From 65e17236df8f542fbce8993a9ab87f8fc43eac8f Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Sat, 17 Feb 2024 23:06:49 -0800 Subject: [PATCH 01/57] update deps --- package-lock.json | 19 ++++++++++--------- poetry.lock | 36 ++++++++++++++++++------------------ 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/package-lock.json b/package-lock.json index 364dbeb..15c9aed 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1361,9 +1361,9 @@ } }, "node_modules/caniuse-lite": { - "version": "1.0.30001587", - "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001587.tgz", - "integrity": "sha512-HMFNotUmLXn71BQxg8cijvqxnIAofforZOwGsxyXJ0qugTdspUF4sPSJ2vhgprHCB996tIDzEq1ubumPDV8ULA==", + "version": "1.0.30001588", + "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001588.tgz", + "integrity": "sha512-+hVY9jE44uKLkH0SrUTqxjxqNTOWHsbnQDIKjwkZ3lNTzUUVdBLBGXtj/q5Mp5u98r3droaZAewQuEDzjQdZlQ==", "dev": true, "funding": [ { @@ -11554,16 +11554,17 @@ } }, "node_modules/typed-array-byte-offset": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/typed-array-byte-offset/-/typed-array-byte-offset-1.0.0.tgz", - "integrity": "sha512-RD97prjEt9EL8YgAgpOkf3O4IF9lhJFr9g0htQkm0rchFp/Vx7LW5Q8fSXXub7BXAODyUQohRMyOc3faCPd0hg==", + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/typed-array-byte-offset/-/typed-array-byte-offset-1.0.1.tgz", + "integrity": "sha512-tcqKMrTRXjqvHN9S3553NPCaGL0VPgFI92lXszmrE8DMhiDPLBYLlvo8Uu4WZAAX/aGqp/T1sbA4ph8EWjDF9Q==", "dev": true, "dependencies": { - "available-typed-arrays": "^1.0.5", - "call-bind": "^1.0.2", + "available-typed-arrays": "^1.0.6", + "call-bind": "^1.0.7", "for-each": "^0.3.3", + "gopd": "^1.0.1", "has-proto": "^1.0.1", - "is-typed-array": "^1.1.10" + "is-typed-array": "^1.1.13" }, "engines": { "node": ">= 0.4" diff --git a/poetry.lock b/poetry.lock index 0a7a93b..407bbc2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -791,28 +791,28 @@ files = [ [[package]] name = "ruff" -version = "0.2.1" +version = "0.2.2" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - { file = "ruff-0.2.1-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:dd81b911d28925e7e8b323e8d06951554655021df8dd4ac3045d7212ac4ba080" }, - { file = "ruff-0.2.1-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:dc586724a95b7d980aa17f671e173df00f0a2eef23f8babbeee663229a938fec" }, - { file = "ruff-0.2.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c92db7101ef5bfc18e96777ed7bc7c822d545fa5977e90a585accac43d22f18a" }, - { file = "ruff-0.2.1-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:13471684694d41ae0f1e8e3a7497e14cd57ccb7dd72ae08d56a159d6c9c3e30e" }, - { file = "ruff-0.2.1-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a11567e20ea39d1f51aebd778685582d4c56ccb082c1161ffc10f79bebe6df35" }, - { file = "ruff-0.2.1-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:00a818e2db63659570403e44383ab03c529c2b9678ba4ba6c105af7854008105" }, - { file = "ruff-0.2.1-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:be60592f9d218b52f03384d1325efa9d3b41e4c4d55ea022cd548547cc42cd2b" }, - { file = "ruff-0.2.1-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fbd2288890b88e8aab4499e55148805b58ec711053588cc2f0196a44f6e3d855" }, - { file = "ruff-0.2.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3ef052283da7dec1987bba8d8733051c2325654641dfe5877a4022108098683" }, - { file = "ruff-0.2.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:7022d66366d6fded4ba3889f73cd791c2d5621b2ccf34befc752cb0df70f5fad" }, - { file = "ruff-0.2.1-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0a725823cb2a3f08ee743a534cb6935727d9e47409e4ad72c10a3faf042ad5ba" }, - { file = "ruff-0.2.1-py3-none-musllinux_1_2_i686.whl", hash = "sha256:0034d5b6323e6e8fe91b2a1e55b02d92d0b582d2953a2b37a67a2d7dedbb7acc" }, - { file = "ruff-0.2.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:e5cb5526d69bb9143c2e4d2a115d08ffca3d8e0fddc84925a7b54931c96f5c02" }, - { file = "ruff-0.2.1-py3-none-win32.whl", hash = "sha256:6b95ac9ce49b4fb390634d46d6ece32ace3acdd52814671ccaf20b7f60adb232" }, - { file = "ruff-0.2.1-py3-none-win_amd64.whl", hash = "sha256:e3affdcbc2afb6f5bd0eb3130139ceedc5e3f28d206fe49f63073cb9e65988e0" }, - { file = "ruff-0.2.1-py3-none-win_arm64.whl", hash = "sha256:efababa8e12330aa94a53e90a81eb6e2d55f348bc2e71adbf17d9cad23c03ee6" }, - { file = "ruff-0.2.1.tar.gz", hash = "sha256:3b42b5d8677cd0c72b99fcaf068ffc62abb5a19e71b4a3b9cfa50658a0af02f1" }, + { file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:0a9efb032855ffb3c21f6405751d5e147b0c6b631e3ca3f6b20f917572b97eb6" }, + { file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:d450b7fbff85913f866a5384d8912710936e2b96da74541c82c1b458472ddb39" }, + { file = "ruff-0.2.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecd46e3106850a5c26aee114e562c329f9a1fbe9e4821b008c4404f64ff9ce73" }, + { file = "ruff-0.2.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e22676a5b875bd72acd3d11d5fa9075d3a5f53b877fe7b4793e4673499318ba" }, + { file = "ruff-0.2.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1695700d1e25a99d28f7a1636d85bafcc5030bba9d0578c0781ba1790dbcf51c" }, + { file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b0c232af3d0bd8f521806223723456ffebf8e323bd1e4e82b0befb20ba18388e" }, + { file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f63d96494eeec2fc70d909393bcd76c69f35334cdbd9e20d089fb3f0640216ca" }, + { file = "ruff-0.2.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a61ea0ff048e06de273b2e45bd72629f470f5da8f71daf09fe481278b175001" }, + { file = "ruff-0.2.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e1439c8f407e4f356470e54cdecdca1bd5439a0673792dbe34a2b0a551a2fe3" }, + { file = "ruff-0.2.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:940de32dc8853eba0f67f7198b3e79bc6ba95c2edbfdfac2144c8235114d6726" }, + { file = "ruff-0.2.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0c126da55c38dd917621552ab430213bdb3273bb10ddb67bc4b761989210eb6e" }, + { file = "ruff-0.2.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:3b65494f7e4bed2e74110dac1f0d17dc8e1f42faaa784e7c58a98e335ec83d7e" }, + { file = "ruff-0.2.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1ec49be4fe6ddac0503833f3ed8930528e26d1e60ad35c2446da372d16651ce9" }, + { file = "ruff-0.2.2-py3-none-win32.whl", hash = "sha256:d920499b576f6c68295bc04e7b17b6544d9d05f196bb3aac4358792ef6f34325" }, + { file = "ruff-0.2.2-py3-none-win_amd64.whl", hash = "sha256:cc9a91ae137d687f43a44c900e5d95e9617cb37d4c989e462980ba27039d239d" }, + { file = "ruff-0.2.2-py3-none-win_arm64.whl", hash = "sha256:c9d15fc41e6054bfc7200478720570078f0b41c9ae4f010bcc16bd6f4d1aacdd" }, + { file = "ruff-0.2.2.tar.gz", hash = "sha256:e62ed7f36b3068a30ba39193a14274cd706bc486fad521276458022f7bccb31d" }, ] [[package]] From 3ce61254dcf8ca26cb614e76d4e09b96f1ffae54 Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Mon, 19 Feb 2024 14:11:47 -0800 Subject: [PATCH 02/57] titles after tokens --- NEWS.md | 5 ++ comicfn2dict/parse.py | 103 ++++++++++++++++++----- comicfn2dict/regex.py | 2 +- pyproject.toml | 2 +- tests/comic_filenames.py | 175 +++++++++++++++++++++++++++++++++++++-- 5 files changed, 256 insertions(+), 31 deletions(-) diff --git a/NEWS.md b/NEWS.md index 65cf227..525bf58 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ # 📰 comicfn2dict News +## v0.2.0 + +- Titles are now parsed only if they occur after the series token AND after + either issue, year or volume. + ## v0.1.4 - Require Python 3.10 diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 6a75aef..f41a7f9 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -1,10 +1,10 @@ """Parse comic book archive names using the simple 'parse' parser.""" +from pprint import pprint from pathlib import Path from re import Match, Pattern from typing import Any from comicfn2dict.regex import ( - DASH_SPLIT_RE, EXTRA_SPACES_RE, ISSUE_ANYWHERE_RE, ISSUE_BEGIN_RE, @@ -26,9 +26,13 @@ from comicfn2dict.regex import ( _REMAINING_GROUP_KEYS = ("series", "title") -def _parse_ext(name: str, suffix: str, metadata: dict) -> str: +def _parse_ext(name: str | Path, metadata: dict) -> str: """Pop the extension from the pathname.""" - data = name.removesuffix(suffix) + if isinstance(name, str): + name = name.strip() + path = Path(name) + suffix = path.suffix + data = path.name.removesuffix(suffix) ext = suffix.lstrip(".") if ext: metadata["ext"] = ext @@ -43,17 +47,18 @@ def _clean_dividers(data: str) -> str: def _get_data_list(path: str | Path, metadata: dict) -> list[str]: """Prepare data list from a path or string.""" - if isinstance(path, str): - path = path.strip() - path = Path(path) - data = _parse_ext(path.name, path.suffix, metadata) + data = _parse_ext(path, metadata) data = _clean_dividers(data) - return DASH_SPLIT_RE.split(data) + return [data] -def _paren_strip(value: str) -> str: +def _grouping_operators_strip(value: str) -> str: """Strip spaces and parens.""" - return value.strip().strip("()").strip() + value = value.strip() + value = value.strip("()").strip() + value = value.strip("-").strip() + value = value.strip("'").strip('"').strip() + return value def _splicey_dicey( @@ -71,7 +76,7 @@ def _splicey_dicey( if data_after := data[match.end() :].strip(): data_ends.append(data_after) data_list[index:index] = data_ends - return _paren_strip(value) + return _grouping_operators_strip(value) def _match_original_format_and_scan_info( @@ -83,10 +88,10 @@ def _match_original_format_and_scan_info( scan_info = match.group("scan_info") except IndexError: scan_info = None - metadata["original_format"] = _paren_strip(original_format) + metadata["original_format"] = _grouping_operators_strip(original_format) match_group = 1 if scan_info: - metadata["scan_info"] = _paren_strip(scan_info) + metadata["scan_info"] = _grouping_operators_strip(scan_info) match_group = 0 _splicey_dicey(data_list, index, match, match_group=match_group) @@ -112,14 +117,16 @@ def _pop_value_from_token( regex: Pattern, key: str, index: int = 0, -) -> Match: +) -> str: """Search token for value, splice and assign to metadata.""" data = data_list[index] match = regex.search(data) if match: value = _splicey_dicey(data_list, index, match, key) metadata[key] = value - return match + else: + value = "" + return value def _parse_item( @@ -128,21 +135,25 @@ def _parse_item( regex: Pattern, key: str, start_index: int = 0, + path: str = "", ) -> int: """Parse a value from the data list into metadata and alter the data list.""" + path_index = -1 index = start_index dl_len = end_index = len(data_list) if index >= end_index: index = 0 while index < end_index: - match = _pop_value_from_token(data_list, metadata, regex, key, index) - if match: + value = _pop_value_from_token(data_list, metadata, regex, key, index) + if value: + if "key" == "issue": + path_index = path.find(value) break index += 1 if index > dl_len and start_index > 0: index = 0 end_index = start_index - return index + return path_index def _pop_issue_from_text_fields( @@ -156,7 +167,39 @@ def _pop_issue_from_text_fields( return data_list.pop(index) -def _assign_remaining_groups(data_list: list[str], metadata: dict): +TITLE_PRECEDING_KEYS = ("issue", "year", "volume") + + +def _is_title_in_position(path, value, metadata): + """Does the title come after series and one other token if they exist.""" + # TODO this could be faster if indexes could be grabbed for these tokens + # when they are extracted. + title_index = path.find(value) + + # Does a series come first. + series = metadata.get("series") + if not series: + return False + series_index = path.find(series) + if title_index < series_index: + return False + + # If other tokens exist then they much precede the title. + title_ok = False + other_tokens_exist = False + for preceding_key in TITLE_PRECEDING_KEYS: + preceding_value = metadata.get(preceding_key) + if not preceding_value: + continue + other_tokens_exist = True + preceding_index = path.find(preceding_value) + if title_index > preceding_index: + title_ok = True + break + return title_ok or not other_tokens_exist + + +def _assign_remaining_groups(data_list: list[str], metadata: dict, path: str): """Assign series and title.""" index = 0 for key in _REMAINING_GROUP_KEYS: @@ -167,7 +210,9 @@ def _assign_remaining_groups(data_list: list[str], metadata: dict): match = REMAINING_GROUP_RE.search(data) if data else None if match: value = _pop_issue_from_text_fields(data_list, metadata, index) - value = _paren_strip(value) + if key == "title" and not _is_title_in_position(path, value, metadata): + continue + value = _grouping_operators_strip(value) if value: metadata[key] = value else: @@ -184,10 +229,17 @@ def _pickup_issue(remainders: list[str], metadata: dict) -> None: _parse_item(remainders, metadata, ISSUE_ANYWHERE_RE, "issue") +def _log_progress(label, metadata, data_list): + print(label + ":") + pprint(metadata) + pprint(data_list) + + def comicfn2dict(path: str | Path) -> dict[str, Any]: """Parse the filename with a hierarchy of regexes.""" metadata = {} data_list = _get_data_list(path, metadata) + _log_progress("INITIAL", metadata, data_list) # Parse paren tokens _parse_item(data_list, metadata, ISSUE_COUNT_RE, "issue_count") @@ -206,26 +258,33 @@ def comicfn2dict(path: str | Path) -> dict[str, Any]: "scan_info", start_index=of_index + 1, ) + _log_progress("AFTER PAREN TOKENS", metadata, data_list) # Parse regular tokens _parse_item(data_list, metadata, VOLUME_RE, "volume") - _parse_item(data_list, metadata, ISSUE_NUMBER_RE, "issue") + _parse_item(data_list, metadata, ISSUE_NUMBER_RE, "issue", path=str(path)) + _log_progress("AFTER REGULAR TOKENS", metadata, data_list) # Pickup year if not gotten. if "year" not in metadata: _parse_item(data_list, metadata, YEAR_BEGIN_RE, "year") if "year" not in metadata: _parse_item(data_list, metadata, YEAR_END_RE, "year") + _log_progress("AFTER YEAR PICKUP", metadata, data_list) # Pickup issue if it's a standalone token if "issue" not in metadata: _parse_item(data_list, metadata, ISSUE_TOKEN_RE, "issue") + _log_progress("AFTER ISSUE PICKUP", metadata, data_list) + # Series and Title. Also looks for issue. - _assign_remaining_groups(data_list, metadata) + _assign_remaining_groups(data_list, metadata, str(path)) + _log_progress("AFTER SERIES AND TITLE", metadata, data_list) # Final try for issue number. _pickup_issue(data_list, metadata) + _log_progress("AFTER ISSUE PICKUP", metadata, data_list) # Add Remainders if data_list: diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index d5f8145..d49273d 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -72,4 +72,4 @@ ISSUE_BEGIN_RE = re_compile(r"^(" + _ISSUE_RE_EXP + r")\b") ISSUE_ANYWHERE_RE = re_compile(r"\b(" + _ISSUE_RE_EXP + r")\b") # LONG STRINGS -REMAINING_GROUP_RE = re_compile(r"^[\w].*[^\)]") +REMAINING_GROUP_RE = re_compile(r"^[^\()].*[^\)]") diff --git a/pyproject.toml b/pyproject.toml index c9299fc..5f662e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "comicfn2dict" -version = "0.1.4" +version = "0.2.0" description = "Parse common comic filenames and return a dict of metadata attributes. Includes a cli." license = "GPL-3.0-only" authors = ["AJ Slater "] diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index 753c047..3700d65 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -80,13 +80,11 @@ FNS = { "original_format": "digital", }, "Bardude - The Last Thing I Remember.cbz": { - "series": "Bardude", - "title": "The Last Thing I Remember", + "series": "Bardude - The Last Thing I Remember", "ext": "cbz", }, "Drunkguy - The Man Without Fear - 01.cbz": { - "series": "Drunkguy", - "title": "The Man Without Fear", + "series": "Drunkguy - The Man Without Fear", "issue": "01", "ext": "cbz", }, @@ -125,9 +123,8 @@ FNS = { "scan_info": "Zone-Empire", "title": "Last Bullet", }, - "Jeremy John - A Big Long Title (2017) (digital-Minutement).cbz": { - "series": "Jeremy John", - "title": "A Big Long Title", + "Jeremy John - Not A Title (2017) (digital-Minutement).cbz": { + "series": "Jeremy John - Not A Title", "year": "2017", "ext": "cbz", "original_format": "digital", @@ -243,3 +240,167 @@ FNS = { "ext": "cbz", }, } + +FNS.update( # Newly fixed. + { + "'Batman - Superman - World's Finest 022 (2024) (Webrip) (The Last Kryptonian-DCP).cbz": { + "ext": "cbz", + "issue": "022", + "remainders": ("(The Last Kryptonian-DCP)",), + "scan_info": "Webrip", + "series": "Batman - Superman - World's Finest", + "year": "2024", + }, + } +) + +FNS.update( + { + # Issue number starting with a letter requested in https://github.com/comictagger/comictagger/issues/543 + "batman #B01 title.cbz": { + "ext": "cbz", + "issue": "B01", + "series": "batman", + "title": "title", + }, # Leading issue number is usually an alternate sequence number + "52 action comics #2024.cbz": { + "ext": "cbz", + "issue": "2024", + "series": "action comics", + "alternate": "52", + }, # 4 digit issue number + "action comics 1024.cbz": { + "ext": "cbz", + "issue": "1024", + "series": "action comics", + }, # Only the issue number. CT ensures that the series always has a value if possible + "#52.cbz": { + "ext": "cbz", + "issue": "52", + "series": "52", + }, # CT treats double-underscore the same as double-dash + "Monster_Island_v1_#2__repaired__c2c.cbz": { + "ext": "cbz", + "issue": "2", + "series": "Monster Island", + "volume": "1", + }, # I'm not sure there's a right way to parse this. This might also be a madeup filename I don't remember + "Super Strange Yarns (1957) #92 (1969).cbz": { + "ext": "cbz", + "issue": "92", + "series": "Super Strange Yarns", + "volume": "1957", + "year": "1969", + }, # Extra - in the series + " X-Men-V1-#067.cbr": { + "ext": "cbr", + "issue": "067", + "series": "X-Men", + "volume": "1", + }, # CT only separates this into a title if the '-' is attached to the previous word eg 'aquaman- Green Arrow'. @bpepple opened a ticket for this https://github.com/ajslater/comicfn2dict/issues/1 already + "Aquaman - Green Arrow - Deep Target #01 (of 07) (2021).cbr": { + "ext": "cbr", + "issue": "01", + "series": "Aquaman - Green Arrow - Deep Target", + "year": "2021", + "issue_count": "7", + }, + "Batman_-_Superman_#020_(2021).cbr": { + "ext": "cbr", + "issue": "020", + "series": "Batman - Superman", + "year": "2021", + }, + "Free Comic Book Day - Avengers.Hulk (2021).cbz": { + "ext": "cbz", + "series": "Free Comic Book Day - Avengers Hulk", + "year": "2021", + }, # CT assumes the volume is also the issue number if it can't find an issue number + "Avengers By Brian Michael Bendis volume 03 (2013).cbz": { + "ext": "cbz", + "issue": "3", + "series": "Avengers By Brian Michael Bendis", + "volume": "03", + "year": "2013", + }, # Publishers like to re-print some of their annuals using this format for the year + "Batman '89 (2021) .cbr": { + "ext": "cbr", + "series": "Batman '89", + "year": "2021", + }, # CT has extra processing to re-attach the year in this case + "Blade Runner Free Comic Book Day 2021 (2021).cbr": { + "ext": "cbr", + "series": "Blade Runner Free Comic Book Day 2021", + "year": "2021", + }, # CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series) + "Bloodshot Book 03 (2020).cbr": { + "ext": "cbr", + "issue": "03", + "series": "Bloodshot", + "title": "Book 03", + "volume": "03", + "year": "2020", + }, # CT checks for the following '(of 06)' after the '03' and marks it as the volume + "Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": { + "ext": "cbr", + "issue": "008", + "series": "Elephantmen 2259", + "title": "Simple Truth", + "volume": "03", + "year": "2021", + "volume_count": "06", + }, # CT catches the year + "Marvel Previews #002 (January 2022).cbr": { + "ext": "cbr", + "issue": "002", + "series": "Marvel Previews", + "year": "2022", + }, # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder + "Marvel Two In One V1 #090 c2c.cbr": { + "ext": "cbr", + "issue": "090", + "series": "Marvel Two In One", + "publisher": "Marvel", + "volume": "1", + }, # This made the parser in CT much more complicated. It's understandable that this isn't parsed on the first few iterations of this project + "Star Wars - War of the Bounty Hunters - IG-88 (2021).cbz": { + "ext": "cbz", + "series": "Star Wars - War of the Bounty Hunters - IG-88", + "year": "2021", + }, # The addition of the '#1' turns this into the same as 'Aquaman - Green Arrow - Deep Target' above + "Star Wars - War of the Bounty Hunters - IG-88 #1 (2021).cbz": { + "ext": "cbz", + "issue": "1", + "series": "Star Wars - War of the Bounty Hunters - IG-88", + "year": "2021", + }, # CT treats '[]' as equivalent to '()', catches DC as a publisher and 'Sep-Oct 1951' as dates and removes them. CT doesn't catch the digital though so that could be better but I blame whoever made this atrocious filename + "Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz": { + "ext": "cbz", + "issue": "49", + "series": "Wonder Woman", + "title": "digital", + "publisher": "DC", + "year": "1951", + }, # CT notices that this is a full date, CT doesn't actually return the month or day though just removes it + "X-Men, 2021-08-04 (#02).cbz": { + "ext": "cbz", + "issue": "02", + "series": "X-Men", + "year": "2021", + }, # CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation + "Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": { + "ext": "cbz", + "issue": "001", + "series": "Cory Doctorow's Futuristic Tales of the Here and Now", + "title": "Anda's Game", + "year": "2007", + }, # This is a contrived test case. I've never seen this I just wanted to handle it with my parser + "Cory Doctorow's Futuristic Tales of the Here and Now #0.0.1 (2007).cbz": { + "ext": "cbz", + "issue": "0.1", + "series": "Cory Doctorow's Futuristic Tales of the Here and Now", + "year": "2007", + "issue_count": "", + }, + } +) From 71dd1d3972892df8044de2dba1fd375e7eb8b43d Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Mon, 19 Feb 2024 14:18:38 -0800 Subject: [PATCH 03/57] alphabetical leading characters after # for issue --- NEWS.md | 2 ++ comicfn2dict/regex.py | 4 ++-- tests/comic_filenames.py | 14 ++++++++------ 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index 525bf58..dcabf89 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,8 @@ - Titles are now parsed only if they occur after the series token AND after either issue, year or volume. +- Issue numbers that start with a '#' character may contain alphabetical + characters. ## v0.1.4 diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index d49273d..73fdc45 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -41,7 +41,6 @@ ORIGINAL_FORMAT_PATTERNS = ( # CLEAN NON_SPACE_DIVIDER_RE = re_compile(r"[_\+]") -DASH_SPLIT_RE = re_compile(r"\s-\s") EXTRA_SPACES_RE = re_compile(r"\s\s+") # PAREN GROUPS @@ -64,8 +63,9 @@ ORIGINAL_FORMAT_SCAN_INFO_RE = re_compile( # REGULAR TOKENS VOLUME_RE = re_compile(r"((?:v(?:ol(?:ume)?)?\.?)\s*(?P\d+))") +_ISSUE_NUMBER_RE_EXP = r"(?P[\w½]+\.?\d*\w*)" +ISSUE_NUMBER_RE = re_compile(r"(#" + _ISSUE_NUMBER_RE_EXP + r")") _ISSUE_RE_EXP = r"(?P[\d½]+\.?\d*\w*)" -ISSUE_NUMBER_RE = re_compile(r"(#" + _ISSUE_RE_EXP + r")") ISSUE_TOKEN_RE = re_compile(r"^(" + _ISSUE_RE_EXP + r")$") ISSUE_END_RE = re_compile(r"\b(" + _ISSUE_RE_EXP + r")$") ISSUE_BEGIN_RE = re_compile(r"^(" + _ISSUE_RE_EXP + r")\b") diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index 3700d65..67e8ac5 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -251,18 +251,20 @@ FNS.update( # Newly fixed. "series": "Batman - Superman - World's Finest", "year": "2024", }, - } -) - -FNS.update( - { # Issue number starting with a letter requested in https://github.com/comictagger/comictagger/issues/543 "batman #B01 title.cbz": { "ext": "cbz", "issue": "B01", "series": "batman", "title": "title", - }, # Leading issue number is usually an alternate sequence number + }, + } +) + + +FNS.update( + { + # Leading issue number is usually an alternate sequence number "52 action comics #2024.cbz": { "ext": "cbz", "issue": "2024", From 664f54cecb807f05e36b5ae6939b4d96fedff865 Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Tue, 20 Feb 2024 00:21:54 -0800 Subject: [PATCH 04/57] make parser a class. use delimeters in a string instead of the data_list --- comicfn2dict/__init__.py | 4 +- comicfn2dict/cli.py | 12 +- comicfn2dict/comicfn2dict.py | 2 +- comicfn2dict/parse.py | 424 ++++++++++++++--------------------- comicfn2dict/regex.py | 15 +- comicfn2dict/unparse.py | 21 +- tests/comic_filenames.py | 56 +++-- tests/test_comicfn2dict.py | 4 +- 8 files changed, 246 insertions(+), 292 deletions(-) diff --git a/comicfn2dict/__init__.py b/comicfn2dict/__init__.py index 19f419b..e4f0a78 100644 --- a/comicfn2dict/__init__.py +++ b/comicfn2dict/__init__.py @@ -1,3 +1,3 @@ """Comic Filename to Dict parser and unparser.""" -from .parse import comicfn2dict # noqa: F401 -from .unparse import dict2comicfn # noqa: F401 +from .parse import ComicFilenameParser # noqa: F401 +from .unparse import serialize # noqa: F401 diff --git a/comicfn2dict/cli.py b/comicfn2dict/cli.py index d26a4ec..d1e6880 100755 --- a/comicfn2dict/cli.py +++ b/comicfn2dict/cli.py @@ -3,8 +3,7 @@ from argparse import ArgumentParser from pathlib import Path from pprint import pprint - -from comicfn2dict.parse import comicfn2dict +from comicfn2dict.parse import ComicFilenameParser def main(): @@ -12,9 +11,16 @@ def main(): description = "Comic book archive read/write tool." parser = ArgumentParser(description=description) parser.add_argument("path", help="Path of comic filename to parse", type=Path) + parser.add_argument( + "-v", + "--verbose", + default=0, + action="count", + help="Display intermediate parsing steps. Good for debugging.", + ) args = parser.parse_args() name = args.path.name - metadata = comicfn2dict(name) + metadata = ComicFilenameParser(name, verbose=args.verbose).parse() pprint(metadata) # noqa:T203 diff --git a/comicfn2dict/comicfn2dict.py b/comicfn2dict/comicfn2dict.py index 3b3faf8..2beb9f5 100644 --- a/comicfn2dict/comicfn2dict.py +++ b/comicfn2dict/comicfn2dict.py @@ -1,3 +1,3 @@ """API import source.""" -from comicfn2dict.parse import comicfn2dict # noqa: F401 +from comicfn2dict.parse import ComicFilenameParser # noqa: F401 from comicfn2dict.unparse import dict2comicfn # noqa: F401 diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index f41a7f9..327a9ec 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -1,22 +1,21 @@ """Parse comic book archive names using the simple 'parse' parser.""" from pprint import pprint +from copy import copy from pathlib import Path -from re import Match, Pattern +from re import Pattern from typing import Any from comicfn2dict.regex import ( EXTRA_SPACES_RE, ISSUE_ANYWHERE_RE, - ISSUE_BEGIN_RE, ISSUE_COUNT_RE, - ISSUE_END_RE, ISSUE_NUMBER_RE, - ISSUE_TOKEN_RE, + ISSUE_BEGIN_RE, + ISSUE_END_RE, NON_SPACE_DIVIDER_RE, - ORIGINAL_FORMAT_RE, + ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, ORIGINAL_FORMAT_SCAN_INFO_RE, REMAINING_GROUP_RE, - SCAN_INFO_RE, VOLUME_RE, YEAR_BEGIN_RE, YEAR_END_RE, @@ -24,270 +23,195 @@ from comicfn2dict.regex import ( ) _REMAINING_GROUP_KEYS = ("series", "title") +_TITLE_PRECEDING_KEYS = ("issue", "year", "volume") +_TOKEN_DELIMETER = "/" -def _parse_ext(name: str | Path, metadata: dict) -> str: - """Pop the extension from the pathname.""" - if isinstance(name, str): - name = name.strip() - path = Path(name) - suffix = path.suffix - data = path.name.removesuffix(suffix) - ext = suffix.lstrip(".") - if ext: - metadata["ext"] = ext - return data +class ComicFilenameParser: + @staticmethod + def _clean_dividers(data: str) -> str: + """Replace non space dividers and clean extra spaces out of string.""" + data = NON_SPACE_DIVIDER_RE.sub(" ", data) + return EXTRA_SPACES_RE.sub(" ", data).strip() + def _parse_ext(self): + """Pop the extension from the pathname.""" + path = Path(self._unparsed_path) + suffix = path.suffix + if not suffix: + return + self.path_indexes["ext"] = self.path.rfind(suffix) -def _clean_dividers(data: str) -> str: - """Replace non space dividers and clean extra spaces out of string.""" - data = NON_SPACE_DIVIDER_RE.sub(" ", data) - return EXTRA_SPACES_RE.sub(" ", data) + data = path.name.removesuffix(suffix) + ext = suffix.lstrip(".") + self.metadata["ext"] = ext + self._unparsed_path = data + def _grouping_operators_strip(self, value: str) -> str: + """Strip spaces and parens.""" + value = value.strip() + value = value.strip("()").strip() + value = value.strip("-").strip() + value = value.strip("'").strip('"').strip() + return value -def _get_data_list(path: str | Path, metadata: dict) -> list[str]: - """Prepare data list from a path or string.""" - data = _parse_ext(path, metadata) - data = _clean_dividers(data) - return [data] - - -def _grouping_operators_strip(value: str) -> str: - """Strip spaces and parens.""" - value = value.strip() - value = value.strip("()").strip() - value = value.strip("-").strip() - value = value.strip("'").strip('"').strip() - return value - - -def _splicey_dicey( - data_list: list[str], index: int, match: Match, match_group: int | str = 0 -) -> str: - """Replace a string token from a list with two strings and the value removed. - - And return the value. - """ - value = match.group(match_group) - data = data_list.pop(index) - data_ends = [] - if data_before := data[: match.start()].strip(): - data_ends.append(data_before) - if data_after := data[match.end() :].strip(): - data_ends.append(data_after) - data_list[index:index] = data_ends - return _grouping_operators_strip(value) - - -def _match_original_format_and_scan_info( - match: Match, metadata: dict[str, Any], data_list: list[str], index: int -) -> None: - """Match (ORIGINAL_FORMAT-SCAN_INFO).""" - original_format = match.group("original_format") - try: - scan_info = match.group("scan_info") - except IndexError: - scan_info = None - metadata["original_format"] = _grouping_operators_strip(original_format) - match_group = 1 - if scan_info: - metadata["scan_info"] = _grouping_operators_strip(scan_info) - match_group = 0 - _splicey_dicey(data_list, index, match, match_group=match_group) - - -def _parse_original_format_and_scan_info(data_list: list[str], metadata: dict) -> int: - """Parse (ORIGINAL_FORMAT-SCAN_INFO).""" - index = 0 - match = None - for data in data_list: - match = ORIGINAL_FORMAT_SCAN_INFO_RE.search(data) - if match: - _match_original_format_and_scan_info(match, metadata, data_list, index) - break - index += 1 - else: - index = 0 - return index - - -def _pop_value_from_token( - data_list: list, - metadata: dict, - regex: Pattern, - key: str, - index: int = 0, -) -> str: - """Search token for value, splice and assign to metadata.""" - data = data_list[index] - match = regex.search(data) - if match: - value = _splicey_dicey(data_list, index, match, key) - metadata[key] = value - else: - value = "" - return value - - -def _parse_item( - data_list: list[str], - metadata: dict, - regex: Pattern, - key: str, - start_index: int = 0, - path: str = "", -) -> int: - """Parse a value from the data list into metadata and alter the data list.""" - path_index = -1 - index = start_index - dl_len = end_index = len(data_list) - if index >= end_index: - index = 0 - while index < end_index: - value = _pop_value_from_token(data_list, metadata, regex, key, index) - if value: - if "key" == "issue": - path_index = path.find(value) - break - index += 1 - if index > dl_len and start_index > 0: - index = 0 - end_index = start_index - return path_index - - -def _pop_issue_from_text_fields( - data_list: list[str], metadata: dict, index: int -) -> str: - """Search issue from ends of text fields.""" - if "issue" not in metadata: - _pop_value_from_token(data_list, metadata, ISSUE_END_RE, "issue", index=index) - if "issue" not in metadata: - _pop_value_from_token(data_list, metadata, ISSUE_BEGIN_RE, "issue", index=index) - return data_list.pop(index) - - -TITLE_PRECEDING_KEYS = ("issue", "year", "volume") - - -def _is_title_in_position(path, value, metadata): - """Does the title come after series and one other token if they exist.""" - # TODO this could be faster if indexes could be grabbed for these tokens - # when they are extracted. - title_index = path.find(value) - - # Does a series come first. - series = metadata.get("series") - if not series: - return False - series_index = path.find(series) - if title_index < series_index: - return False - - # If other tokens exist then they much precede the title. - title_ok = False - other_tokens_exist = False - for preceding_key in TITLE_PRECEDING_KEYS: - preceding_value = metadata.get(preceding_key) - if not preceding_value: - continue - other_tokens_exist = True - preceding_index = path.find(preceding_value) - if title_index > preceding_index: - title_ok = True - break - return title_ok or not other_tokens_exist - - -def _assign_remaining_groups(data_list: list[str], metadata: dict, path: str): - """Assign series and title.""" - index = 0 - for key in _REMAINING_GROUP_KEYS: - try: - data = data_list[index] - except (IndexError, TypeError): - break - match = REMAINING_GROUP_RE.search(data) if data else None - if match: - value = _pop_issue_from_text_fields(data_list, metadata, index) - if key == "title" and not _is_title_in_position(path, value, metadata): + def _parse_item( + self, + regex: Pattern, + require_all: bool = False, + ) -> None: + """Parse a value from the data list into metadata and alter the data list.""" + matches = regex.search(self._unparsed_path) + if not matches: + return + matched_metadata = {} + matched_path_indexes = {} + for key, value in matches.groupdict().items(): + if not value: + if require_all: + return continue - value = _grouping_operators_strip(value) - if value: - metadata[key] = value - else: - index += 1 + matched_path_indexes[key] = self.path.find(value) + # TODO idk if strip is necceesary here + matched_metadata[key] = self._grouping_operators_strip(value) + self.metadata.update(matched_metadata) + self.path_indexes.update(matched_path_indexes) + marked_str = regex.sub(_TOKEN_DELIMETER, self._unparsed_path) + parts = [] + for part in marked_str.split(_TOKEN_DELIMETER): + if token := part.strip(): + parts.append(token) + self._unparsed_path = _TOKEN_DELIMETER.join(parts) -def _pickup_issue(remainders: list[str], metadata: dict) -> None: - """Get issue from remaining tokens or anywhere in a pinch.""" - if "issue" in metadata: - return - _parse_item(remainders, metadata, ISSUE_TOKEN_RE, "issue") - if "issue" in metadata: - return - _parse_item(remainders, metadata, ISSUE_ANYWHERE_RE, "issue") + def _is_title_in_position(self, value): + """Does the title come after series and one other token if they exist.""" + title_index = self.path.find(value) + # Does a series come first. + if title_index < self.path_indexes.get("series", -1): + return False -def _log_progress(label, metadata, data_list): - print(label + ":") - pprint(metadata) - pprint(data_list) + # If other tokens exist then they much precede the title. + title_ok = False + other_tokens_exist = False + for preceding_key in _TITLE_PRECEDING_KEYS: + other_tokens_exist = True + if title_index > self.path_indexes.get(preceding_key, -1): + title_ok = True + break + return title_ok or not other_tokens_exist + def _assign_remaining_groups(self): + """Assign series and title.""" + if not self._unparsed_path: + return -def comicfn2dict(path: str | Path) -> dict[str, Any]: - """Parse the filename with a hierarchy of regexes.""" - metadata = {} - data_list = _get_data_list(path, metadata) - _log_progress("INITIAL", metadata, data_list) + # TODO fix REMAINING GROUP_RE to use token delim + tokens = self._unparsed_path.split(_TOKEN_DELIMETER) - # Parse paren tokens - _parse_item(data_list, metadata, ISSUE_COUNT_RE, "issue_count") - _parse_item(data_list, metadata, YEAR_TOKEN_RE, "year") - of_index = _parse_original_format_and_scan_info(data_list, metadata) - if "original_format" not in metadata: - of_index = _parse_item( - data_list, metadata, ORIGINAL_FORMAT_RE, "original_format" + # ASSIGN GROUPS + remaining_key_index = 0 + unused_tokens = [] + while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS): + key = _REMAINING_GROUP_KEYS[remaining_key_index] + token = tokens.pop(0) + match = REMAINING_GROUP_RE.search(token) + if match: + value = match.group() + if key == "title" and not self._is_title_in_position(value): + unused_tokens.append(token) + continue + value = self._grouping_operators_strip(value) + self.metadata[key] = value + self.path_indexes[key] = self.path.find(value) + remaining_key_index += 1 + else: + unused_tokens.append(token) + + self._unparsed_path = " ".join(unused_tokens) if unused_tokens else "" + + def _add_remainders(self): + """Add Remainders.""" + remainders = [] + for token in self._unparsed_path.split(_TOKEN_DELIMETER): + if remainder := token.strip(): + remainders.append(remainder) + + if remainders: + self.metadata["remainders"] = tuple(remainders) + + def _log_progress(self, label): + if not self._debug: + return + print(label + ":") + combined = {} + for key in self.metadata: + combined[key] = (self.metadata.get(key), self.path_indexes.get(key)) + pprint(combined) + print(self._unparsed_path) + + def parse(self) -> dict[str, Any]: + """Parse the filename with a hierarchy of regexes.""" + self._unparsed_path = self._clean_dividers(self._unparsed_path) + self._log_progress("INITIAL") + self._parse_ext() + + # Parse paren tokens + self._parse_item(ISSUE_COUNT_RE) + self._parse_item(YEAR_TOKEN_RE) + self._parse_item( + ORIGINAL_FORMAT_SCAN_INFO_RE, + require_all=True, ) - if "scan_info" not in metadata: - # Start searching for scan_info after original format. - _parse_item( - data_list, - metadata, - SCAN_INFO_RE, - "scan_info", - start_index=of_index + 1, - ) - _log_progress("AFTER PAREN TOKENS", metadata, data_list) + if "original_format" not in self.metadata: + self._parse_item( + ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, + ) + self._log_progress("AFTER PAREN TOKENS") - # Parse regular tokens - _parse_item(data_list, metadata, VOLUME_RE, "volume") - _parse_item(data_list, metadata, ISSUE_NUMBER_RE, "issue", path=str(path)) - _log_progress("AFTER REGULAR TOKENS", metadata, data_list) + # Parse regular tokens + self._parse_item(VOLUME_RE) + self._parse_item(ISSUE_NUMBER_RE) + self._log_progress("AFTER REGULAR TOKENS") - # Pickup year if not gotten. - if "year" not in metadata: - _parse_item(data_list, metadata, YEAR_BEGIN_RE, "year") - if "year" not in metadata: - _parse_item(data_list, metadata, YEAR_END_RE, "year") - _log_progress("AFTER YEAR PICKUP", metadata, data_list) + # Pickup year if not gotten. + if "year" not in self.metadata: + self._parse_item(YEAR_BEGIN_RE) + if "year" not in self.metadata: + self._parse_item(YEAR_END_RE) + self._log_progress("AFTER YEAR PICKUP") - # Pickup issue if it's a standalone token - if "issue" not in metadata: - _parse_item(data_list, metadata, ISSUE_TOKEN_RE, "issue") + # Pickup issue if it's a standalone token + if "issue" not in self.metadata: + self._parse_item(ISSUE_END_RE) + if "issue" not in self.metadata: + self._parse_item(ISSUE_BEGIN_RE) - _log_progress("AFTER ISSUE PICKUP", metadata, data_list) + self._log_progress("AFTER ISSUE PICKUP") - # Series and Title. Also looks for issue. - _assign_remaining_groups(data_list, metadata, str(path)) - _log_progress("AFTER SERIES AND TITLE", metadata, data_list) + # Series and Title. Also looks for issue. + self._assign_remaining_groups() + self._log_progress("AFTER SERIES AND TITLE") - # Final try for issue number. - _pickup_issue(data_list, metadata) - _log_progress("AFTER ISSUE PICKUP", metadata, data_list) + # Final try for issue number. + if "issue" not in self.metadata: + # TODO is this useful? + self._parse_item(ISSUE_ANYWHERE_RE) + self._log_progress("AFTER ISSUE PICKUP") - # Add Remainders - if data_list: - metadata["remainders"] = tuple(data_list) + self._add_remainders() - return metadata + return self.metadata + + def __init__(self, path: str | Path, verbose: int = 0): + """Initialize.""" + self._debug: bool = verbose > 0 + self.metadata: dict[str, str | tuple[str, ...]] = {} + self.path_indexes: dict[str, int] = {} + # munge path + if isinstance(path, str): + path = path.strip() + p_path = Path(path) + self.path = str(p_path.name).strip() + self._unparsed_path = copy(self.path) diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index 73fdc45..43ae9ea 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -51,24 +51,27 @@ YEAR_BEGIN_RE = re_compile(r"^" + _YEAR_RE_EXP + r"\b") YEAR_END_RE = re_compile(r"\b" + _YEAR_RE_EXP + r"$") _OF_PATTERNS = r"|".join(ORIGINAL_FORMAT_PATTERNS) _ORIGINAL_FORMAT_RE_EXP = r"(?P" + _OF_PATTERNS + r")" -ORIGINAL_FORMAT_RE = re_compile(_ORIGINAL_FORMAT_RE_EXP, parenthify=True) _SCAN_INFO_RE_EXP = r"(?P[^()]+?)" -SCAN_INFO_RE = re_compile(_SCAN_INFO_RE_EXP, parenthify=True) _ORIGINAL_FORMAT_SCAN_INFO_RE_EXP = ( - _ORIGINAL_FORMAT_RE_EXP + r"(?:-" + _SCAN_INFO_RE_EXP + r")?" + _ORIGINAL_FORMAT_RE_EXP + r"\s*[\(:-]" + _SCAN_INFO_RE_EXP # + r")?" ) ORIGINAL_FORMAT_SCAN_INFO_RE = re_compile( _ORIGINAL_FORMAT_SCAN_INFO_RE_EXP, parenthify=True ) +ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE = re_compile( + r"\(" + _ORIGINAL_FORMAT_RE_EXP + r"\).*\(" + _SCAN_INFO_RE_EXP + r"\)" +) # REGULAR TOKENS VOLUME_RE = re_compile(r"((?:v(?:ol(?:ume)?)?\.?)\s*(?P\d+))") _ISSUE_NUMBER_RE_EXP = r"(?P[\w½]+\.?\d*\w*)" ISSUE_NUMBER_RE = re_compile(r"(#" + _ISSUE_NUMBER_RE_EXP + r")") _ISSUE_RE_EXP = r"(?P[\d½]+\.?\d*\w*)" -ISSUE_TOKEN_RE = re_compile(r"^(" + _ISSUE_RE_EXP + r")$") -ISSUE_END_RE = re_compile(r"\b(" + _ISSUE_RE_EXP + r")$") -ISSUE_BEGIN_RE = re_compile(r"^(" + _ISSUE_RE_EXP + r")\b") + +ISSUE_END_RE = re_compile(r"([\/\s]" + _ISSUE_RE_EXP + r"(\/|$))") +ISSUE_BEGIN_RE = re_compile(r"((^|\/)" + _ISSUE_RE_EXP + r"[\/|\s])") + +# TODO is this used? ISSUE_ANYWHERE_RE = re_compile(r"\b(" + _ISSUE_RE_EXP + r")\b") # LONG STRINGS diff --git a/comicfn2dict/unparse.py b/comicfn2dict/unparse.py index 19761d1..74f77cb 100644 --- a/comicfn2dict/unparse.py +++ b/comicfn2dict/unparse.py @@ -28,22 +28,27 @@ _FILENAME_FORMAT_TAGS: tuple[tuple[str, str | Callable], ...] = ( _EMPTY_VALUES: tuple[None, str] = (None, "") -def dict2comicfn(md: Mapping, ext: bool = True) -> str | None: +def _tokenize_tag(md: Mapping, tag: str, fmt: str | Callable) -> str: + val = md.get(tag) + if val in _EMPTY_VALUES: + return "" + final_fmt = fmt(val) if isinstance(fmt, Callable) else fmt + token = final_fmt.format(val).strip() + return token + + +def serialize(md: Mapping, ext: bool = True) -> str: """Get our preferred basename from a metadata dict.""" if not md: - return None + return "" tokens = [] for tag, fmt in _FILENAME_FORMAT_TAGS: - val = md.get(tag) - if val in _EMPTY_VALUES: - continue - final_fmt = fmt(val) if isinstance(fmt, Callable) else fmt - token = final_fmt.format(val).strip() - if token: + if token := _tokenize_tag(md, tag, fmt): tokens.append(token) fn = " ".join(tokens) if remainders := md.get("remainders"): remainder = " ".join(remainders) + # TODO oh this is the - delineated remainder :( fn += f" - {remainder}" if ext: fn += "." + md.get("ext", "cbz") diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index 67e8ac5..b5bd175 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -136,8 +136,7 @@ FNS = { "year": "2006", "ext": "cbz", "scan_info": "Minutemen-Faessla", - # "original_format": "digital", - "remainders": ("(digital",), + "original_format": "digital", }, "Jeremy John 003 (2007) (4 covers) (digital) (Minutemen-Faessla).cbz": { "series": "Jeremy John", @@ -243,6 +242,7 @@ FNS = { FNS.update( # Newly fixed. { + # BIG Change. title after token. more stripping. "'Batman - Superman - World's Finest 022 (2024) (Webrip) (The Last Kryptonian-DCP).cbz": { "ext": "cbz", "issue": "022", @@ -252,6 +252,7 @@ FNS.update( # Newly fixed. "year": "2024", }, # Issue number starting with a letter requested in https://github.com/comictagger/comictagger/issues/543 + # word characters now allowed to lead issue numbers only if preceded by a # marker "batman #B01 title.cbz": { "ext": "cbz", "issue": "B01", @@ -261,32 +262,47 @@ FNS.update( # Newly fixed. } ) +WONFIX = { + # Leading issue number is usually an alternate sequence number + # WONTFIX: Series names may begin with numerals. + "52 action comics #2024.cbz": { + "ext": "cbz", + "issue": "2024", + "series": "action comics", + "alternate": "52", + }, + # Only the issue number. CT ensures that the series always has a value if possible + # I don't think making the series the same as the number is valuable. + "#52.cbz": { + "ext": "cbz", + "issue": "52", + "series": "52", + }, +} + +LATER = { + # 4 digit issue number + # should this be an issue number if year DONE?. + "action comics 1024.cbz": { + "ext": "cbz", + "issue": "1024", + "series": "action comics", + }, +} FNS.update( { - # Leading issue number is usually an alternate sequence number - "52 action comics #2024.cbz": { - "ext": "cbz", - "issue": "2024", - "series": "action comics", - "alternate": "52", - }, # 4 digit issue number - "action comics 1024.cbz": { - "ext": "cbz", - "issue": "1024", - "series": "action comics", - }, # Only the issue number. CT ensures that the series always has a value if possible - "#52.cbz": { - "ext": "cbz", - "issue": "52", - "series": "52", - }, # CT treats double-underscore the same as double-dash + # CT treats double-underscore the same as double-dash + # BUG: should be title right now. + # FEATURE: double dash should be a token delimiter? "Monster_Island_v1_#2__repaired__c2c.cbz": { "ext": "cbz", "issue": "2", "series": "Monster Island", "volume": "1", - }, # I'm not sure there's a right way to parse this. This might also be a madeup filename I don't remember + "remainders": ("repaired c2c",), + }, + # I'm not sure there's a right way to parse this. This might also be a madeup filename I don't remember "Super Strange Yarns (1957) #92 (1969).cbz": { "ext": "cbz", "issue": "92", diff --git a/tests/test_comicfn2dict.py b/tests/test_comicfn2dict.py index 82e5229..33f4d82 100644 --- a/tests/test_comicfn2dict.py +++ b/tests/test_comicfn2dict.py @@ -5,7 +5,7 @@ from types import MappingProxyType import pytest from deepdiff.diff import DeepDiff -from comicfn2dict import comicfn2dict +from comicfn2dict import ComicFilenameParser from tests.comic_filenames import FNS ALL_FIELDS = frozenset({"series", "volume", "issue", "issue_count", "year", "ext"}) @@ -16,7 +16,7 @@ FIELD_SCHEMA = MappingProxyType({key: None for key in ALL_FIELDS}) def test_parse_filename(item): """Test filename parsing.""" fn, defined_fields = item - md = comicfn2dict(fn) + md = ComicFilenameParser(fn, verbose=1).parse() diff = DeepDiff(defined_fields, md, ignore_order=True) print(fn) pprint(defined_fields) From c84d1db13d5f9821285e9ece30dd371877dbfca6 Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Tue, 20 Feb 2024 00:55:09 -0800 Subject: [PATCH 05/57] Add webrip info type, fix test for Worlds Finest --- comicfn2dict/regex.py | 6 ++---- tests/comic_filenames.py | 21 +++++++++------------ 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index 43ae9ea..f3f1362 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -35,7 +35,7 @@ ORIGINAL_FORMAT_PATTERNS = ( r"Sketch", r"TPB", r"Trade[-\s]Paper[-\s]?Back", - r"Web([-\s]?Comic)?", + r"Web([-\s]?(Comic|Rip))?", ) @@ -51,7 +51,7 @@ YEAR_BEGIN_RE = re_compile(r"^" + _YEAR_RE_EXP + r"\b") YEAR_END_RE = re_compile(r"\b" + _YEAR_RE_EXP + r"$") _OF_PATTERNS = r"|".join(ORIGINAL_FORMAT_PATTERNS) _ORIGINAL_FORMAT_RE_EXP = r"(?P" + _OF_PATTERNS + r")" -_SCAN_INFO_RE_EXP = r"(?P[^()]+?)" +_SCAN_INFO_RE_EXP = r"(?P[^()]*)" _ORIGINAL_FORMAT_SCAN_INFO_RE_EXP = ( _ORIGINAL_FORMAT_RE_EXP + r"\s*[\(:-]" + _SCAN_INFO_RE_EXP # + r")?" ) @@ -70,8 +70,6 @@ _ISSUE_RE_EXP = r"(?P[\d½]+\.?\d*\w*)" ISSUE_END_RE = re_compile(r"([\/\s]" + _ISSUE_RE_EXP + r"(\/|$))") ISSUE_BEGIN_RE = re_compile(r"((^|\/)" + _ISSUE_RE_EXP + r"[\/|\s])") - -# TODO is this used? ISSUE_ANYWHERE_RE = re_compile(r"\b(" + _ISSUE_RE_EXP + r")\b") # LONG STRINGS diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index b5bd175..61d38c0 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -246,9 +246,9 @@ FNS.update( # Newly fixed. "'Batman - Superman - World's Finest 022 (2024) (Webrip) (The Last Kryptonian-DCP).cbz": { "ext": "cbz", "issue": "022", - "remainders": ("(The Last Kryptonian-DCP)",), - "scan_info": "Webrip", + "original_format": "Webrip", "series": "Batman - Superman - World's Finest", + "scan_info": "The Last Kryptonian-DCP", "year": "2024", }, # Issue number starting with a letter requested in https://github.com/comictagger/comictagger/issues/543 @@ -259,6 +259,13 @@ FNS.update( # Newly fixed. "series": "batman", "title": "title", }, + "Monster_Island_v1_#2__repaired__c2c.cbz": { + "ext": "cbz", + "issue": "2", + "series": "Monster Island", + "volume": "1", + "remainders": ("repaired c2c",), + }, } ) @@ -292,16 +299,6 @@ LATER = { FNS.update( { - # CT treats double-underscore the same as double-dash - # BUG: should be title right now. - # FEATURE: double dash should be a token delimiter? - "Monster_Island_v1_#2__repaired__c2c.cbz": { - "ext": "cbz", - "issue": "2", - "series": "Monster Island", - "volume": "1", - "remainders": ("repaired c2c",), - }, # I'm not sure there's a right way to parse this. This might also be a madeup filename I don't remember "Super Strange Yarns (1957) #92 (1969).cbz": { "ext": "cbz", From 6f1b96c23a4f36116220bc8d4a55c1254f4d702a Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Tue, 20 Feb 2024 00:55:40 -0800 Subject: [PATCH 06/57] lint --- comicfn2dict/regex.py | 1 - 1 file changed, 1 deletion(-) diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index f3f1362..01bbcbe 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -38,7 +38,6 @@ ORIGINAL_FORMAT_PATTERNS = ( r"Web([-\s]?(Comic|Rip))?", ) - # CLEAN NON_SPACE_DIVIDER_RE = re_compile(r"[_\+]") EXTRA_SPACES_RE = re_compile(r"\s\s+") From 31fd809aee9932a4c98ed66162a9500adb0fa7f3 Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Tue, 20 Feb 2024 01:06:47 -0800 Subject: [PATCH 07/57] move working tests into newly working block --- tests/comic_filenames.py | 102 +++++++++++++++++++++------------------ 1 file changed, 56 insertions(+), 46 deletions(-) diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index 61d38c0..d1ea099 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -266,6 +266,46 @@ FNS.update( # Newly fixed. "volume": "1", "remainders": ("repaired c2c",), }, + # Extra - in the series + " X-Men-V1-#067.cbr": { + "ext": "cbr", + "issue": "067", + "series": "X-Men", + "volume": "1", + "remainders": ("-",), + }, + "Aquaman - Green Arrow - Deep Target #01 (of 07) (2021).cbr": { + "ext": "cbr", + "issue": "01", + "series": "Aquaman - Green Arrow - Deep Target", + "year": "2021", + "issue_count": "07", + }, + # CT only separates this into a title if the '-' is attached to the previous word eg 'aquaman- Green Arrow'. @bpepple opened a ticket for this https://github.com/ajslater/comicfn2dict/issues/1 already + "Batman_-_Superman_#020_(2021).cbr": { + "ext": "cbr", + "issue": "020", + "series": "Batman - Superman", + "year": "2021", + }, + # Publishers like to re-print some of their annuals using this format for the year + "Batman '89 (2021) .cbr": { + "ext": "cbr", + "series": "Batman '89", + "year": "2021", + }, + # This made the parser in CT much more complicated. It's understandable that this isn't parsed on the first few iterations of this project + "Star Wars - War of the Bounty Hunters - IG-88 (2021).cbz": { + "ext": "cbz", + "series": "Star Wars - War of the Bounty Hunters - IG-88", + "year": "2021", + }, # The addition of the '#1' turns this into the same as 'Aquaman - Green Arrow - Deep Target' above + "Star Wars - War of the Bounty Hunters - IG-88 #1 (2021).cbz": { + "ext": "cbz", + "issue": "1", + "series": "Star Wars - War of the Bounty Hunters - IG-88", + "year": "2021", + }, } ) @@ -295,54 +335,34 @@ LATER = { "issue": "1024", "series": "action comics", }, + # I'm not sure there's a right way to parse this. This might also be a madeup filename I don't remember + # if a year occurs after another year, and no volume, do volume / year + "Super Strange Yarns (1957) #92 (1969).cbz": { + "ext": "cbz", + "issue": "92", + "series": "Super Strange Yarns", + "volume": "1957", + "year": "1969", + }, } +# Not examined yet. FNS.update( { - # I'm not sure there's a right way to parse this. This might also be a madeup filename I don't remember - "Super Strange Yarns (1957) #92 (1969).cbz": { - "ext": "cbz", - "issue": "92", - "series": "Super Strange Yarns", - "volume": "1957", - "year": "1969", - }, # Extra - in the series - " X-Men-V1-#067.cbr": { - "ext": "cbr", - "issue": "067", - "series": "X-Men", - "volume": "1", - }, # CT only separates this into a title if the '-' is attached to the previous word eg 'aquaman- Green Arrow'. @bpepple opened a ticket for this https://github.com/ajslater/comicfn2dict/issues/1 already - "Aquaman - Green Arrow - Deep Target #01 (of 07) (2021).cbr": { - "ext": "cbr", - "issue": "01", - "series": "Aquaman - Green Arrow - Deep Target", - "year": "2021", - "issue_count": "7", - }, - "Batman_-_Superman_#020_(2021).cbr": { - "ext": "cbr", - "issue": "020", - "series": "Batman - Superman", - "year": "2021", - }, "Free Comic Book Day - Avengers.Hulk (2021).cbz": { "ext": "cbz", "series": "Free Comic Book Day - Avengers Hulk", "year": "2021", - }, # CT assumes the volume is also the issue number if it can't find an issue number + }, + # CT assumes the volume is also the issue number if it can't find an issue number "Avengers By Brian Michael Bendis volume 03 (2013).cbz": { "ext": "cbz", "issue": "3", "series": "Avengers By Brian Michael Bendis", "volume": "03", "year": "2013", - }, # Publishers like to re-print some of their annuals using this format for the year - "Batman '89 (2021) .cbr": { - "ext": "cbr", - "series": "Batman '89", - "year": "2021", - }, # CT has extra processing to re-attach the year in this case + }, + # CT has extra processing to re-attach the year in this case "Blade Runner Free Comic Book Day 2021 (2021).cbr": { "ext": "cbr", "series": "Blade Runner Free Comic Book Day 2021", @@ -377,18 +397,8 @@ FNS.update( "series": "Marvel Two In One", "publisher": "Marvel", "volume": "1", - }, # This made the parser in CT much more complicated. It's understandable that this isn't parsed on the first few iterations of this project - "Star Wars - War of the Bounty Hunters - IG-88 (2021).cbz": { - "ext": "cbz", - "series": "Star Wars - War of the Bounty Hunters - IG-88", - "year": "2021", - }, # The addition of the '#1' turns this into the same as 'Aquaman - Green Arrow - Deep Target' above - "Star Wars - War of the Bounty Hunters - IG-88 #1 (2021).cbz": { - "ext": "cbz", - "issue": "1", - "series": "Star Wars - War of the Bounty Hunters - IG-88", - "year": "2021", - }, # CT treats '[]' as equivalent to '()', catches DC as a publisher and 'Sep-Oct 1951' as dates and removes them. CT doesn't catch the digital though so that could be better but I blame whoever made this atrocious filename + }, + # CT treats '[]' as equivalent to '()', catches DC as a publisher and 'Sep-Oct 1951' as dates and removes them. CT doesn't catch the digital though so that could be better but I blame whoever made this atrocious filename "Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz": { "ext": "cbz", "issue": "49", From b510864c30b426f446d748d6548d1228953fa2aa Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Tue, 20 Feb 2024 01:09:51 -0800 Subject: [PATCH 08/57] move wontfix block --- tests/comic_filenames.py | 60 +++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index d1ea099..4d67eaa 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -29,6 +29,7 @@ TEST_COMIC_VOL_ONLY = { "ext": "cbr", } +# Working with 0.1.0 FNS = { "Night of 1000 Wolves 001 (2013).cbz": { "series": "Night of 1000 Wolves", @@ -79,15 +80,6 @@ FNS = { "title": "The Smell of Burnt Toast", "original_format": "digital", }, - "Bardude - The Last Thing I Remember.cbz": { - "series": "Bardude - The Last Thing I Remember", - "ext": "cbz", - }, - "Drunkguy - The Man Without Fear - 01.cbz": { - "series": "Drunkguy - The Man Without Fear", - "issue": "01", - "ext": "cbz", - }, "The_Arkenstone_v03_(2002)_(Digital)_(DR_&_Quenya-Elves).cbr": { "series": "The Arkenstone", "volume": "03", @@ -240,8 +232,19 @@ FNS = { }, } -FNS.update( # Newly fixed. +# Fixed with 0.2.0 +FNS.update( { + # Philosopy change regarding dashes. + "Bardude - The Last Thing I Remember.cbz": { + "series": "Bardude - The Last Thing I Remember", + "ext": "cbz", + }, + "Drunkguy - The Man Without Fear - 01.cbz": { + "series": "Drunkguy - The Man Without Fear", + "issue": "01", + "ext": "cbz", + }, # BIG Change. title after token. more stripping. "'Batman - Superman - World's Finest 022 (2024) (Webrip) (The Last Kryptonian-DCP).cbz": { "ext": "cbz", @@ -308,25 +311,6 @@ FNS.update( # Newly fixed. }, } ) - -WONFIX = { - # Leading issue number is usually an alternate sequence number - # WONTFIX: Series names may begin with numerals. - "52 action comics #2024.cbz": { - "ext": "cbz", - "issue": "2024", - "series": "action comics", - "alternate": "52", - }, - # Only the issue number. CT ensures that the series always has a value if possible - # I don't think making the series the same as the number is valuable. - "#52.cbz": { - "ext": "cbz", - "issue": "52", - "series": "52", - }, -} - LATER = { # 4 digit issue number # should this be an issue number if year DONE?. @@ -429,3 +413,21 @@ FNS.update( }, } ) + +WONFIX = { + # Leading issue number is usually an alternate sequence number + # WONTFIX: Series names may begin with numerals. + "52 action comics #2024.cbz": { + "ext": "cbz", + "issue": "2024", + "series": "action comics", + "alternate": "52", + }, + # Only the issue number. CT ensures that the series always has a value if possible + # I don't think making the series the same as the number is valuable. + "#52.cbz": { + "ext": "cbz", + "issue": "52", + "series": "52", + }, +} From fd4b4bc99c99bcc9b2a5dad00021890472daf07e Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Tue, 20 Feb 2024 12:45:42 -0800 Subject: [PATCH 09/57] make parse_index a lazy function. re-add old 0.1.0 API --- comicfn2dict/comicfn2dict.py | 4 ++-- comicfn2dict/parse.py | 37 +++++++++++++++++++++++++----------- comicfn2dict/unparse.py | 5 +++++ 3 files changed, 33 insertions(+), 13 deletions(-) diff --git a/comicfn2dict/comicfn2dict.py b/comicfn2dict/comicfn2dict.py index 2beb9f5..6399337 100644 --- a/comicfn2dict/comicfn2dict.py +++ b/comicfn2dict/comicfn2dict.py @@ -1,3 +1,3 @@ """API import source.""" -from comicfn2dict.parse import ComicFilenameParser # noqa: F401 -from comicfn2dict.unparse import dict2comicfn # noqa: F401 +from comicfn2dict.parse import comicfn2dict, ComicFilenameParser # noqa: F401 +from comicfn2dict.unparse import dict2comicfn, serialize # noqa: F401 diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 327a9ec..4c01a60 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -34,13 +34,27 @@ class ComicFilenameParser: data = NON_SPACE_DIVIDER_RE.sub(" ", data) return EXTRA_SPACES_RE.sub(" ", data).strip() + def path_index(self, key: str): + """Retrieve and memoize the key's location in the path.""" + if key == "remainders": + return -1 + value: str = self.metadata.get(key, "") # type: ignore + if not value: + return -1 + if value not in self._path_indexes: + if key == "ext": + index = self.path.rfind(value) + else: + index = self.path.find(value) + self._path_indexes[value] = index + return self._path_indexes[value] + def _parse_ext(self): """Pop the extension from the pathname.""" path = Path(self._unparsed_path) suffix = path.suffix if not suffix: return - self.path_indexes["ext"] = self.path.rfind(suffix) data = path.name.removesuffix(suffix) ext = suffix.lstrip(".") @@ -65,17 +79,14 @@ class ComicFilenameParser: if not matches: return matched_metadata = {} - matched_path_indexes = {} for key, value in matches.groupdict().items(): if not value: if require_all: return continue - matched_path_indexes[key] = self.path.find(value) - # TODO idk if strip is necceesary here + # TODO idk if strip is necessary here matched_metadata[key] = self._grouping_operators_strip(value) self.metadata.update(matched_metadata) - self.path_indexes.update(matched_path_indexes) marked_str = regex.sub(_TOKEN_DELIMETER, self._unparsed_path) parts = [] @@ -89,7 +100,7 @@ class ComicFilenameParser: title_index = self.path.find(value) # Does a series come first. - if title_index < self.path_indexes.get("series", -1): + if title_index < self.path_index("series"): return False # If other tokens exist then they much precede the title. @@ -97,7 +108,7 @@ class ComicFilenameParser: other_tokens_exist = False for preceding_key in _TITLE_PRECEDING_KEYS: other_tokens_exist = True - if title_index > self.path_indexes.get(preceding_key, -1): + if title_index > self.path_index(preceding_key): title_ok = True break return title_ok or not other_tokens_exist @@ -124,7 +135,6 @@ class ComicFilenameParser: continue value = self._grouping_operators_strip(value) self.metadata[key] = value - self.path_indexes[key] = self.path.find(value) remaining_key_index += 1 else: unused_tokens.append(token) @@ -147,7 +157,7 @@ class ComicFilenameParser: print(label + ":") combined = {} for key in self.metadata: - combined[key] = (self.metadata.get(key), self.path_indexes.get(key)) + combined[key] = (self.metadata.get(key), self.path_index(key)) pprint(combined) print(self._unparsed_path) @@ -207,11 +217,16 @@ class ComicFilenameParser: def __init__(self, path: str | Path, verbose: int = 0): """Initialize.""" self._debug: bool = verbose > 0 - self.metadata: dict[str, str | tuple[str, ...]] = {} - self.path_indexes: dict[str, int] = {} # munge path if isinstance(path, str): path = path.strip() p_path = Path(path) self.path = str(p_path.name).strip() + self.metadata: dict[str, str | tuple[str, ...]] = {} self._unparsed_path = copy(self.path) + self._path_indexes: dict[str, int] = {} + + +def comicfn2dict(path: str | Path): + """Simple API.""" + return ComicFilenameParser(path).parse() diff --git a/comicfn2dict/unparse.py b/comicfn2dict/unparse.py index 74f77cb..3d32bde 100644 --- a/comicfn2dict/unparse.py +++ b/comicfn2dict/unparse.py @@ -53,3 +53,8 @@ def serialize(md: Mapping, ext: bool = True) -> str: if ext: fn += "." + md.get("ext", "cbz") return fn + + +def dict2comicfn(md: Mapping, ext: bool = True) -> str: + """Simple API.""" + return serialize(md, ext=ext) From 36729799a0e488eaa609714d21ddd6fc576cbdd6 Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Tue, 20 Feb 2024 12:46:42 -0800 Subject: [PATCH 10/57] update deps --- package-lock.json | 99 ++++++++++++++++++++++---------------- poetry.lock | 120 +++++++++++++++++++++++----------------------- 2 files changed, 119 insertions(+), 100 deletions(-) diff --git a/package-lock.json b/package-lock.json index 15c9aed..0455f0e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1195,10 +1195,13 @@ } }, "node_modules/available-typed-arrays": { - "version": "1.0.6", - "resolved": "https://registry.npmjs.org/available-typed-arrays/-/available-typed-arrays-1.0.6.tgz", - "integrity": "sha512-j1QzY8iPNPG4o4xmO3ptzpRxTciqD3MgEHtifP/YnJpIo58Xu+ne4BejlbkuaLfXn/nz6HFiw29bLpj2PNMdGg==", + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/available-typed-arrays/-/available-typed-arrays-1.0.7.tgz", + "integrity": "sha512-wvUjBtSGN7+7SjNpq/9M2Tg350UZD3q62IFZLbRAR1bSMlCo1ZaeW+BJ+D090e4hIIZLBcTDWe4Mh4jvUDajzQ==", "dev": true, + "dependencies": { + "possible-typed-array-names": "^1.0.0" + }, "engines": { "node": ">= 0.4" }, @@ -1777,9 +1780,9 @@ "dev": true }, "node_modules/electron-to-chromium": { - "version": "1.4.673", - "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.673.tgz", - "integrity": "sha512-zjqzx4N7xGdl5468G+vcgzDhaHkaYgVcf9MqgexcTqsl2UHSCmOj/Bi3HAprg4BZCpC7HyD8a6nZl6QAZf72gw==", + "version": "1.4.677", + "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.677.tgz", + "integrity": "sha512-erDa3CaDzwJOpyvfKhOiJjBVNnMM0qxHq47RheVVwsSQrgBA9ZSGV9kdaOfZDPXcHzhG7lBxhj6A7KvfLJBd6Q==", "dev": true }, "node_modules/emoji-regex": { @@ -1880,14 +1883,14 @@ } }, "node_modules/es-set-tostringtag": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.0.2.tgz", - "integrity": "sha512-BuDyupZt65P9D2D2vA/zqcI3G5xRsklm5N3xCwuiy+/vKy8i0ifdsQP1sLgO4tZDSCaQUSnmC48khknGMV3D2Q==", + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.0.3.tgz", + "integrity": "sha512-3T8uNMC3OQTHkFUsFq8r/BwAXLHvU/9O9mE0fBc/MY5iq/8H7ncvO947LmYA6ldWw9Uh8Yhf25zu6n7nML5QWQ==", "dev": true, "dependencies": { - "get-intrinsic": "^1.2.2", - "has-tostringtag": "^1.0.0", - "hasown": "^2.0.0" + "get-intrinsic": "^1.2.4", + "has-tostringtag": "^1.0.2", + "hasown": "^2.0.1" }, "engines": { "node": ">= 0.4" @@ -2736,9 +2739,9 @@ } }, "node_modules/flatted": { - "version": "3.2.9", - "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.2.9.tgz", - "integrity": "sha512-36yxDn5H7OFZQla0/jFJmbIKTdZAQHngCedGxiMmpNfEZM0sdEeT+WczLQrjK6D7o2aiyLYDnkw0R3JK0Qv1RQ==", + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.3.0.tgz", + "integrity": "sha512-noqGuLw158+DuD9UPRKHpJ2hGxpFyDlYYrfM0mWt4XhT4n0lwzTLh70Tkdyy4kyTmyTT9Bv7bWAJqw7cgkEXDg==", "dev": true }, "node_modules/for-each": { @@ -3042,9 +3045,9 @@ } }, "node_modules/has-proto": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/has-proto/-/has-proto-1.0.1.tgz", - "integrity": "sha512-7qE+iP+O+bgF9clE5+UoBFzE65mlBiVj3tKCrlNQ0Ogwm0BjpT/gK4SlLYDMybDh5I3TCTKnPPa0oMG7JDYrhg==", + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/has-proto/-/has-proto-1.0.3.tgz", + "integrity": "sha512-SJ1amZAJUiZS+PhsVLf5tGydlaVB8EdFpaSO4gmiUKUOxk8qzn5AIy4ZeJUmh22znIdk/uMAUT2pl3FxzVUH+Q==", "dev": true, "engines": { "node": ">= 0.4" @@ -3429,9 +3432,9 @@ } }, "node_modules/is-negative-zero": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/is-negative-zero/-/is-negative-zero-2.0.2.tgz", - "integrity": "sha512-dqJvarLawXsFbNDeJW7zAz8ItJ9cd28YufuuFzh0G8pNHjJMnY08Dv7sYX2uF5UpQOwieAeOExEYAWWfu7ZZUA==", + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/is-negative-zero/-/is-negative-zero-2.0.3.tgz", + "integrity": "sha512-5KoIu2Ngpyek75jXodFvnafB6DJgr3u8uuK0LEZJjrU19DrMD3EVERaR8sjz8CCGgpZvxPl9SuE1GMVPFHx1mw==", "dev": true, "engines": { "node": ">= 0.4" @@ -6230,6 +6233,15 @@ "node": ">=4" } }, + "node_modules/possible-typed-array-names": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/possible-typed-array-names/-/possible-typed-array-names-1.0.0.tgz", + "integrity": "sha512-d7Uw+eZoloe0EHDIYoe+bQ5WXnGMOpmiZFTuMWCwpjzzkL2nTjcKiAk4hh8TjnGye2TwWOk3UXucZ+3rbmBa8Q==", + "dev": true, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/prelude-ls": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/prelude-ls/-/prelude-ls-1.2.1.tgz", @@ -11022,14 +11034,15 @@ } }, "node_modules/set-function-name": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/set-function-name/-/set-function-name-2.0.1.tgz", - "integrity": "sha512-tMNCiqYVkXIZgc2Hnoy2IvC/f8ezc5koaRFkCjrpWzGpCd3qbZXPzVy9MAZzK1ch/X0jvSkojys3oqJN0qCmdA==", + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/set-function-name/-/set-function-name-2.0.2.tgz", + "integrity": "sha512-7PGFlmtwsEADb0WYyvCMa1t+yke6daIG4Wirafur5kcf+MhUnPms1UeR0CKQdTZD81yESwMHbtn+TR+dMviakQ==", "dev": true, "dependencies": { - "define-data-property": "^1.0.1", + "define-data-property": "^1.1.4", + "es-errors": "^1.3.0", "functions-have-names": "^1.2.3", - "has-property-descriptors": "^1.0.0" + "has-property-descriptors": "^1.0.2" }, "engines": { "node": ">= 0.4" @@ -11522,12 +11535,12 @@ } }, "node_modules/typed-array-buffer": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/typed-array-buffer/-/typed-array-buffer-1.0.1.tgz", - "integrity": "sha512-RSqu1UEuSlrBhHTWC8O9FnPjOduNs4M7rJ4pRKoEjtx1zUNOPN2sSXHLDX+Y2WPbHIxbvg4JFo2DNAEfPIKWoQ==", + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/typed-array-buffer/-/typed-array-buffer-1.0.2.tgz", + "integrity": "sha512-gEymJYKZtKXzzBzM4jqa9w6Q1Jjm7x2d+sh19AdsD4wqnMPDYyvwpsIc2Q/835kHuo3BEQ7CjelGhfTsoBb2MQ==", "dev": true, "dependencies": { - "call-bind": "^1.0.6", + "call-bind": "^1.0.7", "es-errors": "^1.3.0", "is-typed-array": "^1.1.13" }, @@ -11554,16 +11567,16 @@ } }, "node_modules/typed-array-byte-offset": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/typed-array-byte-offset/-/typed-array-byte-offset-1.0.1.tgz", - "integrity": "sha512-tcqKMrTRXjqvHN9S3553NPCaGL0VPgFI92lXszmrE8DMhiDPLBYLlvo8Uu4WZAAX/aGqp/T1sbA4ph8EWjDF9Q==", + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/typed-array-byte-offset/-/typed-array-byte-offset-1.0.2.tgz", + "integrity": "sha512-Ous0vodHa56FviZucS2E63zkgtgrACj7omjwd/8lTEMEPFFyjfixMZ1ZXenpgCFBBt4EC1J2XsyVS2gkG0eTFA==", "dev": true, "dependencies": { - "available-typed-arrays": "^1.0.6", + "available-typed-arrays": "^1.0.7", "call-bind": "^1.0.7", "for-each": "^0.3.3", "gopd": "^1.0.1", - "has-proto": "^1.0.1", + "has-proto": "^1.0.3", "is-typed-array": "^1.1.13" }, "engines": { @@ -11574,14 +11587,20 @@ } }, "node_modules/typed-array-length": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/typed-array-length/-/typed-array-length-1.0.4.tgz", - "integrity": "sha512-KjZypGq+I/H7HI5HlOoGHkWUUGq+Q0TPhQurLbyrVrvnKTBgzLhIJ7j6J/XTQOi0d1RjyZ0wdas8bKs2p0x3Ng==", + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/typed-array-length/-/typed-array-length-1.0.5.tgz", + "integrity": "sha512-yMi0PlwuznKHxKmcpoOdeLwxBoVPkqZxd7q2FgMkmD3bNwvF5VW0+UlUQ1k1vmktTu4Yu13Q0RIxEP8+B+wloA==", "dev": true, "dependencies": { - "call-bind": "^1.0.2", + "call-bind": "^1.0.7", "for-each": "^0.3.3", - "is-typed-array": "^1.1.9" + "gopd": "^1.0.1", + "has-proto": "^1.0.3", + "is-typed-array": "^1.1.13", + "possible-typed-array-names": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" }, "funding": { "url": "https://github.com/sponsors/ljharb" diff --git a/poetry.lock b/poetry.lock index 407bbc2..efd8604 100644 --- a/poetry.lock +++ b/poetry.lock @@ -61,63 +61,63 @@ files = [ [[package]] name = "coverage" -version = "7.4.1" +version = "7.4.2" description = "Code coverage measurement for Python" optional = false python-versions = ">=3.8" files = [ - { file = "coverage-7.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:077d366e724f24fc02dbfe9d946534357fda71af9764ff99d73c3c596001bbd7" }, - { file = "coverage-7.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0193657651f5399d433c92f8ae264aff31fc1d066deee4b831549526433f3f61" }, - { file = "coverage-7.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d17bbc946f52ca67adf72a5ee783cd7cd3477f8f8796f59b4974a9b59cacc9ee" }, - { file = "coverage-7.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3277f5fa7483c927fe3a7b017b39351610265308f5267ac6d4c2b64cc1d8d25" }, - { file = "coverage-7.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dceb61d40cbfcf45f51e59933c784a50846dc03211054bd76b421a713dcdf19" }, - { file = "coverage-7.4.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6008adeca04a445ea6ef31b2cbaf1d01d02986047606f7da266629afee982630" }, - { file = "coverage-7.4.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:c61f66d93d712f6e03369b6a7769233bfda880b12f417eefdd4f16d1deb2fc4c" }, - { file = "coverage-7.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b9bb62fac84d5f2ff523304e59e5c439955fb3b7f44e3d7b2085184db74d733b" }, - { file = "coverage-7.4.1-cp310-cp310-win32.whl", hash = "sha256:f86f368e1c7ce897bf2457b9eb61169a44e2ef797099fb5728482b8d69f3f016" }, - { file = "coverage-7.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:869b5046d41abfea3e381dd143407b0d29b8282a904a19cb908fa24d090cc018" }, - { file = "coverage-7.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b8ffb498a83d7e0305968289441914154fb0ef5d8b3157df02a90c6695978295" }, - { file = "coverage-7.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3cacfaefe6089d477264001f90f55b7881ba615953414999c46cc9713ff93c8c" }, - { file = "coverage-7.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d6850e6e36e332d5511a48a251790ddc545e16e8beaf046c03985c69ccb2676" }, - { file = "coverage-7.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18e961aa13b6d47f758cc5879383d27b5b3f3dcd9ce8cdbfdc2571fe86feb4dd" }, - { file = "coverage-7.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dfd1e1b9f0898817babf840b77ce9fe655ecbe8b1b327983df485b30df8cc011" }, - { file = "coverage-7.4.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6b00e21f86598b6330f0019b40fb397e705135040dbedc2ca9a93c7441178e74" }, - { file = "coverage-7.4.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:536d609c6963c50055bab766d9951b6c394759190d03311f3e9fcf194ca909e1" }, - { file = "coverage-7.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7ac8f8eb153724f84885a1374999b7e45734bf93a87d8df1e7ce2146860edef6" }, - { file = "coverage-7.4.1-cp311-cp311-win32.whl", hash = "sha256:f3771b23bb3675a06f5d885c3630b1d01ea6cac9e84a01aaf5508706dba546c5" }, - { file = "coverage-7.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:9d2f9d4cc2a53b38cabc2d6d80f7f9b7e3da26b2f53d48f05876fef7956b6968" }, - { file = "coverage-7.4.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f68ef3660677e6624c8cace943e4765545f8191313a07288a53d3da188bd8581" }, - { file = "coverage-7.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:23b27b8a698e749b61809fb637eb98ebf0e505710ec46a8aa6f1be7dc0dc43a6" }, - { file = "coverage-7.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e3424c554391dc9ef4a92ad28665756566a28fecf47308f91841f6c49288e66" }, - { file = "coverage-7.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e0860a348bf7004c812c8368d1fc7f77fe8e4c095d661a579196a9533778e156" }, - { file = "coverage-7.4.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe558371c1bdf3b8fa03e097c523fb9645b8730399c14fe7721ee9c9e2a545d3" }, - { file = "coverage-7.4.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3468cc8720402af37b6c6e7e2a9cdb9f6c16c728638a2ebc768ba1ef6f26c3a1" }, - { file = "coverage-7.4.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:02f2edb575d62172aa28fe00efe821ae31f25dc3d589055b3fb64d51e52e4ab1" }, - { file = "coverage-7.4.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ca6e61dc52f601d1d224526360cdeab0d0712ec104a2ce6cc5ccef6ed9a233bc" }, - { file = "coverage-7.4.1-cp312-cp312-win32.whl", hash = "sha256:ca7b26a5e456a843b9b6683eada193fc1f65c761b3a473941efe5a291f604c74" }, - { file = "coverage-7.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:85ccc5fa54c2ed64bd91ed3b4a627b9cce04646a659512a051fa82a92c04a448" }, - { file = "coverage-7.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8bdb0285a0202888d19ec6b6d23d5990410decb932b709f2b0dfe216d031d218" }, - { file = "coverage-7.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:918440dea04521f499721c039863ef95433314b1db00ff826a02580c1f503e45" }, - { file = "coverage-7.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:379d4c7abad5afbe9d88cc31ea8ca262296480a86af945b08214eb1a556a3e4d" }, - { file = "coverage-7.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b094116f0b6155e36a304ff912f89bbb5067157aff5f94060ff20bbabdc8da06" }, - { file = "coverage-7.4.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2f5968608b1fe2a1d00d01ad1017ee27efd99b3437e08b83ded9b7af3f6f766" }, - { file = "coverage-7.4.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:10e88e7f41e6197ea0429ae18f21ff521d4f4490aa33048f6c6f94c6045a6a75" }, - { file = "coverage-7.4.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a4a3907011d39dbc3e37bdc5df0a8c93853c369039b59efa33a7b6669de04c60" }, - { file = "coverage-7.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6d224f0c4c9c98290a6990259073f496fcec1b5cc613eecbd22786d398ded3ad" }, - { file = "coverage-7.4.1-cp38-cp38-win32.whl", hash = "sha256:23f5881362dcb0e1a92b84b3c2809bdc90db892332daab81ad8f642d8ed55042" }, - { file = "coverage-7.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:a07f61fc452c43cd5328b392e52555f7d1952400a1ad09086c4a8addccbd138d" }, - { file = "coverage-7.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8e738a492b6221f8dcf281b67129510835461132b03024830ac0e554311a5c54" }, - { file = "coverage-7.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:46342fed0fff72efcda77040b14728049200cbba1279e0bf1188f1f2078c1d70" }, - { file = "coverage-7.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9641e21670c68c7e57d2053ddf6c443e4f0a6e18e547e86af3fad0795414a628" }, - { file = "coverage-7.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aeb2c2688ed93b027eb0d26aa188ada34acb22dceea256d76390eea135083950" }, - { file = "coverage-7.4.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d12c923757de24e4e2110cf8832d83a886a4cf215c6e61ed506006872b43a6d1" }, - { file = "coverage-7.4.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0491275c3b9971cdbd28a4595c2cb5838f08036bca31765bad5e17edf900b2c7" }, - { file = "coverage-7.4.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:8dfc5e195bbef80aabd81596ef52a1277ee7143fe419efc3c4d8ba2754671756" }, - { file = "coverage-7.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:1a78b656a4d12b0490ca72651fe4d9f5e07e3c6461063a9b6265ee45eb2bdd35" }, - { file = "coverage-7.4.1-cp39-cp39-win32.whl", hash = "sha256:f90515974b39f4dea2f27c0959688621b46d96d5a626cf9c53dbc653a895c05c" }, - { file = "coverage-7.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:64e723ca82a84053dd7bfcc986bdb34af8d9da83c521c19d6b472bc6880e191a" }, - { file = "coverage-7.4.1-pp38.pp39.pp310-none-any.whl", hash = "sha256:32a8d985462e37cfdab611a6f95b09d7c091d07668fdc26e47a725ee575fe166" }, - { file = "coverage-7.4.1.tar.gz", hash = "sha256:1ed4b95480952b1a26d863e546fa5094564aa0065e1e5f0d4d0041f293251d04" }, + { file = "coverage-7.4.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bf54c3e089179d9d23900e3efc86d46e4431188d9a657f345410eecdd0151f50" }, + { file = "coverage-7.4.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fe6e43c8b510719b48af7db9631b5fbac910ade4bd90e6378c85ac5ac706382c" }, + { file = "coverage-7.4.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b98c89db1b150d851a7840142d60d01d07677a18f0f46836e691c38134ed18b" }, + { file = "coverage-7.4.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c5f9683be6a5b19cd776ee4e2f2ffb411424819c69afab6b2db3a0a364ec6642" }, + { file = "coverage-7.4.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78cdcbf7b9cb83fe047ee09298e25b1cd1636824067166dc97ad0543b079d22f" }, + { file = "coverage-7.4.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:2599972b21911111114100d362aea9e70a88b258400672626efa2b9e2179609c" }, + { file = "coverage-7.4.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ef00d31b7569ed3cb2036f26565f1984b9fc08541731ce01012b02a4c238bf03" }, + { file = "coverage-7.4.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:20a875bfd8c282985c4720c32aa05056f77a68e6d8bbc5fe8632c5860ee0b49b" }, + { file = "coverage-7.4.2-cp310-cp310-win32.whl", hash = "sha256:b3f2b1eb229f23c82898eedfc3296137cf1f16bb145ceab3edfd17cbde273fb7" }, + { file = "coverage-7.4.2-cp310-cp310-win_amd64.whl", hash = "sha256:7df95fdd1432a5d2675ce630fef5f239939e2b3610fe2f2b5bf21fa505256fa3" }, + { file = "coverage-7.4.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a8ddbd158e069dded57738ea69b9744525181e99974c899b39f75b2b29a624e2" }, + { file = "coverage-7.4.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81a5fb41b0d24447a47543b749adc34d45a2cf77b48ca74e5bf3de60a7bd9edc" }, + { file = "coverage-7.4.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2412e98e70f16243be41d20836abd5f3f32edef07cbf8f407f1b6e1ceae783ac" }, + { file = "coverage-7.4.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ddb79414c15c6f03f56cc68fa06994f047cf20207c31b5dad3f6bab54a0f66ef" }, + { file = "coverage-7.4.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cf89ab85027427d351f1de918aff4b43f4eb5f33aff6835ed30322a86ac29c9e" }, + { file = "coverage-7.4.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a178b7b1ac0f1530bb28d2e51f88c0bab3e5949835851a60dda80bff6052510c" }, + { file = "coverage-7.4.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:06fe398145a2e91edaf1ab4eee66149c6776c6b25b136f4a86fcbbb09512fd10" }, + { file = "coverage-7.4.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:18cac867950943fe93d6cd56a67eb7dcd2d4a781a40f4c1e25d6f1ed98721a55" }, + { file = "coverage-7.4.2-cp311-cp311-win32.whl", hash = "sha256:f72cdd2586f9a769570d4b5714a3837b3a59a53b096bb954f1811f6a0afad305" }, + { file = "coverage-7.4.2-cp311-cp311-win_amd64.whl", hash = "sha256:d779a48fac416387dd5673fc5b2d6bd903ed903faaa3247dc1865c65eaa5a93e" }, + { file = "coverage-7.4.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:adbdfcda2469d188d79771d5696dc54fab98a16d2ef7e0875013b5f56a251047" }, + { file = "coverage-7.4.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ac4bab32f396b03ebecfcf2971668da9275b3bb5f81b3b6ba96622f4ef3f6e17" }, + { file = "coverage-7.4.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:006d220ba2e1a45f1de083d5022d4955abb0aedd78904cd5a779b955b019ec73" }, + { file = "coverage-7.4.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3733545eb294e5ad274abe131d1e7e7de4ba17a144505c12feca48803fea5f64" }, + { file = "coverage-7.4.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42a9e754aa250fe61f0f99986399cec086d7e7a01dd82fd863a20af34cbce962" }, + { file = "coverage-7.4.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2ed37e16cf35c8d6e0b430254574b8edd242a367a1b1531bd1adc99c6a5e00fe" }, + { file = "coverage-7.4.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:b953275d4edfab6cc0ed7139fa773dfb89e81fee1569a932f6020ce7c6da0e8f" }, + { file = "coverage-7.4.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:32b4ab7e6c924f945cbae5392832e93e4ceb81483fd6dc4aa8fb1a97b9d3e0e1" }, + { file = "coverage-7.4.2-cp312-cp312-win32.whl", hash = "sha256:f5df76c58977bc35a49515b2fbba84a1d952ff0ec784a4070334dfbec28a2def" }, + { file = "coverage-7.4.2-cp312-cp312-win_amd64.whl", hash = "sha256:34423abbaad70fea9d0164add189eabaea679068ebdf693baa5c02d03e7db244" }, + { file = "coverage-7.4.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5b11f9c6587668e495cc7365f85c93bed34c3a81f9f08b0920b87a89acc13469" }, + { file = "coverage-7.4.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:51593a1f05c39332f623d64d910445fdec3d2ac2d96b37ce7f331882d5678ddf" }, + { file = "coverage-7.4.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69f1665165ba2fe7614e2f0c1aed71e14d83510bf67e2ee13df467d1c08bf1e8" }, + { file = "coverage-7.4.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3c8bbb95a699c80a167478478efe5e09ad31680931ec280bf2087905e3b95ec" }, + { file = "coverage-7.4.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:175f56572f25e1e1201d2b3e07b71ca4d201bf0b9cb8fad3f1dfae6a4188de86" }, + { file = "coverage-7.4.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8562ca91e8c40864942615b1d0b12289d3e745e6b2da901d133f52f2d510a1e3" }, + { file = "coverage-7.4.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d9a1ef0f173e1a19738f154fb3644f90d0ada56fe6c9b422f992b04266c55d5a" }, + { file = "coverage-7.4.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f40ac873045db4fd98a6f40387d242bde2708a3f8167bd967ccd43ad46394ba2" }, + { file = "coverage-7.4.2-cp38-cp38-win32.whl", hash = "sha256:d1b750a8409bec61caa7824bfd64a8074b6d2d420433f64c161a8335796c7c6b" }, + { file = "coverage-7.4.2-cp38-cp38-win_amd64.whl", hash = "sha256:b4ae777bebaed89e3a7e80c4a03fac434a98a8abb5251b2a957d38fe3fd30088" }, + { file = "coverage-7.4.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3ff7f92ae5a456101ca8f48387fd3c56eb96353588e686286f50633a611afc95" }, + { file = "coverage-7.4.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:861d75402269ffda0b33af94694b8e0703563116b04c681b1832903fac8fd647" }, + { file = "coverage-7.4.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3507427d83fa961cbd73f11140f4a5ce84208d31756f7238d6257b2d3d868405" }, + { file = "coverage-7.4.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bf711d517e21fb5bc429f5c4308fbc430a8585ff2a43e88540264ae87871e36a" }, + { file = "coverage-7.4.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c00e54f0bd258ab25e7f731ca1d5144b0bf7bec0051abccd2bdcff65fa3262c9" }, + { file = "coverage-7.4.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f8e845d894e39fb53834da826078f6dc1a933b32b1478cf437007367efaf6f6a" }, + { file = "coverage-7.4.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:840456cb1067dc350af9080298c7c2cfdddcedc1cb1e0b30dceecdaf7be1a2d3" }, + { file = "coverage-7.4.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c11ca2df2206a4e3e4c4567f52594637392ed05d7c7fb73b4ea1c658ba560265" }, + { file = "coverage-7.4.2-cp39-cp39-win32.whl", hash = "sha256:3ff5bdb08d8938d336ce4088ca1a1e4b6c8cd3bef8bb3a4c0eb2f37406e49643" }, + { file = "coverage-7.4.2-cp39-cp39-win_amd64.whl", hash = "sha256:ac9e95cefcf044c98d4e2c829cd0669918585755dd9a92e28a1a7012322d0a95" }, + { file = "coverage-7.4.2-pp38.pp39.pp310-none-any.whl", hash = "sha256:f593a4a90118d99014517c2679e04a4ef5aee2d81aa05c26c734d271065efcb6" }, + { file = "coverage-7.4.2.tar.gz", hash = "sha256:1a5ee18e3a8d766075ce9314ed1cb695414bae67df6a4b0805f5137d93d6f1cb" }, ] [package.dependencies] @@ -328,13 +328,13 @@ six = ">=1.13.0" [[package]] name = "json5" -version = "0.9.14" +version = "0.9.17" description = "A Python implementation of the JSON5 data format." optional = false -python-versions = "*" +python-versions = ">=3.8" files = [ - { file = "json5-0.9.14-py2.py3-none-any.whl", hash = "sha256:740c7f1b9e584a468dbb2939d8d458db3427f2c93ae2139d05f47e453eae964f" }, - { file = "json5-0.9.14.tar.gz", hash = "sha256:9ed66c3a6ca3510a976a9ef9b8c0787de24802724ab1860bc0153c7fdd589b02" }, + { file = "json5-0.9.17-py2.py3-none-any.whl", hash = "sha256:f8ec1ecf985951d70f780f6f877c4baca6a47b6e61e02c4cd190138d10a7805a" }, + { file = "json5-0.9.17.tar.gz", hash = "sha256:717d99d657fa71b7094877b1d921b1cce40ab444389f6d770302563bb7dfd9ae" }, ] [package.extras] @@ -521,13 +521,13 @@ test = ["pytest"] [[package]] name = "pyright" -version = "1.1.350" +version = "1.1.351" description = "Command line wrapper for pyright" optional = false python-versions = ">=3.7" files = [ - { file = "pyright-1.1.350-py3-none-any.whl", hash = "sha256:f1dde6bcefd3c90aedbe9dd1c573e4c1ddbca8c74bf4fa664dd3b1a599ac9a66" }, - { file = "pyright-1.1.350.tar.gz", hash = "sha256:a8ba676de3a3737ea4d8590604da548d4498cc5ee9ee00b1a403c6db987916c6" }, + { file = "pyright-1.1.351-py3-none-any.whl", hash = "sha256:83b44b25396ae20661fc5f133c3fce30928ff1296d4f2e5ff0bca5fcf03eb89d" }, + { file = "pyright-1.1.351.tar.gz", hash = "sha256:01124099714eebd7f6525d8cbfa350626b56dfaf771cfcd55c03e69f0f1efbbd" }, ] [package.dependencies] From 45806114543a863f5a151c6dd5a85f57380dcd34 Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Tue, 20 Feb 2024 12:55:54 -0800 Subject: [PATCH 11/57] make serializer class --- comicfn2dict/cli.py | 3 ++- comicfn2dict/unparse.py | 55 +++++++++++++++++++++++------------------ 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/comicfn2dict/cli.py b/comicfn2dict/cli.py index d1e6880..e72508e 100755 --- a/comicfn2dict/cli.py +++ b/comicfn2dict/cli.py @@ -20,7 +20,8 @@ def main(): ) args = parser.parse_args() name = args.path.name - metadata = ComicFilenameParser(name, verbose=args.verbose).parse() + cfnparser = ComicFilenameParser(name, verbose=args.verbose) + metadata = cfnparser.parse() pprint(metadata) # noqa:T203 diff --git a/comicfn2dict/unparse.py b/comicfn2dict/unparse.py index 3d32bde..a04380c 100644 --- a/comicfn2dict/unparse.py +++ b/comicfn2dict/unparse.py @@ -26,35 +26,42 @@ _FILENAME_FORMAT_TAGS: tuple[tuple[str, str | Callable], ...] = ( ("scan_info", _PAREN_FMT), ) _EMPTY_VALUES: tuple[None, str] = (None, "") +_DEFAULT_EXT = "cbz" -def _tokenize_tag(md: Mapping, tag: str, fmt: str | Callable) -> str: - val = md.get(tag) - if val in _EMPTY_VALUES: - return "" - final_fmt = fmt(val) if isinstance(fmt, Callable) else fmt - token = final_fmt.format(val).strip() - return token +class ComicFilenameSerializer: + def _tokenize_tag(self, tag: str, fmt: str | Callable) -> str: + val = self.metadata.get(tag) + if val in _EMPTY_VALUES: + return "" + final_fmt = fmt(val) if isinstance(fmt, Callable) else fmt + token = final_fmt.format(val).strip() + return token + def serialize(self) -> str: + """Get our preferred basename from a metadata dict.""" + tokens = [] + for tag, fmt in _FILENAME_FORMAT_TAGS: + if token := self._tokenize_tag(tag, fmt): + tokens.append(token) + fn = " ".join(tokens) -def serialize(md: Mapping, ext: bool = True) -> str: - """Get our preferred basename from a metadata dict.""" - if not md: - return "" - tokens = [] - for tag, fmt in _FILENAME_FORMAT_TAGS: - if token := _tokenize_tag(md, tag, fmt): - tokens.append(token) - fn = " ".join(tokens) - if remainders := md.get("remainders"): - remainder = " ".join(remainders) - # TODO oh this is the - delineated remainder :( - fn += f" - {remainder}" - if ext: - fn += "." + md.get("ext", "cbz") - return fn + if remainders := self.metadata.get("remainders"): + # TODO make token and add before join? + remainder = " ".join(remainders) + # TODO oh this is the - delineated remainder :( + fn += f" - {remainder}" + + if self._ext: + fn += "." + self.metadata.get("ext", _DEFAULT_EXT) + + return fn + + def __init__(self, metadata: Mapping, ext: bool = True): + self.metadata: Mapping = metadata + self._ext: bool = ext def dict2comicfn(md: Mapping, ext: bool = True) -> str: """Simple API.""" - return serialize(md, ext=ext) + return ComicFilenameSerializer(md, ext=ext).serialize() From 207e3451d408af8f8ed7738a0e35258c614e3ea8 Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Tue, 20 Feb 2024 12:57:59 -0800 Subject: [PATCH 12/57] bump news for classes --- NEWS.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index dcabf89..e2c0973 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,8 +4,10 @@ - Titles are now parsed only if they occur after the series token AND after either issue, year or volume. -- Issue numbers that start with a '#' character may contain alphabetical +- Issue numbers that lead with a '#' character may start with alphabetical characters. +- ComicFilenameParser and ComicFilenameSerializer classes are available as well + as the old function API. ## v0.1.4 From 2d40518df6319519138d565aa210553f4848b44d Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Tue, 20 Feb 2024 12:59:51 -0800 Subject: [PATCH 13/57] fix imports --- comicfn2dict/__init__.py | 4 ++-- comicfn2dict/comicfn2dict.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/comicfn2dict/__init__.py b/comicfn2dict/__init__.py index e4f0a78..68464e2 100644 --- a/comicfn2dict/__init__.py +++ b/comicfn2dict/__init__.py @@ -1,3 +1,3 @@ """Comic Filename to Dict parser and unparser.""" -from .parse import ComicFilenameParser # noqa: F401 -from .unparse import serialize # noqa: F401 +from .parse import ComicFilenameParser, comicfn2dict # noqa: F401 +from .unparse import ComicFilenameSerializer, dict2comicfn # noqa: F401 diff --git a/comicfn2dict/comicfn2dict.py b/comicfn2dict/comicfn2dict.py index 6399337..980a6ca 100644 --- a/comicfn2dict/comicfn2dict.py +++ b/comicfn2dict/comicfn2dict.py @@ -1,3 +1,3 @@ """API import source.""" from comicfn2dict.parse import comicfn2dict, ComicFilenameParser # noqa: F401 -from comicfn2dict.unparse import dict2comicfn, serialize # noqa: F401 +from comicfn2dict.unparse import dict2comicfn, ComicFilenameSerializer # noqa: F401 From a6b61fc0312f4d8100583bf8334cd50b5da9bd8c Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Tue, 20 Feb 2024 13:00:46 -0800 Subject: [PATCH 14/57] remove redundant namespace --- comicfn2dict/comicfn2dict.py | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 comicfn2dict/comicfn2dict.py diff --git a/comicfn2dict/comicfn2dict.py b/comicfn2dict/comicfn2dict.py deleted file mode 100644 index 980a6ca..0000000 --- a/comicfn2dict/comicfn2dict.py +++ /dev/null @@ -1,3 +0,0 @@ -"""API import source.""" -from comicfn2dict.parse import comicfn2dict, ComicFilenameParser # noqa: F401 -from comicfn2dict.unparse import dict2comicfn, ComicFilenameSerializer # noqa: F401 From da825abda7307973c0f15bb5fc8c8a1e7276620e Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Tue, 20 Feb 2024 13:06:52 -0800 Subject: [PATCH 15/57] move path cleaning until after ext extraction --- comicfn2dict/parse.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 4c01a60..5ed2c23 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -28,14 +28,8 @@ _TOKEN_DELIMETER = "/" class ComicFilenameParser: - @staticmethod - def _clean_dividers(data: str) -> str: - """Replace non space dividers and clean extra spaces out of string.""" - data = NON_SPACE_DIVIDER_RE.sub(" ", data) - return EXTRA_SPACES_RE.sub(" ", data).strip() - def path_index(self, key: str): - """Retrieve and memoize the key's location in the path.""" + """Lazily retrieve and memoize the key's location in the path.""" if key == "remainders": return -1 value: str = self.metadata.get(key, "") # type: ignore @@ -61,6 +55,11 @@ class ComicFilenameParser: self.metadata["ext"] = ext self._unparsed_path = data + def _clean_dividers(self): + """Replace non space dividers and clean extra spaces out of string.""" + data = NON_SPACE_DIVIDER_RE.sub(" ", self._unparsed_path) + self._unparsed_path = EXTRA_SPACES_RE.sub(" ", data).strip() + def _grouping_operators_strip(self, value: str) -> str: """Strip spaces and parens.""" value = value.strip() @@ -163,9 +162,9 @@ class ComicFilenameParser: def parse(self) -> dict[str, Any]: """Parse the filename with a hierarchy of regexes.""" - self._unparsed_path = self._clean_dividers(self._unparsed_path) self._log_progress("INITIAL") self._parse_ext() + self._clean_dividers() # Parse paren tokens self._parse_item(ISSUE_COUNT_RE) From 3304ba76d6bc94e6ab9b2cd5391d6475f6970043 Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Tue, 20 Feb 2024 13:33:48 -0800 Subject: [PATCH 16/57] remove dots from series and title if not near digits --- comicfn2dict/parse.py | 4 ++++ comicfn2dict/regex.py | 2 ++ tests/comic_filenames.py | 10 +++++----- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 5ed2c23..8f72170 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -6,6 +6,7 @@ from re import Pattern from typing import Any from comicfn2dict.regex import ( + NON_NUMBER_DOT_RE, EXTRA_SPACES_RE, ISSUE_ANYWHERE_RE, ISSUE_COUNT_RE, @@ -133,6 +134,8 @@ class ComicFilenameParser: unused_tokens.append(token) continue value = self._grouping_operators_strip(value) + value = NON_NUMBER_DOT_RE.sub(r"\1 \2", value) + self.metadata[key] = value remaining_key_index += 1 else: @@ -165,6 +168,7 @@ class ComicFilenameParser: self._log_progress("INITIAL") self._parse_ext() self._clean_dividers() + self._log_progress("CLEANED") # Parse paren tokens self._parse_item(ISSUE_COUNT_RE) diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index 01bbcbe..296fab9 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -73,3 +73,5 @@ ISSUE_ANYWHERE_RE = re_compile(r"\b(" + _ISSUE_RE_EXP + r")\b") # LONG STRINGS REMAINING_GROUP_RE = re_compile(r"^[^\()].*[^\)]") + +NON_NUMBER_DOT_RE = re_compile(r"(\D)\.(\D)") diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index 4d67eaa..74e04db 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -309,6 +309,11 @@ FNS.update( "series": "Star Wars - War of the Bounty Hunters - IG-88", "year": "2021", }, + "Free Comic Book Day - Avengers.Hulk (2021).cbz": { + "ext": "cbz", + "series": "Free Comic Book Day - Avengers Hulk", + "year": "2021", + }, } ) LATER = { @@ -333,11 +338,6 @@ LATER = { # Not examined yet. FNS.update( { - "Free Comic Book Day - Avengers.Hulk (2021).cbz": { - "ext": "cbz", - "series": "Free Comic Book Day - Avengers Hulk", - "year": "2021", - }, # CT assumes the volume is also the issue number if it can't find an issue number "Avengers By Brian Michael Bendis volume 03 (2013).cbz": { "ext": "cbz", From 93ac5760a0737cc8ec6c99f76ecfb58a79f54bdc Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Tue, 20 Feb 2024 13:42:37 -0800 Subject: [PATCH 17/57] copy volume into issue if issue not available --- NEWS.md | 2 ++ comicfn2dict/parse.py | 4 ++++ tests/comic_filenames.py | 23 +++++++++++++++-------- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index e2c0973..4037888 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,8 @@ either issue, year or volume. - Issue numbers that lead with a '#' character may start with alphabetical characters. +- If volume is parsed, but issue number is not, the issue number is copied from + the volume number. - ComicFilenameParser and ComicFilenameSerializer classes are available as well as the old function API. diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 8f72170..77c8d5a 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -213,6 +213,10 @@ class ComicFilenameParser: self._parse_item(ISSUE_ANYWHERE_RE) self._log_progress("AFTER ISSUE PICKUP") + # Copy volume into issue if it's all we have. + if "issue" not in self.metadata and "volume" in self.metadata: + self.metadata["issue"] = self.metadata["volume"] + self._add_remainders() return self.metadata diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index 74e04db..db4238e 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -22,6 +22,7 @@ TEST_COMIC_FIELDS_VOL = { TEST_COMIC_VOL_ONLY = { "series": "Long Series Name", "volume": "1", + "issue": "1", "title": "Title", "original_format": "TPB", "year": "2000", @@ -74,6 +75,7 @@ FNS = { "Arkenstone Vol. 01 - The Smell of Burnt Toast (2020) (digital) (My-brother).cbr": { "series": "Arkenstone", "volume": "01", + "issue": "01", "year": "2020", "ext": "cbr", "scan_info": "My-brother", @@ -83,6 +85,7 @@ FNS = { "The_Arkenstone_v03_(2002)_(Digital)_(DR_&_Quenya-Elves).cbr": { "series": "The Arkenstone", "volume": "03", + "issue": "03", "year": "2002", "ext": "cbr", "scan_info": "DR & Quenya-Elves", @@ -101,6 +104,7 @@ FNS = { "Kartalk Library Edition v01 (1992) (digital) (Son of Ultron-Empire).cbr": { "series": "Kartalk Library Edition", "volume": "01", + "issue": "01", "year": "1992", "ext": "cbr", "original_format": "digital", @@ -109,6 +113,7 @@ FNS = { "Kind of Deadly v02 - Last Bullet (2006) (Digital) (Zone-Empire).cbr": { "series": "Kind of Deadly", "volume": "02", + "issue": "02", "year": "2006", "ext": "cbr", "original_format": "Digital", @@ -142,6 +147,7 @@ FNS = { "Jeremy John v01 - Uninterested! (2007) (Digital) (Asgard-Empire).cbr": { "series": "Jeremy John", "volume": "01", + "issue": "01", "year": "2007", "ext": "cbr", "original_format": "Digital", @@ -168,6 +174,7 @@ FNS = { "Darkwad by Carlos Zemo v01 - Knuckle Fight (2009) (Digital) (Zone-Empire).cbr": { "series": "Darkwad by Carlos Zemo", "volume": "01", + "issue": "01", "year": "2009", "ext": "cbr", "title": "Knuckle Fight", @@ -314,6 +321,14 @@ FNS.update( "series": "Free Comic Book Day - Avengers Hulk", "year": "2021", }, + # CT assumes the volume is also the issue number if it can't find an issue number + "Avengers By Brian Michael Bendis volume 03 (2013).cbz": { + "ext": "cbz", + "issue": "03", + "series": "Avengers By Brian Michael Bendis", + "volume": "03", + "year": "2013", + }, } ) LATER = { @@ -338,14 +353,6 @@ LATER = { # Not examined yet. FNS.update( { - # CT assumes the volume is also the issue number if it can't find an issue number - "Avengers By Brian Michael Bendis volume 03 (2013).cbz": { - "ext": "cbz", - "issue": "3", - "series": "Avengers By Brian Michael Bendis", - "volume": "03", - "year": "2013", - }, # CT has extra processing to re-attach the year in this case "Blade Runner Free Comic Book Day 2021 (2021).cbr": { "ext": "cbr", From 120feab7af886fa8f0323efc21edb14120627394 Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Tue, 20 Feb 2024 17:14:03 -0800 Subject: [PATCH 18/57] sophisticated date parsing --- NEWS.md | 1 + comicfn2dict/parse.py | 57 +++++++++++++++++--------- comicfn2dict/regex.py | 56 +++++++++++++++++++++++-- tests/comic_filenames.py | 88 +++++++++++++++++++++++++--------------- 4 files changed, 147 insertions(+), 55 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4037888..f002b1d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,7 @@ - Titles are now parsed only if they occur after the series token AND after either issue, year or volume. +- A more sophisticated date parser. - Issue numbers that lead with a '#' character may start with alphabetical characters. - If volume is parsed, but issue number is not, the issue number is copied from diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 77c8d5a..0628ca4 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -1,5 +1,6 @@ """Parse comic book archive names using the simple 'parse' parser.""" from pprint import pprint +from calendar import month_abbr from copy import copy from pathlib import Path from re import Pattern @@ -7,6 +8,7 @@ from typing import Any from comicfn2dict.regex import ( NON_NUMBER_DOT_RE, + YEAR_FIRST_DATE_RE, EXTRA_SPACES_RE, ISSUE_ANYWHERE_RE, ISSUE_COUNT_RE, @@ -18,14 +20,14 @@ from comicfn2dict.regex import ( ORIGINAL_FORMAT_SCAN_INFO_RE, REMAINING_GROUP_RE, VOLUME_RE, - YEAR_BEGIN_RE, - YEAR_END_RE, + MONTH_FIRST_DATE_RE, YEAR_TOKEN_RE, ) _REMAINING_GROUP_KEYS = ("series", "title") _TITLE_PRECEDING_KEYS = ("issue", "year", "volume") _TOKEN_DELIMETER = "/" +_DATE_KEYS = frozenset({"year", "month", "day"}) class ComicFilenameParser: @@ -69,7 +71,7 @@ class ComicFilenameParser: value = value.strip("'").strip('"').strip() return value - def _parse_item( + def _parse_items( self, regex: Pattern, require_all: bool = False, @@ -95,6 +97,30 @@ class ComicFilenameParser: parts.append(token) self._unparsed_path = _TOKEN_DELIMETER.join(parts) + def _alpha_month_to_numeric(self): + """Translate alpha_month to numeric month.""" + if alpha_month := self.metadata.get("alpha_month", ""): + alpha_month = alpha_month.capitalize() # type: ignore + for index, abbr in enumerate(month_abbr): + if abbr and alpha_month.startswith(abbr): + month = f"{index:02d}" + self.metadata["month"] = month + break + + def _parse_dates(self): + """Parse date schemes.""" + # Month first date + self._parse_items(MONTH_FIRST_DATE_RE) + self._alpha_month_to_numeric() + + # Year first date + if _DATE_KEYS - self.metadata.keys(): + self._parse_items(YEAR_FIRST_DATE_RE) + self._alpha_month_to_numeric() + + if "year" not in self.metadata: + self._parse_items(YEAR_TOKEN_RE) + def _is_title_in_position(self, value): """Does the title come after series and one other token if they exist.""" title_index = self.path.find(value) @@ -171,35 +197,28 @@ class ComicFilenameParser: self._log_progress("CLEANED") # Parse paren tokens - self._parse_item(ISSUE_COUNT_RE) - self._parse_item(YEAR_TOKEN_RE) - self._parse_item( + self._parse_items(ISSUE_COUNT_RE) + self._parse_dates() + self._parse_items( ORIGINAL_FORMAT_SCAN_INFO_RE, require_all=True, ) if "original_format" not in self.metadata: - self._parse_item( + self._parse_items( ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, ) self._log_progress("AFTER PAREN TOKENS") # Parse regular tokens - self._parse_item(VOLUME_RE) - self._parse_item(ISSUE_NUMBER_RE) + self._parse_items(VOLUME_RE) + self._parse_items(ISSUE_NUMBER_RE) self._log_progress("AFTER REGULAR TOKENS") - # Pickup year if not gotten. - if "year" not in self.metadata: - self._parse_item(YEAR_BEGIN_RE) - if "year" not in self.metadata: - self._parse_item(YEAR_END_RE) - self._log_progress("AFTER YEAR PICKUP") - # Pickup issue if it's a standalone token if "issue" not in self.metadata: - self._parse_item(ISSUE_END_RE) + self._parse_items(ISSUE_END_RE) if "issue" not in self.metadata: - self._parse_item(ISSUE_BEGIN_RE) + self._parse_items(ISSUE_BEGIN_RE) self._log_progress("AFTER ISSUE PICKUP") @@ -210,7 +229,7 @@ class ComicFilenameParser: # Final try for issue number. if "issue" not in self.metadata: # TODO is this useful? - self._parse_item(ISSUE_ANYWHERE_RE) + self._parse_items(ISSUE_ANYWHERE_RE) self._log_progress("AFTER ISSUE PICKUP") # Copy volume into issue if it's all we have. diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index 296fab9..5a61484 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -38,16 +38,64 @@ ORIGINAL_FORMAT_PATTERNS = ( r"Web([-\s]?(Comic|Rip))?", ) +MONTHS = ( + r"Jan(uary)?", + r"Feb(ruary)?", + r"Mar(ch)?", + r"Apr(il)?", + r"May", + r"Jun(e)?", + r"Jul(y)?", + r"Aug(ust)?", + r"Sept(ember)?", + r"Oct(ober)?", + r"Nov(ember)?", + r"Dec(ember)?", +) + # CLEAN NON_SPACE_DIVIDER_RE = re_compile(r"[_\+]") EXTRA_SPACES_RE = re_compile(r"\s\s+") +### DATES +_YEAR_RE_EXP = r"(?P[12]\d{3})" +_MONTH_ALPHA_RE_EXP = r"(?P" + r"|".join(MONTHS) + r")\.?" +_MONTH_NUMERIC_RE_EXP = r"(?P0?\d|1[0-2]?)" +_MONTH_RE_EXP = r"(" + _MONTH_ALPHA_RE_EXP + r"|" + _MONTH_NUMERIC_RE_EXP + r")" + +_DAY_RE_EXP = r"(?P([0-2]?\d|(3)[0-1]))" +_DATE_DELIM = r"[-\s]+" +_MONTH_FIRST_DATE_RE_EXP = ( + r"((\b|\(?)" + # Month + + _MONTH_RE_EXP + # Day + + r"(" + + _DATE_DELIM + + _DAY_RE_EXP + + r")?" + # Year + + r"[,]?" + + _DATE_DELIM + + _YEAR_RE_EXP + + r"(\)?|\b))" +) +_YEAR_FIRST_DATE_RE_EXP = ( + r"(\b\(?" + + _YEAR_RE_EXP + + _DATE_DELIM + + _MONTH_RE_EXP + + _DATE_DELIM + + _DAY_RE_EXP + + r"\b\)?)" +) + +MONTH_FIRST_DATE_RE = re_compile(_MONTH_FIRST_DATE_RE_EXP) +YEAR_FIRST_DATE_RE = re_compile(_YEAR_FIRST_DATE_RE_EXP) +YEAR_TOKEN_RE = re_compile(_YEAR_RE_EXP, parenthify=True) + # PAREN GROUPS ISSUE_COUNT_RE = re_compile(r"of\s*(?P\d+)", parenthify=True) -_YEAR_RE_EXP = r"(?P[12]\d{3})" -YEAR_TOKEN_RE = re_compile(_YEAR_RE_EXP, parenthify=True) -YEAR_BEGIN_RE = re_compile(r"^" + _YEAR_RE_EXP + r"\b") -YEAR_END_RE = re_compile(r"\b" + _YEAR_RE_EXP + r"$") _OF_PATTERNS = r"|".join(ORIGINAL_FORMAT_PATTERNS) _ORIGINAL_FORMAT_RE_EXP = r"(?P" + _OF_PATTERNS + r")" _SCAN_INFO_RE_EXP = r"(?P[^()]*)" diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index db4238e..1c860dd 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -329,6 +329,30 @@ FNS.update( "volume": "03", "year": "2013", }, + # CT catches the year + "Marvel Previews #002 (January 2022).cbr": { + "ext": "cbr", + "issue": "002", + "series": "Marvel Previews", + "alpha_month": "January", + "month": "01", + "year": "2022", + }, + "Test Numeric Year #2 2001-02-24.cbz": { + "ext": "cbz", + "issue": "2", + "series": "Test Numeric Year", + "year": "2002", + "month": "02", + "day": "24", + }, + "Test Month First Date 02-24-2001.cbz": { + "ext": "cbz", + "series": "Test Month First Date", + "year": "2002", + "month": "02", + "day": "24", + }, } ) LATER = { @@ -348,40 +372,37 @@ LATER = { "volume": "1957", "year": "1969", }, + # CT has extra processing to re-attach the year in this case + "Blade Runner Free Comic Book Day 2021 (2021).cbr": { + "ext": "cbr", + "series": "Blade Runner Free Comic Book Day 2021", + "year": "2021", + }, + # CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series) + "Bloodshot Book 03 (2020).cbr": { + "ext": "cbr", + "issue": "03", + "series": "Bloodshot", + "title": "Book 03", + "volume": "03", + "year": "2020", + }, + # CT checks for the following '(of 06)' after the '03' and marks it as the volume + "Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": { + "ext": "cbr", + "issue": "008", + "series": "Elephantmen 2259", + "title": "Simple Truth", + "volume": "03", + "year": "2021", + "volume_count": "06", + }, } # Not examined yet. FNS.update( { - # CT has extra processing to re-attach the year in this case - "Blade Runner Free Comic Book Day 2021 (2021).cbr": { - "ext": "cbr", - "series": "Blade Runner Free Comic Book Day 2021", - "year": "2021", - }, # CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series) - "Bloodshot Book 03 (2020).cbr": { - "ext": "cbr", - "issue": "03", - "series": "Bloodshot", - "title": "Book 03", - "volume": "03", - "year": "2020", - }, # CT checks for the following '(of 06)' after the '03' and marks it as the volume - "Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": { - "ext": "cbr", - "issue": "008", - "series": "Elephantmen 2259", - "title": "Simple Truth", - "volume": "03", - "year": "2021", - "volume_count": "06", - }, # CT catches the year - "Marvel Previews #002 (January 2022).cbr": { - "ext": "cbr", - "issue": "002", - "series": "Marvel Previews", - "year": "2022", - }, # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder + # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder "Marvel Two In One V1 #090 c2c.cbr": { "ext": "cbr", "issue": "090", @@ -397,20 +418,23 @@ FNS.update( "title": "digital", "publisher": "DC", "year": "1951", - }, # CT notices that this is a full date, CT doesn't actually return the month or day though just removes it + }, + # CT notices that this is a full date, CT doesn't actually return the month or day though just removes it "X-Men, 2021-08-04 (#02).cbz": { "ext": "cbz", "issue": "02", "series": "X-Men", "year": "2021", - }, # CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation + }, + # CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation "Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": { "ext": "cbz", "issue": "001", "series": "Cory Doctorow's Futuristic Tales of the Here and Now", "title": "Anda's Game", "year": "2007", - }, # This is a contrived test case. I've never seen this I just wanted to handle it with my parser + }, + # This is a contrived test case. I've never seen this I just wanted to handle it with my parser "Cory Doctorow's Futuristic Tales of the Here and Now #0.0.1 (2007).cbz": { "ext": "cbz", "issue": "0.1", From 656310c609433692a56542d9d35e68b3afc4cbb1 Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Tue, 20 Feb 2024 17:17:40 -0800 Subject: [PATCH 19/57] fix test result --- tests/comic_filenames.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index 1c860dd..a485662 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -342,7 +342,7 @@ FNS.update( "ext": "cbz", "issue": "2", "series": "Test Numeric Year", - "year": "2002", + "year": "2001", "month": "02", "day": "24", }, From e34cfb9d1337e2b9c8f32ed0d071ab555a47f125 Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Tue, 20 Feb 2024 17:18:47 -0800 Subject: [PATCH 20/57] fix test result --- tests/comic_filenames.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index a485662..aff31b4 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -349,7 +349,7 @@ FNS.update( "Test Month First Date 02-24-2001.cbz": { "ext": "cbz", "series": "Test Month First Date", - "year": "2002", + "year": "2001", "month": "02", "day": "24", }, From 57ecf4f354b54acecefad3808df2ceb67a8cf63e Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Tue, 20 Feb 2024 17:34:31 -0800 Subject: [PATCH 21/57] add optional parens to issue --- comicfn2dict/regex.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index 5a61484..c7272c7 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -112,12 +112,12 @@ ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE = re_compile( # REGULAR TOKENS VOLUME_RE = re_compile(r"((?:v(?:ol(?:ume)?)?\.?)\s*(?P\d+))") _ISSUE_NUMBER_RE_EXP = r"(?P[\w½]+\.?\d*\w*)" -ISSUE_NUMBER_RE = re_compile(r"(#" + _ISSUE_NUMBER_RE_EXP + r")") +ISSUE_NUMBER_RE = re_compile(r"(\(?#" + _ISSUE_NUMBER_RE_EXP + r"\)?)") _ISSUE_RE_EXP = r"(?P[\d½]+\.?\d*\w*)" -ISSUE_END_RE = re_compile(r"([\/\s]" + _ISSUE_RE_EXP + r"(\/|$))") -ISSUE_BEGIN_RE = re_compile(r"((^|\/)" + _ISSUE_RE_EXP + r"[\/|\s])") -ISSUE_ANYWHERE_RE = re_compile(r"\b(" + _ISSUE_RE_EXP + r")\b") +ISSUE_END_RE = re_compile(r"([\/\s]\(?" + _ISSUE_RE_EXP + r"\)?(\/|$))") +ISSUE_BEGIN_RE = re_compile(r"((^|\/)\(?" + _ISSUE_RE_EXP + r"\)?[\/|\s])") +ISSUE_ANYWHERE_RE = re_compile(r"\b(\(?" + _ISSUE_RE_EXP + r"\)?)\b") # LONG STRINGS REMAINING_GROUP_RE = re_compile(r"^[^\()].*[^\)]") From a3bc43b2d619b1c7d568b677affb8afe408fbeed Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Tue, 20 Feb 2024 17:35:03 -0800 Subject: [PATCH 22/57] parse number issues earlier. pop alpha_month. strip commas from titles and series --- comicfn2dict/parse.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 0628ca4..18a284f 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -68,6 +68,7 @@ class ComicFilenameParser: value = value.strip() value = value.strip("()").strip() value = value.strip("-").strip() + value = value.strip(",").strip() value = value.strip("'").strip('"').strip() return value @@ -99,7 +100,7 @@ class ComicFilenameParser: def _alpha_month_to_numeric(self): """Translate alpha_month to numeric month.""" - if alpha_month := self.metadata.get("alpha_month", ""): + if alpha_month := self.metadata.pop("alpha_month", ""): alpha_month = alpha_month.capitalize() # type: ignore for index, abbr in enumerate(month_abbr): if abbr and alpha_month.startswith(abbr): @@ -197,6 +198,7 @@ class ComicFilenameParser: self._log_progress("CLEANED") # Parse paren tokens + self._parse_items(ISSUE_NUMBER_RE) self._parse_items(ISSUE_COUNT_RE) self._parse_dates() self._parse_items( @@ -211,7 +213,6 @@ class ComicFilenameParser: # Parse regular tokens self._parse_items(VOLUME_RE) - self._parse_items(ISSUE_NUMBER_RE) self._log_progress("AFTER REGULAR TOKENS") # Pickup issue if it's a standalone token From de1e0949c0c72f9706625c87974453daad9ac14b Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Tue, 20 Feb 2024 17:35:44 -0800 Subject: [PATCH 23/57] move working tests into block. move difficult tests into difficult block --- tests/comic_filenames.py | 107 +++++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 54 deletions(-) diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index aff31b4..18635c8 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -334,7 +334,6 @@ FNS.update( "ext": "cbr", "issue": "002", "series": "Marvel Previews", - "alpha_month": "January", "month": "01", "year": "2022", }, @@ -353,16 +352,25 @@ FNS.update( "month": "02", "day": "24", }, + # CT notices that this is a full date, CT doesn't actually return the month or day though just removes it + "X-Men, 2021-08-04 (#02).cbz": { + "ext": "cbz", + "issue": "02", + "series": "X-Men", + "year": "2021", + "month": "08", + "day": "04", + }, + # 4 digit issue number + # should this be an issue number if year DONE?. + "action comics 1024.cbz": { + "ext": "cbz", + "issue": "1024", + "series": "action comics", + }, } ) -LATER = { - # 4 digit issue number - # should this be an issue number if year DONE?. - "action comics 1024.cbz": { - "ext": "cbz", - "issue": "1024", - "series": "action comics", - }, +DIFFICULT = { # I'm not sure there's a right way to parse this. This might also be a madeup filename I don't remember # if a year occurs after another year, and no volume, do volume / year "Super Strange Yarns (1957) #92 (1969).cbz": { @@ -397,53 +405,44 @@ LATER = { "year": "2021", "volume_count": "06", }, + # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder + "Marvel Two In One V1 #090 c2c.cbr": { + "ext": "cbr", + "issue": "090", + "series": "Marvel Two In One", + "publisher": "Marvel", + "volume": "1", + }, + # CT treats '[]' as equivalent to '()', catches DC as a publisher and 'Sep-Oct 1951' as dates and removes them. CT doesn't catch the digital though so that could be better but I blame whoever made this atrocious filename + "Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz": { + "ext": "cbz", + "issue": "49", + "series": "Wonder Woman", + "title": "digital", + "publisher": "DC", + "year": "1951", + "month": "10", + }, + # CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation + "Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": { + "ext": "cbz", + "issue": "001", + "series": "Cory Doctorow's Futuristic Tales of the Here and Now", + "title": "Anda's Game", + "year": "2007", + }, + # This is a contrived test case. I've never seen this I just wanted to handle it with my parser + "Cory Doctorow's Futuristic Tales of the Here and Now #0.0.1 (2007).cbz": { + "ext": "cbz", + "issue": "0.1", + "series": "Cory Doctorow's Futuristic Tales of the Here and Now", + "year": "2007", + "issue_count": "", + }, } -# Not examined yet. -FNS.update( - { - # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder - "Marvel Two In One V1 #090 c2c.cbr": { - "ext": "cbr", - "issue": "090", - "series": "Marvel Two In One", - "publisher": "Marvel", - "volume": "1", - }, - # CT treats '[]' as equivalent to '()', catches DC as a publisher and 'Sep-Oct 1951' as dates and removes them. CT doesn't catch the digital though so that could be better but I blame whoever made this atrocious filename - "Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz": { - "ext": "cbz", - "issue": "49", - "series": "Wonder Woman", - "title": "digital", - "publisher": "DC", - "year": "1951", - }, - # CT notices that this is a full date, CT doesn't actually return the month or day though just removes it - "X-Men, 2021-08-04 (#02).cbz": { - "ext": "cbz", - "issue": "02", - "series": "X-Men", - "year": "2021", - }, - # CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation - "Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": { - "ext": "cbz", - "issue": "001", - "series": "Cory Doctorow's Futuristic Tales of the Here and Now", - "title": "Anda's Game", - "year": "2007", - }, - # This is a contrived test case. I've never seen this I just wanted to handle it with my parser - "Cory Doctorow's Futuristic Tales of the Here and Now #0.0.1 (2007).cbz": { - "ext": "cbz", - "issue": "0.1", - "series": "Cory Doctorow's Futuristic Tales of the Here and Now", - "year": "2007", - "issue_count": "", - }, - } -) +# FNS.update(LATER) + WONFIX = { # Leading issue number is usually an alternate sequence number From 55423a9f10b87f107f83f6bd60792322c02bf737 Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Tue, 20 Feb 2024 18:36:34 -0800 Subject: [PATCH 24/57] formatting --- comicfn2dict/parse.py | 5 +---- comicfn2dict/regex.py | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 18a284f..1484637 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -145,12 +145,9 @@ class ComicFilenameParser: if not self._unparsed_path: return - # TODO fix REMAINING GROUP_RE to use token delim - tokens = self._unparsed_path.split(_TOKEN_DELIMETER) - - # ASSIGN GROUPS remaining_key_index = 0 unused_tokens = [] + tokens = self._unparsed_path.split(_TOKEN_DELIMETER) while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS): key = _REMAINING_GROUP_KEYS[remaining_key_index] token = tokens.pop(0) diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index c7272c7..e9c24be 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -111,15 +111,15 @@ ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE = re_compile( # REGULAR TOKENS VOLUME_RE = re_compile(r"((?:v(?:ol(?:ume)?)?\.?)\s*(?P\d+))") + +# ISSUE _ISSUE_NUMBER_RE_EXP = r"(?P[\w½]+\.?\d*\w*)" ISSUE_NUMBER_RE = re_compile(r"(\(?#" + _ISSUE_NUMBER_RE_EXP + r"\)?)") _ISSUE_RE_EXP = r"(?P[\d½]+\.?\d*\w*)" - ISSUE_END_RE = re_compile(r"([\/\s]\(?" + _ISSUE_RE_EXP + r"\)?(\/|$))") ISSUE_BEGIN_RE = re_compile(r"((^|\/)\(?" + _ISSUE_RE_EXP + r"\)?[\/|\s])") ISSUE_ANYWHERE_RE = re_compile(r"\b(\(?" + _ISSUE_RE_EXP + r"\)?)\b") # LONG STRINGS REMAINING_GROUP_RE = re_compile(r"^[^\()].*[^\)]") - NON_NUMBER_DOT_RE = re_compile(r"(\D)\.(\D)") From 4b1f5fbdb9a89ca10f1d025d3bd17be97a554a2d Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Wed, 21 Feb 2024 09:37:00 -0800 Subject: [PATCH 25/57] better issue regex --- comicfn2dict/regex.py | 5 ++--- tests/comic_filenames.py | 19 +++++++++---------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index e9c24be..7ffbd4b 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -113,9 +113,8 @@ ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE = re_compile( VOLUME_RE = re_compile(r"((?:v(?:ol(?:ume)?)?\.?)\s*(?P\d+))") # ISSUE -_ISSUE_NUMBER_RE_EXP = r"(?P[\w½]+\.?\d*\w*)" -ISSUE_NUMBER_RE = re_compile(r"(\(?#" + _ISSUE_NUMBER_RE_EXP + r"\)?)") -_ISSUE_RE_EXP = r"(?P[\d½]+\.?\d*\w*)" +_ISSUE_RE_EXP = r"(?P\w*(½|\d+)[\.\d+]*\w*)" +ISSUE_NUMBER_RE = re_compile(r"(\(?#" + _ISSUE_RE_EXP + r"\)?)") ISSUE_END_RE = re_compile(r"([\/\s]\(?" + _ISSUE_RE_EXP + r"\)?(\/|$))") ISSUE_BEGIN_RE = re_compile(r"((^|\/)\(?" + _ISSUE_RE_EXP + r"\)?[\/|\s])") ISSUE_ANYWHERE_RE = re_compile(r"\b(\(?" + _ISSUE_RE_EXP + r"\)?)\b") diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index 18635c8..79dc507 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -368,6 +368,13 @@ FNS.update( "issue": "1024", "series": "action comics", }, + # This is a contrived test case. I've never seen this I just wanted to handle it with my parser + "Cory Doctorow's Futuristic Tales of the Here and Now #0.0.1 (2007).cbz": { + "ext": "cbz", + "issue": "0.0.1", + "series": "Cory Doctorow's Futuristic Tales of the Here and Now", + "year": "2007", + }, } ) DIFFICULT = { @@ -431,18 +438,10 @@ DIFFICULT = { "title": "Anda's Game", "year": "2007", }, - # This is a contrived test case. I've never seen this I just wanted to handle it with my parser - "Cory Doctorow's Futuristic Tales of the Here and Now #0.0.1 (2007).cbz": { - "ext": "cbz", - "issue": "0.1", - "series": "Cory Doctorow's Futuristic Tales of the Here and Now", - "year": "2007", - "issue_count": "", - }, } -# FNS.update(LATER) - +# first_key, first_val = DIFFICULT.popitem() +# FNS[first_key] = first_val WONFIX = { # Leading issue number is usually an alternate sequence number From 7d9b4efeee4db54b5fd150f5d3b2107a98200d96 Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Wed, 21 Feb 2024 10:08:51 -0800 Subject: [PATCH 26/57] reorganize code. only substitute first colon out of caution. --- comicfn2dict/parse.py | 34 ++++++++++++++++++---------------- comicfn2dict/regex.py | 17 +++++++++++++++-- tests/comic_filenames.py | 16 ++++++++-------- 3 files changed, 41 insertions(+), 26 deletions(-) diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 1484637..644ffb8 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -9,13 +9,13 @@ from typing import Any from comicfn2dict.regex import ( NON_NUMBER_DOT_RE, YEAR_FIRST_DATE_RE, - EXTRA_SPACES_RE, ISSUE_ANYWHERE_RE, + REGEX_SUBS, + TOKEN_DELIMETER, ISSUE_COUNT_RE, ISSUE_NUMBER_RE, ISSUE_BEGIN_RE, ISSUE_END_RE, - NON_SPACE_DIVIDER_RE, ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, ORIGINAL_FORMAT_SCAN_INFO_RE, REMAINING_GROUP_RE, @@ -26,7 +26,6 @@ from comicfn2dict.regex import ( _REMAINING_GROUP_KEYS = ("series", "title") _TITLE_PRECEDING_KEYS = ("issue", "year", "volume") -_TOKEN_DELIMETER = "/" _DATE_KEYS = frozenset({"year", "month", "day"}) @@ -58,19 +57,22 @@ class ComicFilenameParser: self.metadata["ext"] = ext self._unparsed_path = data - def _clean_dividers(self): - """Replace non space dividers and clean extra spaces out of string.""" - data = NON_SPACE_DIVIDER_RE.sub(" ", self._unparsed_path) - self._unparsed_path = EXTRA_SPACES_RE.sub(" ", data).strip() - def _grouping_operators_strip(self, value: str) -> str: """Strip spaces and parens.""" value = value.strip() value = value.strip("()").strip() value = value.strip("-").strip() value = value.strip(",").strip() - value = value.strip("'").strip('"').strip() - return value + value = value.strip("'").strip() + return value.strip('"').strip() + + def _clean_dividers(self): + """Replace non space dividers and clean extra spaces out of string.""" + data = self._unparsed_path + for regex, pair in REGEX_SUBS.items(): + replacement, count = pair + data = regex.sub(replacement, data, count=count) + self._unparsed_path = data.strip() def _parse_items( self, @@ -91,12 +93,12 @@ class ComicFilenameParser: matched_metadata[key] = self._grouping_operators_strip(value) self.metadata.update(matched_metadata) - marked_str = regex.sub(_TOKEN_DELIMETER, self._unparsed_path) + marked_str = regex.sub(TOKEN_DELIMETER, self._unparsed_path) parts = [] - for part in marked_str.split(_TOKEN_DELIMETER): + for part in marked_str.split(TOKEN_DELIMETER): if token := part.strip(): parts.append(token) - self._unparsed_path = _TOKEN_DELIMETER.join(parts) + self._unparsed_path = TOKEN_DELIMETER.join(parts) def _alpha_month_to_numeric(self): """Translate alpha_month to numeric month.""" @@ -147,7 +149,7 @@ class ComicFilenameParser: remaining_key_index = 0 unused_tokens = [] - tokens = self._unparsed_path.split(_TOKEN_DELIMETER) + tokens = self._unparsed_path.split(TOKEN_DELIMETER) while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS): key = _REMAINING_GROUP_KEYS[remaining_key_index] token = tokens.pop(0) @@ -170,7 +172,7 @@ class ComicFilenameParser: def _add_remainders(self): """Add Remainders.""" remainders = [] - for token in self._unparsed_path.split(_TOKEN_DELIMETER): + for token in self._unparsed_path.split(TOKEN_DELIMETER): if remainder := token.strip(): remainders.append(remainder) @@ -225,8 +227,8 @@ class ComicFilenameParser: self._log_progress("AFTER SERIES AND TITLE") # Final try for issue number. + # TODO unused if "issue" not in self.metadata: - # TODO is this useful? self._parse_items(ISSUE_ANYWHERE_RE) self._log_progress("AFTER ISSUE PICKUP") diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index 7ffbd4b..718df9b 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -1,5 +1,6 @@ """Parsing regexes.""" import re +from types import MappingProxyType def re_compile(exp, parenthify=False): @@ -53,9 +54,19 @@ MONTHS = ( r"Dec(ember)?", ) +TOKEN_DELIMETER = r"/" + # CLEAN -NON_SPACE_DIVIDER_RE = re_compile(r"[_\+]") -EXTRA_SPACES_RE = re_compile(r"\s\s+") +_TOKEN_DIVIDERS_RE = re_compile(r":") +_SPACE_EQUIVALENT_RE = re_compile(r"_") +_EXTRA_SPACES_RE = re_compile(r"\s\s+") +REGEX_SUBS: MappingProxyType[re.Pattern, tuple[str, int]] = MappingProxyType( + { + _TOKEN_DIVIDERS_RE: (TOKEN_DELIMETER, 1), + _SPACE_EQUIVALENT_RE: (r" ", 0), + _EXTRA_SPACES_RE: (r" ", 0), + } +) ### DATES _YEAR_RE_EXP = r"(?P[12]\d{3})" @@ -117,6 +128,8 @@ _ISSUE_RE_EXP = r"(?P\w*(½|\d+)[\.\d+]*\w*)" ISSUE_NUMBER_RE = re_compile(r"(\(?#" + _ISSUE_RE_EXP + r"\)?)") ISSUE_END_RE = re_compile(r"([\/\s]\(?" + _ISSUE_RE_EXP + r"\)?(\/|$))") ISSUE_BEGIN_RE = re_compile(r"((^|\/)\(?" + _ISSUE_RE_EXP + r"\)?[\/|\s])") + +# TODO unused ISSUE_ANYWHERE_RE = re_compile(r"\b(\(?" + _ISSUE_RE_EXP + r"\)?)\b") # LONG STRINGS diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index 79dc507..4c63c91 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -375,6 +375,14 @@ FNS.update( "series": "Cory Doctorow's Futuristic Tales of the Here and Now", "year": "2007", }, + # CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation + "Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": { + "ext": "cbz", + "issue": "001", + "series": "Cory Doctorow's Futuristic Tales of the Here and Now", + "title": "Anda's Game", + "year": "2007", + }, } ) DIFFICULT = { @@ -430,14 +438,6 @@ DIFFICULT = { "year": "1951", "month": "10", }, - # CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation - "Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": { - "ext": "cbz", - "issue": "001", - "series": "Cory Doctorow's Futuristic Tales of the Here and Now", - "title": "Anda's Game", - "year": "2007", - }, } # first_key, first_val = DIFFICULT.popitem() From 2c0ab37d8316943170c84f50d33c792a87669f8d Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Wed, 21 Feb 2024 11:32:55 -0800 Subject: [PATCH 27/57] complicated year and volume parsing --- comicfn2dict/parse.py | 59 ++++++++++++++++++++++++++++++++-------- comicfn2dict/regex.py | 5 ++++ tests/comic_filenames.py | 46 +++++++++++++++++++------------ 3 files changed, 81 insertions(+), 29 deletions(-) diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 644ffb8..2ad37e0 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -16,6 +16,7 @@ from comicfn2dict.regex import ( ISSUE_NUMBER_RE, ISSUE_BEGIN_RE, ISSUE_END_RE, + YEAR_END_RE, ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, ORIGINAL_FORMAT_SCAN_INFO_RE, REMAINING_GROUP_RE, @@ -78,6 +79,9 @@ class ComicFilenameParser: self, regex: Pattern, require_all: bool = False, + exclude: str = "", + first_only: bool = False, + pop: bool = True, ) -> None: """Parse a value from the data list into metadata and alter the data list.""" matches = regex.search(self._unparsed_path) @@ -85,15 +89,23 @@ class ComicFilenameParser: return matched_metadata = {} for key, value in matches.groupdict().items(): + print(f"{value=} == {exclude=}") + if value == exclude: + continue if not value: if require_all: return continue # TODO idk if strip is necessary here matched_metadata[key] = self._grouping_operators_strip(value) + if first_only: + break self.metadata.update(matched_metadata) - marked_str = regex.sub(TOKEN_DELIMETER, self._unparsed_path) + if not matched_metadata or not pop: + return + count = 1 if first_only else 0 + marked_str = regex.sub(TOKEN_DELIMETER, self._unparsed_path, count=count) parts = [] for part in marked_str.split(TOKEN_DELIMETER): if token := part.strip(): @@ -122,7 +134,15 @@ class ComicFilenameParser: self._alpha_month_to_numeric() if "year" not in self.metadata: - self._parse_items(YEAR_TOKEN_RE) + self._parse_items(YEAR_TOKEN_RE, first_only=True) + if "volume" in self.metadata: + return + # A second year will be the real year. + # Move the first year to volume + if volume := self.metadata.get("year", ""): + self._parse_items(YEAR_TOKEN_RE) + if self.metadata.get("year", "") != volume: + self.metadata["volume"] = volume def _is_title_in_position(self, value): """Does the title come after series and one other token if they exist.""" @@ -191,15 +211,27 @@ class ComicFilenameParser: def parse(self) -> dict[str, Any]: """Parse the filename with a hierarchy of regexes.""" + # Init + # self._log_progress("INITIAL") self._parse_ext() self._clean_dividers() self._log_progress("CLEANED") - # Parse paren tokens + # Main issue parsing + # self._parse_items(ISSUE_NUMBER_RE) self._parse_items(ISSUE_COUNT_RE) + self._log_progress("AFTER ISSUE") + + # Volume and date + # + self._parse_items(VOLUME_RE) self._parse_dates() + self._log_progress("AFTER VOLUME & DATE") + + # Format & Scan Info + # self._parse_items( ORIGINAL_FORMAT_SCAN_INFO_RE, require_all=True, @@ -210,19 +242,21 @@ class ComicFilenameParser: ) self._log_progress("AFTER PAREN TOKENS") - # Parse regular tokens - self._parse_items(VOLUME_RE) - self._log_progress("AFTER REGULAR TOKENS") + # Series and Title + # + # Match years on the end of series and title tokens + year_end_matched = False + if "year" not in self.metadata: + self._parse_items(YEAR_END_RE, pop=False) + year_end_matched = "year" in self.metadata - # Pickup issue if it's a standalone token - if "issue" not in self.metadata: - self._parse_items(ISSUE_END_RE) + # Pickup issue if it's out on the end of a token + if "issue" not in self.metadata and not year_end_matched: + exclude: str = self.metadata.get("year", "") # type: ignore + self._parse_items(ISSUE_END_RE, exclude=exclude) if "issue" not in self.metadata: self._parse_items(ISSUE_BEGIN_RE) - self._log_progress("AFTER ISSUE PICKUP") - - # Series and Title. Also looks for issue. self._assign_remaining_groups() self._log_progress("AFTER SERIES AND TITLE") @@ -233,6 +267,7 @@ class ComicFilenameParser: self._log_progress("AFTER ISSUE PICKUP") # Copy volume into issue if it's all we have. + # if "issue" not in self.metadata and "volume" in self.metadata: self.metadata["issue"] = self.metadata["volume"] diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index 718df9b..6d23685 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -60,11 +60,15 @@ TOKEN_DELIMETER = r"/" _TOKEN_DIVIDERS_RE = re_compile(r":") _SPACE_EQUIVALENT_RE = re_compile(r"_") _EXTRA_SPACES_RE = re_compile(r"\s\s+") +_LEFT_PAREN_EQUIVALENT_RE = re_compile(r"\[") +_RIGHT_PAREN_EQUIVALENT_RE = re_compile(r"\]") REGEX_SUBS: MappingProxyType[re.Pattern, tuple[str, int]] = MappingProxyType( { _TOKEN_DIVIDERS_RE: (TOKEN_DELIMETER, 1), _SPACE_EQUIVALENT_RE: (r" ", 0), _EXTRA_SPACES_RE: (r" ", 0), + _LEFT_PAREN_EQUIVALENT_RE: (r"(", 0), + _RIGHT_PAREN_EQUIVALENT_RE: (r")", 0), } ) @@ -104,6 +108,7 @@ _YEAR_FIRST_DATE_RE_EXP = ( MONTH_FIRST_DATE_RE = re_compile(_MONTH_FIRST_DATE_RE_EXP) YEAR_FIRST_DATE_RE = re_compile(_YEAR_FIRST_DATE_RE_EXP) YEAR_TOKEN_RE = re_compile(_YEAR_RE_EXP, parenthify=True) +YEAR_END_RE = re_compile(_YEAR_RE_EXP + r"\/|$") # PAREN GROUPS ISSUE_COUNT_RE = re_compile(r"of\s*(?P\d+)", parenthify=True) diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index 4c63c91..42c4c3e 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -383,25 +383,26 @@ FNS.update( "title": "Anda's Game", "year": "2007", }, + # If a title ends in a year, it's not an issue (and is a year if no year) + "Blade Runner Free Comic Book Day 2021 (2021).cbr": { + "ext": "cbr", + "series": "Blade Runner Free Comic Book Day 2021", + "year": "2021", + }, + # If a year occurs after another year, and no volume, do volume / year + "Super Strange Yarns (1957) #92 (1969).cbz": { + "ext": "cbz", + "issue": "92", + "series": "Super Strange Yarns", + "volume": "1957", + "year": "1969", + }, } ) -DIFFICULT = { - # I'm not sure there's a right way to parse this. This might also be a madeup filename I don't remember - # if a year occurs after another year, and no volume, do volume / year - "Super Strange Yarns (1957) #92 (1969).cbz": { - "ext": "cbz", - "issue": "92", - "series": "Super Strange Yarns", - "volume": "1957", - "year": "1969", - }, - # CT has extra processing to re-attach the year in this case - "Blade Runner Free Comic Book Day 2021 (2021).cbr": { - "ext": "cbr", - "series": "Blade Runner Free Comic Book Day 2021", - "year": "2021", - }, +VOLUME = { # CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series) + # + # Book \d is a non-popped volume not an issue "Bloodshot Book 03 (2020).cbr": { "ext": "cbr", "issue": "03", @@ -411,6 +412,9 @@ DIFFICULT = { "year": "2020", }, # CT checks for the following '(of 06)' after the '03' and marks it as the volume + # + # issue count is not popped if does not occur near issue + # \d (of \d) is volume & volume count if not issue "Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": { "ext": "cbr", "issue": "008", @@ -420,7 +424,12 @@ DIFFICULT = { "year": "2021", "volume_count": "06", }, +} +PUBLISHER = { # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder + # + # 1. c2c is not a title and is an original_format + # Leading common publisher may be a publisher? Do not pop "Marvel Two In One V1 #090 c2c.cbr": { "ext": "cbr", "issue": "090", @@ -429,6 +438,9 @@ DIFFICULT = { "volume": "1", }, # CT treats '[]' as equivalent to '()', catches DC as a publisher and 'Sep-Oct 1951' as dates and removes them. CT doesn't catch the digital though so that could be better but I blame whoever made this atrocious filename + # + # 1. Month-Month should be handled + # 2. DC is a common publisher, no pop? "Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz": { "ext": "cbz", "issue": "49", @@ -440,7 +452,7 @@ DIFFICULT = { }, } -# first_key, first_val = DIFFICULT.popitem() +# first_key, first_val = YEAR.popitem() # FNS[first_key] = first_val WONFIX = { From d550d9c54e293387e6d11f02ff63a5a60748adb5 Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Wed, 21 Feb 2024 13:15:18 -0800 Subject: [PATCH 28/57] Complicated volume parsing --- comicfn2dict/parse.py | 44 +++++++++++++++++++++++-------------- comicfn2dict/regex.py | 25 ++++++++++++++++----- tests/comic_filenames.py | 47 +++++++++++++++++----------------------- 3 files changed, 68 insertions(+), 48 deletions(-) diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 2ad37e0..897dc63 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -7,21 +7,23 @@ from re import Pattern from typing import Any from comicfn2dict.regex import ( - NON_NUMBER_DOT_RE, - YEAR_FIRST_DATE_RE, + BOOK_VOLUME_RE, ISSUE_ANYWHERE_RE, - REGEX_SUBS, - TOKEN_DELIMETER, - ISSUE_COUNT_RE, - ISSUE_NUMBER_RE, ISSUE_BEGIN_RE, ISSUE_END_RE, - YEAR_END_RE, - ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, - ORIGINAL_FORMAT_SCAN_INFO_RE, - REMAINING_GROUP_RE, - VOLUME_RE, + ISSUE_NUMBER_RE, + ISSUE_WITH_COUNT_RE, MONTH_FIRST_DATE_RE, + NON_NUMBER_DOT_RE, + ORIGINAL_FORMAT_SCAN_INFO_RE, + ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, + REGEX_SUBS, + REMAINING_GROUP_RE, + TOKEN_DELIMETER, + VOLUME_RE, + VOLUME_WITH_COUNT_RE, + YEAR_END_RE, + YEAR_FIRST_DATE_RE, YEAR_TOKEN_RE, ) @@ -172,6 +174,8 @@ class ComicFilenameParser: tokens = self._unparsed_path.split(TOKEN_DELIMETER) while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS): key = _REMAINING_GROUP_KEYS[remaining_key_index] + if key in self.metadata: + continue token = tokens.pop(0) match = REMAINING_GROUP_RE.search(token) if match: @@ -218,15 +222,19 @@ class ComicFilenameParser: self._clean_dividers() self._log_progress("CLEANED") - # Main issue parsing + # Issue # self._parse_items(ISSUE_NUMBER_RE) - self._parse_items(ISSUE_COUNT_RE) + if "issue" not in self.metadata: + self._parse_items(ISSUE_WITH_COUNT_RE) + # self._parse_items(ISSUE_COUNT_RE) self._log_progress("AFTER ISSUE") - # Volume and date + # Volume and Date # self._parse_items(VOLUME_RE) + if "volume" not in self.metadata: + self._parse_items(VOLUME_WITH_COUNT_RE) self._parse_dates() self._log_progress("AFTER VOLUME & DATE") @@ -244,13 +252,17 @@ class ComicFilenameParser: # Series and Title # - # Match years on the end of series and title tokens + # Volume left on the end of string tokens + if "volume" not in self.metadata: + self._parse_items(BOOK_VOLUME_RE) + + # Years left on the end of string tokens year_end_matched = False if "year" not in self.metadata: self._parse_items(YEAR_END_RE, pop=False) year_end_matched = "year" in self.metadata - # Pickup issue if it's out on the end of a token + # Issue left on the end of string tokens if "issue" not in self.metadata and not year_end_matched: exclude: str = self.metadata.get("year", "") # type: ignore self._parse_items(ISSUE_END_RE, exclude=exclude) diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index 6d23685..33b7295 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -111,7 +111,6 @@ YEAR_TOKEN_RE = re_compile(_YEAR_RE_EXP, parenthify=True) YEAR_END_RE = re_compile(_YEAR_RE_EXP + r"\/|$") # PAREN GROUPS -ISSUE_COUNT_RE = re_compile(r"of\s*(?P\d+)", parenthify=True) _OF_PATTERNS = r"|".join(ORIGINAL_FORMAT_PATTERNS) _ORIGINAL_FORMAT_RE_EXP = r"(?P" + _OF_PATTERNS + r")" _SCAN_INFO_RE_EXP = r"(?P[^()]*)" @@ -125,18 +124,34 @@ ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE = re_compile( r"\(" + _ORIGINAL_FORMAT_RE_EXP + r"\).*\(" + _SCAN_INFO_RE_EXP + r"\)" ) -# REGULAR TOKENS -VOLUME_RE = re_compile(r"((?:v(?:ol(?:ume)?)?\.?)\s*(?P\d+))") - # ISSUE _ISSUE_RE_EXP = r"(?P\w*(½|\d+)[\.\d+]*\w*)" -ISSUE_NUMBER_RE = re_compile(r"(\(?#" + _ISSUE_RE_EXP + r"\)?)") +_ISSUE_COUNT_RE_EXP = r"\(of\s*(?P\d+)\)" +ISSUE_NUMBER_RE = re_compile( + r"(\(?#" + _ISSUE_RE_EXP + r"\)?)" + r"(\W*" + _ISSUE_COUNT_RE_EXP + r")?" +) +ISSUE_WITH_COUNT_RE = re_compile( + r"(\(?" + _ISSUE_RE_EXP + r"\)?" + r"\W*" + _ISSUE_COUNT_RE_EXP + r")" +) + ISSUE_END_RE = re_compile(r"([\/\s]\(?" + _ISSUE_RE_EXP + r"\)?(\/|$))") ISSUE_BEGIN_RE = re_compile(r"((^|\/)\(?" + _ISSUE_RE_EXP + r"\)?[\/|\s])") # TODO unused ISSUE_ANYWHERE_RE = re_compile(r"\b(\(?" + _ISSUE_RE_EXP + r"\)?)\b") +# Volume +_VOLUME_COUNT_RE_EXP = r"\(of\s*(?P\d+)\)" +VOLUME_RE = re_compile( + r"(" + r"(?:v(?:ol(?:ume)?)?\.?)\s*(?P\d+)" + r"(\W*" + _VOLUME_COUNT_RE_EXP + r")?" + r")" +) +VOLUME_WITH_COUNT_RE = re_compile( + r"(\(?" + r"(?P\d+)" + r"\)?" + r"\W*" + _VOLUME_COUNT_RE_EXP + r")" +) +BOOK_VOLUME_RE = re_compile(r"(?P" + r"book\s*(?P<volume>\d+)" + r")") + + # LONG STRINGS REMAINING_GROUP_RE = re_compile(r"^[^\()].*[^\)]") NON_NUMBER_DOT_RE = re_compile(r"(\D)\.(\D)") diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index 42c4c3e..5e3c0e0 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -397,34 +397,27 @@ FNS.update( "volume": "1957", "year": "1969", }, + # CT checks for the following '(of 06)' after the '03' and marks it as the volume + "Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": { + "ext": "cbr", + "issue": "008", + "series": "Elephantmen 2259", + "title": "Simple Truth", + "volume": "03", + "year": "2021", + "volume_count": "06", + }, + # CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series) + "Bloodshot Book 03 (2020).cbr": { + "ext": "cbr", + "issue": "03", + "series": "Bloodshot", + "title": "Book 03", + "volume": "03", + "year": "2020", + }, } ) -VOLUME = { - # CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series) - # - # Book \d is a non-popped volume not an issue - "Bloodshot Book 03 (2020).cbr": { - "ext": "cbr", - "issue": "03", - "series": "Bloodshot", - "title": "Book 03", - "volume": "03", - "year": "2020", - }, - # CT checks for the following '(of 06)' after the '03' and marks it as the volume - # - # issue count is not popped if does not occur near issue - # \d (of \d) is volume & volume count if not issue - "Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": { - "ext": "cbr", - "issue": "008", - "series": "Elephantmen 2259", - "title": "Simple Truth", - "volume": "03", - "year": "2021", - "volume_count": "06", - }, -} PUBLISHER = { # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder # @@ -452,7 +445,7 @@ PUBLISHER = { }, } -# first_key, first_val = YEAR.popitem() +# first_key, first_val = VOLUME.popitem() # FNS[first_key] = first_val WONFIX = { From 241e13f2d67a201e72b759306d307efd03c094f4 Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Wed, 21 Feb 2024 13:16:30 -0800 Subject: [PATCH 29/57] update deps --- package-lock.json | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/package-lock.json b/package-lock.json index 0455f0e..bd836dd 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1780,9 +1780,9 @@ "dev": true }, "node_modules/electron-to-chromium": { - "version": "1.4.677", - "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.677.tgz", - "integrity": "sha512-erDa3CaDzwJOpyvfKhOiJjBVNnMM0qxHq47RheVVwsSQrgBA9ZSGV9kdaOfZDPXcHzhG7lBxhj6A7KvfLJBd6Q==", + "version": "1.4.678", + "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.678.tgz", + "integrity": "sha512-NbdGC2p0O5Q5iVhLEsNBSfytaw7wbEFJlIvaF71wi6QDtLAph5/rVogjyOpf/QggJIt8hNK3KdwNJnc2bzckbw==", "dev": true }, "node_modules/emoji-regex": { @@ -2739,9 +2739,9 @@ } }, "node_modules/flatted": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.3.0.tgz", - "integrity": "sha512-noqGuLw158+DuD9UPRKHpJ2hGxpFyDlYYrfM0mWt4XhT4n0lwzTLh70Tkdyy4kyTmyTT9Bv7bWAJqw7cgkEXDg==", + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.3.1.tgz", + "integrity": "sha512-X8cqMLLie7KsNUDSdzeN8FYK9rEt4Dt67OsG/DNGnYTSDBG4uFAJFBnUeiV+zCVAvwFy56IjM9sH51jVaEhNxw==", "dev": true }, "node_modules/for-each": { @@ -3525,12 +3525,15 @@ } }, "node_modules/is-shared-array-buffer": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-shared-array-buffer/-/is-shared-array-buffer-1.0.2.tgz", - "integrity": "sha512-sqN2UDu1/0y6uvXyStCOzyhAjCSlHceFoMKJW8W9EU9cvic/QdsZ0kEU93HEy3IUEFZIiH/3w+AH/UQbPHNdhA==", + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/is-shared-array-buffer/-/is-shared-array-buffer-1.0.3.tgz", + "integrity": "sha512-nA2hv5XIhLR3uVzDDfCIknerhx8XUKnstuOERPNNIinXG7v9u+ohXF67vxm4TPTEPU6lm61ZkwP3c9PCB97rhg==", "dev": true, "dependencies": { - "call-bind": "^1.0.2" + "call-bind": "^1.0.7" + }, + "engines": { + "node": ">= 0.4" }, "funding": { "url": "https://github.com/sponsors/ljharb" @@ -11549,15 +11552,16 @@ } }, "node_modules/typed-array-byte-length": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/typed-array-byte-length/-/typed-array-byte-length-1.0.0.tgz", - "integrity": "sha512-Or/+kvLxNpeQ9DtSydonMxCx+9ZXOswtwJn17SNLvhptaXYDJvkFFP5zbfU/uLmvnBJlI4yrnXRxpdWH/M5tNA==", + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/typed-array-byte-length/-/typed-array-byte-length-1.0.1.tgz", + "integrity": "sha512-3iMJ9q0ao7WE9tWcaYKIptkNBuOIcZCCT0d4MRvuuH88fEoEH62IuQe0OtraD3ebQEoTRk8XCBoknUNc1Y67pw==", "dev": true, "dependencies": { - "call-bind": "^1.0.2", + "call-bind": "^1.0.7", "for-each": "^0.3.3", - "has-proto": "^1.0.1", - "is-typed-array": "^1.1.10" + "gopd": "^1.0.1", + "has-proto": "^1.0.3", + "is-typed-array": "^1.1.13" }, "engines": { "node": ">= 0.4" From 0ff6feb3ea255a42f4a1fc3a2539c9df6175360f Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Wed, 21 Feb 2024 16:25:39 -0800 Subject: [PATCH 30/57] all tests work --- comicfn2dict/parse.py | 55 +++++++++++++++++++++++++++++++++++--- comicfn2dict/regex.py | 57 +++++++++++++++++++++++++++++++++++++--- tests/comic_filenames.py | 54 ++++++++++++++++++------------------- 3 files changed, 132 insertions(+), 34 deletions(-) diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 897dc63..3783fc6 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -7,6 +7,7 @@ from re import Pattern from typing import Any from comicfn2dict.regex import ( + ALPHA_MONTH_RANGE_RE, BOOK_VOLUME_RE, ISSUE_ANYWHERE_RE, ISSUE_BEGIN_RE, @@ -17,8 +18,13 @@ from comicfn2dict.regex import ( NON_NUMBER_DOT_RE, ORIGINAL_FORMAT_SCAN_INFO_RE, ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, + PUBLISHER_AMBIGUOUS_RE, + PUBLISHER_UNAMBIGUOUS_RE, + PUBLISHER_AMBIGUOUS_TOKEN_RE, + PUBLISHER_UNAMBIGUOUS_TOKEN_RE, REGEX_SUBS, REMAINING_GROUP_RE, + SCAN_INFO_SECONDARY_RE, TOKEN_DELIMETER, VOLUME_RE, VOLUME_WITH_COUNT_RE, @@ -41,6 +47,8 @@ class ComicFilenameParser: if not value: return -1 if value not in self._path_indexes: + # TODO This is fragile. + # Better to get it at match time. if key == "ext": index = self.path.rfind(value) else: @@ -69,12 +77,32 @@ class ComicFilenameParser: value = value.strip("'").strip() return value.strip('"').strip() + def _parenthify_double_underscores(self) -> str: + """Replace double underscores with parens.""" + parts = self._unparsed_path.split("__") + num_parts = len(parts) + print(f"{num_parts=} {num_parts % 2}") + if num_parts < 3 or not num_parts % 2: + return self._unparsed_path + index = 0 + mode = " (" + parenthified = parts[index] + index += 1 + while index < len(parts): + parenthified += mode + parts[index] + print(f"{parenthified=}") + mode = ") " if mode == " (" else ") " + index += 1 + return parenthified.strip() + def _clean_dividers(self): """Replace non space dividers and clean extra spaces out of string.""" - data = self._unparsed_path + data = self._parenthify_double_underscores() + + # Simple substitutions for regex, pair in REGEX_SUBS.items(): replacement, count = pair - data = regex.sub(replacement, data, count=count) + data = regex.sub(replacement, data, count=count).strip() self._unparsed_path = data.strip() def _parse_items( @@ -91,7 +119,6 @@ class ComicFilenameParser: return matched_metadata = {} for key, value in matches.groupdict().items(): - print(f"{value=} == {exclude=}") if value == exclude: continue if not value: @@ -126,6 +153,9 @@ class ComicFilenameParser: def _parse_dates(self): """Parse date schemes.""" + # Discard second month of alpha month ranges. + self._unparsed_path = ALPHA_MONTH_RANGE_RE.sub(r"\1", self._unparsed_path) + # Month first date self._parse_items(MONTH_FIRST_DATE_RE) self._alpha_month_to_numeric() @@ -248,6 +278,13 @@ class ComicFilenameParser: self._parse_items( ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, ) + + self._parse_items(SCAN_INFO_SECONDARY_RE) + if ( + scan_info_secondary := self.metadata.pop("secondary_scan_info", "") + ) and "scan_info" not in self.metadata: + self.metadata["scan_info"] = scan_info_secondary # type: ignore + self._log_progress("AFTER PAREN TOKENS") # Series and Title @@ -269,6 +306,18 @@ class ComicFilenameParser: if "issue" not in self.metadata: self._parse_items(ISSUE_BEGIN_RE) self._log_progress("AFTER ISSUE PICKUP") + + # Publisher + # + # Pop single tokens so they don't end up titles. + self._parse_items(PUBLISHER_UNAMBIGUOUS_TOKEN_RE, first_only=True) + if "publisher" not in self.metadata: + self._parse_items(PUBLISHER_AMBIGUOUS_TOKEN_RE, first_only=True) + if "publisher" not in self.metadata: + self._parse_items(PUBLISHER_UNAMBIGUOUS_RE, pop=False, first_only=True) + if "publisher" not in self.metadata: + self._parse_items(PUBLISHER_AMBIGUOUS_RE, pop=False, first_only=True) + self._assign_remaining_groups() self._log_progress("AFTER SERIES AND TITLE") diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index 33b7295..f9a456e 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -10,6 +10,30 @@ def re_compile(exp, parenthify=False): return re.compile(exp, flags=re.IGNORECASE) +PUBLISHERS_UNAMBIGUOUS = ( + r"Abrams ComicArts", + r"BOOM! Studios", + r"DC(\sComics)?", + r"Dark Horse Comics", + r"Drawn & Quarterly", + r"Dynamite Entertainment", + r"IDW Publishing", + r"Icon Comics", + r"Kodansha", + r"Oni Press", + r"Pantheon Books", + r"SLG Publishing", + r"SelfMadeHero", + r"Titan Comics", +) +PUBLISHERS_AMBIGUOUS = ( + r"Marvel", + r"Heavy Metal", + r"Epic", + r"Image", + r"Mirage", +) + ORIGINAL_FORMAT_PATTERNS = ( r"Anthology", r"(One|1)[-\s]Shot", @@ -48,7 +72,7 @@ MONTHS = ( r"Jun(e)?", r"Jul(y)?", r"Aug(ust)?", - r"Sept(ember)?", + r"Sep(tember)?", r"Oct(ober)?", r"Nov(ember)?", r"Dec(ember)?", @@ -74,9 +98,19 @@ REGEX_SUBS: MappingProxyType[re.Pattern, tuple[str, int]] = MappingProxyType( ### DATES _YEAR_RE_EXP = r"(?P<year>[12]\d{3})" -_MONTH_ALPHA_RE_EXP = r"(?P<alpha_month>" + r"|".join(MONTHS) + r")\.?" +_MONTH_ALPHA_RE_EXP = r"(" + "(?P<alpha_month>" + r"|".join(MONTHS) + r")\.?" r")" _MONTH_NUMERIC_RE_EXP = r"(?P<month>0?\d|1[0-2]?)" _MONTH_RE_EXP = r"(" + _MONTH_ALPHA_RE_EXP + r"|" + _MONTH_NUMERIC_RE_EXP + r")" +_ALPHA_MONTH_RANGE = ( + r"\b" + + r"(" + r"|".join(MONTHS) + r")" + + r"(" + + r"\.?-" + + r"(" + r"|".join(MONTHS) + r")" + + r")\b" +) +print(_ALPHA_MONTH_RANGE) +ALPHA_MONTH_RANGE_RE = re_compile(_ALPHA_MONTH_RANGE) _DAY_RE_EXP = r"(?P<day>([0-2]?\d|(3)[0-1]))" _DATE_DELIM = r"[-\s]+" @@ -124,6 +158,8 @@ ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE = re_compile( r"\(" + _ORIGINAL_FORMAT_RE_EXP + r"\).*\(" + _SCAN_INFO_RE_EXP + r"\)" ) +SCAN_INFO_SECONDARY_RE = re_compile(r"\b(?P<secondary_scan_info>c2c)\b") + # ISSUE _ISSUE_RE_EXP = r"(?P<issue>\w*(½|\d+)[\.\d+]*\w*)" _ISSUE_COUNT_RE_EXP = r"\(of\s*(?P<issue_count>\d+)\)" @@ -151,7 +187,22 @@ VOLUME_WITH_COUNT_RE = re_compile( ) BOOK_VOLUME_RE = re_compile(r"(?P<title>" + r"book\s*(?P<volume>\d+)" + r")") +# Publisher +_PUBLISHER_UNAMBIGUOUS_RE_EXP = ( + r"(\b(?P<publisher>" + r"|".join(PUBLISHERS_UNAMBIGUOUS) + r")\b)" +) +_PUBLISHER_AMBIGUOUS_RE_EXP = ( + r"(\b(?P<publisher>" + r"|".join(PUBLISHERS_AMBIGUOUS) + r")\b)" +) +PUBLISHER_UNAMBIGUOUS_TOKEN_RE = re_compile( + r"(^|\/)" + _PUBLISHER_UNAMBIGUOUS_RE_EXP + r"($|\/)" +) +PUBLISHER_AMBIGUOUS_TOKEN_RE = re_compile( + r"(^|\/)" + _PUBLISHER_AMBIGUOUS_RE_EXP + r"($|\/)" +) +PUBLISHER_UNAMBIGUOUS_RE = re_compile(_PUBLISHER_UNAMBIGUOUS_RE_EXP) +PUBLISHER_AMBIGUOUS_RE = re_compile(_PUBLISHER_AMBIGUOUS_RE_EXP) # LONG STRINGS -REMAINING_GROUP_RE = re_compile(r"^[^\()].*[^\)]") +REMAINING_GROUP_RE = re_compile(r"^[^\(].*[^\)]") NON_NUMBER_DOT_RE = re_compile(r"(\D)\.(\D)") diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index 5e3c0e0..e85a18e 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -274,7 +274,8 @@ FNS.update( "issue": "2", "series": "Monster Island", "volume": "1", - "remainders": ("repaired c2c",), + "scan_info": "c2c", + "remainders": ("(repaired)",), }, # Extra - in the series " X-Men-V1-#067.cbr": { @@ -334,6 +335,7 @@ FNS.update( "ext": "cbr", "issue": "002", "series": "Marvel Previews", + "publisher": "Marvel", "month": "01", "year": "2022", }, @@ -416,36 +418,32 @@ FNS.update( "volume": "03", "year": "2020", }, + # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder + "Marvel Two In One V1 #090 c2c.cbr": { + "ext": "cbr", + "issue": "090", + "series": "Marvel Two In One", + "publisher": "Marvel", + "volume": "1", + "scan_info": "c2c", + }, + # CT treats '[]' as equivalent to '()', catches DC as a publisher and 'Sep-Oct 1951' as dates and removes them. CT doesn't catch the digital though so that could be better but I blame whoever made this atrocious filename + "Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz": { + "ext": "cbz", + "issue": "49", + "series": "Wonder Woman", + "publisher": "DC", + "year": "1951", + "month": "09", + "remainders": ( + "digital (downsized, lightened, 4 missing story pages " + "restored) (Shadowcat-Empire)", + ), + }, } ) -PUBLISHER = { - # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder - # - # 1. c2c is not a title and is an original_format - # Leading common publisher may be a publisher? Do not pop - "Marvel Two In One V1 #090 c2c.cbr": { - "ext": "cbr", - "issue": "090", - "series": "Marvel Two In One", - "publisher": "Marvel", - "volume": "1", - }, - # CT treats '[]' as equivalent to '()', catches DC as a publisher and 'Sep-Oct 1951' as dates and removes them. CT doesn't catch the digital though so that could be better but I blame whoever made this atrocious filename - # - # 1. Month-Month should be handled - # 2. DC is a common publisher, no pop? - "Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz": { - "ext": "cbz", - "issue": "49", - "series": "Wonder Woman", - "title": "digital", - "publisher": "DC", - "year": "1951", - "month": "10", - }, -} -# first_key, first_val = VOLUME.popitem() +# first_key, first_val = NEW.popitem() # FNS[first_key] = first_val WONFIX = { From 4b9015878d894ecf42c4a449cbee8e039a0eb87d Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Wed, 21 Feb 2024 17:03:42 -0800 Subject: [PATCH 31/57] replace code with regex --- comicfn2dict/parse.py | 20 +------------------- comicfn2dict/regex.py | 2 ++ 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 3783fc6..2dd0f19 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -77,27 +77,9 @@ class ComicFilenameParser: value = value.strip("'").strip() return value.strip('"').strip() - def _parenthify_double_underscores(self) -> str: - """Replace double underscores with parens.""" - parts = self._unparsed_path.split("__") - num_parts = len(parts) - print(f"{num_parts=} {num_parts % 2}") - if num_parts < 3 or not num_parts % 2: - return self._unparsed_path - index = 0 - mode = " (" - parenthified = parts[index] - index += 1 - while index < len(parts): - parenthified += mode + parts[index] - print(f"{parenthified=}") - mode = ") " if mode == " (" else ") " - index += 1 - return parenthified.strip() - def _clean_dividers(self): """Replace non space dividers and clean extra spaces out of string.""" - data = self._parenthify_double_underscores() + data = self._unparsed_path # Simple substitutions for regex, pair in REGEX_SUBS.items(): diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index f9a456e..c17ecd8 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -86,8 +86,10 @@ _SPACE_EQUIVALENT_RE = re_compile(r"_") _EXTRA_SPACES_RE = re_compile(r"\s\s+") _LEFT_PAREN_EQUIVALENT_RE = re_compile(r"\[") _RIGHT_PAREN_EQUIVALENT_RE = re_compile(r"\]") +_DOUBLE_UNDERSCORE_RE = re_compile(r"__(.*)__") REGEX_SUBS: MappingProxyType[re.Pattern, tuple[str, int]] = MappingProxyType( { + _DOUBLE_UNDERSCORE_RE: (r"(\1)", 0), _TOKEN_DIVIDERS_RE: (TOKEN_DELIMETER, 1), _SPACE_EQUIVALENT_RE: (r" ", 0), _EXTRA_SPACES_RE: (r" ", 0), From 439a904c5445dd9f1847c13daaa1058de2815756 Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Wed, 21 Feb 2024 18:08:50 -0800 Subject: [PATCH 32/57] full dates in serialization. remainder in square brackets. tests for serializer --- comicfn2dict/regex.py | 8 +++++-- comicfn2dict/unparse.py | 48 +++++++++++++++++++++++++++++++------- tests/comic_filenames.py | 35 +++++++++++++++++++++++++-- tests/test_comicfn2dict.py | 8 ++----- tests/test_dict2comicfn.py | 13 +++++++++++ 5 files changed, 94 insertions(+), 18 deletions(-) create mode 100644 tests/test_dict2comicfn.py diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index c17ecd8..dc2e987 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -105,10 +105,14 @@ _MONTH_NUMERIC_RE_EXP = r"(?P<month>0?\d|1[0-2]?)" _MONTH_RE_EXP = r"(" + _MONTH_ALPHA_RE_EXP + r"|" + _MONTH_NUMERIC_RE_EXP + r")" _ALPHA_MONTH_RANGE = ( r"\b" - + r"(" + r"|".join(MONTHS) + r")" + + r"(" + + r"|".join(MONTHS) + + r")" + r"(" + r"\.?-" - + r"(" + r"|".join(MONTHS) + r")" + + r"(" + + r"|".join(MONTHS) + + r")" + r")\b" ) print(_ALPHA_MONTH_RANGE) diff --git a/comicfn2dict/unparse.py b/comicfn2dict/unparse.py index a04380c..abe5fab 100644 --- a/comicfn2dict/unparse.py +++ b/comicfn2dict/unparse.py @@ -1,5 +1,8 @@ """Unparse comic filenames.""" -from collections.abc import Callable, Mapping +from collections.abc import Callable, Mapping, Sequence +from contextlib import suppress +from calendar import month_abbr +from types import MappingProxyType def issue_formatter(issue: str) -> str: @@ -18,18 +21,39 @@ _PAREN_FMT: str = "({})" _FILENAME_FORMAT_TAGS: tuple[tuple[str, str | Callable], ...] = ( ("series", "{}"), ("volume", "v{}"), + ("volume_count", "(of {:03})"), ("issue", issue_formatter), ("issue_count", "(of {:03})"), - ("year", _PAREN_FMT), + ("date", _PAREN_FMT), ("title", "{}"), + ("publisher", _PAREN_FMT), ("original_format", _PAREN_FMT), ("scan_info", _PAREN_FMT), ) _EMPTY_VALUES: tuple[None, str] = (None, "") _DEFAULT_EXT = "cbz" +_DATE_KEYS = ("year", "month", "day") class ComicFilenameSerializer: + def _add_date(self) -> None: + if "date" in self.metadata: + return + parts = [] + for key in _DATE_KEYS: + if part := self.metadata.get(key): + if key == "month" and not parts: + with suppress(TypeError): + part = month_abbr[int(part)] + + parts.append(part) + if key == "month" and not parts: + # noop if only day. + break + if parts: + date = "-".join(parts) + self.metadata = MappingProxyType({**self.metadata, "date": date}) + def _tokenize_tag(self, tag: str, fmt: str | Callable) -> str: val = self.metadata.get(tag) if val in _EMPTY_VALUES: @@ -38,22 +62,30 @@ class ComicFilenameSerializer: token = final_fmt.format(val).strip() return token + def _add_remainder(self) -> str: + if remainders := self.metadata.get("remainders"): + if isinstance(remainders, Sequence): + remainder = " ".join(remainders) + else: + remainder = str(remainders) + return f"[{remainder}]" + return "" + def serialize(self) -> str: """Get our preferred basename from a metadata dict.""" + self._add_date() + tokens = [] for tag, fmt in _FILENAME_FORMAT_TAGS: if token := self._tokenize_tag(tag, fmt): tokens.append(token) fn = " ".join(tokens) - if remainders := self.metadata.get("remainders"): - # TODO make token and add before join? - remainder = " ".join(remainders) - # TODO oh this is the - delineated remainder :( - fn += f" - {remainder}" + fn += self._add_remainder() if self._ext: - fn += "." + self.metadata.get("ext", _DEFAULT_EXT) + ext = self.metadata.get("ext", _DEFAULT_EXT) + fn += f".{ext}" return fn diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index e85a18e..71823f6 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -1,5 +1,8 @@ """Test filenames with human parsed correct results.""" +from types import MappingProxyType + + TEST_COMIC_FIELDS = { "series": "Long Series Name", "issue": "001", @@ -30,7 +33,7 @@ TEST_COMIC_VOL_ONLY = { "ext": "cbr", } -# Working with 0.1.0 +# Tests for 0.1.0 FNS = { "Night of 1000 Wolves 001 (2013).cbz": { "series": "Night of 1000 Wolves", @@ -239,7 +242,7 @@ FNS = { }, } -# Fixed with 0.2.0 +# Tests for 0.2.0 FNS.update( { # Philosopy change regarding dashes. @@ -442,6 +445,34 @@ FNS.update( }, } ) +PARSE_FNS = MappingProxyType(FNS) + +SERIALIZE_FNS = MappingProxyType( + { + "Long Series Name #001 (2000) Title (TPB) (Releaser).cbz": TEST_COMIC_FIELDS, + "Long Series Name v1 #001 " + "(2000) Title (TPB) (Releaser & Releaser-Releaser).cbr": TEST_COMIC_VOL_ONLY, + "Series Name (2000-12-31).cbz": { + "series": "Series Name", + "year": "2000", + "month": "12", + "day": "31", + "ext": "cbz", + }, + "Series Name (2000-12).cbz": { + "series": "Series Name", + "year": "2000", + "month": "12", + "ext": "cbz", + }, + "Series Name (Dec-31).cbz": { + "series": "Series Name", + "month": "12", + "day": "31", + "ext": "cbz", + }, + } +) # first_key, first_val = NEW.popitem() # FNS[first_key] = first_val diff --git a/tests/test_comicfn2dict.py b/tests/test_comicfn2dict.py index 33f4d82..768c5c2 100644 --- a/tests/test_comicfn2dict.py +++ b/tests/test_comicfn2dict.py @@ -1,18 +1,14 @@ """Tests for filename parsing.""" from pprint import pprint -from types import MappingProxyType import pytest from deepdiff.diff import DeepDiff from comicfn2dict import ComicFilenameParser -from tests.comic_filenames import FNS - -ALL_FIELDS = frozenset({"series", "volume", "issue", "issue_count", "year", "ext"}) -FIELD_SCHEMA = MappingProxyType({key: None for key in ALL_FIELDS}) +from tests.comic_filenames import PARSE_FNS -@pytest.mark.parametrize("item", FNS.items()) +@pytest.mark.parametrize("item", PARSE_FNS.items()) def test_parse_filename(item): """Test filename parsing.""" fn, defined_fields = item diff --git a/tests/test_dict2comicfn.py b/tests/test_dict2comicfn.py new file mode 100644 index 0000000..787183f --- /dev/null +++ b/tests/test_dict2comicfn.py @@ -0,0 +1,13 @@ +"""Tests for filename parsing.""" +import pytest + +from comicfn2dict import ComicFilenameSerializer +from tests.comic_filenames import SERIALIZE_FNS + + +@pytest.mark.parametrize("item", SERIALIZE_FNS.items()) +def test_serialize_dict(item): + """Test metadata serialization.""" + test_fn, md = item + fn = ComicFilenameSerializer(md).serialize() + assert test_fn == fn From 9ec6c8492a9c1b5ea5558676c33f6123a4c20e96 Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Wed, 21 Feb 2024 18:10:28 -0800 Subject: [PATCH 33/57] move code --- tests/comic_filenames.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index 71823f6..10a631f 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -445,6 +445,9 @@ FNS.update( }, } ) + +# first_key, first_val = NEW.popitem() +# FNS[first_key] = first_val PARSE_FNS = MappingProxyType(FNS) SERIALIZE_FNS = MappingProxyType( @@ -473,10 +476,6 @@ SERIALIZE_FNS = MappingProxyType( }, } ) - -# first_key, first_val = NEW.popitem() -# FNS[first_key] = first_val - WONFIX = { # Leading issue number is usually an alternate sequence number # WONTFIX: Series names may begin with numerals. From 71b84e9540cebd076fd8e7a0c8b9ced54da98511 Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Wed, 21 Feb 2024 21:13:13 -0800 Subject: [PATCH 34/57] remove cruft --- comicfn2dict/regex.py | 1 - 1 file changed, 1 deletion(-) diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index dc2e987..55a208f 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -115,7 +115,6 @@ _ALPHA_MONTH_RANGE = ( + r")" + r")\b" ) -print(_ALPHA_MONTH_RANGE) ALPHA_MONTH_RANGE_RE = re_compile(_ALPHA_MONTH_RANGE) _DAY_RE_EXP = r"(?P<day>([0-2]?\d|(3)[0-1]))" From 0d2fd4b9d2e534559e23b7e34889fefd9bbe140c Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Wed, 21 Feb 2024 21:13:41 -0800 Subject: [PATCH 35/57] simple logging activeated by verbose --- comicfn2dict/cli.py | 2 ++ comicfn2dict/log.py | 9 +++++++++ comicfn2dict/parse.py | 40 +++++++++++++++++++++++----------------- comicfn2dict/unparse.py | 20 +++++++++++++++++++- 4 files changed, 53 insertions(+), 18 deletions(-) create mode 100644 comicfn2dict/log.py diff --git a/comicfn2dict/cli.py b/comicfn2dict/cli.py index e72508e..c0a7199 100755 --- a/comicfn2dict/cli.py +++ b/comicfn2dict/cli.py @@ -22,6 +22,8 @@ def main(): name = args.path.name cfnparser = ComicFilenameParser(name, verbose=args.verbose) metadata = cfnparser.parse() + if args.verbose: + print("=" * 80) pprint(metadata) # noqa:T203 diff --git a/comicfn2dict/log.py b/comicfn2dict/log.py new file mode 100644 index 0000000..3265889 --- /dev/null +++ b/comicfn2dict/log.py @@ -0,0 +1,9 @@ +"""Print log header.""" + + +def print_log_header(label: str) -> None: + """Print log header.""" + prefix = "-" * 3 + label + suffix_len = 80 - len(prefix) + suffix = "-" * suffix_len + print(prefix + suffix) diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 2dd0f19..d0fa781 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -1,11 +1,11 @@ """Parse comic book archive names using the simple 'parse' parser.""" -from pprint import pprint +from pprint import pformat from calendar import month_abbr from copy import copy from pathlib import Path from re import Pattern from typing import Any - +from comicfn2dict.log import print_log_header from comicfn2dict.regex import ( ALPHA_MONTH_RANGE_RE, BOOK_VOLUME_RE, @@ -215,24 +215,24 @@ class ComicFilenameParser: if remainders: self.metadata["remainders"] = tuple(remainders) - def _log_progress(self, label): + def _log(self, label): if not self._debug: return - print(label + ":") + print_log_header(label) combined = {} for key in self.metadata: combined[key] = (self.metadata.get(key), self.path_index(key)) - pprint(combined) - print(self._unparsed_path) + print(" " + self._unparsed_path) + print(" " + pformat(combined)) def parse(self) -> dict[str, Any]: """Parse the filename with a hierarchy of regexes.""" # Init # - self._log_progress("INITIAL") + self._log("Init") self._parse_ext() self._clean_dividers() - self._log_progress("CLEANED") + self._log("After Clean Path") # Issue # @@ -240,15 +240,19 @@ class ComicFilenameParser: if "issue" not in self.metadata: self._parse_items(ISSUE_WITH_COUNT_RE) # self._parse_items(ISSUE_COUNT_RE) - self._log_progress("AFTER ISSUE") + self._log("After Issue") - # Volume and Date + # Volume # self._parse_items(VOLUME_RE) if "volume" not in self.metadata: self._parse_items(VOLUME_WITH_COUNT_RE) + self._log("After Volume") + + # Date + # self._parse_dates() - self._log_progress("AFTER VOLUME & DATE") + self._log("After Date") # Format & Scan Info # @@ -260,26 +264,26 @@ class ComicFilenameParser: self._parse_items( ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, ) - self._parse_items(SCAN_INFO_SECONDARY_RE) if ( scan_info_secondary := self.metadata.pop("secondary_scan_info", "") ) and "scan_info" not in self.metadata: self.metadata["scan_info"] = scan_info_secondary # type: ignore - - self._log_progress("AFTER PAREN TOKENS") + self._log("After original_format & scan_info") # Series and Title # # Volume left on the end of string tokens if "volume" not in self.metadata: self._parse_items(BOOK_VOLUME_RE) + self._log("After original_format & scan_info") # Years left on the end of string tokens year_end_matched = False if "year" not in self.metadata: self._parse_items(YEAR_END_RE, pop=False) year_end_matched = "year" in self.metadata + self._log("After Year on end of token") # Issue left on the end of string tokens if "issue" not in self.metadata and not year_end_matched: @@ -287,7 +291,7 @@ class ComicFilenameParser: self._parse_items(ISSUE_END_RE, exclude=exclude) if "issue" not in self.metadata: self._parse_items(ISSUE_BEGIN_RE) - self._log_progress("AFTER ISSUE PICKUP") + self._log("After Issue on ends of tokens") # Publisher # @@ -299,20 +303,22 @@ class ComicFilenameParser: self._parse_items(PUBLISHER_UNAMBIGUOUS_RE, pop=False, first_only=True) if "publisher" not in self.metadata: self._parse_items(PUBLISHER_AMBIGUOUS_RE, pop=False, first_only=True) + self._log("After publisher") self._assign_remaining_groups() - self._log_progress("AFTER SERIES AND TITLE") + self._log("After Series & Title") # Final try for issue number. # TODO unused if "issue" not in self.metadata: self._parse_items(ISSUE_ANYWHERE_RE) - self._log_progress("AFTER ISSUE PICKUP") + self._log("AFTER ISSUE PICKUP") # Copy volume into issue if it's all we have. # if "issue" not in self.metadata and "volume" in self.metadata: self.metadata["issue"] = self.metadata["volume"] + self._log("After issue can be volume") self._add_remainders() diff --git a/comicfn2dict/unparse.py b/comicfn2dict/unparse.py index abe5fab..a0c4b91 100644 --- a/comicfn2dict/unparse.py +++ b/comicfn2dict/unparse.py @@ -3,6 +3,7 @@ from collections.abc import Callable, Mapping, Sequence from contextlib import suppress from calendar import month_abbr from types import MappingProxyType +from comicfn2dict.log import print_log_header def issue_formatter(issue: str) -> str: @@ -36,7 +37,17 @@ _DATE_KEYS = ("year", "month", "day") class ComicFilenameSerializer: + """Serialize Comic Filenames from dict.""" + + def _log(self, label, fn): + """Log progress.""" + if not self._debug: + return + print_log_header(label) + print(fn) + def _add_date(self) -> None: + """Construct date from Y-m-D if they exist.""" if "date" in self.metadata: return parts = [] @@ -52,9 +63,11 @@ class ComicFilenameSerializer: break if parts: date = "-".join(parts) + self._log("After date", date) self.metadata = MappingProxyType({**self.metadata, "date": date}) def _tokenize_tag(self, tag: str, fmt: str | Callable) -> str: + """Add tags to the string.""" val = self.metadata.get(tag) if val in _EMPTY_VALUES: return "" @@ -63,6 +76,7 @@ class ComicFilenameSerializer: return token def _add_remainder(self) -> str: + """Add the remainders specially.""" if remainders := self.metadata.get("remainders"): if isinstance(remainders, Sequence): remainder = " ".join(remainders) @@ -79,19 +93,23 @@ class ComicFilenameSerializer: for tag, fmt in _FILENAME_FORMAT_TAGS: if token := self._tokenize_tag(tag, fmt): tokens.append(token) + self._log(f"After {tag}", tokens) fn = " ".join(tokens) fn += self._add_remainder() + self._log("After remainder", fn) if self._ext: ext = self.metadata.get("ext", _DEFAULT_EXT) fn += f".{ext}" + self._log("After ext", fn) return fn - def __init__(self, metadata: Mapping, ext: bool = True): + def __init__(self, metadata: Mapping, ext: bool = True, verbose: int = 0): self.metadata: Mapping = metadata self._ext: bool = ext + self._debug: bool = bool(verbose) def dict2comicfn(md: Mapping, ext: bool = True) -> str: From 4f42e0afad4d687b7947fb1bdf8af80bed54e782 Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Wed, 21 Feb 2024 21:16:03 -0800 Subject: [PATCH 36/57] minor code cleanup --- comicfn2dict/parse.py | 5 +++-- comicfn2dict/unparse.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index d0fa781..9e621a3 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -337,6 +337,7 @@ class ComicFilenameParser: self._path_indexes: dict[str, int] = {} -def comicfn2dict(path: str | Path): +def comicfn2dict(path: str | Path, verbose: int = 0): """Simple API.""" - return ComicFilenameParser(path).parse() + parser = ComicFilenameParser(path, verbose=verbose) + return parser.parse() diff --git a/comicfn2dict/unparse.py b/comicfn2dict/unparse.py index a0c4b91..351a115 100644 --- a/comicfn2dict/unparse.py +++ b/comicfn2dict/unparse.py @@ -112,6 +112,7 @@ class ComicFilenameSerializer: self._debug: bool = bool(verbose) -def dict2comicfn(md: Mapping, ext: bool = True) -> str: +def dict2comicfn(md: Mapping, ext: bool = True, verbose: int = 0) -> str: """Simple API.""" - return ComicFilenameSerializer(md, ext=ext).serialize() + serializer = ComicFilenameSerializer(md, ext=ext, verbose=verbose) + return serializer.serialize() From 83cca4a84636b4fe202b8b0db7a85ab07fd8ab43 Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Wed, 21 Feb 2024 21:18:32 -0800 Subject: [PATCH 37/57] cleanup cruft --- tests/comic_filenames.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index 10a631f..3c92ee0 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -476,20 +476,3 @@ SERIALIZE_FNS = MappingProxyType( }, } ) -WONFIX = { - # Leading issue number is usually an alternate sequence number - # WONTFIX: Series names may begin with numerals. - "52 action comics #2024.cbz": { - "ext": "cbz", - "issue": "2024", - "series": "action comics", - "alternate": "52", - }, - # Only the issue number. CT ensures that the series always has a value if possible - # I don't think making the series the same as the number is valuable. - "#52.cbz": { - "ext": "cbz", - "issue": "52", - "series": "52", - }, -} From f61fe41850ca107088dd0402287011b6feaedab4 Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Fri, 23 Feb 2024 17:35:13 -0800 Subject: [PATCH 38/57] remove unused issue pickup --- comicfn2dict/parse.py | 7 --- comicfn2dict/regex.py | 4 -- package-lock.json | 54 ++++++++++---------- poetry.lock | 115 +++++++++++++++++++++--------------------- 4 files changed, 85 insertions(+), 95 deletions(-) diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 9e621a3..e49f903 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -9,7 +9,6 @@ from comicfn2dict.log import print_log_header from comicfn2dict.regex import ( ALPHA_MONTH_RANGE_RE, BOOK_VOLUME_RE, - ISSUE_ANYWHERE_RE, ISSUE_BEGIN_RE, ISSUE_END_RE, ISSUE_NUMBER_RE, @@ -308,12 +307,6 @@ class ComicFilenameParser: self._assign_remaining_groups() self._log("After Series & Title") - # Final try for issue number. - # TODO unused - if "issue" not in self.metadata: - self._parse_items(ISSUE_ANYWHERE_RE) - self._log("AFTER ISSUE PICKUP") - # Copy volume into issue if it's all we have. # if "issue" not in self.metadata and "volume" in self.metadata: diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index 55a208f..5e4444c 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -174,13 +174,9 @@ ISSUE_NUMBER_RE = re_compile( ISSUE_WITH_COUNT_RE = re_compile( r"(\(?" + _ISSUE_RE_EXP + r"\)?" + r"\W*" + _ISSUE_COUNT_RE_EXP + r")" ) - ISSUE_END_RE = re_compile(r"([\/\s]\(?" + _ISSUE_RE_EXP + r"\)?(\/|$))") ISSUE_BEGIN_RE = re_compile(r"((^|\/)\(?" + _ISSUE_RE_EXP + r"\)?[\/|\s])") -# TODO unused -ISSUE_ANYWHERE_RE = re_compile(r"\b(\(?" + _ISSUE_RE_EXP + r"\)?)\b") - # Volume _VOLUME_COUNT_RE_EXP = r"\(of\s*(?P<volume_count>\d+)\)" VOLUME_RE = re_compile( diff --git a/package-lock.json b/package-lock.json index bd836dd..7572c4a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -442,9 +442,9 @@ } }, "node_modules/@eslint/js": { - "version": "8.56.0", - "resolved": "https://registry.npmjs.org/@eslint/js/-/js-8.56.0.tgz", - "integrity": "sha512-gMsVel9D7f2HLkBma9VbtzZRehRogVRfbr++f06nL2vnCGCNlzOD+/MUov/F4p8myyAHspEhVobgjpX64q5m6A==", + "version": "8.57.0", + "resolved": "https://registry.npmjs.org/@eslint/js/-/js-8.57.0.tgz", + "integrity": "sha512-Ys+3g2TaW7gADOJzPt83SJtCDhMjndcDMFVQ/Tj9iA1BfJzFKD9mAUXT3OenpuPHbI6P/myECxRJrofUsDx/5g==", "dev": true, "engines": { "node": "^12.22.0 || ^14.17.0 || >=16.0.0" @@ -811,9 +811,9 @@ "dev": true }, "node_modules/@types/estree-jsx": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/@types/estree-jsx/-/estree-jsx-1.0.4.tgz", - "integrity": "sha512-5idy3hvI9lAMqsyilBM+N+boaCf1MgoefbDxN6KEO5aK17TOHwFAYT9sjxzeKAiIWRUBgLxmZ9mPcnzZXtTcRQ==", + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/@types/estree-jsx/-/estree-jsx-1.0.5.tgz", + "integrity": "sha512-52CcUVNFyfb1A2ALocQw/Dd1BQFNmSdkuC3BkZ6iqhdMfQz7JWOFRuJFloOzjk+6WijU56m9oKXFAXc7o3Towg==", "dev": true, "dependencies": { "@types/estree": "*" @@ -856,9 +856,9 @@ "dev": true }, "node_modules/@types/node": { - "version": "20.11.19", - "resolved": "https://registry.npmjs.org/@types/node/-/node-20.11.19.tgz", - "integrity": "sha512-7xMnVEcZFu0DikYjWOlRq7NTPETrm7teqUT2WkQjrTIkEgUyyGdWsj/Zg8bEJt5TNklzbPD1X3fqfsHw3SpapQ==", + "version": "20.11.20", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.11.20.tgz", + "integrity": "sha512-7/rR21OS+fq8IyHTgtLkDK949uzsa6n8BkziAKtPVpugIkO6D+/ooXMvzXxDnZrmtXVfjb1bKQafYpb8s89LOg==", "dev": true, "dependencies": { "undici-types": "~5.26.4" @@ -1364,9 +1364,9 @@ } }, "node_modules/caniuse-lite": { - "version": "1.0.30001588", - "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001588.tgz", - "integrity": "sha512-+hVY9jE44uKLkH0SrUTqxjxqNTOWHsbnQDIKjwkZ3lNTzUUVdBLBGXtj/q5Mp5u98r3droaZAewQuEDzjQdZlQ==", + "version": "1.0.30001589", + "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001589.tgz", + "integrity": "sha512-vNQWS6kI+q6sBlHbh71IIeC+sRwK2N3EDySc/updIGhIee2x5z00J4c1242/5/d6EpEMdOnk/m+6tuk4/tcsqg==", "dev": true, "funding": [ { @@ -1780,9 +1780,9 @@ "dev": true }, "node_modules/electron-to-chromium": { - "version": "1.4.678", - "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.678.tgz", - "integrity": "sha512-NbdGC2p0O5Q5iVhLEsNBSfytaw7wbEFJlIvaF71wi6QDtLAph5/rVogjyOpf/QggJIt8hNK3KdwNJnc2bzckbw==", + "version": "1.4.681", + "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.681.tgz", + "integrity": "sha512-1PpuqJUFWoXZ1E54m8bsLPVYwIVCRzvaL+n5cjigGga4z854abDnFRc+cTa2th4S79kyGqya/1xoR7h+Y5G5lg==", "dev": true }, "node_modules/emoji-regex": { @@ -1944,16 +1944,16 @@ } }, "node_modules/eslint": { - "version": "8.56.0", - "resolved": "https://registry.npmjs.org/eslint/-/eslint-8.56.0.tgz", - "integrity": "sha512-Go19xM6T9puCOWntie1/P997aXxFsOi37JIHRWI514Hc6ZnaHGKY9xFhrU65RT6CcBEzZoGG1e6Nq+DT04ZtZQ==", + "version": "8.57.0", + "resolved": "https://registry.npmjs.org/eslint/-/eslint-8.57.0.tgz", + "integrity": "sha512-dZ6+mexnaTIbSBZWgou51U6OmzIhYM2VcNdtiTtI7qPNZm35Akpr0f6vtw3w1Kmn5PYo+tZVfh13WrhpS6oLqQ==", "dev": true, "dependencies": { "@eslint-community/eslint-utils": "^4.2.0", "@eslint-community/regexpp": "^4.6.1", "@eslint/eslintrc": "^2.1.4", - "@eslint/js": "8.56.0", - "@humanwhocodes/config-array": "^0.11.13", + "@eslint/js": "8.57.0", + "@humanwhocodes/config-array": "^0.11.14", "@humanwhocodes/module-importer": "^1.0.1", "@nodelib/fs.walk": "^1.2.8", "@ungap/structured-clone": "^1.2.0", @@ -6288,12 +6288,12 @@ "dev": true }, "node_modules/prettier-plugin-packagejson": { - "version": "2.4.11", - "resolved": "https://registry.npmjs.org/prettier-plugin-packagejson/-/prettier-plugin-packagejson-2.4.11.tgz", - "integrity": "sha512-zmOmM96GkAjT2zUdHSQJnpyVpbisBkewDluo2NLHjI/JN7uOCZlEzWVaMhdqyZ8LVdQDfzamvbvSw4swd3Az1A==", + "version": "2.4.12", + "resolved": "https://registry.npmjs.org/prettier-plugin-packagejson/-/prettier-plugin-packagejson-2.4.12.tgz", + "integrity": "sha512-hifuuOgw5rHHTdouw9VrhT8+Nd7UwxtL1qco8dUfd4XUFQL6ia3xyjSxhPQTsGnSYFraTWy5Omb+MZm/OWDTpQ==", "dev": true, "dependencies": { - "sort-package-json": "2.7.0", + "sort-package-json": "2.8.0", "synckit": "0.9.0" }, "peerDependencies": { @@ -11134,9 +11134,9 @@ "dev": true }, "node_modules/sort-package-json": { - "version": "2.7.0", - "resolved": "https://registry.npmjs.org/sort-package-json/-/sort-package-json-2.7.0.tgz", - "integrity": "sha512-6AayF8bp6L+WROgpbhTMUtB9JSFmpGHjmW7DyaNPS1HwlTw2oSVlUUtlkHSEZmg5o89F3zvLBZNvMeZ1T4fjQg==", + "version": "2.8.0", + "resolved": "https://registry.npmjs.org/sort-package-json/-/sort-package-json-2.8.0.tgz", + "integrity": "sha512-PxeNg93bTJWmDGnu0HADDucoxfFiKkIr73Kv85EBThlI1YQPdc0XovBgg2llD0iABZbu2SlKo8ntGmOP9wOj/g==", "dev": true, "dependencies": { "detect-indent": "^7.0.1", diff --git a/poetry.lock b/poetry.lock index efd8604..0e6736b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -61,63 +61,63 @@ files = [ [[package]] name = "coverage" -version = "7.4.2" +version = "7.4.3" description = "Code coverage measurement for Python" optional = false python-versions = ">=3.8" files = [ - { file = "coverage-7.4.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bf54c3e089179d9d23900e3efc86d46e4431188d9a657f345410eecdd0151f50" }, - { file = "coverage-7.4.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fe6e43c8b510719b48af7db9631b5fbac910ade4bd90e6378c85ac5ac706382c" }, - { file = "coverage-7.4.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b98c89db1b150d851a7840142d60d01d07677a18f0f46836e691c38134ed18b" }, - { file = "coverage-7.4.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c5f9683be6a5b19cd776ee4e2f2ffb411424819c69afab6b2db3a0a364ec6642" }, - { file = "coverage-7.4.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78cdcbf7b9cb83fe047ee09298e25b1cd1636824067166dc97ad0543b079d22f" }, - { file = "coverage-7.4.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:2599972b21911111114100d362aea9e70a88b258400672626efa2b9e2179609c" }, - { file = "coverage-7.4.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ef00d31b7569ed3cb2036f26565f1984b9fc08541731ce01012b02a4c238bf03" }, - { file = "coverage-7.4.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:20a875bfd8c282985c4720c32aa05056f77a68e6d8bbc5fe8632c5860ee0b49b" }, - { file = "coverage-7.4.2-cp310-cp310-win32.whl", hash = "sha256:b3f2b1eb229f23c82898eedfc3296137cf1f16bb145ceab3edfd17cbde273fb7" }, - { file = "coverage-7.4.2-cp310-cp310-win_amd64.whl", hash = "sha256:7df95fdd1432a5d2675ce630fef5f239939e2b3610fe2f2b5bf21fa505256fa3" }, - { file = "coverage-7.4.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a8ddbd158e069dded57738ea69b9744525181e99974c899b39f75b2b29a624e2" }, - { file = "coverage-7.4.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81a5fb41b0d24447a47543b749adc34d45a2cf77b48ca74e5bf3de60a7bd9edc" }, - { file = "coverage-7.4.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2412e98e70f16243be41d20836abd5f3f32edef07cbf8f407f1b6e1ceae783ac" }, - { file = "coverage-7.4.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ddb79414c15c6f03f56cc68fa06994f047cf20207c31b5dad3f6bab54a0f66ef" }, - { file = "coverage-7.4.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cf89ab85027427d351f1de918aff4b43f4eb5f33aff6835ed30322a86ac29c9e" }, - { file = "coverage-7.4.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a178b7b1ac0f1530bb28d2e51f88c0bab3e5949835851a60dda80bff6052510c" }, - { file = "coverage-7.4.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:06fe398145a2e91edaf1ab4eee66149c6776c6b25b136f4a86fcbbb09512fd10" }, - { file = "coverage-7.4.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:18cac867950943fe93d6cd56a67eb7dcd2d4a781a40f4c1e25d6f1ed98721a55" }, - { file = "coverage-7.4.2-cp311-cp311-win32.whl", hash = "sha256:f72cdd2586f9a769570d4b5714a3837b3a59a53b096bb954f1811f6a0afad305" }, - { file = "coverage-7.4.2-cp311-cp311-win_amd64.whl", hash = "sha256:d779a48fac416387dd5673fc5b2d6bd903ed903faaa3247dc1865c65eaa5a93e" }, - { file = "coverage-7.4.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:adbdfcda2469d188d79771d5696dc54fab98a16d2ef7e0875013b5f56a251047" }, - { file = "coverage-7.4.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ac4bab32f396b03ebecfcf2971668da9275b3bb5f81b3b6ba96622f4ef3f6e17" }, - { file = "coverage-7.4.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:006d220ba2e1a45f1de083d5022d4955abb0aedd78904cd5a779b955b019ec73" }, - { file = "coverage-7.4.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3733545eb294e5ad274abe131d1e7e7de4ba17a144505c12feca48803fea5f64" }, - { file = "coverage-7.4.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42a9e754aa250fe61f0f99986399cec086d7e7a01dd82fd863a20af34cbce962" }, - { file = "coverage-7.4.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2ed37e16cf35c8d6e0b430254574b8edd242a367a1b1531bd1adc99c6a5e00fe" }, - { file = "coverage-7.4.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:b953275d4edfab6cc0ed7139fa773dfb89e81fee1569a932f6020ce7c6da0e8f" }, - { file = "coverage-7.4.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:32b4ab7e6c924f945cbae5392832e93e4ceb81483fd6dc4aa8fb1a97b9d3e0e1" }, - { file = "coverage-7.4.2-cp312-cp312-win32.whl", hash = "sha256:f5df76c58977bc35a49515b2fbba84a1d952ff0ec784a4070334dfbec28a2def" }, - { file = "coverage-7.4.2-cp312-cp312-win_amd64.whl", hash = "sha256:34423abbaad70fea9d0164add189eabaea679068ebdf693baa5c02d03e7db244" }, - { file = "coverage-7.4.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5b11f9c6587668e495cc7365f85c93bed34c3a81f9f08b0920b87a89acc13469" }, - { file = "coverage-7.4.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:51593a1f05c39332f623d64d910445fdec3d2ac2d96b37ce7f331882d5678ddf" }, - { file = "coverage-7.4.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69f1665165ba2fe7614e2f0c1aed71e14d83510bf67e2ee13df467d1c08bf1e8" }, - { file = "coverage-7.4.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3c8bbb95a699c80a167478478efe5e09ad31680931ec280bf2087905e3b95ec" }, - { file = "coverage-7.4.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:175f56572f25e1e1201d2b3e07b71ca4d201bf0b9cb8fad3f1dfae6a4188de86" }, - { file = "coverage-7.4.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8562ca91e8c40864942615b1d0b12289d3e745e6b2da901d133f52f2d510a1e3" }, - { file = "coverage-7.4.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d9a1ef0f173e1a19738f154fb3644f90d0ada56fe6c9b422f992b04266c55d5a" }, - { file = "coverage-7.4.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f40ac873045db4fd98a6f40387d242bde2708a3f8167bd967ccd43ad46394ba2" }, - { file = "coverage-7.4.2-cp38-cp38-win32.whl", hash = "sha256:d1b750a8409bec61caa7824bfd64a8074b6d2d420433f64c161a8335796c7c6b" }, - { file = "coverage-7.4.2-cp38-cp38-win_amd64.whl", hash = "sha256:b4ae777bebaed89e3a7e80c4a03fac434a98a8abb5251b2a957d38fe3fd30088" }, - { file = "coverage-7.4.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3ff7f92ae5a456101ca8f48387fd3c56eb96353588e686286f50633a611afc95" }, - { file = "coverage-7.4.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:861d75402269ffda0b33af94694b8e0703563116b04c681b1832903fac8fd647" }, - { file = "coverage-7.4.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3507427d83fa961cbd73f11140f4a5ce84208d31756f7238d6257b2d3d868405" }, - { file = "coverage-7.4.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bf711d517e21fb5bc429f5c4308fbc430a8585ff2a43e88540264ae87871e36a" }, - { file = "coverage-7.4.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c00e54f0bd258ab25e7f731ca1d5144b0bf7bec0051abccd2bdcff65fa3262c9" }, - { file = "coverage-7.4.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f8e845d894e39fb53834da826078f6dc1a933b32b1478cf437007367efaf6f6a" }, - { file = "coverage-7.4.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:840456cb1067dc350af9080298c7c2cfdddcedc1cb1e0b30dceecdaf7be1a2d3" }, - { file = "coverage-7.4.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c11ca2df2206a4e3e4c4567f52594637392ed05d7c7fb73b4ea1c658ba560265" }, - { file = "coverage-7.4.2-cp39-cp39-win32.whl", hash = "sha256:3ff5bdb08d8938d336ce4088ca1a1e4b6c8cd3bef8bb3a4c0eb2f37406e49643" }, - { file = "coverage-7.4.2-cp39-cp39-win_amd64.whl", hash = "sha256:ac9e95cefcf044c98d4e2c829cd0669918585755dd9a92e28a1a7012322d0a95" }, - { file = "coverage-7.4.2-pp38.pp39.pp310-none-any.whl", hash = "sha256:f593a4a90118d99014517c2679e04a4ef5aee2d81aa05c26c734d271065efcb6" }, - { file = "coverage-7.4.2.tar.gz", hash = "sha256:1a5ee18e3a8d766075ce9314ed1cb695414bae67df6a4b0805f5137d93d6f1cb" }, + { file = "coverage-7.4.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8580b827d4746d47294c0e0b92854c85a92c2227927433998f0d3320ae8a71b6" }, + { file = "coverage-7.4.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:718187eeb9849fc6cc23e0d9b092bc2348821c5e1a901c9f8975df0bc785bfd4" }, + { file = "coverage-7.4.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:767b35c3a246bcb55b8044fd3a43b8cd553dd1f9f2c1eeb87a302b1f8daa0524" }, + { file = "coverage-7.4.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae7f19afe0cce50039e2c782bff379c7e347cba335429678450b8fe81c4ef96d" }, + { file = "coverage-7.4.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba3a8aaed13770e970b3df46980cb068d1c24af1a1968b7818b69af8c4347efb" }, + { file = "coverage-7.4.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:ee866acc0861caebb4f2ab79f0b94dbfbdbfadc19f82e6e9c93930f74e11d7a0" }, + { file = "coverage-7.4.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:506edb1dd49e13a2d4cac6a5173317b82a23c9d6e8df63efb4f0380de0fbccbc" }, + { file = "coverage-7.4.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd6545d97c98a192c5ac995d21c894b581f1fd14cf389be90724d21808b657e2" }, + { file = "coverage-7.4.3-cp310-cp310-win32.whl", hash = "sha256:f6a09b360d67e589236a44f0c39218a8efba2593b6abdccc300a8862cffc2f94" }, + { file = "coverage-7.4.3-cp310-cp310-win_amd64.whl", hash = "sha256:18d90523ce7553dd0b7e23cbb28865db23cddfd683a38fb224115f7826de78d0" }, + { file = "coverage-7.4.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cbbe5e739d45a52f3200a771c6d2c7acf89eb2524890a4a3aa1a7fa0695d2a47" }, + { file = "coverage-7.4.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:489763b2d037b164846ebac0cbd368b8a4ca56385c4090807ff9fad817de4113" }, + { file = "coverage-7.4.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:451f433ad901b3bb00184d83fd83d135fb682d780b38af7944c9faeecb1e0bfe" }, + { file = "coverage-7.4.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fcc66e222cf4c719fe7722a403888b1f5e1682d1679bd780e2b26c18bb648cdc" }, + { file = "coverage-7.4.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3ec74cfef2d985e145baae90d9b1b32f85e1741b04cd967aaf9cfa84c1334f3" }, + { file = "coverage-7.4.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:abbbd8093c5229c72d4c2926afaee0e6e3140de69d5dcd918b2921f2f0c8baba" }, + { file = "coverage-7.4.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:35eb581efdacf7b7422af677b92170da4ef34500467381e805944a3201df2079" }, + { file = "coverage-7.4.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8249b1c7334be8f8c3abcaaa996e1e4927b0e5a23b65f5bf6cfe3180d8ca7840" }, + { file = "coverage-7.4.3-cp311-cp311-win32.whl", hash = "sha256:cf30900aa1ba595312ae41978b95e256e419d8a823af79ce670835409fc02ad3" }, + { file = "coverage-7.4.3-cp311-cp311-win_amd64.whl", hash = "sha256:18c7320695c949de11a351742ee001849912fd57e62a706d83dfc1581897fa2e" }, + { file = "coverage-7.4.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b51bfc348925e92a9bd9b2e48dad13431b57011fd1038f08316e6bf1df107d10" }, + { file = "coverage-7.4.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d6cdecaedea1ea9e033d8adf6a0ab11107b49571bbb9737175444cea6eb72328" }, + { file = "coverage-7.4.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3b2eccb883368f9e972e216c7b4c7c06cabda925b5f06dde0650281cb7666a30" }, + { file = "coverage-7.4.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6c00cdc8fa4e50e1cc1f941a7f2e3e0f26cb2a1233c9696f26963ff58445bac7" }, + { file = "coverage-7.4.3-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9a4a8dd3dcf4cbd3165737358e4d7dfbd9d59902ad11e3b15eebb6393b0446e" }, + { file = "coverage-7.4.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:062b0a75d9261e2f9c6d071753f7eef0fc9caf3a2c82d36d76667ba7b6470003" }, + { file = "coverage-7.4.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:ebe7c9e67a2d15fa97b77ea6571ce5e1e1f6b0db71d1d5e96f8d2bf134303c1d" }, + { file = "coverage-7.4.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:c0a120238dd71c68484f02562f6d446d736adcc6ca0993712289b102705a9a3a" }, + { file = "coverage-7.4.3-cp312-cp312-win32.whl", hash = "sha256:37389611ba54fd6d278fde86eb2c013c8e50232e38f5c68235d09d0a3f8aa352" }, + { file = "coverage-7.4.3-cp312-cp312-win_amd64.whl", hash = "sha256:d25b937a5d9ffa857d41be042b4238dd61db888533b53bc76dc082cb5a15e914" }, + { file = "coverage-7.4.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:28ca2098939eabab044ad68850aac8f8db6bf0b29bc7f2887d05889b17346454" }, + { file = "coverage-7.4.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:280459f0a03cecbe8800786cdc23067a8fc64c0bd51dc614008d9c36e1659d7e" }, + { file = "coverage-7.4.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c0cdedd3500e0511eac1517bf560149764b7d8e65cb800d8bf1c63ebf39edd2" }, + { file = "coverage-7.4.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9a9babb9466fe1da12417a4aed923e90124a534736de6201794a3aea9d98484e" }, + { file = "coverage-7.4.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dec9de46a33cf2dd87a5254af095a409ea3bf952d85ad339751e7de6d962cde6" }, + { file = "coverage-7.4.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:16bae383a9cc5abab9bb05c10a3e5a52e0a788325dc9ba8499e821885928968c" }, + { file = "coverage-7.4.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:2c854ce44e1ee31bda4e318af1dbcfc929026d12c5ed030095ad98197eeeaed0" }, + { file = "coverage-7.4.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ce8c50520f57ec57aa21a63ea4f325c7b657386b3f02ccaedeccf9ebe27686e1" }, + { file = "coverage-7.4.3-cp38-cp38-win32.whl", hash = "sha256:708a3369dcf055c00ddeeaa2b20f0dd1ce664eeabde6623e516c5228b753654f" }, + { file = "coverage-7.4.3-cp38-cp38-win_amd64.whl", hash = "sha256:1bf25fbca0c8d121a3e92a2a0555c7e5bc981aee5c3fdaf4bb7809f410f696b9" }, + { file = "coverage-7.4.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3b253094dbe1b431d3a4ac2f053b6d7ede2664ac559705a704f621742e034f1f" }, + { file = "coverage-7.4.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:77fbfc5720cceac9c200054b9fab50cb2a7d79660609200ab83f5db96162d20c" }, + { file = "coverage-7.4.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6679060424faa9c11808598504c3ab472de4531c571ab2befa32f4971835788e" }, + { file = "coverage-7.4.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4af154d617c875b52651dd8dd17a31270c495082f3d55f6128e7629658d63765" }, + { file = "coverage-7.4.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8640f1fde5e1b8e3439fe482cdc2b0bb6c329f4bb161927c28d2e8879c6029ee" }, + { file = "coverage-7.4.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:69b9f6f66c0af29642e73a520b6fed25ff9fd69a25975ebe6acb297234eda501" }, + { file = "coverage-7.4.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:0842571634f39016a6c03e9d4aba502be652a6e4455fadb73cd3a3a49173e38f" }, + { file = "coverage-7.4.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a78ed23b08e8ab524551f52953a8a05d61c3a760781762aac49f8de6eede8c45" }, + { file = "coverage-7.4.3-cp39-cp39-win32.whl", hash = "sha256:c0524de3ff096e15fcbfe8f056fdb4ea0bf497d584454f344d59fce069d3e6e9" }, + { file = "coverage-7.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:0209a6369ccce576b43bb227dc8322d8ef9e323d089c6f3f26a597b09cb4d2aa" }, + { file = "coverage-7.4.3-pp38.pp39.pp310-none-any.whl", hash = "sha256:7cbde573904625509a3f37b6fecea974e363460b556a627c60dc2f47e2fffa51" }, + { file = "coverage-7.4.3.tar.gz", hash = "sha256:276f6077a5c61447a48d133ed13e759c09e62aff0dc84274a68dc18660104d52" }, ] [package.dependencies] @@ -817,13 +817,13 @@ files = [ [[package]] name = "setuptools" -version = "69.1.0" +version = "69.1.1" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - { file = "setuptools-69.1.0-py3-none-any.whl", hash = "sha256:c054629b81b946d63a9c6e732bc8b2513a7c3ea645f11d0139a2191d735c60c6" }, - { file = "setuptools-69.1.0.tar.gz", hash = "sha256:850894c4195f09c4ed30dba56213bf7c3f21d86ed6bdaafb5df5972593bfc401" }, + { file = "setuptools-69.1.1-py3-none-any.whl", hash = "sha256:02fa291a0471b3a18b2b2481ed902af520c69e8ae0919c13da936542754b4c56" }, + { file = "setuptools-69.1.1.tar.gz", hash = "sha256:5c0806c7d9af348e6dd3777b4f4dbb42c7ad85b190104837488eab9a7c945cf8" }, ] [package.extras] @@ -850,6 +850,7 @@ testing = [ "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", + "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", @@ -870,7 +871,7 @@ testing-integration = [ "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", - "packaging (>=23.1)", + "packaging (>=23.2)", "pytest", "pytest-enabler", "pytest-xdist", From 551804ad5010936a0823666f62239427981233c6 Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Fri, 23 Feb 2024 17:37:16 -0800 Subject: [PATCH 39/57] remove unused strip --- comicfn2dict/parse.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index e49f903..856db94 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -106,8 +106,7 @@ class ComicFilenameParser: if require_all: return continue - # TODO idk if strip is necessary here - matched_metadata[key] = self._grouping_operators_strip(value) + matched_metadata[key] = value if first_only: break self.metadata.update(matched_metadata) From 7d98a8cea64f191921fb5732e7b798df09133349 Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Fri, 23 Feb 2024 17:39:02 -0800 Subject: [PATCH 40/57] move function --- comicfn2dict/parse.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 856db94..c31ba6d 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -47,7 +47,7 @@ class ComicFilenameParser: return -1 if value not in self._path_indexes: # TODO This is fragile. - # Better to get it at match time. + # Can I get it at match time? if key == "ext": index = self.path.rfind(value) else: @@ -67,15 +67,6 @@ class ComicFilenameParser: self.metadata["ext"] = ext self._unparsed_path = data - def _grouping_operators_strip(self, value: str) -> str: - """Strip spaces and parens.""" - value = value.strip() - value = value.strip("()").strip() - value = value.strip("-").strip() - value = value.strip(",").strip() - value = value.strip("'").strip() - return value.strip('"').strip() - def _clean_dividers(self): """Replace non space dividers and clean extra spaces out of string.""" data = self._unparsed_path @@ -174,6 +165,15 @@ class ComicFilenameParser: break return title_ok or not other_tokens_exist + def _grouping_operators_strip(self, value: str) -> str: + """Strip spaces and parens.""" + value = value.strip() + value = value.strip("()").strip() + value = value.strip("-").strip() + value = value.strip(",").strip() + value = value.strip("'").strip() + return value.strip('"').strip() + def _assign_remaining_groups(self): """Assign series and title.""" if not self._unparsed_path: From 51cb5eac7e07ef08687d9f5ade2fce2c64aa4661 Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Fri, 23 Feb 2024 17:41:42 -0800 Subject: [PATCH 41/57] add class comment --- comicfn2dict/parse.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index c31ba6d..0978c0f 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -38,6 +38,8 @@ _DATE_KEYS = frozenset({"year", "month", "day"}) class ComicFilenameParser: + """Parse a filename metadata into a dict.""" + def path_index(self, key: str): """Lazily retrieve and memoize the key's location in the path.""" if key == "remainders": From e15feb587d7358fabbbfe71d3ab7734603c83720 Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Fri, 23 Feb 2024 17:52:25 -0800 Subject: [PATCH 42/57] comment on path_index() fragility --- comicfn2dict/parse.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 0978c0f..74af443 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -48,8 +48,8 @@ class ComicFilenameParser: if not value: return -1 if value not in self._path_indexes: - # TODO This is fragile. - # Can I get it at match time? + # XXX This is fragile, but it's difficult to calculate the original + # position at match time from the ever changing _unparsed_path. if key == "ext": index = self.path.rfind(value) else: From b57899d954e1685e829e825007a52a0aa395ea19 Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Fri, 23 Feb 2024 18:01:07 -0800 Subject: [PATCH 43/57] break up parsed items function --- comicfn2dict/parse.py | 59 ++++++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 74af443..b75578a 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -3,7 +3,7 @@ from pprint import pformat from calendar import month_abbr from copy import copy from pathlib import Path -from re import Pattern +from re import Match, Pattern from typing import Any from comicfn2dict.log import print_log_header from comicfn2dict.regex import ( @@ -79,6 +79,36 @@ class ComicFilenameParser: data = regex.sub(replacement, data, count=count).strip() self._unparsed_path = data.strip() + def _parse_items_update_metadata( + self, matches: Match, exclude: str, require_all: bool, first_only: bool + ) -> bool: + """Update Metadata.""" + matched_metadata = {} + for key, value in matches.groupdict().items(): + if value == exclude: + continue + if not value: + if require_all: + return False + continue + matched_metadata[key] = value + if first_only: + break + if not matched_metadata: + return False + self.metadata.update(matched_metadata) + return True + + def _parse_items_pop_tokens(self, regex: Pattern, first_only: bool) -> None: + """Pop tokens from unparsed path.""" + count = 1 if first_only else 0 + marked_str = regex.sub(TOKEN_DELIMETER, self._unparsed_path, count=count) + parts = [] + for part in marked_str.split(TOKEN_DELIMETER): + if token := part.strip(): + parts.append(token) + self._unparsed_path = TOKEN_DELIMETER.join(parts) + def _parse_items( self, regex: Pattern, @@ -88,31 +118,18 @@ class ComicFilenameParser: pop: bool = True, ) -> None: """Parse a value from the data list into metadata and alter the data list.""" + # Match matches = regex.search(self._unparsed_path) if not matches: return - matched_metadata = {} - for key, value in matches.groupdict().items(): - if value == exclude: - continue - if not value: - if require_all: - return - continue - matched_metadata[key] = value - if first_only: - break - self.metadata.update(matched_metadata) - if not matched_metadata or not pop: + if not self._parse_items_update_metadata( + matches, exclude, require_all, first_only + ): return - count = 1 if first_only else 0 - marked_str = regex.sub(TOKEN_DELIMETER, self._unparsed_path, count=count) - parts = [] - for part in marked_str.split(TOKEN_DELIMETER): - if token := part.strip(): - parts.append(token) - self._unparsed_path = TOKEN_DELIMETER.join(parts) + + if pop: + self._parse_items_pop_tokens(regex, first_only) def _alpha_month_to_numeric(self): """Translate alpha_month to numeric month.""" From 4466fa67233c91463747a9574f5f125d5a7c4acf Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Fri, 23 Feb 2024 18:15:53 -0800 Subject: [PATCH 44/57] break up parse method and sort methods --- comicfn2dict/parse.py | 173 +++++++++++++++++++++--------------------- 1 file changed, 85 insertions(+), 88 deletions(-) diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index b75578a..5df8506 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -57,6 +57,16 @@ class ComicFilenameParser: self._path_indexes[value] = index return self._path_indexes[value] + def _log(self, label): + if not self._debug: + return + print_log_header(label) + combined = {} + for key in self.metadata: + combined[key] = (self.metadata.get(key), self.path_index(key)) + print(" " + self._unparsed_path) + print(" " + pformat(combined)) + def _parse_ext(self): """Pop the extension from the pathname.""" path = Path(self._unparsed_path) @@ -78,6 +88,7 @@ class ComicFilenameParser: replacement, count = pair data = regex.sub(replacement, data, count=count).strip() self._unparsed_path = data.strip() + self._log("After Clean Path") def _parse_items_update_metadata( self, matches: Match, exclude: str, require_all: bool, first_only: bool @@ -131,6 +142,20 @@ class ComicFilenameParser: if pop: self._parse_items_pop_tokens(regex, first_only) + def _parse_issue(self): + """Parse Issue.""" + self._parse_items(ISSUE_NUMBER_RE) + if "issue" not in self.metadata: + self._parse_items(ISSUE_WITH_COUNT_RE) + self._log("After Issue") + + def _parse_volume(self): + """Parse Volume.""" + self._parse_items(VOLUME_RE) + if "volume" not in self.metadata: + self._parse_items(VOLUME_WITH_COUNT_RE) + self._log("After Volume") + def _alpha_month_to_numeric(self): """Translate alpha_month to numeric month.""" if alpha_month := self.metadata.pop("alpha_month", ""): @@ -165,6 +190,58 @@ class ComicFilenameParser: self._parse_items(YEAR_TOKEN_RE) if self.metadata.get("year", "") != volume: self.metadata["volume"] = volume + self._log("After Date") + + def _parse_format_and_scan_info(self): + # Format & Scan Info + # + self._parse_items( + ORIGINAL_FORMAT_SCAN_INFO_RE, + require_all=True, + ) + if "original_format" not in self.metadata: + self._parse_items( + ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, + ) + self._parse_items(SCAN_INFO_SECONDARY_RE) + if ( + scan_info_secondary := self.metadata.pop("secondary_scan_info", "") + ) and "scan_info" not in self.metadata: + self.metadata["scan_info"] = scan_info_secondary # type: ignore + self._log("After original_format & scan_info") + + def _parse_ends_of_remaining_tokens(self): + # Volume left on the end of string tokens + if "volume" not in self.metadata: + self._parse_items(BOOK_VOLUME_RE) + self._log("After original_format & scan_info") + + # Years left on the end of string tokens + year_end_matched = False + if "year" not in self.metadata: + self._parse_items(YEAR_END_RE, pop=False) + year_end_matched = "year" in self.metadata + self._log("After Year on end of token") + + # Issue left on the end of string tokens + if "issue" not in self.metadata and not year_end_matched: + exclude: str = self.metadata.get("year", "") # type: ignore + self._parse_items(ISSUE_END_RE, exclude=exclude) + if "issue" not in self.metadata: + self._parse_items(ISSUE_BEGIN_RE) + self._log("After Issue on ends of tokens") + + def _parse_publisher(self): + """Parse Publisher.""" + # Pop single tokens so they don't end up titles. + self._parse_items(PUBLISHER_UNAMBIGUOUS_TOKEN_RE, first_only=True) + if "publisher" not in self.metadata: + self._parse_items(PUBLISHER_AMBIGUOUS_TOKEN_RE, first_only=True) + if "publisher" not in self.metadata: + self._parse_items(PUBLISHER_UNAMBIGUOUS_RE, pop=False, first_only=True) + if "publisher" not in self.metadata: + self._parse_items(PUBLISHER_AMBIGUOUS_RE, pop=False, first_only=True) + self._log("After publisher") def _is_title_in_position(self, value): """Does the title come after series and one other token if they exist.""" @@ -193,7 +270,7 @@ class ComicFilenameParser: value = value.strip("'").strip() return value.strip('"').strip() - def _assign_remaining_groups(self): + def _parse_series_and_title(self): """Assign series and title.""" if not self._unparsed_path: return @@ -221,6 +298,7 @@ class ComicFilenameParser: unused_tokens.append(token) self._unparsed_path = " ".join(unused_tokens) if unused_tokens else "" + self._log("After Series & Title") def _add_remainders(self): """Add Remainders.""" @@ -232,101 +310,20 @@ class ComicFilenameParser: if remainders: self.metadata["remainders"] = tuple(remainders) - def _log(self, label): - if not self._debug: - return - print_log_header(label) - combined = {} - for key in self.metadata: - combined[key] = (self.metadata.get(key), self.path_index(key)) - print(" " + self._unparsed_path) - print(" " + pformat(combined)) - def parse(self) -> dict[str, Any]: """Parse the filename with a hierarchy of regexes.""" - # Init - # self._log("Init") self._parse_ext() self._clean_dividers() - self._log("After Clean Path") - - # Issue - # - self._parse_items(ISSUE_NUMBER_RE) - if "issue" not in self.metadata: - self._parse_items(ISSUE_WITH_COUNT_RE) - # self._parse_items(ISSUE_COUNT_RE) - self._log("After Issue") - - # Volume - # - self._parse_items(VOLUME_RE) - if "volume" not in self.metadata: - self._parse_items(VOLUME_WITH_COUNT_RE) - self._log("After Volume") - - # Date - # + self._parse_issue() + self._parse_volume() self._parse_dates() - self._log("After Date") - - # Format & Scan Info - # - self._parse_items( - ORIGINAL_FORMAT_SCAN_INFO_RE, - require_all=True, - ) - if "original_format" not in self.metadata: - self._parse_items( - ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, - ) - self._parse_items(SCAN_INFO_SECONDARY_RE) - if ( - scan_info_secondary := self.metadata.pop("secondary_scan_info", "") - ) and "scan_info" not in self.metadata: - self.metadata["scan_info"] = scan_info_secondary # type: ignore - self._log("After original_format & scan_info") - - # Series and Title - # - # Volume left on the end of string tokens - if "volume" not in self.metadata: - self._parse_items(BOOK_VOLUME_RE) - self._log("After original_format & scan_info") - - # Years left on the end of string tokens - year_end_matched = False - if "year" not in self.metadata: - self._parse_items(YEAR_END_RE, pop=False) - year_end_matched = "year" in self.metadata - self._log("After Year on end of token") - - # Issue left on the end of string tokens - if "issue" not in self.metadata and not year_end_matched: - exclude: str = self.metadata.get("year", "") # type: ignore - self._parse_items(ISSUE_END_RE, exclude=exclude) - if "issue" not in self.metadata: - self._parse_items(ISSUE_BEGIN_RE) - self._log("After Issue on ends of tokens") - - # Publisher - # - # Pop single tokens so they don't end up titles. - self._parse_items(PUBLISHER_UNAMBIGUOUS_TOKEN_RE, first_only=True) - if "publisher" not in self.metadata: - self._parse_items(PUBLISHER_AMBIGUOUS_TOKEN_RE, first_only=True) - if "publisher" not in self.metadata: - self._parse_items(PUBLISHER_UNAMBIGUOUS_RE, pop=False, first_only=True) - if "publisher" not in self.metadata: - self._parse_items(PUBLISHER_AMBIGUOUS_RE, pop=False, first_only=True) - self._log("After publisher") - - self._assign_remaining_groups() - self._log("After Series & Title") + self._parse_format_and_scan_info() + self._parse_ends_of_remaining_tokens() + self._parse_publisher() + self._parse_series_and_title() # Copy volume into issue if it's all we have. - # if "issue" not in self.metadata and "volume" in self.metadata: self.metadata["issue"] = self.metadata["volume"] self._log("After issue can be volume") From 95ceefd0fea2184ff3b48664ee7a517a7f84b04e Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Fri, 23 Feb 2024 18:23:08 -0800 Subject: [PATCH 45/57] update docs --- README.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 57b5f90..d88bdef 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,17 @@ pip install comicfn2dict ## API -look at `comicfn2dict/comicfn2dict.py` +<!-- eslint-skip --> + +```python +from comicfn2dict import comicfn2dict, dict2comicfn + +path = "Comic Series #001 Title (2024).cbz" + +metadata: dict[str, str| tuple[str,...]] = comicfn2dict(path, verbose=0) + +filename: str = dict2comicfn(metadata, bool=True, verbose=0) +``` ## CLI From 2500fa351bcbb5e4f029310c4014ae770d07bcea Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Fri, 23 Feb 2024 18:29:45 -0800 Subject: [PATCH 46/57] circleci build scripts --- .circleci/config.yml | 60 ++++++++++++++++++++++++++++++++++++++ Dockerfile | 20 +++++++++++++ bin/docker-compose-exit.sh | 6 ++++ debian.sources | 11 +++++++ docker-compose.yaml | 21 +++++++++++++ pyproject.toml | 2 +- 6 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 .circleci/config.yml create mode 100644 Dockerfile create mode 100755 bin/docker-compose-exit.sh create mode 100644 debian.sources create mode 100644 docker-compose.yaml diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000..65e2777 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,60 @@ +jobs: + build: + machine: + image: ubuntu-2204:current + environment: + DOCKER_CLI_EXPERIMENTAL: enabled + DOCKER_BUILDKIT: 1 + steps: + - checkout + - run: + command: docker compose build comicfn2dict-builder + name: Build Builder + - run: + command: ./bin/docker-compose-exit.sh comicfn2dict-lint + name: comicfn2dict Lint + - run: + command: ./bin/docker-compose-exit.sh comicfn2dict-test + name: comicfn2dict Test + - store_test_results: + path: test-results/pytest + - store_artifacts: + path: test-results/coverage + - run: + command: ./bin/docker-compose-exit.sh comicfn2dict-build + name: Build comicfn2dict Dist + - persist_to_workspace: + paths: + - ./README.md + - ./bin + - ./dist + - ./pyproject.toml + root: . + deploy: + docker: + - image: cimg/python:3.12.1 + steps: + - attach_workspace: + at: . + - run: + command: ./bin/publish-pypi.sh +version: 2.1 +workflows: + main: + jobs: + - build: + filters: + branches: + only: + - develop + - pre-release + - main + - deploy: + filters: + branches: + only: + - pre-release + - main + requires: + - build + version: 2.1 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..28b7b97 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,20 @@ +FROM python:3.12.1-bookworm +LABEL maintainer="AJ Slater <aj@slater.net>" + +COPY debian.sources /etc/apt/sources.list.d/ +# hadolint ignore=DL3008 +RUN apt-get clean \ + && apt-get update \ + && apt-get install --no-install-recommends -y \ + bash \ + npm \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY bin ./bin +COPY package.json package-lock.json pyproject.toml poetry.lock Makefile ./ +RUN make install-all + +COPY . . diff --git a/bin/docker-compose-exit.sh b/bin/docker-compose-exit.sh new file mode 100755 index 0000000..e2ad011 --- /dev/null +++ b/bin/docker-compose-exit.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# Run a docker compose service and return its exit code +set -euo pipefail +SERVICE=$1 +# docker compose without the dash doesn't have the exit-code-from param +docker compose up --exit-code-from "$SERVICE" "$SERVICE" diff --git a/debian.sources b/debian.sources new file mode 100644 index 0000000..0780fac --- /dev/null +++ b/debian.sources @@ -0,0 +1,11 @@ +Types: deb +URIs: http://deb.debian.org/debian +Suites: bookworm bookworm-updates +Components: main contrib non-free +Signed-By: /usr/share/keyrings/debian-archive-keyring.gpg + +Types: deb +URIs: http://deb.debian.org/debian-security +Suites: bookworm-security +Components: main contrib non-free +Signed-By: /usr/share/keyrings/debian-archive-keyring.gpg diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..c10d215 --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,21 @@ +services: + comicfn2dict-builder: + build: . + image: comicfn2dict-builder + container_name: comicfn2dict-builder + comicfn2dict-lint: + image: comicfn2dict-builder + container_name: comicfn2dict-lint + command: make lint + comicfn2dict-test: + image: comicfn2dict-builder + container_name: comicfn2dict-test + command: make test + volumes: + - ./test-results/:/app/test-results/ + comicfn2dict-build: + image: comicfn2dict-builder + container_name: comicfn2dict-build + volumes: + - ./dist/:/app/dist/ + command: poetry build diff --git a/pyproject.toml b/pyproject.toml index 5f662e4..fb23c4d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "comicfn2dict" -version = "0.2.0" +version = "0.2.0a0" description = "Parse common comic filenames and return a dict of metadata attributes. Includes a cli." license = "GPL-3.0-only" authors = ["AJ Slater <aj@slater.net>"] From 9c052298a3aace830b9e2b7cde57897f3f320a35 Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Fri, 23 Feb 2024 19:04:45 -0800 Subject: [PATCH 47/57] install npm in install deps --- Makefile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 519844f..577ebac 100644 --- a/Makefile +++ b/Makefile @@ -2,27 +2,25 @@ ## Update pip and install poetry pip install --upgrade pip pip install --upgrade poetry + npm install .PHONY: install ## Install for production ## @category Install install-prod: install-deps poetry install --no-root --only-root - npm install .PHONY: install-dev ## Install dev requirements ## @category Install install-dev: install-deps poetry install --no-root --only-root --with dev - npm install .PHONY: install-all ## Install with all extras ## @category Install install-all: install-deps poetry install --no-root --all-extras - npm install .PHONY: clean ## Clean pycaches From 51f184e546b911e58a525004478afaecb9d35bd8 Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Fri, 23 Feb 2024 19:06:35 -0800 Subject: [PATCH 48/57] fix install-deps in makefile --- Dockerfile | 2 +- Makefile | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 28b7b97..135775c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,6 +15,6 @@ WORKDIR /app COPY bin ./bin COPY package.json package-lock.json pyproject.toml poetry.lock Makefile ./ -RUN make install-all +RUN make install-deps install-all COPY . . diff --git a/Makefile b/Makefile index 577ebac..84ad134 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,7 @@ .PHONY: install-deps ## Update pip and install poetry +## @category Install +install-deps: pip install --upgrade pip pip install --upgrade poetry npm install From 76691dfb998da22422dfc1c6786b34bfc6765336 Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Fri, 23 Feb 2024 19:07:10 -0800 Subject: [PATCH 49/57] revert dockerfile to just use install-all" --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 135775c..28b7b97 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,6 +15,6 @@ WORKDIR /app COPY bin ./bin COPY package.json package-lock.json pyproject.toml poetry.lock Makefile ./ -RUN make install-deps install-all +RUN make install-all COPY . . From 8608d4e805ea2888bef82d67b32bb988ad1ec337 Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Fri, 23 Feb 2024 19:09:59 -0800 Subject: [PATCH 50/57] bump news --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index f002b1d..91c10e4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -11,6 +11,7 @@ the volume number. - ComicFilenameParser and ComicFilenameSerializer classes are available as well as the old function API. +- New test cases thanks to @lordwelch & @bpepple ## v0.1.4 From 014c7191cd3c3f1c38c9551b7bca1cc7e9b67164 Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Fri, 23 Feb 2024 19:20:36 -0800 Subject: [PATCH 51/57] add publish script --- bin/publish-pypi.sh | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100755 bin/publish-pypi.sh diff --git a/bin/publish-pypi.sh b/bin/publish-pypi.sh new file mode 100755 index 0000000..7bf553d --- /dev/null +++ b/bin/publish-pypi.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# Publish the created package +set -euo pipefail +cd "$(dirname "$0")/.." +pip3 install --upgrade pip +pip3 install --upgrade poetry +poetry publish -u "$PYPI_USER" -p "$PYPI_PASS" From 29e6068db25f88ab85a6b8a5b40563272c647eb3 Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Fri, 23 Feb 2024 19:40:39 -0800 Subject: [PATCH 52/57] restore ORIGINAL_FORMAT_RE --- comicfn2dict/regex.py | 2 ++ pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index 5e4444c..9168438 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -156,6 +156,8 @@ _SCAN_INFO_RE_EXP = r"(?P<scan_info>[^()]*)" _ORIGINAL_FORMAT_SCAN_INFO_RE_EXP = ( _ORIGINAL_FORMAT_RE_EXP + r"\s*[\(:-]" + _SCAN_INFO_RE_EXP # + r")?" ) +# Keep this even though comicfn2dict doesn't use it directly +ORIGINAL_FORMAT_RE = re_compile(_ORIGINAL_FORMAT_RE_EXP, parenthify=True) ORIGINAL_FORMAT_SCAN_INFO_RE = re_compile( _ORIGINAL_FORMAT_SCAN_INFO_RE_EXP, parenthify=True ) diff --git a/pyproject.toml b/pyproject.toml index fb23c4d..49f7a9c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "comicfn2dict" -version = "0.2.0a0" +version = "0.2.0a1" description = "Parse common comic filenames and return a dict of metadata attributes. Includes a cli." license = "GPL-3.0-only" authors = ["AJ Slater <aj@slater.net>"] From d3b11d6361b055d7e7ae8596ef5461d44a6dbbae Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Fri, 23 Feb 2024 22:16:51 -0800 Subject: [PATCH 53/57] cast date and remainder parts as strings --- comicfn2dict/unparse.py | 2 ++ pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/comicfn2dict/unparse.py b/comicfn2dict/unparse.py index 351a115..ee65211 100644 --- a/comicfn2dict/unparse.py +++ b/comicfn2dict/unparse.py @@ -62,6 +62,7 @@ class ComicFilenameSerializer: # noop if only day. break if parts: + parts = (str(part) for part in parts) date = "-".join(parts) self._log("After date", date) self.metadata = MappingProxyType({**self.metadata, "date": date}) @@ -79,6 +80,7 @@ class ComicFilenameSerializer: """Add the remainders specially.""" if remainders := self.metadata.get("remainders"): if isinstance(remainders, Sequence): + remainders = (str(remainder) for remainder in remainders) remainder = " ".join(remainders) else: remainder = str(remainders) diff --git a/pyproject.toml b/pyproject.toml index 49f7a9c..5c8d950 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "comicfn2dict" -version = "0.2.0a1" +version = "0.2.0a2" description = "Parse common comic filenames and return a dict of metadata attributes. Includes a cli." license = "GPL-3.0-only" authors = ["AJ Slater <aj@slater.net>"] From 7694a3e2fdba3b412730e8c4dd616d24acb788dd Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Sat, 24 Feb 2024 18:21:07 -0800 Subject: [PATCH 54/57] enforce title position limits. reduce parse_series_and_title complexity. add type hints. --- NEWS.md | 2 + comicfn2dict/parse.py | 101 +++++++++++++++++++++++---------------- comicfn2dict/unparse.py | 4 +- pyproject.toml | 2 +- tests/comic_filenames.py | 40 ++++++++++++++-- 5 files changed, 99 insertions(+), 50 deletions(-) diff --git a/NEWS.md b/NEWS.md index 91c10e4..01d79b0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,6 +12,8 @@ - ComicFilenameParser and ComicFilenameSerializer classes are available as well as the old function API. - New test cases thanks to @lordwelch & @bpepple +- Titles must come after series and one other token, but before format and scan + info. ## v0.1.4 diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 5df8506..a754d22 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -4,7 +4,7 @@ from calendar import month_abbr from copy import copy from pathlib import Path from re import Match, Pattern -from typing import Any +from sys import maxsize from comicfn2dict.log import print_log_header from comicfn2dict.regex import ( ALPHA_MONTH_RANGE_RE, @@ -32,21 +32,22 @@ from comicfn2dict.regex import ( YEAR_TOKEN_RE, ) -_REMAINING_GROUP_KEYS = ("series", "title") -_TITLE_PRECEDING_KEYS = ("issue", "year", "volume") _DATE_KEYS = frozenset({"year", "month", "day"}) +_REMAINING_GROUP_KEYS = ("series", "title") +# Ordered by commonness. +_TITLE_PRECEDING_KEYS = ("issue", "year", "volume", "month") class ComicFilenameParser: """Parse a filename metadata into a dict.""" - def path_index(self, key: str): + def path_index(self, key: str, default: int = -1) -> int: """Lazily retrieve and memoize the key's location in the path.""" if key == "remainders": - return -1 + return default value: str = self.metadata.get(key, "") # type: ignore if not value: - return -1 + return default if value not in self._path_indexes: # XXX This is fragile, but it's difficult to calculate the original # position at match time from the ever changing _unparsed_path. @@ -57,7 +58,7 @@ class ComicFilenameParser: self._path_indexes[value] = index return self._path_indexes[value] - def _log(self, label): + def _log(self, label: str) -> None: if not self._debug: return print_log_header(label) @@ -67,7 +68,7 @@ class ComicFilenameParser: print(" " + self._unparsed_path) print(" " + pformat(combined)) - def _parse_ext(self): + def _parse_ext(self) -> None: """Pop the extension from the pathname.""" path = Path(self._unparsed_path) suffix = path.suffix @@ -79,7 +80,7 @@ class ComicFilenameParser: self.metadata["ext"] = ext self._unparsed_path = data - def _clean_dividers(self): + def _clean_dividers(self) -> None: """Replace non space dividers and clean extra spaces out of string.""" data = self._unparsed_path @@ -142,21 +143,21 @@ class ComicFilenameParser: if pop: self._parse_items_pop_tokens(regex, first_only) - def _parse_issue(self): + def _parse_issue(self) -> None: """Parse Issue.""" self._parse_items(ISSUE_NUMBER_RE) if "issue" not in self.metadata: self._parse_items(ISSUE_WITH_COUNT_RE) self._log("After Issue") - def _parse_volume(self): + def _parse_volume(self) -> None: """Parse Volume.""" self._parse_items(VOLUME_RE) if "volume" not in self.metadata: self._parse_items(VOLUME_WITH_COUNT_RE) self._log("After Volume") - def _alpha_month_to_numeric(self): + def _alpha_month_to_numeric(self) -> None: """Translate alpha_month to numeric month.""" if alpha_month := self.metadata.pop("alpha_month", ""): alpha_month = alpha_month.capitalize() # type: ignore @@ -166,7 +167,7 @@ class ComicFilenameParser: self.metadata["month"] = month break - def _parse_dates(self): + def _parse_dates(self) -> None: """Parse date schemes.""" # Discard second month of alpha month ranges. self._unparsed_path = ALPHA_MONTH_RANGE_RE.sub(r"\1", self._unparsed_path) @@ -192,9 +193,8 @@ class ComicFilenameParser: self.metadata["volume"] = volume self._log("After Date") - def _parse_format_and_scan_info(self): - # Format & Scan Info - # + def _parse_format_and_scan_info(self) -> None: + """Format & Scan Info.""" self._parse_items( ORIGINAL_FORMAT_SCAN_INFO_RE, require_all=True, @@ -231,7 +231,7 @@ class ComicFilenameParser: self._parse_items(ISSUE_BEGIN_RE) self._log("After Issue on ends of tokens") - def _parse_publisher(self): + def _parse_publisher(self) -> None: """Parse Publisher.""" # Pop single tokens so they don't end up titles. self._parse_items(PUBLISHER_UNAMBIGUOUS_TOKEN_RE, first_only=True) @@ -243,15 +243,19 @@ class ComicFilenameParser: self._parse_items(PUBLISHER_AMBIGUOUS_RE, pop=False, first_only=True) self._log("After publisher") - def _is_title_in_position(self, value): + def _is_at_title_position(self, value: str) -> bool: """Does the title come after series and one other token if they exist.""" title_index = self.path.find(value) - # Does a series come first. - if title_index < self.path_index("series"): + # Titles must come after series but before format and scan_info + if ( + title_index < self.path_index("series") + or title_index > self.path_index("original_format", maxsize) + or title_index > self.path_index("scan_info", maxsize) + ): return False - # If other tokens exist then they much precede the title. + # Titles must be after the series and one other token. title_ok = False other_tokens_exist = False for preceding_key in _TITLE_PRECEDING_KEYS: @@ -270,7 +274,28 @@ class ComicFilenameParser: value = value.strip("'").strip() return value.strip('"').strip() - def _parse_series_and_title(self): + def _parse_series_and_title_token( + self, remaining_key_index: int, tokens: list[str] + ) -> str: + """Parse one series or title token.""" + key = _REMAINING_GROUP_KEYS[remaining_key_index] + if key in self.metadata: + return "" + token = tokens.pop(0) + match = REMAINING_GROUP_RE.search(token) + if not match: + return token + value = match.group() + if key == "title": + if not self._is_at_title_position(value): + return token + value = NON_NUMBER_DOT_RE.sub(r"\1 \2", value) + value = self._grouping_operators_strip(value) + if value: + self.metadata[key] = value + return "" + + def _parse_series_and_title(self) -> None: """Assign series and title.""" if not self._unparsed_path: return @@ -279,28 +304,18 @@ class ComicFilenameParser: unused_tokens = [] tokens = self._unparsed_path.split(TOKEN_DELIMETER) while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS): - key = _REMAINING_GROUP_KEYS[remaining_key_index] - if key in self.metadata: - continue - token = tokens.pop(0) - match = REMAINING_GROUP_RE.search(token) - if match: - value = match.group() - if key == "title" and not self._is_title_in_position(value): - unused_tokens.append(token) - continue - value = self._grouping_operators_strip(value) - value = NON_NUMBER_DOT_RE.sub(r"\1 \2", value) - - self.metadata[key] = value - remaining_key_index += 1 - else: - unused_tokens.append(token) + unused_token = self._parse_series_and_title_token( + remaining_key_index, tokens + ) + if unused_token: + unused_tokens.append(unused_token) + remaining_key_index += 1 + print(f"{unused_tokens=}") self._unparsed_path = " ".join(unused_tokens) if unused_tokens else "" self._log("After Series & Title") - def _add_remainders(self): + def _add_remainders(self) -> None: """Add Remainders.""" remainders = [] for token in self._unparsed_path.split(TOKEN_DELIMETER): @@ -310,7 +325,7 @@ class ComicFilenameParser: if remainders: self.metadata["remainders"] = tuple(remainders) - def parse(self) -> dict[str, Any]: + def parse(self) -> dict[str, str | tuple[str, ...]]: """Parse the filename with a hierarchy of regexes.""" self._log("Init") self._parse_ext() @@ -345,7 +360,9 @@ class ComicFilenameParser: self._path_indexes: dict[str, int] = {} -def comicfn2dict(path: str | Path, verbose: int = 0): +def comicfn2dict( + path: str | Path, verbose: int = 0 +) -> dict[str, str | tuple[str, ...]]: """Simple API.""" parser = ComicFilenameParser(path, verbose=verbose) return parser.parse() diff --git a/comicfn2dict/unparse.py b/comicfn2dict/unparse.py index ee65211..7907113 100644 --- a/comicfn2dict/unparse.py +++ b/comicfn2dict/unparse.py @@ -39,7 +39,7 @@ _DATE_KEYS = ("year", "month", "day") class ComicFilenameSerializer: """Serialize Comic Filenames from dict.""" - def _log(self, label, fn): + def _log(self, label: str, fn: str) -> None: """Log progress.""" if not self._debug: return @@ -95,7 +95,7 @@ class ComicFilenameSerializer: for tag, fmt in _FILENAME_FORMAT_TAGS: if token := self._tokenize_tag(tag, fmt): tokens.append(token) - self._log(f"After {tag}", tokens) + self._log(f"After {tag}", str(tokens)) fn = " ".join(tokens) fn += self._add_remainder() diff --git a/pyproject.toml b/pyproject.toml index 5c8d950..3d63bad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "comicfn2dict" -version = "0.2.0a2" +version = "0.2.0a3" description = "Parse common comic filenames and return a dict of metadata attributes. Includes a cli." license = "GPL-3.0-only" authors = ["AJ Slater <aj@slater.net>"] diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index 3c92ee0..4d299f6 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -56,11 +56,6 @@ FNS = { "Long Series Name #001 (2000) Title (TPB) (Releaser).cbz": TEST_COMIC_FIELDS, "Long Series Name (2000) 001 Title (TPB) (Releaser).cbz": TEST_COMIC_FIELDS, "Long Series Name (2000) #001 Title (TPB) (Releaser).cbz": TEST_COMIC_FIELDS, - "Long Series Name v1 (2000) #001 " - "Title (TPB) (Releaser).cbz": TEST_COMIC_FIELDS_VOL, - "Long Series Name 001 (2000) (TPB-Releaser) Title.cbz": TEST_COMIC_FIELDS, - "Long Series Name Vol 1 " - "(2000) (TPB) (Releaser & Releaser-Releaser) Title.cbr": TEST_COMIC_VOL_ONLY, "Ultimate Craziness (2019) (Digital) (Friends-of-Bill).cbr": { "series": "Ultimate Craziness", "year": "2019", @@ -443,6 +438,41 @@ FNS.update( "restored) (Shadowcat-Empire)", ), }, + "Captain Science #001 (1950) The Beginning - nothing.cbz": { + "ext": "cbz", + "issue": "001", + "title": "The Beginning - nothing", + "series": "Captain Science", + "year": "1950", + }, + "Captain Science #001-cix-cbi.cbr": { + "ext": "cbr", + "issue": "001", + "series": "Captain Science", + "remainders": ("cix-cbi",), + }, + "Long Series Name v1 (2000) #001 " + "Title (TPB) (Releaser).cbz": TEST_COMIC_FIELDS_VOL, + "Long Series Name 001 (2000) (TPB-Releaser) Title.cbz": { + "series": "Long Series Name", + "issue": "001", + "year": "2000", + "original_format": "TPB", + "scan_info": "Releaser", + "remainders": ("Title",), + "ext": "cbz", + }, + "Long Series Name Vol 1 " + "(2000) (TPB) (Releaser & Releaser-Releaser) Title.cbr": { + "series": "Long Series Name", + "volume": "1", + "issue": "1", + "remainders": ("Title",), + "original_format": "TPB", + "year": "2000", + "scan_info": "Releaser & Releaser-Releaser", + "ext": "cbr", + }, } ) From 0a17bbc0d9cf0d09e7990659093e2670a67926b4 Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Sat, 24 Feb 2024 19:04:45 -0800 Subject: [PATCH 55/57] fix test for title and not remainers --- tests/comic_filenames.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index 4d299f6..b52edf9 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -449,7 +449,7 @@ FNS.update( "ext": "cbr", "issue": "001", "series": "Captain Science", - "remainders": ("cix-cbi",), + "title": "cix-cbi", }, "Long Series Name v1 (2000) #001 " "Title (TPB) (Releaser).cbz": TEST_COMIC_FIELDS_VOL, From 32f8cb0f226877193c7c5084d231b6c6e7b7001a Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Sat, 24 Feb 2024 19:40:33 -0800 Subject: [PATCH 56/57] lint and add type notations --- comicfn2dict/cli.py | 5 +-- comicfn2dict/log.py | 2 +- comicfn2dict/parse.py | 26 ++++++-------- comicfn2dict/regex.py | 76 ++++++++++++++++++++-------------------- comicfn2dict/unparse.py | 11 +++--- pyproject.toml | 4 +-- tests/comic_filenames.py | 1 - 7 files changed, 61 insertions(+), 64 deletions(-) diff --git a/comicfn2dict/cli.py b/comicfn2dict/cli.py index c0a7199..e8ab8cd 100755 --- a/comicfn2dict/cli.py +++ b/comicfn2dict/cli.py @@ -3,10 +3,11 @@ from argparse import ArgumentParser from pathlib import Path from pprint import pprint + from comicfn2dict.parse import ComicFilenameParser -def main(): +def main() -> None: """Test parser.""" description = "Comic book archive read/write tool." parser = ArgumentParser(description=description) @@ -23,7 +24,7 @@ def main(): cfnparser = ComicFilenameParser(name, verbose=args.verbose) metadata = cfnparser.parse() if args.verbose: - print("=" * 80) + print("=" * 80) # noqa:T201 pprint(metadata) # noqa:T203 diff --git a/comicfn2dict/log.py b/comicfn2dict/log.py index 3265889..0626325 100644 --- a/comicfn2dict/log.py +++ b/comicfn2dict/log.py @@ -6,4 +6,4 @@ def print_log_header(label: str) -> None: prefix = "-" * 3 + label suffix_len = 80 - len(prefix) suffix = "-" * suffix_len - print(prefix + suffix) + print(prefix + suffix) # noqa: T201 diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index a754d22..0cca5af 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -1,10 +1,11 @@ """Parse comic book archive names using the simple 'parse' parser.""" -from pprint import pformat from calendar import month_abbr from copy import copy from pathlib import Path +from pprint import pformat from re import Match, Pattern from sys import maxsize + from comicfn2dict.log import print_log_header from comicfn2dict.regex import ( ALPHA_MONTH_RANGE_RE, @@ -18,8 +19,8 @@ from comicfn2dict.regex import ( ORIGINAL_FORMAT_SCAN_INFO_RE, ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, PUBLISHER_AMBIGUOUS_RE, - PUBLISHER_UNAMBIGUOUS_RE, PUBLISHER_AMBIGUOUS_TOKEN_RE, + PUBLISHER_UNAMBIGUOUS_RE, PUBLISHER_UNAMBIGUOUS_TOKEN_RE, REGEX_SUBS, REMAINING_GROUP_RE, @@ -51,10 +52,7 @@ class ComicFilenameParser: if value not in self._path_indexes: # XXX This is fragile, but it's difficult to calculate the original # position at match time from the ever changing _unparsed_path. - if key == "ext": - index = self.path.rfind(value) - else: - index = self.path.find(value) + index = self.path.rfind(value) if key == "ext" else self.path.find(value) self._path_indexes[value] = index return self._path_indexes[value] @@ -65,8 +63,8 @@ class ComicFilenameParser: combined = {} for key in self.metadata: combined[key] = (self.metadata.get(key), self.path_index(key)) - print(" " + self._unparsed_path) - print(" " + pformat(combined)) + print(" " + self._unparsed_path) # noqa: T201 + print(" " + pformat(combined)) # noqa: T201 def _parse_ext(self) -> None: """Pop the extension from the pathname.""" @@ -121,7 +119,7 @@ class ComicFilenameParser: parts.append(token) self._unparsed_path = TOKEN_DELIMETER.join(parts) - def _parse_items( + def _parse_items( # noqa: PLR0913 self, regex: Pattern, require_all: bool = False, @@ -244,7 +242,7 @@ class ComicFilenameParser: self._log("After publisher") def _is_at_title_position(self, value: str) -> bool: - """Does the title come after series and one other token if they exist.""" + """Title is in correct position.""" title_index = self.path.find(value) # Titles must come after series but before format and scan_info @@ -286,9 +284,8 @@ class ComicFilenameParser: if not match: return token value = match.group() - if key == "title": - if not self._is_at_title_position(value): - return token + if key == "title" and not self._is_at_title_position(value): + return token value = NON_NUMBER_DOT_RE.sub(r"\1 \2", value) value = self._grouping_operators_strip(value) if value: @@ -311,7 +308,6 @@ class ComicFilenameParser: unused_tokens.append(unused_token) remaining_key_index += 1 - print(f"{unused_tokens=}") self._unparsed_path = " ".join(unused_tokens) if unused_tokens else "" self._log("After Series & Title") @@ -363,6 +359,6 @@ class ComicFilenameParser: def comicfn2dict( path: str | Path, verbose: int = 0 ) -> dict[str, str | tuple[str, ...]]: - """Simple API.""" + """Simplfily the API.""" parser = ComicFilenameParser(path, verbose=verbose) return parser.parse() diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index 9168438..6daee7d 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -1,16 +1,8 @@ """Parsing regexes.""" -import re +from re import IGNORECASE, Pattern, compile from types import MappingProxyType - -def re_compile(exp, parenthify=False): - """Compile regex with options.""" - if parenthify: - exp = r"\(" + exp + r"\)" - return re.compile(exp, flags=re.IGNORECASE) - - -PUBLISHERS_UNAMBIGUOUS = ( +PUBLISHERS_UNAMBIGUOUS: tuple[str, ...] = ( r"Abrams ComicArts", r"BOOM! Studios", r"DC(\sComics)?", @@ -26,7 +18,7 @@ PUBLISHERS_UNAMBIGUOUS = ( r"SelfMadeHero", r"Titan Comics", ) -PUBLISHERS_AMBIGUOUS = ( +PUBLISHERS_AMBIGUOUS: tuple[str, ...] = ( r"Marvel", r"Heavy Metal", r"Epic", @@ -34,7 +26,7 @@ PUBLISHERS_AMBIGUOUS = ( r"Mirage", ) -ORIGINAL_FORMAT_PATTERNS = ( +ORIGINAL_FORMAT_PATTERNS: tuple[str, ...] = ( r"Anthology", r"(One|1)[-\s]Shot", r"Annual", @@ -63,7 +55,7 @@ ORIGINAL_FORMAT_PATTERNS = ( r"Web([-\s]?(Comic|Rip))?", ) -MONTHS = ( +MONTHS: tuple[str, ...] = ( r"Jan(uary)?", r"Feb(ruary)?", r"Mar(ch)?", @@ -78,7 +70,15 @@ MONTHS = ( r"Dec(ember)?", ) -TOKEN_DELIMETER = r"/" +TOKEN_DELIMETER: str = r"/" + + +def re_compile(exp: str, parenthify: bool = False) -> Pattern: + """Compile regex with options.""" + if parenthify: + exp = r"\(" + exp + r"\)" + return compile(exp, flags=IGNORECASE) + # CLEAN _TOKEN_DIVIDERS_RE = re_compile(r":") @@ -87,7 +87,7 @@ _EXTRA_SPACES_RE = re_compile(r"\s\s+") _LEFT_PAREN_EQUIVALENT_RE = re_compile(r"\[") _RIGHT_PAREN_EQUIVALENT_RE = re_compile(r"\]") _DOUBLE_UNDERSCORE_RE = re_compile(r"__(.*)__") -REGEX_SUBS: MappingProxyType[re.Pattern, tuple[str, int]] = MappingProxyType( +REGEX_SUBS: MappingProxyType[Pattern, tuple[str, int]] = MappingProxyType( { _DOUBLE_UNDERSCORE_RE: (r"(\1)", 0), _TOKEN_DIVIDERS_RE: (TOKEN_DELIMETER, 1), @@ -104,7 +104,7 @@ _MONTH_ALPHA_RE_EXP = r"(" + "(?P<alpha_month>" + r"|".join(MONTHS) + r")\.?" r" _MONTH_NUMERIC_RE_EXP = r"(?P<month>0?\d|1[0-2]?)" _MONTH_RE_EXP = r"(" + _MONTH_ALPHA_RE_EXP + r"|" + _MONTH_NUMERIC_RE_EXP + r")" _ALPHA_MONTH_RANGE = ( - r"\b" + r"\b" # noqa: ISC003 + r"(" + r"|".join(MONTHS) + r")" @@ -115,7 +115,7 @@ _ALPHA_MONTH_RANGE = ( + r")" + r")\b" ) -ALPHA_MONTH_RANGE_RE = re_compile(_ALPHA_MONTH_RANGE) +ALPHA_MONTH_RANGE_RE: Pattern = re_compile(_ALPHA_MONTH_RANGE) _DAY_RE_EXP = r"(?P<day>([0-2]?\d|(3)[0-1]))" _DATE_DELIM = r"[-\s]+" @@ -144,10 +144,10 @@ _YEAR_FIRST_DATE_RE_EXP = ( + r"\b\)?)" ) -MONTH_FIRST_DATE_RE = re_compile(_MONTH_FIRST_DATE_RE_EXP) -YEAR_FIRST_DATE_RE = re_compile(_YEAR_FIRST_DATE_RE_EXP) -YEAR_TOKEN_RE = re_compile(_YEAR_RE_EXP, parenthify=True) -YEAR_END_RE = re_compile(_YEAR_RE_EXP + r"\/|$") +MONTH_FIRST_DATE_RE: Pattern = re_compile(_MONTH_FIRST_DATE_RE_EXP) +YEAR_FIRST_DATE_RE: Pattern = re_compile(_YEAR_FIRST_DATE_RE_EXP) +YEAR_TOKEN_RE: Pattern = re_compile(_YEAR_RE_EXP, parenthify=True) +YEAR_END_RE: Pattern = re_compile(_YEAR_RE_EXP + r"\/|$") # PAREN GROUPS _OF_PATTERNS = r"|".join(ORIGINAL_FORMAT_PATTERNS) @@ -157,38 +157,38 @@ _ORIGINAL_FORMAT_SCAN_INFO_RE_EXP = ( _ORIGINAL_FORMAT_RE_EXP + r"\s*[\(:-]" + _SCAN_INFO_RE_EXP # + r")?" ) # Keep this even though comicfn2dict doesn't use it directly -ORIGINAL_FORMAT_RE = re_compile(_ORIGINAL_FORMAT_RE_EXP, parenthify=True) -ORIGINAL_FORMAT_SCAN_INFO_RE = re_compile( +ORIGINAL_FORMAT_RE: Pattern = re_compile(_ORIGINAL_FORMAT_RE_EXP, parenthify=True) +ORIGINAL_FORMAT_SCAN_INFO_RE: Pattern = re_compile( _ORIGINAL_FORMAT_SCAN_INFO_RE_EXP, parenthify=True ) -ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE = re_compile( +ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE: Pattern = re_compile( r"\(" + _ORIGINAL_FORMAT_RE_EXP + r"\).*\(" + _SCAN_INFO_RE_EXP + r"\)" ) -SCAN_INFO_SECONDARY_RE = re_compile(r"\b(?P<secondary_scan_info>c2c)\b") +SCAN_INFO_SECONDARY_RE: Pattern = re_compile(r"\b(?P<secondary_scan_info>c2c)\b") # ISSUE _ISSUE_RE_EXP = r"(?P<issue>\w*(½|\d+)[\.\d+]*\w*)" _ISSUE_COUNT_RE_EXP = r"\(of\s*(?P<issue_count>\d+)\)" -ISSUE_NUMBER_RE = re_compile( +ISSUE_NUMBER_RE: Pattern = re_compile( r"(\(?#" + _ISSUE_RE_EXP + r"\)?)" + r"(\W*" + _ISSUE_COUNT_RE_EXP + r")?" ) -ISSUE_WITH_COUNT_RE = re_compile( +ISSUE_WITH_COUNT_RE: Pattern = re_compile( r"(\(?" + _ISSUE_RE_EXP + r"\)?" + r"\W*" + _ISSUE_COUNT_RE_EXP + r")" ) -ISSUE_END_RE = re_compile(r"([\/\s]\(?" + _ISSUE_RE_EXP + r"\)?(\/|$))") -ISSUE_BEGIN_RE = re_compile(r"((^|\/)\(?" + _ISSUE_RE_EXP + r"\)?[\/|\s])") +ISSUE_END_RE: Pattern = re_compile(r"([\/\s]\(?" + _ISSUE_RE_EXP + r"\)?(\/|$))") +ISSUE_BEGIN_RE: Pattern = re_compile(r"((^|\/)\(?" + _ISSUE_RE_EXP + r"\)?[\/|\s])") # Volume _VOLUME_COUNT_RE_EXP = r"\(of\s*(?P<volume_count>\d+)\)" -VOLUME_RE = re_compile( - r"(" + r"(?:v(?:ol(?:ume)?)?\.?)\s*(?P<volume>\d+)" +VOLUME_RE: Pattern = re_compile( + r"(" + r"(?:v(?:ol(?:ume)?)?\.?)\s*(?P<volume>\d+)" # noqa: ISC003 r"(\W*" + _VOLUME_COUNT_RE_EXP + r")?" + r")" ) -VOLUME_WITH_COUNT_RE = re_compile( +VOLUME_WITH_COUNT_RE: Pattern = re_compile( r"(\(?" + r"(?P<volume>\d+)" + r"\)?" + r"\W*" + _VOLUME_COUNT_RE_EXP + r")" ) -BOOK_VOLUME_RE = re_compile(r"(?P<title>" + r"book\s*(?P<volume>\d+)" + r")") +BOOK_VOLUME_RE: Pattern = re_compile(r"(?P<title>" + r"book\s*(?P<volume>\d+)" + r")") # Publisher _PUBLISHER_UNAMBIGUOUS_RE_EXP = ( @@ -197,15 +197,15 @@ _PUBLISHER_UNAMBIGUOUS_RE_EXP = ( _PUBLISHER_AMBIGUOUS_RE_EXP = ( r"(\b(?P<publisher>" + r"|".join(PUBLISHERS_AMBIGUOUS) + r")\b)" ) -PUBLISHER_UNAMBIGUOUS_TOKEN_RE = re_compile( +PUBLISHER_UNAMBIGUOUS_TOKEN_RE: Pattern = re_compile( r"(^|\/)" + _PUBLISHER_UNAMBIGUOUS_RE_EXP + r"($|\/)" ) -PUBLISHER_AMBIGUOUS_TOKEN_RE = re_compile( +PUBLISHER_AMBIGUOUS_TOKEN_RE: Pattern = re_compile( r"(^|\/)" + _PUBLISHER_AMBIGUOUS_RE_EXP + r"($|\/)" ) -PUBLISHER_UNAMBIGUOUS_RE = re_compile(_PUBLISHER_UNAMBIGUOUS_RE_EXP) +PUBLISHER_UNAMBIGUOUS_RE: Pattern = re_compile(_PUBLISHER_UNAMBIGUOUS_RE_EXP) PUBLISHER_AMBIGUOUS_RE = re_compile(_PUBLISHER_AMBIGUOUS_RE_EXP) # LONG STRINGS -REMAINING_GROUP_RE = re_compile(r"^[^\(].*[^\)]") -NON_NUMBER_DOT_RE = re_compile(r"(\D)\.(\D)") +REMAINING_GROUP_RE: Pattern = re_compile(r"^[^\(].*[^\)]") +NON_NUMBER_DOT_RE: Pattern = re_compile(r"(\D)\.(\D)") diff --git a/comicfn2dict/unparse.py b/comicfn2dict/unparse.py index 7907113..2b454b1 100644 --- a/comicfn2dict/unparse.py +++ b/comicfn2dict/unparse.py @@ -1,8 +1,9 @@ """Unparse comic filenames.""" +from calendar import month_abbr from collections.abc import Callable, Mapping, Sequence from contextlib import suppress -from calendar import month_abbr from types import MappingProxyType + from comicfn2dict.log import print_log_header @@ -44,7 +45,7 @@ class ComicFilenameSerializer: if not self._debug: return print_log_header(label) - print(fn) + print(fn) # noqa: T201 def _add_date(self) -> None: """Construct date from Y-m-D if they exist.""" @@ -73,8 +74,7 @@ class ComicFilenameSerializer: if val in _EMPTY_VALUES: return "" final_fmt = fmt(val) if isinstance(fmt, Callable) else fmt - token = final_fmt.format(val).strip() - return token + return final_fmt.format(val).strip() def _add_remainder(self) -> str: """Add the remainders specially.""" @@ -109,12 +109,13 @@ class ComicFilenameSerializer: return fn def __init__(self, metadata: Mapping, ext: bool = True, verbose: int = 0): + """Initialize.""" self.metadata: Mapping = metadata self._ext: bool = ext self._debug: bool = bool(verbose) def dict2comicfn(md: Mapping, ext: bool = True, verbose: int = 0) -> str: - """Simple API.""" + """Simplify API.""" serializer = ComicFilenameSerializer(md, ext=ext, verbose=verbose) return serializer.serialize() diff --git a/pyproject.toml b/pyproject.toml index 3d63bad..0fc18b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "comicfn2dict" -version = "0.2.0a3" +version = "0.2.0a4" description = "Parse common comic filenames and return a dict of metadata attributes. Includes a cli." license = "GPL-3.0-only" authors = ["AJ Slater <aj@slater.net>"] @@ -125,7 +125,7 @@ exclude = "*~,.git/*,.mypy_cache/*,.pytest_cache/*,.venv*,__pycache__/*,cache/*, extend-exclude = ["typings"] target-version = "py310" -[tool.lint.ruff] +[tool.ruff.lint] extend-ignore = [ "S101", "D203", diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index b52edf9..3d00ddd 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -2,7 +2,6 @@ from types import MappingProxyType - TEST_COMIC_FIELDS = { "series": "Long Series Name", "issue": "001", From 16d362da8a332f1f952da9b099707434d5fd39c4 Mon Sep 17 00:00:00 2001 From: AJ Slater <aj@slater.net> Date: Sun, 25 Feb 2024 01:56:28 -0800 Subject: [PATCH 57/57] exclude captain marvel from pulisher dectector --- comicfn2dict/regex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index 6daee7d..6b8ff65 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -19,7 +19,7 @@ PUBLISHERS_UNAMBIGUOUS: tuple[str, ...] = ( r"Titan Comics", ) PUBLISHERS_AMBIGUOUS: tuple[str, ...] = ( - r"Marvel", + r"(?<!Capt\.\s)(?<!Capt\s)(?<!Captain\s)Marvel", r"Heavy Metal", r"Epic", r"Image",