From 3ce61254dcf8ca26cb614e76d4e09b96f1ffae54 Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Mon, 19 Feb 2024 14:11:47 -0800 Subject: [PATCH] titles after tokens --- NEWS.md | 5 ++ comicfn2dict/parse.py | 103 ++++++++++++++++++----- comicfn2dict/regex.py | 2 +- pyproject.toml | 2 +- tests/comic_filenames.py | 175 +++++++++++++++++++++++++++++++++++++-- 5 files changed, 256 insertions(+), 31 deletions(-) diff --git a/NEWS.md b/NEWS.md index 65cf227..525bf58 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ # 📰 comicfn2dict News +## v0.2.0 + +- Titles are now parsed only if they occur after the series token AND after + either issue, year or volume. + ## v0.1.4 - Require Python 3.10 diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 6a75aef..f41a7f9 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -1,10 +1,10 @@ """Parse comic book archive names using the simple 'parse' parser.""" +from pprint import pprint from pathlib import Path from re import Match, Pattern from typing import Any from comicfn2dict.regex import ( - DASH_SPLIT_RE, EXTRA_SPACES_RE, ISSUE_ANYWHERE_RE, ISSUE_BEGIN_RE, @@ -26,9 +26,13 @@ from comicfn2dict.regex import ( _REMAINING_GROUP_KEYS = ("series", "title") -def _parse_ext(name: str, suffix: str, metadata: dict) -> str: +def _parse_ext(name: str | Path, metadata: dict) -> str: """Pop the extension from the pathname.""" - data = name.removesuffix(suffix) + if isinstance(name, str): + name = name.strip() + path = Path(name) + suffix = path.suffix + data = path.name.removesuffix(suffix) ext = suffix.lstrip(".") if ext: metadata["ext"] = ext @@ -43,17 +47,18 @@ def _clean_dividers(data: str) -> str: def _get_data_list(path: str | Path, metadata: dict) -> list[str]: """Prepare data list from a path or string.""" - if isinstance(path, str): - path = path.strip() - path = Path(path) - data = _parse_ext(path.name, path.suffix, metadata) + data = _parse_ext(path, metadata) data = _clean_dividers(data) - return DASH_SPLIT_RE.split(data) + return [data] -def _paren_strip(value: str) -> str: +def _grouping_operators_strip(value: str) -> str: """Strip spaces and parens.""" - return value.strip().strip("()").strip() + value = value.strip() + value = value.strip("()").strip() + value = value.strip("-").strip() + value = value.strip("'").strip('"').strip() + return value def _splicey_dicey( @@ -71,7 +76,7 @@ def _splicey_dicey( if data_after := data[match.end() :].strip(): data_ends.append(data_after) data_list[index:index] = data_ends - return _paren_strip(value) + return _grouping_operators_strip(value) def _match_original_format_and_scan_info( @@ -83,10 +88,10 @@ def _match_original_format_and_scan_info( scan_info = match.group("scan_info") except IndexError: scan_info = None - metadata["original_format"] = _paren_strip(original_format) + metadata["original_format"] = _grouping_operators_strip(original_format) match_group = 1 if scan_info: - metadata["scan_info"] = _paren_strip(scan_info) + metadata["scan_info"] = _grouping_operators_strip(scan_info) match_group = 0 _splicey_dicey(data_list, index, match, match_group=match_group) @@ -112,14 +117,16 @@ def _pop_value_from_token( regex: Pattern, key: str, index: int = 0, -) -> Match: +) -> str: """Search token for value, splice and assign to metadata.""" data = data_list[index] match = regex.search(data) if match: value = _splicey_dicey(data_list, index, match, key) metadata[key] = value - return match + else: + value = "" + return value def _parse_item( @@ -128,21 +135,25 @@ def _parse_item( regex: Pattern, key: str, start_index: int = 0, + path: str = "", ) -> int: """Parse a value from the data list into metadata and alter the data list.""" + path_index = -1 index = start_index dl_len = end_index = len(data_list) if index >= end_index: index = 0 while index < end_index: - match = _pop_value_from_token(data_list, metadata, regex, key, index) - if match: + value = _pop_value_from_token(data_list, metadata, regex, key, index) + if value: + if "key" == "issue": + path_index = path.find(value) break index += 1 if index > dl_len and start_index > 0: index = 0 end_index = start_index - return index + return path_index def _pop_issue_from_text_fields( @@ -156,7 +167,39 @@ def _pop_issue_from_text_fields( return data_list.pop(index) -def _assign_remaining_groups(data_list: list[str], metadata: dict): +TITLE_PRECEDING_KEYS = ("issue", "year", "volume") + + +def _is_title_in_position(path, value, metadata): + """Does the title come after series and one other token if they exist.""" + # TODO this could be faster if indexes could be grabbed for these tokens + # when they are extracted. + title_index = path.find(value) + + # Does a series come first. + series = metadata.get("series") + if not series: + return False + series_index = path.find(series) + if title_index < series_index: + return False + + # If other tokens exist then they much precede the title. + title_ok = False + other_tokens_exist = False + for preceding_key in TITLE_PRECEDING_KEYS: + preceding_value = metadata.get(preceding_key) + if not preceding_value: + continue + other_tokens_exist = True + preceding_index = path.find(preceding_value) + if title_index > preceding_index: + title_ok = True + break + return title_ok or not other_tokens_exist + + +def _assign_remaining_groups(data_list: list[str], metadata: dict, path: str): """Assign series and title.""" index = 0 for key in _REMAINING_GROUP_KEYS: @@ -167,7 +210,9 @@ def _assign_remaining_groups(data_list: list[str], metadata: dict): match = REMAINING_GROUP_RE.search(data) if data else None if match: value = _pop_issue_from_text_fields(data_list, metadata, index) - value = _paren_strip(value) + if key == "title" and not _is_title_in_position(path, value, metadata): + continue + value = _grouping_operators_strip(value) if value: metadata[key] = value else: @@ -184,10 +229,17 @@ def _pickup_issue(remainders: list[str], metadata: dict) -> None: _parse_item(remainders, metadata, ISSUE_ANYWHERE_RE, "issue") +def _log_progress(label, metadata, data_list): + print(label + ":") + pprint(metadata) + pprint(data_list) + + def comicfn2dict(path: str | Path) -> dict[str, Any]: """Parse the filename with a hierarchy of regexes.""" metadata = {} data_list = _get_data_list(path, metadata) + _log_progress("INITIAL", metadata, data_list) # Parse paren tokens _parse_item(data_list, metadata, ISSUE_COUNT_RE, "issue_count") @@ -206,26 +258,33 @@ def comicfn2dict(path: str | Path) -> dict[str, Any]: "scan_info", start_index=of_index + 1, ) + _log_progress("AFTER PAREN TOKENS", metadata, data_list) # Parse regular tokens _parse_item(data_list, metadata, VOLUME_RE, "volume") - _parse_item(data_list, metadata, ISSUE_NUMBER_RE, "issue") + _parse_item(data_list, metadata, ISSUE_NUMBER_RE, "issue", path=str(path)) + _log_progress("AFTER REGULAR TOKENS", metadata, data_list) # Pickup year if not gotten. if "year" not in metadata: _parse_item(data_list, metadata, YEAR_BEGIN_RE, "year") if "year" not in metadata: _parse_item(data_list, metadata, YEAR_END_RE, "year") + _log_progress("AFTER YEAR PICKUP", metadata, data_list) # Pickup issue if it's a standalone token if "issue" not in metadata: _parse_item(data_list, metadata, ISSUE_TOKEN_RE, "issue") + _log_progress("AFTER ISSUE PICKUP", metadata, data_list) + # Series and Title. Also looks for issue. - _assign_remaining_groups(data_list, metadata) + _assign_remaining_groups(data_list, metadata, str(path)) + _log_progress("AFTER SERIES AND TITLE", metadata, data_list) # Final try for issue number. _pickup_issue(data_list, metadata) + _log_progress("AFTER ISSUE PICKUP", metadata, data_list) # Add Remainders if data_list: diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index d5f8145..d49273d 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -72,4 +72,4 @@ ISSUE_BEGIN_RE = re_compile(r"^(" + _ISSUE_RE_EXP + r")\b") ISSUE_ANYWHERE_RE = re_compile(r"\b(" + _ISSUE_RE_EXP + r")\b") # LONG STRINGS -REMAINING_GROUP_RE = re_compile(r"^[\w].*[^\)]") +REMAINING_GROUP_RE = re_compile(r"^[^\()].*[^\)]") diff --git a/pyproject.toml b/pyproject.toml index c9299fc..5f662e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "comicfn2dict" -version = "0.1.4" +version = "0.2.0" description = "Parse common comic filenames and return a dict of metadata attributes. Includes a cli." license = "GPL-3.0-only" authors = ["AJ Slater "] diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index 753c047..3700d65 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -80,13 +80,11 @@ FNS = { "original_format": "digital", }, "Bardude - The Last Thing I Remember.cbz": { - "series": "Bardude", - "title": "The Last Thing I Remember", + "series": "Bardude - The Last Thing I Remember", "ext": "cbz", }, "Drunkguy - The Man Without Fear - 01.cbz": { - "series": "Drunkguy", - "title": "The Man Without Fear", + "series": "Drunkguy - The Man Without Fear", "issue": "01", "ext": "cbz", }, @@ -125,9 +123,8 @@ FNS = { "scan_info": "Zone-Empire", "title": "Last Bullet", }, - "Jeremy John - A Big Long Title (2017) (digital-Minutement).cbz": { - "series": "Jeremy John", - "title": "A Big Long Title", + "Jeremy John - Not A Title (2017) (digital-Minutement).cbz": { + "series": "Jeremy John - Not A Title", "year": "2017", "ext": "cbz", "original_format": "digital", @@ -243,3 +240,167 @@ FNS = { "ext": "cbz", }, } + +FNS.update( # Newly fixed. + { + "'Batman - Superman - World's Finest 022 (2024) (Webrip) (The Last Kryptonian-DCP).cbz": { + "ext": "cbz", + "issue": "022", + "remainders": ("(The Last Kryptonian-DCP)",), + "scan_info": "Webrip", + "series": "Batman - Superman - World's Finest", + "year": "2024", + }, + } +) + +FNS.update( + { + # Issue number starting with a letter requested in https://github.com/comictagger/comictagger/issues/543 + "batman #B01 title.cbz": { + "ext": "cbz", + "issue": "B01", + "series": "batman", + "title": "title", + }, # Leading issue number is usually an alternate sequence number + "52 action comics #2024.cbz": { + "ext": "cbz", + "issue": "2024", + "series": "action comics", + "alternate": "52", + }, # 4 digit issue number + "action comics 1024.cbz": { + "ext": "cbz", + "issue": "1024", + "series": "action comics", + }, # Only the issue number. CT ensures that the series always has a value if possible + "#52.cbz": { + "ext": "cbz", + "issue": "52", + "series": "52", + }, # CT treats double-underscore the same as double-dash + "Monster_Island_v1_#2__repaired__c2c.cbz": { + "ext": "cbz", + "issue": "2", + "series": "Monster Island", + "volume": "1", + }, # I'm not sure there's a right way to parse this. This might also be a madeup filename I don't remember + "Super Strange Yarns (1957) #92 (1969).cbz": { + "ext": "cbz", + "issue": "92", + "series": "Super Strange Yarns", + "volume": "1957", + "year": "1969", + }, # Extra - in the series + " X-Men-V1-#067.cbr": { + "ext": "cbr", + "issue": "067", + "series": "X-Men", + "volume": "1", + }, # CT only separates this into a title if the '-' is attached to the previous word eg 'aquaman- Green Arrow'. @bpepple opened a ticket for this https://github.com/ajslater/comicfn2dict/issues/1 already + "Aquaman - Green Arrow - Deep Target #01 (of 07) (2021).cbr": { + "ext": "cbr", + "issue": "01", + "series": "Aquaman - Green Arrow - Deep Target", + "year": "2021", + "issue_count": "7", + }, + "Batman_-_Superman_#020_(2021).cbr": { + "ext": "cbr", + "issue": "020", + "series": "Batman - Superman", + "year": "2021", + }, + "Free Comic Book Day - Avengers.Hulk (2021).cbz": { + "ext": "cbz", + "series": "Free Comic Book Day - Avengers Hulk", + "year": "2021", + }, # CT assumes the volume is also the issue number if it can't find an issue number + "Avengers By Brian Michael Bendis volume 03 (2013).cbz": { + "ext": "cbz", + "issue": "3", + "series": "Avengers By Brian Michael Bendis", + "volume": "03", + "year": "2013", + }, # Publishers like to re-print some of their annuals using this format for the year + "Batman '89 (2021) .cbr": { + "ext": "cbr", + "series": "Batman '89", + "year": "2021", + }, # CT has extra processing to re-attach the year in this case + "Blade Runner Free Comic Book Day 2021 (2021).cbr": { + "ext": "cbr", + "series": "Blade Runner Free Comic Book Day 2021", + "year": "2021", + }, # CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series) + "Bloodshot Book 03 (2020).cbr": { + "ext": "cbr", + "issue": "03", + "series": "Bloodshot", + "title": "Book 03", + "volume": "03", + "year": "2020", + }, # CT checks for the following '(of 06)' after the '03' and marks it as the volume + "Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": { + "ext": "cbr", + "issue": "008", + "series": "Elephantmen 2259", + "title": "Simple Truth", + "volume": "03", + "year": "2021", + "volume_count": "06", + }, # CT catches the year + "Marvel Previews #002 (January 2022).cbr": { + "ext": "cbr", + "issue": "002", + "series": "Marvel Previews", + "year": "2022", + }, # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder + "Marvel Two In One V1 #090 c2c.cbr": { + "ext": "cbr", + "issue": "090", + "series": "Marvel Two In One", + "publisher": "Marvel", + "volume": "1", + }, # This made the parser in CT much more complicated. It's understandable that this isn't parsed on the first few iterations of this project + "Star Wars - War of the Bounty Hunters - IG-88 (2021).cbz": { + "ext": "cbz", + "series": "Star Wars - War of the Bounty Hunters - IG-88", + "year": "2021", + }, # The addition of the '#1' turns this into the same as 'Aquaman - Green Arrow - Deep Target' above + "Star Wars - War of the Bounty Hunters - IG-88 #1 (2021).cbz": { + "ext": "cbz", + "issue": "1", + "series": "Star Wars - War of the Bounty Hunters - IG-88", + "year": "2021", + }, # CT treats '[]' as equivalent to '()', catches DC as a publisher and 'Sep-Oct 1951' as dates and removes them. CT doesn't catch the digital though so that could be better but I blame whoever made this atrocious filename + "Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz": { + "ext": "cbz", + "issue": "49", + "series": "Wonder Woman", + "title": "digital", + "publisher": "DC", + "year": "1951", + }, # CT notices that this is a full date, CT doesn't actually return the month or day though just removes it + "X-Men, 2021-08-04 (#02).cbz": { + "ext": "cbz", + "issue": "02", + "series": "X-Men", + "year": "2021", + }, # CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation + "Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": { + "ext": "cbz", + "issue": "001", + "series": "Cory Doctorow's Futuristic Tales of the Here and Now", + "title": "Anda's Game", + "year": "2007", + }, # This is a contrived test case. I've never seen this I just wanted to handle it with my parser + "Cory Doctorow's Futuristic Tales of the Here and Now #0.0.1 (2007).cbz": { + "ext": "cbz", + "issue": "0.1", + "series": "Cory Doctorow's Futuristic Tales of the Here and Now", + "year": "2007", + "issue_count": "", + }, + } +)