From 7694a3e2fdba3b412730e8c4dd616d24acb788dd Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Sat, 24 Feb 2024 18:21:07 -0800 Subject: [PATCH] enforce title position limits. reduce parse_series_and_title complexity. add type hints. --- NEWS.md | 2 + comicfn2dict/parse.py | 101 +++++++++++++++++++++++---------------- comicfn2dict/unparse.py | 4 +- pyproject.toml | 2 +- tests/comic_filenames.py | 40 ++++++++++++++-- 5 files changed, 99 insertions(+), 50 deletions(-) diff --git a/NEWS.md b/NEWS.md index 91c10e4..01d79b0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,6 +12,8 @@ - ComicFilenameParser and ComicFilenameSerializer classes are available as well as the old function API. - New test cases thanks to @lordwelch & @bpepple +- Titles must come after series and one other token, but before format and scan + info. ## v0.1.4 diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 5df8506..a754d22 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -4,7 +4,7 @@ from calendar import month_abbr from copy import copy from pathlib import Path from re import Match, Pattern -from typing import Any +from sys import maxsize from comicfn2dict.log import print_log_header from comicfn2dict.regex import ( ALPHA_MONTH_RANGE_RE, @@ -32,21 +32,22 @@ from comicfn2dict.regex import ( YEAR_TOKEN_RE, ) -_REMAINING_GROUP_KEYS = ("series", "title") -_TITLE_PRECEDING_KEYS = ("issue", "year", "volume") _DATE_KEYS = frozenset({"year", "month", "day"}) +_REMAINING_GROUP_KEYS = ("series", "title") +# Ordered by commonness. +_TITLE_PRECEDING_KEYS = ("issue", "year", "volume", "month") class ComicFilenameParser: """Parse a filename metadata into a dict.""" - def path_index(self, key: str): + def path_index(self, key: str, default: int = -1) -> int: """Lazily retrieve and memoize the key's location in the path.""" if key == "remainders": - return -1 + return default value: str = self.metadata.get(key, "") # type: ignore if not value: - return -1 + return default if value not in self._path_indexes: # XXX This is fragile, but it's difficult to calculate the original # position at match time from the ever changing _unparsed_path. @@ -57,7 +58,7 @@ class ComicFilenameParser: self._path_indexes[value] = index return self._path_indexes[value] - def _log(self, label): + def _log(self, label: str) -> None: if not self._debug: return print_log_header(label) @@ -67,7 +68,7 @@ class ComicFilenameParser: print(" " + self._unparsed_path) print(" " + pformat(combined)) - def _parse_ext(self): + def _parse_ext(self) -> None: """Pop the extension from the pathname.""" path = Path(self._unparsed_path) suffix = path.suffix @@ -79,7 +80,7 @@ class ComicFilenameParser: self.metadata["ext"] = ext self._unparsed_path = data - def _clean_dividers(self): + def _clean_dividers(self) -> None: """Replace non space dividers and clean extra spaces out of string.""" data = self._unparsed_path @@ -142,21 +143,21 @@ class ComicFilenameParser: if pop: self._parse_items_pop_tokens(regex, first_only) - def _parse_issue(self): + def _parse_issue(self) -> None: """Parse Issue.""" self._parse_items(ISSUE_NUMBER_RE) if "issue" not in self.metadata: self._parse_items(ISSUE_WITH_COUNT_RE) self._log("After Issue") - def _parse_volume(self): + def _parse_volume(self) -> None: """Parse Volume.""" self._parse_items(VOLUME_RE) if "volume" not in self.metadata: self._parse_items(VOLUME_WITH_COUNT_RE) self._log("After Volume") - def _alpha_month_to_numeric(self): + def _alpha_month_to_numeric(self) -> None: """Translate alpha_month to numeric month.""" if alpha_month := self.metadata.pop("alpha_month", ""): alpha_month = alpha_month.capitalize() # type: ignore @@ -166,7 +167,7 @@ class ComicFilenameParser: self.metadata["month"] = month break - def _parse_dates(self): + def _parse_dates(self) -> None: """Parse date schemes.""" # Discard second month of alpha month ranges. self._unparsed_path = ALPHA_MONTH_RANGE_RE.sub(r"\1", self._unparsed_path) @@ -192,9 +193,8 @@ class ComicFilenameParser: self.metadata["volume"] = volume self._log("After Date") - def _parse_format_and_scan_info(self): - # Format & Scan Info - # + def _parse_format_and_scan_info(self) -> None: + """Format & Scan Info.""" self._parse_items( ORIGINAL_FORMAT_SCAN_INFO_RE, require_all=True, @@ -231,7 +231,7 @@ class ComicFilenameParser: self._parse_items(ISSUE_BEGIN_RE) self._log("After Issue on ends of tokens") - def _parse_publisher(self): + def _parse_publisher(self) -> None: """Parse Publisher.""" # Pop single tokens so they don't end up titles. self._parse_items(PUBLISHER_UNAMBIGUOUS_TOKEN_RE, first_only=True) @@ -243,15 +243,19 @@ class ComicFilenameParser: self._parse_items(PUBLISHER_AMBIGUOUS_RE, pop=False, first_only=True) self._log("After publisher") - def _is_title_in_position(self, value): + def _is_at_title_position(self, value: str) -> bool: """Does the title come after series and one other token if they exist.""" title_index = self.path.find(value) - # Does a series come first. - if title_index < self.path_index("series"): + # Titles must come after series but before format and scan_info + if ( + title_index < self.path_index("series") + or title_index > self.path_index("original_format", maxsize) + or title_index > self.path_index("scan_info", maxsize) + ): return False - # If other tokens exist then they much precede the title. + # Titles must be after the series and one other token. title_ok = False other_tokens_exist = False for preceding_key in _TITLE_PRECEDING_KEYS: @@ -270,7 +274,28 @@ class ComicFilenameParser: value = value.strip("'").strip() return value.strip('"').strip() - def _parse_series_and_title(self): + def _parse_series_and_title_token( + self, remaining_key_index: int, tokens: list[str] + ) -> str: + """Parse one series or title token.""" + key = _REMAINING_GROUP_KEYS[remaining_key_index] + if key in self.metadata: + return "" + token = tokens.pop(0) + match = REMAINING_GROUP_RE.search(token) + if not match: + return token + value = match.group() + if key == "title": + if not self._is_at_title_position(value): + return token + value = NON_NUMBER_DOT_RE.sub(r"\1 \2", value) + value = self._grouping_operators_strip(value) + if value: + self.metadata[key] = value + return "" + + def _parse_series_and_title(self) -> None: """Assign series and title.""" if not self._unparsed_path: return @@ -279,28 +304,18 @@ class ComicFilenameParser: unused_tokens = [] tokens = self._unparsed_path.split(TOKEN_DELIMETER) while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS): - key = _REMAINING_GROUP_KEYS[remaining_key_index] - if key in self.metadata: - continue - token = tokens.pop(0) - match = REMAINING_GROUP_RE.search(token) - if match: - value = match.group() - if key == "title" and not self._is_title_in_position(value): - unused_tokens.append(token) - continue - value = self._grouping_operators_strip(value) - value = NON_NUMBER_DOT_RE.sub(r"\1 \2", value) - - self.metadata[key] = value - remaining_key_index += 1 - else: - unused_tokens.append(token) + unused_token = self._parse_series_and_title_token( + remaining_key_index, tokens + ) + if unused_token: + unused_tokens.append(unused_token) + remaining_key_index += 1 + print(f"{unused_tokens=}") self._unparsed_path = " ".join(unused_tokens) if unused_tokens else "" self._log("After Series & Title") - def _add_remainders(self): + def _add_remainders(self) -> None: """Add Remainders.""" remainders = [] for token in self._unparsed_path.split(TOKEN_DELIMETER): @@ -310,7 +325,7 @@ class ComicFilenameParser: if remainders: self.metadata["remainders"] = tuple(remainders) - def parse(self) -> dict[str, Any]: + def parse(self) -> dict[str, str | tuple[str, ...]]: """Parse the filename with a hierarchy of regexes.""" self._log("Init") self._parse_ext() @@ -345,7 +360,9 @@ class ComicFilenameParser: self._path_indexes: dict[str, int] = {} -def comicfn2dict(path: str | Path, verbose: int = 0): +def comicfn2dict( + path: str | Path, verbose: int = 0 +) -> dict[str, str | tuple[str, ...]]: """Simple API.""" parser = ComicFilenameParser(path, verbose=verbose) return parser.parse() diff --git a/comicfn2dict/unparse.py b/comicfn2dict/unparse.py index ee65211..7907113 100644 --- a/comicfn2dict/unparse.py +++ b/comicfn2dict/unparse.py @@ -39,7 +39,7 @@ _DATE_KEYS = ("year", "month", "day") class ComicFilenameSerializer: """Serialize Comic Filenames from dict.""" - def _log(self, label, fn): + def _log(self, label: str, fn: str) -> None: """Log progress.""" if not self._debug: return @@ -95,7 +95,7 @@ class ComicFilenameSerializer: for tag, fmt in _FILENAME_FORMAT_TAGS: if token := self._tokenize_tag(tag, fmt): tokens.append(token) - self._log(f"After {tag}", tokens) + self._log(f"After {tag}", str(tokens)) fn = " ".join(tokens) fn += self._add_remainder() diff --git a/pyproject.toml b/pyproject.toml index 5c8d950..3d63bad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "comicfn2dict" -version = "0.2.0a2" +version = "0.2.0a3" description = "Parse common comic filenames and return a dict of metadata attributes. Includes a cli." license = "GPL-3.0-only" authors = ["AJ Slater "] diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index 3c92ee0..4d299f6 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -56,11 +56,6 @@ FNS = { "Long Series Name #001 (2000) Title (TPB) (Releaser).cbz": TEST_COMIC_FIELDS, "Long Series Name (2000) 001 Title (TPB) (Releaser).cbz": TEST_COMIC_FIELDS, "Long Series Name (2000) #001 Title (TPB) (Releaser).cbz": TEST_COMIC_FIELDS, - "Long Series Name v1 (2000) #001 " - "Title (TPB) (Releaser).cbz": TEST_COMIC_FIELDS_VOL, - "Long Series Name 001 (2000) (TPB-Releaser) Title.cbz": TEST_COMIC_FIELDS, - "Long Series Name Vol 1 " - "(2000) (TPB) (Releaser & Releaser-Releaser) Title.cbr": TEST_COMIC_VOL_ONLY, "Ultimate Craziness (2019) (Digital) (Friends-of-Bill).cbr": { "series": "Ultimate Craziness", "year": "2019", @@ -443,6 +438,41 @@ FNS.update( "restored) (Shadowcat-Empire)", ), }, + "Captain Science #001 (1950) The Beginning - nothing.cbz": { + "ext": "cbz", + "issue": "001", + "title": "The Beginning - nothing", + "series": "Captain Science", + "year": "1950", + }, + "Captain Science #001-cix-cbi.cbr": { + "ext": "cbr", + "issue": "001", + "series": "Captain Science", + "remainders": ("cix-cbi",), + }, + "Long Series Name v1 (2000) #001 " + "Title (TPB) (Releaser).cbz": TEST_COMIC_FIELDS_VOL, + "Long Series Name 001 (2000) (TPB-Releaser) Title.cbz": { + "series": "Long Series Name", + "issue": "001", + "year": "2000", + "original_format": "TPB", + "scan_info": "Releaser", + "remainders": ("Title",), + "ext": "cbz", + }, + "Long Series Name Vol 1 " + "(2000) (TPB) (Releaser & Releaser-Releaser) Title.cbr": { + "series": "Long Series Name", + "volume": "1", + "issue": "1", + "remainders": ("Title",), + "original_format": "TPB", + "year": "2000", + "scan_info": "Releaser & Releaser-Releaser", + "ext": "cbr", + }, } )