From d550d9c54e293387e6d11f02ff63a5a60748adb5 Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Wed, 21 Feb 2024 13:15:18 -0800 Subject: [PATCH] Complicated volume parsing --- comicfn2dict/parse.py | 44 +++++++++++++++++++++++-------------- comicfn2dict/regex.py | 25 ++++++++++++++++----- tests/comic_filenames.py | 47 +++++++++++++++++----------------------- 3 files changed, 68 insertions(+), 48 deletions(-) diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 2ad37e0..897dc63 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -7,21 +7,23 @@ from re import Pattern from typing import Any from comicfn2dict.regex import ( - NON_NUMBER_DOT_RE, - YEAR_FIRST_DATE_RE, + BOOK_VOLUME_RE, ISSUE_ANYWHERE_RE, - REGEX_SUBS, - TOKEN_DELIMETER, - ISSUE_COUNT_RE, - ISSUE_NUMBER_RE, ISSUE_BEGIN_RE, ISSUE_END_RE, - YEAR_END_RE, - ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, - ORIGINAL_FORMAT_SCAN_INFO_RE, - REMAINING_GROUP_RE, - VOLUME_RE, + ISSUE_NUMBER_RE, + ISSUE_WITH_COUNT_RE, MONTH_FIRST_DATE_RE, + NON_NUMBER_DOT_RE, + ORIGINAL_FORMAT_SCAN_INFO_RE, + ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, + REGEX_SUBS, + REMAINING_GROUP_RE, + TOKEN_DELIMETER, + VOLUME_RE, + VOLUME_WITH_COUNT_RE, + YEAR_END_RE, + YEAR_FIRST_DATE_RE, YEAR_TOKEN_RE, ) @@ -172,6 +174,8 @@ class ComicFilenameParser: tokens = self._unparsed_path.split(TOKEN_DELIMETER) while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS): key = _REMAINING_GROUP_KEYS[remaining_key_index] + if key in self.metadata: + continue token = tokens.pop(0) match = REMAINING_GROUP_RE.search(token) if match: @@ -218,15 +222,19 @@ class ComicFilenameParser: self._clean_dividers() self._log_progress("CLEANED") - # Main issue parsing + # Issue # self._parse_items(ISSUE_NUMBER_RE) - self._parse_items(ISSUE_COUNT_RE) + if "issue" not in self.metadata: + self._parse_items(ISSUE_WITH_COUNT_RE) + # self._parse_items(ISSUE_COUNT_RE) self._log_progress("AFTER ISSUE") - # Volume and date + # Volume and Date # self._parse_items(VOLUME_RE) + if "volume" not in self.metadata: + self._parse_items(VOLUME_WITH_COUNT_RE) self._parse_dates() self._log_progress("AFTER VOLUME & DATE") @@ -244,13 +252,17 @@ class ComicFilenameParser: # Series and Title # - # Match years on the end of series and title tokens + # Volume left on the end of string tokens + if "volume" not in self.metadata: + self._parse_items(BOOK_VOLUME_RE) + + # Years left on the end of string tokens year_end_matched = False if "year" not in self.metadata: self._parse_items(YEAR_END_RE, pop=False) year_end_matched = "year" in self.metadata - # Pickup issue if it's out on the end of a token + # Issue left on the end of string tokens if "issue" not in self.metadata and not year_end_matched: exclude: str = self.metadata.get("year", "") # type: ignore self._parse_items(ISSUE_END_RE, exclude=exclude) diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index 6d23685..33b7295 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -111,7 +111,6 @@ YEAR_TOKEN_RE = re_compile(_YEAR_RE_EXP, parenthify=True) YEAR_END_RE = re_compile(_YEAR_RE_EXP + r"\/|$") # PAREN GROUPS -ISSUE_COUNT_RE = re_compile(r"of\s*(?P\d+)", parenthify=True) _OF_PATTERNS = r"|".join(ORIGINAL_FORMAT_PATTERNS) _ORIGINAL_FORMAT_RE_EXP = r"(?P" + _OF_PATTERNS + r")" _SCAN_INFO_RE_EXP = r"(?P[^()]*)" @@ -125,18 +124,34 @@ ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE = re_compile( r"\(" + _ORIGINAL_FORMAT_RE_EXP + r"\).*\(" + _SCAN_INFO_RE_EXP + r"\)" ) -# REGULAR TOKENS -VOLUME_RE = re_compile(r"((?:v(?:ol(?:ume)?)?\.?)\s*(?P\d+))") - # ISSUE _ISSUE_RE_EXP = r"(?P\w*(½|\d+)[\.\d+]*\w*)" -ISSUE_NUMBER_RE = re_compile(r"(\(?#" + _ISSUE_RE_EXP + r"\)?)") +_ISSUE_COUNT_RE_EXP = r"\(of\s*(?P\d+)\)" +ISSUE_NUMBER_RE = re_compile( + r"(\(?#" + _ISSUE_RE_EXP + r"\)?)" + r"(\W*" + _ISSUE_COUNT_RE_EXP + r")?" +) +ISSUE_WITH_COUNT_RE = re_compile( + r"(\(?" + _ISSUE_RE_EXP + r"\)?" + r"\W*" + _ISSUE_COUNT_RE_EXP + r")" +) + ISSUE_END_RE = re_compile(r"([\/\s]\(?" + _ISSUE_RE_EXP + r"\)?(\/|$))") ISSUE_BEGIN_RE = re_compile(r"((^|\/)\(?" + _ISSUE_RE_EXP + r"\)?[\/|\s])") # TODO unused ISSUE_ANYWHERE_RE = re_compile(r"\b(\(?" + _ISSUE_RE_EXP + r"\)?)\b") +# Volume +_VOLUME_COUNT_RE_EXP = r"\(of\s*(?P\d+)\)" +VOLUME_RE = re_compile( + r"(" + r"(?:v(?:ol(?:ume)?)?\.?)\s*(?P\d+)" + r"(\W*" + _VOLUME_COUNT_RE_EXP + r")?" + r")" +) +VOLUME_WITH_COUNT_RE = re_compile( + r"(\(?" + r"(?P\d+)" + r"\)?" + r"\W*" + _VOLUME_COUNT_RE_EXP + r")" +) +BOOK_VOLUME_RE = re_compile(r"(?P" + r"book\s*(?P<volume>\d+)" + r")") + + # LONG STRINGS REMAINING_GROUP_RE = re_compile(r"^[^\()].*[^\)]") NON_NUMBER_DOT_RE = re_compile(r"(\D)\.(\D)") diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index 42c4c3e..5e3c0e0 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -397,34 +397,27 @@ FNS.update( "volume": "1957", "year": "1969", }, + # CT checks for the following '(of 06)' after the '03' and marks it as the volume + "Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": { + "ext": "cbr", + "issue": "008", + "series": "Elephantmen 2259", + "title": "Simple Truth", + "volume": "03", + "year": "2021", + "volume_count": "06", + }, + # CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series) + "Bloodshot Book 03 (2020).cbr": { + "ext": "cbr", + "issue": "03", + "series": "Bloodshot", + "title": "Book 03", + "volume": "03", + "year": "2020", + }, } ) -VOLUME = { - # CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series) - # - # Book \d is a non-popped volume not an issue - "Bloodshot Book 03 (2020).cbr": { - "ext": "cbr", - "issue": "03", - "series": "Bloodshot", - "title": "Book 03", - "volume": "03", - "year": "2020", - }, - # CT checks for the following '(of 06)' after the '03' and marks it as the volume - # - # issue count is not popped if does not occur near issue - # \d (of \d) is volume & volume count if not issue - "Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": { - "ext": "cbr", - "issue": "008", - "series": "Elephantmen 2259", - "title": "Simple Truth", - "volume": "03", - "year": "2021", - "volume_count": "06", - }, -} PUBLISHER = { # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder # @@ -452,7 +445,7 @@ PUBLISHER = { }, } -# first_key, first_val = YEAR.popitem() +# first_key, first_val = VOLUME.popitem() # FNS[first_key] = first_val WONFIX = {