From 0ff6feb3ea255a42f4a1fc3a2539c9df6175360f Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Wed, 21 Feb 2024 16:25:39 -0800 Subject: [PATCH] all tests work --- comicfn2dict/parse.py | 55 +++++++++++++++++++++++++++++++++++--- comicfn2dict/regex.py | 57 +++++++++++++++++++++++++++++++++++++--- tests/comic_filenames.py | 54 ++++++++++++++++++------------------- 3 files changed, 132 insertions(+), 34 deletions(-) diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 897dc63..3783fc6 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -7,6 +7,7 @@ from re import Pattern from typing import Any from comicfn2dict.regex import ( + ALPHA_MONTH_RANGE_RE, BOOK_VOLUME_RE, ISSUE_ANYWHERE_RE, ISSUE_BEGIN_RE, @@ -17,8 +18,13 @@ from comicfn2dict.regex import ( NON_NUMBER_DOT_RE, ORIGINAL_FORMAT_SCAN_INFO_RE, ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, + PUBLISHER_AMBIGUOUS_RE, + PUBLISHER_UNAMBIGUOUS_RE, + PUBLISHER_AMBIGUOUS_TOKEN_RE, + PUBLISHER_UNAMBIGUOUS_TOKEN_RE, REGEX_SUBS, REMAINING_GROUP_RE, + SCAN_INFO_SECONDARY_RE, TOKEN_DELIMETER, VOLUME_RE, VOLUME_WITH_COUNT_RE, @@ -41,6 +47,8 @@ class ComicFilenameParser: if not value: return -1 if value not in self._path_indexes: + # TODO This is fragile. + # Better to get it at match time. if key == "ext": index = self.path.rfind(value) else: @@ -69,12 +77,32 @@ class ComicFilenameParser: value = value.strip("'").strip() return value.strip('"').strip() + def _parenthify_double_underscores(self) -> str: + """Replace double underscores with parens.""" + parts = self._unparsed_path.split("__") + num_parts = len(parts) + print(f"{num_parts=} {num_parts % 2}") + if num_parts < 3 or not num_parts % 2: + return self._unparsed_path + index = 0 + mode = " (" + parenthified = parts[index] + index += 1 + while index < len(parts): + parenthified += mode + parts[index] + print(f"{parenthified=}") + mode = ") " if mode == " (" else ") " + index += 1 + return parenthified.strip() + def _clean_dividers(self): """Replace non space dividers and clean extra spaces out of string.""" - data = self._unparsed_path + data = self._parenthify_double_underscores() + + # Simple substitutions for regex, pair in REGEX_SUBS.items(): replacement, count = pair - data = regex.sub(replacement, data, count=count) + data = regex.sub(replacement, data, count=count).strip() self._unparsed_path = data.strip() def _parse_items( @@ -91,7 +119,6 @@ class ComicFilenameParser: return matched_metadata = {} for key, value in matches.groupdict().items(): - print(f"{value=} == {exclude=}") if value == exclude: continue if not value: @@ -126,6 +153,9 @@ class ComicFilenameParser: def _parse_dates(self): """Parse date schemes.""" + # Discard second month of alpha month ranges. + self._unparsed_path = ALPHA_MONTH_RANGE_RE.sub(r"\1", self._unparsed_path) + # Month first date self._parse_items(MONTH_FIRST_DATE_RE) self._alpha_month_to_numeric() @@ -248,6 +278,13 @@ class ComicFilenameParser: self._parse_items( ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, ) + + self._parse_items(SCAN_INFO_SECONDARY_RE) + if ( + scan_info_secondary := self.metadata.pop("secondary_scan_info", "") + ) and "scan_info" not in self.metadata: + self.metadata["scan_info"] = scan_info_secondary # type: ignore + self._log_progress("AFTER PAREN TOKENS") # Series and Title @@ -269,6 +306,18 @@ class ComicFilenameParser: if "issue" not in self.metadata: self._parse_items(ISSUE_BEGIN_RE) self._log_progress("AFTER ISSUE PICKUP") + + # Publisher + # + # Pop single tokens so they don't end up titles. + self._parse_items(PUBLISHER_UNAMBIGUOUS_TOKEN_RE, first_only=True) + if "publisher" not in self.metadata: + self._parse_items(PUBLISHER_AMBIGUOUS_TOKEN_RE, first_only=True) + if "publisher" not in self.metadata: + self._parse_items(PUBLISHER_UNAMBIGUOUS_RE, pop=False, first_only=True) + if "publisher" not in self.metadata: + self._parse_items(PUBLISHER_AMBIGUOUS_RE, pop=False, first_only=True) + self._assign_remaining_groups() self._log_progress("AFTER SERIES AND TITLE") diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index 33b7295..f9a456e 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -10,6 +10,30 @@ def re_compile(exp, parenthify=False): return re.compile(exp, flags=re.IGNORECASE) +PUBLISHERS_UNAMBIGUOUS = ( + r"Abrams ComicArts", + r"BOOM! Studios", + r"DC(\sComics)?", + r"Dark Horse Comics", + r"Drawn & Quarterly", + r"Dynamite Entertainment", + r"IDW Publishing", + r"Icon Comics", + r"Kodansha", + r"Oni Press", + r"Pantheon Books", + r"SLG Publishing", + r"SelfMadeHero", + r"Titan Comics", +) +PUBLISHERS_AMBIGUOUS = ( + r"Marvel", + r"Heavy Metal", + r"Epic", + r"Image", + r"Mirage", +) + ORIGINAL_FORMAT_PATTERNS = ( r"Anthology", r"(One|1)[-\s]Shot", @@ -48,7 +72,7 @@ MONTHS = ( r"Jun(e)?", r"Jul(y)?", r"Aug(ust)?", - r"Sept(ember)?", + r"Sep(tember)?", r"Oct(ober)?", r"Nov(ember)?", r"Dec(ember)?", @@ -74,9 +98,19 @@ REGEX_SUBS: MappingProxyType[re.Pattern, tuple[str, int]] = MappingProxyType( ### DATES _YEAR_RE_EXP = r"(?P[12]\d{3})" -_MONTH_ALPHA_RE_EXP = r"(?P" + r"|".join(MONTHS) + r")\.?" +_MONTH_ALPHA_RE_EXP = r"(" + "(?P" + r"|".join(MONTHS) + r")\.?" r")" _MONTH_NUMERIC_RE_EXP = r"(?P0?\d|1[0-2]?)" _MONTH_RE_EXP = r"(" + _MONTH_ALPHA_RE_EXP + r"|" + _MONTH_NUMERIC_RE_EXP + r")" +_ALPHA_MONTH_RANGE = ( + r"\b" + + r"(" + r"|".join(MONTHS) + r")" + + r"(" + + r"\.?-" + + r"(" + r"|".join(MONTHS) + r")" + + r")\b" +) +print(_ALPHA_MONTH_RANGE) +ALPHA_MONTH_RANGE_RE = re_compile(_ALPHA_MONTH_RANGE) _DAY_RE_EXP = r"(?P([0-2]?\d|(3)[0-1]))" _DATE_DELIM = r"[-\s]+" @@ -124,6 +158,8 @@ ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE = re_compile( r"\(" + _ORIGINAL_FORMAT_RE_EXP + r"\).*\(" + _SCAN_INFO_RE_EXP + r"\)" ) +SCAN_INFO_SECONDARY_RE = re_compile(r"\b(?Pc2c)\b") + # ISSUE _ISSUE_RE_EXP = r"(?P\w*(½|\d+)[\.\d+]*\w*)" _ISSUE_COUNT_RE_EXP = r"\(of\s*(?P\d+)\)" @@ -151,7 +187,22 @@ VOLUME_WITH_COUNT_RE = re_compile( ) BOOK_VOLUME_RE = re_compile(r"(?P" + r"book\s*(?P<volume>\d+)" + r")") +# Publisher +_PUBLISHER_UNAMBIGUOUS_RE_EXP = ( + r"(\b(?P<publisher>" + r"|".join(PUBLISHERS_UNAMBIGUOUS) + r")\b)" +) +_PUBLISHER_AMBIGUOUS_RE_EXP = ( + r"(\b(?P<publisher>" + r"|".join(PUBLISHERS_AMBIGUOUS) + r")\b)" +) +PUBLISHER_UNAMBIGUOUS_TOKEN_RE = re_compile( + r"(^|\/)" + _PUBLISHER_UNAMBIGUOUS_RE_EXP + r"($|\/)" +) +PUBLISHER_AMBIGUOUS_TOKEN_RE = re_compile( + r"(^|\/)" + _PUBLISHER_AMBIGUOUS_RE_EXP + r"($|\/)" +) +PUBLISHER_UNAMBIGUOUS_RE = re_compile(_PUBLISHER_UNAMBIGUOUS_RE_EXP) +PUBLISHER_AMBIGUOUS_RE = re_compile(_PUBLISHER_AMBIGUOUS_RE_EXP) # LONG STRINGS -REMAINING_GROUP_RE = re_compile(r"^[^\()].*[^\)]") +REMAINING_GROUP_RE = re_compile(r"^[^\(].*[^\)]") NON_NUMBER_DOT_RE = re_compile(r"(\D)\.(\D)") diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index 5e3c0e0..e85a18e 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -274,7 +274,8 @@ FNS.update( "issue": "2", "series": "Monster Island", "volume": "1", - "remainders": ("repaired c2c",), + "scan_info": "c2c", + "remainders": ("(repaired)",), }, # Extra - in the series " X-Men-V1-#067.cbr": { @@ -334,6 +335,7 @@ FNS.update( "ext": "cbr", "issue": "002", "series": "Marvel Previews", + "publisher": "Marvel", "month": "01", "year": "2022", }, @@ -416,36 +418,32 @@ FNS.update( "volume": "03", "year": "2020", }, + # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder + "Marvel Two In One V1 #090 c2c.cbr": { + "ext": "cbr", + "issue": "090", + "series": "Marvel Two In One", + "publisher": "Marvel", + "volume": "1", + "scan_info": "c2c", + }, + # CT treats '[]' as equivalent to '()', catches DC as a publisher and 'Sep-Oct 1951' as dates and removes them. CT doesn't catch the digital though so that could be better but I blame whoever made this atrocious filename + "Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz": { + "ext": "cbz", + "issue": "49", + "series": "Wonder Woman", + "publisher": "DC", + "year": "1951", + "month": "09", + "remainders": ( + "digital (downsized, lightened, 4 missing story pages " + "restored) (Shadowcat-Empire)", + ), + }, } ) -PUBLISHER = { - # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder - # - # 1. c2c is not a title and is an original_format - # Leading common publisher may be a publisher? Do not pop - "Marvel Two In One V1 #090 c2c.cbr": { - "ext": "cbr", - "issue": "090", - "series": "Marvel Two In One", - "publisher": "Marvel", - "volume": "1", - }, - # CT treats '[]' as equivalent to '()', catches DC as a publisher and 'Sep-Oct 1951' as dates and removes them. CT doesn't catch the digital though so that could be better but I blame whoever made this atrocious filename - # - # 1. Month-Month should be handled - # 2. DC is a common publisher, no pop? - "Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz": { - "ext": "cbz", - "issue": "49", - "series": "Wonder Woman", - "title": "digital", - "publisher": "DC", - "year": "1951", - "month": "10", - }, -} -# first_key, first_val = VOLUME.popitem() +# first_key, first_val = NEW.popitem() # FNS[first_key] = first_val WONFIX = {