diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 1484637..644ffb8 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -9,13 +9,13 @@ from typing import Any from comicfn2dict.regex import ( NON_NUMBER_DOT_RE, YEAR_FIRST_DATE_RE, - EXTRA_SPACES_RE, ISSUE_ANYWHERE_RE, + REGEX_SUBS, + TOKEN_DELIMETER, ISSUE_COUNT_RE, ISSUE_NUMBER_RE, ISSUE_BEGIN_RE, ISSUE_END_RE, - NON_SPACE_DIVIDER_RE, ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, ORIGINAL_FORMAT_SCAN_INFO_RE, REMAINING_GROUP_RE, @@ -26,7 +26,6 @@ from comicfn2dict.regex import ( _REMAINING_GROUP_KEYS = ("series", "title") _TITLE_PRECEDING_KEYS = ("issue", "year", "volume") -_TOKEN_DELIMETER = "/" _DATE_KEYS = frozenset({"year", "month", "day"}) @@ -58,19 +57,22 @@ class ComicFilenameParser: self.metadata["ext"] = ext self._unparsed_path = data - def _clean_dividers(self): - """Replace non space dividers and clean extra spaces out of string.""" - data = NON_SPACE_DIVIDER_RE.sub(" ", self._unparsed_path) - self._unparsed_path = EXTRA_SPACES_RE.sub(" ", data).strip() - def _grouping_operators_strip(self, value: str) -> str: """Strip spaces and parens.""" value = value.strip() value = value.strip("()").strip() value = value.strip("-").strip() value = value.strip(",").strip() - value = value.strip("'").strip('"').strip() - return value + value = value.strip("'").strip() + return value.strip('"').strip() + + def _clean_dividers(self): + """Replace non space dividers and clean extra spaces out of string.""" + data = self._unparsed_path + for regex, pair in REGEX_SUBS.items(): + replacement, count = pair + data = regex.sub(replacement, data, count=count) + self._unparsed_path = data.strip() def _parse_items( self, @@ -91,12 +93,12 @@ class ComicFilenameParser: matched_metadata[key] = self._grouping_operators_strip(value) self.metadata.update(matched_metadata) - marked_str = regex.sub(_TOKEN_DELIMETER, self._unparsed_path) + marked_str = regex.sub(TOKEN_DELIMETER, self._unparsed_path) parts = [] - for part in marked_str.split(_TOKEN_DELIMETER): + for part in marked_str.split(TOKEN_DELIMETER): if token := part.strip(): parts.append(token) - self._unparsed_path = _TOKEN_DELIMETER.join(parts) + self._unparsed_path = TOKEN_DELIMETER.join(parts) def _alpha_month_to_numeric(self): """Translate alpha_month to numeric month.""" @@ -147,7 +149,7 @@ class ComicFilenameParser: remaining_key_index = 0 unused_tokens = [] - tokens = self._unparsed_path.split(_TOKEN_DELIMETER) + tokens = self._unparsed_path.split(TOKEN_DELIMETER) while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS): key = _REMAINING_GROUP_KEYS[remaining_key_index] token = tokens.pop(0) @@ -170,7 +172,7 @@ class ComicFilenameParser: def _add_remainders(self): """Add Remainders.""" remainders = [] - for token in self._unparsed_path.split(_TOKEN_DELIMETER): + for token in self._unparsed_path.split(TOKEN_DELIMETER): if remainder := token.strip(): remainders.append(remainder) @@ -225,8 +227,8 @@ class ComicFilenameParser: self._log_progress("AFTER SERIES AND TITLE") # Final try for issue number. + # TODO unused if "issue" not in self.metadata: - # TODO is this useful? self._parse_items(ISSUE_ANYWHERE_RE) self._log_progress("AFTER ISSUE PICKUP") diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index 7ffbd4b..718df9b 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -1,5 +1,6 @@ """Parsing regexes.""" import re +from types import MappingProxyType def re_compile(exp, parenthify=False): @@ -53,9 +54,19 @@ MONTHS = ( r"Dec(ember)?", ) +TOKEN_DELIMETER = r"/" + # CLEAN -NON_SPACE_DIVIDER_RE = re_compile(r"[_\+]") -EXTRA_SPACES_RE = re_compile(r"\s\s+") +_TOKEN_DIVIDERS_RE = re_compile(r":") +_SPACE_EQUIVALENT_RE = re_compile(r"_") +_EXTRA_SPACES_RE = re_compile(r"\s\s+") +REGEX_SUBS: MappingProxyType[re.Pattern, tuple[str, int]] = MappingProxyType( + { + _TOKEN_DIVIDERS_RE: (TOKEN_DELIMETER, 1), + _SPACE_EQUIVALENT_RE: (r" ", 0), + _EXTRA_SPACES_RE: (r" ", 0), + } +) ### DATES _YEAR_RE_EXP = r"(?P[12]\d{3})" @@ -117,6 +128,8 @@ _ISSUE_RE_EXP = r"(?P\w*(½|\d+)[\.\d+]*\w*)" ISSUE_NUMBER_RE = re_compile(r"(\(?#" + _ISSUE_RE_EXP + r"\)?)") ISSUE_END_RE = re_compile(r"([\/\s]\(?" + _ISSUE_RE_EXP + r"\)?(\/|$))") ISSUE_BEGIN_RE = re_compile(r"((^|\/)\(?" + _ISSUE_RE_EXP + r"\)?[\/|\s])") + +# TODO unused ISSUE_ANYWHERE_RE = re_compile(r"\b(\(?" + _ISSUE_RE_EXP + r"\)?)\b") # LONG STRINGS diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index 79dc507..4c63c91 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -375,6 +375,14 @@ FNS.update( "series": "Cory Doctorow's Futuristic Tales of the Here and Now", "year": "2007", }, + # CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation + "Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": { + "ext": "cbz", + "issue": "001", + "series": "Cory Doctorow's Futuristic Tales of the Here and Now", + "title": "Anda's Game", + "year": "2007", + }, } ) DIFFICULT = { @@ -430,14 +438,6 @@ DIFFICULT = { "year": "1951", "month": "10", }, - # CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation - "Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": { - "ext": "cbz", - "issue": "001", - "series": "Cory Doctorow's Futuristic Tales of the Here and Now", - "title": "Anda's Game", - "year": "2007", - }, } # first_key, first_val = DIFFICULT.popitem()