reorganize code. only substitute first colon out of caution.

2024-02-21 10:08:51 -08:00 · 2024-02-21 10:08:51 -08:00 · 7d9b4efeee
commit 7d9b4efeee
parent 4b1f5fbdb9
3 changed files with 41 additions and 26 deletions
--- a/comicfn2dict/parse.py
+++ b/comicfn2dict/parse.py
@ -9,13 +9,13 @@ from typing import Any
 from comicfn2dict.regex import (
    NON_NUMBER_DOT_RE,
    YEAR_FIRST_DATE_RE,
-    EXTRA_SPACES_RE,
    ISSUE_ANYWHERE_RE,
+    REGEX_SUBS,
+    TOKEN_DELIMETER,
    ISSUE_COUNT_RE,
    ISSUE_NUMBER_RE,
    ISSUE_BEGIN_RE,
    ISSUE_END_RE,
-    NON_SPACE_DIVIDER_RE,
    ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
    ORIGINAL_FORMAT_SCAN_INFO_RE,
    REMAINING_GROUP_RE,
@ -26,7 +26,6 @@ from comicfn2dict.regex import (

 _REMAINING_GROUP_KEYS = ("series", "title")
 _TITLE_PRECEDING_KEYS = ("issue", "year", "volume")
-_TOKEN_DELIMETER = "/"
 _DATE_KEYS = frozenset({"year", "month", "day"})


@ -58,19 +57,22 @@ class ComicFilenameParser:
        self.metadata["ext"] = ext
        self._unparsed_path = data

-    def _clean_dividers(self):
-        """Replace non space dividers and clean extra spaces out of string."""
-        data = NON_SPACE_DIVIDER_RE.sub(" ", self._unparsed_path)
-        self._unparsed_path = EXTRA_SPACES_RE.sub(" ", data).strip()
-
    def _grouping_operators_strip(self, value: str) -> str:
        """Strip spaces and parens."""
        value = value.strip()
        value = value.strip("()").strip()
        value = value.strip("-").strip()
        value = value.strip(",").strip()
-        value = value.strip("'").strip('"').strip()
-        return value
+        value = value.strip("'").strip()
+        return value.strip('"').strip()
+
+    def _clean_dividers(self):
+        """Replace non space dividers and clean extra spaces out of string."""
+        data = self._unparsed_path
+        for regex, pair in REGEX_SUBS.items():
+            replacement, count = pair
+            data = regex.sub(replacement, data, count=count)
+        self._unparsed_path = data.strip()

    def _parse_items(
        self,
@ -91,12 +93,12 @@ class ComicFilenameParser:
            matched_metadata[key] = self._grouping_operators_strip(value)
        self.metadata.update(matched_metadata)

-        marked_str = regex.sub(_TOKEN_DELIMETER, self._unparsed_path)
+        marked_str = regex.sub(TOKEN_DELIMETER, self._unparsed_path)
        parts = []
-        for part in marked_str.split(_TOKEN_DELIMETER):
+        for part in marked_str.split(TOKEN_DELIMETER):
            if token := part.strip():
                parts.append(token)
-        self._unparsed_path = _TOKEN_DELIMETER.join(parts)
+        self._unparsed_path = TOKEN_DELIMETER.join(parts)

    def _alpha_month_to_numeric(self):
        """Translate alpha_month to numeric month."""
@ -147,7 +149,7 @@ class ComicFilenameParser:

        remaining_key_index = 0
        unused_tokens = []
-        tokens = self._unparsed_path.split(_TOKEN_DELIMETER)
+        tokens = self._unparsed_path.split(TOKEN_DELIMETER)
        while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS):
            key = _REMAINING_GROUP_KEYS[remaining_key_index]
            token = tokens.pop(0)
@ -170,7 +172,7 @@ class ComicFilenameParser:
    def _add_remainders(self):
        """Add Remainders."""
        remainders = []
-        for token in self._unparsed_path.split(_TOKEN_DELIMETER):
+        for token in self._unparsed_path.split(TOKEN_DELIMETER):
            if remainder := token.strip():
                remainders.append(remainder)

@ -225,8 +227,8 @@ class ComicFilenameParser:
        self._log_progress("AFTER SERIES AND TITLE")

        # Final try for issue number.
+        # TODO unused
        if "issue" not in self.metadata:
-            # TODO is this useful?
            self._parse_items(ISSUE_ANYWHERE_RE)
        self._log_progress("AFTER ISSUE PICKUP")

--- a/comicfn2dict/regex.py
+++ b/comicfn2dict/regex.py
@ -1,5 +1,6 @@
 """Parsing regexes."""
 import re
+from types import MappingProxyType


 def re_compile(exp, parenthify=False):
@ -53,9 +54,19 @@ MONTHS = (
    r"Dec(ember)?",
 )

+TOKEN_DELIMETER = r"/"
+
 # CLEAN
-NON_SPACE_DIVIDER_RE = re_compile(r"[_\+]")
-EXTRA_SPACES_RE = re_compile(r"\s\s+")
+_TOKEN_DIVIDERS_RE = re_compile(r":")
+_SPACE_EQUIVALENT_RE = re_compile(r"_")
+_EXTRA_SPACES_RE = re_compile(r"\s\s+")
+REGEX_SUBS: MappingProxyType[re.Pattern, tuple[str, int]] = MappingProxyType(
+    {
+        _TOKEN_DIVIDERS_RE: (TOKEN_DELIMETER, 1),
+        _SPACE_EQUIVALENT_RE: (r" ", 0),
+        _EXTRA_SPACES_RE: (r" ", 0),
+    }
+)

 ### DATES
 _YEAR_RE_EXP = r"(?P<year>[12]\d{3})"
@ -117,6 +128,8 @@ _ISSUE_RE_EXP = r"(?P<issue>\w*(½|\d+)[\.\d+]*\w*)"
 ISSUE_NUMBER_RE = re_compile(r"(\(?#" + _ISSUE_RE_EXP + r"\)?)")
 ISSUE_END_RE = re_compile(r"([\/\s]\(?" + _ISSUE_RE_EXP + r"\)?(\/|$))")
 ISSUE_BEGIN_RE = re_compile(r"((^|\/)\(?" + _ISSUE_RE_EXP + r"\)?[\/|\s])")
+
+# TODO unused
 ISSUE_ANYWHERE_RE = re_compile(r"\b(\(?" + _ISSUE_RE_EXP + r"\)?)\b")

 # LONG STRINGS
--- a/tests/comic_filenames.py
+++ b/tests/comic_filenames.py
@ -375,6 +375,14 @@ FNS.update(
            "series": "Cory Doctorow's Futuristic Tales of the Here and Now",
            "year": "2007",
        },
+        # CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation
+        "Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": {
+            "ext": "cbz",
+            "issue": "001",
+            "series": "Cory Doctorow's Futuristic Tales of the Here and Now",
+            "title": "Anda's Game",
+            "year": "2007",
+        },
    }
 )
 DIFFICULT = {
@ -430,14 +438,6 @@ DIFFICULT = {
        "year": "1951",
        "month": "10",
    },
-    # CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation
-    "Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": {
-        "ext": "cbz",
-        "issue": "001",
-        "series": "Cory Doctorow's Futuristic Tales of the Here and Now",
-        "title": "Anda's Game",
-        "year": "2007",
-    },
 }

 # first_key, first_val = DIFFICULT.popitem()