reorganize code. only substitute first colon out of caution.

This commit is contained in:
AJ Slater 2024-02-21 10:08:51 -08:00
parent 4b1f5fbdb9
commit 7d9b4efeee
3 changed files with 41 additions and 26 deletions

View File

@ -9,13 +9,13 @@ from typing import Any
from comicfn2dict.regex import (
NON_NUMBER_DOT_RE,
YEAR_FIRST_DATE_RE,
EXTRA_SPACES_RE,
ISSUE_ANYWHERE_RE,
REGEX_SUBS,
TOKEN_DELIMETER,
ISSUE_COUNT_RE,
ISSUE_NUMBER_RE,
ISSUE_BEGIN_RE,
ISSUE_END_RE,
NON_SPACE_DIVIDER_RE,
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
ORIGINAL_FORMAT_SCAN_INFO_RE,
REMAINING_GROUP_RE,
@ -26,7 +26,6 @@ from comicfn2dict.regex import (
_REMAINING_GROUP_KEYS = ("series", "title")
_TITLE_PRECEDING_KEYS = ("issue", "year", "volume")
_TOKEN_DELIMETER = "/"
_DATE_KEYS = frozenset({"year", "month", "day"})
@ -58,19 +57,22 @@ class ComicFilenameParser:
self.metadata["ext"] = ext
self._unparsed_path = data
def _clean_dividers(self):
"""Replace non space dividers and clean extra spaces out of string."""
data = NON_SPACE_DIVIDER_RE.sub(" ", self._unparsed_path)
self._unparsed_path = EXTRA_SPACES_RE.sub(" ", data).strip()
def _grouping_operators_strip(self, value: str) -> str:
"""Strip spaces and parens."""
value = value.strip()
value = value.strip("()").strip()
value = value.strip("-").strip()
value = value.strip(",").strip()
value = value.strip("'").strip('"').strip()
return value
value = value.strip("'").strip()
return value.strip('"').strip()
def _clean_dividers(self):
"""Replace non space dividers and clean extra spaces out of string."""
data = self._unparsed_path
for regex, pair in REGEX_SUBS.items():
replacement, count = pair
data = regex.sub(replacement, data, count=count)
self._unparsed_path = data.strip()
def _parse_items(
self,
@ -91,12 +93,12 @@ class ComicFilenameParser:
matched_metadata[key] = self._grouping_operators_strip(value)
self.metadata.update(matched_metadata)
marked_str = regex.sub(_TOKEN_DELIMETER, self._unparsed_path)
marked_str = regex.sub(TOKEN_DELIMETER, self._unparsed_path)
parts = []
for part in marked_str.split(_TOKEN_DELIMETER):
for part in marked_str.split(TOKEN_DELIMETER):
if token := part.strip():
parts.append(token)
self._unparsed_path = _TOKEN_DELIMETER.join(parts)
self._unparsed_path = TOKEN_DELIMETER.join(parts)
def _alpha_month_to_numeric(self):
"""Translate alpha_month to numeric month."""
@ -147,7 +149,7 @@ class ComicFilenameParser:
remaining_key_index = 0
unused_tokens = []
tokens = self._unparsed_path.split(_TOKEN_DELIMETER)
tokens = self._unparsed_path.split(TOKEN_DELIMETER)
while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS):
key = _REMAINING_GROUP_KEYS[remaining_key_index]
token = tokens.pop(0)
@ -170,7 +172,7 @@ class ComicFilenameParser:
def _add_remainders(self):
"""Add Remainders."""
remainders = []
for token in self._unparsed_path.split(_TOKEN_DELIMETER):
for token in self._unparsed_path.split(TOKEN_DELIMETER):
if remainder := token.strip():
remainders.append(remainder)
@ -225,8 +227,8 @@ class ComicFilenameParser:
self._log_progress("AFTER SERIES AND TITLE")
# Final try for issue number.
# TODO unused
if "issue" not in self.metadata:
# TODO is this useful?
self._parse_items(ISSUE_ANYWHERE_RE)
self._log_progress("AFTER ISSUE PICKUP")

View File

@ -1,5 +1,6 @@
"""Parsing regexes."""
import re
from types import MappingProxyType
def re_compile(exp, parenthify=False):
@ -53,9 +54,19 @@ MONTHS = (
r"Dec(ember)?",
)
TOKEN_DELIMETER = r"/"
# CLEAN
NON_SPACE_DIVIDER_RE = re_compile(r"[_\+]")
EXTRA_SPACES_RE = re_compile(r"\s\s+")
_TOKEN_DIVIDERS_RE = re_compile(r":")
_SPACE_EQUIVALENT_RE = re_compile(r"_")
_EXTRA_SPACES_RE = re_compile(r"\s\s+")
REGEX_SUBS: MappingProxyType[re.Pattern, tuple[str, int]] = MappingProxyType(
{
_TOKEN_DIVIDERS_RE: (TOKEN_DELIMETER, 1),
_SPACE_EQUIVALENT_RE: (r" ", 0),
_EXTRA_SPACES_RE: (r" ", 0),
}
)
### DATES
_YEAR_RE_EXP = r"(?P<year>[12]\d{3})"
@ -117,6 +128,8 @@ _ISSUE_RE_EXP = r"(?P<issue>\w*(½|\d+)[\.\d+]*\w*)"
ISSUE_NUMBER_RE = re_compile(r"(\(?#" + _ISSUE_RE_EXP + r"\)?)")
ISSUE_END_RE = re_compile(r"([\/\s]\(?" + _ISSUE_RE_EXP + r"\)?(\/|$))")
ISSUE_BEGIN_RE = re_compile(r"((^|\/)\(?" + _ISSUE_RE_EXP + r"\)?[\/|\s])")
# TODO unused
ISSUE_ANYWHERE_RE = re_compile(r"\b(\(?" + _ISSUE_RE_EXP + r"\)?)\b")
# LONG STRINGS

View File

@ -375,6 +375,14 @@ FNS.update(
"series": "Cory Doctorow's Futuristic Tales of the Here and Now",
"year": "2007",
},
# CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation
"Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": {
"ext": "cbz",
"issue": "001",
"series": "Cory Doctorow's Futuristic Tales of the Here and Now",
"title": "Anda's Game",
"year": "2007",
},
}
)
DIFFICULT = {
@ -430,14 +438,6 @@ DIFFICULT = {
"year": "1951",
"month": "10",
},
# CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation
"Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": {
"ext": "cbz",
"issue": "001",
"series": "Cory Doctorow's Futuristic Tales of the Here and Now",
"title": "Anda's Game",
"year": "2007",
},
}
# first_key, first_val = DIFFICULT.popitem()