reorganize code. only substitute first colon out of caution.

This commit is contained in:
AJ Slater 2024-02-21 10:08:51 -08:00
parent 4b1f5fbdb9
commit 7d9b4efeee
3 changed files with 41 additions and 26 deletions

View File

@ -9,13 +9,13 @@ from typing import Any
from comicfn2dict.regex import ( from comicfn2dict.regex import (
NON_NUMBER_DOT_RE, NON_NUMBER_DOT_RE,
YEAR_FIRST_DATE_RE, YEAR_FIRST_DATE_RE,
EXTRA_SPACES_RE,
ISSUE_ANYWHERE_RE, ISSUE_ANYWHERE_RE,
REGEX_SUBS,
TOKEN_DELIMETER,
ISSUE_COUNT_RE, ISSUE_COUNT_RE,
ISSUE_NUMBER_RE, ISSUE_NUMBER_RE,
ISSUE_BEGIN_RE, ISSUE_BEGIN_RE,
ISSUE_END_RE, ISSUE_END_RE,
NON_SPACE_DIVIDER_RE,
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
ORIGINAL_FORMAT_SCAN_INFO_RE, ORIGINAL_FORMAT_SCAN_INFO_RE,
REMAINING_GROUP_RE, REMAINING_GROUP_RE,
@ -26,7 +26,6 @@ from comicfn2dict.regex import (
_REMAINING_GROUP_KEYS = ("series", "title") _REMAINING_GROUP_KEYS = ("series", "title")
_TITLE_PRECEDING_KEYS = ("issue", "year", "volume") _TITLE_PRECEDING_KEYS = ("issue", "year", "volume")
_TOKEN_DELIMETER = "/"
_DATE_KEYS = frozenset({"year", "month", "day"}) _DATE_KEYS = frozenset({"year", "month", "day"})
@ -58,19 +57,22 @@ class ComicFilenameParser:
self.metadata["ext"] = ext self.metadata["ext"] = ext
self._unparsed_path = data self._unparsed_path = data
def _clean_dividers(self):
"""Replace non space dividers and clean extra spaces out of string."""
data = NON_SPACE_DIVIDER_RE.sub(" ", self._unparsed_path)
self._unparsed_path = EXTRA_SPACES_RE.sub(" ", data).strip()
def _grouping_operators_strip(self, value: str) -> str: def _grouping_operators_strip(self, value: str) -> str:
"""Strip spaces and parens.""" """Strip spaces and parens."""
value = value.strip() value = value.strip()
value = value.strip("()").strip() value = value.strip("()").strip()
value = value.strip("-").strip() value = value.strip("-").strip()
value = value.strip(",").strip() value = value.strip(",").strip()
value = value.strip("'").strip('"').strip() value = value.strip("'").strip()
return value return value.strip('"').strip()
def _clean_dividers(self):
"""Replace non space dividers and clean extra spaces out of string."""
data = self._unparsed_path
for regex, pair in REGEX_SUBS.items():
replacement, count = pair
data = regex.sub(replacement, data, count=count)
self._unparsed_path = data.strip()
def _parse_items( def _parse_items(
self, self,
@ -91,12 +93,12 @@ class ComicFilenameParser:
matched_metadata[key] = self._grouping_operators_strip(value) matched_metadata[key] = self._grouping_operators_strip(value)
self.metadata.update(matched_metadata) self.metadata.update(matched_metadata)
marked_str = regex.sub(_TOKEN_DELIMETER, self._unparsed_path) marked_str = regex.sub(TOKEN_DELIMETER, self._unparsed_path)
parts = [] parts = []
for part in marked_str.split(_TOKEN_DELIMETER): for part in marked_str.split(TOKEN_DELIMETER):
if token := part.strip(): if token := part.strip():
parts.append(token) parts.append(token)
self._unparsed_path = _TOKEN_DELIMETER.join(parts) self._unparsed_path = TOKEN_DELIMETER.join(parts)
def _alpha_month_to_numeric(self): def _alpha_month_to_numeric(self):
"""Translate alpha_month to numeric month.""" """Translate alpha_month to numeric month."""
@ -147,7 +149,7 @@ class ComicFilenameParser:
remaining_key_index = 0 remaining_key_index = 0
unused_tokens = [] unused_tokens = []
tokens = self._unparsed_path.split(_TOKEN_DELIMETER) tokens = self._unparsed_path.split(TOKEN_DELIMETER)
while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS): while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS):
key = _REMAINING_GROUP_KEYS[remaining_key_index] key = _REMAINING_GROUP_KEYS[remaining_key_index]
token = tokens.pop(0) token = tokens.pop(0)
@ -170,7 +172,7 @@ class ComicFilenameParser:
def _add_remainders(self): def _add_remainders(self):
"""Add Remainders.""" """Add Remainders."""
remainders = [] remainders = []
for token in self._unparsed_path.split(_TOKEN_DELIMETER): for token in self._unparsed_path.split(TOKEN_DELIMETER):
if remainder := token.strip(): if remainder := token.strip():
remainders.append(remainder) remainders.append(remainder)
@ -225,8 +227,8 @@ class ComicFilenameParser:
self._log_progress("AFTER SERIES AND TITLE") self._log_progress("AFTER SERIES AND TITLE")
# Final try for issue number. # Final try for issue number.
# TODO unused
if "issue" not in self.metadata: if "issue" not in self.metadata:
# TODO is this useful?
self._parse_items(ISSUE_ANYWHERE_RE) self._parse_items(ISSUE_ANYWHERE_RE)
self._log_progress("AFTER ISSUE PICKUP") self._log_progress("AFTER ISSUE PICKUP")

View File

@ -1,5 +1,6 @@
"""Parsing regexes.""" """Parsing regexes."""
import re import re
from types import MappingProxyType
def re_compile(exp, parenthify=False): def re_compile(exp, parenthify=False):
@ -53,9 +54,19 @@ MONTHS = (
r"Dec(ember)?", r"Dec(ember)?",
) )
TOKEN_DELIMETER = r"/"
# CLEAN # CLEAN
NON_SPACE_DIVIDER_RE = re_compile(r"[_\+]") _TOKEN_DIVIDERS_RE = re_compile(r":")
EXTRA_SPACES_RE = re_compile(r"\s\s+") _SPACE_EQUIVALENT_RE = re_compile(r"_")
_EXTRA_SPACES_RE = re_compile(r"\s\s+")
REGEX_SUBS: MappingProxyType[re.Pattern, tuple[str, int]] = MappingProxyType(
{
_TOKEN_DIVIDERS_RE: (TOKEN_DELIMETER, 1),
_SPACE_EQUIVALENT_RE: (r" ", 0),
_EXTRA_SPACES_RE: (r" ", 0),
}
)
### DATES ### DATES
_YEAR_RE_EXP = r"(?P<year>[12]\d{3})" _YEAR_RE_EXP = r"(?P<year>[12]\d{3})"
@ -117,6 +128,8 @@ _ISSUE_RE_EXP = r"(?P<issue>\w*(½|\d+)[\.\d+]*\w*)"
ISSUE_NUMBER_RE = re_compile(r"(\(?#" + _ISSUE_RE_EXP + r"\)?)") ISSUE_NUMBER_RE = re_compile(r"(\(?#" + _ISSUE_RE_EXP + r"\)?)")
ISSUE_END_RE = re_compile(r"([\/\s]\(?" + _ISSUE_RE_EXP + r"\)?(\/|$))") ISSUE_END_RE = re_compile(r"([\/\s]\(?" + _ISSUE_RE_EXP + r"\)?(\/|$))")
ISSUE_BEGIN_RE = re_compile(r"((^|\/)\(?" + _ISSUE_RE_EXP + r"\)?[\/|\s])") ISSUE_BEGIN_RE = re_compile(r"((^|\/)\(?" + _ISSUE_RE_EXP + r"\)?[\/|\s])")
# TODO unused
ISSUE_ANYWHERE_RE = re_compile(r"\b(\(?" + _ISSUE_RE_EXP + r"\)?)\b") ISSUE_ANYWHERE_RE = re_compile(r"\b(\(?" + _ISSUE_RE_EXP + r"\)?)\b")
# LONG STRINGS # LONG STRINGS

View File

@ -375,6 +375,14 @@ FNS.update(
"series": "Cory Doctorow's Futuristic Tales of the Here and Now", "series": "Cory Doctorow's Futuristic Tales of the Here and Now",
"year": "2007", "year": "2007",
}, },
# CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation
"Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": {
"ext": "cbz",
"issue": "001",
"series": "Cory Doctorow's Futuristic Tales of the Here and Now",
"title": "Anda's Game",
"year": "2007",
},
} }
) )
DIFFICULT = { DIFFICULT = {
@ -430,14 +438,6 @@ DIFFICULT = {
"year": "1951", "year": "1951",
"month": "10", "month": "10",
}, },
# CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation
"Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": {
"ext": "cbz",
"issue": "001",
"series": "Cory Doctorow's Futuristic Tales of the Here and Now",
"title": "Anda's Game",
"year": "2007",
},
} }
# first_key, first_val = DIFFICULT.popitem() # first_key, first_val = DIFFICULT.popitem()