all tests work

This commit is contained in:
AJ Slater 2024-02-21 16:25:39 -08:00
parent 241e13f2d6
commit 0ff6feb3ea
3 changed files with 132 additions and 34 deletions

View File

@ -7,6 +7,7 @@ from re import Pattern
from typing import Any from typing import Any
from comicfn2dict.regex import ( from comicfn2dict.regex import (
ALPHA_MONTH_RANGE_RE,
BOOK_VOLUME_RE, BOOK_VOLUME_RE,
ISSUE_ANYWHERE_RE, ISSUE_ANYWHERE_RE,
ISSUE_BEGIN_RE, ISSUE_BEGIN_RE,
@ -17,8 +18,13 @@ from comicfn2dict.regex import (
NON_NUMBER_DOT_RE, NON_NUMBER_DOT_RE,
ORIGINAL_FORMAT_SCAN_INFO_RE, ORIGINAL_FORMAT_SCAN_INFO_RE,
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
PUBLISHER_AMBIGUOUS_RE,
PUBLISHER_UNAMBIGUOUS_RE,
PUBLISHER_AMBIGUOUS_TOKEN_RE,
PUBLISHER_UNAMBIGUOUS_TOKEN_RE,
REGEX_SUBS, REGEX_SUBS,
REMAINING_GROUP_RE, REMAINING_GROUP_RE,
SCAN_INFO_SECONDARY_RE,
TOKEN_DELIMETER, TOKEN_DELIMETER,
VOLUME_RE, VOLUME_RE,
VOLUME_WITH_COUNT_RE, VOLUME_WITH_COUNT_RE,
@ -41,6 +47,8 @@ class ComicFilenameParser:
if not value: if not value:
return -1 return -1
if value not in self._path_indexes: if value not in self._path_indexes:
# TODO This is fragile.
# Better to get it at match time.
if key == "ext": if key == "ext":
index = self.path.rfind(value) index = self.path.rfind(value)
else: else:
@ -69,12 +77,32 @@ class ComicFilenameParser:
value = value.strip("'").strip() value = value.strip("'").strip()
return value.strip('"').strip() return value.strip('"').strip()
def _parenthify_double_underscores(self) -> str:
"""Replace double underscores with parens."""
parts = self._unparsed_path.split("__")
num_parts = len(parts)
print(f"{num_parts=} {num_parts % 2}")
if num_parts < 3 or not num_parts % 2:
return self._unparsed_path
index = 0
mode = " ("
parenthified = parts[index]
index += 1
while index < len(parts):
parenthified += mode + parts[index]
print(f"{parenthified=}")
mode = ") " if mode == " (" else ") "
index += 1
return parenthified.strip()
def _clean_dividers(self): def _clean_dividers(self):
"""Replace non space dividers and clean extra spaces out of string.""" """Replace non space dividers and clean extra spaces out of string."""
data = self._unparsed_path data = self._parenthify_double_underscores()
# Simple substitutions
for regex, pair in REGEX_SUBS.items(): for regex, pair in REGEX_SUBS.items():
replacement, count = pair replacement, count = pair
data = regex.sub(replacement, data, count=count) data = regex.sub(replacement, data, count=count).strip()
self._unparsed_path = data.strip() self._unparsed_path = data.strip()
def _parse_items( def _parse_items(
@ -91,7 +119,6 @@ class ComicFilenameParser:
return return
matched_metadata = {} matched_metadata = {}
for key, value in matches.groupdict().items(): for key, value in matches.groupdict().items():
print(f"{value=} == {exclude=}")
if value == exclude: if value == exclude:
continue continue
if not value: if not value:
@ -126,6 +153,9 @@ class ComicFilenameParser:
def _parse_dates(self): def _parse_dates(self):
"""Parse date schemes.""" """Parse date schemes."""
# Discard second month of alpha month ranges.
self._unparsed_path = ALPHA_MONTH_RANGE_RE.sub(r"\1", self._unparsed_path)
# Month first date # Month first date
self._parse_items(MONTH_FIRST_DATE_RE) self._parse_items(MONTH_FIRST_DATE_RE)
self._alpha_month_to_numeric() self._alpha_month_to_numeric()
@ -248,6 +278,13 @@ class ComicFilenameParser:
self._parse_items( self._parse_items(
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
) )
self._parse_items(SCAN_INFO_SECONDARY_RE)
if (
scan_info_secondary := self.metadata.pop("secondary_scan_info", "")
) and "scan_info" not in self.metadata:
self.metadata["scan_info"] = scan_info_secondary # type: ignore
self._log_progress("AFTER PAREN TOKENS") self._log_progress("AFTER PAREN TOKENS")
# Series and Title # Series and Title
@ -269,6 +306,18 @@ class ComicFilenameParser:
if "issue" not in self.metadata: if "issue" not in self.metadata:
self._parse_items(ISSUE_BEGIN_RE) self._parse_items(ISSUE_BEGIN_RE)
self._log_progress("AFTER ISSUE PICKUP") self._log_progress("AFTER ISSUE PICKUP")
# Publisher
#
# Pop single tokens so they don't end up titles.
self._parse_items(PUBLISHER_UNAMBIGUOUS_TOKEN_RE, first_only=True)
if "publisher" not in self.metadata:
self._parse_items(PUBLISHER_AMBIGUOUS_TOKEN_RE, first_only=True)
if "publisher" not in self.metadata:
self._parse_items(PUBLISHER_UNAMBIGUOUS_RE, pop=False, first_only=True)
if "publisher" not in self.metadata:
self._parse_items(PUBLISHER_AMBIGUOUS_RE, pop=False, first_only=True)
self._assign_remaining_groups() self._assign_remaining_groups()
self._log_progress("AFTER SERIES AND TITLE") self._log_progress("AFTER SERIES AND TITLE")

View File

@ -10,6 +10,30 @@ def re_compile(exp, parenthify=False):
return re.compile(exp, flags=re.IGNORECASE) return re.compile(exp, flags=re.IGNORECASE)
PUBLISHERS_UNAMBIGUOUS = (
r"Abrams ComicArts",
r"BOOM! Studios",
r"DC(\sComics)?",
r"Dark Horse Comics",
r"Drawn & Quarterly",
r"Dynamite Entertainment",
r"IDW Publishing",
r"Icon Comics",
r"Kodansha",
r"Oni Press",
r"Pantheon Books",
r"SLG Publishing",
r"SelfMadeHero",
r"Titan Comics",
)
PUBLISHERS_AMBIGUOUS = (
r"Marvel",
r"Heavy Metal",
r"Epic",
r"Image",
r"Mirage",
)
ORIGINAL_FORMAT_PATTERNS = ( ORIGINAL_FORMAT_PATTERNS = (
r"Anthology", r"Anthology",
r"(One|1)[-\s]Shot", r"(One|1)[-\s]Shot",
@ -48,7 +72,7 @@ MONTHS = (
r"Jun(e)?", r"Jun(e)?",
r"Jul(y)?", r"Jul(y)?",
r"Aug(ust)?", r"Aug(ust)?",
r"Sept(ember)?", r"Sep(tember)?",
r"Oct(ober)?", r"Oct(ober)?",
r"Nov(ember)?", r"Nov(ember)?",
r"Dec(ember)?", r"Dec(ember)?",
@ -74,9 +98,19 @@ REGEX_SUBS: MappingProxyType[re.Pattern, tuple[str, int]] = MappingProxyType(
### DATES ### DATES
_YEAR_RE_EXP = r"(?P<year>[12]\d{3})" _YEAR_RE_EXP = r"(?P<year>[12]\d{3})"
_MONTH_ALPHA_RE_EXP = r"(?P<alpha_month>" + r"|".join(MONTHS) + r")\.?" _MONTH_ALPHA_RE_EXP = r"(" + "(?P<alpha_month>" + r"|".join(MONTHS) + r")\.?" r")"
_MONTH_NUMERIC_RE_EXP = r"(?P<month>0?\d|1[0-2]?)" _MONTH_NUMERIC_RE_EXP = r"(?P<month>0?\d|1[0-2]?)"
_MONTH_RE_EXP = r"(" + _MONTH_ALPHA_RE_EXP + r"|" + _MONTH_NUMERIC_RE_EXP + r")" _MONTH_RE_EXP = r"(" + _MONTH_ALPHA_RE_EXP + r"|" + _MONTH_NUMERIC_RE_EXP + r")"
_ALPHA_MONTH_RANGE = (
r"\b"
+ r"(" + r"|".join(MONTHS) + r")"
+ r"("
+ r"\.?-"
+ r"(" + r"|".join(MONTHS) + r")"
+ r")\b"
)
print(_ALPHA_MONTH_RANGE)
ALPHA_MONTH_RANGE_RE = re_compile(_ALPHA_MONTH_RANGE)
_DAY_RE_EXP = r"(?P<day>([0-2]?\d|(3)[0-1]))" _DAY_RE_EXP = r"(?P<day>([0-2]?\d|(3)[0-1]))"
_DATE_DELIM = r"[-\s]+" _DATE_DELIM = r"[-\s]+"
@ -124,6 +158,8 @@ ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE = re_compile(
r"\(" + _ORIGINAL_FORMAT_RE_EXP + r"\).*\(" + _SCAN_INFO_RE_EXP + r"\)" r"\(" + _ORIGINAL_FORMAT_RE_EXP + r"\).*\(" + _SCAN_INFO_RE_EXP + r"\)"
) )
SCAN_INFO_SECONDARY_RE = re_compile(r"\b(?P<secondary_scan_info>c2c)\b")
# ISSUE # ISSUE
_ISSUE_RE_EXP = r"(?P<issue>\w*(½|\d+)[\.\d+]*\w*)" _ISSUE_RE_EXP = r"(?P<issue>\w*(½|\d+)[\.\d+]*\w*)"
_ISSUE_COUNT_RE_EXP = r"\(of\s*(?P<issue_count>\d+)\)" _ISSUE_COUNT_RE_EXP = r"\(of\s*(?P<issue_count>\d+)\)"
@ -151,7 +187,22 @@ VOLUME_WITH_COUNT_RE = re_compile(
) )
BOOK_VOLUME_RE = re_compile(r"(?P<title>" + r"book\s*(?P<volume>\d+)" + r")") BOOK_VOLUME_RE = re_compile(r"(?P<title>" + r"book\s*(?P<volume>\d+)" + r")")
# Publisher
_PUBLISHER_UNAMBIGUOUS_RE_EXP = (
r"(\b(?P<publisher>" + r"|".join(PUBLISHERS_UNAMBIGUOUS) + r")\b)"
)
_PUBLISHER_AMBIGUOUS_RE_EXP = (
r"(\b(?P<publisher>" + r"|".join(PUBLISHERS_AMBIGUOUS) + r")\b)"
)
PUBLISHER_UNAMBIGUOUS_TOKEN_RE = re_compile(
r"(^|\/)" + _PUBLISHER_UNAMBIGUOUS_RE_EXP + r"($|\/)"
)
PUBLISHER_AMBIGUOUS_TOKEN_RE = re_compile(
r"(^|\/)" + _PUBLISHER_AMBIGUOUS_RE_EXP + r"($|\/)"
)
PUBLISHER_UNAMBIGUOUS_RE = re_compile(_PUBLISHER_UNAMBIGUOUS_RE_EXP)
PUBLISHER_AMBIGUOUS_RE = re_compile(_PUBLISHER_AMBIGUOUS_RE_EXP)
# LONG STRINGS # LONG STRINGS
REMAINING_GROUP_RE = re_compile(r"^[^\()].*[^\)]") REMAINING_GROUP_RE = re_compile(r"^[^\(].*[^\)]")
NON_NUMBER_DOT_RE = re_compile(r"(\D)\.(\D)") NON_NUMBER_DOT_RE = re_compile(r"(\D)\.(\D)")

View File

@ -274,7 +274,8 @@ FNS.update(
"issue": "2", "issue": "2",
"series": "Monster Island", "series": "Monster Island",
"volume": "1", "volume": "1",
"remainders": ("repaired c2c",), "scan_info": "c2c",
"remainders": ("(repaired)",),
}, },
# Extra - in the series # Extra - in the series
" X-Men-V1-#067.cbr": { " X-Men-V1-#067.cbr": {
@ -334,6 +335,7 @@ FNS.update(
"ext": "cbr", "ext": "cbr",
"issue": "002", "issue": "002",
"series": "Marvel Previews", "series": "Marvel Previews",
"publisher": "Marvel",
"month": "01", "month": "01",
"year": "2022", "year": "2022",
}, },
@ -416,36 +418,32 @@ FNS.update(
"volume": "03", "volume": "03",
"year": "2020", "year": "2020",
}, },
}
)
PUBLISHER = {
# c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder
#
# 1. c2c is not a title and is an original_format
# Leading common publisher may be a publisher? Do not pop
"Marvel Two In One V1 #090 c2c.cbr": { "Marvel Two In One V1 #090 c2c.cbr": {
"ext": "cbr", "ext": "cbr",
"issue": "090", "issue": "090",
"series": "Marvel Two In One", "series": "Marvel Two In One",
"publisher": "Marvel", "publisher": "Marvel",
"volume": "1", "volume": "1",
"scan_info": "c2c",
}, },
# CT treats '[]' as equivalent to '()', catches DC as a publisher and 'Sep-Oct 1951' as dates and removes them. CT doesn't catch the digital though so that could be better but I blame whoever made this atrocious filename # CT treats '[]' as equivalent to '()', catches DC as a publisher and 'Sep-Oct 1951' as dates and removes them. CT doesn't catch the digital though so that could be better but I blame whoever made this atrocious filename
#
# 1. Month-Month should be handled
# 2. DC is a common publisher, no pop?
"Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz": { "Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz": {
"ext": "cbz", "ext": "cbz",
"issue": "49", "issue": "49",
"series": "Wonder Woman", "series": "Wonder Woman",
"title": "digital",
"publisher": "DC", "publisher": "DC",
"year": "1951", "year": "1951",
"month": "10", "month": "09",
"remainders": (
"digital (downsized, lightened, 4 missing story pages "
"restored) (Shadowcat-Empire)",
),
}, },
} }
)
# first_key, first_val = VOLUME.popitem() # first_key, first_val = NEW.popitem()
# FNS[first_key] = first_val # FNS[first_key] = first_val
WONFIX = { WONFIX = {