all tests work
This commit is contained in:
parent
241e13f2d6
commit
0ff6feb3ea
@ -7,6 +7,7 @@ from re import Pattern
|
||||
from typing import Any
|
||||
|
||||
from comicfn2dict.regex import (
|
||||
ALPHA_MONTH_RANGE_RE,
|
||||
BOOK_VOLUME_RE,
|
||||
ISSUE_ANYWHERE_RE,
|
||||
ISSUE_BEGIN_RE,
|
||||
@ -17,8 +18,13 @@ from comicfn2dict.regex import (
|
||||
NON_NUMBER_DOT_RE,
|
||||
ORIGINAL_FORMAT_SCAN_INFO_RE,
|
||||
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
|
||||
PUBLISHER_AMBIGUOUS_RE,
|
||||
PUBLISHER_UNAMBIGUOUS_RE,
|
||||
PUBLISHER_AMBIGUOUS_TOKEN_RE,
|
||||
PUBLISHER_UNAMBIGUOUS_TOKEN_RE,
|
||||
REGEX_SUBS,
|
||||
REMAINING_GROUP_RE,
|
||||
SCAN_INFO_SECONDARY_RE,
|
||||
TOKEN_DELIMETER,
|
||||
VOLUME_RE,
|
||||
VOLUME_WITH_COUNT_RE,
|
||||
@ -41,6 +47,8 @@ class ComicFilenameParser:
|
||||
if not value:
|
||||
return -1
|
||||
if value not in self._path_indexes:
|
||||
# TODO This is fragile.
|
||||
# Better to get it at match time.
|
||||
if key == "ext":
|
||||
index = self.path.rfind(value)
|
||||
else:
|
||||
@ -69,12 +77,32 @@ class ComicFilenameParser:
|
||||
value = value.strip("'").strip()
|
||||
return value.strip('"').strip()
|
||||
|
||||
def _parenthify_double_underscores(self) -> str:
|
||||
"""Replace double underscores with parens."""
|
||||
parts = self._unparsed_path.split("__")
|
||||
num_parts = len(parts)
|
||||
print(f"{num_parts=} {num_parts % 2}")
|
||||
if num_parts < 3 or not num_parts % 2:
|
||||
return self._unparsed_path
|
||||
index = 0
|
||||
mode = " ("
|
||||
parenthified = parts[index]
|
||||
index += 1
|
||||
while index < len(parts):
|
||||
parenthified += mode + parts[index]
|
||||
print(f"{parenthified=}")
|
||||
mode = ") " if mode == " (" else ") "
|
||||
index += 1
|
||||
return parenthified.strip()
|
||||
|
||||
def _clean_dividers(self):
|
||||
"""Replace non space dividers and clean extra spaces out of string."""
|
||||
data = self._unparsed_path
|
||||
data = self._parenthify_double_underscores()
|
||||
|
||||
# Simple substitutions
|
||||
for regex, pair in REGEX_SUBS.items():
|
||||
replacement, count = pair
|
||||
data = regex.sub(replacement, data, count=count)
|
||||
data = regex.sub(replacement, data, count=count).strip()
|
||||
self._unparsed_path = data.strip()
|
||||
|
||||
def _parse_items(
|
||||
@ -91,7 +119,6 @@ class ComicFilenameParser:
|
||||
return
|
||||
matched_metadata = {}
|
||||
for key, value in matches.groupdict().items():
|
||||
print(f"{value=} == {exclude=}")
|
||||
if value == exclude:
|
||||
continue
|
||||
if not value:
|
||||
@ -126,6 +153,9 @@ class ComicFilenameParser:
|
||||
|
||||
def _parse_dates(self):
|
||||
"""Parse date schemes."""
|
||||
# Discard second month of alpha month ranges.
|
||||
self._unparsed_path = ALPHA_MONTH_RANGE_RE.sub(r"\1", self._unparsed_path)
|
||||
|
||||
# Month first date
|
||||
self._parse_items(MONTH_FIRST_DATE_RE)
|
||||
self._alpha_month_to_numeric()
|
||||
@ -248,6 +278,13 @@ class ComicFilenameParser:
|
||||
self._parse_items(
|
||||
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
|
||||
)
|
||||
|
||||
self._parse_items(SCAN_INFO_SECONDARY_RE)
|
||||
if (
|
||||
scan_info_secondary := self.metadata.pop("secondary_scan_info", "")
|
||||
) and "scan_info" not in self.metadata:
|
||||
self.metadata["scan_info"] = scan_info_secondary # type: ignore
|
||||
|
||||
self._log_progress("AFTER PAREN TOKENS")
|
||||
|
||||
# Series and Title
|
||||
@ -269,6 +306,18 @@ class ComicFilenameParser:
|
||||
if "issue" not in self.metadata:
|
||||
self._parse_items(ISSUE_BEGIN_RE)
|
||||
self._log_progress("AFTER ISSUE PICKUP")
|
||||
|
||||
# Publisher
|
||||
#
|
||||
# Pop single tokens so they don't end up titles.
|
||||
self._parse_items(PUBLISHER_UNAMBIGUOUS_TOKEN_RE, first_only=True)
|
||||
if "publisher" not in self.metadata:
|
||||
self._parse_items(PUBLISHER_AMBIGUOUS_TOKEN_RE, first_only=True)
|
||||
if "publisher" not in self.metadata:
|
||||
self._parse_items(PUBLISHER_UNAMBIGUOUS_RE, pop=False, first_only=True)
|
||||
if "publisher" not in self.metadata:
|
||||
self._parse_items(PUBLISHER_AMBIGUOUS_RE, pop=False, first_only=True)
|
||||
|
||||
self._assign_remaining_groups()
|
||||
self._log_progress("AFTER SERIES AND TITLE")
|
||||
|
||||
|
@ -10,6 +10,30 @@ def re_compile(exp, parenthify=False):
|
||||
return re.compile(exp, flags=re.IGNORECASE)
|
||||
|
||||
|
||||
PUBLISHERS_UNAMBIGUOUS = (
|
||||
r"Abrams ComicArts",
|
||||
r"BOOM! Studios",
|
||||
r"DC(\sComics)?",
|
||||
r"Dark Horse Comics",
|
||||
r"Drawn & Quarterly",
|
||||
r"Dynamite Entertainment",
|
||||
r"IDW Publishing",
|
||||
r"Icon Comics",
|
||||
r"Kodansha",
|
||||
r"Oni Press",
|
||||
r"Pantheon Books",
|
||||
r"SLG Publishing",
|
||||
r"SelfMadeHero",
|
||||
r"Titan Comics",
|
||||
)
|
||||
PUBLISHERS_AMBIGUOUS = (
|
||||
r"Marvel",
|
||||
r"Heavy Metal",
|
||||
r"Epic",
|
||||
r"Image",
|
||||
r"Mirage",
|
||||
)
|
||||
|
||||
ORIGINAL_FORMAT_PATTERNS = (
|
||||
r"Anthology",
|
||||
r"(One|1)[-\s]Shot",
|
||||
@ -48,7 +72,7 @@ MONTHS = (
|
||||
r"Jun(e)?",
|
||||
r"Jul(y)?",
|
||||
r"Aug(ust)?",
|
||||
r"Sept(ember)?",
|
||||
r"Sep(tember)?",
|
||||
r"Oct(ober)?",
|
||||
r"Nov(ember)?",
|
||||
r"Dec(ember)?",
|
||||
@ -74,9 +98,19 @@ REGEX_SUBS: MappingProxyType[re.Pattern, tuple[str, int]] = MappingProxyType(
|
||||
|
||||
### DATES
|
||||
_YEAR_RE_EXP = r"(?P<year>[12]\d{3})"
|
||||
_MONTH_ALPHA_RE_EXP = r"(?P<alpha_month>" + r"|".join(MONTHS) + r")\.?"
|
||||
_MONTH_ALPHA_RE_EXP = r"(" + "(?P<alpha_month>" + r"|".join(MONTHS) + r")\.?" r")"
|
||||
_MONTH_NUMERIC_RE_EXP = r"(?P<month>0?\d|1[0-2]?)"
|
||||
_MONTH_RE_EXP = r"(" + _MONTH_ALPHA_RE_EXP + r"|" + _MONTH_NUMERIC_RE_EXP + r")"
|
||||
_ALPHA_MONTH_RANGE = (
|
||||
r"\b"
|
||||
+ r"(" + r"|".join(MONTHS) + r")"
|
||||
+ r"("
|
||||
+ r"\.?-"
|
||||
+ r"(" + r"|".join(MONTHS) + r")"
|
||||
+ r")\b"
|
||||
)
|
||||
print(_ALPHA_MONTH_RANGE)
|
||||
ALPHA_MONTH_RANGE_RE = re_compile(_ALPHA_MONTH_RANGE)
|
||||
|
||||
_DAY_RE_EXP = r"(?P<day>([0-2]?\d|(3)[0-1]))"
|
||||
_DATE_DELIM = r"[-\s]+"
|
||||
@ -124,6 +158,8 @@ ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE = re_compile(
|
||||
r"\(" + _ORIGINAL_FORMAT_RE_EXP + r"\).*\(" + _SCAN_INFO_RE_EXP + r"\)"
|
||||
)
|
||||
|
||||
SCAN_INFO_SECONDARY_RE = re_compile(r"\b(?P<secondary_scan_info>c2c)\b")
|
||||
|
||||
# ISSUE
|
||||
_ISSUE_RE_EXP = r"(?P<issue>\w*(½|\d+)[\.\d+]*\w*)"
|
||||
_ISSUE_COUNT_RE_EXP = r"\(of\s*(?P<issue_count>\d+)\)"
|
||||
@ -151,7 +187,22 @@ VOLUME_WITH_COUNT_RE = re_compile(
|
||||
)
|
||||
BOOK_VOLUME_RE = re_compile(r"(?P<title>" + r"book\s*(?P<volume>\d+)" + r")")
|
||||
|
||||
# Publisher
|
||||
_PUBLISHER_UNAMBIGUOUS_RE_EXP = (
|
||||
r"(\b(?P<publisher>" + r"|".join(PUBLISHERS_UNAMBIGUOUS) + r")\b)"
|
||||
)
|
||||
_PUBLISHER_AMBIGUOUS_RE_EXP = (
|
||||
r"(\b(?P<publisher>" + r"|".join(PUBLISHERS_AMBIGUOUS) + r")\b)"
|
||||
)
|
||||
PUBLISHER_UNAMBIGUOUS_TOKEN_RE = re_compile(
|
||||
r"(^|\/)" + _PUBLISHER_UNAMBIGUOUS_RE_EXP + r"($|\/)"
|
||||
)
|
||||
PUBLISHER_AMBIGUOUS_TOKEN_RE = re_compile(
|
||||
r"(^|\/)" + _PUBLISHER_AMBIGUOUS_RE_EXP + r"($|\/)"
|
||||
)
|
||||
PUBLISHER_UNAMBIGUOUS_RE = re_compile(_PUBLISHER_UNAMBIGUOUS_RE_EXP)
|
||||
PUBLISHER_AMBIGUOUS_RE = re_compile(_PUBLISHER_AMBIGUOUS_RE_EXP)
|
||||
|
||||
# LONG STRINGS
|
||||
REMAINING_GROUP_RE = re_compile(r"^[^\()].*[^\)]")
|
||||
REMAINING_GROUP_RE = re_compile(r"^[^\(].*[^\)]")
|
||||
NON_NUMBER_DOT_RE = re_compile(r"(\D)\.(\D)")
|
||||
|
@ -274,7 +274,8 @@ FNS.update(
|
||||
"issue": "2",
|
||||
"series": "Monster Island",
|
||||
"volume": "1",
|
||||
"remainders": ("repaired c2c",),
|
||||
"scan_info": "c2c",
|
||||
"remainders": ("(repaired)",),
|
||||
},
|
||||
# Extra - in the series
|
||||
" X-Men-V1-#067.cbr": {
|
||||
@ -334,6 +335,7 @@ FNS.update(
|
||||
"ext": "cbr",
|
||||
"issue": "002",
|
||||
"series": "Marvel Previews",
|
||||
"publisher": "Marvel",
|
||||
"month": "01",
|
||||
"year": "2022",
|
||||
},
|
||||
@ -416,36 +418,32 @@ FNS.update(
|
||||
"volume": "03",
|
||||
"year": "2020",
|
||||
},
|
||||
# c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder
|
||||
"Marvel Two In One V1 #090 c2c.cbr": {
|
||||
"ext": "cbr",
|
||||
"issue": "090",
|
||||
"series": "Marvel Two In One",
|
||||
"publisher": "Marvel",
|
||||
"volume": "1",
|
||||
"scan_info": "c2c",
|
||||
},
|
||||
# CT treats '[]' as equivalent to '()', catches DC as a publisher and 'Sep-Oct 1951' as dates and removes them. CT doesn't catch the digital though so that could be better but I blame whoever made this atrocious filename
|
||||
"Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz": {
|
||||
"ext": "cbz",
|
||||
"issue": "49",
|
||||
"series": "Wonder Woman",
|
||||
"publisher": "DC",
|
||||
"year": "1951",
|
||||
"month": "09",
|
||||
"remainders": (
|
||||
"digital (downsized, lightened, 4 missing story pages "
|
||||
"restored) (Shadowcat-Empire)",
|
||||
),
|
||||
},
|
||||
}
|
||||
)
|
||||
PUBLISHER = {
|
||||
# c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder
|
||||
#
|
||||
# 1. c2c is not a title and is an original_format
|
||||
# Leading common publisher may be a publisher? Do not pop
|
||||
"Marvel Two In One V1 #090 c2c.cbr": {
|
||||
"ext": "cbr",
|
||||
"issue": "090",
|
||||
"series": "Marvel Two In One",
|
||||
"publisher": "Marvel",
|
||||
"volume": "1",
|
||||
},
|
||||
# CT treats '[]' as equivalent to '()', catches DC as a publisher and 'Sep-Oct 1951' as dates and removes them. CT doesn't catch the digital though so that could be better but I blame whoever made this atrocious filename
|
||||
#
|
||||
# 1. Month-Month should be handled
|
||||
# 2. DC is a common publisher, no pop?
|
||||
"Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz": {
|
||||
"ext": "cbz",
|
||||
"issue": "49",
|
||||
"series": "Wonder Woman",
|
||||
"title": "digital",
|
||||
"publisher": "DC",
|
||||
"year": "1951",
|
||||
"month": "10",
|
||||
},
|
||||
}
|
||||
|
||||
# first_key, first_val = VOLUME.popitem()
|
||||
# first_key, first_val = NEW.popitem()
|
||||
# FNS[first_key] = first_val
|
||||
|
||||
WONFIX = {
|
||||
|
Loading…
Reference in New Issue
Block a user