all tests work
This commit is contained in:
parent
241e13f2d6
commit
0ff6feb3ea
@ -7,6 +7,7 @@ from re import Pattern
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from comicfn2dict.regex import (
|
from comicfn2dict.regex import (
|
||||||
|
ALPHA_MONTH_RANGE_RE,
|
||||||
BOOK_VOLUME_RE,
|
BOOK_VOLUME_RE,
|
||||||
ISSUE_ANYWHERE_RE,
|
ISSUE_ANYWHERE_RE,
|
||||||
ISSUE_BEGIN_RE,
|
ISSUE_BEGIN_RE,
|
||||||
@ -17,8 +18,13 @@ from comicfn2dict.regex import (
|
|||||||
NON_NUMBER_DOT_RE,
|
NON_NUMBER_DOT_RE,
|
||||||
ORIGINAL_FORMAT_SCAN_INFO_RE,
|
ORIGINAL_FORMAT_SCAN_INFO_RE,
|
||||||
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
|
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
|
||||||
|
PUBLISHER_AMBIGUOUS_RE,
|
||||||
|
PUBLISHER_UNAMBIGUOUS_RE,
|
||||||
|
PUBLISHER_AMBIGUOUS_TOKEN_RE,
|
||||||
|
PUBLISHER_UNAMBIGUOUS_TOKEN_RE,
|
||||||
REGEX_SUBS,
|
REGEX_SUBS,
|
||||||
REMAINING_GROUP_RE,
|
REMAINING_GROUP_RE,
|
||||||
|
SCAN_INFO_SECONDARY_RE,
|
||||||
TOKEN_DELIMETER,
|
TOKEN_DELIMETER,
|
||||||
VOLUME_RE,
|
VOLUME_RE,
|
||||||
VOLUME_WITH_COUNT_RE,
|
VOLUME_WITH_COUNT_RE,
|
||||||
@ -41,6 +47,8 @@ class ComicFilenameParser:
|
|||||||
if not value:
|
if not value:
|
||||||
return -1
|
return -1
|
||||||
if value not in self._path_indexes:
|
if value not in self._path_indexes:
|
||||||
|
# TODO This is fragile.
|
||||||
|
# Better to get it at match time.
|
||||||
if key == "ext":
|
if key == "ext":
|
||||||
index = self.path.rfind(value)
|
index = self.path.rfind(value)
|
||||||
else:
|
else:
|
||||||
@ -69,12 +77,32 @@ class ComicFilenameParser:
|
|||||||
value = value.strip("'").strip()
|
value = value.strip("'").strip()
|
||||||
return value.strip('"').strip()
|
return value.strip('"').strip()
|
||||||
|
|
||||||
|
def _parenthify_double_underscores(self) -> str:
|
||||||
|
"""Replace double underscores with parens."""
|
||||||
|
parts = self._unparsed_path.split("__")
|
||||||
|
num_parts = len(parts)
|
||||||
|
print(f"{num_parts=} {num_parts % 2}")
|
||||||
|
if num_parts < 3 or not num_parts % 2:
|
||||||
|
return self._unparsed_path
|
||||||
|
index = 0
|
||||||
|
mode = " ("
|
||||||
|
parenthified = parts[index]
|
||||||
|
index += 1
|
||||||
|
while index < len(parts):
|
||||||
|
parenthified += mode + parts[index]
|
||||||
|
print(f"{parenthified=}")
|
||||||
|
mode = ") " if mode == " (" else ") "
|
||||||
|
index += 1
|
||||||
|
return parenthified.strip()
|
||||||
|
|
||||||
def _clean_dividers(self):
|
def _clean_dividers(self):
|
||||||
"""Replace non space dividers and clean extra spaces out of string."""
|
"""Replace non space dividers and clean extra spaces out of string."""
|
||||||
data = self._unparsed_path
|
data = self._parenthify_double_underscores()
|
||||||
|
|
||||||
|
# Simple substitutions
|
||||||
for regex, pair in REGEX_SUBS.items():
|
for regex, pair in REGEX_SUBS.items():
|
||||||
replacement, count = pair
|
replacement, count = pair
|
||||||
data = regex.sub(replacement, data, count=count)
|
data = regex.sub(replacement, data, count=count).strip()
|
||||||
self._unparsed_path = data.strip()
|
self._unparsed_path = data.strip()
|
||||||
|
|
||||||
def _parse_items(
|
def _parse_items(
|
||||||
@ -91,7 +119,6 @@ class ComicFilenameParser:
|
|||||||
return
|
return
|
||||||
matched_metadata = {}
|
matched_metadata = {}
|
||||||
for key, value in matches.groupdict().items():
|
for key, value in matches.groupdict().items():
|
||||||
print(f"{value=} == {exclude=}")
|
|
||||||
if value == exclude:
|
if value == exclude:
|
||||||
continue
|
continue
|
||||||
if not value:
|
if not value:
|
||||||
@ -126,6 +153,9 @@ class ComicFilenameParser:
|
|||||||
|
|
||||||
def _parse_dates(self):
|
def _parse_dates(self):
|
||||||
"""Parse date schemes."""
|
"""Parse date schemes."""
|
||||||
|
# Discard second month of alpha month ranges.
|
||||||
|
self._unparsed_path = ALPHA_MONTH_RANGE_RE.sub(r"\1", self._unparsed_path)
|
||||||
|
|
||||||
# Month first date
|
# Month first date
|
||||||
self._parse_items(MONTH_FIRST_DATE_RE)
|
self._parse_items(MONTH_FIRST_DATE_RE)
|
||||||
self._alpha_month_to_numeric()
|
self._alpha_month_to_numeric()
|
||||||
@ -248,6 +278,13 @@ class ComicFilenameParser:
|
|||||||
self._parse_items(
|
self._parse_items(
|
||||||
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
|
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self._parse_items(SCAN_INFO_SECONDARY_RE)
|
||||||
|
if (
|
||||||
|
scan_info_secondary := self.metadata.pop("secondary_scan_info", "")
|
||||||
|
) and "scan_info" not in self.metadata:
|
||||||
|
self.metadata["scan_info"] = scan_info_secondary # type: ignore
|
||||||
|
|
||||||
self._log_progress("AFTER PAREN TOKENS")
|
self._log_progress("AFTER PAREN TOKENS")
|
||||||
|
|
||||||
# Series and Title
|
# Series and Title
|
||||||
@ -269,6 +306,18 @@ class ComicFilenameParser:
|
|||||||
if "issue" not in self.metadata:
|
if "issue" not in self.metadata:
|
||||||
self._parse_items(ISSUE_BEGIN_RE)
|
self._parse_items(ISSUE_BEGIN_RE)
|
||||||
self._log_progress("AFTER ISSUE PICKUP")
|
self._log_progress("AFTER ISSUE PICKUP")
|
||||||
|
|
||||||
|
# Publisher
|
||||||
|
#
|
||||||
|
# Pop single tokens so they don't end up titles.
|
||||||
|
self._parse_items(PUBLISHER_UNAMBIGUOUS_TOKEN_RE, first_only=True)
|
||||||
|
if "publisher" not in self.metadata:
|
||||||
|
self._parse_items(PUBLISHER_AMBIGUOUS_TOKEN_RE, first_only=True)
|
||||||
|
if "publisher" not in self.metadata:
|
||||||
|
self._parse_items(PUBLISHER_UNAMBIGUOUS_RE, pop=False, first_only=True)
|
||||||
|
if "publisher" not in self.metadata:
|
||||||
|
self._parse_items(PUBLISHER_AMBIGUOUS_RE, pop=False, first_only=True)
|
||||||
|
|
||||||
self._assign_remaining_groups()
|
self._assign_remaining_groups()
|
||||||
self._log_progress("AFTER SERIES AND TITLE")
|
self._log_progress("AFTER SERIES AND TITLE")
|
||||||
|
|
||||||
|
@ -10,6 +10,30 @@ def re_compile(exp, parenthify=False):
|
|||||||
return re.compile(exp, flags=re.IGNORECASE)
|
return re.compile(exp, flags=re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
PUBLISHERS_UNAMBIGUOUS = (
|
||||||
|
r"Abrams ComicArts",
|
||||||
|
r"BOOM! Studios",
|
||||||
|
r"DC(\sComics)?",
|
||||||
|
r"Dark Horse Comics",
|
||||||
|
r"Drawn & Quarterly",
|
||||||
|
r"Dynamite Entertainment",
|
||||||
|
r"IDW Publishing",
|
||||||
|
r"Icon Comics",
|
||||||
|
r"Kodansha",
|
||||||
|
r"Oni Press",
|
||||||
|
r"Pantheon Books",
|
||||||
|
r"SLG Publishing",
|
||||||
|
r"SelfMadeHero",
|
||||||
|
r"Titan Comics",
|
||||||
|
)
|
||||||
|
PUBLISHERS_AMBIGUOUS = (
|
||||||
|
r"Marvel",
|
||||||
|
r"Heavy Metal",
|
||||||
|
r"Epic",
|
||||||
|
r"Image",
|
||||||
|
r"Mirage",
|
||||||
|
)
|
||||||
|
|
||||||
ORIGINAL_FORMAT_PATTERNS = (
|
ORIGINAL_FORMAT_PATTERNS = (
|
||||||
r"Anthology",
|
r"Anthology",
|
||||||
r"(One|1)[-\s]Shot",
|
r"(One|1)[-\s]Shot",
|
||||||
@ -48,7 +72,7 @@ MONTHS = (
|
|||||||
r"Jun(e)?",
|
r"Jun(e)?",
|
||||||
r"Jul(y)?",
|
r"Jul(y)?",
|
||||||
r"Aug(ust)?",
|
r"Aug(ust)?",
|
||||||
r"Sept(ember)?",
|
r"Sep(tember)?",
|
||||||
r"Oct(ober)?",
|
r"Oct(ober)?",
|
||||||
r"Nov(ember)?",
|
r"Nov(ember)?",
|
||||||
r"Dec(ember)?",
|
r"Dec(ember)?",
|
||||||
@ -74,9 +98,19 @@ REGEX_SUBS: MappingProxyType[re.Pattern, tuple[str, int]] = MappingProxyType(
|
|||||||
|
|
||||||
### DATES
|
### DATES
|
||||||
_YEAR_RE_EXP = r"(?P<year>[12]\d{3})"
|
_YEAR_RE_EXP = r"(?P<year>[12]\d{3})"
|
||||||
_MONTH_ALPHA_RE_EXP = r"(?P<alpha_month>" + r"|".join(MONTHS) + r")\.?"
|
_MONTH_ALPHA_RE_EXP = r"(" + "(?P<alpha_month>" + r"|".join(MONTHS) + r")\.?" r")"
|
||||||
_MONTH_NUMERIC_RE_EXP = r"(?P<month>0?\d|1[0-2]?)"
|
_MONTH_NUMERIC_RE_EXP = r"(?P<month>0?\d|1[0-2]?)"
|
||||||
_MONTH_RE_EXP = r"(" + _MONTH_ALPHA_RE_EXP + r"|" + _MONTH_NUMERIC_RE_EXP + r")"
|
_MONTH_RE_EXP = r"(" + _MONTH_ALPHA_RE_EXP + r"|" + _MONTH_NUMERIC_RE_EXP + r")"
|
||||||
|
_ALPHA_MONTH_RANGE = (
|
||||||
|
r"\b"
|
||||||
|
+ r"(" + r"|".join(MONTHS) + r")"
|
||||||
|
+ r"("
|
||||||
|
+ r"\.?-"
|
||||||
|
+ r"(" + r"|".join(MONTHS) + r")"
|
||||||
|
+ r")\b"
|
||||||
|
)
|
||||||
|
print(_ALPHA_MONTH_RANGE)
|
||||||
|
ALPHA_MONTH_RANGE_RE = re_compile(_ALPHA_MONTH_RANGE)
|
||||||
|
|
||||||
_DAY_RE_EXP = r"(?P<day>([0-2]?\d|(3)[0-1]))"
|
_DAY_RE_EXP = r"(?P<day>([0-2]?\d|(3)[0-1]))"
|
||||||
_DATE_DELIM = r"[-\s]+"
|
_DATE_DELIM = r"[-\s]+"
|
||||||
@ -124,6 +158,8 @@ ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE = re_compile(
|
|||||||
r"\(" + _ORIGINAL_FORMAT_RE_EXP + r"\).*\(" + _SCAN_INFO_RE_EXP + r"\)"
|
r"\(" + _ORIGINAL_FORMAT_RE_EXP + r"\).*\(" + _SCAN_INFO_RE_EXP + r"\)"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
SCAN_INFO_SECONDARY_RE = re_compile(r"\b(?P<secondary_scan_info>c2c)\b")
|
||||||
|
|
||||||
# ISSUE
|
# ISSUE
|
||||||
_ISSUE_RE_EXP = r"(?P<issue>\w*(½|\d+)[\.\d+]*\w*)"
|
_ISSUE_RE_EXP = r"(?P<issue>\w*(½|\d+)[\.\d+]*\w*)"
|
||||||
_ISSUE_COUNT_RE_EXP = r"\(of\s*(?P<issue_count>\d+)\)"
|
_ISSUE_COUNT_RE_EXP = r"\(of\s*(?P<issue_count>\d+)\)"
|
||||||
@ -151,7 +187,22 @@ VOLUME_WITH_COUNT_RE = re_compile(
|
|||||||
)
|
)
|
||||||
BOOK_VOLUME_RE = re_compile(r"(?P<title>" + r"book\s*(?P<volume>\d+)" + r")")
|
BOOK_VOLUME_RE = re_compile(r"(?P<title>" + r"book\s*(?P<volume>\d+)" + r")")
|
||||||
|
|
||||||
|
# Publisher
|
||||||
|
_PUBLISHER_UNAMBIGUOUS_RE_EXP = (
|
||||||
|
r"(\b(?P<publisher>" + r"|".join(PUBLISHERS_UNAMBIGUOUS) + r")\b)"
|
||||||
|
)
|
||||||
|
_PUBLISHER_AMBIGUOUS_RE_EXP = (
|
||||||
|
r"(\b(?P<publisher>" + r"|".join(PUBLISHERS_AMBIGUOUS) + r")\b)"
|
||||||
|
)
|
||||||
|
PUBLISHER_UNAMBIGUOUS_TOKEN_RE = re_compile(
|
||||||
|
r"(^|\/)" + _PUBLISHER_UNAMBIGUOUS_RE_EXP + r"($|\/)"
|
||||||
|
)
|
||||||
|
PUBLISHER_AMBIGUOUS_TOKEN_RE = re_compile(
|
||||||
|
r"(^|\/)" + _PUBLISHER_AMBIGUOUS_RE_EXP + r"($|\/)"
|
||||||
|
)
|
||||||
|
PUBLISHER_UNAMBIGUOUS_RE = re_compile(_PUBLISHER_UNAMBIGUOUS_RE_EXP)
|
||||||
|
PUBLISHER_AMBIGUOUS_RE = re_compile(_PUBLISHER_AMBIGUOUS_RE_EXP)
|
||||||
|
|
||||||
# LONG STRINGS
|
# LONG STRINGS
|
||||||
REMAINING_GROUP_RE = re_compile(r"^[^\()].*[^\)]")
|
REMAINING_GROUP_RE = re_compile(r"^[^\(].*[^\)]")
|
||||||
NON_NUMBER_DOT_RE = re_compile(r"(\D)\.(\D)")
|
NON_NUMBER_DOT_RE = re_compile(r"(\D)\.(\D)")
|
||||||
|
@ -274,7 +274,8 @@ FNS.update(
|
|||||||
"issue": "2",
|
"issue": "2",
|
||||||
"series": "Monster Island",
|
"series": "Monster Island",
|
||||||
"volume": "1",
|
"volume": "1",
|
||||||
"remainders": ("repaired c2c",),
|
"scan_info": "c2c",
|
||||||
|
"remainders": ("(repaired)",),
|
||||||
},
|
},
|
||||||
# Extra - in the series
|
# Extra - in the series
|
||||||
" X-Men-V1-#067.cbr": {
|
" X-Men-V1-#067.cbr": {
|
||||||
@ -334,6 +335,7 @@ FNS.update(
|
|||||||
"ext": "cbr",
|
"ext": "cbr",
|
||||||
"issue": "002",
|
"issue": "002",
|
||||||
"series": "Marvel Previews",
|
"series": "Marvel Previews",
|
||||||
|
"publisher": "Marvel",
|
||||||
"month": "01",
|
"month": "01",
|
||||||
"year": "2022",
|
"year": "2022",
|
||||||
},
|
},
|
||||||
@ -416,36 +418,32 @@ FNS.update(
|
|||||||
"volume": "03",
|
"volume": "03",
|
||||||
"year": "2020",
|
"year": "2020",
|
||||||
},
|
},
|
||||||
}
|
|
||||||
)
|
|
||||||
PUBLISHER = {
|
|
||||||
# c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder
|
# c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder
|
||||||
#
|
|
||||||
# 1. c2c is not a title and is an original_format
|
|
||||||
# Leading common publisher may be a publisher? Do not pop
|
|
||||||
"Marvel Two In One V1 #090 c2c.cbr": {
|
"Marvel Two In One V1 #090 c2c.cbr": {
|
||||||
"ext": "cbr",
|
"ext": "cbr",
|
||||||
"issue": "090",
|
"issue": "090",
|
||||||
"series": "Marvel Two In One",
|
"series": "Marvel Two In One",
|
||||||
"publisher": "Marvel",
|
"publisher": "Marvel",
|
||||||
"volume": "1",
|
"volume": "1",
|
||||||
|
"scan_info": "c2c",
|
||||||
},
|
},
|
||||||
# CT treats '[]' as equivalent to '()', catches DC as a publisher and 'Sep-Oct 1951' as dates and removes them. CT doesn't catch the digital though so that could be better but I blame whoever made this atrocious filename
|
# CT treats '[]' as equivalent to '()', catches DC as a publisher and 'Sep-Oct 1951' as dates and removes them. CT doesn't catch the digital though so that could be better but I blame whoever made this atrocious filename
|
||||||
#
|
|
||||||
# 1. Month-Month should be handled
|
|
||||||
# 2. DC is a common publisher, no pop?
|
|
||||||
"Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz": {
|
"Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz": {
|
||||||
"ext": "cbz",
|
"ext": "cbz",
|
||||||
"issue": "49",
|
"issue": "49",
|
||||||
"series": "Wonder Woman",
|
"series": "Wonder Woman",
|
||||||
"title": "digital",
|
|
||||||
"publisher": "DC",
|
"publisher": "DC",
|
||||||
"year": "1951",
|
"year": "1951",
|
||||||
"month": "10",
|
"month": "09",
|
||||||
|
"remainders": (
|
||||||
|
"digital (downsized, lightened, 4 missing story pages "
|
||||||
|
"restored) (Shadowcat-Empire)",
|
||||||
|
),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# first_key, first_val = VOLUME.popitem()
|
# first_key, first_val = NEW.popitem()
|
||||||
# FNS[first_key] = first_val
|
# FNS[first_key] = first_val
|
||||||
|
|
||||||
WONFIX = {
|
WONFIX = {
|
||||||
|
Loading…
Reference in New Issue
Block a user