complicated year and volume parsing
This commit is contained in:
parent
7d9b4efeee
commit
2c0ab37d83
@ -16,6 +16,7 @@ from comicfn2dict.regex import (
|
||||
ISSUE_NUMBER_RE,
|
||||
ISSUE_BEGIN_RE,
|
||||
ISSUE_END_RE,
|
||||
YEAR_END_RE,
|
||||
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
|
||||
ORIGINAL_FORMAT_SCAN_INFO_RE,
|
||||
REMAINING_GROUP_RE,
|
||||
@ -78,6 +79,9 @@ class ComicFilenameParser:
|
||||
self,
|
||||
regex: Pattern,
|
||||
require_all: bool = False,
|
||||
exclude: str = "",
|
||||
first_only: bool = False,
|
||||
pop: bool = True,
|
||||
) -> None:
|
||||
"""Parse a value from the data list into metadata and alter the data list."""
|
||||
matches = regex.search(self._unparsed_path)
|
||||
@ -85,15 +89,23 @@ class ComicFilenameParser:
|
||||
return
|
||||
matched_metadata = {}
|
||||
for key, value in matches.groupdict().items():
|
||||
print(f"{value=} == {exclude=}")
|
||||
if value == exclude:
|
||||
continue
|
||||
if not value:
|
||||
if require_all:
|
||||
return
|
||||
continue
|
||||
# TODO idk if strip is necessary here
|
||||
matched_metadata[key] = self._grouping_operators_strip(value)
|
||||
if first_only:
|
||||
break
|
||||
self.metadata.update(matched_metadata)
|
||||
|
||||
marked_str = regex.sub(TOKEN_DELIMETER, self._unparsed_path)
|
||||
if not matched_metadata or not pop:
|
||||
return
|
||||
count = 1 if first_only else 0
|
||||
marked_str = regex.sub(TOKEN_DELIMETER, self._unparsed_path, count=count)
|
||||
parts = []
|
||||
for part in marked_str.split(TOKEN_DELIMETER):
|
||||
if token := part.strip():
|
||||
@ -122,7 +134,15 @@ class ComicFilenameParser:
|
||||
self._alpha_month_to_numeric()
|
||||
|
||||
if "year" not in self.metadata:
|
||||
self._parse_items(YEAR_TOKEN_RE)
|
||||
self._parse_items(YEAR_TOKEN_RE, first_only=True)
|
||||
if "volume" in self.metadata:
|
||||
return
|
||||
# A second year will be the real year.
|
||||
# Move the first year to volume
|
||||
if volume := self.metadata.get("year", ""):
|
||||
self._parse_items(YEAR_TOKEN_RE)
|
||||
if self.metadata.get("year", "") != volume:
|
||||
self.metadata["volume"] = volume
|
||||
|
||||
def _is_title_in_position(self, value):
|
||||
"""Does the title come after series and one other token if they exist."""
|
||||
@ -191,15 +211,27 @@ class ComicFilenameParser:
|
||||
|
||||
def parse(self) -> dict[str, Any]:
|
||||
"""Parse the filename with a hierarchy of regexes."""
|
||||
# Init
|
||||
#
|
||||
self._log_progress("INITIAL")
|
||||
self._parse_ext()
|
||||
self._clean_dividers()
|
||||
self._log_progress("CLEANED")
|
||||
|
||||
# Parse paren tokens
|
||||
# Main issue parsing
|
||||
#
|
||||
self._parse_items(ISSUE_NUMBER_RE)
|
||||
self._parse_items(ISSUE_COUNT_RE)
|
||||
self._log_progress("AFTER ISSUE")
|
||||
|
||||
# Volume and date
|
||||
#
|
||||
self._parse_items(VOLUME_RE)
|
||||
self._parse_dates()
|
||||
self._log_progress("AFTER VOLUME & DATE")
|
||||
|
||||
# Format & Scan Info
|
||||
#
|
||||
self._parse_items(
|
||||
ORIGINAL_FORMAT_SCAN_INFO_RE,
|
||||
require_all=True,
|
||||
@ -210,19 +242,21 @@ class ComicFilenameParser:
|
||||
)
|
||||
self._log_progress("AFTER PAREN TOKENS")
|
||||
|
||||
# Parse regular tokens
|
||||
self._parse_items(VOLUME_RE)
|
||||
self._log_progress("AFTER REGULAR TOKENS")
|
||||
# Series and Title
|
||||
#
|
||||
# Match years on the end of series and title tokens
|
||||
year_end_matched = False
|
||||
if "year" not in self.metadata:
|
||||
self._parse_items(YEAR_END_RE, pop=False)
|
||||
year_end_matched = "year" in self.metadata
|
||||
|
||||
# Pickup issue if it's a standalone token
|
||||
if "issue" not in self.metadata:
|
||||
self._parse_items(ISSUE_END_RE)
|
||||
# Pickup issue if it's out on the end of a token
|
||||
if "issue" not in self.metadata and not year_end_matched:
|
||||
exclude: str = self.metadata.get("year", "") # type: ignore
|
||||
self._parse_items(ISSUE_END_RE, exclude=exclude)
|
||||
if "issue" not in self.metadata:
|
||||
self._parse_items(ISSUE_BEGIN_RE)
|
||||
|
||||
self._log_progress("AFTER ISSUE PICKUP")
|
||||
|
||||
# Series and Title. Also looks for issue.
|
||||
self._assign_remaining_groups()
|
||||
self._log_progress("AFTER SERIES AND TITLE")
|
||||
|
||||
@ -233,6 +267,7 @@ class ComicFilenameParser:
|
||||
self._log_progress("AFTER ISSUE PICKUP")
|
||||
|
||||
# Copy volume into issue if it's all we have.
|
||||
#
|
||||
if "issue" not in self.metadata and "volume" in self.metadata:
|
||||
self.metadata["issue"] = self.metadata["volume"]
|
||||
|
||||
|
@ -60,11 +60,15 @@ TOKEN_DELIMETER = r"/"
|
||||
_TOKEN_DIVIDERS_RE = re_compile(r":")
|
||||
_SPACE_EQUIVALENT_RE = re_compile(r"_")
|
||||
_EXTRA_SPACES_RE = re_compile(r"\s\s+")
|
||||
_LEFT_PAREN_EQUIVALENT_RE = re_compile(r"\[")
|
||||
_RIGHT_PAREN_EQUIVALENT_RE = re_compile(r"\]")
|
||||
REGEX_SUBS: MappingProxyType[re.Pattern, tuple[str, int]] = MappingProxyType(
|
||||
{
|
||||
_TOKEN_DIVIDERS_RE: (TOKEN_DELIMETER, 1),
|
||||
_SPACE_EQUIVALENT_RE: (r" ", 0),
|
||||
_EXTRA_SPACES_RE: (r" ", 0),
|
||||
_LEFT_PAREN_EQUIVALENT_RE: (r"(", 0),
|
||||
_RIGHT_PAREN_EQUIVALENT_RE: (r")", 0),
|
||||
}
|
||||
)
|
||||
|
||||
@ -104,6 +108,7 @@ _YEAR_FIRST_DATE_RE_EXP = (
|
||||
MONTH_FIRST_DATE_RE = re_compile(_MONTH_FIRST_DATE_RE_EXP)
|
||||
YEAR_FIRST_DATE_RE = re_compile(_YEAR_FIRST_DATE_RE_EXP)
|
||||
YEAR_TOKEN_RE = re_compile(_YEAR_RE_EXP, parenthify=True)
|
||||
YEAR_END_RE = re_compile(_YEAR_RE_EXP + r"\/|$")
|
||||
|
||||
# PAREN GROUPS
|
||||
ISSUE_COUNT_RE = re_compile(r"of\s*(?P<issue_count>\d+)", parenthify=True)
|
||||
|
@ -383,25 +383,26 @@ FNS.update(
|
||||
"title": "Anda's Game",
|
||||
"year": "2007",
|
||||
},
|
||||
# If a title ends in a year, it's not an issue (and is a year if no year)
|
||||
"Blade Runner Free Comic Book Day 2021 (2021).cbr": {
|
||||
"ext": "cbr",
|
||||
"series": "Blade Runner Free Comic Book Day 2021",
|
||||
"year": "2021",
|
||||
},
|
||||
# If a year occurs after another year, and no volume, do volume / year
|
||||
"Super Strange Yarns (1957) #92 (1969).cbz": {
|
||||
"ext": "cbz",
|
||||
"issue": "92",
|
||||
"series": "Super Strange Yarns",
|
||||
"volume": "1957",
|
||||
"year": "1969",
|
||||
},
|
||||
}
|
||||
)
|
||||
DIFFICULT = {
|
||||
# I'm not sure there's a right way to parse this. This might also be a madeup filename I don't remember
|
||||
# if a year occurs after another year, and no volume, do volume / year
|
||||
"Super Strange Yarns (1957) #92 (1969).cbz": {
|
||||
"ext": "cbz",
|
||||
"issue": "92",
|
||||
"series": "Super Strange Yarns",
|
||||
"volume": "1957",
|
||||
"year": "1969",
|
||||
},
|
||||
# CT has extra processing to re-attach the year in this case
|
||||
"Blade Runner Free Comic Book Day 2021 (2021).cbr": {
|
||||
"ext": "cbr",
|
||||
"series": "Blade Runner Free Comic Book Day 2021",
|
||||
"year": "2021",
|
||||
},
|
||||
VOLUME = {
|
||||
# CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series)
|
||||
#
|
||||
# Book \d is a non-popped volume not an issue
|
||||
"Bloodshot Book 03 (2020).cbr": {
|
||||
"ext": "cbr",
|
||||
"issue": "03",
|
||||
@ -411,6 +412,9 @@ DIFFICULT = {
|
||||
"year": "2020",
|
||||
},
|
||||
# CT checks for the following '(of 06)' after the '03' and marks it as the volume
|
||||
#
|
||||
# issue count is not popped if does not occur near issue
|
||||
# \d (of \d) is volume & volume count if not issue
|
||||
"Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": {
|
||||
"ext": "cbr",
|
||||
"issue": "008",
|
||||
@ -420,7 +424,12 @@ DIFFICULT = {
|
||||
"year": "2021",
|
||||
"volume_count": "06",
|
||||
},
|
||||
}
|
||||
PUBLISHER = {
|
||||
# c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder
|
||||
#
|
||||
# 1. c2c is not a title and is an original_format
|
||||
# Leading common publisher may be a publisher? Do not pop
|
||||
"Marvel Two In One V1 #090 c2c.cbr": {
|
||||
"ext": "cbr",
|
||||
"issue": "090",
|
||||
@ -429,6 +438,9 @@ DIFFICULT = {
|
||||
"volume": "1",
|
||||
},
|
||||
# CT treats '[]' as equivalent to '()', catches DC as a publisher and 'Sep-Oct 1951' as dates and removes them. CT doesn't catch the digital though so that could be better but I blame whoever made this atrocious filename
|
||||
#
|
||||
# 1. Month-Month should be handled
|
||||
# 2. DC is a common publisher, no pop?
|
||||
"Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz": {
|
||||
"ext": "cbz",
|
||||
"issue": "49",
|
||||
@ -440,7 +452,7 @@ DIFFICULT = {
|
||||
},
|
||||
}
|
||||
|
||||
# first_key, first_val = DIFFICULT.popitem()
|
||||
# first_key, first_val = YEAR.popitem()
|
||||
# FNS[first_key] = first_val
|
||||
|
||||
WONFIX = {
|
||||
|
Loading…
Reference in New Issue
Block a user