complicated year and volume parsing

This commit is contained in:
AJ Slater 2024-02-21 11:32:55 -08:00
parent 7d9b4efeee
commit 2c0ab37d83
3 changed files with 81 additions and 29 deletions

View File

@ -16,6 +16,7 @@ from comicfn2dict.regex import (
ISSUE_NUMBER_RE,
ISSUE_BEGIN_RE,
ISSUE_END_RE,
YEAR_END_RE,
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
ORIGINAL_FORMAT_SCAN_INFO_RE,
REMAINING_GROUP_RE,
@ -78,6 +79,9 @@ class ComicFilenameParser:
self,
regex: Pattern,
require_all: bool = False,
exclude: str = "",
first_only: bool = False,
pop: bool = True,
) -> None:
"""Parse a value from the data list into metadata and alter the data list."""
matches = regex.search(self._unparsed_path)
@ -85,15 +89,23 @@ class ComicFilenameParser:
return
matched_metadata = {}
for key, value in matches.groupdict().items():
print(f"{value=} == {exclude=}")
if value == exclude:
continue
if not value:
if require_all:
return
continue
# TODO idk if strip is necessary here
matched_metadata[key] = self._grouping_operators_strip(value)
if first_only:
break
self.metadata.update(matched_metadata)
marked_str = regex.sub(TOKEN_DELIMETER, self._unparsed_path)
if not matched_metadata or not pop:
return
count = 1 if first_only else 0
marked_str = regex.sub(TOKEN_DELIMETER, self._unparsed_path, count=count)
parts = []
for part in marked_str.split(TOKEN_DELIMETER):
if token := part.strip():
@ -122,7 +134,15 @@ class ComicFilenameParser:
self._alpha_month_to_numeric()
if "year" not in self.metadata:
self._parse_items(YEAR_TOKEN_RE)
self._parse_items(YEAR_TOKEN_RE, first_only=True)
if "volume" in self.metadata:
return
# A second year will be the real year.
# Move the first year to volume
if volume := self.metadata.get("year", ""):
self._parse_items(YEAR_TOKEN_RE)
if self.metadata.get("year", "") != volume:
self.metadata["volume"] = volume
def _is_title_in_position(self, value):
"""Does the title come after series and one other token if they exist."""
@ -191,15 +211,27 @@ class ComicFilenameParser:
def parse(self) -> dict[str, Any]:
"""Parse the filename with a hierarchy of regexes."""
# Init
#
self._log_progress("INITIAL")
self._parse_ext()
self._clean_dividers()
self._log_progress("CLEANED")
# Parse paren tokens
# Main issue parsing
#
self._parse_items(ISSUE_NUMBER_RE)
self._parse_items(ISSUE_COUNT_RE)
self._log_progress("AFTER ISSUE")
# Volume and date
#
self._parse_items(VOLUME_RE)
self._parse_dates()
self._log_progress("AFTER VOLUME & DATE")
# Format & Scan Info
#
self._parse_items(
ORIGINAL_FORMAT_SCAN_INFO_RE,
require_all=True,
@ -210,19 +242,21 @@ class ComicFilenameParser:
)
self._log_progress("AFTER PAREN TOKENS")
# Parse regular tokens
self._parse_items(VOLUME_RE)
self._log_progress("AFTER REGULAR TOKENS")
# Series and Title
#
# Match years on the end of series and title tokens
year_end_matched = False
if "year" not in self.metadata:
self._parse_items(YEAR_END_RE, pop=False)
year_end_matched = "year" in self.metadata
# Pickup issue if it's a standalone token
if "issue" not in self.metadata:
self._parse_items(ISSUE_END_RE)
# Pickup issue if it's out on the end of a token
if "issue" not in self.metadata and not year_end_matched:
exclude: str = self.metadata.get("year", "") # type: ignore
self._parse_items(ISSUE_END_RE, exclude=exclude)
if "issue" not in self.metadata:
self._parse_items(ISSUE_BEGIN_RE)
self._log_progress("AFTER ISSUE PICKUP")
# Series and Title. Also looks for issue.
self._assign_remaining_groups()
self._log_progress("AFTER SERIES AND TITLE")
@ -233,6 +267,7 @@ class ComicFilenameParser:
self._log_progress("AFTER ISSUE PICKUP")
# Copy volume into issue if it's all we have.
#
if "issue" not in self.metadata and "volume" in self.metadata:
self.metadata["issue"] = self.metadata["volume"]

View File

@ -60,11 +60,15 @@ TOKEN_DELIMETER = r"/"
_TOKEN_DIVIDERS_RE = re_compile(r":")
_SPACE_EQUIVALENT_RE = re_compile(r"_")
_EXTRA_SPACES_RE = re_compile(r"\s\s+")
_LEFT_PAREN_EQUIVALENT_RE = re_compile(r"\[")
_RIGHT_PAREN_EQUIVALENT_RE = re_compile(r"\]")
REGEX_SUBS: MappingProxyType[re.Pattern, tuple[str, int]] = MappingProxyType(
{
_TOKEN_DIVIDERS_RE: (TOKEN_DELIMETER, 1),
_SPACE_EQUIVALENT_RE: (r" ", 0),
_EXTRA_SPACES_RE: (r" ", 0),
_LEFT_PAREN_EQUIVALENT_RE: (r"(", 0),
_RIGHT_PAREN_EQUIVALENT_RE: (r")", 0),
}
)
@ -104,6 +108,7 @@ _YEAR_FIRST_DATE_RE_EXP = (
MONTH_FIRST_DATE_RE = re_compile(_MONTH_FIRST_DATE_RE_EXP)
YEAR_FIRST_DATE_RE = re_compile(_YEAR_FIRST_DATE_RE_EXP)
YEAR_TOKEN_RE = re_compile(_YEAR_RE_EXP, parenthify=True)
YEAR_END_RE = re_compile(_YEAR_RE_EXP + r"\/|$")
# PAREN GROUPS
ISSUE_COUNT_RE = re_compile(r"of\s*(?P<issue_count>\d+)", parenthify=True)

View File

@ -383,25 +383,26 @@ FNS.update(
"title": "Anda's Game",
"year": "2007",
},
# If a title ends in a year, it's not an issue (and is a year if no year)
"Blade Runner Free Comic Book Day 2021 (2021).cbr": {
"ext": "cbr",
"series": "Blade Runner Free Comic Book Day 2021",
"year": "2021",
},
# If a year occurs after another year, and no volume, do volume / year
"Super Strange Yarns (1957) #92 (1969).cbz": {
"ext": "cbz",
"issue": "92",
"series": "Super Strange Yarns",
"volume": "1957",
"year": "1969",
},
}
)
DIFFICULT = {
# I'm not sure there's a right way to parse this. This might also be a madeup filename I don't remember
# if a year occurs after another year, and no volume, do volume / year
"Super Strange Yarns (1957) #92 (1969).cbz": {
"ext": "cbz",
"issue": "92",
"series": "Super Strange Yarns",
"volume": "1957",
"year": "1969",
},
# CT has extra processing to re-attach the year in this case
"Blade Runner Free Comic Book Day 2021 (2021).cbr": {
"ext": "cbr",
"series": "Blade Runner Free Comic Book Day 2021",
"year": "2021",
},
VOLUME = {
# CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series)
#
# Book \d is a non-popped volume not an issue
"Bloodshot Book 03 (2020).cbr": {
"ext": "cbr",
"issue": "03",
@ -411,6 +412,9 @@ DIFFICULT = {
"year": "2020",
},
# CT checks for the following '(of 06)' after the '03' and marks it as the volume
#
# issue count is not popped if does not occur near issue
# \d (of \d) is volume & volume count if not issue
"Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": {
"ext": "cbr",
"issue": "008",
@ -420,7 +424,12 @@ DIFFICULT = {
"year": "2021",
"volume_count": "06",
},
}
PUBLISHER = {
# c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder
#
# 1. c2c is not a title and is an original_format
# Leading common publisher may be a publisher? Do not pop
"Marvel Two In One V1 #090 c2c.cbr": {
"ext": "cbr",
"issue": "090",
@ -429,6 +438,9 @@ DIFFICULT = {
"volume": "1",
},
# CT treats '[]' as equivalent to '()', catches DC as a publisher and 'Sep-Oct 1951' as dates and removes them. CT doesn't catch the digital though so that could be better but I blame whoever made this atrocious filename
#
# 1. Month-Month should be handled
# 2. DC is a common publisher, no pop?
"Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz": {
"ext": "cbz",
"issue": "49",
@ -440,7 +452,7 @@ DIFFICULT = {
},
}
# first_key, first_val = DIFFICULT.popitem()
# first_key, first_val = YEAR.popitem()
# FNS[first_key] = first_val
WONFIX = {