Complicated volume parsing

This commit is contained in:
AJ Slater 2024-02-21 13:15:18 -08:00
parent 2c0ab37d83
commit d550d9c54e
3 changed files with 68 additions and 48 deletions

View File

@ -7,21 +7,23 @@ from re import Pattern
from typing import Any from typing import Any
from comicfn2dict.regex import ( from comicfn2dict.regex import (
NON_NUMBER_DOT_RE, BOOK_VOLUME_RE,
YEAR_FIRST_DATE_RE,
ISSUE_ANYWHERE_RE, ISSUE_ANYWHERE_RE,
REGEX_SUBS,
TOKEN_DELIMETER,
ISSUE_COUNT_RE,
ISSUE_NUMBER_RE,
ISSUE_BEGIN_RE, ISSUE_BEGIN_RE,
ISSUE_END_RE, ISSUE_END_RE,
YEAR_END_RE, ISSUE_NUMBER_RE,
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, ISSUE_WITH_COUNT_RE,
ORIGINAL_FORMAT_SCAN_INFO_RE,
REMAINING_GROUP_RE,
VOLUME_RE,
MONTH_FIRST_DATE_RE, MONTH_FIRST_DATE_RE,
NON_NUMBER_DOT_RE,
ORIGINAL_FORMAT_SCAN_INFO_RE,
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
REGEX_SUBS,
REMAINING_GROUP_RE,
TOKEN_DELIMETER,
VOLUME_RE,
VOLUME_WITH_COUNT_RE,
YEAR_END_RE,
YEAR_FIRST_DATE_RE,
YEAR_TOKEN_RE, YEAR_TOKEN_RE,
) )
@ -172,6 +174,8 @@ class ComicFilenameParser:
tokens = self._unparsed_path.split(TOKEN_DELIMETER) tokens = self._unparsed_path.split(TOKEN_DELIMETER)
while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS): while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS):
key = _REMAINING_GROUP_KEYS[remaining_key_index] key = _REMAINING_GROUP_KEYS[remaining_key_index]
if key in self.metadata:
continue
token = tokens.pop(0) token = tokens.pop(0)
match = REMAINING_GROUP_RE.search(token) match = REMAINING_GROUP_RE.search(token)
if match: if match:
@ -218,15 +222,19 @@ class ComicFilenameParser:
self._clean_dividers() self._clean_dividers()
self._log_progress("CLEANED") self._log_progress("CLEANED")
# Main issue parsing # Issue
# #
self._parse_items(ISSUE_NUMBER_RE) self._parse_items(ISSUE_NUMBER_RE)
self._parse_items(ISSUE_COUNT_RE) if "issue" not in self.metadata:
self._parse_items(ISSUE_WITH_COUNT_RE)
# self._parse_items(ISSUE_COUNT_RE)
self._log_progress("AFTER ISSUE") self._log_progress("AFTER ISSUE")
# Volume and date # Volume and Date
# #
self._parse_items(VOLUME_RE) self._parse_items(VOLUME_RE)
if "volume" not in self.metadata:
self._parse_items(VOLUME_WITH_COUNT_RE)
self._parse_dates() self._parse_dates()
self._log_progress("AFTER VOLUME & DATE") self._log_progress("AFTER VOLUME & DATE")
@ -244,13 +252,17 @@ class ComicFilenameParser:
# Series and Title # Series and Title
# #
# Match years on the end of series and title tokens # Volume left on the end of string tokens
if "volume" not in self.metadata:
self._parse_items(BOOK_VOLUME_RE)
# Years left on the end of string tokens
year_end_matched = False year_end_matched = False
if "year" not in self.metadata: if "year" not in self.metadata:
self._parse_items(YEAR_END_RE, pop=False) self._parse_items(YEAR_END_RE, pop=False)
year_end_matched = "year" in self.metadata year_end_matched = "year" in self.metadata
# Pickup issue if it's out on the end of a token # Issue left on the end of string tokens
if "issue" not in self.metadata and not year_end_matched: if "issue" not in self.metadata and not year_end_matched:
exclude: str = self.metadata.get("year", "") # type: ignore exclude: str = self.metadata.get("year", "") # type: ignore
self._parse_items(ISSUE_END_RE, exclude=exclude) self._parse_items(ISSUE_END_RE, exclude=exclude)

View File

@ -111,7 +111,6 @@ YEAR_TOKEN_RE = re_compile(_YEAR_RE_EXP, parenthify=True)
YEAR_END_RE = re_compile(_YEAR_RE_EXP + r"\/|$") YEAR_END_RE = re_compile(_YEAR_RE_EXP + r"\/|$")
# PAREN GROUPS # PAREN GROUPS
ISSUE_COUNT_RE = re_compile(r"of\s*(?P<issue_count>\d+)", parenthify=True)
_OF_PATTERNS = r"|".join(ORIGINAL_FORMAT_PATTERNS) _OF_PATTERNS = r"|".join(ORIGINAL_FORMAT_PATTERNS)
_ORIGINAL_FORMAT_RE_EXP = r"(?P<original_format>" + _OF_PATTERNS + r")" _ORIGINAL_FORMAT_RE_EXP = r"(?P<original_format>" + _OF_PATTERNS + r")"
_SCAN_INFO_RE_EXP = r"(?P<scan_info>[^()]*)" _SCAN_INFO_RE_EXP = r"(?P<scan_info>[^()]*)"
@ -125,18 +124,34 @@ ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE = re_compile(
r"\(" + _ORIGINAL_FORMAT_RE_EXP + r"\).*\(" + _SCAN_INFO_RE_EXP + r"\)" r"\(" + _ORIGINAL_FORMAT_RE_EXP + r"\).*\(" + _SCAN_INFO_RE_EXP + r"\)"
) )
# REGULAR TOKENS
VOLUME_RE = re_compile(r"((?:v(?:ol(?:ume)?)?\.?)\s*(?P<volume>\d+))")
# ISSUE # ISSUE
_ISSUE_RE_EXP = r"(?P<issue>\w*(½|\d+)[\.\d+]*\w*)" _ISSUE_RE_EXP = r"(?P<issue>\w*(½|\d+)[\.\d+]*\w*)"
ISSUE_NUMBER_RE = re_compile(r"(\(?#" + _ISSUE_RE_EXP + r"\)?)") _ISSUE_COUNT_RE_EXP = r"\(of\s*(?P<issue_count>\d+)\)"
ISSUE_NUMBER_RE = re_compile(
r"(\(?#" + _ISSUE_RE_EXP + r"\)?)" + r"(\W*" + _ISSUE_COUNT_RE_EXP + r")?"
)
ISSUE_WITH_COUNT_RE = re_compile(
r"(\(?" + _ISSUE_RE_EXP + r"\)?" + r"\W*" + _ISSUE_COUNT_RE_EXP + r")"
)
ISSUE_END_RE = re_compile(r"([\/\s]\(?" + _ISSUE_RE_EXP + r"\)?(\/|$))") ISSUE_END_RE = re_compile(r"([\/\s]\(?" + _ISSUE_RE_EXP + r"\)?(\/|$))")
ISSUE_BEGIN_RE = re_compile(r"((^|\/)\(?" + _ISSUE_RE_EXP + r"\)?[\/|\s])") ISSUE_BEGIN_RE = re_compile(r"((^|\/)\(?" + _ISSUE_RE_EXP + r"\)?[\/|\s])")
# TODO unused # TODO unused
ISSUE_ANYWHERE_RE = re_compile(r"\b(\(?" + _ISSUE_RE_EXP + r"\)?)\b") ISSUE_ANYWHERE_RE = re_compile(r"\b(\(?" + _ISSUE_RE_EXP + r"\)?)\b")
# Volume
_VOLUME_COUNT_RE_EXP = r"\(of\s*(?P<volume_count>\d+)\)"
VOLUME_RE = re_compile(
r"(" + r"(?:v(?:ol(?:ume)?)?\.?)\s*(?P<volume>\d+)"
r"(\W*" + _VOLUME_COUNT_RE_EXP + r")?" + r")"
)
VOLUME_WITH_COUNT_RE = re_compile(
r"(\(?" + r"(?P<volume>\d+)" + r"\)?" + r"\W*" + _VOLUME_COUNT_RE_EXP + r")"
)
BOOK_VOLUME_RE = re_compile(r"(?P<title>" + r"book\s*(?P<volume>\d+)" + r")")
# LONG STRINGS # LONG STRINGS
REMAINING_GROUP_RE = re_compile(r"^[^\()].*[^\)]") REMAINING_GROUP_RE = re_compile(r"^[^\()].*[^\)]")
NON_NUMBER_DOT_RE = re_compile(r"(\D)\.(\D)") NON_NUMBER_DOT_RE = re_compile(r"(\D)\.(\D)")

View File

@ -397,34 +397,27 @@ FNS.update(
"volume": "1957", "volume": "1957",
"year": "1969", "year": "1969",
}, },
# CT checks for the following '(of 06)' after the '03' and marks it as the volume
"Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": {
"ext": "cbr",
"issue": "008",
"series": "Elephantmen 2259",
"title": "Simple Truth",
"volume": "03",
"year": "2021",
"volume_count": "06",
},
# CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series)
"Bloodshot Book 03 (2020).cbr": {
"ext": "cbr",
"issue": "03",
"series": "Bloodshot",
"title": "Book 03",
"volume": "03",
"year": "2020",
},
} }
) )
VOLUME = {
# CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series)
#
# Book \d is a non-popped volume not an issue
"Bloodshot Book 03 (2020).cbr": {
"ext": "cbr",
"issue": "03",
"series": "Bloodshot",
"title": "Book 03",
"volume": "03",
"year": "2020",
},
# CT checks for the following '(of 06)' after the '03' and marks it as the volume
#
# issue count is not popped if does not occur near issue
# \d (of \d) is volume & volume count if not issue
"Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": {
"ext": "cbr",
"issue": "008",
"series": "Elephantmen 2259",
"title": "Simple Truth",
"volume": "03",
"year": "2021",
"volume_count": "06",
},
}
PUBLISHER = { PUBLISHER = {
# c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder
# #
@ -452,7 +445,7 @@ PUBLISHER = {
}, },
} }
# first_key, first_val = YEAR.popitem() # first_key, first_val = VOLUME.popitem()
# FNS[first_key] = first_val # FNS[first_key] = first_val
WONFIX = { WONFIX = {