Complicated volume parsing
This commit is contained in:
parent
2c0ab37d83
commit
d550d9c54e
@ -7,21 +7,23 @@ from re import Pattern
|
||||
from typing import Any
|
||||
|
||||
from comicfn2dict.regex import (
|
||||
NON_NUMBER_DOT_RE,
|
||||
YEAR_FIRST_DATE_RE,
|
||||
BOOK_VOLUME_RE,
|
||||
ISSUE_ANYWHERE_RE,
|
||||
REGEX_SUBS,
|
||||
TOKEN_DELIMETER,
|
||||
ISSUE_COUNT_RE,
|
||||
ISSUE_NUMBER_RE,
|
||||
ISSUE_BEGIN_RE,
|
||||
ISSUE_END_RE,
|
||||
YEAR_END_RE,
|
||||
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
|
||||
ORIGINAL_FORMAT_SCAN_INFO_RE,
|
||||
REMAINING_GROUP_RE,
|
||||
VOLUME_RE,
|
||||
ISSUE_NUMBER_RE,
|
||||
ISSUE_WITH_COUNT_RE,
|
||||
MONTH_FIRST_DATE_RE,
|
||||
NON_NUMBER_DOT_RE,
|
||||
ORIGINAL_FORMAT_SCAN_INFO_RE,
|
||||
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
|
||||
REGEX_SUBS,
|
||||
REMAINING_GROUP_RE,
|
||||
TOKEN_DELIMETER,
|
||||
VOLUME_RE,
|
||||
VOLUME_WITH_COUNT_RE,
|
||||
YEAR_END_RE,
|
||||
YEAR_FIRST_DATE_RE,
|
||||
YEAR_TOKEN_RE,
|
||||
)
|
||||
|
||||
@ -172,6 +174,8 @@ class ComicFilenameParser:
|
||||
tokens = self._unparsed_path.split(TOKEN_DELIMETER)
|
||||
while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS):
|
||||
key = _REMAINING_GROUP_KEYS[remaining_key_index]
|
||||
if key in self.metadata:
|
||||
continue
|
||||
token = tokens.pop(0)
|
||||
match = REMAINING_GROUP_RE.search(token)
|
||||
if match:
|
||||
@ -218,15 +222,19 @@ class ComicFilenameParser:
|
||||
self._clean_dividers()
|
||||
self._log_progress("CLEANED")
|
||||
|
||||
# Main issue parsing
|
||||
# Issue
|
||||
#
|
||||
self._parse_items(ISSUE_NUMBER_RE)
|
||||
self._parse_items(ISSUE_COUNT_RE)
|
||||
if "issue" not in self.metadata:
|
||||
self._parse_items(ISSUE_WITH_COUNT_RE)
|
||||
# self._parse_items(ISSUE_COUNT_RE)
|
||||
self._log_progress("AFTER ISSUE")
|
||||
|
||||
# Volume and date
|
||||
# Volume and Date
|
||||
#
|
||||
self._parse_items(VOLUME_RE)
|
||||
if "volume" not in self.metadata:
|
||||
self._parse_items(VOLUME_WITH_COUNT_RE)
|
||||
self._parse_dates()
|
||||
self._log_progress("AFTER VOLUME & DATE")
|
||||
|
||||
@ -244,13 +252,17 @@ class ComicFilenameParser:
|
||||
|
||||
# Series and Title
|
||||
#
|
||||
# Match years on the end of series and title tokens
|
||||
# Volume left on the end of string tokens
|
||||
if "volume" not in self.metadata:
|
||||
self._parse_items(BOOK_VOLUME_RE)
|
||||
|
||||
# Years left on the end of string tokens
|
||||
year_end_matched = False
|
||||
if "year" not in self.metadata:
|
||||
self._parse_items(YEAR_END_RE, pop=False)
|
||||
year_end_matched = "year" in self.metadata
|
||||
|
||||
# Pickup issue if it's out on the end of a token
|
||||
# Issue left on the end of string tokens
|
||||
if "issue" not in self.metadata and not year_end_matched:
|
||||
exclude: str = self.metadata.get("year", "") # type: ignore
|
||||
self._parse_items(ISSUE_END_RE, exclude=exclude)
|
||||
|
@ -111,7 +111,6 @@ YEAR_TOKEN_RE = re_compile(_YEAR_RE_EXP, parenthify=True)
|
||||
YEAR_END_RE = re_compile(_YEAR_RE_EXP + r"\/|$")
|
||||
|
||||
# PAREN GROUPS
|
||||
ISSUE_COUNT_RE = re_compile(r"of\s*(?P<issue_count>\d+)", parenthify=True)
|
||||
_OF_PATTERNS = r"|".join(ORIGINAL_FORMAT_PATTERNS)
|
||||
_ORIGINAL_FORMAT_RE_EXP = r"(?P<original_format>" + _OF_PATTERNS + r")"
|
||||
_SCAN_INFO_RE_EXP = r"(?P<scan_info>[^()]*)"
|
||||
@ -125,18 +124,34 @@ ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE = re_compile(
|
||||
r"\(" + _ORIGINAL_FORMAT_RE_EXP + r"\).*\(" + _SCAN_INFO_RE_EXP + r"\)"
|
||||
)
|
||||
|
||||
# REGULAR TOKENS
|
||||
VOLUME_RE = re_compile(r"((?:v(?:ol(?:ume)?)?\.?)\s*(?P<volume>\d+))")
|
||||
|
||||
# ISSUE
|
||||
_ISSUE_RE_EXP = r"(?P<issue>\w*(½|\d+)[\.\d+]*\w*)"
|
||||
ISSUE_NUMBER_RE = re_compile(r"(\(?#" + _ISSUE_RE_EXP + r"\)?)")
|
||||
_ISSUE_COUNT_RE_EXP = r"\(of\s*(?P<issue_count>\d+)\)"
|
||||
ISSUE_NUMBER_RE = re_compile(
|
||||
r"(\(?#" + _ISSUE_RE_EXP + r"\)?)" + r"(\W*" + _ISSUE_COUNT_RE_EXP + r")?"
|
||||
)
|
||||
ISSUE_WITH_COUNT_RE = re_compile(
|
||||
r"(\(?" + _ISSUE_RE_EXP + r"\)?" + r"\W*" + _ISSUE_COUNT_RE_EXP + r")"
|
||||
)
|
||||
|
||||
ISSUE_END_RE = re_compile(r"([\/\s]\(?" + _ISSUE_RE_EXP + r"\)?(\/|$))")
|
||||
ISSUE_BEGIN_RE = re_compile(r"((^|\/)\(?" + _ISSUE_RE_EXP + r"\)?[\/|\s])")
|
||||
|
||||
# TODO unused
|
||||
ISSUE_ANYWHERE_RE = re_compile(r"\b(\(?" + _ISSUE_RE_EXP + r"\)?)\b")
|
||||
|
||||
# Volume
|
||||
_VOLUME_COUNT_RE_EXP = r"\(of\s*(?P<volume_count>\d+)\)"
|
||||
VOLUME_RE = re_compile(
|
||||
r"(" + r"(?:v(?:ol(?:ume)?)?\.?)\s*(?P<volume>\d+)"
|
||||
r"(\W*" + _VOLUME_COUNT_RE_EXP + r")?" + r")"
|
||||
)
|
||||
VOLUME_WITH_COUNT_RE = re_compile(
|
||||
r"(\(?" + r"(?P<volume>\d+)" + r"\)?" + r"\W*" + _VOLUME_COUNT_RE_EXP + r")"
|
||||
)
|
||||
BOOK_VOLUME_RE = re_compile(r"(?P<title>" + r"book\s*(?P<volume>\d+)" + r")")
|
||||
|
||||
|
||||
# LONG STRINGS
|
||||
REMAINING_GROUP_RE = re_compile(r"^[^\()].*[^\)]")
|
||||
NON_NUMBER_DOT_RE = re_compile(r"(\D)\.(\D)")
|
||||
|
@ -397,34 +397,27 @@ FNS.update(
|
||||
"volume": "1957",
|
||||
"year": "1969",
|
||||
},
|
||||
# CT checks for the following '(of 06)' after the '03' and marks it as the volume
|
||||
"Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": {
|
||||
"ext": "cbr",
|
||||
"issue": "008",
|
||||
"series": "Elephantmen 2259",
|
||||
"title": "Simple Truth",
|
||||
"volume": "03",
|
||||
"year": "2021",
|
||||
"volume_count": "06",
|
||||
},
|
||||
# CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series)
|
||||
"Bloodshot Book 03 (2020).cbr": {
|
||||
"ext": "cbr",
|
||||
"issue": "03",
|
||||
"series": "Bloodshot",
|
||||
"title": "Book 03",
|
||||
"volume": "03",
|
||||
"year": "2020",
|
||||
},
|
||||
}
|
||||
)
|
||||
VOLUME = {
|
||||
# CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series)
|
||||
#
|
||||
# Book \d is a non-popped volume not an issue
|
||||
"Bloodshot Book 03 (2020).cbr": {
|
||||
"ext": "cbr",
|
||||
"issue": "03",
|
||||
"series": "Bloodshot",
|
||||
"title": "Book 03",
|
||||
"volume": "03",
|
||||
"year": "2020",
|
||||
},
|
||||
# CT checks for the following '(of 06)' after the '03' and marks it as the volume
|
||||
#
|
||||
# issue count is not popped if does not occur near issue
|
||||
# \d (of \d) is volume & volume count if not issue
|
||||
"Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": {
|
||||
"ext": "cbr",
|
||||
"issue": "008",
|
||||
"series": "Elephantmen 2259",
|
||||
"title": "Simple Truth",
|
||||
"volume": "03",
|
||||
"year": "2021",
|
||||
"volume_count": "06",
|
||||
},
|
||||
}
|
||||
PUBLISHER = {
|
||||
# c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder
|
||||
#
|
||||
@ -452,7 +445,7 @@ PUBLISHER = {
|
||||
},
|
||||
}
|
||||
|
||||
# first_key, first_val = YEAR.popitem()
|
||||
# first_key, first_val = VOLUME.popitem()
|
||||
# FNS[first_key] = first_val
|
||||
|
||||
WONFIX = {
|
||||
|
Loading…
Reference in New Issue
Block a user