Complicated volume parsing
This commit is contained in:
parent
2c0ab37d83
commit
d550d9c54e
@ -7,21 +7,23 @@ from re import Pattern
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from comicfn2dict.regex import (
|
from comicfn2dict.regex import (
|
||||||
NON_NUMBER_DOT_RE,
|
BOOK_VOLUME_RE,
|
||||||
YEAR_FIRST_DATE_RE,
|
|
||||||
ISSUE_ANYWHERE_RE,
|
ISSUE_ANYWHERE_RE,
|
||||||
REGEX_SUBS,
|
|
||||||
TOKEN_DELIMETER,
|
|
||||||
ISSUE_COUNT_RE,
|
|
||||||
ISSUE_NUMBER_RE,
|
|
||||||
ISSUE_BEGIN_RE,
|
ISSUE_BEGIN_RE,
|
||||||
ISSUE_END_RE,
|
ISSUE_END_RE,
|
||||||
YEAR_END_RE,
|
ISSUE_NUMBER_RE,
|
||||||
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
|
ISSUE_WITH_COUNT_RE,
|
||||||
ORIGINAL_FORMAT_SCAN_INFO_RE,
|
|
||||||
REMAINING_GROUP_RE,
|
|
||||||
VOLUME_RE,
|
|
||||||
MONTH_FIRST_DATE_RE,
|
MONTH_FIRST_DATE_RE,
|
||||||
|
NON_NUMBER_DOT_RE,
|
||||||
|
ORIGINAL_FORMAT_SCAN_INFO_RE,
|
||||||
|
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
|
||||||
|
REGEX_SUBS,
|
||||||
|
REMAINING_GROUP_RE,
|
||||||
|
TOKEN_DELIMETER,
|
||||||
|
VOLUME_RE,
|
||||||
|
VOLUME_WITH_COUNT_RE,
|
||||||
|
YEAR_END_RE,
|
||||||
|
YEAR_FIRST_DATE_RE,
|
||||||
YEAR_TOKEN_RE,
|
YEAR_TOKEN_RE,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -172,6 +174,8 @@ class ComicFilenameParser:
|
|||||||
tokens = self._unparsed_path.split(TOKEN_DELIMETER)
|
tokens = self._unparsed_path.split(TOKEN_DELIMETER)
|
||||||
while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS):
|
while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS):
|
||||||
key = _REMAINING_GROUP_KEYS[remaining_key_index]
|
key = _REMAINING_GROUP_KEYS[remaining_key_index]
|
||||||
|
if key in self.metadata:
|
||||||
|
continue
|
||||||
token = tokens.pop(0)
|
token = tokens.pop(0)
|
||||||
match = REMAINING_GROUP_RE.search(token)
|
match = REMAINING_GROUP_RE.search(token)
|
||||||
if match:
|
if match:
|
||||||
@ -218,15 +222,19 @@ class ComicFilenameParser:
|
|||||||
self._clean_dividers()
|
self._clean_dividers()
|
||||||
self._log_progress("CLEANED")
|
self._log_progress("CLEANED")
|
||||||
|
|
||||||
# Main issue parsing
|
# Issue
|
||||||
#
|
#
|
||||||
self._parse_items(ISSUE_NUMBER_RE)
|
self._parse_items(ISSUE_NUMBER_RE)
|
||||||
self._parse_items(ISSUE_COUNT_RE)
|
if "issue" not in self.metadata:
|
||||||
|
self._parse_items(ISSUE_WITH_COUNT_RE)
|
||||||
|
# self._parse_items(ISSUE_COUNT_RE)
|
||||||
self._log_progress("AFTER ISSUE")
|
self._log_progress("AFTER ISSUE")
|
||||||
|
|
||||||
# Volume and date
|
# Volume and Date
|
||||||
#
|
#
|
||||||
self._parse_items(VOLUME_RE)
|
self._parse_items(VOLUME_RE)
|
||||||
|
if "volume" not in self.metadata:
|
||||||
|
self._parse_items(VOLUME_WITH_COUNT_RE)
|
||||||
self._parse_dates()
|
self._parse_dates()
|
||||||
self._log_progress("AFTER VOLUME & DATE")
|
self._log_progress("AFTER VOLUME & DATE")
|
||||||
|
|
||||||
@ -244,13 +252,17 @@ class ComicFilenameParser:
|
|||||||
|
|
||||||
# Series and Title
|
# Series and Title
|
||||||
#
|
#
|
||||||
# Match years on the end of series and title tokens
|
# Volume left on the end of string tokens
|
||||||
|
if "volume" not in self.metadata:
|
||||||
|
self._parse_items(BOOK_VOLUME_RE)
|
||||||
|
|
||||||
|
# Years left on the end of string tokens
|
||||||
year_end_matched = False
|
year_end_matched = False
|
||||||
if "year" not in self.metadata:
|
if "year" not in self.metadata:
|
||||||
self._parse_items(YEAR_END_RE, pop=False)
|
self._parse_items(YEAR_END_RE, pop=False)
|
||||||
year_end_matched = "year" in self.metadata
|
year_end_matched = "year" in self.metadata
|
||||||
|
|
||||||
# Pickup issue if it's out on the end of a token
|
# Issue left on the end of string tokens
|
||||||
if "issue" not in self.metadata and not year_end_matched:
|
if "issue" not in self.metadata and not year_end_matched:
|
||||||
exclude: str = self.metadata.get("year", "") # type: ignore
|
exclude: str = self.metadata.get("year", "") # type: ignore
|
||||||
self._parse_items(ISSUE_END_RE, exclude=exclude)
|
self._parse_items(ISSUE_END_RE, exclude=exclude)
|
||||||
|
@ -111,7 +111,6 @@ YEAR_TOKEN_RE = re_compile(_YEAR_RE_EXP, parenthify=True)
|
|||||||
YEAR_END_RE = re_compile(_YEAR_RE_EXP + r"\/|$")
|
YEAR_END_RE = re_compile(_YEAR_RE_EXP + r"\/|$")
|
||||||
|
|
||||||
# PAREN GROUPS
|
# PAREN GROUPS
|
||||||
ISSUE_COUNT_RE = re_compile(r"of\s*(?P<issue_count>\d+)", parenthify=True)
|
|
||||||
_OF_PATTERNS = r"|".join(ORIGINAL_FORMAT_PATTERNS)
|
_OF_PATTERNS = r"|".join(ORIGINAL_FORMAT_PATTERNS)
|
||||||
_ORIGINAL_FORMAT_RE_EXP = r"(?P<original_format>" + _OF_PATTERNS + r")"
|
_ORIGINAL_FORMAT_RE_EXP = r"(?P<original_format>" + _OF_PATTERNS + r")"
|
||||||
_SCAN_INFO_RE_EXP = r"(?P<scan_info>[^()]*)"
|
_SCAN_INFO_RE_EXP = r"(?P<scan_info>[^()]*)"
|
||||||
@ -125,18 +124,34 @@ ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE = re_compile(
|
|||||||
r"\(" + _ORIGINAL_FORMAT_RE_EXP + r"\).*\(" + _SCAN_INFO_RE_EXP + r"\)"
|
r"\(" + _ORIGINAL_FORMAT_RE_EXP + r"\).*\(" + _SCAN_INFO_RE_EXP + r"\)"
|
||||||
)
|
)
|
||||||
|
|
||||||
# REGULAR TOKENS
|
|
||||||
VOLUME_RE = re_compile(r"((?:v(?:ol(?:ume)?)?\.?)\s*(?P<volume>\d+))")
|
|
||||||
|
|
||||||
# ISSUE
|
# ISSUE
|
||||||
_ISSUE_RE_EXP = r"(?P<issue>\w*(½|\d+)[\.\d+]*\w*)"
|
_ISSUE_RE_EXP = r"(?P<issue>\w*(½|\d+)[\.\d+]*\w*)"
|
||||||
ISSUE_NUMBER_RE = re_compile(r"(\(?#" + _ISSUE_RE_EXP + r"\)?)")
|
_ISSUE_COUNT_RE_EXP = r"\(of\s*(?P<issue_count>\d+)\)"
|
||||||
|
ISSUE_NUMBER_RE = re_compile(
|
||||||
|
r"(\(?#" + _ISSUE_RE_EXP + r"\)?)" + r"(\W*" + _ISSUE_COUNT_RE_EXP + r")?"
|
||||||
|
)
|
||||||
|
ISSUE_WITH_COUNT_RE = re_compile(
|
||||||
|
r"(\(?" + _ISSUE_RE_EXP + r"\)?" + r"\W*" + _ISSUE_COUNT_RE_EXP + r")"
|
||||||
|
)
|
||||||
|
|
||||||
ISSUE_END_RE = re_compile(r"([\/\s]\(?" + _ISSUE_RE_EXP + r"\)?(\/|$))")
|
ISSUE_END_RE = re_compile(r"([\/\s]\(?" + _ISSUE_RE_EXP + r"\)?(\/|$))")
|
||||||
ISSUE_BEGIN_RE = re_compile(r"((^|\/)\(?" + _ISSUE_RE_EXP + r"\)?[\/|\s])")
|
ISSUE_BEGIN_RE = re_compile(r"((^|\/)\(?" + _ISSUE_RE_EXP + r"\)?[\/|\s])")
|
||||||
|
|
||||||
# TODO unused
|
# TODO unused
|
||||||
ISSUE_ANYWHERE_RE = re_compile(r"\b(\(?" + _ISSUE_RE_EXP + r"\)?)\b")
|
ISSUE_ANYWHERE_RE = re_compile(r"\b(\(?" + _ISSUE_RE_EXP + r"\)?)\b")
|
||||||
|
|
||||||
|
# Volume
|
||||||
|
_VOLUME_COUNT_RE_EXP = r"\(of\s*(?P<volume_count>\d+)\)"
|
||||||
|
VOLUME_RE = re_compile(
|
||||||
|
r"(" + r"(?:v(?:ol(?:ume)?)?\.?)\s*(?P<volume>\d+)"
|
||||||
|
r"(\W*" + _VOLUME_COUNT_RE_EXP + r")?" + r")"
|
||||||
|
)
|
||||||
|
VOLUME_WITH_COUNT_RE = re_compile(
|
||||||
|
r"(\(?" + r"(?P<volume>\d+)" + r"\)?" + r"\W*" + _VOLUME_COUNT_RE_EXP + r")"
|
||||||
|
)
|
||||||
|
BOOK_VOLUME_RE = re_compile(r"(?P<title>" + r"book\s*(?P<volume>\d+)" + r")")
|
||||||
|
|
||||||
|
|
||||||
# LONG STRINGS
|
# LONG STRINGS
|
||||||
REMAINING_GROUP_RE = re_compile(r"^[^\()].*[^\)]")
|
REMAINING_GROUP_RE = re_compile(r"^[^\()].*[^\)]")
|
||||||
NON_NUMBER_DOT_RE = re_compile(r"(\D)\.(\D)")
|
NON_NUMBER_DOT_RE = re_compile(r"(\D)\.(\D)")
|
||||||
|
@ -397,24 +397,7 @@ FNS.update(
|
|||||||
"volume": "1957",
|
"volume": "1957",
|
||||||
"year": "1969",
|
"year": "1969",
|
||||||
},
|
},
|
||||||
}
|
|
||||||
)
|
|
||||||
VOLUME = {
|
|
||||||
# CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series)
|
|
||||||
#
|
|
||||||
# Book \d is a non-popped volume not an issue
|
|
||||||
"Bloodshot Book 03 (2020).cbr": {
|
|
||||||
"ext": "cbr",
|
|
||||||
"issue": "03",
|
|
||||||
"series": "Bloodshot",
|
|
||||||
"title": "Book 03",
|
|
||||||
"volume": "03",
|
|
||||||
"year": "2020",
|
|
||||||
},
|
|
||||||
# CT checks for the following '(of 06)' after the '03' and marks it as the volume
|
# CT checks for the following '(of 06)' after the '03' and marks it as the volume
|
||||||
#
|
|
||||||
# issue count is not popped if does not occur near issue
|
|
||||||
# \d (of \d) is volume & volume count if not issue
|
|
||||||
"Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": {
|
"Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": {
|
||||||
"ext": "cbr",
|
"ext": "cbr",
|
||||||
"issue": "008",
|
"issue": "008",
|
||||||
@ -424,7 +407,17 @@ VOLUME = {
|
|||||||
"year": "2021",
|
"year": "2021",
|
||||||
"volume_count": "06",
|
"volume_count": "06",
|
||||||
},
|
},
|
||||||
}
|
# CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series)
|
||||||
|
"Bloodshot Book 03 (2020).cbr": {
|
||||||
|
"ext": "cbr",
|
||||||
|
"issue": "03",
|
||||||
|
"series": "Bloodshot",
|
||||||
|
"title": "Book 03",
|
||||||
|
"volume": "03",
|
||||||
|
"year": "2020",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
PUBLISHER = {
|
PUBLISHER = {
|
||||||
# c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder
|
# c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder
|
||||||
#
|
#
|
||||||
@ -452,7 +445,7 @@ PUBLISHER = {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
# first_key, first_val = YEAR.popitem()
|
# first_key, first_val = VOLUME.popitem()
|
||||||
# FNS[first_key] = first_val
|
# FNS[first_key] = first_val
|
||||||
|
|
||||||
WONFIX = {
|
WONFIX = {
|
||||||
|
Loading…
Reference in New Issue
Block a user