reorganize code. only substitute first colon out of caution.
This commit is contained in:
parent
4b1f5fbdb9
commit
7d9b4efeee
@ -9,13 +9,13 @@ from typing import Any
|
|||||||
from comicfn2dict.regex import (
|
from comicfn2dict.regex import (
|
||||||
NON_NUMBER_DOT_RE,
|
NON_NUMBER_DOT_RE,
|
||||||
YEAR_FIRST_DATE_RE,
|
YEAR_FIRST_DATE_RE,
|
||||||
EXTRA_SPACES_RE,
|
|
||||||
ISSUE_ANYWHERE_RE,
|
ISSUE_ANYWHERE_RE,
|
||||||
|
REGEX_SUBS,
|
||||||
|
TOKEN_DELIMETER,
|
||||||
ISSUE_COUNT_RE,
|
ISSUE_COUNT_RE,
|
||||||
ISSUE_NUMBER_RE,
|
ISSUE_NUMBER_RE,
|
||||||
ISSUE_BEGIN_RE,
|
ISSUE_BEGIN_RE,
|
||||||
ISSUE_END_RE,
|
ISSUE_END_RE,
|
||||||
NON_SPACE_DIVIDER_RE,
|
|
||||||
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
|
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
|
||||||
ORIGINAL_FORMAT_SCAN_INFO_RE,
|
ORIGINAL_FORMAT_SCAN_INFO_RE,
|
||||||
REMAINING_GROUP_RE,
|
REMAINING_GROUP_RE,
|
||||||
@ -26,7 +26,6 @@ from comicfn2dict.regex import (
|
|||||||
|
|
||||||
_REMAINING_GROUP_KEYS = ("series", "title")
|
_REMAINING_GROUP_KEYS = ("series", "title")
|
||||||
_TITLE_PRECEDING_KEYS = ("issue", "year", "volume")
|
_TITLE_PRECEDING_KEYS = ("issue", "year", "volume")
|
||||||
_TOKEN_DELIMETER = "/"
|
|
||||||
_DATE_KEYS = frozenset({"year", "month", "day"})
|
_DATE_KEYS = frozenset({"year", "month", "day"})
|
||||||
|
|
||||||
|
|
||||||
@ -58,19 +57,22 @@ class ComicFilenameParser:
|
|||||||
self.metadata["ext"] = ext
|
self.metadata["ext"] = ext
|
||||||
self._unparsed_path = data
|
self._unparsed_path = data
|
||||||
|
|
||||||
def _clean_dividers(self):
|
|
||||||
"""Replace non space dividers and clean extra spaces out of string."""
|
|
||||||
data = NON_SPACE_DIVIDER_RE.sub(" ", self._unparsed_path)
|
|
||||||
self._unparsed_path = EXTRA_SPACES_RE.sub(" ", data).strip()
|
|
||||||
|
|
||||||
def _grouping_operators_strip(self, value: str) -> str:
|
def _grouping_operators_strip(self, value: str) -> str:
|
||||||
"""Strip spaces and parens."""
|
"""Strip spaces and parens."""
|
||||||
value = value.strip()
|
value = value.strip()
|
||||||
value = value.strip("()").strip()
|
value = value.strip("()").strip()
|
||||||
value = value.strip("-").strip()
|
value = value.strip("-").strip()
|
||||||
value = value.strip(",").strip()
|
value = value.strip(",").strip()
|
||||||
value = value.strip("'").strip('"').strip()
|
value = value.strip("'").strip()
|
||||||
return value
|
return value.strip('"').strip()
|
||||||
|
|
||||||
|
def _clean_dividers(self):
|
||||||
|
"""Replace non space dividers and clean extra spaces out of string."""
|
||||||
|
data = self._unparsed_path
|
||||||
|
for regex, pair in REGEX_SUBS.items():
|
||||||
|
replacement, count = pair
|
||||||
|
data = regex.sub(replacement, data, count=count)
|
||||||
|
self._unparsed_path = data.strip()
|
||||||
|
|
||||||
def _parse_items(
|
def _parse_items(
|
||||||
self,
|
self,
|
||||||
@ -91,12 +93,12 @@ class ComicFilenameParser:
|
|||||||
matched_metadata[key] = self._grouping_operators_strip(value)
|
matched_metadata[key] = self._grouping_operators_strip(value)
|
||||||
self.metadata.update(matched_metadata)
|
self.metadata.update(matched_metadata)
|
||||||
|
|
||||||
marked_str = regex.sub(_TOKEN_DELIMETER, self._unparsed_path)
|
marked_str = regex.sub(TOKEN_DELIMETER, self._unparsed_path)
|
||||||
parts = []
|
parts = []
|
||||||
for part in marked_str.split(_TOKEN_DELIMETER):
|
for part in marked_str.split(TOKEN_DELIMETER):
|
||||||
if token := part.strip():
|
if token := part.strip():
|
||||||
parts.append(token)
|
parts.append(token)
|
||||||
self._unparsed_path = _TOKEN_DELIMETER.join(parts)
|
self._unparsed_path = TOKEN_DELIMETER.join(parts)
|
||||||
|
|
||||||
def _alpha_month_to_numeric(self):
|
def _alpha_month_to_numeric(self):
|
||||||
"""Translate alpha_month to numeric month."""
|
"""Translate alpha_month to numeric month."""
|
||||||
@ -147,7 +149,7 @@ class ComicFilenameParser:
|
|||||||
|
|
||||||
remaining_key_index = 0
|
remaining_key_index = 0
|
||||||
unused_tokens = []
|
unused_tokens = []
|
||||||
tokens = self._unparsed_path.split(_TOKEN_DELIMETER)
|
tokens = self._unparsed_path.split(TOKEN_DELIMETER)
|
||||||
while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS):
|
while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS):
|
||||||
key = _REMAINING_GROUP_KEYS[remaining_key_index]
|
key = _REMAINING_GROUP_KEYS[remaining_key_index]
|
||||||
token = tokens.pop(0)
|
token = tokens.pop(0)
|
||||||
@ -170,7 +172,7 @@ class ComicFilenameParser:
|
|||||||
def _add_remainders(self):
|
def _add_remainders(self):
|
||||||
"""Add Remainders."""
|
"""Add Remainders."""
|
||||||
remainders = []
|
remainders = []
|
||||||
for token in self._unparsed_path.split(_TOKEN_DELIMETER):
|
for token in self._unparsed_path.split(TOKEN_DELIMETER):
|
||||||
if remainder := token.strip():
|
if remainder := token.strip():
|
||||||
remainders.append(remainder)
|
remainders.append(remainder)
|
||||||
|
|
||||||
@ -225,8 +227,8 @@ class ComicFilenameParser:
|
|||||||
self._log_progress("AFTER SERIES AND TITLE")
|
self._log_progress("AFTER SERIES AND TITLE")
|
||||||
|
|
||||||
# Final try for issue number.
|
# Final try for issue number.
|
||||||
|
# TODO unused
|
||||||
if "issue" not in self.metadata:
|
if "issue" not in self.metadata:
|
||||||
# TODO is this useful?
|
|
||||||
self._parse_items(ISSUE_ANYWHERE_RE)
|
self._parse_items(ISSUE_ANYWHERE_RE)
|
||||||
self._log_progress("AFTER ISSUE PICKUP")
|
self._log_progress("AFTER ISSUE PICKUP")
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
"""Parsing regexes."""
|
"""Parsing regexes."""
|
||||||
import re
|
import re
|
||||||
|
from types import MappingProxyType
|
||||||
|
|
||||||
|
|
||||||
def re_compile(exp, parenthify=False):
|
def re_compile(exp, parenthify=False):
|
||||||
@ -53,9 +54,19 @@ MONTHS = (
|
|||||||
r"Dec(ember)?",
|
r"Dec(ember)?",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
TOKEN_DELIMETER = r"/"
|
||||||
|
|
||||||
# CLEAN
|
# CLEAN
|
||||||
NON_SPACE_DIVIDER_RE = re_compile(r"[_\+]")
|
_TOKEN_DIVIDERS_RE = re_compile(r":")
|
||||||
EXTRA_SPACES_RE = re_compile(r"\s\s+")
|
_SPACE_EQUIVALENT_RE = re_compile(r"_")
|
||||||
|
_EXTRA_SPACES_RE = re_compile(r"\s\s+")
|
||||||
|
REGEX_SUBS: MappingProxyType[re.Pattern, tuple[str, int]] = MappingProxyType(
|
||||||
|
{
|
||||||
|
_TOKEN_DIVIDERS_RE: (TOKEN_DELIMETER, 1),
|
||||||
|
_SPACE_EQUIVALENT_RE: (r" ", 0),
|
||||||
|
_EXTRA_SPACES_RE: (r" ", 0),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
### DATES
|
### DATES
|
||||||
_YEAR_RE_EXP = r"(?P<year>[12]\d{3})"
|
_YEAR_RE_EXP = r"(?P<year>[12]\d{3})"
|
||||||
@ -117,6 +128,8 @@ _ISSUE_RE_EXP = r"(?P<issue>\w*(½|\d+)[\.\d+]*\w*)"
|
|||||||
ISSUE_NUMBER_RE = re_compile(r"(\(?#" + _ISSUE_RE_EXP + r"\)?)")
|
ISSUE_NUMBER_RE = re_compile(r"(\(?#" + _ISSUE_RE_EXP + r"\)?)")
|
||||||
ISSUE_END_RE = re_compile(r"([\/\s]\(?" + _ISSUE_RE_EXP + r"\)?(\/|$))")
|
ISSUE_END_RE = re_compile(r"([\/\s]\(?" + _ISSUE_RE_EXP + r"\)?(\/|$))")
|
||||||
ISSUE_BEGIN_RE = re_compile(r"((^|\/)\(?" + _ISSUE_RE_EXP + r"\)?[\/|\s])")
|
ISSUE_BEGIN_RE = re_compile(r"((^|\/)\(?" + _ISSUE_RE_EXP + r"\)?[\/|\s])")
|
||||||
|
|
||||||
|
# TODO unused
|
||||||
ISSUE_ANYWHERE_RE = re_compile(r"\b(\(?" + _ISSUE_RE_EXP + r"\)?)\b")
|
ISSUE_ANYWHERE_RE = re_compile(r"\b(\(?" + _ISSUE_RE_EXP + r"\)?)\b")
|
||||||
|
|
||||||
# LONG STRINGS
|
# LONG STRINGS
|
||||||
|
@ -375,6 +375,14 @@ FNS.update(
|
|||||||
"series": "Cory Doctorow's Futuristic Tales of the Here and Now",
|
"series": "Cory Doctorow's Futuristic Tales of the Here and Now",
|
||||||
"year": "2007",
|
"year": "2007",
|
||||||
},
|
},
|
||||||
|
# CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation
|
||||||
|
"Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": {
|
||||||
|
"ext": "cbz",
|
||||||
|
"issue": "001",
|
||||||
|
"series": "Cory Doctorow's Futuristic Tales of the Here and Now",
|
||||||
|
"title": "Anda's Game",
|
||||||
|
"year": "2007",
|
||||||
|
},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
DIFFICULT = {
|
DIFFICULT = {
|
||||||
@ -430,14 +438,6 @@ DIFFICULT = {
|
|||||||
"year": "1951",
|
"year": "1951",
|
||||||
"month": "10",
|
"month": "10",
|
||||||
},
|
},
|
||||||
# CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation
|
|
||||||
"Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": {
|
|
||||||
"ext": "cbz",
|
|
||||||
"issue": "001",
|
|
||||||
"series": "Cory Doctorow's Futuristic Tales of the Here and Now",
|
|
||||||
"title": "Anda's Game",
|
|
||||||
"year": "2007",
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# first_key, first_val = DIFFICULT.popitem()
|
# first_key, first_val = DIFFICULT.popitem()
|
||||||
|
Loading…
Reference in New Issue
Block a user