enforce title position limits. reduce parse_series_and_title complexity. add type hints.

This commit is contained in:
AJ Slater 2024-02-24 18:21:07 -08:00
parent d3b11d6361
commit 7694a3e2fd
5 changed files with 99 additions and 50 deletions

View File

@ -12,6 +12,8 @@
- ComicFilenameParser and ComicFilenameSerializer classes are available as well
as the old function API.
- New test cases thanks to @lordwelch & @bpepple
- Titles must come after series and one other token, but before format and scan
info.
## v0.1.4

View File

@ -4,7 +4,7 @@ from calendar import month_abbr
from copy import copy
from pathlib import Path
from re import Match, Pattern
from typing import Any
from sys import maxsize
from comicfn2dict.log import print_log_header
from comicfn2dict.regex import (
ALPHA_MONTH_RANGE_RE,
@ -32,21 +32,22 @@ from comicfn2dict.regex import (
YEAR_TOKEN_RE,
)
_REMAINING_GROUP_KEYS = ("series", "title")
_TITLE_PRECEDING_KEYS = ("issue", "year", "volume")
_DATE_KEYS = frozenset({"year", "month", "day"})
_REMAINING_GROUP_KEYS = ("series", "title")
# Ordered by commonness.
_TITLE_PRECEDING_KEYS = ("issue", "year", "volume", "month")
class ComicFilenameParser:
"""Parse a filename metadata into a dict."""
def path_index(self, key: str):
def path_index(self, key: str, default: int = -1) -> int:
"""Lazily retrieve and memoize the key's location in the path."""
if key == "remainders":
return -1
return default
value: str = self.metadata.get(key, "") # type: ignore
if not value:
return -1
return default
if value not in self._path_indexes:
# XXX This is fragile, but it's difficult to calculate the original
# position at match time from the ever changing _unparsed_path.
@ -57,7 +58,7 @@ class ComicFilenameParser:
self._path_indexes[value] = index
return self._path_indexes[value]
def _log(self, label):
def _log(self, label: str) -> None:
if not self._debug:
return
print_log_header(label)
@ -67,7 +68,7 @@ class ComicFilenameParser:
print(" " + self._unparsed_path)
print(" " + pformat(combined))
def _parse_ext(self):
def _parse_ext(self) -> None:
"""Pop the extension from the pathname."""
path = Path(self._unparsed_path)
suffix = path.suffix
@ -79,7 +80,7 @@ class ComicFilenameParser:
self.metadata["ext"] = ext
self._unparsed_path = data
def _clean_dividers(self):
def _clean_dividers(self) -> None:
"""Replace non space dividers and clean extra spaces out of string."""
data = self._unparsed_path
@ -142,21 +143,21 @@ class ComicFilenameParser:
if pop:
self._parse_items_pop_tokens(regex, first_only)
def _parse_issue(self):
def _parse_issue(self) -> None:
"""Parse Issue."""
self._parse_items(ISSUE_NUMBER_RE)
if "issue" not in self.metadata:
self._parse_items(ISSUE_WITH_COUNT_RE)
self._log("After Issue")
def _parse_volume(self):
def _parse_volume(self) -> None:
"""Parse Volume."""
self._parse_items(VOLUME_RE)
if "volume" not in self.metadata:
self._parse_items(VOLUME_WITH_COUNT_RE)
self._log("After Volume")
def _alpha_month_to_numeric(self):
def _alpha_month_to_numeric(self) -> None:
"""Translate alpha_month to numeric month."""
if alpha_month := self.metadata.pop("alpha_month", ""):
alpha_month = alpha_month.capitalize() # type: ignore
@ -166,7 +167,7 @@ class ComicFilenameParser:
self.metadata["month"] = month
break
def _parse_dates(self):
def _parse_dates(self) -> None:
"""Parse date schemes."""
# Discard second month of alpha month ranges.
self._unparsed_path = ALPHA_MONTH_RANGE_RE.sub(r"\1", self._unparsed_path)
@ -192,9 +193,8 @@ class ComicFilenameParser:
self.metadata["volume"] = volume
self._log("After Date")
def _parse_format_and_scan_info(self):
# Format & Scan Info
#
def _parse_format_and_scan_info(self) -> None:
"""Format & Scan Info."""
self._parse_items(
ORIGINAL_FORMAT_SCAN_INFO_RE,
require_all=True,
@ -231,7 +231,7 @@ class ComicFilenameParser:
self._parse_items(ISSUE_BEGIN_RE)
self._log("After Issue on ends of tokens")
def _parse_publisher(self):
def _parse_publisher(self) -> None:
"""Parse Publisher."""
# Pop single tokens so they don't end up titles.
self._parse_items(PUBLISHER_UNAMBIGUOUS_TOKEN_RE, first_only=True)
@ -243,15 +243,19 @@ class ComicFilenameParser:
self._parse_items(PUBLISHER_AMBIGUOUS_RE, pop=False, first_only=True)
self._log("After publisher")
def _is_title_in_position(self, value):
def _is_at_title_position(self, value: str) -> bool:
"""Does the title come after series and one other token if they exist."""
title_index = self.path.find(value)
# Does a series come first.
if title_index < self.path_index("series"):
# Titles must come after series but before format and scan_info
if (
title_index < self.path_index("series")
or title_index > self.path_index("original_format", maxsize)
or title_index > self.path_index("scan_info", maxsize)
):
return False
# If other tokens exist then they much precede the title.
# Titles must be after the series and one other token.
title_ok = False
other_tokens_exist = False
for preceding_key in _TITLE_PRECEDING_KEYS:
@ -270,7 +274,28 @@ class ComicFilenameParser:
value = value.strip("'").strip()
return value.strip('"').strip()
def _parse_series_and_title(self):
def _parse_series_and_title_token(
self, remaining_key_index: int, tokens: list[str]
) -> str:
"""Parse one series or title token."""
key = _REMAINING_GROUP_KEYS[remaining_key_index]
if key in self.metadata:
return ""
token = tokens.pop(0)
match = REMAINING_GROUP_RE.search(token)
if not match:
return token
value = match.group()
if key == "title":
if not self._is_at_title_position(value):
return token
value = NON_NUMBER_DOT_RE.sub(r"\1 \2", value)
value = self._grouping_operators_strip(value)
if value:
self.metadata[key] = value
return ""
def _parse_series_and_title(self) -> None:
"""Assign series and title."""
if not self._unparsed_path:
return
@ -279,28 +304,18 @@ class ComicFilenameParser:
unused_tokens = []
tokens = self._unparsed_path.split(TOKEN_DELIMETER)
while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS):
key = _REMAINING_GROUP_KEYS[remaining_key_index]
if key in self.metadata:
continue
token = tokens.pop(0)
match = REMAINING_GROUP_RE.search(token)
if match:
value = match.group()
if key == "title" and not self._is_title_in_position(value):
unused_tokens.append(token)
continue
value = self._grouping_operators_strip(value)
value = NON_NUMBER_DOT_RE.sub(r"\1 \2", value)
self.metadata[key] = value
remaining_key_index += 1
else:
unused_tokens.append(token)
unused_token = self._parse_series_and_title_token(
remaining_key_index, tokens
)
if unused_token:
unused_tokens.append(unused_token)
remaining_key_index += 1
print(f"{unused_tokens=}")
self._unparsed_path = " ".join(unused_tokens) if unused_tokens else ""
self._log("After Series & Title")
def _add_remainders(self):
def _add_remainders(self) -> None:
"""Add Remainders."""
remainders = []
for token in self._unparsed_path.split(TOKEN_DELIMETER):
@ -310,7 +325,7 @@ class ComicFilenameParser:
if remainders:
self.metadata["remainders"] = tuple(remainders)
def parse(self) -> dict[str, Any]:
def parse(self) -> dict[str, str | tuple[str, ...]]:
"""Parse the filename with a hierarchy of regexes."""
self._log("Init")
self._parse_ext()
@ -345,7 +360,9 @@ class ComicFilenameParser:
self._path_indexes: dict[str, int] = {}
def comicfn2dict(path: str | Path, verbose: int = 0):
def comicfn2dict(
path: str | Path, verbose: int = 0
) -> dict[str, str | tuple[str, ...]]:
"""Simple API."""
parser = ComicFilenameParser(path, verbose=verbose)
return parser.parse()

View File

@ -39,7 +39,7 @@ _DATE_KEYS = ("year", "month", "day")
class ComicFilenameSerializer:
"""Serialize Comic Filenames from dict."""
def _log(self, label, fn):
def _log(self, label: str, fn: str) -> None:
"""Log progress."""
if not self._debug:
return
@ -95,7 +95,7 @@ class ComicFilenameSerializer:
for tag, fmt in _FILENAME_FORMAT_TAGS:
if token := self._tokenize_tag(tag, fmt):
tokens.append(token)
self._log(f"After {tag}", tokens)
self._log(f"After {tag}", str(tokens))
fn = " ".join(tokens)
fn += self._add_remainder()

View File

@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry]
name = "comicfn2dict"
version = "0.2.0a2"
version = "0.2.0a3"
description = "Parse common comic filenames and return a dict of metadata attributes. Includes a cli."
license = "GPL-3.0-only"
authors = ["AJ Slater <aj@slater.net>"]

View File

@ -56,11 +56,6 @@ FNS = {
"Long Series Name #001 (2000) Title (TPB) (Releaser).cbz": TEST_COMIC_FIELDS,
"Long Series Name (2000) 001 Title (TPB) (Releaser).cbz": TEST_COMIC_FIELDS,
"Long Series Name (2000) #001 Title (TPB) (Releaser).cbz": TEST_COMIC_FIELDS,
"Long Series Name v1 (2000) #001 "
"Title (TPB) (Releaser).cbz": TEST_COMIC_FIELDS_VOL,
"Long Series Name 001 (2000) (TPB-Releaser) Title.cbz": TEST_COMIC_FIELDS,
"Long Series Name Vol 1 "
"(2000) (TPB) (Releaser & Releaser-Releaser) Title.cbr": TEST_COMIC_VOL_ONLY,
"Ultimate Craziness (2019) (Digital) (Friends-of-Bill).cbr": {
"series": "Ultimate Craziness",
"year": "2019",
@ -443,6 +438,41 @@ FNS.update(
"restored) (Shadowcat-Empire)",
),
},
"Captain Science #001 (1950) The Beginning - nothing.cbz": {
"ext": "cbz",
"issue": "001",
"title": "The Beginning - nothing",
"series": "Captain Science",
"year": "1950",
},
"Captain Science #001-cix-cbi.cbr": {
"ext": "cbr",
"issue": "001",
"series": "Captain Science",
"remainders": ("cix-cbi",),
},
"Long Series Name v1 (2000) #001 "
"Title (TPB) (Releaser).cbz": TEST_COMIC_FIELDS_VOL,
"Long Series Name 001 (2000) (TPB-Releaser) Title.cbz": {
"series": "Long Series Name",
"issue": "001",
"year": "2000",
"original_format": "TPB",
"scan_info": "Releaser",
"remainders": ("Title",),
"ext": "cbz",
},
"Long Series Name Vol 1 "
"(2000) (TPB) (Releaser & Releaser-Releaser) Title.cbr": {
"series": "Long Series Name",
"volume": "1",
"issue": "1",
"remainders": ("Title",),
"original_format": "TPB",
"year": "2000",
"scan_info": "Releaser & Releaser-Releaser",
"ext": "cbr",
},
}
)