2024-02-23 17:41:42 -08:00

338 lines
11 KiB
Python

"""Parse comic book archive names using the simple 'parse' parser."""
from pprint import pformat
from calendar import month_abbr
from copy import copy
from pathlib import Path
from re import Pattern
from typing import Any
from comicfn2dict.log import print_log_header
from comicfn2dict.regex import (
ALPHA_MONTH_RANGE_RE,
BOOK_VOLUME_RE,
ISSUE_BEGIN_RE,
ISSUE_END_RE,
ISSUE_NUMBER_RE,
ISSUE_WITH_COUNT_RE,
MONTH_FIRST_DATE_RE,
NON_NUMBER_DOT_RE,
ORIGINAL_FORMAT_SCAN_INFO_RE,
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
PUBLISHER_AMBIGUOUS_RE,
PUBLISHER_UNAMBIGUOUS_RE,
PUBLISHER_AMBIGUOUS_TOKEN_RE,
PUBLISHER_UNAMBIGUOUS_TOKEN_RE,
REGEX_SUBS,
REMAINING_GROUP_RE,
SCAN_INFO_SECONDARY_RE,
TOKEN_DELIMETER,
VOLUME_RE,
VOLUME_WITH_COUNT_RE,
YEAR_END_RE,
YEAR_FIRST_DATE_RE,
YEAR_TOKEN_RE,
)
_REMAINING_GROUP_KEYS = ("series", "title")
_TITLE_PRECEDING_KEYS = ("issue", "year", "volume")
_DATE_KEYS = frozenset({"year", "month", "day"})
class ComicFilenameParser:
"""Parse a filename metadata into a dict."""
def path_index(self, key: str):
"""Lazily retrieve and memoize the key's location in the path."""
if key == "remainders":
return -1
value: str = self.metadata.get(key, "") # type: ignore
if not value:
return -1
if value not in self._path_indexes:
# TODO This is fragile.
# Can I get it at match time?
if key == "ext":
index = self.path.rfind(value)
else:
index = self.path.find(value)
self._path_indexes[value] = index
return self._path_indexes[value]
def _parse_ext(self):
"""Pop the extension from the pathname."""
path = Path(self._unparsed_path)
suffix = path.suffix
if not suffix:
return
data = path.name.removesuffix(suffix)
ext = suffix.lstrip(".")
self.metadata["ext"] = ext
self._unparsed_path = data
def _clean_dividers(self):
"""Replace non space dividers and clean extra spaces out of string."""
data = self._unparsed_path
# Simple substitutions
for regex, pair in REGEX_SUBS.items():
replacement, count = pair
data = regex.sub(replacement, data, count=count).strip()
self._unparsed_path = data.strip()
def _parse_items(
self,
regex: Pattern,
require_all: bool = False,
exclude: str = "",
first_only: bool = False,
pop: bool = True,
) -> None:
"""Parse a value from the data list into metadata and alter the data list."""
matches = regex.search(self._unparsed_path)
if not matches:
return
matched_metadata = {}
for key, value in matches.groupdict().items():
if value == exclude:
continue
if not value:
if require_all:
return
continue
matched_metadata[key] = value
if first_only:
break
self.metadata.update(matched_metadata)
if not matched_metadata or not pop:
return
count = 1 if first_only else 0
marked_str = regex.sub(TOKEN_DELIMETER, self._unparsed_path, count=count)
parts = []
for part in marked_str.split(TOKEN_DELIMETER):
if token := part.strip():
parts.append(token)
self._unparsed_path = TOKEN_DELIMETER.join(parts)
def _alpha_month_to_numeric(self):
"""Translate alpha_month to numeric month."""
if alpha_month := self.metadata.pop("alpha_month", ""):
alpha_month = alpha_month.capitalize() # type: ignore
for index, abbr in enumerate(month_abbr):
if abbr and alpha_month.startswith(abbr):
month = f"{index:02d}"
self.metadata["month"] = month
break
def _parse_dates(self):
"""Parse date schemes."""
# Discard second month of alpha month ranges.
self._unparsed_path = ALPHA_MONTH_RANGE_RE.sub(r"\1", self._unparsed_path)
# Month first date
self._parse_items(MONTH_FIRST_DATE_RE)
self._alpha_month_to_numeric()
# Year first date
if _DATE_KEYS - self.metadata.keys():
self._parse_items(YEAR_FIRST_DATE_RE)
self._alpha_month_to_numeric()
if "year" not in self.metadata:
self._parse_items(YEAR_TOKEN_RE, first_only=True)
if "volume" in self.metadata:
return
# A second year will be the real year.
# Move the first year to volume
if volume := self.metadata.get("year", ""):
self._parse_items(YEAR_TOKEN_RE)
if self.metadata.get("year", "") != volume:
self.metadata["volume"] = volume
def _is_title_in_position(self, value):
"""Does the title come after series and one other token if they exist."""
title_index = self.path.find(value)
# Does a series come first.
if title_index < self.path_index("series"):
return False
# If other tokens exist then they much precede the title.
title_ok = False
other_tokens_exist = False
for preceding_key in _TITLE_PRECEDING_KEYS:
other_tokens_exist = True
if title_index > self.path_index(preceding_key):
title_ok = True
break
return title_ok or not other_tokens_exist
def _grouping_operators_strip(self, value: str) -> str:
"""Strip spaces and parens."""
value = value.strip()
value = value.strip("()").strip()
value = value.strip("-").strip()
value = value.strip(",").strip()
value = value.strip("'").strip()
return value.strip('"').strip()
def _assign_remaining_groups(self):
"""Assign series and title."""
if not self._unparsed_path:
return
remaining_key_index = 0
unused_tokens = []
tokens = self._unparsed_path.split(TOKEN_DELIMETER)
while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS):
key = _REMAINING_GROUP_KEYS[remaining_key_index]
if key in self.metadata:
continue
token = tokens.pop(0)
match = REMAINING_GROUP_RE.search(token)
if match:
value = match.group()
if key == "title" and not self._is_title_in_position(value):
unused_tokens.append(token)
continue
value = self._grouping_operators_strip(value)
value = NON_NUMBER_DOT_RE.sub(r"\1 \2", value)
self.metadata[key] = value
remaining_key_index += 1
else:
unused_tokens.append(token)
self._unparsed_path = " ".join(unused_tokens) if unused_tokens else ""
def _add_remainders(self):
"""Add Remainders."""
remainders = []
for token in self._unparsed_path.split(TOKEN_DELIMETER):
if remainder := token.strip():
remainders.append(remainder)
if remainders:
self.metadata["remainders"] = tuple(remainders)
def _log(self, label):
if not self._debug:
return
print_log_header(label)
combined = {}
for key in self.metadata:
combined[key] = (self.metadata.get(key), self.path_index(key))
print(" " + self._unparsed_path)
print(" " + pformat(combined))
def parse(self) -> dict[str, Any]:
"""Parse the filename with a hierarchy of regexes."""
# Init
#
self._log("Init")
self._parse_ext()
self._clean_dividers()
self._log("After Clean Path")
# Issue
#
self._parse_items(ISSUE_NUMBER_RE)
if "issue" not in self.metadata:
self._parse_items(ISSUE_WITH_COUNT_RE)
# self._parse_items(ISSUE_COUNT_RE)
self._log("After Issue")
# Volume
#
self._parse_items(VOLUME_RE)
if "volume" not in self.metadata:
self._parse_items(VOLUME_WITH_COUNT_RE)
self._log("After Volume")
# Date
#
self._parse_dates()
self._log("After Date")
# Format & Scan Info
#
self._parse_items(
ORIGINAL_FORMAT_SCAN_INFO_RE,
require_all=True,
)
if "original_format" not in self.metadata:
self._parse_items(
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
)
self._parse_items(SCAN_INFO_SECONDARY_RE)
if (
scan_info_secondary := self.metadata.pop("secondary_scan_info", "")
) and "scan_info" not in self.metadata:
self.metadata["scan_info"] = scan_info_secondary # type: ignore
self._log("After original_format & scan_info")
# Series and Title
#
# Volume left on the end of string tokens
if "volume" not in self.metadata:
self._parse_items(BOOK_VOLUME_RE)
self._log("After original_format & scan_info")
# Years left on the end of string tokens
year_end_matched = False
if "year" not in self.metadata:
self._parse_items(YEAR_END_RE, pop=False)
year_end_matched = "year" in self.metadata
self._log("After Year on end of token")
# Issue left on the end of string tokens
if "issue" not in self.metadata and not year_end_matched:
exclude: str = self.metadata.get("year", "") # type: ignore
self._parse_items(ISSUE_END_RE, exclude=exclude)
if "issue" not in self.metadata:
self._parse_items(ISSUE_BEGIN_RE)
self._log("After Issue on ends of tokens")
# Publisher
#
# Pop single tokens so they don't end up titles.
self._parse_items(PUBLISHER_UNAMBIGUOUS_TOKEN_RE, first_only=True)
if "publisher" not in self.metadata:
self._parse_items(PUBLISHER_AMBIGUOUS_TOKEN_RE, first_only=True)
if "publisher" not in self.metadata:
self._parse_items(PUBLISHER_UNAMBIGUOUS_RE, pop=False, first_only=True)
if "publisher" not in self.metadata:
self._parse_items(PUBLISHER_AMBIGUOUS_RE, pop=False, first_only=True)
self._log("After publisher")
self._assign_remaining_groups()
self._log("After Series & Title")
# Copy volume into issue if it's all we have.
#
if "issue" not in self.metadata and "volume" in self.metadata:
self.metadata["issue"] = self.metadata["volume"]
self._log("After issue can be volume")
self._add_remainders()
return self.metadata
def __init__(self, path: str | Path, verbose: int = 0):
"""Initialize."""
self._debug: bool = verbose > 0
# munge path
if isinstance(path, str):
path = path.strip()
p_path = Path(path)
self.path = str(p_path.name).strip()
self.metadata: dict[str, str | tuple[str, ...]] = {}
self._unparsed_path = copy(self.path)
self._path_indexes: dict[str, int] = {}
def comicfn2dict(path: str | Path, verbose: int = 0):
"""Simple API."""
parser = ComicFilenameParser(path, verbose=verbose)
return parser.parse()