338 lines
11 KiB
Python
338 lines
11 KiB
Python
"""Parse comic book archive names using the simple 'parse' parser."""
|
|
from pprint import pformat
|
|
from calendar import month_abbr
|
|
from copy import copy
|
|
from pathlib import Path
|
|
from re import Pattern
|
|
from typing import Any
|
|
from comicfn2dict.log import print_log_header
|
|
from comicfn2dict.regex import (
|
|
ALPHA_MONTH_RANGE_RE,
|
|
BOOK_VOLUME_RE,
|
|
ISSUE_BEGIN_RE,
|
|
ISSUE_END_RE,
|
|
ISSUE_NUMBER_RE,
|
|
ISSUE_WITH_COUNT_RE,
|
|
MONTH_FIRST_DATE_RE,
|
|
NON_NUMBER_DOT_RE,
|
|
ORIGINAL_FORMAT_SCAN_INFO_RE,
|
|
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
|
|
PUBLISHER_AMBIGUOUS_RE,
|
|
PUBLISHER_UNAMBIGUOUS_RE,
|
|
PUBLISHER_AMBIGUOUS_TOKEN_RE,
|
|
PUBLISHER_UNAMBIGUOUS_TOKEN_RE,
|
|
REGEX_SUBS,
|
|
REMAINING_GROUP_RE,
|
|
SCAN_INFO_SECONDARY_RE,
|
|
TOKEN_DELIMETER,
|
|
VOLUME_RE,
|
|
VOLUME_WITH_COUNT_RE,
|
|
YEAR_END_RE,
|
|
YEAR_FIRST_DATE_RE,
|
|
YEAR_TOKEN_RE,
|
|
)
|
|
|
|
_REMAINING_GROUP_KEYS = ("series", "title")
|
|
_TITLE_PRECEDING_KEYS = ("issue", "year", "volume")
|
|
_DATE_KEYS = frozenset({"year", "month", "day"})
|
|
|
|
|
|
class ComicFilenameParser:
|
|
"""Parse a filename metadata into a dict."""
|
|
|
|
def path_index(self, key: str):
|
|
"""Lazily retrieve and memoize the key's location in the path."""
|
|
if key == "remainders":
|
|
return -1
|
|
value: str = self.metadata.get(key, "") # type: ignore
|
|
if not value:
|
|
return -1
|
|
if value not in self._path_indexes:
|
|
# TODO This is fragile.
|
|
# Can I get it at match time?
|
|
if key == "ext":
|
|
index = self.path.rfind(value)
|
|
else:
|
|
index = self.path.find(value)
|
|
self._path_indexes[value] = index
|
|
return self._path_indexes[value]
|
|
|
|
def _parse_ext(self):
|
|
"""Pop the extension from the pathname."""
|
|
path = Path(self._unparsed_path)
|
|
suffix = path.suffix
|
|
if not suffix:
|
|
return
|
|
|
|
data = path.name.removesuffix(suffix)
|
|
ext = suffix.lstrip(".")
|
|
self.metadata["ext"] = ext
|
|
self._unparsed_path = data
|
|
|
|
def _clean_dividers(self):
|
|
"""Replace non space dividers and clean extra spaces out of string."""
|
|
data = self._unparsed_path
|
|
|
|
# Simple substitutions
|
|
for regex, pair in REGEX_SUBS.items():
|
|
replacement, count = pair
|
|
data = regex.sub(replacement, data, count=count).strip()
|
|
self._unparsed_path = data.strip()
|
|
|
|
def _parse_items(
|
|
self,
|
|
regex: Pattern,
|
|
require_all: bool = False,
|
|
exclude: str = "",
|
|
first_only: bool = False,
|
|
pop: bool = True,
|
|
) -> None:
|
|
"""Parse a value from the data list into metadata and alter the data list."""
|
|
matches = regex.search(self._unparsed_path)
|
|
if not matches:
|
|
return
|
|
matched_metadata = {}
|
|
for key, value in matches.groupdict().items():
|
|
if value == exclude:
|
|
continue
|
|
if not value:
|
|
if require_all:
|
|
return
|
|
continue
|
|
matched_metadata[key] = value
|
|
if first_only:
|
|
break
|
|
self.metadata.update(matched_metadata)
|
|
|
|
if not matched_metadata or not pop:
|
|
return
|
|
count = 1 if first_only else 0
|
|
marked_str = regex.sub(TOKEN_DELIMETER, self._unparsed_path, count=count)
|
|
parts = []
|
|
for part in marked_str.split(TOKEN_DELIMETER):
|
|
if token := part.strip():
|
|
parts.append(token)
|
|
self._unparsed_path = TOKEN_DELIMETER.join(parts)
|
|
|
|
def _alpha_month_to_numeric(self):
|
|
"""Translate alpha_month to numeric month."""
|
|
if alpha_month := self.metadata.pop("alpha_month", ""):
|
|
alpha_month = alpha_month.capitalize() # type: ignore
|
|
for index, abbr in enumerate(month_abbr):
|
|
if abbr and alpha_month.startswith(abbr):
|
|
month = f"{index:02d}"
|
|
self.metadata["month"] = month
|
|
break
|
|
|
|
def _parse_dates(self):
|
|
"""Parse date schemes."""
|
|
# Discard second month of alpha month ranges.
|
|
self._unparsed_path = ALPHA_MONTH_RANGE_RE.sub(r"\1", self._unparsed_path)
|
|
|
|
# Month first date
|
|
self._parse_items(MONTH_FIRST_DATE_RE)
|
|
self._alpha_month_to_numeric()
|
|
|
|
# Year first date
|
|
if _DATE_KEYS - self.metadata.keys():
|
|
self._parse_items(YEAR_FIRST_DATE_RE)
|
|
self._alpha_month_to_numeric()
|
|
|
|
if "year" not in self.metadata:
|
|
self._parse_items(YEAR_TOKEN_RE, first_only=True)
|
|
if "volume" in self.metadata:
|
|
return
|
|
# A second year will be the real year.
|
|
# Move the first year to volume
|
|
if volume := self.metadata.get("year", ""):
|
|
self._parse_items(YEAR_TOKEN_RE)
|
|
if self.metadata.get("year", "") != volume:
|
|
self.metadata["volume"] = volume
|
|
|
|
def _is_title_in_position(self, value):
|
|
"""Does the title come after series and one other token if they exist."""
|
|
title_index = self.path.find(value)
|
|
|
|
# Does a series come first.
|
|
if title_index < self.path_index("series"):
|
|
return False
|
|
|
|
# If other tokens exist then they much precede the title.
|
|
title_ok = False
|
|
other_tokens_exist = False
|
|
for preceding_key in _TITLE_PRECEDING_KEYS:
|
|
other_tokens_exist = True
|
|
if title_index > self.path_index(preceding_key):
|
|
title_ok = True
|
|
break
|
|
return title_ok or not other_tokens_exist
|
|
|
|
def _grouping_operators_strip(self, value: str) -> str:
|
|
"""Strip spaces and parens."""
|
|
value = value.strip()
|
|
value = value.strip("()").strip()
|
|
value = value.strip("-").strip()
|
|
value = value.strip(",").strip()
|
|
value = value.strip("'").strip()
|
|
return value.strip('"').strip()
|
|
|
|
def _assign_remaining_groups(self):
|
|
"""Assign series and title."""
|
|
if not self._unparsed_path:
|
|
return
|
|
|
|
remaining_key_index = 0
|
|
unused_tokens = []
|
|
tokens = self._unparsed_path.split(TOKEN_DELIMETER)
|
|
while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS):
|
|
key = _REMAINING_GROUP_KEYS[remaining_key_index]
|
|
if key in self.metadata:
|
|
continue
|
|
token = tokens.pop(0)
|
|
match = REMAINING_GROUP_RE.search(token)
|
|
if match:
|
|
value = match.group()
|
|
if key == "title" and not self._is_title_in_position(value):
|
|
unused_tokens.append(token)
|
|
continue
|
|
value = self._grouping_operators_strip(value)
|
|
value = NON_NUMBER_DOT_RE.sub(r"\1 \2", value)
|
|
|
|
self.metadata[key] = value
|
|
remaining_key_index += 1
|
|
else:
|
|
unused_tokens.append(token)
|
|
|
|
self._unparsed_path = " ".join(unused_tokens) if unused_tokens else ""
|
|
|
|
def _add_remainders(self):
|
|
"""Add Remainders."""
|
|
remainders = []
|
|
for token in self._unparsed_path.split(TOKEN_DELIMETER):
|
|
if remainder := token.strip():
|
|
remainders.append(remainder)
|
|
|
|
if remainders:
|
|
self.metadata["remainders"] = tuple(remainders)
|
|
|
|
def _log(self, label):
|
|
if not self._debug:
|
|
return
|
|
print_log_header(label)
|
|
combined = {}
|
|
for key in self.metadata:
|
|
combined[key] = (self.metadata.get(key), self.path_index(key))
|
|
print(" " + self._unparsed_path)
|
|
print(" " + pformat(combined))
|
|
|
|
def parse(self) -> dict[str, Any]:
|
|
"""Parse the filename with a hierarchy of regexes."""
|
|
# Init
|
|
#
|
|
self._log("Init")
|
|
self._parse_ext()
|
|
self._clean_dividers()
|
|
self._log("After Clean Path")
|
|
|
|
# Issue
|
|
#
|
|
self._parse_items(ISSUE_NUMBER_RE)
|
|
if "issue" not in self.metadata:
|
|
self._parse_items(ISSUE_WITH_COUNT_RE)
|
|
# self._parse_items(ISSUE_COUNT_RE)
|
|
self._log("After Issue")
|
|
|
|
# Volume
|
|
#
|
|
self._parse_items(VOLUME_RE)
|
|
if "volume" not in self.metadata:
|
|
self._parse_items(VOLUME_WITH_COUNT_RE)
|
|
self._log("After Volume")
|
|
|
|
# Date
|
|
#
|
|
self._parse_dates()
|
|
self._log("After Date")
|
|
|
|
# Format & Scan Info
|
|
#
|
|
self._parse_items(
|
|
ORIGINAL_FORMAT_SCAN_INFO_RE,
|
|
require_all=True,
|
|
)
|
|
if "original_format" not in self.metadata:
|
|
self._parse_items(
|
|
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
|
|
)
|
|
self._parse_items(SCAN_INFO_SECONDARY_RE)
|
|
if (
|
|
scan_info_secondary := self.metadata.pop("secondary_scan_info", "")
|
|
) and "scan_info" not in self.metadata:
|
|
self.metadata["scan_info"] = scan_info_secondary # type: ignore
|
|
self._log("After original_format & scan_info")
|
|
|
|
# Series and Title
|
|
#
|
|
# Volume left on the end of string tokens
|
|
if "volume" not in self.metadata:
|
|
self._parse_items(BOOK_VOLUME_RE)
|
|
self._log("After original_format & scan_info")
|
|
|
|
# Years left on the end of string tokens
|
|
year_end_matched = False
|
|
if "year" not in self.metadata:
|
|
self._parse_items(YEAR_END_RE, pop=False)
|
|
year_end_matched = "year" in self.metadata
|
|
self._log("After Year on end of token")
|
|
|
|
# Issue left on the end of string tokens
|
|
if "issue" not in self.metadata and not year_end_matched:
|
|
exclude: str = self.metadata.get("year", "") # type: ignore
|
|
self._parse_items(ISSUE_END_RE, exclude=exclude)
|
|
if "issue" not in self.metadata:
|
|
self._parse_items(ISSUE_BEGIN_RE)
|
|
self._log("After Issue on ends of tokens")
|
|
|
|
# Publisher
|
|
#
|
|
# Pop single tokens so they don't end up titles.
|
|
self._parse_items(PUBLISHER_UNAMBIGUOUS_TOKEN_RE, first_only=True)
|
|
if "publisher" not in self.metadata:
|
|
self._parse_items(PUBLISHER_AMBIGUOUS_TOKEN_RE, first_only=True)
|
|
if "publisher" not in self.metadata:
|
|
self._parse_items(PUBLISHER_UNAMBIGUOUS_RE, pop=False, first_only=True)
|
|
if "publisher" not in self.metadata:
|
|
self._parse_items(PUBLISHER_AMBIGUOUS_RE, pop=False, first_only=True)
|
|
self._log("After publisher")
|
|
|
|
self._assign_remaining_groups()
|
|
self._log("After Series & Title")
|
|
|
|
# Copy volume into issue if it's all we have.
|
|
#
|
|
if "issue" not in self.metadata and "volume" in self.metadata:
|
|
self.metadata["issue"] = self.metadata["volume"]
|
|
self._log("After issue can be volume")
|
|
|
|
self._add_remainders()
|
|
|
|
return self.metadata
|
|
|
|
def __init__(self, path: str | Path, verbose: int = 0):
|
|
"""Initialize."""
|
|
self._debug: bool = verbose > 0
|
|
# munge path
|
|
if isinstance(path, str):
|
|
path = path.strip()
|
|
p_path = Path(path)
|
|
self.path = str(p_path.name).strip()
|
|
self.metadata: dict[str, str | tuple[str, ...]] = {}
|
|
self._unparsed_path = copy(self.path)
|
|
self._path_indexes: dict[str, int] = {}
|
|
|
|
|
|
def comicfn2dict(path: str | Path, verbose: int = 0):
|
|
"""Simple API."""
|
|
parser = ComicFilenameParser(path, verbose=verbose)
|
|
return parser.parse()
|