2024-02-20 13:42:37 -08:00

240 lines
7.8 KiB
Python

"""Parse comic book archive names using the simple 'parse' parser."""
from pprint import pprint
from copy import copy
from pathlib import Path
from re import Pattern
from typing import Any
from comicfn2dict.regex import (
NON_NUMBER_DOT_RE,
EXTRA_SPACES_RE,
ISSUE_ANYWHERE_RE,
ISSUE_COUNT_RE,
ISSUE_NUMBER_RE,
ISSUE_BEGIN_RE,
ISSUE_END_RE,
NON_SPACE_DIVIDER_RE,
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
ORIGINAL_FORMAT_SCAN_INFO_RE,
REMAINING_GROUP_RE,
VOLUME_RE,
YEAR_BEGIN_RE,
YEAR_END_RE,
YEAR_TOKEN_RE,
)
_REMAINING_GROUP_KEYS = ("series", "title")
_TITLE_PRECEDING_KEYS = ("issue", "year", "volume")
_TOKEN_DELIMETER = "/"
class ComicFilenameParser:
def path_index(self, key: str):
"""Lazily retrieve and memoize the key's location in the path."""
if key == "remainders":
return -1
value: str = self.metadata.get(key, "") # type: ignore
if not value:
return -1
if value not in self._path_indexes:
if key == "ext":
index = self.path.rfind(value)
else:
index = self.path.find(value)
self._path_indexes[value] = index
return self._path_indexes[value]
def _parse_ext(self):
"""Pop the extension from the pathname."""
path = Path(self._unparsed_path)
suffix = path.suffix
if not suffix:
return
data = path.name.removesuffix(suffix)
ext = suffix.lstrip(".")
self.metadata["ext"] = ext
self._unparsed_path = data
def _clean_dividers(self):
"""Replace non space dividers and clean extra spaces out of string."""
data = NON_SPACE_DIVIDER_RE.sub(" ", self._unparsed_path)
self._unparsed_path = EXTRA_SPACES_RE.sub(" ", data).strip()
def _grouping_operators_strip(self, value: str) -> str:
"""Strip spaces and parens."""
value = value.strip()
value = value.strip("()").strip()
value = value.strip("-").strip()
value = value.strip("'").strip('"').strip()
return value
def _parse_item(
self,
regex: Pattern,
require_all: bool = False,
) -> None:
"""Parse a value from the data list into metadata and alter the data list."""
matches = regex.search(self._unparsed_path)
if not matches:
return
matched_metadata = {}
for key, value in matches.groupdict().items():
if not value:
if require_all:
return
continue
# TODO idk if strip is necessary here
matched_metadata[key] = self._grouping_operators_strip(value)
self.metadata.update(matched_metadata)
marked_str = regex.sub(_TOKEN_DELIMETER, self._unparsed_path)
parts = []
for part in marked_str.split(_TOKEN_DELIMETER):
if token := part.strip():
parts.append(token)
self._unparsed_path = _TOKEN_DELIMETER.join(parts)
def _is_title_in_position(self, value):
"""Does the title come after series and one other token if they exist."""
title_index = self.path.find(value)
# Does a series come first.
if title_index < self.path_index("series"):
return False
# If other tokens exist then they much precede the title.
title_ok = False
other_tokens_exist = False
for preceding_key in _TITLE_PRECEDING_KEYS:
other_tokens_exist = True
if title_index > self.path_index(preceding_key):
title_ok = True
break
return title_ok or not other_tokens_exist
def _assign_remaining_groups(self):
"""Assign series and title."""
if not self._unparsed_path:
return
# TODO fix REMAINING GROUP_RE to use token delim
tokens = self._unparsed_path.split(_TOKEN_DELIMETER)
# ASSIGN GROUPS
remaining_key_index = 0
unused_tokens = []
while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS):
key = _REMAINING_GROUP_KEYS[remaining_key_index]
token = tokens.pop(0)
match = REMAINING_GROUP_RE.search(token)
if match:
value = match.group()
if key == "title" and not self._is_title_in_position(value):
unused_tokens.append(token)
continue
value = self._grouping_operators_strip(value)
value = NON_NUMBER_DOT_RE.sub(r"\1 \2", value)
self.metadata[key] = value
remaining_key_index += 1
else:
unused_tokens.append(token)
self._unparsed_path = " ".join(unused_tokens) if unused_tokens else ""
def _add_remainders(self):
"""Add Remainders."""
remainders = []
for token in self._unparsed_path.split(_TOKEN_DELIMETER):
if remainder := token.strip():
remainders.append(remainder)
if remainders:
self.metadata["remainders"] = tuple(remainders)
def _log_progress(self, label):
if not self._debug:
return
print(label + ":")
combined = {}
for key in self.metadata:
combined[key] = (self.metadata.get(key), self.path_index(key))
pprint(combined)
print(self._unparsed_path)
def parse(self) -> dict[str, Any]:
"""Parse the filename with a hierarchy of regexes."""
self._log_progress("INITIAL")
self._parse_ext()
self._clean_dividers()
self._log_progress("CLEANED")
# Parse paren tokens
self._parse_item(ISSUE_COUNT_RE)
self._parse_item(YEAR_TOKEN_RE)
self._parse_item(
ORIGINAL_FORMAT_SCAN_INFO_RE,
require_all=True,
)
if "original_format" not in self.metadata:
self._parse_item(
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
)
self._log_progress("AFTER PAREN TOKENS")
# Parse regular tokens
self._parse_item(VOLUME_RE)
self._parse_item(ISSUE_NUMBER_RE)
self._log_progress("AFTER REGULAR TOKENS")
# Pickup year if not gotten.
if "year" not in self.metadata:
self._parse_item(YEAR_BEGIN_RE)
if "year" not in self.metadata:
self._parse_item(YEAR_END_RE)
self._log_progress("AFTER YEAR PICKUP")
# Pickup issue if it's a standalone token
if "issue" not in self.metadata:
self._parse_item(ISSUE_END_RE)
if "issue" not in self.metadata:
self._parse_item(ISSUE_BEGIN_RE)
self._log_progress("AFTER ISSUE PICKUP")
# Series and Title. Also looks for issue.
self._assign_remaining_groups()
self._log_progress("AFTER SERIES AND TITLE")
# Final try for issue number.
if "issue" not in self.metadata:
# TODO is this useful?
self._parse_item(ISSUE_ANYWHERE_RE)
self._log_progress("AFTER ISSUE PICKUP")
# Copy volume into issue if it's all we have.
if "issue" not in self.metadata and "volume" in self.metadata:
self.metadata["issue"] = self.metadata["volume"]
self._add_remainders()
return self.metadata
def __init__(self, path: str | Path, verbose: int = 0):
"""Initialize."""
self._debug: bool = verbose > 0
# munge path
if isinstance(path, str):
path = path.strip()
p_path = Path(path)
self.path = str(p_path.name).strip()
self.metadata: dict[str, str | tuple[str, ...]] = {}
self._unparsed_path = copy(self.path)
self._path_indexes: dict[str, int] = {}
def comicfn2dict(path: str | Path):
"""Simple API."""
return ComicFilenameParser(path).parse()