make parser a class. use delimeters in a string instead of the data_list

This commit is contained in:
AJ Slater 2024-02-20 00:21:54 -08:00
parent 71dd1d3972
commit 664f54cecb
8 changed files with 246 additions and 292 deletions

View File

@ -1,3 +1,3 @@
"""Comic Filename to Dict parser and unparser."""
from .parse import comicfn2dict # noqa: F401
from .unparse import dict2comicfn # noqa: F401
from .parse import ComicFilenameParser # noqa: F401
from .unparse import serialize # noqa: F401

View File

@ -3,8 +3,7 @@
from argparse import ArgumentParser
from pathlib import Path
from pprint import pprint
from comicfn2dict.parse import comicfn2dict
from comicfn2dict.parse import ComicFilenameParser
def main():
@ -12,9 +11,16 @@ def main():
description = "Comic book archive read/write tool."
parser = ArgumentParser(description=description)
parser.add_argument("path", help="Path of comic filename to parse", type=Path)
parser.add_argument(
"-v",
"--verbose",
default=0,
action="count",
help="Display intermediate parsing steps. Good for debugging.",
)
args = parser.parse_args()
name = args.path.name
metadata = comicfn2dict(name)
metadata = ComicFilenameParser(name, verbose=args.verbose).parse()
pprint(metadata) # noqa:T203

View File

@ -1,3 +1,3 @@
"""API import source."""
from comicfn2dict.parse import comicfn2dict # noqa: F401
from comicfn2dict.parse import ComicFilenameParser # noqa: F401
from comicfn2dict.unparse import dict2comicfn # noqa: F401

View File

@ -1,22 +1,21 @@
"""Parse comic book archive names using the simple 'parse' parser."""
from pprint import pprint
from copy import copy
from pathlib import Path
from re import Match, Pattern
from re import Pattern
from typing import Any
from comicfn2dict.regex import (
EXTRA_SPACES_RE,
ISSUE_ANYWHERE_RE,
ISSUE_BEGIN_RE,
ISSUE_COUNT_RE,
ISSUE_END_RE,
ISSUE_NUMBER_RE,
ISSUE_TOKEN_RE,
ISSUE_BEGIN_RE,
ISSUE_END_RE,
NON_SPACE_DIVIDER_RE,
ORIGINAL_FORMAT_RE,
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
ORIGINAL_FORMAT_SCAN_INFO_RE,
REMAINING_GROUP_RE,
SCAN_INFO_RE,
VOLUME_RE,
YEAR_BEGIN_RE,
YEAR_END_RE,
@ -24,35 +23,31 @@ from comicfn2dict.regex import (
)
_REMAINING_GROUP_KEYS = ("series", "title")
_TITLE_PRECEDING_KEYS = ("issue", "year", "volume")
_TOKEN_DELIMETER = "/"
def _parse_ext(name: str | Path, metadata: dict) -> str:
"""Pop the extension from the pathname."""
if isinstance(name, str):
name = name.strip()
path = Path(name)
suffix = path.suffix
data = path.name.removesuffix(suffix)
ext = suffix.lstrip(".")
if ext:
metadata["ext"] = ext
return data
class ComicFilenameParser:
@staticmethod
def _clean_dividers(data: str) -> str:
"""Replace non space dividers and clean extra spaces out of string."""
data = NON_SPACE_DIVIDER_RE.sub(" ", data)
return EXTRA_SPACES_RE.sub(" ", data)
return EXTRA_SPACES_RE.sub(" ", data).strip()
def _parse_ext(self):
"""Pop the extension from the pathname."""
path = Path(self._unparsed_path)
suffix = path.suffix
if not suffix:
return
self.path_indexes["ext"] = self.path.rfind(suffix)
def _get_data_list(path: str | Path, metadata: dict) -> list[str]:
"""Prepare data list from a path or string."""
data = _parse_ext(path, metadata)
data = _clean_dividers(data)
return [data]
data = path.name.removesuffix(suffix)
ext = suffix.lstrip(".")
self.metadata["ext"] = ext
self._unparsed_path = data
def _grouping_operators_strip(value: str) -> str:
def _grouping_operators_strip(self, value: str) -> str:
"""Strip spaces and parens."""
value = value.strip()
value = value.strip("()").strip()
@ -60,234 +55,163 @@ def _grouping_operators_strip(value: str) -> str:
value = value.strip("'").strip('"').strip()
return value
def _splicey_dicey(
data_list: list[str], index: int, match: Match, match_group: int | str = 0
) -> str:
"""Replace a string token from a list with two strings and the value removed.
And return the value.
"""
value = match.group(match_group)
data = data_list.pop(index)
data_ends = []
if data_before := data[: match.start()].strip():
data_ends.append(data_before)
if data_after := data[match.end() :].strip():
data_ends.append(data_after)
data_list[index:index] = data_ends
return _grouping_operators_strip(value)
def _match_original_format_and_scan_info(
match: Match, metadata: dict[str, Any], data_list: list[str], index: int
) -> None:
"""Match (ORIGINAL_FORMAT-SCAN_INFO)."""
original_format = match.group("original_format")
try:
scan_info = match.group("scan_info")
except IndexError:
scan_info = None
metadata["original_format"] = _grouping_operators_strip(original_format)
match_group = 1
if scan_info:
metadata["scan_info"] = _grouping_operators_strip(scan_info)
match_group = 0
_splicey_dicey(data_list, index, match, match_group=match_group)
def _parse_original_format_and_scan_info(data_list: list[str], metadata: dict) -> int:
"""Parse (ORIGINAL_FORMAT-SCAN_INFO)."""
index = 0
match = None
for data in data_list:
match = ORIGINAL_FORMAT_SCAN_INFO_RE.search(data)
if match:
_match_original_format_and_scan_info(match, metadata, data_list, index)
break
index += 1
else:
index = 0
return index
def _pop_value_from_token(
data_list: list,
metadata: dict,
regex: Pattern,
key: str,
index: int = 0,
) -> str:
"""Search token for value, splice and assign to metadata."""
data = data_list[index]
match = regex.search(data)
if match:
value = _splicey_dicey(data_list, index, match, key)
metadata[key] = value
else:
value = ""
return value
def _parse_item(
data_list: list[str],
metadata: dict,
self,
regex: Pattern,
key: str,
start_index: int = 0,
path: str = "",
) -> int:
require_all: bool = False,
) -> None:
"""Parse a value from the data list into metadata and alter the data list."""
path_index = -1
index = start_index
dl_len = end_index = len(data_list)
if index >= end_index:
index = 0
while index < end_index:
value = _pop_value_from_token(data_list, metadata, regex, key, index)
if value:
if "key" == "issue":
path_index = path.find(value)
break
index += 1
if index > dl_len and start_index > 0:
index = 0
end_index = start_index
return path_index
matches = regex.search(self._unparsed_path)
if not matches:
return
matched_metadata = {}
matched_path_indexes = {}
for key, value in matches.groupdict().items():
if not value:
if require_all:
return
continue
matched_path_indexes[key] = self.path.find(value)
# TODO idk if strip is necceesary here
matched_metadata[key] = self._grouping_operators_strip(value)
self.metadata.update(matched_metadata)
self.path_indexes.update(matched_path_indexes)
marked_str = regex.sub(_TOKEN_DELIMETER, self._unparsed_path)
parts = []
for part in marked_str.split(_TOKEN_DELIMETER):
if token := part.strip():
parts.append(token)
self._unparsed_path = _TOKEN_DELIMETER.join(parts)
def _pop_issue_from_text_fields(
data_list: list[str], metadata: dict, index: int
) -> str:
"""Search issue from ends of text fields."""
if "issue" not in metadata:
_pop_value_from_token(data_list, metadata, ISSUE_END_RE, "issue", index=index)
if "issue" not in metadata:
_pop_value_from_token(data_list, metadata, ISSUE_BEGIN_RE, "issue", index=index)
return data_list.pop(index)
TITLE_PRECEDING_KEYS = ("issue", "year", "volume")
def _is_title_in_position(path, value, metadata):
def _is_title_in_position(self, value):
"""Does the title come after series and one other token if they exist."""
# TODO this could be faster if indexes could be grabbed for these tokens
# when they are extracted.
title_index = path.find(value)
title_index = self.path.find(value)
# Does a series come first.
series = metadata.get("series")
if not series:
return False
series_index = path.find(series)
if title_index < series_index:
if title_index < self.path_indexes.get("series", -1):
return False
# If other tokens exist then they much precede the title.
title_ok = False
other_tokens_exist = False
for preceding_key in TITLE_PRECEDING_KEYS:
preceding_value = metadata.get(preceding_key)
if not preceding_value:
continue
for preceding_key in _TITLE_PRECEDING_KEYS:
other_tokens_exist = True
preceding_index = path.find(preceding_value)
if title_index > preceding_index:
if title_index > self.path_indexes.get(preceding_key, -1):
title_ok = True
break
return title_ok or not other_tokens_exist
def _assign_remaining_groups(data_list: list[str], metadata: dict, path: str):
def _assign_remaining_groups(self):
"""Assign series and title."""
index = 0
for key in _REMAINING_GROUP_KEYS:
try:
data = data_list[index]
except (IndexError, TypeError):
break
match = REMAINING_GROUP_RE.search(data) if data else None
if not self._unparsed_path:
return
# TODO fix REMAINING GROUP_RE to use token delim
tokens = self._unparsed_path.split(_TOKEN_DELIMETER)
# ASSIGN GROUPS
remaining_key_index = 0
unused_tokens = []
while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS):
key = _REMAINING_GROUP_KEYS[remaining_key_index]
token = tokens.pop(0)
match = REMAINING_GROUP_RE.search(token)
if match:
value = _pop_issue_from_text_fields(data_list, metadata, index)
if key == "title" and not _is_title_in_position(path, value, metadata):
value = match.group()
if key == "title" and not self._is_title_in_position(value):
unused_tokens.append(token)
continue
value = _grouping_operators_strip(value)
if value:
metadata[key] = value
value = self._grouping_operators_strip(value)
self.metadata[key] = value
self.path_indexes[key] = self.path.find(value)
remaining_key_index += 1
else:
index += 1
unused_tokens.append(token)
self._unparsed_path = " ".join(unused_tokens) if unused_tokens else ""
def _pickup_issue(remainders: list[str], metadata: dict) -> None:
"""Get issue from remaining tokens or anywhere in a pinch."""
if "issue" in metadata:
def _add_remainders(self):
"""Add Remainders."""
remainders = []
for token in self._unparsed_path.split(_TOKEN_DELIMETER):
if remainder := token.strip():
remainders.append(remainder)
if remainders:
self.metadata["remainders"] = tuple(remainders)
def _log_progress(self, label):
if not self._debug:
return
_parse_item(remainders, metadata, ISSUE_TOKEN_RE, "issue")
if "issue" in metadata:
return
_parse_item(remainders, metadata, ISSUE_ANYWHERE_RE, "issue")
def _log_progress(label, metadata, data_list):
print(label + ":")
pprint(metadata)
pprint(data_list)
combined = {}
for key in self.metadata:
combined[key] = (self.metadata.get(key), self.path_indexes.get(key))
pprint(combined)
print(self._unparsed_path)
def comicfn2dict(path: str | Path) -> dict[str, Any]:
def parse(self) -> dict[str, Any]:
"""Parse the filename with a hierarchy of regexes."""
metadata = {}
data_list = _get_data_list(path, metadata)
_log_progress("INITIAL", metadata, data_list)
self._unparsed_path = self._clean_dividers(self._unparsed_path)
self._log_progress("INITIAL")
self._parse_ext()
# Parse paren tokens
_parse_item(data_list, metadata, ISSUE_COUNT_RE, "issue_count")
_parse_item(data_list, metadata, YEAR_TOKEN_RE, "year")
of_index = _parse_original_format_and_scan_info(data_list, metadata)
if "original_format" not in metadata:
of_index = _parse_item(
data_list, metadata, ORIGINAL_FORMAT_RE, "original_format"
self._parse_item(ISSUE_COUNT_RE)
self._parse_item(YEAR_TOKEN_RE)
self._parse_item(
ORIGINAL_FORMAT_SCAN_INFO_RE,
require_all=True,
)
if "scan_info" not in metadata:
# Start searching for scan_info after original format.
_parse_item(
data_list,
metadata,
SCAN_INFO_RE,
"scan_info",
start_index=of_index + 1,
if "original_format" not in self.metadata:
self._parse_item(
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
)
_log_progress("AFTER PAREN TOKENS", metadata, data_list)
self._log_progress("AFTER PAREN TOKENS")
# Parse regular tokens
_parse_item(data_list, metadata, VOLUME_RE, "volume")
_parse_item(data_list, metadata, ISSUE_NUMBER_RE, "issue", path=str(path))
_log_progress("AFTER REGULAR TOKENS", metadata, data_list)
self._parse_item(VOLUME_RE)
self._parse_item(ISSUE_NUMBER_RE)
self._log_progress("AFTER REGULAR TOKENS")
# Pickup year if not gotten.
if "year" not in metadata:
_parse_item(data_list, metadata, YEAR_BEGIN_RE, "year")
if "year" not in metadata:
_parse_item(data_list, metadata, YEAR_END_RE, "year")
_log_progress("AFTER YEAR PICKUP", metadata, data_list)
if "year" not in self.metadata:
self._parse_item(YEAR_BEGIN_RE)
if "year" not in self.metadata:
self._parse_item(YEAR_END_RE)
self._log_progress("AFTER YEAR PICKUP")
# Pickup issue if it's a standalone token
if "issue" not in metadata:
_parse_item(data_list, metadata, ISSUE_TOKEN_RE, "issue")
if "issue" not in self.metadata:
self._parse_item(ISSUE_END_RE)
if "issue" not in self.metadata:
self._parse_item(ISSUE_BEGIN_RE)
_log_progress("AFTER ISSUE PICKUP", metadata, data_list)
self._log_progress("AFTER ISSUE PICKUP")
# Series and Title. Also looks for issue.
_assign_remaining_groups(data_list, metadata, str(path))
_log_progress("AFTER SERIES AND TITLE", metadata, data_list)
self._assign_remaining_groups()
self._log_progress("AFTER SERIES AND TITLE")
# Final try for issue number.
_pickup_issue(data_list, metadata)
_log_progress("AFTER ISSUE PICKUP", metadata, data_list)
if "issue" not in self.metadata:
# TODO is this useful?
self._parse_item(ISSUE_ANYWHERE_RE)
self._log_progress("AFTER ISSUE PICKUP")
# Add Remainders
if data_list:
metadata["remainders"] = tuple(data_list)
self._add_remainders()
return metadata
return self.metadata
def __init__(self, path: str | Path, verbose: int = 0):
"""Initialize."""
self._debug: bool = verbose > 0
self.metadata: dict[str, str | tuple[str, ...]] = {}
self.path_indexes: dict[str, int] = {}
# munge path
if isinstance(path, str):
path = path.strip()
p_path = Path(path)
self.path = str(p_path.name).strip()
self._unparsed_path = copy(self.path)

View File

@ -51,24 +51,27 @@ YEAR_BEGIN_RE = re_compile(r"^" + _YEAR_RE_EXP + r"\b")
YEAR_END_RE = re_compile(r"\b" + _YEAR_RE_EXP + r"$")
_OF_PATTERNS = r"|".join(ORIGINAL_FORMAT_PATTERNS)
_ORIGINAL_FORMAT_RE_EXP = r"(?P<original_format>" + _OF_PATTERNS + r")"
ORIGINAL_FORMAT_RE = re_compile(_ORIGINAL_FORMAT_RE_EXP, parenthify=True)
_SCAN_INFO_RE_EXP = r"(?P<scan_info>[^()]+?)"
SCAN_INFO_RE = re_compile(_SCAN_INFO_RE_EXP, parenthify=True)
_ORIGINAL_FORMAT_SCAN_INFO_RE_EXP = (
_ORIGINAL_FORMAT_RE_EXP + r"(?:-" + _SCAN_INFO_RE_EXP + r")?"
_ORIGINAL_FORMAT_RE_EXP + r"\s*[\(:-]" + _SCAN_INFO_RE_EXP # + r")?"
)
ORIGINAL_FORMAT_SCAN_INFO_RE = re_compile(
_ORIGINAL_FORMAT_SCAN_INFO_RE_EXP, parenthify=True
)
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE = re_compile(
r"\(" + _ORIGINAL_FORMAT_RE_EXP + r"\).*\(" + _SCAN_INFO_RE_EXP + r"\)"
)
# REGULAR TOKENS
VOLUME_RE = re_compile(r"((?:v(?:ol(?:ume)?)?\.?)\s*(?P<volume>\d+))")
_ISSUE_NUMBER_RE_EXP = r"(?P<issue>[\w½]+\.?\d*\w*)"
ISSUE_NUMBER_RE = re_compile(r"(#" + _ISSUE_NUMBER_RE_EXP + r")")
_ISSUE_RE_EXP = r"(?P<issue>[\d½]+\.?\d*\w*)"
ISSUE_TOKEN_RE = re_compile(r"^(" + _ISSUE_RE_EXP + r")$")
ISSUE_END_RE = re_compile(r"\b(" + _ISSUE_RE_EXP + r")$")
ISSUE_BEGIN_RE = re_compile(r"^(" + _ISSUE_RE_EXP + r")\b")
ISSUE_END_RE = re_compile(r"([\/\s]" + _ISSUE_RE_EXP + r"(\/|$))")
ISSUE_BEGIN_RE = re_compile(r"((^|\/)" + _ISSUE_RE_EXP + r"[\/|\s])")
# TODO is this used?
ISSUE_ANYWHERE_RE = re_compile(r"\b(" + _ISSUE_RE_EXP + r")\b")
# LONG STRINGS

View File

@ -28,22 +28,27 @@ _FILENAME_FORMAT_TAGS: tuple[tuple[str, str | Callable], ...] = (
_EMPTY_VALUES: tuple[None, str] = (None, "")
def dict2comicfn(md: Mapping, ext: bool = True) -> str | None:
"""Get our preferred basename from a metadata dict."""
if not md:
return None
tokens = []
for tag, fmt in _FILENAME_FORMAT_TAGS:
def _tokenize_tag(md: Mapping, tag: str, fmt: str | Callable) -> str:
val = md.get(tag)
if val in _EMPTY_VALUES:
continue
return ""
final_fmt = fmt(val) if isinstance(fmt, Callable) else fmt
token = final_fmt.format(val).strip()
if token:
return token
def serialize(md: Mapping, ext: bool = True) -> str:
"""Get our preferred basename from a metadata dict."""
if not md:
return ""
tokens = []
for tag, fmt in _FILENAME_FORMAT_TAGS:
if token := _tokenize_tag(md, tag, fmt):
tokens.append(token)
fn = " ".join(tokens)
if remainders := md.get("remainders"):
remainder = " ".join(remainders)
# TODO oh this is the - delineated remainder :(
fn += f" - {remainder}"
if ext:
fn += "." + md.get("ext", "cbz")

View File

@ -136,8 +136,7 @@ FNS = {
"year": "2006",
"ext": "cbz",
"scan_info": "Minutemen-Faessla",
# "original_format": "digital",
"remainders": ("(digital",),
"original_format": "digital",
},
"Jeremy John 003 (2007) (4 covers) (digital) (Minutemen-Faessla).cbz": {
"series": "Jeremy John",
@ -243,6 +242,7 @@ FNS = {
FNS.update( # Newly fixed.
{
# BIG Change. title after token. more stripping.
"'Batman - Superman - World's Finest 022 (2024) (Webrip) (The Last Kryptonian-DCP).cbz": {
"ext": "cbz",
"issue": "022",
@ -252,6 +252,7 @@ FNS.update( # Newly fixed.
"year": "2024",
},
# Issue number starting with a letter requested in https://github.com/comictagger/comictagger/issues/543
# word characters now allowed to lead issue numbers only if preceded by a # marker
"batman #B01 title.cbz": {
"ext": "cbz",
"issue": "B01",
@ -261,32 +262,47 @@ FNS.update( # Newly fixed.
}
)
FNS.update(
{
WONFIX = {
# Leading issue number is usually an alternate sequence number
# WONTFIX: Series names may begin with numerals.
"52 action comics #2024.cbz": {
"ext": "cbz",
"issue": "2024",
"series": "action comics",
"alternate": "52",
}, # 4 digit issue number
"action comics 1024.cbz": {
"ext": "cbz",
"issue": "1024",
"series": "action comics",
}, # Only the issue number. CT ensures that the series always has a value if possible
},
# Only the issue number. CT ensures that the series always has a value if possible
# I don't think making the series the same as the number is valuable.
"#52.cbz": {
"ext": "cbz",
"issue": "52",
"series": "52",
}, # CT treats double-underscore the same as double-dash
},
}
LATER = {
# 4 digit issue number
# should this be an issue number if year DONE?.
"action comics 1024.cbz": {
"ext": "cbz",
"issue": "1024",
"series": "action comics",
},
}
FNS.update(
{
# CT treats double-underscore the same as double-dash
# BUG: should be title right now.
# FEATURE: double dash should be a token delimiter?
"Monster_Island_v1_#2__repaired__c2c.cbz": {
"ext": "cbz",
"issue": "2",
"series": "Monster Island",
"volume": "1",
}, # I'm not sure there's a right way to parse this. This might also be a madeup filename I don't remember
"remainders": ("repaired c2c",),
},
# I'm not sure there's a right way to parse this. This might also be a madeup filename I don't remember
"Super Strange Yarns (1957) #92 (1969).cbz": {
"ext": "cbz",
"issue": "92",

View File

@ -5,7 +5,7 @@ from types import MappingProxyType
import pytest
from deepdiff.diff import DeepDiff
from comicfn2dict import comicfn2dict
from comicfn2dict import ComicFilenameParser
from tests.comic_filenames import FNS
ALL_FIELDS = frozenset({"series", "volume", "issue", "issue_count", "year", "ext"})
@ -16,7 +16,7 @@ FIELD_SCHEMA = MappingProxyType({key: None for key in ALL_FIELDS})
def test_parse_filename(item):
"""Test filename parsing."""
fn, defined_fields = item
md = comicfn2dict(fn)
md = ComicFilenameParser(fn, verbose=1).parse()
diff = DeepDiff(defined_fields, md, ignore_order=True)
print(fn)
pprint(defined_fields)