make parser a class. use delimeters in a string instead of the data_list
This commit is contained in:
parent
71dd1d3972
commit
664f54cecb
@ -1,3 +1,3 @@
|
|||||||
"""Comic Filename to Dict parser and unparser."""
|
"""Comic Filename to Dict parser and unparser."""
|
||||||
from .parse import comicfn2dict # noqa: F401
|
from .parse import ComicFilenameParser # noqa: F401
|
||||||
from .unparse import dict2comicfn # noqa: F401
|
from .unparse import serialize # noqa: F401
|
||||||
|
@ -3,8 +3,7 @@
|
|||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
from comicfn2dict.parse import ComicFilenameParser
|
||||||
from comicfn2dict.parse import comicfn2dict
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
@ -12,9 +11,16 @@ def main():
|
|||||||
description = "Comic book archive read/write tool."
|
description = "Comic book archive read/write tool."
|
||||||
parser = ArgumentParser(description=description)
|
parser = ArgumentParser(description=description)
|
||||||
parser.add_argument("path", help="Path of comic filename to parse", type=Path)
|
parser.add_argument("path", help="Path of comic filename to parse", type=Path)
|
||||||
|
parser.add_argument(
|
||||||
|
"-v",
|
||||||
|
"--verbose",
|
||||||
|
default=0,
|
||||||
|
action="count",
|
||||||
|
help="Display intermediate parsing steps. Good for debugging.",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
name = args.path.name
|
name = args.path.name
|
||||||
metadata = comicfn2dict(name)
|
metadata = ComicFilenameParser(name, verbose=args.verbose).parse()
|
||||||
pprint(metadata) # noqa:T203
|
pprint(metadata) # noqa:T203
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,3 +1,3 @@
|
|||||||
"""API import source."""
|
"""API import source."""
|
||||||
from comicfn2dict.parse import comicfn2dict # noqa: F401
|
from comicfn2dict.parse import ComicFilenameParser # noqa: F401
|
||||||
from comicfn2dict.unparse import dict2comicfn # noqa: F401
|
from comicfn2dict.unparse import dict2comicfn # noqa: F401
|
||||||
|
@ -1,22 +1,21 @@
|
|||||||
"""Parse comic book archive names using the simple 'parse' parser."""
|
"""Parse comic book archive names using the simple 'parse' parser."""
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
from copy import copy
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from re import Match, Pattern
|
from re import Pattern
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from comicfn2dict.regex import (
|
from comicfn2dict.regex import (
|
||||||
EXTRA_SPACES_RE,
|
EXTRA_SPACES_RE,
|
||||||
ISSUE_ANYWHERE_RE,
|
ISSUE_ANYWHERE_RE,
|
||||||
ISSUE_BEGIN_RE,
|
|
||||||
ISSUE_COUNT_RE,
|
ISSUE_COUNT_RE,
|
||||||
ISSUE_END_RE,
|
|
||||||
ISSUE_NUMBER_RE,
|
ISSUE_NUMBER_RE,
|
||||||
ISSUE_TOKEN_RE,
|
ISSUE_BEGIN_RE,
|
||||||
|
ISSUE_END_RE,
|
||||||
NON_SPACE_DIVIDER_RE,
|
NON_SPACE_DIVIDER_RE,
|
||||||
ORIGINAL_FORMAT_RE,
|
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
|
||||||
ORIGINAL_FORMAT_SCAN_INFO_RE,
|
ORIGINAL_FORMAT_SCAN_INFO_RE,
|
||||||
REMAINING_GROUP_RE,
|
REMAINING_GROUP_RE,
|
||||||
SCAN_INFO_RE,
|
|
||||||
VOLUME_RE,
|
VOLUME_RE,
|
||||||
YEAR_BEGIN_RE,
|
YEAR_BEGIN_RE,
|
||||||
YEAR_END_RE,
|
YEAR_END_RE,
|
||||||
@ -24,270 +23,195 @@ from comicfn2dict.regex import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
_REMAINING_GROUP_KEYS = ("series", "title")
|
_REMAINING_GROUP_KEYS = ("series", "title")
|
||||||
|
_TITLE_PRECEDING_KEYS = ("issue", "year", "volume")
|
||||||
|
_TOKEN_DELIMETER = "/"
|
||||||
|
|
||||||
|
|
||||||
def _parse_ext(name: str | Path, metadata: dict) -> str:
|
class ComicFilenameParser:
|
||||||
"""Pop the extension from the pathname."""
|
@staticmethod
|
||||||
if isinstance(name, str):
|
def _clean_dividers(data: str) -> str:
|
||||||
name = name.strip()
|
"""Replace non space dividers and clean extra spaces out of string."""
|
||||||
path = Path(name)
|
data = NON_SPACE_DIVIDER_RE.sub(" ", data)
|
||||||
suffix = path.suffix
|
return EXTRA_SPACES_RE.sub(" ", data).strip()
|
||||||
data = path.name.removesuffix(suffix)
|
|
||||||
ext = suffix.lstrip(".")
|
|
||||||
if ext:
|
|
||||||
metadata["ext"] = ext
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
def _parse_ext(self):
|
||||||
|
"""Pop the extension from the pathname."""
|
||||||
|
path = Path(self._unparsed_path)
|
||||||
|
suffix = path.suffix
|
||||||
|
if not suffix:
|
||||||
|
return
|
||||||
|
self.path_indexes["ext"] = self.path.rfind(suffix)
|
||||||
|
|
||||||
def _clean_dividers(data: str) -> str:
|
data = path.name.removesuffix(suffix)
|
||||||
"""Replace non space dividers and clean extra spaces out of string."""
|
ext = suffix.lstrip(".")
|
||||||
data = NON_SPACE_DIVIDER_RE.sub(" ", data)
|
self.metadata["ext"] = ext
|
||||||
return EXTRA_SPACES_RE.sub(" ", data)
|
self._unparsed_path = data
|
||||||
|
|
||||||
|
def _grouping_operators_strip(self, value: str) -> str:
|
||||||
|
"""Strip spaces and parens."""
|
||||||
|
value = value.strip()
|
||||||
|
value = value.strip("()").strip()
|
||||||
|
value = value.strip("-").strip()
|
||||||
|
value = value.strip("'").strip('"').strip()
|
||||||
|
return value
|
||||||
|
|
||||||
def _get_data_list(path: str | Path, metadata: dict) -> list[str]:
|
def _parse_item(
|
||||||
"""Prepare data list from a path or string."""
|
self,
|
||||||
data = _parse_ext(path, metadata)
|
regex: Pattern,
|
||||||
data = _clean_dividers(data)
|
require_all: bool = False,
|
||||||
return [data]
|
) -> None:
|
||||||
|
"""Parse a value from the data list into metadata and alter the data list."""
|
||||||
|
matches = regex.search(self._unparsed_path)
|
||||||
def _grouping_operators_strip(value: str) -> str:
|
if not matches:
|
||||||
"""Strip spaces and parens."""
|
return
|
||||||
value = value.strip()
|
matched_metadata = {}
|
||||||
value = value.strip("()").strip()
|
matched_path_indexes = {}
|
||||||
value = value.strip("-").strip()
|
for key, value in matches.groupdict().items():
|
||||||
value = value.strip("'").strip('"').strip()
|
if not value:
|
||||||
return value
|
if require_all:
|
||||||
|
return
|
||||||
|
|
||||||
def _splicey_dicey(
|
|
||||||
data_list: list[str], index: int, match: Match, match_group: int | str = 0
|
|
||||||
) -> str:
|
|
||||||
"""Replace a string token from a list with two strings and the value removed.
|
|
||||||
|
|
||||||
And return the value.
|
|
||||||
"""
|
|
||||||
value = match.group(match_group)
|
|
||||||
data = data_list.pop(index)
|
|
||||||
data_ends = []
|
|
||||||
if data_before := data[: match.start()].strip():
|
|
||||||
data_ends.append(data_before)
|
|
||||||
if data_after := data[match.end() :].strip():
|
|
||||||
data_ends.append(data_after)
|
|
||||||
data_list[index:index] = data_ends
|
|
||||||
return _grouping_operators_strip(value)
|
|
||||||
|
|
||||||
|
|
||||||
def _match_original_format_and_scan_info(
|
|
||||||
match: Match, metadata: dict[str, Any], data_list: list[str], index: int
|
|
||||||
) -> None:
|
|
||||||
"""Match (ORIGINAL_FORMAT-SCAN_INFO)."""
|
|
||||||
original_format = match.group("original_format")
|
|
||||||
try:
|
|
||||||
scan_info = match.group("scan_info")
|
|
||||||
except IndexError:
|
|
||||||
scan_info = None
|
|
||||||
metadata["original_format"] = _grouping_operators_strip(original_format)
|
|
||||||
match_group = 1
|
|
||||||
if scan_info:
|
|
||||||
metadata["scan_info"] = _grouping_operators_strip(scan_info)
|
|
||||||
match_group = 0
|
|
||||||
_splicey_dicey(data_list, index, match, match_group=match_group)
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_original_format_and_scan_info(data_list: list[str], metadata: dict) -> int:
|
|
||||||
"""Parse (ORIGINAL_FORMAT-SCAN_INFO)."""
|
|
||||||
index = 0
|
|
||||||
match = None
|
|
||||||
for data in data_list:
|
|
||||||
match = ORIGINAL_FORMAT_SCAN_INFO_RE.search(data)
|
|
||||||
if match:
|
|
||||||
_match_original_format_and_scan_info(match, metadata, data_list, index)
|
|
||||||
break
|
|
||||||
index += 1
|
|
||||||
else:
|
|
||||||
index = 0
|
|
||||||
return index
|
|
||||||
|
|
||||||
|
|
||||||
def _pop_value_from_token(
|
|
||||||
data_list: list,
|
|
||||||
metadata: dict,
|
|
||||||
regex: Pattern,
|
|
||||||
key: str,
|
|
||||||
index: int = 0,
|
|
||||||
) -> str:
|
|
||||||
"""Search token for value, splice and assign to metadata."""
|
|
||||||
data = data_list[index]
|
|
||||||
match = regex.search(data)
|
|
||||||
if match:
|
|
||||||
value = _splicey_dicey(data_list, index, match, key)
|
|
||||||
metadata[key] = value
|
|
||||||
else:
|
|
||||||
value = ""
|
|
||||||
return value
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_item(
|
|
||||||
data_list: list[str],
|
|
||||||
metadata: dict,
|
|
||||||
regex: Pattern,
|
|
||||||
key: str,
|
|
||||||
start_index: int = 0,
|
|
||||||
path: str = "",
|
|
||||||
) -> int:
|
|
||||||
"""Parse a value from the data list into metadata and alter the data list."""
|
|
||||||
path_index = -1
|
|
||||||
index = start_index
|
|
||||||
dl_len = end_index = len(data_list)
|
|
||||||
if index >= end_index:
|
|
||||||
index = 0
|
|
||||||
while index < end_index:
|
|
||||||
value = _pop_value_from_token(data_list, metadata, regex, key, index)
|
|
||||||
if value:
|
|
||||||
if "key" == "issue":
|
|
||||||
path_index = path.find(value)
|
|
||||||
break
|
|
||||||
index += 1
|
|
||||||
if index > dl_len and start_index > 0:
|
|
||||||
index = 0
|
|
||||||
end_index = start_index
|
|
||||||
return path_index
|
|
||||||
|
|
||||||
|
|
||||||
def _pop_issue_from_text_fields(
|
|
||||||
data_list: list[str], metadata: dict, index: int
|
|
||||||
) -> str:
|
|
||||||
"""Search issue from ends of text fields."""
|
|
||||||
if "issue" not in metadata:
|
|
||||||
_pop_value_from_token(data_list, metadata, ISSUE_END_RE, "issue", index=index)
|
|
||||||
if "issue" not in metadata:
|
|
||||||
_pop_value_from_token(data_list, metadata, ISSUE_BEGIN_RE, "issue", index=index)
|
|
||||||
return data_list.pop(index)
|
|
||||||
|
|
||||||
|
|
||||||
TITLE_PRECEDING_KEYS = ("issue", "year", "volume")
|
|
||||||
|
|
||||||
|
|
||||||
def _is_title_in_position(path, value, metadata):
|
|
||||||
"""Does the title come after series and one other token if they exist."""
|
|
||||||
# TODO this could be faster if indexes could be grabbed for these tokens
|
|
||||||
# when they are extracted.
|
|
||||||
title_index = path.find(value)
|
|
||||||
|
|
||||||
# Does a series come first.
|
|
||||||
series = metadata.get("series")
|
|
||||||
if not series:
|
|
||||||
return False
|
|
||||||
series_index = path.find(series)
|
|
||||||
if title_index < series_index:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# If other tokens exist then they much precede the title.
|
|
||||||
title_ok = False
|
|
||||||
other_tokens_exist = False
|
|
||||||
for preceding_key in TITLE_PRECEDING_KEYS:
|
|
||||||
preceding_value = metadata.get(preceding_key)
|
|
||||||
if not preceding_value:
|
|
||||||
continue
|
|
||||||
other_tokens_exist = True
|
|
||||||
preceding_index = path.find(preceding_value)
|
|
||||||
if title_index > preceding_index:
|
|
||||||
title_ok = True
|
|
||||||
break
|
|
||||||
return title_ok or not other_tokens_exist
|
|
||||||
|
|
||||||
|
|
||||||
def _assign_remaining_groups(data_list: list[str], metadata: dict, path: str):
|
|
||||||
"""Assign series and title."""
|
|
||||||
index = 0
|
|
||||||
for key in _REMAINING_GROUP_KEYS:
|
|
||||||
try:
|
|
||||||
data = data_list[index]
|
|
||||||
except (IndexError, TypeError):
|
|
||||||
break
|
|
||||||
match = REMAINING_GROUP_RE.search(data) if data else None
|
|
||||||
if match:
|
|
||||||
value = _pop_issue_from_text_fields(data_list, metadata, index)
|
|
||||||
if key == "title" and not _is_title_in_position(path, value, metadata):
|
|
||||||
continue
|
continue
|
||||||
value = _grouping_operators_strip(value)
|
matched_path_indexes[key] = self.path.find(value)
|
||||||
if value:
|
# TODO idk if strip is necceesary here
|
||||||
metadata[key] = value
|
matched_metadata[key] = self._grouping_operators_strip(value)
|
||||||
else:
|
self.metadata.update(matched_metadata)
|
||||||
index += 1
|
self.path_indexes.update(matched_path_indexes)
|
||||||
|
|
||||||
|
marked_str = regex.sub(_TOKEN_DELIMETER, self._unparsed_path)
|
||||||
|
parts = []
|
||||||
|
for part in marked_str.split(_TOKEN_DELIMETER):
|
||||||
|
if token := part.strip():
|
||||||
|
parts.append(token)
|
||||||
|
self._unparsed_path = _TOKEN_DELIMETER.join(parts)
|
||||||
|
|
||||||
def _pickup_issue(remainders: list[str], metadata: dict) -> None:
|
def _is_title_in_position(self, value):
|
||||||
"""Get issue from remaining tokens or anywhere in a pinch."""
|
"""Does the title come after series and one other token if they exist."""
|
||||||
if "issue" in metadata:
|
title_index = self.path.find(value)
|
||||||
return
|
|
||||||
_parse_item(remainders, metadata, ISSUE_TOKEN_RE, "issue")
|
|
||||||
if "issue" in metadata:
|
|
||||||
return
|
|
||||||
_parse_item(remainders, metadata, ISSUE_ANYWHERE_RE, "issue")
|
|
||||||
|
|
||||||
|
# Does a series come first.
|
||||||
|
if title_index < self.path_indexes.get("series", -1):
|
||||||
|
return False
|
||||||
|
|
||||||
def _log_progress(label, metadata, data_list):
|
# If other tokens exist then they much precede the title.
|
||||||
print(label + ":")
|
title_ok = False
|
||||||
pprint(metadata)
|
other_tokens_exist = False
|
||||||
pprint(data_list)
|
for preceding_key in _TITLE_PRECEDING_KEYS:
|
||||||
|
other_tokens_exist = True
|
||||||
|
if title_index > self.path_indexes.get(preceding_key, -1):
|
||||||
|
title_ok = True
|
||||||
|
break
|
||||||
|
return title_ok or not other_tokens_exist
|
||||||
|
|
||||||
|
def _assign_remaining_groups(self):
|
||||||
|
"""Assign series and title."""
|
||||||
|
if not self._unparsed_path:
|
||||||
|
return
|
||||||
|
|
||||||
def comicfn2dict(path: str | Path) -> dict[str, Any]:
|
# TODO fix REMAINING GROUP_RE to use token delim
|
||||||
"""Parse the filename with a hierarchy of regexes."""
|
tokens = self._unparsed_path.split(_TOKEN_DELIMETER)
|
||||||
metadata = {}
|
|
||||||
data_list = _get_data_list(path, metadata)
|
|
||||||
_log_progress("INITIAL", metadata, data_list)
|
|
||||||
|
|
||||||
# Parse paren tokens
|
# ASSIGN GROUPS
|
||||||
_parse_item(data_list, metadata, ISSUE_COUNT_RE, "issue_count")
|
remaining_key_index = 0
|
||||||
_parse_item(data_list, metadata, YEAR_TOKEN_RE, "year")
|
unused_tokens = []
|
||||||
of_index = _parse_original_format_and_scan_info(data_list, metadata)
|
while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS):
|
||||||
if "original_format" not in metadata:
|
key = _REMAINING_GROUP_KEYS[remaining_key_index]
|
||||||
of_index = _parse_item(
|
token = tokens.pop(0)
|
||||||
data_list, metadata, ORIGINAL_FORMAT_RE, "original_format"
|
match = REMAINING_GROUP_RE.search(token)
|
||||||
|
if match:
|
||||||
|
value = match.group()
|
||||||
|
if key == "title" and not self._is_title_in_position(value):
|
||||||
|
unused_tokens.append(token)
|
||||||
|
continue
|
||||||
|
value = self._grouping_operators_strip(value)
|
||||||
|
self.metadata[key] = value
|
||||||
|
self.path_indexes[key] = self.path.find(value)
|
||||||
|
remaining_key_index += 1
|
||||||
|
else:
|
||||||
|
unused_tokens.append(token)
|
||||||
|
|
||||||
|
self._unparsed_path = " ".join(unused_tokens) if unused_tokens else ""
|
||||||
|
|
||||||
|
def _add_remainders(self):
|
||||||
|
"""Add Remainders."""
|
||||||
|
remainders = []
|
||||||
|
for token in self._unparsed_path.split(_TOKEN_DELIMETER):
|
||||||
|
if remainder := token.strip():
|
||||||
|
remainders.append(remainder)
|
||||||
|
|
||||||
|
if remainders:
|
||||||
|
self.metadata["remainders"] = tuple(remainders)
|
||||||
|
|
||||||
|
def _log_progress(self, label):
|
||||||
|
if not self._debug:
|
||||||
|
return
|
||||||
|
print(label + ":")
|
||||||
|
combined = {}
|
||||||
|
for key in self.metadata:
|
||||||
|
combined[key] = (self.metadata.get(key), self.path_indexes.get(key))
|
||||||
|
pprint(combined)
|
||||||
|
print(self._unparsed_path)
|
||||||
|
|
||||||
|
def parse(self) -> dict[str, Any]:
|
||||||
|
"""Parse the filename with a hierarchy of regexes."""
|
||||||
|
self._unparsed_path = self._clean_dividers(self._unparsed_path)
|
||||||
|
self._log_progress("INITIAL")
|
||||||
|
self._parse_ext()
|
||||||
|
|
||||||
|
# Parse paren tokens
|
||||||
|
self._parse_item(ISSUE_COUNT_RE)
|
||||||
|
self._parse_item(YEAR_TOKEN_RE)
|
||||||
|
self._parse_item(
|
||||||
|
ORIGINAL_FORMAT_SCAN_INFO_RE,
|
||||||
|
require_all=True,
|
||||||
)
|
)
|
||||||
if "scan_info" not in metadata:
|
if "original_format" not in self.metadata:
|
||||||
# Start searching for scan_info after original format.
|
self._parse_item(
|
||||||
_parse_item(
|
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
|
||||||
data_list,
|
)
|
||||||
metadata,
|
self._log_progress("AFTER PAREN TOKENS")
|
||||||
SCAN_INFO_RE,
|
|
||||||
"scan_info",
|
|
||||||
start_index=of_index + 1,
|
|
||||||
)
|
|
||||||
_log_progress("AFTER PAREN TOKENS", metadata, data_list)
|
|
||||||
|
|
||||||
# Parse regular tokens
|
# Parse regular tokens
|
||||||
_parse_item(data_list, metadata, VOLUME_RE, "volume")
|
self._parse_item(VOLUME_RE)
|
||||||
_parse_item(data_list, metadata, ISSUE_NUMBER_RE, "issue", path=str(path))
|
self._parse_item(ISSUE_NUMBER_RE)
|
||||||
_log_progress("AFTER REGULAR TOKENS", metadata, data_list)
|
self._log_progress("AFTER REGULAR TOKENS")
|
||||||
|
|
||||||
# Pickup year if not gotten.
|
# Pickup year if not gotten.
|
||||||
if "year" not in metadata:
|
if "year" not in self.metadata:
|
||||||
_parse_item(data_list, metadata, YEAR_BEGIN_RE, "year")
|
self._parse_item(YEAR_BEGIN_RE)
|
||||||
if "year" not in metadata:
|
if "year" not in self.metadata:
|
||||||
_parse_item(data_list, metadata, YEAR_END_RE, "year")
|
self._parse_item(YEAR_END_RE)
|
||||||
_log_progress("AFTER YEAR PICKUP", metadata, data_list)
|
self._log_progress("AFTER YEAR PICKUP")
|
||||||
|
|
||||||
# Pickup issue if it's a standalone token
|
# Pickup issue if it's a standalone token
|
||||||
if "issue" not in metadata:
|
if "issue" not in self.metadata:
|
||||||
_parse_item(data_list, metadata, ISSUE_TOKEN_RE, "issue")
|
self._parse_item(ISSUE_END_RE)
|
||||||
|
if "issue" not in self.metadata:
|
||||||
|
self._parse_item(ISSUE_BEGIN_RE)
|
||||||
|
|
||||||
_log_progress("AFTER ISSUE PICKUP", metadata, data_list)
|
self._log_progress("AFTER ISSUE PICKUP")
|
||||||
|
|
||||||
# Series and Title. Also looks for issue.
|
# Series and Title. Also looks for issue.
|
||||||
_assign_remaining_groups(data_list, metadata, str(path))
|
self._assign_remaining_groups()
|
||||||
_log_progress("AFTER SERIES AND TITLE", metadata, data_list)
|
self._log_progress("AFTER SERIES AND TITLE")
|
||||||
|
|
||||||
# Final try for issue number.
|
# Final try for issue number.
|
||||||
_pickup_issue(data_list, metadata)
|
if "issue" not in self.metadata:
|
||||||
_log_progress("AFTER ISSUE PICKUP", metadata, data_list)
|
# TODO is this useful?
|
||||||
|
self._parse_item(ISSUE_ANYWHERE_RE)
|
||||||
|
self._log_progress("AFTER ISSUE PICKUP")
|
||||||
|
|
||||||
# Add Remainders
|
self._add_remainders()
|
||||||
if data_list:
|
|
||||||
metadata["remainders"] = tuple(data_list)
|
|
||||||
|
|
||||||
return metadata
|
return self.metadata
|
||||||
|
|
||||||
|
def __init__(self, path: str | Path, verbose: int = 0):
|
||||||
|
"""Initialize."""
|
||||||
|
self._debug: bool = verbose > 0
|
||||||
|
self.metadata: dict[str, str | tuple[str, ...]] = {}
|
||||||
|
self.path_indexes: dict[str, int] = {}
|
||||||
|
# munge path
|
||||||
|
if isinstance(path, str):
|
||||||
|
path = path.strip()
|
||||||
|
p_path = Path(path)
|
||||||
|
self.path = str(p_path.name).strip()
|
||||||
|
self._unparsed_path = copy(self.path)
|
||||||
|
@ -51,24 +51,27 @@ YEAR_BEGIN_RE = re_compile(r"^" + _YEAR_RE_EXP + r"\b")
|
|||||||
YEAR_END_RE = re_compile(r"\b" + _YEAR_RE_EXP + r"$")
|
YEAR_END_RE = re_compile(r"\b" + _YEAR_RE_EXP + r"$")
|
||||||
_OF_PATTERNS = r"|".join(ORIGINAL_FORMAT_PATTERNS)
|
_OF_PATTERNS = r"|".join(ORIGINAL_FORMAT_PATTERNS)
|
||||||
_ORIGINAL_FORMAT_RE_EXP = r"(?P<original_format>" + _OF_PATTERNS + r")"
|
_ORIGINAL_FORMAT_RE_EXP = r"(?P<original_format>" + _OF_PATTERNS + r")"
|
||||||
ORIGINAL_FORMAT_RE = re_compile(_ORIGINAL_FORMAT_RE_EXP, parenthify=True)
|
|
||||||
_SCAN_INFO_RE_EXP = r"(?P<scan_info>[^()]+?)"
|
_SCAN_INFO_RE_EXP = r"(?P<scan_info>[^()]+?)"
|
||||||
SCAN_INFO_RE = re_compile(_SCAN_INFO_RE_EXP, parenthify=True)
|
|
||||||
_ORIGINAL_FORMAT_SCAN_INFO_RE_EXP = (
|
_ORIGINAL_FORMAT_SCAN_INFO_RE_EXP = (
|
||||||
_ORIGINAL_FORMAT_RE_EXP + r"(?:-" + _SCAN_INFO_RE_EXP + r")?"
|
_ORIGINAL_FORMAT_RE_EXP + r"\s*[\(:-]" + _SCAN_INFO_RE_EXP # + r")?"
|
||||||
)
|
)
|
||||||
ORIGINAL_FORMAT_SCAN_INFO_RE = re_compile(
|
ORIGINAL_FORMAT_SCAN_INFO_RE = re_compile(
|
||||||
_ORIGINAL_FORMAT_SCAN_INFO_RE_EXP, parenthify=True
|
_ORIGINAL_FORMAT_SCAN_INFO_RE_EXP, parenthify=True
|
||||||
)
|
)
|
||||||
|
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE = re_compile(
|
||||||
|
r"\(" + _ORIGINAL_FORMAT_RE_EXP + r"\).*\(" + _SCAN_INFO_RE_EXP + r"\)"
|
||||||
|
)
|
||||||
|
|
||||||
# REGULAR TOKENS
|
# REGULAR TOKENS
|
||||||
VOLUME_RE = re_compile(r"((?:v(?:ol(?:ume)?)?\.?)\s*(?P<volume>\d+))")
|
VOLUME_RE = re_compile(r"((?:v(?:ol(?:ume)?)?\.?)\s*(?P<volume>\d+))")
|
||||||
_ISSUE_NUMBER_RE_EXP = r"(?P<issue>[\w½]+\.?\d*\w*)"
|
_ISSUE_NUMBER_RE_EXP = r"(?P<issue>[\w½]+\.?\d*\w*)"
|
||||||
ISSUE_NUMBER_RE = re_compile(r"(#" + _ISSUE_NUMBER_RE_EXP + r")")
|
ISSUE_NUMBER_RE = re_compile(r"(#" + _ISSUE_NUMBER_RE_EXP + r")")
|
||||||
_ISSUE_RE_EXP = r"(?P<issue>[\d½]+\.?\d*\w*)"
|
_ISSUE_RE_EXP = r"(?P<issue>[\d½]+\.?\d*\w*)"
|
||||||
ISSUE_TOKEN_RE = re_compile(r"^(" + _ISSUE_RE_EXP + r")$")
|
|
||||||
ISSUE_END_RE = re_compile(r"\b(" + _ISSUE_RE_EXP + r")$")
|
ISSUE_END_RE = re_compile(r"([\/\s]" + _ISSUE_RE_EXP + r"(\/|$))")
|
||||||
ISSUE_BEGIN_RE = re_compile(r"^(" + _ISSUE_RE_EXP + r")\b")
|
ISSUE_BEGIN_RE = re_compile(r"((^|\/)" + _ISSUE_RE_EXP + r"[\/|\s])")
|
||||||
|
|
||||||
|
# TODO is this used?
|
||||||
ISSUE_ANYWHERE_RE = re_compile(r"\b(" + _ISSUE_RE_EXP + r")\b")
|
ISSUE_ANYWHERE_RE = re_compile(r"\b(" + _ISSUE_RE_EXP + r")\b")
|
||||||
|
|
||||||
# LONG STRINGS
|
# LONG STRINGS
|
||||||
|
@ -28,22 +28,27 @@ _FILENAME_FORMAT_TAGS: tuple[tuple[str, str | Callable], ...] = (
|
|||||||
_EMPTY_VALUES: tuple[None, str] = (None, "")
|
_EMPTY_VALUES: tuple[None, str] = (None, "")
|
||||||
|
|
||||||
|
|
||||||
def dict2comicfn(md: Mapping, ext: bool = True) -> str | None:
|
def _tokenize_tag(md: Mapping, tag: str, fmt: str | Callable) -> str:
|
||||||
|
val = md.get(tag)
|
||||||
|
if val in _EMPTY_VALUES:
|
||||||
|
return ""
|
||||||
|
final_fmt = fmt(val) if isinstance(fmt, Callable) else fmt
|
||||||
|
token = final_fmt.format(val).strip()
|
||||||
|
return token
|
||||||
|
|
||||||
|
|
||||||
|
def serialize(md: Mapping, ext: bool = True) -> str:
|
||||||
"""Get our preferred basename from a metadata dict."""
|
"""Get our preferred basename from a metadata dict."""
|
||||||
if not md:
|
if not md:
|
||||||
return None
|
return ""
|
||||||
tokens = []
|
tokens = []
|
||||||
for tag, fmt in _FILENAME_FORMAT_TAGS:
|
for tag, fmt in _FILENAME_FORMAT_TAGS:
|
||||||
val = md.get(tag)
|
if token := _tokenize_tag(md, tag, fmt):
|
||||||
if val in _EMPTY_VALUES:
|
|
||||||
continue
|
|
||||||
final_fmt = fmt(val) if isinstance(fmt, Callable) else fmt
|
|
||||||
token = final_fmt.format(val).strip()
|
|
||||||
if token:
|
|
||||||
tokens.append(token)
|
tokens.append(token)
|
||||||
fn = " ".join(tokens)
|
fn = " ".join(tokens)
|
||||||
if remainders := md.get("remainders"):
|
if remainders := md.get("remainders"):
|
||||||
remainder = " ".join(remainders)
|
remainder = " ".join(remainders)
|
||||||
|
# TODO oh this is the - delineated remainder :(
|
||||||
fn += f" - {remainder}"
|
fn += f" - {remainder}"
|
||||||
if ext:
|
if ext:
|
||||||
fn += "." + md.get("ext", "cbz")
|
fn += "." + md.get("ext", "cbz")
|
||||||
|
@ -136,8 +136,7 @@ FNS = {
|
|||||||
"year": "2006",
|
"year": "2006",
|
||||||
"ext": "cbz",
|
"ext": "cbz",
|
||||||
"scan_info": "Minutemen-Faessla",
|
"scan_info": "Minutemen-Faessla",
|
||||||
# "original_format": "digital",
|
"original_format": "digital",
|
||||||
"remainders": ("(digital",),
|
|
||||||
},
|
},
|
||||||
"Jeremy John 003 (2007) (4 covers) (digital) (Minutemen-Faessla).cbz": {
|
"Jeremy John 003 (2007) (4 covers) (digital) (Minutemen-Faessla).cbz": {
|
||||||
"series": "Jeremy John",
|
"series": "Jeremy John",
|
||||||
@ -243,6 +242,7 @@ FNS = {
|
|||||||
|
|
||||||
FNS.update( # Newly fixed.
|
FNS.update( # Newly fixed.
|
||||||
{
|
{
|
||||||
|
# BIG Change. title after token. more stripping.
|
||||||
"'Batman - Superman - World's Finest 022 (2024) (Webrip) (The Last Kryptonian-DCP).cbz": {
|
"'Batman - Superman - World's Finest 022 (2024) (Webrip) (The Last Kryptonian-DCP).cbz": {
|
||||||
"ext": "cbz",
|
"ext": "cbz",
|
||||||
"issue": "022",
|
"issue": "022",
|
||||||
@ -252,6 +252,7 @@ FNS.update( # Newly fixed.
|
|||||||
"year": "2024",
|
"year": "2024",
|
||||||
},
|
},
|
||||||
# Issue number starting with a letter requested in https://github.com/comictagger/comictagger/issues/543
|
# Issue number starting with a letter requested in https://github.com/comictagger/comictagger/issues/543
|
||||||
|
# word characters now allowed to lead issue numbers only if preceded by a # marker
|
||||||
"batman #B01 title.cbz": {
|
"batman #B01 title.cbz": {
|
||||||
"ext": "cbz",
|
"ext": "cbz",
|
||||||
"issue": "B01",
|
"issue": "B01",
|
||||||
@ -261,32 +262,47 @@ FNS.update( # Newly fixed.
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
WONFIX = {
|
||||||
|
# Leading issue number is usually an alternate sequence number
|
||||||
|
# WONTFIX: Series names may begin with numerals.
|
||||||
|
"52 action comics #2024.cbz": {
|
||||||
|
"ext": "cbz",
|
||||||
|
"issue": "2024",
|
||||||
|
"series": "action comics",
|
||||||
|
"alternate": "52",
|
||||||
|
},
|
||||||
|
# Only the issue number. CT ensures that the series always has a value if possible
|
||||||
|
# I don't think making the series the same as the number is valuable.
|
||||||
|
"#52.cbz": {
|
||||||
|
"ext": "cbz",
|
||||||
|
"issue": "52",
|
||||||
|
"series": "52",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
LATER = {
|
||||||
|
# 4 digit issue number
|
||||||
|
# should this be an issue number if year DONE?.
|
||||||
|
"action comics 1024.cbz": {
|
||||||
|
"ext": "cbz",
|
||||||
|
"issue": "1024",
|
||||||
|
"series": "action comics",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
FNS.update(
|
FNS.update(
|
||||||
{
|
{
|
||||||
# Leading issue number is usually an alternate sequence number
|
# CT treats double-underscore the same as double-dash
|
||||||
"52 action comics #2024.cbz": {
|
# BUG: should be title right now.
|
||||||
"ext": "cbz",
|
# FEATURE: double dash should be a token delimiter?
|
||||||
"issue": "2024",
|
|
||||||
"series": "action comics",
|
|
||||||
"alternate": "52",
|
|
||||||
}, # 4 digit issue number
|
|
||||||
"action comics 1024.cbz": {
|
|
||||||
"ext": "cbz",
|
|
||||||
"issue": "1024",
|
|
||||||
"series": "action comics",
|
|
||||||
}, # Only the issue number. CT ensures that the series always has a value if possible
|
|
||||||
"#52.cbz": {
|
|
||||||
"ext": "cbz",
|
|
||||||
"issue": "52",
|
|
||||||
"series": "52",
|
|
||||||
}, # CT treats double-underscore the same as double-dash
|
|
||||||
"Monster_Island_v1_#2__repaired__c2c.cbz": {
|
"Monster_Island_v1_#2__repaired__c2c.cbz": {
|
||||||
"ext": "cbz",
|
"ext": "cbz",
|
||||||
"issue": "2",
|
"issue": "2",
|
||||||
"series": "Monster Island",
|
"series": "Monster Island",
|
||||||
"volume": "1",
|
"volume": "1",
|
||||||
}, # I'm not sure there's a right way to parse this. This might also be a madeup filename I don't remember
|
"remainders": ("repaired c2c",),
|
||||||
|
},
|
||||||
|
# I'm not sure there's a right way to parse this. This might also be a madeup filename I don't remember
|
||||||
"Super Strange Yarns (1957) #92 (1969).cbz": {
|
"Super Strange Yarns (1957) #92 (1969).cbz": {
|
||||||
"ext": "cbz",
|
"ext": "cbz",
|
||||||
"issue": "92",
|
"issue": "92",
|
||||||
|
@ -5,7 +5,7 @@ from types import MappingProxyType
|
|||||||
import pytest
|
import pytest
|
||||||
from deepdiff.diff import DeepDiff
|
from deepdiff.diff import DeepDiff
|
||||||
|
|
||||||
from comicfn2dict import comicfn2dict
|
from comicfn2dict import ComicFilenameParser
|
||||||
from tests.comic_filenames import FNS
|
from tests.comic_filenames import FNS
|
||||||
|
|
||||||
ALL_FIELDS = frozenset({"series", "volume", "issue", "issue_count", "year", "ext"})
|
ALL_FIELDS = frozenset({"series", "volume", "issue", "issue_count", "year", "ext"})
|
||||||
@ -16,7 +16,7 @@ FIELD_SCHEMA = MappingProxyType({key: None for key in ALL_FIELDS})
|
|||||||
def test_parse_filename(item):
|
def test_parse_filename(item):
|
||||||
"""Test filename parsing."""
|
"""Test filename parsing."""
|
||||||
fn, defined_fields = item
|
fn, defined_fields = item
|
||||||
md = comicfn2dict(fn)
|
md = ComicFilenameParser(fn, verbose=1).parse()
|
||||||
diff = DeepDiff(defined_fields, md, ignore_order=True)
|
diff = DeepDiff(defined_fields, md, ignore_order=True)
|
||||||
print(fn)
|
print(fn)
|
||||||
pprint(defined_fields)
|
pprint(defined_fields)
|
||||||
|
Loading…
Reference in New Issue
Block a user