Merge pull request #4 from ajslater/develop

v0.2.0
This commit is contained in:
AJ Slater 2024-02-28 13:13:33 -08:00 committed by GitHub
commit 624b64d6ca
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
26 changed files with 3566 additions and 5457 deletions

60
.circleci/config.yml Normal file
View File

@ -0,0 +1,60 @@
jobs:
build:
machine:
image: ubuntu-2204:current
environment:
DOCKER_CLI_EXPERIMENTAL: enabled
DOCKER_BUILDKIT: 1
steps:
- checkout
- run:
command: docker compose build comicfn2dict-builder
name: Build Builder
- run:
command: ./bin/docker-compose-exit.sh comicfn2dict-lint
name: comicfn2dict Lint
- run:
command: ./bin/docker-compose-exit.sh comicfn2dict-test
name: comicfn2dict Test
- store_test_results:
path: test-results/pytest
- store_artifacts:
path: test-results/coverage
- run:
command: ./bin/docker-compose-exit.sh comicfn2dict-build
name: Build comicfn2dict Dist
- persist_to_workspace:
paths:
- ./README.md
- ./bin
- ./dist
- ./pyproject.toml
root: .
deploy:
docker:
- image: cimg/python:3.12.1
steps:
- attach_workspace:
at: .
- run:
command: ./bin/publish-pypi.sh
version: 2.1
workflows:
main:
jobs:
- build:
filters:
branches:
only:
- develop
- pre-release
- main
- deploy:
filters:
branches:
only:
- pre-release
- main
requires:
- build
version: 2.1

13
.eslintignore Normal file
View File

@ -0,0 +1,13 @@
!.circleci
**/__pycache__
*test-results*
*~
.git
.mypy_cache
.pytest_cache
.ruff_cache
.venv
dist
node_modules
package-lock.json
typings

View File

@ -1,101 +0,0 @@
module.exports = {
root: true,
env: {
browser: true,
es2022: true,
node: true,
},
extends: [
"eslint:recommended",
// LANGS
"plugin:json/recommended",
"plugin:mdx/recommended",
"plugin:yaml/recommended",
// CODE QUALITY
"plugin:sonarjs/recommended",
"plugin:unicorn/all",
// PRACTICES
"plugin:array-func/recommended",
"plugin:eslint-comments/recommended",
"plugin:no-use-extend-native/recommended",
"plugin:optimize-regex/all",
"plugin:promise/recommended",
"plugin:import/recommended",
"plugin:switch-case/recommended",
// PRETTIER
"plugin:prettier/recommended",
"prettier", // prettier-config
// SECURITY
"plugin:no-unsanitized/DOM",
"plugin:security/recommended-legacy",
],
overrides: [
{
files: ["*.md"],
rules: {
"prettier/prettier": ["warn", { parser: "markdown" }],
},
},
],
parserOptions: {
ecmaVersion: "latest",
ecmaFeatures: {
impliedStrict: true,
},
},
plugins: [
"array-func",
"eslint-comments",
"json",
"import",
"no-constructor-bind",
"no-secrets",
"no-unsanitized",
"no-use-extend-native",
"optimize-regex",
"prettier",
"promise",
"simple-import-sort",
"switch-case",
"security",
"sonarjs",
"unicorn",
"yaml",
],
rules: {
"array-func/prefer-array-from": "off", // for modern browsers the spread operator, as preferred by unicorn, works fine.
"max-params": ["warn", 4],
"no-console": process.env.NODE_ENV === "production" ? "warn" : "off",
"no-debugger": process.env.NODE_ENV === "production" ? "warn" : "off",
"no-constructor-bind/no-constructor-bind": "error",
"no-constructor-bind/no-constructor-state": "error",
"no-secrets/no-secrets": "error",
"eslint-comments/no-unused-disable": 1,
"prettier/prettier": "warn",
"security/detect-object-injection": "off",
"simple-import-sort/exports": "warn",
"simple-import-sort/imports": "warn",
"space-before-function-paren": "off",
"switch-case/newline-between-switch-case": "off", // Malfunctioning
"unicorn/switch-case-braces": ["warn", "avoid"],
"unicorn/prefer-node-protocol": 0,
"unicorn/prevent-abbreviations": "off",
"unicorn/filename-case": [
"error",
{ case: "kebabCase", ignore: [".*.md"] },
],
},
ignorePatterns: [
"*~",
"**/__pycache__",
".git",
"!.circleci",
".mypy_cache",
".pytest_cache",
".venv*",
"dist",
"package-lock.json",
"test-results",
"typings",
],
};

20
Dockerfile Normal file
View File

@ -0,0 +1,20 @@
FROM python:3.12.1-bookworm
LABEL maintainer="AJ Slater <aj@slater.net>"
COPY debian.sources /etc/apt/sources.list.d/
# hadolint ignore=DL3008
RUN apt-get clean \
&& apt-get update \
&& apt-get install --no-install-recommends -y \
bash \
npm \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY bin ./bin
COPY package.json package-lock.json pyproject.toml poetry.lock Makefile ./
RUN make install-all
COPY . .

View File

@ -1,28 +1,28 @@
.PHONY: install-deps
## Update pip and install poetry
## @category Install
install-deps:
pip install --upgrade pip
pip install --upgrade poetry
npm install
.PHONY: install
## Install for production
## @category Install
install-prod: install-deps
poetry install --no-root --only-root
npm install
.PHONY: install-dev
## Install dev requirements
## @category Install
install-dev: install-deps
poetry install --no-root --only-root --with dev
npm install
.PHONY: install-all
## Install with all extras
## @category Install
install-all: install-deps
poetry install --no-root --all-extras
npm install
.PHONY: clean
## Clean pycaches

19
NEWS.md
View File

@ -1,5 +1,24 @@
# 📰 comicfn2dict News
## v0.2.0
- Titles are now parsed only if they occur after the series token AND after
either issue, year or volume.
- A more sophisticated date parser.
- Issue numbers that lead with a '#' character may start with alphabetical
characters.
- If volume is parsed, but issue number is not, the issue number is copied from
the volume number.
- ComicFilenameParser and ComicFilenameSerializer classes are available as well
as the old function API.
- New test cases thanks to @lordwelch & @bpepple
- Titles must come after series and one other token, but before format and scan
info.
## v0.1.4
- Require Python 3.10
## v0.1.3
- Fix README

View File

@ -4,16 +4,30 @@ An API and CLI for extracting structured comic metadata from filenames.
## Install
<!-- eslint-skip -->
```sh
pip install comicfn2dict
```
## API
look at `comicfn2dict/comicfn2dict.py`
<!-- eslint-skip -->
```python
from comicfn2dict import comicfn2dict, dict2comicfn
path = "Comic Series #001 Title (2024).cbz"
metadata: dict[str, str| tuple[str,...]] = comicfn2dict(path, verbose=0)
filename: str = dict2comicfn(metadata, bool=True, verbose=0)
```
## CLI
<!-- eslint-skip -->
```sh
comicfn2dict "Series Name #01 - Title (2023).cbz"
{'ext': 'cbz',

6
bin/docker-compose-exit.sh Executable file
View File

@ -0,0 +1,6 @@
#!/bin/bash
# Run a docker compose service and return its exit code
set -euo pipefail
SERVICE=$1
# docker compose without the dash doesn't have the exit-code-from param
docker compose up --exit-code-from "$SERVICE" "$SERVICE"

7
bin/publish-pypi.sh Executable file
View File

@ -0,0 +1,7 @@
#!/bin/bash
# Publish the created package
set -euo pipefail
cd "$(dirname "$0")/.."
pip3 install --upgrade pip
pip3 install --upgrade poetry
poetry publish -u "$PYPI_USER" -p "$PYPI_PASS"

View File

@ -1,3 +1,3 @@
"""Comic Filename to Dict parser and unparser."""
from .parse import comicfn2dict # noqa: F401
from .unparse import dict2comicfn # noqa: F401
from .parse import ComicFilenameParser, comicfn2dict # noqa: F401
from .unparse import ComicFilenameSerializer, dict2comicfn # noqa: F401

View File

@ -4,17 +4,27 @@ from argparse import ArgumentParser
from pathlib import Path
from pprint import pprint
from comicfn2dict.parse import comicfn2dict
from comicfn2dict.parse import ComicFilenameParser
def main():
def main() -> None:
"""Test parser."""
description = "Comic book archive read/write tool."
parser = ArgumentParser(description=description)
parser.add_argument("path", help="Path of comic filename to parse", type=Path)
parser.add_argument(
"-v",
"--verbose",
default=0,
action="count",
help="Display intermediate parsing steps. Good for debugging.",
)
args = parser.parse_args()
name = args.path.name
metadata = comicfn2dict(name)
cfnparser = ComicFilenameParser(name, verbose=args.verbose)
metadata = cfnparser.parse()
if args.verbose:
print("=" * 80) # noqa:T201
pprint(metadata) # noqa:T203

View File

@ -1,3 +0,0 @@
"""API import source."""
from comicfn2dict.parse import comicfn2dict # noqa: F401
from comicfn2dict.unparse import dict2comicfn # noqa: F401

9
comicfn2dict/log.py Normal file
View File

@ -0,0 +1,9 @@
"""Print log header."""
def print_log_header(label: str) -> None:
"""Print log header."""
prefix = "-" * 3 + label
suffix_len = 80 - len(prefix)
suffix = "-" * suffix_len
print(prefix + suffix) # noqa: T201

View File

@ -1,226 +1,364 @@
"""Parse comic book archive names using the simple 'parse' parser."""
import re
from calendar import month_abbr
from copy import copy
from pathlib import Path
from typing import Union
from pprint import pformat
from re import Match, Pattern
from sys import maxsize
from comicfn2dict.log import print_log_header
from comicfn2dict.regex import (
DASH_SPLIT_RE,
EXTRA_SPACES_RE,
ISSUE_ANYWHERE_RE,
ALPHA_MONTH_RANGE_RE,
BOOK_VOLUME_RE,
ISSUE_BEGIN_RE,
ISSUE_COUNT_RE,
ISSUE_END_RE,
ISSUE_NUMBER_RE,
ISSUE_TOKEN_RE,
NON_SPACE_DIVIDER_RE,
ORIGINAL_FORMAT_RE,
ISSUE_WITH_COUNT_RE,
MONTH_FIRST_DATE_RE,
NON_NUMBER_DOT_RE,
ORIGINAL_FORMAT_SCAN_INFO_RE,
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
PUBLISHER_AMBIGUOUS_RE,
PUBLISHER_AMBIGUOUS_TOKEN_RE,
PUBLISHER_UNAMBIGUOUS_RE,
PUBLISHER_UNAMBIGUOUS_TOKEN_RE,
REGEX_SUBS,
REMAINING_GROUP_RE,
SCAN_INFO_RE,
SCAN_INFO_SECONDARY_RE,
TOKEN_DELIMETER,
VOLUME_RE,
YEAR_BEGIN_RE,
VOLUME_WITH_COUNT_RE,
YEAR_END_RE,
YEAR_FIRST_DATE_RE,
YEAR_TOKEN_RE,
)
_DATE_KEYS = frozenset({"year", "month", "day"})
_REMAINING_GROUP_KEYS = ("series", "title")
# Ordered by commonness.
_TITLE_PRECEDING_KEYS = ("issue", "year", "volume", "month")
def _parse_ext(name, suffix, metadata):
"""Pop the extension from the pathname."""
data = name.removesuffix(suffix)
ext = suffix.lstrip(".")
if ext:
metadata["ext"] = ext
return data
class ComicFilenameParser:
"""Parse a filename metadata into a dict."""
def path_index(self, key: str, default: int = -1) -> int:
"""Lazily retrieve and memoize the key's location in the path."""
if key == "remainders":
return default
value: str = self.metadata.get(key, "") # type: ignore
if not value:
return default
if value not in self._path_indexes:
# XXX This is fragile, but it's difficult to calculate the original
# position at match time from the ever changing _unparsed_path.
index = self.path.rfind(value) if key == "ext" else self.path.find(value)
self._path_indexes[value] = index
return self._path_indexes[value]
def _clean_dividers(data):
"""Replace non space dividers and clean extra spaces out of string."""
data = NON_SPACE_DIVIDER_RE.sub(" ", data)
return EXTRA_SPACES_RE.sub(" ", data)
def _log(self, label: str) -> None:
if not self._debug:
return
print_log_header(label)
combined = {}
for key in self.metadata:
combined[key] = (self.metadata.get(key), self.path_index(key))
print(" " + self._unparsed_path) # noqa: T201
print(" " + pformat(combined)) # noqa: T201
def _parse_ext(self) -> None:
"""Pop the extension from the pathname."""
path = Path(self._unparsed_path)
suffix = path.suffix
if not suffix:
return
def _get_data_list(path, metadata):
"""Prepare data list from a path or string."""
if isinstance(path, str):
path = path.strip()
path = Path(path)
data = _parse_ext(path.name, path.suffix, metadata)
data = _clean_dividers(data)
return DASH_SPLIT_RE.split(data)
data = path.name.removesuffix(suffix)
ext = suffix.lstrip(".")
self.metadata["ext"] = ext
self._unparsed_path = data
def _clean_dividers(self) -> None:
"""Replace non space dividers and clean extra spaces out of string."""
data = self._unparsed_path
def _paren_strip(value: str):
"""Strip spaces and parens."""
return value.strip().strip("()").strip()
# Simple substitutions
for regex, pair in REGEX_SUBS.items():
replacement, count = pair
data = regex.sub(replacement, data, count=count).strip()
self._unparsed_path = data.strip()
self._log("After Clean Path")
def _parse_items_update_metadata(
self, matches: Match, exclude: str, require_all: bool, first_only: bool
) -> bool:
"""Update Metadata."""
matched_metadata = {}
for key, value in matches.groupdict().items():
if value == exclude:
continue
if not value:
if require_all:
return False
continue
matched_metadata[key] = value
if first_only:
break
if not matched_metadata:
return False
self.metadata.update(matched_metadata)
return True
def _splicey_dicey(data_list, index, match, match_group: Union[int, str] = 0):
"""Replace a string token from a list with two strings and the value removed.
def _parse_items_pop_tokens(self, regex: Pattern, first_only: bool) -> None:
"""Pop tokens from unparsed path."""
count = 1 if first_only else 0
marked_str = regex.sub(TOKEN_DELIMETER, self._unparsed_path, count=count)
parts = []
for part in marked_str.split(TOKEN_DELIMETER):
if token := part.strip():
parts.append(token)
self._unparsed_path = TOKEN_DELIMETER.join(parts)
And return the value.
"""
value = match.group(match_group)
data = data_list.pop(index)
data_ends = []
if data_before := data[: match.start()].strip():
data_ends.append(data_before)
if data_after := data[match.end() :].strip():
data_ends.append(data_after)
data_list[index:index] = data_ends
return _paren_strip(value)
def _parse_items( # noqa: PLR0913
self,
regex: Pattern,
require_all: bool = False,
exclude: str = "",
first_only: bool = False,
pop: bool = True,
) -> None:
"""Parse a value from the data list into metadata and alter the data list."""
# Match
matches = regex.search(self._unparsed_path)
if not matches:
return
if not self._parse_items_update_metadata(
matches, exclude, require_all, first_only
):
return
def _parse_original_format_and_scan_info(data_list, metadata):
"""Parse (ORIGINAL_FORMAT-SCAN_INFO)."""
original_format = None
scan_info = None
index = 0
match = None
for data in data_list:
match = ORIGINAL_FORMAT_SCAN_INFO_RE.search(data)
if match:
original_format = match.group("original_format")
try:
scan_info = match.group("scan_info")
except IndexError:
scan_info = None
break
index += 1
if original_format:
metadata["original_format"] = _paren_strip(original_format)
match_group = 1
if scan_info:
metadata["scan_info"] = _paren_strip(scan_info)
match_group = 0
_splicey_dicey(data_list, index, match, match_group=match_group)
else:
index = 0
return index
if pop:
self._parse_items_pop_tokens(regex, first_only)
def _parse_issue(self) -> None:
"""Parse Issue."""
self._parse_items(ISSUE_NUMBER_RE)
if "issue" not in self.metadata:
self._parse_items(ISSUE_WITH_COUNT_RE)
self._log("After Issue")
def _pop_value_from_token(
data_list: list,
metadata: dict,
regex: re.Pattern,
key: str,
index: int = 0,
):
"""Search token for value, splice and assign to metadata."""
data = data_list[index]
match = regex.search(data)
if match:
value = _splicey_dicey(data_list, index, match, key)
metadata[key] = value
return match
def _parse_volume(self) -> None:
"""Parse Volume."""
self._parse_items(VOLUME_RE)
if "volume" not in self.metadata:
self._parse_items(VOLUME_WITH_COUNT_RE)
self._log("After Volume")
def _alpha_month_to_numeric(self) -> None:
"""Translate alpha_month to numeric month."""
if alpha_month := self.metadata.pop("alpha_month", ""):
alpha_month = alpha_month.capitalize() # type: ignore
for index, abbr in enumerate(month_abbr):
if abbr and alpha_month.startswith(abbr):
month = f"{index:02d}"
self.metadata["month"] = month
break
def _parse_item(
data_list,
metadata,
regex,
key,
start_index: int = 0,
):
"""Parse a value from the data list into metadata and alter the data list."""
index = start_index
dl_len = end_index = len(data_list)
if index >= end_index:
index = 0
while index < end_index:
match = _pop_value_from_token(data_list, metadata, regex, key, index)
if match:
break
index += 1
if index > dl_len and start_index > 0:
index = 0
end_index = start_index
return index
def _parse_dates(self) -> None:
"""Parse date schemes."""
# Discard second month of alpha month ranges.
self._unparsed_path = ALPHA_MONTH_RANGE_RE.sub(r"\1", self._unparsed_path)
# Month first date
self._parse_items(MONTH_FIRST_DATE_RE)
self._alpha_month_to_numeric()
def _pop_issue_from_text_fields(data_list, metadata, index):
"""Search issue from ends of text fields."""
if "issue" not in metadata:
_pop_value_from_token(data_list, metadata, ISSUE_END_RE, "issue", index=index)
if "issue" not in metadata:
_pop_value_from_token(data_list, metadata, ISSUE_BEGIN_RE, "issue", index=index)
return data_list.pop(index)
# Year first date
if _DATE_KEYS - self.metadata.keys():
self._parse_items(YEAR_FIRST_DATE_RE)
self._alpha_month_to_numeric()
if "year" not in self.metadata:
self._parse_items(YEAR_TOKEN_RE, first_only=True)
if "volume" in self.metadata:
return
# A second year will be the real year.
# Move the first year to volume
if volume := self.metadata.get("year", ""):
self._parse_items(YEAR_TOKEN_RE)
if self.metadata.get("year", "") != volume:
self.metadata["volume"] = volume
self._log("After Date")
def _assign_remaining_groups(data_list, metadata):
"""Assign series and title."""
index = 0
for key in _REMAINING_GROUP_KEYS:
try:
data = data_list[index]
except (IndexError, TypeError):
break
match = REMAINING_GROUP_RE.search(data) if data else None
if match:
value = _pop_issue_from_text_fields(data_list, metadata, index)
value = _paren_strip(value)
if value:
metadata[key] = value
else:
index += 1
def _pickup_issue(remainders, metadata):
"""Get issue from remaining tokens or anywhere in a pinch."""
if "issue" in metadata:
return
_parse_item(remainders, metadata, ISSUE_TOKEN_RE, "issue")
if "issue" in metadata:
return
_parse_item(remainders, metadata, ISSUE_ANYWHERE_RE, "issue")
def comicfn2dict(path):
"""Parse the filename with a hierarchy of regexes."""
metadata = {}
data_list = _get_data_list(path, metadata)
# Parse paren tokens
_parse_item(data_list, metadata, ISSUE_COUNT_RE, "issue_count")
_parse_item(data_list, metadata, YEAR_TOKEN_RE, "year")
of_index = _parse_original_format_and_scan_info(data_list, metadata)
if "original_format" not in metadata:
of_index = _parse_item(
data_list, metadata, ORIGINAL_FORMAT_RE, "original_format"
)
if "scan_info" not in metadata:
# Start searching for scan_info after original format.
_parse_item(
data_list,
metadata,
SCAN_INFO_RE,
"scan_info",
start_index=of_index + 1,
def _parse_format_and_scan_info(self) -> None:
"""Format & Scan Info."""
self._parse_items(
ORIGINAL_FORMAT_SCAN_INFO_RE,
require_all=True,
)
if "original_format" not in self.metadata:
self._parse_items(
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
)
self._parse_items(SCAN_INFO_SECONDARY_RE)
if (
scan_info_secondary := self.metadata.pop("secondary_scan_info", "")
) and "scan_info" not in self.metadata:
self.metadata["scan_info"] = scan_info_secondary # type: ignore
self._log("After original_format & scan_info")
# Parse regular tokens
_parse_item(data_list, metadata, VOLUME_RE, "volume")
_parse_item(data_list, metadata, ISSUE_NUMBER_RE, "issue")
def _parse_ends_of_remaining_tokens(self):
# Volume left on the end of string tokens
if "volume" not in self.metadata:
self._parse_items(BOOK_VOLUME_RE)
self._log("After original_format & scan_info")
# Pickup year if not gotten.
if "year" not in metadata:
_parse_item(data_list, metadata, YEAR_BEGIN_RE, "year")
if "year" not in metadata:
_parse_item(data_list, metadata, YEAR_END_RE, "year")
# Years left on the end of string tokens
year_end_matched = False
if "year" not in self.metadata:
self._parse_items(YEAR_END_RE, pop=False)
year_end_matched = "year" in self.metadata
self._log("After Year on end of token")
# Pickup issue if it's a standalone token
if "issue" not in metadata:
_parse_item(data_list, metadata, ISSUE_TOKEN_RE, "issue")
# Issue left on the end of string tokens
if "issue" not in self.metadata and not year_end_matched:
exclude: str = self.metadata.get("year", "") # type: ignore
self._parse_items(ISSUE_END_RE, exclude=exclude)
if "issue" not in self.metadata:
self._parse_items(ISSUE_BEGIN_RE)
self._log("After Issue on ends of tokens")
# Series and Title. Also looks for issue.
_assign_remaining_groups(data_list, metadata)
def _parse_publisher(self) -> None:
"""Parse Publisher."""
# Pop single tokens so they don't end up titles.
self._parse_items(PUBLISHER_UNAMBIGUOUS_TOKEN_RE, first_only=True)
if "publisher" not in self.metadata:
self._parse_items(PUBLISHER_AMBIGUOUS_TOKEN_RE, first_only=True)
if "publisher" not in self.metadata:
self._parse_items(PUBLISHER_UNAMBIGUOUS_RE, pop=False, first_only=True)
if "publisher" not in self.metadata:
self._parse_items(PUBLISHER_AMBIGUOUS_RE, pop=False, first_only=True)
self._log("After publisher")
# Final try for issue number.
_pickup_issue(data_list, metadata)
def _is_at_title_position(self, value: str) -> bool:
"""Title is in correct position."""
title_index = self.path.find(value)
# Add Remainders
if data_list:
metadata["remainders"] = tuple(data_list)
# Titles must come after series but before format and scan_info
if (
title_index < self.path_index("series")
or title_index > self.path_index("original_format", maxsize)
or title_index > self.path_index("scan_info", maxsize)
):
return False
return metadata
# Titles must be after the series and one other token.
title_ok = False
other_tokens_exist = False
for preceding_key in _TITLE_PRECEDING_KEYS:
other_tokens_exist = True
if title_index > self.path_index(preceding_key):
title_ok = True
break
return title_ok or not other_tokens_exist
def _grouping_operators_strip(self, value: str) -> str:
"""Strip spaces and parens."""
value = value.strip()
value = value.strip("()").strip()
value = value.strip("-").strip()
value = value.strip(",").strip()
value = value.strip("'").strip()
return value.strip('"').strip()
def _parse_series_and_title_token(
self, remaining_key_index: int, tokens: list[str]
) -> str:
"""Parse one series or title token."""
key = _REMAINING_GROUP_KEYS[remaining_key_index]
if key in self.metadata:
return ""
token = tokens.pop(0)
match = REMAINING_GROUP_RE.search(token)
if not match:
return token
value = match.group()
if key == "title" and not self._is_at_title_position(value):
return token
value = NON_NUMBER_DOT_RE.sub(r"\1 \2", value)
value = self._grouping_operators_strip(value)
if value:
self.metadata[key] = value
return ""
def _parse_series_and_title(self) -> None:
"""Assign series and title."""
if not self._unparsed_path:
return
remaining_key_index = 0
unused_tokens = []
tokens = self._unparsed_path.split(TOKEN_DELIMETER)
while tokens and remaining_key_index < len(_REMAINING_GROUP_KEYS):
unused_token = self._parse_series_and_title_token(
remaining_key_index, tokens
)
if unused_token:
unused_tokens.append(unused_token)
remaining_key_index += 1
self._unparsed_path = " ".join(unused_tokens) if unused_tokens else ""
self._log("After Series & Title")
def _add_remainders(self) -> None:
"""Add Remainders."""
remainders = []
for token in self._unparsed_path.split(TOKEN_DELIMETER):
if remainder := token.strip():
remainders.append(remainder)
if remainders:
self.metadata["remainders"] = tuple(remainders)
def parse(self) -> dict[str, str | tuple[str, ...]]:
"""Parse the filename with a hierarchy of regexes."""
self._log("Init")
self._parse_ext()
self._clean_dividers()
self._parse_issue()
self._parse_volume()
self._parse_dates()
self._parse_format_and_scan_info()
self._parse_ends_of_remaining_tokens()
self._parse_publisher()
self._parse_series_and_title()
# Copy volume into issue if it's all we have.
if "issue" not in self.metadata and "volume" in self.metadata:
self.metadata["issue"] = self.metadata["volume"]
self._log("After issue can be volume")
self._add_remainders()
return self.metadata
def __init__(self, path: str | Path, verbose: int = 0):
"""Initialize."""
self._debug: bool = verbose > 0
# munge path
if isinstance(path, str):
path = path.strip()
p_path = Path(path)
self.path = str(p_path.name).strip()
self.metadata: dict[str, str | tuple[str, ...]] = {}
self._unparsed_path = copy(self.path)
self._path_indexes: dict[str, int] = {}
def comicfn2dict(
path: str | Path, verbose: int = 0
) -> dict[str, str | tuple[str, ...]]:
"""Simplfily the API."""
parser = ComicFilenameParser(path, verbose=verbose)
return parser.parse()

View File

@ -1,15 +1,32 @@
"""Parsing regexes."""
import re
from re import IGNORECASE, Pattern, compile
from types import MappingProxyType
PUBLISHERS_UNAMBIGUOUS: tuple[str, ...] = (
r"Abrams ComicArts",
r"BOOM! Studios",
r"DC(\sComics)?",
r"Dark Horse Comics",
r"Drawn & Quarterly",
r"Dynamite Entertainment",
r"IDW Publishing",
r"Icon Comics",
r"Kodansha",
r"Oni Press",
r"Pantheon Books",
r"SLG Publishing",
r"SelfMadeHero",
r"Titan Comics",
)
PUBLISHERS_AMBIGUOUS: tuple[str, ...] = (
r"(?<!Capt\.\s)(?<!Capt\s)(?<!Captain\s)Marvel",
r"Heavy Metal",
r"Epic",
r"Image",
r"Mirage",
)
def re_compile(exp, parenthify=False):
"""Compile regex with options."""
if parenthify:
exp = r"\(" + exp + r"\)"
return re.compile(exp, flags=re.IGNORECASE)
ORIGINAL_FORMAT_PATTERNS = (
ORIGINAL_FORMAT_PATTERNS: tuple[str, ...] = (
r"Anthology",
r"(One|1)[-\s]Shot",
r"Annual",
@ -35,41 +52,160 @@ ORIGINAL_FORMAT_PATTERNS = (
r"Sketch",
r"TPB",
r"Trade[-\s]Paper[-\s]?Back",
r"Web([-\s]?Comic)?",
r"Web([-\s]?(Comic|Rip))?",
)
MONTHS: tuple[str, ...] = (
r"Jan(uary)?",
r"Feb(ruary)?",
r"Mar(ch)?",
r"Apr(il)?",
r"May",
r"Jun(e)?",
r"Jul(y)?",
r"Aug(ust)?",
r"Sep(tember)?",
r"Oct(ober)?",
r"Nov(ember)?",
r"Dec(ember)?",
)
TOKEN_DELIMETER: str = r"/"
def re_compile(exp: str, parenthify: bool = False) -> Pattern:
"""Compile regex with options."""
if parenthify:
exp = r"\(" + exp + r"\)"
return compile(exp, flags=IGNORECASE)
# CLEAN
NON_SPACE_DIVIDER_RE = re_compile(r"[_\+]")
DASH_SPLIT_RE = re_compile(r"\s-\s")
EXTRA_SPACES_RE = re_compile(r"\s\s+")
_TOKEN_DIVIDERS_RE = re_compile(r":")
_SPACE_EQUIVALENT_RE = re_compile(r"_")
_EXTRA_SPACES_RE = re_compile(r"\s\s+")
_LEFT_PAREN_EQUIVALENT_RE = re_compile(r"\[")
_RIGHT_PAREN_EQUIVALENT_RE = re_compile(r"\]")
_DOUBLE_UNDERSCORE_RE = re_compile(r"__(.*)__")
REGEX_SUBS: MappingProxyType[Pattern, tuple[str, int]] = MappingProxyType(
{
_DOUBLE_UNDERSCORE_RE: (r"(\1)", 0),
_TOKEN_DIVIDERS_RE: (TOKEN_DELIMETER, 1),
_SPACE_EQUIVALENT_RE: (r" ", 0),
_EXTRA_SPACES_RE: (r" ", 0),
_LEFT_PAREN_EQUIVALENT_RE: (r"(", 0),
_RIGHT_PAREN_EQUIVALENT_RE: (r")", 0),
}
)
### DATES
_YEAR_RE_EXP = r"(?P<year>[12]\d{3})"
_MONTH_ALPHA_RE_EXP = r"(" + "(?P<alpha_month>" + r"|".join(MONTHS) + r")\.?" r")"
_MONTH_NUMERIC_RE_EXP = r"(?P<month>0?\d|1[0-2]?)"
_MONTH_RE_EXP = r"(" + _MONTH_ALPHA_RE_EXP + r"|" + _MONTH_NUMERIC_RE_EXP + r")"
_ALPHA_MONTH_RANGE = (
r"\b" # noqa: ISC003
+ r"("
+ r"|".join(MONTHS)
+ r")"
+ r"("
+ r"\.?-"
+ r"("
+ r"|".join(MONTHS)
+ r")"
+ r")\b"
)
ALPHA_MONTH_RANGE_RE: Pattern = re_compile(_ALPHA_MONTH_RANGE)
_DAY_RE_EXP = r"(?P<day>([0-2]?\d|(3)[0-1]))"
_DATE_DELIM = r"[-\s]+"
_MONTH_FIRST_DATE_RE_EXP = (
r"((\b|\(?)"
# Month
+ _MONTH_RE_EXP
# Day
+ r"("
+ _DATE_DELIM
+ _DAY_RE_EXP
+ r")?"
# Year
+ r"[,]?"
+ _DATE_DELIM
+ _YEAR_RE_EXP
+ r"(\)?|\b))"
)
_YEAR_FIRST_DATE_RE_EXP = (
r"(\b\(?"
+ _YEAR_RE_EXP
+ _DATE_DELIM
+ _MONTH_RE_EXP
+ _DATE_DELIM
+ _DAY_RE_EXP
+ r"\b\)?)"
)
MONTH_FIRST_DATE_RE: Pattern = re_compile(_MONTH_FIRST_DATE_RE_EXP)
YEAR_FIRST_DATE_RE: Pattern = re_compile(_YEAR_FIRST_DATE_RE_EXP)
YEAR_TOKEN_RE: Pattern = re_compile(_YEAR_RE_EXP, parenthify=True)
YEAR_END_RE: Pattern = re_compile(_YEAR_RE_EXP + r"\/|$")
# PAREN GROUPS
ISSUE_COUNT_RE = re_compile(r"of\s*(?P<issue_count>\d+)", parenthify=True)
_YEAR_RE_EXP = r"(?P<year>[12]\d{3})"
YEAR_TOKEN_RE = re_compile(_YEAR_RE_EXP, parenthify=True)
YEAR_BEGIN_RE = re_compile(r"^" + _YEAR_RE_EXP + r"\b")
YEAR_END_RE = re_compile(r"\b" + _YEAR_RE_EXP + r"$")
_OF_PATTERNS = r"|".join(ORIGINAL_FORMAT_PATTERNS)
_ORIGINAL_FORMAT_RE_EXP = r"(?P<original_format>" + _OF_PATTERNS + r")"
ORIGINAL_FORMAT_RE = re_compile(_ORIGINAL_FORMAT_RE_EXP, parenthify=True)
_SCAN_INFO_RE_EXP = r"(?P<scan_info>[^()]+?)"
SCAN_INFO_RE = re_compile(_SCAN_INFO_RE_EXP, parenthify=True)
_SCAN_INFO_RE_EXP = r"(?P<scan_info>[^()]*)"
_ORIGINAL_FORMAT_SCAN_INFO_RE_EXP = (
_ORIGINAL_FORMAT_RE_EXP + r"(?:-" + _SCAN_INFO_RE_EXP + r")?"
_ORIGINAL_FORMAT_RE_EXP + r"\s*[\(:-]" + _SCAN_INFO_RE_EXP # + r")?"
)
ORIGINAL_FORMAT_SCAN_INFO_RE = re_compile(
# Keep this even though comicfn2dict doesn't use it directly
ORIGINAL_FORMAT_RE: Pattern = re_compile(_ORIGINAL_FORMAT_RE_EXP, parenthify=True)
ORIGINAL_FORMAT_SCAN_INFO_RE: Pattern = re_compile(
_ORIGINAL_FORMAT_SCAN_INFO_RE_EXP, parenthify=True
)
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE: Pattern = re_compile(
r"\(" + _ORIGINAL_FORMAT_RE_EXP + r"\).*\(" + _SCAN_INFO_RE_EXP + r"\)"
)
# REGULAR TOKENS
VOLUME_RE = re_compile(r"((?:v(?:ol(?:ume)?)?\.?)\s*(?P<volume>\d+))")
_ISSUE_RE_EXP = r"(?P<issue>[\d½]+\.?\d*\w*)"
ISSUE_NUMBER_RE = re_compile(r"(#" + _ISSUE_RE_EXP + r")")
ISSUE_TOKEN_RE = re_compile(r"^(" + _ISSUE_RE_EXP + r")$")
ISSUE_END_RE = re_compile(r"\b(" + _ISSUE_RE_EXP + r")$")
ISSUE_BEGIN_RE = re_compile(r"^(" + _ISSUE_RE_EXP + r")\b")
ISSUE_ANYWHERE_RE = re_compile(r"\b(" + _ISSUE_RE_EXP + r")\b")
SCAN_INFO_SECONDARY_RE: Pattern = re_compile(r"\b(?P<secondary_scan_info>c2c)\b")
# ISSUE
_ISSUE_RE_EXP = r"(?P<issue>\w*(½|\d+)[\.\d+]*\w*)"
_ISSUE_COUNT_RE_EXP = r"\(of\s*(?P<issue_count>\d+)\)"
ISSUE_NUMBER_RE: Pattern = re_compile(
r"(\(?#" + _ISSUE_RE_EXP + r"\)?)" + r"(\W*" + _ISSUE_COUNT_RE_EXP + r")?"
)
ISSUE_WITH_COUNT_RE: Pattern = re_compile(
r"(\(?" + _ISSUE_RE_EXP + r"\)?" + r"\W*" + _ISSUE_COUNT_RE_EXP + r")"
)
ISSUE_END_RE: Pattern = re_compile(r"([\/\s]\(?" + _ISSUE_RE_EXP + r"\)?(\/|$))")
ISSUE_BEGIN_RE: Pattern = re_compile(r"((^|\/)\(?" + _ISSUE_RE_EXP + r"\)?[\/|\s])")
# Volume
_VOLUME_COUNT_RE_EXP = r"\(of\s*(?P<volume_count>\d+)\)"
VOLUME_RE: Pattern = re_compile(
r"(" + r"(?:v(?:ol(?:ume)?)?\.?)\s*(?P<volume>\d+)" # noqa: ISC003
r"(\W*" + _VOLUME_COUNT_RE_EXP + r")?" + r")"
)
VOLUME_WITH_COUNT_RE: Pattern = re_compile(
r"(\(?" + r"(?P<volume>\d+)" + r"\)?" + r"\W*" + _VOLUME_COUNT_RE_EXP + r")"
)
BOOK_VOLUME_RE: Pattern = re_compile(r"(?P<title>" + r"book\s*(?P<volume>\d+)" + r")")
# Publisher
_PUBLISHER_UNAMBIGUOUS_RE_EXP = (
r"(\b(?P<publisher>" + r"|".join(PUBLISHERS_UNAMBIGUOUS) + r")\b)"
)
_PUBLISHER_AMBIGUOUS_RE_EXP = (
r"(\b(?P<publisher>" + r"|".join(PUBLISHERS_AMBIGUOUS) + r")\b)"
)
PUBLISHER_UNAMBIGUOUS_TOKEN_RE: Pattern = re_compile(
r"(^|\/)" + _PUBLISHER_UNAMBIGUOUS_RE_EXP + r"($|\/)"
)
PUBLISHER_AMBIGUOUS_TOKEN_RE: Pattern = re_compile(
r"(^|\/)" + _PUBLISHER_AMBIGUOUS_RE_EXP + r"($|\/)"
)
PUBLISHER_UNAMBIGUOUS_RE: Pattern = re_compile(_PUBLISHER_UNAMBIGUOUS_RE_EXP)
PUBLISHER_AMBIGUOUS_RE = re_compile(_PUBLISHER_AMBIGUOUS_RE_EXP)
# LONG STRINGS
REMAINING_GROUP_RE = re_compile(r"^[\w].*[^\)]")
REMAINING_GROUP_RE: Pattern = re_compile(r"^[^\(].*[^\)]")
NON_NUMBER_DOT_RE: Pattern = re_compile(r"(\D)\.(\D)")

View File

@ -1,8 +1,13 @@
"""Unparse comic filenames."""
from typing import Callable
from calendar import month_abbr
from collections.abc import Callable, Mapping, Sequence
from contextlib import suppress
from types import MappingProxyType
from comicfn2dict.log import print_log_header
def issue_formatter(issue):
def issue_formatter(issue: str) -> str:
"""Formatter to zero pad issues."""
i = 0
issue = issue.lstrip("0")
@ -14,37 +19,103 @@ def issue_formatter(issue):
return "#{:0>" + str(pad) + "}"
_PAREN_FMT = "({})"
_FILENAME_FORMAT_TAGS = (
_PAREN_FMT: str = "({})"
_FILENAME_FORMAT_TAGS: tuple[tuple[str, str | Callable], ...] = (
("series", "{}"),
("volume", "v{}"),
("volume_count", "(of {:03})"),
("issue", issue_formatter),
("issue_count", "(of {:03})"),
("year", _PAREN_FMT),
("date", _PAREN_FMT),
("title", "{}"),
("publisher", _PAREN_FMT),
("original_format", _PAREN_FMT),
("scan_info", _PAREN_FMT),
)
_EMPTY_VALUES = (None, "")
_EMPTY_VALUES: tuple[None, str] = (None, "")
_DEFAULT_EXT = "cbz"
_DATE_KEYS = ("year", "month", "day")
def dict2comicfn(md, ext=True):
"""Get our preferred basename from a metadata dict."""
if not md:
return None
tokens = []
for tag, fmt in _FILENAME_FORMAT_TAGS:
val = md.get(tag)
class ComicFilenameSerializer:
"""Serialize Comic Filenames from dict."""
def _log(self, label: str, fn: str) -> None:
"""Log progress."""
if not self._debug:
return
print_log_header(label)
print(fn) # noqa: T201
def _add_date(self) -> None:
"""Construct date from Y-m-D if they exist."""
if "date" in self.metadata:
return
parts = []
for key in _DATE_KEYS:
if part := self.metadata.get(key):
if key == "month" and not parts:
with suppress(TypeError):
part = month_abbr[int(part)]
parts.append(part)
if key == "month" and not parts:
# noop if only day.
break
if parts:
parts = (str(part) for part in parts)
date = "-".join(parts)
self._log("After date", date)
self.metadata = MappingProxyType({**self.metadata, "date": date})
def _tokenize_tag(self, tag: str, fmt: str | Callable) -> str:
"""Add tags to the string."""
val = self.metadata.get(tag)
if val in _EMPTY_VALUES:
continue
return ""
final_fmt = fmt(val) if isinstance(fmt, Callable) else fmt
token = final_fmt.format(val).strip()
if token:
tokens.append(token)
fn = " ".join(tokens)
if remainders := md.get("remainders"):
remainder = " ".join(remainders)
fn += f" - {remainder}"
if ext:
fn += "." + md.get("ext", "cbz")
return fn
return final_fmt.format(val).strip()
def _add_remainder(self) -> str:
"""Add the remainders specially."""
if remainders := self.metadata.get("remainders"):
if isinstance(remainders, Sequence):
remainders = (str(remainder) for remainder in remainders)
remainder = " ".join(remainders)
else:
remainder = str(remainders)
return f"[{remainder}]"
return ""
def serialize(self) -> str:
"""Get our preferred basename from a metadata dict."""
self._add_date()
tokens = []
for tag, fmt in _FILENAME_FORMAT_TAGS:
if token := self._tokenize_tag(tag, fmt):
tokens.append(token)
self._log(f"After {tag}", str(tokens))
fn = " ".join(tokens)
fn += self._add_remainder()
self._log("After remainder", fn)
if self._ext:
ext = self.metadata.get("ext", _DEFAULT_EXT)
fn += f".{ext}"
self._log("After ext", fn)
return fn
def __init__(self, metadata: Mapping, ext: bool = True, verbose: int = 0):
"""Initialize."""
self.metadata: Mapping = metadata
self._ext: bool = ext
self._debug: bool = bool(verbose)
def dict2comicfn(md: Mapping, ext: bool = True, verbose: int = 0) -> str:
"""Simplify API."""
serializer = ComicFilenameSerializer(md, ext=ext, verbose=verbose)
return serializer.serialize()

11
debian.sources Normal file
View File

@ -0,0 +1,11 @@
Types: deb
URIs: http://deb.debian.org/debian
Suites: bookworm bookworm-updates
Components: main contrib non-free
Signed-By: /usr/share/keyrings/debian-archive-keyring.gpg
Types: deb
URIs: http://deb.debian.org/debian-security
Suites: bookworm-security
Components: main contrib non-free
Signed-By: /usr/share/keyrings/debian-archive-keyring.gpg

21
docker-compose.yaml Normal file
View File

@ -0,0 +1,21 @@
services:
comicfn2dict-builder:
build: .
image: comicfn2dict-builder
container_name: comicfn2dict-builder
comicfn2dict-lint:
image: comicfn2dict-builder
container_name: comicfn2dict-lint
command: make lint
comicfn2dict-test:
image: comicfn2dict-builder
container_name: comicfn2dict-test
command: make test
volumes:
- ./test-results/:/app/test-results/
comicfn2dict-build:
image: comicfn2dict-builder
container_name: comicfn2dict-build
volumes:
- ./dist/:/app/dist/
command: poetry build

186
eslint.config.js Normal file
View File

@ -0,0 +1,186 @@
import { FlatCompat } from "@eslint/eslintrc";
import js from "@eslint/js";
import arrayFunc from "eslint-plugin-array-func";
// import plugin broken for flag config
// https://github.com/import-js/eslint-plugin-import/issues/2556
// import importPlugin from "eslint-plugin-import";
import eslintPluginPrettierRecommended from "eslint-plugin-prettier/recommended";
import pluginSecurity from "eslint-plugin-security";
import eslintPluginUnicorn from "eslint-plugin-unicorn";
import globals from "globals";
const compat = new FlatCompat();
export default [
{
languageOptions: {
globals: {
...globals.node,
...globals.browser,
},
},
linterOptions: {
reportUnusedDisableDirectives: "warn",
},
plugins: {
// import: importPlugin,
unicorn: eslintPluginUnicorn,
},
rules: {
"array-func/prefer-array-from": "off", // for modern browsers the spread operator, as preferred by unicorn, works fine.
"max-params": ["warn", 4],
"no-console": "warn",
"no-debugger": "warn",
"no-constructor-bind/no-constructor-bind": "error",
"no-constructor-bind/no-constructor-state": "error",
"no-secrets/no-secrets": "error",
"prettier/prettier": "warn",
"security/detect-object-injection": "off",
"space-before-function-paren": "off",
"unicorn/switch-case-braces": ["warn", "avoid"],
"unicorn/prefer-node-protocol": 0,
"unicorn/prevent-abbreviations": "off",
"unicorn/filename-case": [
"error",
{ case: "kebabCase", ignore: [".*.md"] },
],
/*
...importPlugin.configs["recommended"].rules,
"import/no-unresolved": [
"error",
{
ignore: ["^[@]"],
},
],
*/
},
/*
settings: {
"import/parsers": {
espree: [".js", ".cjs", ".mjs", ".jsx"],
"@typescript-eslint/parser": [".ts"],
},
"import/resolver": {
typescript: true,
node: true,
},
},
*/
},
js.configs.recommended,
arrayFunc.configs.all,
pluginSecurity.configs.recommended,
eslintPluginPrettierRecommended,
...compat.config({
root: true,
env: {
browser: true,
es2024: true,
node: true,
},
extends: [
// LANGS
"plugin:jsonc/recommended-with-jsonc",
"plugin:markdown/recommended",
"plugin:toml/recommended",
"plugin:yml/standard",
"plugin:yml/prettier",
// CODE QUALITY
"plugin:sonarjs/recommended",
// PRACTICES
"plugin:eslint-comments/recommended",
// "plugin:import/recommended",
"plugin:no-use-extend-native/recommended",
"plugin:optimize-regex/all",
"plugin:promise/recommended",
"plugin:switch-case/recommended",
// SECURITY
"plugin:no-unsanitized/DOM",
],
overrides: [
{
files: ["**/*.md"],
processor: "markdown/markdown",
rules: {
"prettier/prettier": ["warn", { parser: "markdown" }],
},
},
{
files: ["**/*.md/*.js"], // Will match js code inside *.md files
rules: {
"no-unused-vars": "off",
"no-undef": "off",
},
},
{
files: ["**/*.md/*.sh"],
rules: {
"prettier/prettier": ["error", { parser: "sh" }],
},
},
{
files: ["*.yaml", "*.yml"],
//parser: "yaml-eslint-parser",
rules: {
"unicorn/filename-case": "off",
},
},
{
files: ["*.toml"],
//parser: "toml-eslint-parser",
rules: {
"prettier/prettier": ["error", { parser: "toml" }],
},
},
{
files: ["*.json", "*.json5", "*.jsonc"],
//parser: "jsonc-eslint-parser",
},
],
parserOptions: {
ecmaFeatures: {
impliedStrict: true,
},
ecmaVersion: "latest",
},
plugins: [
"eslint-comments",
//"import",
"markdown",
"no-constructor-bind",
"no-secrets",
"no-unsanitized",
"no-use-extend-native",
"optimize-regex",
"promise",
"simple-import-sort",
"sonarjs",
"switch-case",
"unicorn",
],
rules: {
"no-constructor-bind/no-constructor-bind": "error",
"no-constructor-bind/no-constructor-state": "error",
"no-secrets/no-secrets": "error",
"eslint-comments/no-unused-disable": 1,
"simple-import-sort/exports": "warn",
"simple-import-sort/imports": "warn",
"switch-case/newline-between-switch-case": "off", // Malfunctioning
},
ignorePatterns: [
"*~",
"**/__pycache__",
".git",
"!.circleci",
".mypy_cache",
".ruff_cache",
".pytest_cache",
".venv*",
"dist",
"node_modules",
"package-lock.json",
"test-results",
"typings",
],
}),
];

6339
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -1,10 +1,10 @@
{
"version": "0.1.0",
"description": "linting",
"version": "0.2.0",
"description": "comicfn2dict linting",
"type": "module",
"scripts": {
"fix": "eslint_d --cache --fix --ignore-pattern frontend --ext .cjs,.mjs,.js,.json,.yaml,.md . && prettier --write .",
"lint": "eslint_d --cache --ignore-pattern frontend --ext .cjs,.mjs,.js,.json,.yaml,.md . && prettier --check .",
"fix": "eslint --cache --fix . && prettier --write .",
"lint": "eslint --cache . && prettier --check .",
"remark-check": "remark .",
"remark-fix": "remark . --output"
},
@ -13,12 +13,13 @@
"@prettier/plugin-xml",
"prettier-plugin-nginx",
"prettier-plugin-packagejson",
"prettier-plugin-sh"
"prettier-plugin-sh",
"prettier-plugin-toml"
],
"overrides": [
{
"files": [
"*.md"
"**/*.md"
],
"options": {
"proseWrap": "always"
@ -28,6 +29,7 @@
},
"remarkConfig": {
"plugins": [
"gfm",
"preset-lint-consistent",
"preset-lint-recommended",
"preset-lint-markdown-style-guide",
@ -42,36 +44,37 @@
"@prettier/plugin-xml": "^3.0.0",
"eslint": "^8.34.0",
"eslint-config-prettier": "^9.0.0",
"eslint-plugin-array-func": "^4.0.0",
"eslint-plugin-array-func": "^5.0.1",
"eslint-plugin-eslint-comments": "^3.2.0",
"eslint-plugin-import": "^2.25.4",
"eslint-plugin-json": "^3.1.0",
"eslint-plugin-mdx": "^3.0.0",
"eslint-plugin-jsonc": "^2.13.0",
"eslint-plugin-markdown": "^3.0.0",
"eslint-plugin-no-constructor-bind": "^2.0.4",
"eslint-plugin-no-secrets": "^0.8.9",
"eslint-plugin-no-unsanitized": "^4.0.0",
"eslint-plugin-no-use-extend-native": "^0.5.0",
"eslint-plugin-only-warn": "^1.0.2",
"eslint-plugin-optimize-regex": "^1.2.0",
"eslint-plugin-prettier": "^5.0.0-alpha.2",
"eslint-plugin-promise": "^6.0.0",
"eslint-plugin-scanjs-rules": "^0.2.1",
"eslint-plugin-security": "^2.1.0",
"eslint-plugin-simple-import-sort": "^10.0.0",
"eslint-plugin-sonarjs": "^0.23.0",
"eslint-plugin-simple-import-sort": "^12.0.0",
"eslint-plugin-sonarjs": "^0.24.0",
"eslint-plugin-switch-case": "^1.1.2",
"eslint-plugin-unicorn": "^50.0.1",
"eslint-plugin-yaml": "^0.5.0",
"eslint-plugin-toml": "^0.9.2",
"eslint-plugin-unicorn": "^51.0.1",
"eslint-plugin-yml": "^1.12.2",
"eslint_d": "^13.0.0",
"prettier": "^3.0.0",
"prettier-plugin-nginx": "^1.0.3",
"prettier-plugin-packagejson": "^2.4.4",
"prettier-plugin-sh": "^0.13.0",
"prettier-plugin-sh": "^0.14.0",
"prettier-plugin-toml": "^2.0.1",
"remark-cli": "^12.0.0",
"remark-gfm": "^4.0.0",
"remark-preset-lint-consistent": "^5.1.1",
"remark-preset-lint-markdown-style-guide": "^5.1.2",
"remark-preset-lint-recommended": "^6.1.2",
"remark-preset-prettier": "^2.0.1",
"toml": "^3.0.0"
"remark-preset-prettier": "^2.0.1"
}
}

942
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry]
name = "comicfn2dict"
version = "0.1.3"
version = "0.2.0a4"
description = "Parse common comic filenames and return a dict of metadata attributes. Includes a cli."
license = "GPL-3.0-only"
authors = ["AJ Slater <aj@slater.net>"]
@ -20,12 +20,15 @@ classifiers = [
"Operating System :: OS Independent",
"Programming Language :: Python :: 3 :: Only",
]
packages = [{ include = "comicfn2dict" }, { include = "tests", format = "sdist" }]
packages = [
{ include = "comicfn2dict" },
{ include = "tests", format = "sdist" },
]
exclude = ["*/**/*~"]
include = []
[tool.poetry.dependencies]
python = "^3.9"
python = "^3.10"
[tool.poetry.group.dev.dependencies]
neovim = "^0.3.1"
@ -42,7 +45,7 @@ pytest-gitignore = "^1.3"
codespell = "^2.1.0"
pyright = "^1.1.232"
radon = { version = "^6.0.1", extras = ["toml"] }
ruff = "^0.1.2"
ruff = "^0.2.1"
types-python-dateutil = "^2.8.19"
vulture = "^2.3"
@ -77,7 +80,7 @@ omit = [
"dist/*",
"node_modules/*",
"test-results/*",
"typings/*"
"typings/*",
]
[tool.pyright]
@ -98,12 +101,10 @@ exclude = [
useLibraryCodeForTypes = true
reportMissingImports = true
reportImportCycles = true
pythonVersion = "3.9"
pythonVersion = "3.10"
pythonPlatform = "All"
[tool.pytest.ini_options]
junit_family = "xunit2"
# --black
addopts = """
--junit-xml=test-results/pytest/results.xml
-ra
@ -113,21 +114,38 @@ addopts = """
--cov-append
--cov-report=html
--cov-report=term
--ignore=.git
--ignore=cache
--ignore=frontend
--ignore=typings
"""
junit_family = "xunit2"
testpaths = "tests"
[tool.radon]
exclude = "*~,.git/*,.mypy_cache/*,.pytest_cache/*,.venv*,__pycache__/*,cache/*,dist/*,node_modules/*,test-results/*,typings/*"
[tool.ruff]
extend-exclude = ["typings"]
extend-ignore = ["S101", "D203", "D213",
target-version = "py310"
[tool.ruff.lint]
extend-ignore = [
"S101",
"D203",
"D213",
# Format ignores
"W191", "E501", "E111", "E114", "E117", "D206", "D300", "Q000", "Q001",
"Q002", "Q003", "COM812", "COM819", "ISC001", "ISC002"
"W191",
"E501",
"E111",
"E114",
"E117",
"D206",
"D300",
"Q000",
"Q001",
"Q002",
"Q003",
"COM812",
"COM819",
"ISC001",
"ISC002",
]
extend-select = [
"A",
@ -168,19 +186,16 @@ extend-select = [
"TRY",
"UP",
"W",
"YTT"
"YTT",
# "ANN", "ERA", "COM"
]
external = ["V101"]
# format = "grouped"
# show-source = true
target-version = "py39"
task-tags = ["TODO", "FIXME", "XXX", "http", "HACK"]
[tool.ruff.per-file-ignores]
[tool.ruff.lint.per-file-ignores]
"tests/*" = ["SLF001", "T201", "T203"]
[tool.ruff.pycodestyle]
[tool.ruff.lint.pycodestyle]
ignore-overlong-task-comments = true
[tool.vulture]

View File

@ -1,5 +1,7 @@
"""Test filenames with human parsed correct results."""
from types import MappingProxyType
TEST_COMIC_FIELDS = {
"series": "Long Series Name",
"issue": "001",
@ -22,6 +24,7 @@ TEST_COMIC_FIELDS_VOL = {
TEST_COMIC_VOL_ONLY = {
"series": "Long Series Name",
"volume": "1",
"issue": "1",
"title": "Title",
"original_format": "TPB",
"year": "2000",
@ -29,6 +32,7 @@ TEST_COMIC_VOL_ONLY = {
"ext": "cbr",
}
# Tests for 0.1.0
FNS = {
"Night of 1000 Wolves 001 (2013).cbz": {
"series": "Night of 1000 Wolves",
@ -51,11 +55,6 @@ FNS = {
"Long Series Name #001 (2000) Title (TPB) (Releaser).cbz": TEST_COMIC_FIELDS,
"Long Series Name (2000) 001 Title (TPB) (Releaser).cbz": TEST_COMIC_FIELDS,
"Long Series Name (2000) #001 Title (TPB) (Releaser).cbz": TEST_COMIC_FIELDS,
"Long Series Name v1 (2000) #001 "
"Title (TPB) (Releaser).cbz": TEST_COMIC_FIELDS_VOL,
"Long Series Name 001 (2000) (TPB-Releaser) Title.cbz": TEST_COMIC_FIELDS,
"Long Series Name Vol 1 "
"(2000) (TPB) (Releaser & Releaser-Releaser) Title.cbr": TEST_COMIC_VOL_ONLY,
"Ultimate Craziness (2019) (Digital) (Friends-of-Bill).cbr": {
"series": "Ultimate Craziness",
"year": "2019",
@ -73,26 +72,17 @@ FNS = {
"Arkenstone Vol. 01 - The Smell of Burnt Toast (2020) (digital) (My-brother).cbr": {
"series": "Arkenstone",
"volume": "01",
"issue": "01",
"year": "2020",
"ext": "cbr",
"scan_info": "My-brother",
"title": "The Smell of Burnt Toast",
"original_format": "digital",
},
"Bardude - The Last Thing I Remember.cbz": {
"series": "Bardude",
"title": "The Last Thing I Remember",
"ext": "cbz",
},
"Drunkguy - The Man Without Fear - 01.cbz": {
"series": "Drunkguy",
"title": "The Man Without Fear",
"issue": "01",
"ext": "cbz",
},
"The_Arkenstone_v03_(2002)_(Digital)_(DR_&amp;_Quenya-Elves).cbr": {
"series": "The Arkenstone",
"volume": "03",
"issue": "03",
"year": "2002",
"ext": "cbr",
"scan_info": "DR &amp; Quenya-Elves",
@ -111,6 +101,7 @@ FNS = {
"Kartalk Library Edition v01 (1992) (digital) (Son of Ultron-Empire).cbr": {
"series": "Kartalk Library Edition",
"volume": "01",
"issue": "01",
"year": "1992",
"ext": "cbr",
"original_format": "digital",
@ -119,15 +110,15 @@ FNS = {
"Kind of Deadly v02 - Last Bullet (2006) (Digital) (Zone-Empire).cbr": {
"series": "Kind of Deadly",
"volume": "02",
"issue": "02",
"year": "2006",
"ext": "cbr",
"original_format": "Digital",
"scan_info": "Zone-Empire",
"title": "Last Bullet",
},
"Jeremy John - A Big Long Title (2017) (digital-Minutement).cbz": {
"series": "Jeremy John",
"title": "A Big Long Title",
"Jeremy John - Not A Title (2017) (digital-Minutement).cbz": {
"series": "Jeremy John - Not A Title",
"year": "2017",
"ext": "cbz",
"original_format": "digital",
@ -139,8 +130,7 @@ FNS = {
"year": "2006",
"ext": "cbz",
"scan_info": "Minutemen-Faessla",
# "original_format": "digital",
"remainders": ("(digital",),
"original_format": "digital",
},
"Jeremy John 003 (2007) (4 covers) (digital) (Minutemen-Faessla).cbz": {
"series": "Jeremy John",
@ -154,6 +144,7 @@ FNS = {
"Jeremy John v01 - Uninterested! (2007) (Digital) (Asgard-Empire).cbr": {
"series": "Jeremy John",
"volume": "01",
"issue": "01",
"year": "2007",
"ext": "cbr",
"original_format": "Digital",
@ -180,6 +171,7 @@ FNS = {
"Darkwad by Carlos Zemo v01 - Knuckle Fight (2009) (Digital) (Zone-Empire).cbr": {
"series": "Darkwad by Carlos Zemo",
"volume": "01",
"issue": "01",
"year": "2009",
"ext": "cbr",
"title": "Knuckle Fight",
@ -243,3 +235,273 @@ FNS = {
"ext": "cbz",
},
}
# Tests for 0.2.0
FNS.update(
{
# Philosopy change regarding dashes.
"Bardude - The Last Thing I Remember.cbz": {
"series": "Bardude - The Last Thing I Remember",
"ext": "cbz",
},
"Drunkguy - The Man Without Fear - 01.cbz": {
"series": "Drunkguy - The Man Without Fear",
"issue": "01",
"ext": "cbz",
},
# BIG Change. title after token. more stripping.
"'Batman - Superman - World's Finest 022 (2024) (Webrip) (The Last Kryptonian-DCP).cbz": {
"ext": "cbz",
"issue": "022",
"original_format": "Webrip",
"series": "Batman - Superman - World's Finest",
"scan_info": "The Last Kryptonian-DCP",
"year": "2024",
},
# Issue number starting with a letter requested in https://github.com/comictagger/comictagger/issues/543
# word characters now allowed to lead issue numbers only if preceded by a # marker
"batman #B01 title.cbz": {
"ext": "cbz",
"issue": "B01",
"series": "batman",
"title": "title",
},
"Monster_Island_v1_#2__repaired__c2c.cbz": {
"ext": "cbz",
"issue": "2",
"series": "Monster Island",
"volume": "1",
"scan_info": "c2c",
"remainders": ("(repaired)",),
},
# Extra - in the series
" X-Men-V1-#067.cbr": {
"ext": "cbr",
"issue": "067",
"series": "X-Men",
"volume": "1",
"remainders": ("-",),
},
"Aquaman - Green Arrow - Deep Target #01 (of 07) (2021).cbr": {
"ext": "cbr",
"issue": "01",
"series": "Aquaman - Green Arrow - Deep Target",
"year": "2021",
"issue_count": "07",
},
# CT only separates this into a title if the '-' is attached to the previous word eg 'aquaman- Green Arrow'. @bpepple opened a ticket for this https://github.com/ajslater/comicfn2dict/issues/1 already
"Batman_-_Superman_#020_(2021).cbr": {
"ext": "cbr",
"issue": "020",
"series": "Batman - Superman",
"year": "2021",
},
# Publishers like to re-print some of their annuals using this format for the year
"Batman '89 (2021) .cbr": {
"ext": "cbr",
"series": "Batman '89",
"year": "2021",
},
# This made the parser in CT much more complicated. It's understandable that this isn't parsed on the first few iterations of this project
"Star Wars - War of the Bounty Hunters - IG-88 (2021).cbz": {
"ext": "cbz",
"series": "Star Wars - War of the Bounty Hunters - IG-88",
"year": "2021",
}, # The addition of the '#1' turns this into the same as 'Aquaman - Green Arrow - Deep Target' above
"Star Wars - War of the Bounty Hunters - IG-88 #1 (2021).cbz": {
"ext": "cbz",
"issue": "1",
"series": "Star Wars - War of the Bounty Hunters - IG-88",
"year": "2021",
},
"Free Comic Book Day - Avengers.Hulk (2021).cbz": {
"ext": "cbz",
"series": "Free Comic Book Day - Avengers Hulk",
"year": "2021",
},
# CT assumes the volume is also the issue number if it can't find an issue number
"Avengers By Brian Michael Bendis volume 03 (2013).cbz": {
"ext": "cbz",
"issue": "03",
"series": "Avengers By Brian Michael Bendis",
"volume": "03",
"year": "2013",
},
# CT catches the year
"Marvel Previews #002 (January 2022).cbr": {
"ext": "cbr",
"issue": "002",
"series": "Marvel Previews",
"publisher": "Marvel",
"month": "01",
"year": "2022",
},
"Test Numeric Year #2 2001-02-24.cbz": {
"ext": "cbz",
"issue": "2",
"series": "Test Numeric Year",
"year": "2001",
"month": "02",
"day": "24",
},
"Test Month First Date 02-24-2001.cbz": {
"ext": "cbz",
"series": "Test Month First Date",
"year": "2001",
"month": "02",
"day": "24",
},
# CT notices that this is a full date, CT doesn't actually return the month or day though just removes it
"X-Men, 2021-08-04 (#02).cbz": {
"ext": "cbz",
"issue": "02",
"series": "X-Men",
"year": "2021",
"month": "08",
"day": "04",
},
# 4 digit issue number
# should this be an issue number if year DONE?.
"action comics 1024.cbz": {
"ext": "cbz",
"issue": "1024",
"series": "action comics",
},
# This is a contrived test case. I've never seen this I just wanted to handle it with my parser
"Cory Doctorow's Futuristic Tales of the Here and Now #0.0.1 (2007).cbz": {
"ext": "cbz",
"issue": "0.0.1",
"series": "Cory Doctorow's Futuristic Tales of the Here and Now",
"year": "2007",
},
# CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation
"Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": {
"ext": "cbz",
"issue": "001",
"series": "Cory Doctorow's Futuristic Tales of the Here and Now",
"title": "Anda's Game",
"year": "2007",
},
# If a title ends in a year, it's not an issue (and is a year if no year)
"Blade Runner Free Comic Book Day 2021 (2021).cbr": {
"ext": "cbr",
"series": "Blade Runner Free Comic Book Day 2021",
"year": "2021",
},
# If a year occurs after another year, and no volume, do volume / year
"Super Strange Yarns (1957) #92 (1969).cbz": {
"ext": "cbz",
"issue": "92",
"series": "Super Strange Yarns",
"volume": "1957",
"year": "1969",
},
# CT checks for the following '(of 06)' after the '03' and marks it as the volume
"Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": {
"ext": "cbr",
"issue": "008",
"series": "Elephantmen 2259",
"title": "Simple Truth",
"volume": "03",
"year": "2021",
"volume_count": "06",
},
# CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series)
"Bloodshot Book 03 (2020).cbr": {
"ext": "cbr",
"issue": "03",
"series": "Bloodshot",
"title": "Book 03",
"volume": "03",
"year": "2020",
},
# c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder
"Marvel Two In One V1 #090 c2c.cbr": {
"ext": "cbr",
"issue": "090",
"series": "Marvel Two In One",
"publisher": "Marvel",
"volume": "1",
"scan_info": "c2c",
},
# CT treats '[]' as equivalent to '()', catches DC as a publisher and 'Sep-Oct 1951' as dates and removes them. CT doesn't catch the digital though so that could be better but I blame whoever made this atrocious filename
"Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz": {
"ext": "cbz",
"issue": "49",
"series": "Wonder Woman",
"publisher": "DC",
"year": "1951",
"month": "09",
"remainders": (
"digital (downsized, lightened, 4 missing story pages "
"restored) (Shadowcat-Empire)",
),
},
"Captain Science #001 (1950) The Beginning - nothing.cbz": {
"ext": "cbz",
"issue": "001",
"title": "The Beginning - nothing",
"series": "Captain Science",
"year": "1950",
},
"Captain Science #001-cix-cbi.cbr": {
"ext": "cbr",
"issue": "001",
"series": "Captain Science",
"title": "cix-cbi",
},
"Long Series Name v1 (2000) #001 "
"Title (TPB) (Releaser).cbz": TEST_COMIC_FIELDS_VOL,
"Long Series Name 001 (2000) (TPB-Releaser) Title.cbz": {
"series": "Long Series Name",
"issue": "001",
"year": "2000",
"original_format": "TPB",
"scan_info": "Releaser",
"remainders": ("Title",),
"ext": "cbz",
},
"Long Series Name Vol 1 "
"(2000) (TPB) (Releaser & Releaser-Releaser) Title.cbr": {
"series": "Long Series Name",
"volume": "1",
"issue": "1",
"remainders": ("Title",),
"original_format": "TPB",
"year": "2000",
"scan_info": "Releaser & Releaser-Releaser",
"ext": "cbr",
},
}
)
# first_key, first_val = NEW.popitem()
# FNS[first_key] = first_val
PARSE_FNS = MappingProxyType(FNS)
SERIALIZE_FNS = MappingProxyType(
{
"Long Series Name #001 (2000) Title (TPB) (Releaser).cbz": TEST_COMIC_FIELDS,
"Long Series Name v1 #001 "
"(2000) Title (TPB) (Releaser & Releaser-Releaser).cbr": TEST_COMIC_VOL_ONLY,
"Series Name (2000-12-31).cbz": {
"series": "Series Name",
"year": "2000",
"month": "12",
"day": "31",
"ext": "cbz",
},
"Series Name (2000-12).cbz": {
"series": "Series Name",
"year": "2000",
"month": "12",
"ext": "cbz",
},
"Series Name (Dec-31).cbz": {
"series": "Series Name",
"month": "12",
"day": "31",
"ext": "cbz",
},
}
)

View File

@ -1,22 +1,18 @@
"""Tests for filename parsing."""
from pprint import pprint
from types import MappingProxyType
import pytest
from deepdiff.diff import DeepDiff
from comicfn2dict import comicfn2dict
from tests.comic_filenames import FNS
ALL_FIELDS = frozenset({"series", "volume", "issue", "issue_count", "year", "ext"})
FIELD_SCHEMA = MappingProxyType({key: None for key in ALL_FIELDS})
from comicfn2dict import ComicFilenameParser
from tests.comic_filenames import PARSE_FNS
@pytest.mark.parametrize("item", FNS.items())
@pytest.mark.parametrize("item", PARSE_FNS.items())
def test_parse_filename(item):
"""Test filename parsing."""
fn, defined_fields = item
md = comicfn2dict(fn)
md = ComicFilenameParser(fn, verbose=1).parse()
diff = DeepDiff(defined_fields, md, ignore_order=True)
print(fn)
pprint(defined_fields)

View File

@ -0,0 +1,13 @@
"""Tests for filename parsing."""
import pytest
from comicfn2dict import ComicFilenameSerializer
from tests.comic_filenames import SERIALIZE_FNS
@pytest.mark.parametrize("item", SERIALIZE_FNS.items())
def test_serialize_dict(item):
"""Test metadata serialization."""
test_fn, md = item
fn = ComicFilenameSerializer(md).serialize()
assert test_fn == fn