titles after tokens

This commit is contained in:
AJ Slater 2024-02-19 14:11:47 -08:00
parent 65e17236df
commit 3ce61254dc
5 changed files with 256 additions and 31 deletions

View File

@ -1,5 +1,10 @@
# 📰 comicfn2dict News
## v0.2.0
- Titles are now parsed only if they occur after the series token AND after
either issue, year or volume.
## v0.1.4
- Require Python 3.10

View File

@ -1,10 +1,10 @@
"""Parse comic book archive names using the simple 'parse' parser."""
from pprint import pprint
from pathlib import Path
from re import Match, Pattern
from typing import Any
from comicfn2dict.regex import (
DASH_SPLIT_RE,
EXTRA_SPACES_RE,
ISSUE_ANYWHERE_RE,
ISSUE_BEGIN_RE,
@ -26,9 +26,13 @@ from comicfn2dict.regex import (
_REMAINING_GROUP_KEYS = ("series", "title")
def _parse_ext(name: str, suffix: str, metadata: dict) -> str:
def _parse_ext(name: str | Path, metadata: dict) -> str:
"""Pop the extension from the pathname."""
data = name.removesuffix(suffix)
if isinstance(name, str):
name = name.strip()
path = Path(name)
suffix = path.suffix
data = path.name.removesuffix(suffix)
ext = suffix.lstrip(".")
if ext:
metadata["ext"] = ext
@ -43,17 +47,18 @@ def _clean_dividers(data: str) -> str:
def _get_data_list(path: str | Path, metadata: dict) -> list[str]:
"""Prepare data list from a path or string."""
if isinstance(path, str):
path = path.strip()
path = Path(path)
data = _parse_ext(path.name, path.suffix, metadata)
data = _parse_ext(path, metadata)
data = _clean_dividers(data)
return DASH_SPLIT_RE.split(data)
return [data]
def _paren_strip(value: str) -> str:
def _grouping_operators_strip(value: str) -> str:
"""Strip spaces and parens."""
return value.strip().strip("()").strip()
value = value.strip()
value = value.strip("()").strip()
value = value.strip("-").strip()
value = value.strip("'").strip('"').strip()
return value
def _splicey_dicey(
@ -71,7 +76,7 @@ def _splicey_dicey(
if data_after := data[match.end() :].strip():
data_ends.append(data_after)
data_list[index:index] = data_ends
return _paren_strip(value)
return _grouping_operators_strip(value)
def _match_original_format_and_scan_info(
@ -83,10 +88,10 @@ def _match_original_format_and_scan_info(
scan_info = match.group("scan_info")
except IndexError:
scan_info = None
metadata["original_format"] = _paren_strip(original_format)
metadata["original_format"] = _grouping_operators_strip(original_format)
match_group = 1
if scan_info:
metadata["scan_info"] = _paren_strip(scan_info)
metadata["scan_info"] = _grouping_operators_strip(scan_info)
match_group = 0
_splicey_dicey(data_list, index, match, match_group=match_group)
@ -112,14 +117,16 @@ def _pop_value_from_token(
regex: Pattern,
key: str,
index: int = 0,
) -> Match:
) -> str:
"""Search token for value, splice and assign to metadata."""
data = data_list[index]
match = regex.search(data)
if match:
value = _splicey_dicey(data_list, index, match, key)
metadata[key] = value
return match
else:
value = ""
return value
def _parse_item(
@ -128,21 +135,25 @@ def _parse_item(
regex: Pattern,
key: str,
start_index: int = 0,
path: str = "",
) -> int:
"""Parse a value from the data list into metadata and alter the data list."""
path_index = -1
index = start_index
dl_len = end_index = len(data_list)
if index >= end_index:
index = 0
while index < end_index:
match = _pop_value_from_token(data_list, metadata, regex, key, index)
if match:
value = _pop_value_from_token(data_list, metadata, regex, key, index)
if value:
if "key" == "issue":
path_index = path.find(value)
break
index += 1
if index > dl_len and start_index > 0:
index = 0
end_index = start_index
return index
return path_index
def _pop_issue_from_text_fields(
@ -156,7 +167,39 @@ def _pop_issue_from_text_fields(
return data_list.pop(index)
def _assign_remaining_groups(data_list: list[str], metadata: dict):
TITLE_PRECEDING_KEYS = ("issue", "year", "volume")
def _is_title_in_position(path, value, metadata):
"""Does the title come after series and one other token if they exist."""
# TODO this could be faster if indexes could be grabbed for these tokens
# when they are extracted.
title_index = path.find(value)
# Does a series come first.
series = metadata.get("series")
if not series:
return False
series_index = path.find(series)
if title_index < series_index:
return False
# If other tokens exist then they much precede the title.
title_ok = False
other_tokens_exist = False
for preceding_key in TITLE_PRECEDING_KEYS:
preceding_value = metadata.get(preceding_key)
if not preceding_value:
continue
other_tokens_exist = True
preceding_index = path.find(preceding_value)
if title_index > preceding_index:
title_ok = True
break
return title_ok or not other_tokens_exist
def _assign_remaining_groups(data_list: list[str], metadata: dict, path: str):
"""Assign series and title."""
index = 0
for key in _REMAINING_GROUP_KEYS:
@ -167,7 +210,9 @@ def _assign_remaining_groups(data_list: list[str], metadata: dict):
match = REMAINING_GROUP_RE.search(data) if data else None
if match:
value = _pop_issue_from_text_fields(data_list, metadata, index)
value = _paren_strip(value)
if key == "title" and not _is_title_in_position(path, value, metadata):
continue
value = _grouping_operators_strip(value)
if value:
metadata[key] = value
else:
@ -184,10 +229,17 @@ def _pickup_issue(remainders: list[str], metadata: dict) -> None:
_parse_item(remainders, metadata, ISSUE_ANYWHERE_RE, "issue")
def _log_progress(label, metadata, data_list):
print(label + ":")
pprint(metadata)
pprint(data_list)
def comicfn2dict(path: str | Path) -> dict[str, Any]:
"""Parse the filename with a hierarchy of regexes."""
metadata = {}
data_list = _get_data_list(path, metadata)
_log_progress("INITIAL", metadata, data_list)
# Parse paren tokens
_parse_item(data_list, metadata, ISSUE_COUNT_RE, "issue_count")
@ -206,26 +258,33 @@ def comicfn2dict(path: str | Path) -> dict[str, Any]:
"scan_info",
start_index=of_index + 1,
)
_log_progress("AFTER PAREN TOKENS", metadata, data_list)
# Parse regular tokens
_parse_item(data_list, metadata, VOLUME_RE, "volume")
_parse_item(data_list, metadata, ISSUE_NUMBER_RE, "issue")
_parse_item(data_list, metadata, ISSUE_NUMBER_RE, "issue", path=str(path))
_log_progress("AFTER REGULAR TOKENS", metadata, data_list)
# Pickup year if not gotten.
if "year" not in metadata:
_parse_item(data_list, metadata, YEAR_BEGIN_RE, "year")
if "year" not in metadata:
_parse_item(data_list, metadata, YEAR_END_RE, "year")
_log_progress("AFTER YEAR PICKUP", metadata, data_list)
# Pickup issue if it's a standalone token
if "issue" not in metadata:
_parse_item(data_list, metadata, ISSUE_TOKEN_RE, "issue")
_log_progress("AFTER ISSUE PICKUP", metadata, data_list)
# Series and Title. Also looks for issue.
_assign_remaining_groups(data_list, metadata)
_assign_remaining_groups(data_list, metadata, str(path))
_log_progress("AFTER SERIES AND TITLE", metadata, data_list)
# Final try for issue number.
_pickup_issue(data_list, metadata)
_log_progress("AFTER ISSUE PICKUP", metadata, data_list)
# Add Remainders
if data_list:

View File

@ -72,4 +72,4 @@ ISSUE_BEGIN_RE = re_compile(r"^(" + _ISSUE_RE_EXP + r")\b")
ISSUE_ANYWHERE_RE = re_compile(r"\b(" + _ISSUE_RE_EXP + r")\b")
# LONG STRINGS
REMAINING_GROUP_RE = re_compile(r"^[\w].*[^\)]")
REMAINING_GROUP_RE = re_compile(r"^[^\()].*[^\)]")

View File

@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry]
name = "comicfn2dict"
version = "0.1.4"
version = "0.2.0"
description = "Parse common comic filenames and return a dict of metadata attributes. Includes a cli."
license = "GPL-3.0-only"
authors = ["AJ Slater <aj@slater.net>"]

View File

@ -80,13 +80,11 @@ FNS = {
"original_format": "digital",
},
"Bardude - The Last Thing I Remember.cbz": {
"series": "Bardude",
"title": "The Last Thing I Remember",
"series": "Bardude - The Last Thing I Remember",
"ext": "cbz",
},
"Drunkguy - The Man Without Fear - 01.cbz": {
"series": "Drunkguy",
"title": "The Man Without Fear",
"series": "Drunkguy - The Man Without Fear",
"issue": "01",
"ext": "cbz",
},
@ -125,9 +123,8 @@ FNS = {
"scan_info": "Zone-Empire",
"title": "Last Bullet",
},
"Jeremy John - A Big Long Title (2017) (digital-Minutement).cbz": {
"series": "Jeremy John",
"title": "A Big Long Title",
"Jeremy John - Not A Title (2017) (digital-Minutement).cbz": {
"series": "Jeremy John - Not A Title",
"year": "2017",
"ext": "cbz",
"original_format": "digital",
@ -243,3 +240,167 @@ FNS = {
"ext": "cbz",
},
}
FNS.update( # Newly fixed.
{
"'Batman - Superman - World's Finest 022 (2024) (Webrip) (The Last Kryptonian-DCP).cbz": {
"ext": "cbz",
"issue": "022",
"remainders": ("(The Last Kryptonian-DCP)",),
"scan_info": "Webrip",
"series": "Batman - Superman - World's Finest",
"year": "2024",
},
}
)
FNS.update(
{
# Issue number starting with a letter requested in https://github.com/comictagger/comictagger/issues/543
"batman #B01 title.cbz": {
"ext": "cbz",
"issue": "B01",
"series": "batman",
"title": "title",
}, # Leading issue number is usually an alternate sequence number
"52 action comics #2024.cbz": {
"ext": "cbz",
"issue": "2024",
"series": "action comics",
"alternate": "52",
}, # 4 digit issue number
"action comics 1024.cbz": {
"ext": "cbz",
"issue": "1024",
"series": "action comics",
}, # Only the issue number. CT ensures that the series always has a value if possible
"#52.cbz": {
"ext": "cbz",
"issue": "52",
"series": "52",
}, # CT treats double-underscore the same as double-dash
"Monster_Island_v1_#2__repaired__c2c.cbz": {
"ext": "cbz",
"issue": "2",
"series": "Monster Island",
"volume": "1",
}, # I'm not sure there's a right way to parse this. This might also be a madeup filename I don't remember
"Super Strange Yarns (1957) #92 (1969).cbz": {
"ext": "cbz",
"issue": "92",
"series": "Super Strange Yarns",
"volume": "1957",
"year": "1969",
}, # Extra - in the series
" X-Men-V1-#067.cbr": {
"ext": "cbr",
"issue": "067",
"series": "X-Men",
"volume": "1",
}, # CT only separates this into a title if the '-' is attached to the previous word eg 'aquaman- Green Arrow'. @bpepple opened a ticket for this https://github.com/ajslater/comicfn2dict/issues/1 already
"Aquaman - Green Arrow - Deep Target #01 (of 07) (2021).cbr": {
"ext": "cbr",
"issue": "01",
"series": "Aquaman - Green Arrow - Deep Target",
"year": "2021",
"issue_count": "7",
},
"Batman_-_Superman_#020_(2021).cbr": {
"ext": "cbr",
"issue": "020",
"series": "Batman - Superman",
"year": "2021",
},
"Free Comic Book Day - Avengers.Hulk (2021).cbz": {
"ext": "cbz",
"series": "Free Comic Book Day - Avengers Hulk",
"year": "2021",
}, # CT assumes the volume is also the issue number if it can't find an issue number
"Avengers By Brian Michael Bendis volume 03 (2013).cbz": {
"ext": "cbz",
"issue": "3",
"series": "Avengers By Brian Michael Bendis",
"volume": "03",
"year": "2013",
}, # Publishers like to re-print some of their annuals using this format for the year
"Batman '89 (2021) .cbr": {
"ext": "cbr",
"series": "Batman '89",
"year": "2021",
}, # CT has extra processing to re-attach the year in this case
"Blade Runner Free Comic Book Day 2021 (2021).cbr": {
"ext": "cbr",
"series": "Blade Runner Free Comic Book Day 2021",
"year": "2021",
}, # CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series)
"Bloodshot Book 03 (2020).cbr": {
"ext": "cbr",
"issue": "03",
"series": "Bloodshot",
"title": "Book 03",
"volume": "03",
"year": "2020",
}, # CT checks for the following '(of 06)' after the '03' and marks it as the volume
"Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": {
"ext": "cbr",
"issue": "008",
"series": "Elephantmen 2259",
"title": "Simple Truth",
"volume": "03",
"year": "2021",
"volume_count": "06",
}, # CT catches the year
"Marvel Previews #002 (January 2022).cbr": {
"ext": "cbr",
"issue": "002",
"series": "Marvel Previews",
"year": "2022",
}, # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder
"Marvel Two In One V1 #090 c2c.cbr": {
"ext": "cbr",
"issue": "090",
"series": "Marvel Two In One",
"publisher": "Marvel",
"volume": "1",
}, # This made the parser in CT much more complicated. It's understandable that this isn't parsed on the first few iterations of this project
"Star Wars - War of the Bounty Hunters - IG-88 (2021).cbz": {
"ext": "cbz",
"series": "Star Wars - War of the Bounty Hunters - IG-88",
"year": "2021",
}, # The addition of the '#1' turns this into the same as 'Aquaman - Green Arrow - Deep Target' above
"Star Wars - War of the Bounty Hunters - IG-88 #1 (2021).cbz": {
"ext": "cbz",
"issue": "1",
"series": "Star Wars - War of the Bounty Hunters - IG-88",
"year": "2021",
}, # CT treats '[]' as equivalent to '()', catches DC as a publisher and 'Sep-Oct 1951' as dates and removes them. CT doesn't catch the digital though so that could be better but I blame whoever made this atrocious filename
"Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz": {
"ext": "cbz",
"issue": "49",
"series": "Wonder Woman",
"title": "digital",
"publisher": "DC",
"year": "1951",
}, # CT notices that this is a full date, CT doesn't actually return the month or day though just removes it
"X-Men, 2021-08-04 (#02).cbz": {
"ext": "cbz",
"issue": "02",
"series": "X-Men",
"year": "2021",
}, # CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation
"Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": {
"ext": "cbz",
"issue": "001",
"series": "Cory Doctorow's Futuristic Tales of the Here and Now",
"title": "Anda's Game",
"year": "2007",
}, # This is a contrived test case. I've never seen this I just wanted to handle it with my parser
"Cory Doctorow's Futuristic Tales of the Here and Now #0.0.1 (2007).cbz": {
"ext": "cbz",
"issue": "0.1",
"series": "Cory Doctorow's Futuristic Tales of the Here and Now",
"year": "2007",
"issue_count": "",
},
}
)