titles after tokens

This commit is contained in:
AJ Slater 2024-02-19 14:11:47 -08:00
parent 65e17236df
commit 3ce61254dc
5 changed files with 256 additions and 31 deletions

View File

@ -1,5 +1,10 @@
# 📰 comicfn2dict News # 📰 comicfn2dict News
## v0.2.0
- Titles are now parsed only if they occur after the series token AND after
either issue, year or volume.
## v0.1.4 ## v0.1.4
- Require Python 3.10 - Require Python 3.10

View File

@ -1,10 +1,10 @@
"""Parse comic book archive names using the simple 'parse' parser.""" """Parse comic book archive names using the simple 'parse' parser."""
from pprint import pprint
from pathlib import Path from pathlib import Path
from re import Match, Pattern from re import Match, Pattern
from typing import Any from typing import Any
from comicfn2dict.regex import ( from comicfn2dict.regex import (
DASH_SPLIT_RE,
EXTRA_SPACES_RE, EXTRA_SPACES_RE,
ISSUE_ANYWHERE_RE, ISSUE_ANYWHERE_RE,
ISSUE_BEGIN_RE, ISSUE_BEGIN_RE,
@ -26,9 +26,13 @@ from comicfn2dict.regex import (
_REMAINING_GROUP_KEYS = ("series", "title") _REMAINING_GROUP_KEYS = ("series", "title")
def _parse_ext(name: str, suffix: str, metadata: dict) -> str: def _parse_ext(name: str | Path, metadata: dict) -> str:
"""Pop the extension from the pathname.""" """Pop the extension from the pathname."""
data = name.removesuffix(suffix) if isinstance(name, str):
name = name.strip()
path = Path(name)
suffix = path.suffix
data = path.name.removesuffix(suffix)
ext = suffix.lstrip(".") ext = suffix.lstrip(".")
if ext: if ext:
metadata["ext"] = ext metadata["ext"] = ext
@ -43,17 +47,18 @@ def _clean_dividers(data: str) -> str:
def _get_data_list(path: str | Path, metadata: dict) -> list[str]: def _get_data_list(path: str | Path, metadata: dict) -> list[str]:
"""Prepare data list from a path or string.""" """Prepare data list from a path or string."""
if isinstance(path, str): data = _parse_ext(path, metadata)
path = path.strip()
path = Path(path)
data = _parse_ext(path.name, path.suffix, metadata)
data = _clean_dividers(data) data = _clean_dividers(data)
return DASH_SPLIT_RE.split(data) return [data]
def _paren_strip(value: str) -> str: def _grouping_operators_strip(value: str) -> str:
"""Strip spaces and parens.""" """Strip spaces and parens."""
return value.strip().strip("()").strip() value = value.strip()
value = value.strip("()").strip()
value = value.strip("-").strip()
value = value.strip("'").strip('"').strip()
return value
def _splicey_dicey( def _splicey_dicey(
@ -71,7 +76,7 @@ def _splicey_dicey(
if data_after := data[match.end() :].strip(): if data_after := data[match.end() :].strip():
data_ends.append(data_after) data_ends.append(data_after)
data_list[index:index] = data_ends data_list[index:index] = data_ends
return _paren_strip(value) return _grouping_operators_strip(value)
def _match_original_format_and_scan_info( def _match_original_format_and_scan_info(
@ -83,10 +88,10 @@ def _match_original_format_and_scan_info(
scan_info = match.group("scan_info") scan_info = match.group("scan_info")
except IndexError: except IndexError:
scan_info = None scan_info = None
metadata["original_format"] = _paren_strip(original_format) metadata["original_format"] = _grouping_operators_strip(original_format)
match_group = 1 match_group = 1
if scan_info: if scan_info:
metadata["scan_info"] = _paren_strip(scan_info) metadata["scan_info"] = _grouping_operators_strip(scan_info)
match_group = 0 match_group = 0
_splicey_dicey(data_list, index, match, match_group=match_group) _splicey_dicey(data_list, index, match, match_group=match_group)
@ -112,14 +117,16 @@ def _pop_value_from_token(
regex: Pattern, regex: Pattern,
key: str, key: str,
index: int = 0, index: int = 0,
) -> Match: ) -> str:
"""Search token for value, splice and assign to metadata.""" """Search token for value, splice and assign to metadata."""
data = data_list[index] data = data_list[index]
match = regex.search(data) match = regex.search(data)
if match: if match:
value = _splicey_dicey(data_list, index, match, key) value = _splicey_dicey(data_list, index, match, key)
metadata[key] = value metadata[key] = value
return match else:
value = ""
return value
def _parse_item( def _parse_item(
@ -128,21 +135,25 @@ def _parse_item(
regex: Pattern, regex: Pattern,
key: str, key: str,
start_index: int = 0, start_index: int = 0,
path: str = "",
) -> int: ) -> int:
"""Parse a value from the data list into metadata and alter the data list.""" """Parse a value from the data list into metadata and alter the data list."""
path_index = -1
index = start_index index = start_index
dl_len = end_index = len(data_list) dl_len = end_index = len(data_list)
if index >= end_index: if index >= end_index:
index = 0 index = 0
while index < end_index: while index < end_index:
match = _pop_value_from_token(data_list, metadata, regex, key, index) value = _pop_value_from_token(data_list, metadata, regex, key, index)
if match: if value:
if "key" == "issue":
path_index = path.find(value)
break break
index += 1 index += 1
if index > dl_len and start_index > 0: if index > dl_len and start_index > 0:
index = 0 index = 0
end_index = start_index end_index = start_index
return index return path_index
def _pop_issue_from_text_fields( def _pop_issue_from_text_fields(
@ -156,7 +167,39 @@ def _pop_issue_from_text_fields(
return data_list.pop(index) return data_list.pop(index)
def _assign_remaining_groups(data_list: list[str], metadata: dict): TITLE_PRECEDING_KEYS = ("issue", "year", "volume")
def _is_title_in_position(path, value, metadata):
"""Does the title come after series and one other token if they exist."""
# TODO this could be faster if indexes could be grabbed for these tokens
# when they are extracted.
title_index = path.find(value)
# Does a series come first.
series = metadata.get("series")
if not series:
return False
series_index = path.find(series)
if title_index < series_index:
return False
# If other tokens exist then they much precede the title.
title_ok = False
other_tokens_exist = False
for preceding_key in TITLE_PRECEDING_KEYS:
preceding_value = metadata.get(preceding_key)
if not preceding_value:
continue
other_tokens_exist = True
preceding_index = path.find(preceding_value)
if title_index > preceding_index:
title_ok = True
break
return title_ok or not other_tokens_exist
def _assign_remaining_groups(data_list: list[str], metadata: dict, path: str):
"""Assign series and title.""" """Assign series and title."""
index = 0 index = 0
for key in _REMAINING_GROUP_KEYS: for key in _REMAINING_GROUP_KEYS:
@ -167,7 +210,9 @@ def _assign_remaining_groups(data_list: list[str], metadata: dict):
match = REMAINING_GROUP_RE.search(data) if data else None match = REMAINING_GROUP_RE.search(data) if data else None
if match: if match:
value = _pop_issue_from_text_fields(data_list, metadata, index) value = _pop_issue_from_text_fields(data_list, metadata, index)
value = _paren_strip(value) if key == "title" and not _is_title_in_position(path, value, metadata):
continue
value = _grouping_operators_strip(value)
if value: if value:
metadata[key] = value metadata[key] = value
else: else:
@ -184,10 +229,17 @@ def _pickup_issue(remainders: list[str], metadata: dict) -> None:
_parse_item(remainders, metadata, ISSUE_ANYWHERE_RE, "issue") _parse_item(remainders, metadata, ISSUE_ANYWHERE_RE, "issue")
def _log_progress(label, metadata, data_list):
print(label + ":")
pprint(metadata)
pprint(data_list)
def comicfn2dict(path: str | Path) -> dict[str, Any]: def comicfn2dict(path: str | Path) -> dict[str, Any]:
"""Parse the filename with a hierarchy of regexes.""" """Parse the filename with a hierarchy of regexes."""
metadata = {} metadata = {}
data_list = _get_data_list(path, metadata) data_list = _get_data_list(path, metadata)
_log_progress("INITIAL", metadata, data_list)
# Parse paren tokens # Parse paren tokens
_parse_item(data_list, metadata, ISSUE_COUNT_RE, "issue_count") _parse_item(data_list, metadata, ISSUE_COUNT_RE, "issue_count")
@ -206,26 +258,33 @@ def comicfn2dict(path: str | Path) -> dict[str, Any]:
"scan_info", "scan_info",
start_index=of_index + 1, start_index=of_index + 1,
) )
_log_progress("AFTER PAREN TOKENS", metadata, data_list)
# Parse regular tokens # Parse regular tokens
_parse_item(data_list, metadata, VOLUME_RE, "volume") _parse_item(data_list, metadata, VOLUME_RE, "volume")
_parse_item(data_list, metadata, ISSUE_NUMBER_RE, "issue") _parse_item(data_list, metadata, ISSUE_NUMBER_RE, "issue", path=str(path))
_log_progress("AFTER REGULAR TOKENS", metadata, data_list)
# Pickup year if not gotten. # Pickup year if not gotten.
if "year" not in metadata: if "year" not in metadata:
_parse_item(data_list, metadata, YEAR_BEGIN_RE, "year") _parse_item(data_list, metadata, YEAR_BEGIN_RE, "year")
if "year" not in metadata: if "year" not in metadata:
_parse_item(data_list, metadata, YEAR_END_RE, "year") _parse_item(data_list, metadata, YEAR_END_RE, "year")
_log_progress("AFTER YEAR PICKUP", metadata, data_list)
# Pickup issue if it's a standalone token # Pickup issue if it's a standalone token
if "issue" not in metadata: if "issue" not in metadata:
_parse_item(data_list, metadata, ISSUE_TOKEN_RE, "issue") _parse_item(data_list, metadata, ISSUE_TOKEN_RE, "issue")
_log_progress("AFTER ISSUE PICKUP", metadata, data_list)
# Series and Title. Also looks for issue. # Series and Title. Also looks for issue.
_assign_remaining_groups(data_list, metadata) _assign_remaining_groups(data_list, metadata, str(path))
_log_progress("AFTER SERIES AND TITLE", metadata, data_list)
# Final try for issue number. # Final try for issue number.
_pickup_issue(data_list, metadata) _pickup_issue(data_list, metadata)
_log_progress("AFTER ISSUE PICKUP", metadata, data_list)
# Add Remainders # Add Remainders
if data_list: if data_list:

View File

@ -72,4 +72,4 @@ ISSUE_BEGIN_RE = re_compile(r"^(" + _ISSUE_RE_EXP + r")\b")
ISSUE_ANYWHERE_RE = re_compile(r"\b(" + _ISSUE_RE_EXP + r")\b") ISSUE_ANYWHERE_RE = re_compile(r"\b(" + _ISSUE_RE_EXP + r")\b")
# LONG STRINGS # LONG STRINGS
REMAINING_GROUP_RE = re_compile(r"^[\w].*[^\)]") REMAINING_GROUP_RE = re_compile(r"^[^\()].*[^\)]")

View File

@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry] [tool.poetry]
name = "comicfn2dict" name = "comicfn2dict"
version = "0.1.4" version = "0.2.0"
description = "Parse common comic filenames and return a dict of metadata attributes. Includes a cli." description = "Parse common comic filenames and return a dict of metadata attributes. Includes a cli."
license = "GPL-3.0-only" license = "GPL-3.0-only"
authors = ["AJ Slater <aj@slater.net>"] authors = ["AJ Slater <aj@slater.net>"]

View File

@ -80,13 +80,11 @@ FNS = {
"original_format": "digital", "original_format": "digital",
}, },
"Bardude - The Last Thing I Remember.cbz": { "Bardude - The Last Thing I Remember.cbz": {
"series": "Bardude", "series": "Bardude - The Last Thing I Remember",
"title": "The Last Thing I Remember",
"ext": "cbz", "ext": "cbz",
}, },
"Drunkguy - The Man Without Fear - 01.cbz": { "Drunkguy - The Man Without Fear - 01.cbz": {
"series": "Drunkguy", "series": "Drunkguy - The Man Without Fear",
"title": "The Man Without Fear",
"issue": "01", "issue": "01",
"ext": "cbz", "ext": "cbz",
}, },
@ -125,9 +123,8 @@ FNS = {
"scan_info": "Zone-Empire", "scan_info": "Zone-Empire",
"title": "Last Bullet", "title": "Last Bullet",
}, },
"Jeremy John - A Big Long Title (2017) (digital-Minutement).cbz": { "Jeremy John - Not A Title (2017) (digital-Minutement).cbz": {
"series": "Jeremy John", "series": "Jeremy John - Not A Title",
"title": "A Big Long Title",
"year": "2017", "year": "2017",
"ext": "cbz", "ext": "cbz",
"original_format": "digital", "original_format": "digital",
@ -243,3 +240,167 @@ FNS = {
"ext": "cbz", "ext": "cbz",
}, },
} }
FNS.update( # Newly fixed.
{
"'Batman - Superman - World's Finest 022 (2024) (Webrip) (The Last Kryptonian-DCP).cbz": {
"ext": "cbz",
"issue": "022",
"remainders": ("(The Last Kryptonian-DCP)",),
"scan_info": "Webrip",
"series": "Batman - Superman - World's Finest",
"year": "2024",
},
}
)
FNS.update(
{
# Issue number starting with a letter requested in https://github.com/comictagger/comictagger/issues/543
"batman #B01 title.cbz": {
"ext": "cbz",
"issue": "B01",
"series": "batman",
"title": "title",
}, # Leading issue number is usually an alternate sequence number
"52 action comics #2024.cbz": {
"ext": "cbz",
"issue": "2024",
"series": "action comics",
"alternate": "52",
}, # 4 digit issue number
"action comics 1024.cbz": {
"ext": "cbz",
"issue": "1024",
"series": "action comics",
}, # Only the issue number. CT ensures that the series always has a value if possible
"#52.cbz": {
"ext": "cbz",
"issue": "52",
"series": "52",
}, # CT treats double-underscore the same as double-dash
"Monster_Island_v1_#2__repaired__c2c.cbz": {
"ext": "cbz",
"issue": "2",
"series": "Monster Island",
"volume": "1",
}, # I'm not sure there's a right way to parse this. This might also be a madeup filename I don't remember
"Super Strange Yarns (1957) #92 (1969).cbz": {
"ext": "cbz",
"issue": "92",
"series": "Super Strange Yarns",
"volume": "1957",
"year": "1969",
}, # Extra - in the series
" X-Men-V1-#067.cbr": {
"ext": "cbr",
"issue": "067",
"series": "X-Men",
"volume": "1",
}, # CT only separates this into a title if the '-' is attached to the previous word eg 'aquaman- Green Arrow'. @bpepple opened a ticket for this https://github.com/ajslater/comicfn2dict/issues/1 already
"Aquaman - Green Arrow - Deep Target #01 (of 07) (2021).cbr": {
"ext": "cbr",
"issue": "01",
"series": "Aquaman - Green Arrow - Deep Target",
"year": "2021",
"issue_count": "7",
},
"Batman_-_Superman_#020_(2021).cbr": {
"ext": "cbr",
"issue": "020",
"series": "Batman - Superman",
"year": "2021",
},
"Free Comic Book Day - Avengers.Hulk (2021).cbz": {
"ext": "cbz",
"series": "Free Comic Book Day - Avengers Hulk",
"year": "2021",
}, # CT assumes the volume is also the issue number if it can't find an issue number
"Avengers By Brian Michael Bendis volume 03 (2013).cbz": {
"ext": "cbz",
"issue": "3",
"series": "Avengers By Brian Michael Bendis",
"volume": "03",
"year": "2013",
}, # Publishers like to re-print some of their annuals using this format for the year
"Batman '89 (2021) .cbr": {
"ext": "cbr",
"series": "Batman '89",
"year": "2021",
}, # CT has extra processing to re-attach the year in this case
"Blade Runner Free Comic Book Day 2021 (2021).cbr": {
"ext": "cbr",
"series": "Blade Runner Free Comic Book Day 2021",
"year": "2021",
}, # CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series)
"Bloodshot Book 03 (2020).cbr": {
"ext": "cbr",
"issue": "03",
"series": "Bloodshot",
"title": "Book 03",
"volume": "03",
"year": "2020",
}, # CT checks for the following '(of 06)' after the '03' and marks it as the volume
"Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": {
"ext": "cbr",
"issue": "008",
"series": "Elephantmen 2259",
"title": "Simple Truth",
"volume": "03",
"year": "2021",
"volume_count": "06",
}, # CT catches the year
"Marvel Previews #002 (January 2022).cbr": {
"ext": "cbr",
"issue": "002",
"series": "Marvel Previews",
"year": "2022",
}, # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder
"Marvel Two In One V1 #090 c2c.cbr": {
"ext": "cbr",
"issue": "090",
"series": "Marvel Two In One",
"publisher": "Marvel",
"volume": "1",
}, # This made the parser in CT much more complicated. It's understandable that this isn't parsed on the first few iterations of this project
"Star Wars - War of the Bounty Hunters - IG-88 (2021).cbz": {
"ext": "cbz",
"series": "Star Wars - War of the Bounty Hunters - IG-88",
"year": "2021",
}, # The addition of the '#1' turns this into the same as 'Aquaman - Green Arrow - Deep Target' above
"Star Wars - War of the Bounty Hunters - IG-88 #1 (2021).cbz": {
"ext": "cbz",
"issue": "1",
"series": "Star Wars - War of the Bounty Hunters - IG-88",
"year": "2021",
}, # CT treats '[]' as equivalent to '()', catches DC as a publisher and 'Sep-Oct 1951' as dates and removes them. CT doesn't catch the digital though so that could be better but I blame whoever made this atrocious filename
"Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz": {
"ext": "cbz",
"issue": "49",
"series": "Wonder Woman",
"title": "digital",
"publisher": "DC",
"year": "1951",
}, # CT notices that this is a full date, CT doesn't actually return the month or day though just removes it
"X-Men, 2021-08-04 (#02).cbz": {
"ext": "cbz",
"issue": "02",
"series": "X-Men",
"year": "2021",
}, # CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation
"Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": {
"ext": "cbz",
"issue": "001",
"series": "Cory Doctorow's Futuristic Tales of the Here and Now",
"title": "Anda's Game",
"year": "2007",
}, # This is a contrived test case. I've never seen this I just wanted to handle it with my parser
"Cory Doctorow's Futuristic Tales of the Here and Now #0.0.1 (2007).cbz": {
"ext": "cbz",
"issue": "0.1",
"series": "Cory Doctorow's Futuristic Tales of the Here and Now",
"year": "2007",
"issue_count": "",
},
}
)