titles after tokens
This commit is contained in:
parent
65e17236df
commit
3ce61254dc
5
NEWS.md
5
NEWS.md
@ -1,5 +1,10 @@
|
||||
# 📰 comicfn2dict News
|
||||
|
||||
## v0.2.0
|
||||
|
||||
- Titles are now parsed only if they occur after the series token AND after
|
||||
either issue, year or volume.
|
||||
|
||||
## v0.1.4
|
||||
|
||||
- Require Python 3.10
|
||||
|
@ -1,10 +1,10 @@
|
||||
"""Parse comic book archive names using the simple 'parse' parser."""
|
||||
from pprint import pprint
|
||||
from pathlib import Path
|
||||
from re import Match, Pattern
|
||||
from typing import Any
|
||||
|
||||
from comicfn2dict.regex import (
|
||||
DASH_SPLIT_RE,
|
||||
EXTRA_SPACES_RE,
|
||||
ISSUE_ANYWHERE_RE,
|
||||
ISSUE_BEGIN_RE,
|
||||
@ -26,9 +26,13 @@ from comicfn2dict.regex import (
|
||||
_REMAINING_GROUP_KEYS = ("series", "title")
|
||||
|
||||
|
||||
def _parse_ext(name: str, suffix: str, metadata: dict) -> str:
|
||||
def _parse_ext(name: str | Path, metadata: dict) -> str:
|
||||
"""Pop the extension from the pathname."""
|
||||
data = name.removesuffix(suffix)
|
||||
if isinstance(name, str):
|
||||
name = name.strip()
|
||||
path = Path(name)
|
||||
suffix = path.suffix
|
||||
data = path.name.removesuffix(suffix)
|
||||
ext = suffix.lstrip(".")
|
||||
if ext:
|
||||
metadata["ext"] = ext
|
||||
@ -43,17 +47,18 @@ def _clean_dividers(data: str) -> str:
|
||||
|
||||
def _get_data_list(path: str | Path, metadata: dict) -> list[str]:
|
||||
"""Prepare data list from a path or string."""
|
||||
if isinstance(path, str):
|
||||
path = path.strip()
|
||||
path = Path(path)
|
||||
data = _parse_ext(path.name, path.suffix, metadata)
|
||||
data = _parse_ext(path, metadata)
|
||||
data = _clean_dividers(data)
|
||||
return DASH_SPLIT_RE.split(data)
|
||||
return [data]
|
||||
|
||||
|
||||
def _paren_strip(value: str) -> str:
|
||||
def _grouping_operators_strip(value: str) -> str:
|
||||
"""Strip spaces and parens."""
|
||||
return value.strip().strip("()").strip()
|
||||
value = value.strip()
|
||||
value = value.strip("()").strip()
|
||||
value = value.strip("-").strip()
|
||||
value = value.strip("'").strip('"').strip()
|
||||
return value
|
||||
|
||||
|
||||
def _splicey_dicey(
|
||||
@ -71,7 +76,7 @@ def _splicey_dicey(
|
||||
if data_after := data[match.end() :].strip():
|
||||
data_ends.append(data_after)
|
||||
data_list[index:index] = data_ends
|
||||
return _paren_strip(value)
|
||||
return _grouping_operators_strip(value)
|
||||
|
||||
|
||||
def _match_original_format_and_scan_info(
|
||||
@ -83,10 +88,10 @@ def _match_original_format_and_scan_info(
|
||||
scan_info = match.group("scan_info")
|
||||
except IndexError:
|
||||
scan_info = None
|
||||
metadata["original_format"] = _paren_strip(original_format)
|
||||
metadata["original_format"] = _grouping_operators_strip(original_format)
|
||||
match_group = 1
|
||||
if scan_info:
|
||||
metadata["scan_info"] = _paren_strip(scan_info)
|
||||
metadata["scan_info"] = _grouping_operators_strip(scan_info)
|
||||
match_group = 0
|
||||
_splicey_dicey(data_list, index, match, match_group=match_group)
|
||||
|
||||
@ -112,14 +117,16 @@ def _pop_value_from_token(
|
||||
regex: Pattern,
|
||||
key: str,
|
||||
index: int = 0,
|
||||
) -> Match:
|
||||
) -> str:
|
||||
"""Search token for value, splice and assign to metadata."""
|
||||
data = data_list[index]
|
||||
match = regex.search(data)
|
||||
if match:
|
||||
value = _splicey_dicey(data_list, index, match, key)
|
||||
metadata[key] = value
|
||||
return match
|
||||
else:
|
||||
value = ""
|
||||
return value
|
||||
|
||||
|
||||
def _parse_item(
|
||||
@ -128,21 +135,25 @@ def _parse_item(
|
||||
regex: Pattern,
|
||||
key: str,
|
||||
start_index: int = 0,
|
||||
path: str = "",
|
||||
) -> int:
|
||||
"""Parse a value from the data list into metadata and alter the data list."""
|
||||
path_index = -1
|
||||
index = start_index
|
||||
dl_len = end_index = len(data_list)
|
||||
if index >= end_index:
|
||||
index = 0
|
||||
while index < end_index:
|
||||
match = _pop_value_from_token(data_list, metadata, regex, key, index)
|
||||
if match:
|
||||
value = _pop_value_from_token(data_list, metadata, regex, key, index)
|
||||
if value:
|
||||
if "key" == "issue":
|
||||
path_index = path.find(value)
|
||||
break
|
||||
index += 1
|
||||
if index > dl_len and start_index > 0:
|
||||
index = 0
|
||||
end_index = start_index
|
||||
return index
|
||||
return path_index
|
||||
|
||||
|
||||
def _pop_issue_from_text_fields(
|
||||
@ -156,7 +167,39 @@ def _pop_issue_from_text_fields(
|
||||
return data_list.pop(index)
|
||||
|
||||
|
||||
def _assign_remaining_groups(data_list: list[str], metadata: dict):
|
||||
TITLE_PRECEDING_KEYS = ("issue", "year", "volume")
|
||||
|
||||
|
||||
def _is_title_in_position(path, value, metadata):
|
||||
"""Does the title come after series and one other token if they exist."""
|
||||
# TODO this could be faster if indexes could be grabbed for these tokens
|
||||
# when they are extracted.
|
||||
title_index = path.find(value)
|
||||
|
||||
# Does a series come first.
|
||||
series = metadata.get("series")
|
||||
if not series:
|
||||
return False
|
||||
series_index = path.find(series)
|
||||
if title_index < series_index:
|
||||
return False
|
||||
|
||||
# If other tokens exist then they much precede the title.
|
||||
title_ok = False
|
||||
other_tokens_exist = False
|
||||
for preceding_key in TITLE_PRECEDING_KEYS:
|
||||
preceding_value = metadata.get(preceding_key)
|
||||
if not preceding_value:
|
||||
continue
|
||||
other_tokens_exist = True
|
||||
preceding_index = path.find(preceding_value)
|
||||
if title_index > preceding_index:
|
||||
title_ok = True
|
||||
break
|
||||
return title_ok or not other_tokens_exist
|
||||
|
||||
|
||||
def _assign_remaining_groups(data_list: list[str], metadata: dict, path: str):
|
||||
"""Assign series and title."""
|
||||
index = 0
|
||||
for key in _REMAINING_GROUP_KEYS:
|
||||
@ -167,7 +210,9 @@ def _assign_remaining_groups(data_list: list[str], metadata: dict):
|
||||
match = REMAINING_GROUP_RE.search(data) if data else None
|
||||
if match:
|
||||
value = _pop_issue_from_text_fields(data_list, metadata, index)
|
||||
value = _paren_strip(value)
|
||||
if key == "title" and not _is_title_in_position(path, value, metadata):
|
||||
continue
|
||||
value = _grouping_operators_strip(value)
|
||||
if value:
|
||||
metadata[key] = value
|
||||
else:
|
||||
@ -184,10 +229,17 @@ def _pickup_issue(remainders: list[str], metadata: dict) -> None:
|
||||
_parse_item(remainders, metadata, ISSUE_ANYWHERE_RE, "issue")
|
||||
|
||||
|
||||
def _log_progress(label, metadata, data_list):
|
||||
print(label + ":")
|
||||
pprint(metadata)
|
||||
pprint(data_list)
|
||||
|
||||
|
||||
def comicfn2dict(path: str | Path) -> dict[str, Any]:
|
||||
"""Parse the filename with a hierarchy of regexes."""
|
||||
metadata = {}
|
||||
data_list = _get_data_list(path, metadata)
|
||||
_log_progress("INITIAL", metadata, data_list)
|
||||
|
||||
# Parse paren tokens
|
||||
_parse_item(data_list, metadata, ISSUE_COUNT_RE, "issue_count")
|
||||
@ -206,26 +258,33 @@ def comicfn2dict(path: str | Path) -> dict[str, Any]:
|
||||
"scan_info",
|
||||
start_index=of_index + 1,
|
||||
)
|
||||
_log_progress("AFTER PAREN TOKENS", metadata, data_list)
|
||||
|
||||
# Parse regular tokens
|
||||
_parse_item(data_list, metadata, VOLUME_RE, "volume")
|
||||
_parse_item(data_list, metadata, ISSUE_NUMBER_RE, "issue")
|
||||
_parse_item(data_list, metadata, ISSUE_NUMBER_RE, "issue", path=str(path))
|
||||
_log_progress("AFTER REGULAR TOKENS", metadata, data_list)
|
||||
|
||||
# Pickup year if not gotten.
|
||||
if "year" not in metadata:
|
||||
_parse_item(data_list, metadata, YEAR_BEGIN_RE, "year")
|
||||
if "year" not in metadata:
|
||||
_parse_item(data_list, metadata, YEAR_END_RE, "year")
|
||||
_log_progress("AFTER YEAR PICKUP", metadata, data_list)
|
||||
|
||||
# Pickup issue if it's a standalone token
|
||||
if "issue" not in metadata:
|
||||
_parse_item(data_list, metadata, ISSUE_TOKEN_RE, "issue")
|
||||
|
||||
_log_progress("AFTER ISSUE PICKUP", metadata, data_list)
|
||||
|
||||
# Series and Title. Also looks for issue.
|
||||
_assign_remaining_groups(data_list, metadata)
|
||||
_assign_remaining_groups(data_list, metadata, str(path))
|
||||
_log_progress("AFTER SERIES AND TITLE", metadata, data_list)
|
||||
|
||||
# Final try for issue number.
|
||||
_pickup_issue(data_list, metadata)
|
||||
_log_progress("AFTER ISSUE PICKUP", metadata, data_list)
|
||||
|
||||
# Add Remainders
|
||||
if data_list:
|
||||
|
@ -72,4 +72,4 @@ ISSUE_BEGIN_RE = re_compile(r"^(" + _ISSUE_RE_EXP + r")\b")
|
||||
ISSUE_ANYWHERE_RE = re_compile(r"\b(" + _ISSUE_RE_EXP + r")\b")
|
||||
|
||||
# LONG STRINGS
|
||||
REMAINING_GROUP_RE = re_compile(r"^[\w].*[^\)]")
|
||||
REMAINING_GROUP_RE = re_compile(r"^[^\()].*[^\)]")
|
||||
|
@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.poetry]
|
||||
name = "comicfn2dict"
|
||||
version = "0.1.4"
|
||||
version = "0.2.0"
|
||||
description = "Parse common comic filenames and return a dict of metadata attributes. Includes a cli."
|
||||
license = "GPL-3.0-only"
|
||||
authors = ["AJ Slater <aj@slater.net>"]
|
||||
|
@ -80,13 +80,11 @@ FNS = {
|
||||
"original_format": "digital",
|
||||
},
|
||||
"Bardude - The Last Thing I Remember.cbz": {
|
||||
"series": "Bardude",
|
||||
"title": "The Last Thing I Remember",
|
||||
"series": "Bardude - The Last Thing I Remember",
|
||||
"ext": "cbz",
|
||||
},
|
||||
"Drunkguy - The Man Without Fear - 01.cbz": {
|
||||
"series": "Drunkguy",
|
||||
"title": "The Man Without Fear",
|
||||
"series": "Drunkguy - The Man Without Fear",
|
||||
"issue": "01",
|
||||
"ext": "cbz",
|
||||
},
|
||||
@ -125,9 +123,8 @@ FNS = {
|
||||
"scan_info": "Zone-Empire",
|
||||
"title": "Last Bullet",
|
||||
},
|
||||
"Jeremy John - A Big Long Title (2017) (digital-Minutement).cbz": {
|
||||
"series": "Jeremy John",
|
||||
"title": "A Big Long Title",
|
||||
"Jeremy John - Not A Title (2017) (digital-Minutement).cbz": {
|
||||
"series": "Jeremy John - Not A Title",
|
||||
"year": "2017",
|
||||
"ext": "cbz",
|
||||
"original_format": "digital",
|
||||
@ -243,3 +240,167 @@ FNS = {
|
||||
"ext": "cbz",
|
||||
},
|
||||
}
|
||||
|
||||
FNS.update( # Newly fixed.
|
||||
{
|
||||
"'Batman - Superman - World's Finest 022 (2024) (Webrip) (The Last Kryptonian-DCP).cbz": {
|
||||
"ext": "cbz",
|
||||
"issue": "022",
|
||||
"remainders": ("(The Last Kryptonian-DCP)",),
|
||||
"scan_info": "Webrip",
|
||||
"series": "Batman - Superman - World's Finest",
|
||||
"year": "2024",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
FNS.update(
|
||||
{
|
||||
# Issue number starting with a letter requested in https://github.com/comictagger/comictagger/issues/543
|
||||
"batman #B01 title.cbz": {
|
||||
"ext": "cbz",
|
||||
"issue": "B01",
|
||||
"series": "batman",
|
||||
"title": "title",
|
||||
}, # Leading issue number is usually an alternate sequence number
|
||||
"52 action comics #2024.cbz": {
|
||||
"ext": "cbz",
|
||||
"issue": "2024",
|
||||
"series": "action comics",
|
||||
"alternate": "52",
|
||||
}, # 4 digit issue number
|
||||
"action comics 1024.cbz": {
|
||||
"ext": "cbz",
|
||||
"issue": "1024",
|
||||
"series": "action comics",
|
||||
}, # Only the issue number. CT ensures that the series always has a value if possible
|
||||
"#52.cbz": {
|
||||
"ext": "cbz",
|
||||
"issue": "52",
|
||||
"series": "52",
|
||||
}, # CT treats double-underscore the same as double-dash
|
||||
"Monster_Island_v1_#2__repaired__c2c.cbz": {
|
||||
"ext": "cbz",
|
||||
"issue": "2",
|
||||
"series": "Monster Island",
|
||||
"volume": "1",
|
||||
}, # I'm not sure there's a right way to parse this. This might also be a madeup filename I don't remember
|
||||
"Super Strange Yarns (1957) #92 (1969).cbz": {
|
||||
"ext": "cbz",
|
||||
"issue": "92",
|
||||
"series": "Super Strange Yarns",
|
||||
"volume": "1957",
|
||||
"year": "1969",
|
||||
}, # Extra - in the series
|
||||
" X-Men-V1-#067.cbr": {
|
||||
"ext": "cbr",
|
||||
"issue": "067",
|
||||
"series": "X-Men",
|
||||
"volume": "1",
|
||||
}, # CT only separates this into a title if the '-' is attached to the previous word eg 'aquaman- Green Arrow'. @bpepple opened a ticket for this https://github.com/ajslater/comicfn2dict/issues/1 already
|
||||
"Aquaman - Green Arrow - Deep Target #01 (of 07) (2021).cbr": {
|
||||
"ext": "cbr",
|
||||
"issue": "01",
|
||||
"series": "Aquaman - Green Arrow - Deep Target",
|
||||
"year": "2021",
|
||||
"issue_count": "7",
|
||||
},
|
||||
"Batman_-_Superman_#020_(2021).cbr": {
|
||||
"ext": "cbr",
|
||||
"issue": "020",
|
||||
"series": "Batman - Superman",
|
||||
"year": "2021",
|
||||
},
|
||||
"Free Comic Book Day - Avengers.Hulk (2021).cbz": {
|
||||
"ext": "cbz",
|
||||
"series": "Free Comic Book Day - Avengers Hulk",
|
||||
"year": "2021",
|
||||
}, # CT assumes the volume is also the issue number if it can't find an issue number
|
||||
"Avengers By Brian Michael Bendis volume 03 (2013).cbz": {
|
||||
"ext": "cbz",
|
||||
"issue": "3",
|
||||
"series": "Avengers By Brian Michael Bendis",
|
||||
"volume": "03",
|
||||
"year": "2013",
|
||||
}, # Publishers like to re-print some of their annuals using this format for the year
|
||||
"Batman '89 (2021) .cbr": {
|
||||
"ext": "cbr",
|
||||
"series": "Batman '89",
|
||||
"year": "2021",
|
||||
}, # CT has extra processing to re-attach the year in this case
|
||||
"Blade Runner Free Comic Book Day 2021 (2021).cbr": {
|
||||
"ext": "cbr",
|
||||
"series": "Blade Runner Free Comic Book Day 2021",
|
||||
"year": "2021",
|
||||
}, # CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series)
|
||||
"Bloodshot Book 03 (2020).cbr": {
|
||||
"ext": "cbr",
|
||||
"issue": "03",
|
||||
"series": "Bloodshot",
|
||||
"title": "Book 03",
|
||||
"volume": "03",
|
||||
"year": "2020",
|
||||
}, # CT checks for the following '(of 06)' after the '03' and marks it as the volume
|
||||
"Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": {
|
||||
"ext": "cbr",
|
||||
"issue": "008",
|
||||
"series": "Elephantmen 2259",
|
||||
"title": "Simple Truth",
|
||||
"volume": "03",
|
||||
"year": "2021",
|
||||
"volume_count": "06",
|
||||
}, # CT catches the year
|
||||
"Marvel Previews #002 (January 2022).cbr": {
|
||||
"ext": "cbr",
|
||||
"issue": "002",
|
||||
"series": "Marvel Previews",
|
||||
"year": "2022",
|
||||
}, # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder
|
||||
"Marvel Two In One V1 #090 c2c.cbr": {
|
||||
"ext": "cbr",
|
||||
"issue": "090",
|
||||
"series": "Marvel Two In One",
|
||||
"publisher": "Marvel",
|
||||
"volume": "1",
|
||||
}, # This made the parser in CT much more complicated. It's understandable that this isn't parsed on the first few iterations of this project
|
||||
"Star Wars - War of the Bounty Hunters - IG-88 (2021).cbz": {
|
||||
"ext": "cbz",
|
||||
"series": "Star Wars - War of the Bounty Hunters - IG-88",
|
||||
"year": "2021",
|
||||
}, # The addition of the '#1' turns this into the same as 'Aquaman - Green Arrow - Deep Target' above
|
||||
"Star Wars - War of the Bounty Hunters - IG-88 #1 (2021).cbz": {
|
||||
"ext": "cbz",
|
||||
"issue": "1",
|
||||
"series": "Star Wars - War of the Bounty Hunters - IG-88",
|
||||
"year": "2021",
|
||||
}, # CT treats '[]' as equivalent to '()', catches DC as a publisher and 'Sep-Oct 1951' as dates and removes them. CT doesn't catch the digital though so that could be better but I blame whoever made this atrocious filename
|
||||
"Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz": {
|
||||
"ext": "cbz",
|
||||
"issue": "49",
|
||||
"series": "Wonder Woman",
|
||||
"title": "digital",
|
||||
"publisher": "DC",
|
||||
"year": "1951",
|
||||
}, # CT notices that this is a full date, CT doesn't actually return the month or day though just removes it
|
||||
"X-Men, 2021-08-04 (#02).cbz": {
|
||||
"ext": "cbz",
|
||||
"issue": "02",
|
||||
"series": "X-Men",
|
||||
"year": "2021",
|
||||
}, # CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation
|
||||
"Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": {
|
||||
"ext": "cbz",
|
||||
"issue": "001",
|
||||
"series": "Cory Doctorow's Futuristic Tales of the Here and Now",
|
||||
"title": "Anda's Game",
|
||||
"year": "2007",
|
||||
}, # This is a contrived test case. I've never seen this I just wanted to handle it with my parser
|
||||
"Cory Doctorow's Futuristic Tales of the Here and Now #0.0.1 (2007).cbz": {
|
||||
"ext": "cbz",
|
||||
"issue": "0.1",
|
||||
"series": "Cory Doctorow's Futuristic Tales of the Here and Now",
|
||||
"year": "2007",
|
||||
"issue_count": "",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user