Improve filename parsing

This commit is contained in:
Timmy Welch 2024-06-29 18:43:20 -07:00
parent 56d8c507e2
commit c28dc19df6
4 changed files with 51 additions and 17 deletions

View File

@ -6,6 +6,7 @@ import calendar
import os
import unicodedata
from enum import Enum, auto
from itertools import chain
from typing import Any, Callable, Protocol
@ -307,21 +308,20 @@ def lex_text(lex: Lexer) -> LexerFunc:
if is_alpha_numeric(r):
if r.isnumeric(): # E.g. v1
word = lex.input[lex.start : lex.pos]
if word.casefold() in key and key[word.casefold()] == ItemType.InfoSpecifier:
if key.get(word.casefold(), None) == ItemType.InfoSpecifier:
lex.backup()
lex.emit(key[word.casefold()])
return lex_filename
else:
if r == "'" and lex.peek() == "s":
if r == "'" and lex.peek().casefold() == "s":
lex.get()
else:
lex.backup()
word = lex.input[lex.start : lex.pos + 1]
if word.casefold() == "vol" and lex.peek() == ".":
lex.get()
word = lex.input[lex.start : lex.pos + 1]
if word.casefold() in key:
if key[word.casefold()] in (ItemType.Honorific, ItemType.InfoSpecifier):
lex.accept(".")
lex.emit(key[word.casefold()])
elif cal(word):
lex.emit(ItemType.Calendar)
@ -332,12 +332,8 @@ def lex_text(lex: Lexer) -> LexerFunc:
return lex_filename
def cal(value: str) -> set[Any]:
month_abbr = [i for i, x in enumerate(calendar.month_abbr) if x == value.title()]
month_name = [i for i, x in enumerate(calendar.month_name) if x == value.title()]
day_abbr = [i for i, x in enumerate(calendar.day_abbr) if x == value.title()]
day_name = [i for i, x in enumerate(calendar.day_name) if x == value.title()]
return set(month_abbr + month_name + day_abbr + day_name)
def cal(value: str) -> bool:
return value.title() in set(chain(calendar.month_abbr, calendar.month_name, calendar.day_abbr, calendar.day_name))
def lex_number(lex: Lexer) -> LexerFunc | None:

View File

@ -417,10 +417,14 @@ class Parser:
self.remove_from_remainder.append(filenamelexer.ItemType.FCBD)
self.input = lexer_result
for i, item in enumerate(self.input):
self.error = None
for i, item in list(enumerate(self.input)):
if item.typ == filenamelexer.ItemType.IssueNumber:
self.issue_number_at = i
self.issue_number_marked = True
if item.typ == filenamelexer.ItemType.Error:
self.error = item
self.input.remove(self.error)
# Get returns the next Item in the input.
def get(self) -> filenamelexer.Item:
@ -1043,10 +1047,9 @@ def parse_finish(p: Parser) -> None:
if item in p.title_parts:
p.title_parts.remove(item)
p.filename_info["series"] = p.filename_info.get("issue", "")
if p.series_parts:
p.filename_info["series"] = join_title(p.series_parts)
else:
p.filename_info["series"] = p.filename_info.get("issue", "")
if "free comic book" in p.filename_info["series"].casefold():
p.filename_info["fcbd"] = True
@ -1092,7 +1095,6 @@ def get_remainder(p: Parser) -> str:
elif (
item.typ
in [
filenamelexer.ItemType.Space,
filenamelexer.ItemType.RightBrace,
filenamelexer.ItemType.RightParen,
filenamelexer.ItemType.RightSBrace,
@ -1111,7 +1113,7 @@ def get_remainder(p: Parser) -> str:
# Remove empty parentheses
remainder = re.sub(r"[\[{(]+[]})]+", "", remainder)
return remainder.strip()
return remainder.strip().rstrip("[{(")
def parse_info_specifier(p: Parser) -> ParserFunc:

View File

@ -162,6 +162,8 @@ def parse_filename(
remove_publisher=remove_publisher,
protofolius_issue_number_scheme=protofolius_issue_number_scheme,
)
if p.error:
logger.info("Issue parsing filename: '%s': %s ", filename, p.error.val)
fni = p.filename_info
elif parser == Parser.COMICFN2DICT:
fn2d = comicfn2dict(filename)

View File

@ -74,7 +74,7 @@ names: list[tuple[str, str, dict[str, str | bool], tuple[bool, bool]]] = [
),
(
"Michel Vaillant #5 Nr. 13 aan de start",
"Shortened word followed by a number eg No. 13, Mr. 13",
"Shortened word followed by a number eg No. 13, Nr. 13",
{
"issue": "5",
"series": "Michel Vaillant",
@ -276,6 +276,23 @@ names: list[tuple[str, str, dict[str, str | bool], tuple[bool, bool]]] = [
},
(False, True),
),
(
"batman #3 title (DC.cbz",
"publisher in title",
{
"archive": "cbz",
"issue": "3",
"series": "batman",
"title": "title",
"publisher": "DC",
"volume": "",
"year": "",
"remainder": "",
"issue_count": "",
"alternate": "",
},
(False, True),
),
(
"ms. Marvel #3.cbz",
"honorific and publisher in series",
@ -293,6 +310,23 @@ names: list[tuple[str, str, dict[str, str | bool], tuple[bool, bool]]] = [
},
(False, False),
),
(
"Dr. Doom And The Masters Of Evil #1 (2009).cbz",
"honorific and publisher in series",
{
"archive": "cbz",
"issue": "1",
"series": "Dr. Doom And The Masters Of Evil",
"title": "",
"publisher": "",
"volume": "",
"year": "2009",
"remainder": "",
"issue_count": "",
"alternate": "",
},
(False, False),
),
(
f"action comics #{datetime.datetime.now().year}.cbz",
"issue number is current year (digits == 4)",