From c28dc19df632891a8c64b3afc3b4b5cacf60d1f4 Mon Sep 17 00:00:00 2001 From: Timmy Welch Date: Sat, 29 Jun 2024 18:43:20 -0700 Subject: [PATCH] Improve filename parsing --- comicapi/filenamelexer.py | 18 +++++++----------- comicapi/filenameparser.py | 12 +++++++----- comicapi/utils.py | 2 ++ testing/filenames.py | 36 +++++++++++++++++++++++++++++++++++- 4 files changed, 51 insertions(+), 17 deletions(-) diff --git a/comicapi/filenamelexer.py b/comicapi/filenamelexer.py index 1726e15..e08a628 100644 --- a/comicapi/filenamelexer.py +++ b/comicapi/filenamelexer.py @@ -6,6 +6,7 @@ import calendar import os import unicodedata from enum import Enum, auto +from itertools import chain from typing import Any, Callable, Protocol @@ -307,21 +308,20 @@ def lex_text(lex: Lexer) -> LexerFunc: if is_alpha_numeric(r): if r.isnumeric(): # E.g. v1 word = lex.input[lex.start : lex.pos] - if word.casefold() in key and key[word.casefold()] == ItemType.InfoSpecifier: + if key.get(word.casefold(), None) == ItemType.InfoSpecifier: lex.backup() lex.emit(key[word.casefold()]) return lex_filename else: - if r == "'" and lex.peek() == "s": + if r == "'" and lex.peek().casefold() == "s": lex.get() else: lex.backup() word = lex.input[lex.start : lex.pos + 1] - if word.casefold() == "vol" and lex.peek() == ".": - lex.get() - word = lex.input[lex.start : lex.pos + 1] if word.casefold() in key: + if key[word.casefold()] in (ItemType.Honorific, ItemType.InfoSpecifier): + lex.accept(".") lex.emit(key[word.casefold()]) elif cal(word): lex.emit(ItemType.Calendar) @@ -332,12 +332,8 @@ def lex_text(lex: Lexer) -> LexerFunc: return lex_filename -def cal(value: str) -> set[Any]: - month_abbr = [i for i, x in enumerate(calendar.month_abbr) if x == value.title()] - month_name = [i for i, x in enumerate(calendar.month_name) if x == value.title()] - day_abbr = [i for i, x in enumerate(calendar.day_abbr) if x == value.title()] - day_name = [i for i, x in enumerate(calendar.day_name) if x == value.title()] - return set(month_abbr + month_name + day_abbr + day_name) +def cal(value: str) -> bool: + return value.title() in set(chain(calendar.month_abbr, calendar.month_name, calendar.day_abbr, calendar.day_name)) def lex_number(lex: Lexer) -> LexerFunc | None: diff --git a/comicapi/filenameparser.py b/comicapi/filenameparser.py index 7a3e2c2..979f346 100644 --- a/comicapi/filenameparser.py +++ b/comicapi/filenameparser.py @@ -417,10 +417,14 @@ class Parser: self.remove_from_remainder.append(filenamelexer.ItemType.FCBD) self.input = lexer_result - for i, item in enumerate(self.input): + self.error = None + for i, item in list(enumerate(self.input)): if item.typ == filenamelexer.ItemType.IssueNumber: self.issue_number_at = i self.issue_number_marked = True + if item.typ == filenamelexer.ItemType.Error: + self.error = item + self.input.remove(self.error) # Get returns the next Item in the input. def get(self) -> filenamelexer.Item: @@ -1043,10 +1047,9 @@ def parse_finish(p: Parser) -> None: if item in p.title_parts: p.title_parts.remove(item) + p.filename_info["series"] = p.filename_info.get("issue", "") if p.series_parts: p.filename_info["series"] = join_title(p.series_parts) - else: - p.filename_info["series"] = p.filename_info.get("issue", "") if "free comic book" in p.filename_info["series"].casefold(): p.filename_info["fcbd"] = True @@ -1092,7 +1095,6 @@ def get_remainder(p: Parser) -> str: elif ( item.typ in [ - filenamelexer.ItemType.Space, filenamelexer.ItemType.RightBrace, filenamelexer.ItemType.RightParen, filenamelexer.ItemType.RightSBrace, @@ -1111,7 +1113,7 @@ def get_remainder(p: Parser) -> str: # Remove empty parentheses remainder = re.sub(r"[\[{(]+[]})]+", "", remainder) - return remainder.strip() + return remainder.strip().rstrip("[{(") def parse_info_specifier(p: Parser) -> ParserFunc: diff --git a/comicapi/utils.py b/comicapi/utils.py index 78f0206..b522f20 100644 --- a/comicapi/utils.py +++ b/comicapi/utils.py @@ -162,6 +162,8 @@ def parse_filename( remove_publisher=remove_publisher, protofolius_issue_number_scheme=protofolius_issue_number_scheme, ) + if p.error: + logger.info("Issue parsing filename: '%s': %s ", filename, p.error.val) fni = p.filename_info elif parser == Parser.COMICFN2DICT: fn2d = comicfn2dict(filename) diff --git a/testing/filenames.py b/testing/filenames.py index 29e1d51..9c7da07 100644 --- a/testing/filenames.py +++ b/testing/filenames.py @@ -74,7 +74,7 @@ names: list[tuple[str, str, dict[str, str | bool], tuple[bool, bool]]] = [ ), ( "Michel Vaillant #5 Nr. 13 aan de start", - "Shortened word followed by a number eg No. 13, Mr. 13", + "Shortened word followed by a number eg No. 13, Nr. 13", { "issue": "5", "series": "Michel Vaillant", @@ -276,6 +276,23 @@ names: list[tuple[str, str, dict[str, str | bool], tuple[bool, bool]]] = [ }, (False, True), ), + ( + "batman #3 title (DC.cbz", + "publisher in title", + { + "archive": "cbz", + "issue": "3", + "series": "batman", + "title": "title", + "publisher": "DC", + "volume": "", + "year": "", + "remainder": "", + "issue_count": "", + "alternate": "", + }, + (False, True), + ), ( "ms. Marvel #3.cbz", "honorific and publisher in series", @@ -293,6 +310,23 @@ names: list[tuple[str, str, dict[str, str | bool], tuple[bool, bool]]] = [ }, (False, False), ), + ( + "Dr. Doom And The Masters Of Evil #1 (2009).cbz", + "honorific and publisher in series", + { + "archive": "cbz", + "issue": "1", + "series": "Dr. Doom And The Masters Of Evil", + "title": "", + "publisher": "", + "volume": "", + "year": "2009", + "remainder": "", + "issue_count": "", + "alternate": "", + }, + (False, False), + ), ( f"action comics #{datetime.datetime.now().year}.cbz", "issue number is current year (digits == 4)",