From f6be7919d725e06f9da7a9793784e4832e1a5971 Mon Sep 17 00:00:00 2001 From: Timmy Welch Date: Wed, 6 Sep 2023 04:50:05 -0400 Subject: [PATCH 1/2] Implement support for protofolius's permission scheme --- comicapi/comicarchive.py | 70 ++++++++----------- comicapi/filenamelexer.py | 39 +++++++++-- comicapi/filenameparser.py | 31 +++++++- comicapi/issuestring.py | 52 +++++++++----- comicapi/utils.py | 46 ++++++++++++ comictaggerlib/ctsettings/file.py | 21 +++++- .../ctsettings/settngs_namespace.py | 3 +- comictaggerlib/settingswindow.py | 67 ++++++++++++++++++ comictaggerlib/ui/settingswindow.ui | 40 +++++++++++ testing/filenames.py | 51 +++++++++++--- tests/filenameparser_test.py | 15 ++-- tests/issuestring_test.py | 3 + 12 files changed, 352 insertions(+), 86 deletions(-) diff --git a/comicapi/comicarchive.py b/comicapi/comicarchive.py index 87a234d..936150f 100644 --- a/comicapi/comicarchive.py +++ b/comicapi/comicarchive.py @@ -22,7 +22,7 @@ import shutil import sys from typing import cast -from comicapi import filenamelexer, filenameparser, utils +from comicapi import utils from comicapi.archivers import Archiver, UnknownArchiver, ZipArchiver from comicapi.comet import CoMet from comicapi.comicbookinfo import ComicBookInfo @@ -541,53 +541,39 @@ class ComicArchive: remove_fcbd: bool = False, remove_publisher: bool = False, split_words: bool = False, + allow_issue_start_with_letter: bool = False, + protofolius_issue_number_scheme: bool = False, ) -> GenericMetadata: metadata = GenericMetadata() - filename = self.path.name - if split_words: - import wordninja + filename_info = utils.parse_filename( + self.path.name, + complicated_parser=complicated_parser, + remove_c2c=remove_c2c, + remove_fcbd=remove_fcbd, + remove_publisher=remove_publisher, + split_words=split_words, + allow_issue_start_with_letter=allow_issue_start_with_letter, + protofolius_issue_number_scheme=protofolius_issue_number_scheme, + ) + metadata.alternate_number = utils.xlate(filename_info.get("alternate", None)) + metadata.issue = utils.xlate(filename_info.get("issue", None)) + metadata.issue_count = utils.xlate_int(filename_info.get("issue_count", None)) + metadata.publisher = utils.xlate(filename_info.get("publisher", None)) + metadata.series = utils.xlate(filename_info.get("series", None)) + metadata.title = utils.xlate(filename_info.get("title", None)) + metadata.volume = utils.xlate_int(filename_info.get("volume", None)) + metadata.volume_count = utils.xlate_int(filename_info.get("volume_count", None)) + metadata.year = utils.xlate_int(filename_info.get("year", None)) - filename = " ".join(wordninja.split(self.path.stem)) + self.path.suffix - - if complicated_parser: - lex = filenamelexer.Lex(filename) - p = filenameparser.Parse( - lex.items, remove_c2c=remove_c2c, remove_fcbd=remove_fcbd, remove_publisher=remove_publisher - ) - metadata.alternate_number = utils.xlate(p.filename_info["alternate"]) - metadata.issue = utils.xlate(p.filename_info["issue"]) - metadata.issue_count = utils.xlate_int(p.filename_info["issue_count"]) - metadata.publisher = utils.xlate(p.filename_info["publisher"]) - metadata.series = utils.xlate(p.filename_info["series"]) - metadata.title = utils.xlate(p.filename_info["title"]) - metadata.volume = utils.xlate_int(p.filename_info["volume"]) - metadata.volume_count = utils.xlate_int(p.filename_info["volume_count"]) - metadata.year = utils.xlate_int(p.filename_info["year"]) - - metadata.scan_info = utils.xlate(p.filename_info["remainder"]) - metadata.format = "FCBD" if p.filename_info["fcbd"] else None - if p.filename_info["annual"]: - metadata.format = "Annual" - else: - fnp = filenameparser.FileNameParser() - fnp.parse_filename(filename) - - if fnp.issue: - metadata.issue = fnp.issue - if fnp.series: - metadata.series = fnp.series - if fnp.volume: - metadata.volume = utils.xlate_int(fnp.volume) - if fnp.year: - metadata.year = utils.xlate_int(fnp.year) - if fnp.issue_count: - metadata.issue_count = utils.xlate_int(fnp.issue_count) - if fnp.remainder: - metadata.scan_info = fnp.remainder + metadata.scan_info = utils.xlate(filename_info.get("remainder", None)) + metadata.format = "FCBD" if filename_info.get("fcbd", None) else None + if filename_info.get("annual", None): + metadata.format = "Annual" + if filename_info.get("format", None): + metadata.format = filename_info["format"] metadata.is_empty = False - return metadata def export_as_zip(self, zip_filename: pathlib.Path) -> bool: diff --git a/comicapi/filenamelexer.py b/comicapi/filenamelexer.py index 0b40954..868fcb2 100644 --- a/comicapi/filenamelexer.py +++ b/comicapi/filenamelexer.py @@ -87,7 +87,7 @@ class Item: class Lexer: - def __init__(self, string: str) -> None: + def __init__(self, string: str, allow_issue_start_with_letter: bool = False) -> None: self.input: str = string # The string being scanned # The next lexing function to enter self.state: Callable[[Lexer], Callable | None] | None = None # type: ignore[type-arg] @@ -98,6 +98,7 @@ class Lexer: self.brace_depth: int = 0 # Nesting depth of { } self.sbrace_depth: int = 0 # Nesting depth of [ ] self.items: list[Item] = [] + self.allow_issue_start_with_letter = allow_issue_start_with_letter # Next returns the next rune in the input. def get(self) -> str: @@ -196,7 +197,7 @@ def lex_filename(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # ty return lex_space elif r == ".": r = lex.peek() - if r < "0" or "9" < r: + if not r.isdigit(): lex.emit(ItemType.Dot) return lex_filename @@ -204,15 +205,17 @@ def lex_filename(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # ty return lex_number elif r == "'": r = lex.peek() - if r in "0123456789": + if r.isdigit(): return lex_number lex.emit(ItemType.Text) # TODO: Change to Text elif "0" <= r <= "9": lex.backup() return lex_number elif r == "#": - if "0" <= lex.peek() <= "9": - return lex_number + if lex.allow_issue_start_with_letter and is_alpha_numeric(lex.peek()): + return lex_issue_number + elif lex.peek().isdigit() or lex.peek() in "-+.": + return lex_issue_number lex.emit(ItemType.Symbol) elif is_operator(r): if r == "-" and lex.peek() == "-": @@ -329,6 +332,28 @@ def lex_number(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # type return lex_filename +def lex_issue_number(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # type: ignore[type-arg] + # Only called when lex.input[lex.start] == "#" + original_start = lex.pos + found_number = False + while True: + r = lex.get() + if is_alpha_numeric(r): + if r.isnumeric(): + found_number = True + else: + lex.backup() + break + + if not found_number: + lex.pos = original_start + lex.emit(ItemType.Symbol) + else: + lex.emit(ItemType.IssueNumber) + + return lex_filename + + def is_space(character: str) -> bool: return character in "_ \t" @@ -346,7 +371,7 @@ def is_symbol(character: str) -> bool: return unicodedata.category(character)[0] in "PS" -def Lex(filename: str) -> Lexer: - lex = Lexer(string=os.path.basename(filename)) +def Lex(filename: str, allow_issue_start_with_letter: bool = False) -> Lexer: + lex = Lexer(os.path.basename(filename), allow_issue_start_with_letter) lex.run() return lex diff --git a/comicapi/filenameparser.py b/comicapi/filenameparser.py index 7c6681e..85d6262 100644 --- a/comicapi/filenameparser.py +++ b/comicapi/filenameparser.py @@ -324,6 +324,21 @@ class FilenameInfo(TypedDict, total=False): volume: str volume_count: str year: str + format: str + + +protofolius_issue_number_scheme = { + "B": "biography/best of", + "C": "compact edition", + "E": "entrtainment/puzzle edition", + "F": "familiy book edition", + "J": "jubileum (anniversary) edition", + "P": "pocket edition", + "N": "newly brought out/restyled edition", + "O": "old editions (or oblong format)", + "S": "special edition", + "X": "X-rated edition", +} eof = filenamelexer.Item(filenamelexer.ItemType.EOF, -1, "") @@ -341,6 +356,7 @@ class Parser: remove_c2c: bool = False, remove_fcbd: bool = False, remove_publisher: bool = False, + protofolius_issue_number_scheme: bool = False, ) -> None: self.state: Callable[[Parser], Callable | None] | None = None # type: ignore[type-arg] self.pos = -1 @@ -366,6 +382,7 @@ class Parser: self.remove_c2c = remove_c2c self.remove_fcbd = remove_fcbd self.remove_publisher = remove_publisher + self.protofolius_issue_number_scheme = protofolius_issue_number_scheme self.remove_from_remainder = [] if remove_c2c: @@ -923,6 +940,16 @@ def resolve_issue(p: Parser) -> None: if "volume" in p.filename_info: p.filename_info["issue"] = p.filename_info["volume"] + if ( + "issue" in p.filename_info + and p.protofolius_issue_number_scheme + and len(p.filename_info["issue"]) > 1 + and p.filename_info["issue"][0].isalpha() + and p.filename_info["issue"][0].upper() in protofolius_issue_number_scheme + and p.filename_info["issue"][1].isnumeric() + ): + p.filename_info["format"] = protofolius_issue_number_scheme[p.filename_info["issue"][0].upper()] + def parse_finish(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ignore[type-arg] resolve_year(p) @@ -941,7 +968,7 @@ def parse_finish(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty p.filename_info["series"] = join_title(p.series_parts) p.used_items.extend(p.series_parts) else: - p.filename_info["series"] = p.filename_info["issue"] + p.filename_info["series"] = p.filename_info.get("issue", "") if "free comic book" in p.filename_info["series"].casefold(): p.filename_info["fcbd"] = True @@ -1137,6 +1164,7 @@ def Parse( remove_c2c: bool = False, remove_fcbd: bool = False, remove_publisher: bool = False, + protofolius_issue_number_scheme: bool = False, ) -> Parser: p = Parser( lexer_result=lexer_result, @@ -1144,6 +1172,7 @@ def Parse( remove_c2c=remove_c2c, remove_fcbd=remove_fcbd, remove_publisher=remove_publisher, + protofolius_issue_number_scheme=protofolius_issue_number_scheme, ) p.run() return p diff --git a/comicapi/issuestring.py b/comicapi/issuestring.py index 149e5b1..b2cda1d 100644 --- a/comicapi/issuestring.py +++ b/comicapi/issuestring.py @@ -32,6 +32,7 @@ class IssueString: self.num = None self.suffix = "" + self.prefix = "" if text is None: return @@ -41,18 +42,25 @@ class IssueString: if len(text) == 0: return + for idx, r in enumerate(text): + if not r.isalpha(): + break + self.prefix = text[:idx] + self.num, self.suffix = self.get_number(text[idx:]) + + def get_number(self, text: str) -> tuple[float | None, str]: + num, suffix = None, "" + start = 0 # skip the minus sign if it's first - if text[0] == "-": + if text[0] in ("-", "+"): start = 1 - else: - start = 0 # if it's still not numeric at start skip it if text[start].isdigit() or text[start] == ".": # walk through the string, look for split point (the first non-numeric) decimal_count = 0 for idx in range(start, len(text)): - if text[idx] not in "0123456789.": + if not (text[idx].isdigit() or text[idx] in "."): break # special case: also split on second "." if text[idx] == ".": @@ -71,42 +79,48 @@ class IssueString: if idx == 1 and start == 1: idx = 0 - part1 = text[0:idx] - part2 = text[idx : len(text)] - - if part1 != "": - self.num = float(part1) - self.suffix = part2 + if text[0:idx]: + num = float(text[0:idx]) + suffix = text[idx : len(text)] else: - self.suffix = text + suffix = text + return num, suffix def as_string(self, pad: int = 0) -> str: - # return the float, left side zero-padded, with suffix attached + """return the number, left side zero-padded, with suffix attached""" + + # if there is no number return the text if self.num is None: - return self.suffix + return self.prefix + self.suffix + # negative is added back in last negative = self.num < 0 - num_f = abs(self.num) + # used for padding num_int = int(num_f) - num_s = str(num_int) - if float(num_int) != num_f: - num_s = str(num_f) - num_s += self.suffix + if num_f.is_integer(): + num_s = str(num_int) + else: + num_s = str(num_f) # create padding padding = "" + # we only pad the whole number part, we don't care about the decimal length = len(str(num_int)) if length < pad: padding = "0" * (pad - length) + # add the padding to the front num_s = padding + num_s + + # finally add the negative back in if negative: num_s = "-" + num_s - return num_s + # return the prefix + formatted number + suffix + return self.prefix + num_s + self.suffix def as_float(self) -> float | None: # return the float, with no suffix diff --git a/comicapi/utils.py b/comicapi/utils.py index f2c362e..b7d4075 100644 --- a/comicapi/utils.py +++ b/comicapi/utils.py @@ -26,6 +26,7 @@ from shutil import which # noqa: F401 from typing import Any import comicapi.data +from comicapi import filenamelexer, filenameparser try: import icu @@ -60,6 +61,51 @@ def os_sorted(lst: Iterable) -> Iterable: return sorted(lst, key=key) +def parse_filename( + filename: str, + complicated_parser: bool = False, + remove_c2c: bool = False, + remove_fcbd: bool = False, + remove_publisher: bool = False, + split_words: bool = False, + allow_issue_start_with_letter: bool = False, + protofolius_issue_number_scheme: bool = False, +) -> filenameparser.FilenameInfo: + if split_words: + import wordninja + + filename, ext = os.path.splitext(filename) + filename = " ".join(wordninja.split(filename)) + ext + + if complicated_parser: + lex = filenamelexer.Lex(filename, allow_issue_start_with_letter) + p = filenameparser.Parse( + lex.items, + remove_c2c=remove_c2c, + remove_fcbd=remove_fcbd, + remove_publisher=remove_publisher, + protofolius_issue_number_scheme=protofolius_issue_number_scheme, + ) + return p.filename_info + else: + fnp = filenameparser.FileNameParser() + fnp.parse_filename(filename) + fni = filenameparser.FilenameInfo() + if fnp.issue: + fni["issue"] = fnp.issue + if fnp.series: + fni["series"] = fnp.series + if fnp.volume: + fni["volume"] = fnp.volume + if fnp.year: + fni["year"] = fnp.year + if fnp.issue_count: + fni["issue_count"] = fnp.issue_count + if fnp.remainder: + fni["remainder"] = fnp.remainder + return fni + + def combine_notes(existing_notes: str | None, new_notes: str | None, split: str) -> str: split_notes, split_str, untouched_notes = (existing_notes or "").rpartition(split) if split_notes or split_str: diff --git a/comictaggerlib/ctsettings/file.py b/comictaggerlib/ctsettings/file.py index 5a3e0db..860fcc4 100644 --- a/comictaggerlib/ctsettings/file.py +++ b/comictaggerlib/ctsettings/file.py @@ -119,6 +119,18 @@ def filename(parser: settngs.Manager) -> None: action=argparse.BooleanOptionalAction, help="Attempts to remove publisher names from filenames, currently limited to Marvel and DC. Requires --complicated-parser", ) + parser.add_setting( + "--protofolius-issue-number-scheme", + default=False, + action=argparse.BooleanOptionalAction, + help="Use an issue number scheme devised by protofolius for encoding format informatino as a letter in front of an issue number. Implies --allow-issue-start-with-letter. Requires --complicated-parser", + ) + parser.add_setting( + "--allow-issue-start-with-letter", + default=False, + action=argparse.BooleanOptionalAction, + help="Allows an issue number to start with a single letter (e.g. '#X01'). Requires --complicated-parser", + ) def talker(parser: settngs.Manager) -> None: @@ -220,7 +232,7 @@ def autotag(parser: settngs.Manager) -> None: parser.add_setting("remove_archive_after_successful_match", default=False, cmdline=False) -def validate_file_settings(config: settngs.Config[ct_ns]) -> settngs.Config[ct_ns]: +def parse_filter(config: settngs.Config[ct_ns]) -> settngs.Config[ct_ns]: new_filter = [] remove = [] for x in config[0].Issue_Identifier_publisher_filter: @@ -235,6 +247,13 @@ def validate_file_settings(config: settngs.Config[ct_ns]) -> settngs.Config[ct_n if x in new_filter: new_filter.remove(x) config[0].Issue_Identifier_publisher_filter = new_filter + return config + + +def validate_file_settings(config: settngs.Config[ct_ns]) -> settngs.Config[ct_ns]: + config = parse_filter(config) + if config[0].Filename_Parsing_protofolius_issue_number_scheme: + config[0].Filename_Parsing_allow_issue_start_with_letter = True config[0].File_Rename_replacements = Replacements( [Replacement(x[0], x[1], x[2]) for x in config[0].File_Rename_replacements[0]], diff --git a/comictaggerlib/ctsettings/settngs_namespace.py b/comictaggerlib/ctsettings/settngs_namespace.py index 9d55fe6..64e432c 100644 --- a/comictaggerlib/ctsettings/settngs_namespace.py +++ b/comictaggerlib/ctsettings/settngs_namespace.py @@ -31,7 +31,6 @@ class settngs_namespace(settngs.TypedNS): Runtime_Options_summary: bool Runtime_Options_raw: bool Runtime_Options_recursive: bool - Runtime_Options_script: str Runtime_Options_split_words: bool Runtime_Options_dryrun: bool Runtime_Options_darkmode: bool @@ -70,6 +69,8 @@ class settngs_namespace(settngs.TypedNS): Filename_Parsing_remove_c2c: bool Filename_Parsing_remove_fcbd: bool Filename_Parsing_remove_publisher: bool + Filename_Parsing_protofolius_issue_number_scheme: bool + Filename_Parsing_allow_issue_start_with_letter: bool Sources_source: str Sources_remove_html_tables: bool diff --git a/comictaggerlib/settingswindow.py b/comictaggerlib/settingswindow.py index e8a4516..aaffe75 100644 --- a/comictaggerlib/settingswindow.py +++ b/comictaggerlib/settingswindow.py @@ -195,6 +195,8 @@ class SettingsWindow(QtWidgets.QDialog): self.settings_to_form() self.rename_test() self.dir_test() + self.leFilenameParserTest.setText(self.lblRenameTest.text()) + self.filename_parser_test() # Set General as start tab self.tabWidget.setCurrentIndex(0) @@ -222,6 +224,15 @@ class SettingsWindow(QtWidgets.QDialog): self.twLiteralReplacements.cellChanged.connect(self.rename_test) self.twValueReplacements.cellChanged.connect(self.rename_test) + self.leFilenameParserTest.textEdited.connect(self.filename_parser_test) + self.cbxRemoveC2C.clicked.connect(self.filename_parser_test) + self.cbxRemoveFCBD.clicked.connect(self.filename_parser_test) + self.cbxRemovePublisher.clicked.connect(self.filename_parser_test) + self.cbxProtofoliusIssueNumberScheme.clicked.connect(self.filename_parser_test) + self.cbxProtofoliusIssueNumberScheme.clicked.connect(self.protofolius_clicked) + self.cbxAllowIssueStartWithLetter.clicked.connect(self.filename_parser_test) + self.cbxSplitWords.clicked.connect(self.filename_parser_test) + def disconnect_signals(self) -> None: self.btnAddLiteralReplacement.clicked.disconnect() self.btnAddValueReplacement.clicked.disconnect() @@ -241,6 +252,55 @@ class SettingsWindow(QtWidgets.QDialog): self.leRenameTemplate.textEdited.disconnect() self.twLiteralReplacements.cellChanged.disconnect() self.twValueReplacements.cellChanged.disconnect() + self.leFilenameParserTest.textEdited.disconnect() + self.cbxRemoveC2C.clicked.disconnect() + self.cbxRemoveFCBD.clicked.disconnect() + self.cbxRemovePublisher.clicked.disconnect() + self.cbxProtofoliusIssueNumberScheme.clicked.disconnect() + self.cbxAllowIssueStartWithLetter.clicked.disconnect() + self.cbxSplitWords.clicked.disconnect() + + def protofolius_clicked(self, *args: Any, **kwargs: Any) -> None: + if self.cbxProtofoliusIssueNumberScheme.isChecked(): + self.cbxAllowIssueStartWithLetter.setEnabled(False) + self.cbxAllowIssueStartWithLetter.setChecked(True) + else: + self.cbxAllowIssueStartWithLetter.setEnabled(True) + self.filename_parser_test() + + def filename_parser_test(self, *args: Any, **kwargs: Any) -> None: + self._filename_parser_test(self.leFilenameParserTest.text()) + + def _filename_parser_test(self, filename: str) -> None: + filename_info = utils.parse_filename( + filename=filename, + complicated_parser=self.cbxComplicatedParser.isChecked(), + remove_c2c=self.cbxRemoveC2C.isChecked(), + remove_fcbd=self.cbxRemoveFCBD.isChecked(), + remove_publisher=self.cbxRemovePublisher.isChecked(), + split_words=self.cbxSplitWords.isChecked(), + allow_issue_start_with_letter=self.cbxAllowIssueStartWithLetter.isChecked(), + protofolius_issue_number_scheme=self.cbxProtofoliusIssueNumberScheme.isChecked(), + ) + report = "" + for item in ( + "series", + "issue", + "issue_count", + "title", + "volume", + "volume_count", + "year", + "alternate", + "publisher", + "archive", + "remainder", + "annual", + "c2c", + "fcbd", + ): + report += f"{item.title().replace('_', ' ')}: {dict(filename_info)[item]}\n" + self.lblFilenameParserTest.setText(report) def addLiteralReplacement(self) -> None: self.insertRow(self.twLiteralReplacements, self.twLiteralReplacements.rowCount(), Replacement("", "", False)) @@ -319,6 +379,9 @@ class SettingsWindow(QtWidgets.QDialog): self.cbxRemoveC2C.setChecked(self.config[0].Filename_Parsing_remove_c2c) self.cbxRemoveFCBD.setChecked(self.config[0].Filename_Parsing_remove_fcbd) self.cbxRemovePublisher.setChecked(self.config[0].Filename_Parsing_remove_publisher) + self.cbxProtofoliusIssueNumberScheme.setChecked(self.config[0].Filename_Parsing_protofolius_issue_number_scheme) + self.cbxAllowIssueStartWithLetter.setChecked(self.config[0].Filename_Parsing_allow_issue_start_with_letter) + self.switch_parser() self.cbxClearFormBeforePopulating.setChecked(self.config[0].Issue_Identifier_clear_form_before_populating) @@ -434,6 +497,10 @@ class SettingsWindow(QtWidgets.QDialog): self.config[0].Filename_Parsing_remove_c2c = self.cbxRemoveC2C.isChecked() self.config[0].Filename_Parsing_remove_fcbd = self.cbxRemoveFCBD.isChecked() self.config[0].Filename_Parsing_remove_publisher = self.cbxRemovePublisher.isChecked() + self.config[0].Filename_Parsing_allow_issue_start_with_letter = self.cbxAllowIssueStartWithLetter.isChecked() + self.config.values.Filename_Parsing_protofolius_issue_number_scheme = ( + self.cbxProtofoliusIssueNumberScheme.isChecked() + ) self.config[0].Issue_Identifier_clear_form_before_populating = self.cbxClearFormBeforePopulating.isChecked() self.config[0].Issue_Identifier_always_use_publisher_filter = self.cbxUseFilter.isChecked() diff --git a/comictaggerlib/ui/settingswindow.ui b/comictaggerlib/ui/settingswindow.ui index 43a1561..615b6a5 100644 --- a/comictaggerlib/ui/settingswindow.ui +++ b/comictaggerlib/ui/settingswindow.ui @@ -318,6 +318,46 @@ + + + + Use protofolius's issue number scheme + + + + + + + Allow issue numbers to start with a letter + + + + + + + + + + + + + !Preview only! Attempts to split words before parsing the filename. e.g. 'judgedredd' to 'judge dredd' + + + + + + + + + + Qt::PlainText + + + Qt::LinksAccessibleByMouse|Qt::TextSelectableByKeyboard|Qt::TextSelectableByMouse + + + diff --git a/testing/filenames.py b/testing/filenames.py index ce62d84..bf6f89a 100644 --- a/testing/filenames.py +++ b/testing/filenames.py @@ -23,6 +23,23 @@ datadir = pathlib.Path(__file__).parent / "data" cbz_path = datadir / "Cory Doctorow's Futuristic Tales of the Here and Now #001 - Anda's Game (2007).cbz" names = [ + ( + "batman #B01 title (DC).cbz", + "protofolius_issue_number_scheme", + { + "issue": "B1", + "series": "batman", + "title": "title", + "publisher": "DC", + "volume": "", + "year": "", + "remainder": "", + "issue_count": "", + "alternate": "", + "format": "biography/best of", + }, + (False, True), + ), ( "batman #3 title (DC).cbz", "honorific and publisher in series", @@ -724,15 +741,33 @@ names = [ ), ] -fnames = [] +oldfnames = [] +newfnames = [] for p in names: - pp = list(p) - pp[3] = p[3][0] - fnames.append(tuple(pp)) - if "#" in p[0]: - pp[0] = p[0].replace("#", "") - pp[3] = p[3][1] - fnames.append(tuple(pp)) + filename, reason, info, xfail = p + nxfail = xfail[0] + newfnames.append(pytest.param(filename, reason, info, nxfail)) + oldfnames.append( + pytest.param(filename, reason, info, nxfail, marks=pytest.mark.xfail(condition=nxfail, reason="old parser")) + ) + if "#" in filename: + filename = filename.replace("#", "") + nxfail = xfail[1] + if reason == "protofolius_issue_number_scheme": + newfnames.append( + pytest.param( + filename, + reason, + info, + nxfail, + marks=pytest.mark.xfail(condition=nxfail, reason="protofolius_issue_number_scheme"), + ) + ) + else: + newfnames.append(pytest.param(filename, reason, info, nxfail)) + oldfnames.append( + pytest.param(filename, reason, info, nxfail, marks=pytest.mark.xfail(condition=nxfail, reason="old parser")) + ) rnames = [ ( diff --git a/tests/filenameparser_test.py b/tests/filenameparser_test.py index 06f9305..f457cef 100644 --- a/tests/filenameparser_test.py +++ b/tests/filenameparser_test.py @@ -2,18 +2,21 @@ from __future__ import annotations import pytest +import comicapi.filenamelexer import comicapi.filenameparser -from testing.filenames import fnames +from testing.filenames import newfnames, oldfnames -@pytest.mark.parametrize("filename, reason, expected, xfail", fnames) +@pytest.mark.parametrize("filename, reason, expected, xfail", newfnames) def test_file_name_parser_new(filename, reason, expected, xfail): + lex = comicapi.filenamelexer.Lex(filename, "protofolius_issue_number_scheme" == reason) p = comicapi.filenameparser.Parse( - comicapi.filenamelexer.Lex(filename).items, + lex.items, first_is_alt=True, remove_c2c=True, remove_fcbd=True, remove_publisher=True, + protofolius_issue_number_scheme="protofolius_issue_number_scheme" == reason, ) fp = p.filename_info @@ -30,13 +33,13 @@ def test_file_name_parser_new(filename, reason, expected, xfail): assert fp == expected -@pytest.mark.parametrize("filename, reason, expected, xfail", fnames) +@pytest.mark.parametrize("filename, reason, expected, xfail", oldfnames) def test_file_name_parser(filename, reason, expected, xfail): p = comicapi.filenameparser.FileNameParser() p.parse_filename(filename) fp = p.__dict__ # These are currently not tracked in this parser - for s in ["title", "alternate", "publisher", "fcbd", "c2c", "annual", "volume_count", "remainder"]: + for s in ["title", "alternate", "publisher", "fcbd", "c2c", "annual", "volume_count", "remainder", "format"]: if s in expected: del expected[s] @@ -44,6 +47,4 @@ def test_file_name_parser(filename, reason, expected, xfail): if "remainder" in fp: del fp["remainder"] - if xfail and fp != expected: - pytest.xfail("old parser") assert fp == expected diff --git a/tests/issuestring_test.py b/tests/issuestring_test.py index 3597686..d657736 100644 --- a/tests/issuestring_test.py +++ b/tests/issuestring_test.py @@ -12,6 +12,9 @@ issues = [ ("1", 1.0, "001"), ("22.BEY", 22.0, "022.BEY"), ("22A", 22.0, "022A"), + ("A22A", 22.0, "A022A"), + ("A22", 22.0, "A022"), + ("A22½", 22.5, "A022½"), ("22-A", 22.0, "022-A"), ("", None, ""), ] From 582b8cc57b36ea2e4c41f8e73c3f7ec1353e1dae Mon Sep 17 00:00:00 2001 From: Timmy Welch Date: Wed, 11 Oct 2023 17:03:07 -0700 Subject: [PATCH 2/2] Add more parseable filenames --- comicapi/filenamelexer.py | 30 ++----- comicapi/filenameparser.py | 179 +++++++++++++++++++++++++++---------- testing/filenames.py | 143 ++++++++++++++++++++++++++++- 3 files changed, 281 insertions(+), 71 deletions(-) diff --git a/comicapi/filenamelexer.py b/comicapi/filenamelexer.py index 868fcb2..10a0c78 100644 --- a/comicapi/filenamelexer.py +++ b/comicapi/filenamelexer.py @@ -81,6 +81,7 @@ class Item: self.typ: ItemType = typ self.pos: int = pos self.val: str = val + self.no_space = False def __repr__(self) -> str: return f"{self.val}: index: {self.pos}: {self.typ}" @@ -144,23 +145,14 @@ class Lexer: self.backup() def scan_number(self) -> bool: - digits = "0123456789" + digits = "0123456789.," self.accept_run(digits) - if self.accept("."): - if self.accept(digits): - self.accept_run(digits) - else: - self.backup() - if self.accept("s"): - if not self.accept("t"): - self.backup() - elif self.accept("nr"): - if not self.accept("d"): - self.backup() - elif self.accept("t"): - if not self.accept("h"): - self.backup() + if self.input[self.pos] == ".": + self.backup() + while self.get().isalpha(): + ... + self.backup() return True @@ -197,12 +189,8 @@ def lex_filename(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # ty return lex_space elif r == ".": r = lex.peek() - if not r.isdigit(): - lex.emit(ItemType.Dot) - return lex_filename - - lex.backup() - return lex_number + lex.emit(ItemType.Dot) + return lex_filename elif r == "'": r = lex.peek() if r.isdigit(): diff --git a/comicapi/filenameparser.py b/comicapi/filenameparser.py index 85d6262..1138825 100644 --- a/comicapi/filenameparser.py +++ b/comicapi/filenameparser.py @@ -366,6 +366,8 @@ class Parser: self.alt = False self.filename_info: FilenameInfo = {"series": ""} self.issue_number_at = None + self.issue_number_marked = False + self.issue_number_passed = False self.in_something = 0 # In some sort of brackets {}[]() self.in_brace = 0 # In {} self.in_s_brace = 0 # In [] @@ -394,6 +396,7 @@ class Parser: for i, item in enumerate(self.input): if item.typ == filenamelexer.ItemType.IssueNumber: self.issue_number_at = i + self.issue_number_marked = True # Get returns the next Item in the input. def get(self) -> filenamelexer.Item: @@ -412,11 +415,11 @@ class Parser: return self.input[self.pos + 1] # Peek_back returns but does not step back the previous Item in the input. - def peek_back(self) -> filenamelexer.Item: - if int(self.pos) == 0: + def peek_back(self, length: int = 1) -> filenamelexer.Item: + if int(self.pos) - length < 0: return eof - return self.input[self.pos - 1] + return self.input[self.pos - length] # Backup steps back one Item. def backup(self) -> None: @@ -430,7 +433,6 @@ class Parser: def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ignore[type-arg] item: filenamelexer.Item = p.get() - # We're done, time to do final processing if item.typ == filenamelexer.ItemType.EOF: return parse_finish @@ -446,7 +448,7 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign # Issue number is not 4 digits e.g. a year # If this is still used in 7978 years, something is terribly wrong - if len(item.val.lstrip("0")) != 4: + if len(item.val.lstrip("0")) < 4: # Assume that operators indicate a non-issue number e.g. IG-88 or 88-IG if filenamelexer.ItemType.Operator not in (p.peek().typ, p.peek_back().typ): # It is common to use '89 to refer to an annual reprint from 1989 @@ -460,7 +462,6 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign else: p.operator_rejected.append(item) # operator rejected used later to add back to the series/title - # It is more likely to be a year if it is inside parentheses. if p.in_something > 0: likely_year = len(item.val.lstrip("0")) == 4 @@ -517,23 +518,30 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign likely_issue_number = likely_issue_number and item.val[0] != "'" p.year_candidates.append((likely_year, likely_issue_number, item)) # Ensures that IG-88 gets added back to the series/title - elif ( - p.in_something == 0 - and p.peek_back().typ == filenamelexer.ItemType.Operator - or p.peek().typ == filenamelexer.ItemType.Operator - ): - # Were not in something and the next or previous type is an operator, add it to the series - p.series_parts.append(item) - p.used_items.append(item) + else: + if p.in_something == 0: + if p.peek_back().typ in (filenamelexer.ItemType.IssueNumber, filenamelexer.ItemType.Number) or ( + p.peek_back().typ == filenamelexer.ItemType.Space + and p.peek_back(2).typ in (filenamelexer.ItemType.IssueNumber, filenamelexer.ItemType.Number) + ): + return parse_series + if ( + p.peek_back().typ == filenamelexer.ItemType.Operator + or p.peek().typ == filenamelexer.ItemType.Operator + ): + # Were not in something and the next or previous type is an operator, add it to the series + p.series_parts.append(item) + p.used_items.append(item) - p.get() - return parse_series + p.get() + return parse_series # Number with a leading hash e.g. #003 elif item.typ == filenamelexer.ItemType.IssueNumber: # Unset first item if p.firstItem: p.firstItem = False + p.issue_number_passed = True return parse_issue_number # Matches FCBD. Not added to p.used_items so it will show in "remainder" @@ -720,23 +728,24 @@ def parse_issue_number(p: Parser) -> Callable[[Parser], Callable | None] | None: def parse_series(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ignore[type-arg] item = p.input[p.pos] - - series: list[list[filenamelexer.Item]] = [[]] - # Space and Dots are not useful at the beginning of a title/series - if not p.skip and item.typ not in [filenamelexer.ItemType.Space, filenamelexer.ItemType.Dot]: - series[0].append(item) - current_part = 0 + prev_space = False title_parts: list[filenamelexer.Item] = [] series_parts: list[filenamelexer.Item] = [] - - prev_space = False + series: list[list[filenamelexer.Item]] = [[]] # We stop parsing the series when certain things come up if nothing was done with them continue where we left off if p.peek_back().typ in [filenamelexer.ItemType.Number, filenamelexer.ItemType.Calendar]: series_parts = p.series_parts p.series_parts = [] + + # Space and Dots are not useful at the beginning of a title/series + if not p.skip and item.typ not in [filenamelexer.ItemType.Space, filenamelexer.ItemType.Dot]: + if item.typ == filenamelexer.ItemType.Text: + p.backup() + else: + series[0].append(item) # Skip is only true if we have come across '--' or '__' while not p.skip: item = p.get() @@ -752,9 +761,16 @@ def parse_series(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty filenamelexer.ItemType.Honorific, ]: series[current_part].append(item) - if item.typ == filenamelexer.ItemType.Honorific and p.peek().typ == filenamelexer.ItemType.Dot: - series[current_part].append(p.get()) - elif item.typ == filenamelexer.ItemType.Publisher: + if p.peek().typ == filenamelexer.ItemType.Dot: + dot = p.get() + if item.typ == filenamelexer.ItemType.Honorific or ( + p.peek().typ == filenamelexer.ItemType.Space + and item.typ in (filenamelexer.ItemType.Text, filenamelexer.ItemType.Publisher) + ): + series[current_part].append(dot) + else: + p.backup() + if item.typ == filenamelexer.ItemType.Publisher: p.filename_info["publisher"] = item.val # Handle Volume @@ -798,9 +814,12 @@ def parse_series(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty p.filename_info["volume"] = t2do.convert(item.val) break - # This is 6 in '1 of 6' - if series[current_part] and series[current_part][-1].val.casefold() == "of": - series[current_part].append(item) + count = get_number(p, p.pos + 1) + # this is an issue or volume number + if count is not None: + p.backup() + break + if p.peek().typ == filenamelexer.ItemType.Space: p.get() # We have 2 numbers, add the first to the series and then go back to parse @@ -808,24 +827,52 @@ def parse_series(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty series[current_part].append(item) break - # We have 1 number break here, it's possible it's the issue - p.backup() # Whitespace - p.backup() # The number - break + # the issue number has been marked and passed, keep it as a part of the series + if ( + p.issue_number_marked + and p.issue_number_passed + or p.issue_number_at is not None + and not p.issue_number_marked + ): + # We already have an issue number, this should be a part of the series + series[current_part].append(item) + else: + # We have 1 number break here, it's possible it's the issue + p.backup() # Whitespace + p.backup() # The number + break # We have 1 number break here, it's possible it's the issue else: - p.backup() # The number - break + # the issue number has been #marked or passed, keep it as a part of the series + if ( + p.issue_number_marked + and p.issue_number_passed + or p.issue_number_at is not None + and not p.issue_number_marked + ): + # We already have an issue number, this should be a part of the series + series[current_part].append(item) + else: + p.backup() # The number + break else: # Ensure 'ms. marvel' parses 'ms.' correctly - if item.typ == filenamelexer.ItemType.Dot and p.peek_back().typ == filenamelexer.ItemType.Honorific: - series[current_part].append(item) - # Allows avengers.hulk to parse correctly - elif item.typ == filenamelexer.ItemType.Dot and p.peek().typ == filenamelexer.ItemType.Text: - # Marks the dot as used so that the remainder is clean - p.used_items.append(item) + if item.typ == filenamelexer.ItemType.Dot: + if p.peek_back().typ == filenamelexer.ItemType.Honorific: + series[current_part].append(item) + elif ( + p.peek().typ == filenamelexer.ItemType.Number + or p.peek_back().typ == filenamelexer.ItemType.Text + and len(p.peek_back().val) == 1 + ): + series[current_part].append(item) + item.no_space = True + # Allows avengers.hulk to parse correctly + elif p.peek().typ in (filenamelexer.ItemType.Text,): + # Marks the dot as used so that the remainder is clean + p.used_items.append(item) else: p.backup() break @@ -1075,7 +1122,7 @@ def parse_info_specifier(p: Parser) -> Callable[[Parser], Callable | None] | Non # 'of' is only special if it is inside a parenthesis. elif item.val.casefold() == "of": - i = get_number(p, index) + i = get_number_rev(p, index) if i is not None: if p.in_something > 0: if p.issue_number_at is None: @@ -1111,7 +1158,7 @@ def parse_info_specifier(p: Parser) -> Callable[[Parser], Callable | None] | Non # Gets 03 in '03 of 6' -def get_number(p: Parser, index: int) -> filenamelexer.Item | None: +def get_number_rev(p: Parser, index: int) -> filenamelexer.Item | None: # Go backward through the filename to see if we can find what this is of eg '03 (of 6)' or '008 title 03 (of 6)' rev = p.input[:index] rev.reverse() @@ -1129,6 +1176,36 @@ def get_number(p: Parser, index: int) -> filenamelexer.Item | None: # We got our number, time to leave return i # This is not a number and not an ignorable type, give up looking for the number this count belongs to + break + + return None + + +# Gets 6 in '03 of 6' +def get_number(p: Parser, index: int) -> filenamelexer.Item | None: + # Go forward through the filename to see if we can find what this is of eg '03 (of 6)' or '008 title 03 (of 6)' + filename = p.input[index:] + of_found = False + + for i in filename: + # We don't care about these types, we are looking to see if there is a number that is possibly different from + # the issue number for this count + if i.typ in [ + filenamelexer.ItemType.LeftParen, + filenamelexer.ItemType.LeftBrace, + filenamelexer.ItemType.LeftSBrace, + filenamelexer.ItemType.Space, + ]: + continue + if i.val == "of": + of_found = True + continue + if i.typ in [filenamelexer.ItemType.Number, filenamelexer.ItemType.IssueNumber]: + # We got our number, time to leave + if of_found: + return i + # This is not a number and not an ignorable type, give up looking for the number this count belongs to + break return None @@ -1146,11 +1223,21 @@ def join_title(lst: list[filenamelexer.Item]) -> str: if i == len(lst) - 1: continue # No space after honorifics with a dot - if item.typ == filenamelexer.ItemType.Honorific and lst[i + 1].typ == filenamelexer.ItemType.Dot: + if ( + item.typ in (filenamelexer.ItemType.Honorific, filenamelexer.ItemType.Text) + and lst[i + 1].typ == filenamelexer.ItemType.Dot + ): + continue + if item.no_space: continue # No space if the next item is an operator or symbol if lst[i + 1].typ in [filenamelexer.ItemType.Operator, filenamelexer.ItemType.Symbol]: - continue + # exept if followed by a dollarsign + if not ( + lst[i].typ in [filenamelexer.ItemType.Number, filenamelexer.ItemType.IssueNumber] + and lst[i + 1].val == "$" + ): + continue # Add a space title += " " diff --git a/testing/filenames.py b/testing/filenames.py index bf6f89a..cbfef76 100644 --- a/testing/filenames.py +++ b/testing/filenames.py @@ -23,6 +23,141 @@ datadir = pathlib.Path(__file__).parent / "data" cbz_path = datadir / "Cory Doctorow's Futuristic Tales of the Here and Now #001 - Anda's Game (2007).cbz" names = [ + ( + "Michel Vaillant #5 Nr. 13 aan de start", + "Shortened word followed by a number eg No. 13, Mr. 13", + { + "issue": "5", + "series": "Michel Vaillant", + "title": "Nr. 13 aan de start", + "volume": "", + "year": "", + "remainder": "", + "issue_count": "", + "alternate": "", + }, + (False, True), + ), + ( + "Michel Vaillant #8 De 8ste man", + "Non english ordinal", + { + "issue": "8", + "series": "Michel Vaillant", + "title": "De 8ste man", + "volume": "", + "year": "", + "remainder": "", + "issue_count": "", + "alternate": "", + }, + (False, True), + ), + ( + "Michel Vaillant #13 Mach 1 voor Steve Warson", + "number in title", + { + "issue": "13", + "series": "Michel Vaillant", + "title": "Mach 1 voor Steve Warson", + "volume": "", + "year": "", + "remainder": "", + "issue_count": "", + "alternate": "", + }, + (False, True), + ), + ( + "Michel Vaillant #19 5 Meisjes in de race", + "number starting title", + { + "issue": "19", + "series": "Michel Vaillant", + "title": "5 Meisjes in de race", + "volume": "", + "year": "", + "remainder": "", + "issue_count": "", + "alternate": "", + }, + (False, True), + ), + ( + "Michel Vaillant #34 Steve Warson gaat K.O.", + "acronym", + { + "issue": "34", + "series": "Michel Vaillant", + "title": "Steve Warson gaat K.O.", + "volume": "", + "year": "", + "remainder": "", + "issue_count": "", + "alternate": "", + }, + (False, True), + ), + ( + "Michel Vaillant #40 F.1 in oproer", + "acronym with numbers", + { + "issue": "40", + "series": "Michel Vaillant", + "title": "F.1 in oproer", + "volume": "", + "year": "", + "remainder": "", + "issue_count": "", + "alternate": "", + }, + (False, True), + ), + ( + "Michel Vaillant #42 300 kmu door Parijs", + "number starting title", + { + "issue": "42", + "series": "Michel Vaillant", + "title": "300 kmu door Parijs", + "volume": "", + "year": "", + "remainder": "", + "issue_count": "", + "alternate": "", + }, + (False, True), + ), + ( + "Michel Vaillant #52 F 3000", + "title ends with number", + { + "issue": "52", + "series": "Michel Vaillant", + "title": "F 3000", + "volume": "", + "year": "", + "remainder": "", + "issue_count": "", + "alternate": "", + }, + (False, True), + ), + ( + "Michel Vaillant #66 100.000.000 $ voor Steve Warson", + "number separator is . and dollarsign after number", + { + "issue": "66", + "series": "Michel Vaillant", + "title": "100.000.000 $ voor Steve Warson", + "volume": "", + "year": "", + "remainder": "", + "issue_count": "", + "alternate": "", + }, + (False, True), + ), ( "batman #B01 title (DC).cbz", "protofolius_issue_number_scheme", @@ -42,7 +177,7 @@ names = [ ), ( "batman #3 title (DC).cbz", - "honorific and publisher in series", + "publisher in parenthesis", { "issue": "3", "series": "batman", @@ -58,7 +193,7 @@ names = [ ), ( "batman #3 title DC.cbz", - "honorific and publisher in series", + "publisher in title", { "issue": "3", "series": "batman", @@ -753,14 +888,14 @@ for p in names: if "#" in filename: filename = filename.replace("#", "") nxfail = xfail[1] - if reason == "protofolius_issue_number_scheme": + if reason in ("protofolius_issue_number_scheme", "number starting title"): newfnames.append( pytest.param( filename, reason, info, nxfail, - marks=pytest.mark.xfail(condition=nxfail, reason="protofolius_issue_number_scheme"), + marks=pytest.mark.xfail(condition=nxfail, reason=reason), ) ) else: