diff --git a/comicapi/comicarchive.py b/comicapi/comicarchive.py index 660a8dc..b4cc108 100644 --- a/comicapi/comicarchive.py +++ b/comicapi/comicarchive.py @@ -42,10 +42,10 @@ try: except ImportError: pil_available = False +from comicapi import filenamelexer, filenameparser from comicapi.comet import CoMet from comicapi.comicbookinfo import ComicBookInfo from comicapi.comicinfoxml import ComicInfoXml -from comicapi.filenameparser import FileNameParser from comicapi.genericmetadata import GenericMetadata, PageType logger = logging.getLogger(__name__) @@ -1127,25 +1127,46 @@ class ComicArchive: data = self.get_page(idx) p["ImageSize"] = str(len(data)) - def metadata_from_filename(self, parse_scan_info=True): + def metadata_from_filename( + self, complicated_parser=False, remove_c2c=False, remove_fcbd=False, remove_publisher=False + ): metadata = GenericMetadata() - fnp = FileNameParser() - fnp.parse_filename(self.path) + if complicated_parser: + lex = filenamelexer.Lex(self.path) + p = filenameparser.Parse( + lex.items, remove_c2c=remove_c2c, remove_fcbd=remove_fcbd, remove_publisher=remove_publisher + ) + metadata.alternate_number = p.filename_info["alternate"] or None + metadata.issue = p.filename_info["issue"] or None + metadata.issue_count = p.filename_info["issue_count"] or None + metadata.publisher = p.filename_info["publisher"] or None + metadata.series = p.filename_info["series"] or None + metadata.title = p.filename_info["title"] or None + metadata.volume = p.filename_info["volume"] or None + metadata.volume_count = p.filename_info["volume_count"] or None + metadata.year = p.filename_info["year"] or None - if fnp.issue != "": - metadata.issue = fnp.issue - if fnp.series != "": - metadata.series = fnp.series - if fnp.volume != "": - metadata.volume = fnp.volume - if fnp.year != "": - metadata.year = fnp.year - if fnp.issue_count != "": - metadata.issue_count = fnp.issue_count - if parse_scan_info: - if fnp.remainder != "": + metadata.scan_info = p.filename_info["remainder"] or None + metadata.format = "FCBD" if p.filename_info["fcbd"] else None + if p.filename_info["annual"]: + metadata.format = "Annual" + else: + fnp = filenameparser.FileNameParser() + fnp.parse_filename(self.path) + + if fnp.issue: + metadata.issue = fnp.issue + if fnp.series: + metadata.series = fnp.series + if fnp.volume: + metadata.volume = fnp.volume + if fnp.year: + metadata.year = fnp.year + if fnp.issue_count: + metadata.issue_count = fnp.issue_count + if fnp.remainder: metadata.scan_info = fnp.remainder metadata.is_empty = False diff --git a/comicapi/filenamelexer.py b/comicapi/filenamelexer.py new file mode 100644 index 0000000..ed3f27a --- /dev/null +++ b/comicapi/filenamelexer.py @@ -0,0 +1,353 @@ +import calendar +import os +import unicodedata +from enum import Enum, auto + + +class ItemType(Enum): + Error = auto() # Error occurred; value is text of error + EOF = auto() + Text = auto() # Text + LeftParen = auto() # '(' inside action + Number = auto() # Simple number + IssueNumber = auto() # Preceded by a # Symbol + RightParen = auto() # ')' inside action + Space = auto() # Run of spaces separating arguments + Dot = auto() + LeftBrace = auto() + RightBrace = auto() + LeftSBrace = auto() + RightSBrace = auto() + Symbol = auto() + Skip = auto() # __ or -- no title, issue or series information beyond + Operator = auto() + Calendar = auto() + InfoSpecifier = auto() # Specifies type of info e.g. v1 for 'volume': 1 + ArchiveType = auto() + Honorific = auto() + Keywords = auto() + FCBD = auto() + ComicType = auto() + Publisher = auto() + C2C = auto() + + +braces = [ + ItemType.LeftBrace, + ItemType.LeftParen, + ItemType.LeftSBrace, + ItemType.RightBrace, + ItemType.RightParen, + ItemType.RightSBrace, +] + +eof = chr(0) + +key = { + "fcbd": ItemType.FCBD, + "freecomicbookday": ItemType.FCBD, + "cbr": ItemType.ArchiveType, + "cbz": ItemType.ArchiveType, + "cbt": ItemType.ArchiveType, + "cb7": ItemType.ArchiveType, + "rar": ItemType.ArchiveType, + "zip": ItemType.ArchiveType, + "tar": ItemType.ArchiveType, + "7z": ItemType.ArchiveType, + "annual": ItemType.ComicType, + "book": ItemType.ComicType, + "volume": ItemType.InfoSpecifier, + "vol.": ItemType.InfoSpecifier, + "vol": ItemType.InfoSpecifier, + "v": ItemType.InfoSpecifier, + "of": ItemType.InfoSpecifier, + "dc": ItemType.Publisher, + "marvel": ItemType.Publisher, + "covers": ItemType.InfoSpecifier, + "c2c": ItemType.C2C, + "mr": ItemType.Honorific, + "ms": ItemType.Honorific, + "mrs": ItemType.Honorific, + "dr": ItemType.Honorific, +} + + +class Item: + def __init__(self, typ: ItemType, pos: int, val: str): + self.typ: ItemType = typ + self.pos: int = pos + self.val: str = val + + def __repr__(self): + return f"{self.val}: index: {self.pos}: {self.typ}" + + +class Lexer: + def __init__(self, string): + self.input: str = string # The string being scanned + self.state = None # The next lexing function to enter + self.pos: int = -1 # Current position in the input + self.start: int = 0 # Start position of this item + self.lastPos: int = 0 # Position of most recent item returned by nextItem + self.paren_depth: int = 0 # Nesting depth of ( ) exprs + self.brace_depth: int = 0 # Nesting depth of { } + self.sbrace_depth: int = 0 # Nesting depth of [ ] + self.items = [] + + # Next returns the next rune in the input. + def get(self) -> str: + if int(self.pos) >= len(self.input) - 1: + self.pos += 1 + return eof + + self.pos += 1 + return self.input[self.pos] + + # Peek returns but does not consume the next rune in the input. + def peek(self) -> str: + if int(self.pos) >= len(self.input) - 1: + return eof + + return self.input[self.pos + 1] + + def backup(self): + self.pos -= 1 + + # Emit passes an item back to the client. + def emit(self, t: ItemType): + self.items.append(Item(t, self.start, self.input[self.start : self.pos + 1])) + self.start = self.pos + 1 + + # Ignore skips over the pending input before this point. + def ignore(self): + self.start = self.pos + + # Accept consumes the next rune if it's from the valid se: + def accept(self, valid: str): + if self.get() in valid: + return True + + self.backup() + return False + + # AcceptRun consumes a run of runes from the valid set. + def accept_run(self, valid: str): + while self.get() in valid: + pass + + self.backup() + + # Errorf returns an error token and terminates the scan by passing + # Back a nil pointer that will be the next state, terminating self.nextItem. + def errorf(self, message: str): + self.items.append(Item(ItemType.Error, self.start, message)) + + # NextItem returns the next item from the input. + # Called by the parser, not in the lexing goroutine. + # def next_item(self) -> Item: + # item: Item = self.items.get() + # self.lastPos = item.pos + # return item + + def scan_number(self): + digits = "0123456789" + + self.accept_run(digits) + if self.accept("."): + if self.accept(digits): + self.accept_run(digits) + else: + self.backup() + if self.accept("s"): + if not self.accept("t"): + self.backup() + elif self.accept("nr"): + if not self.accept("d"): + self.backup() + elif self.accept("t"): + if not self.accept("h"): + self.backup() + + return True + + # Runs the state machine for the lexer. + def run(self): + self.state = lex_filename + while self.state is not None: + self.state = self.state(self) + + +# Scans the elements inside action delimiters. +def lex_filename(lex: Lexer): + r = lex.get() + if r == eof: + if lex.paren_depth != 0: + return lex.errorf("unclosed left paren") + + if lex.brace_depth != 0: + return lex.errorf("unclosed left paren") + lex.emit(ItemType.EOF) + return None + elif is_space(r): + if r == "_" and lex.peek() == "_": + lex.get() + lex.emit(ItemType.Skip) + else: + return lex_space + elif r == ".": + r = lex.peek() + if r < "0" or "9" < r: + lex.emit(ItemType.Dot) + return lex_filename + + lex.backup() + return lex_number + elif r == "'": + r = lex.peek() + if r in "0123456789": + return lex_number + lex.emit(ItemType.Text) # TODO: Change to Text + elif "0" <= r <= "9": + lex.backup() + return lex_number + elif r == "#": + if "0" <= lex.peek() <= "9": + return lex_number + lex.emit(ItemType.Symbol) + elif is_operator(r): + if r == "-" and lex.peek() == "-": + lex.get() + lex.emit(ItemType.Skip) + else: + return lex_operator + elif is_alpha_numeric(r): + lex.backup() + return lex_text + elif r == "(": + lex.emit(ItemType.LeftParen) + lex.paren_depth += 1 + elif r == ")": + lex.emit(ItemType.RightParen) + lex.paren_depth -= 1 + if lex.paren_depth < 0: + return lex.errorf("unexpected right paren " + r) + + elif r == "{": + lex.emit(ItemType.LeftBrace) + lex.brace_depth += 1 + elif r == "}": + lex.emit(ItemType.RightBrace) + lex.brace_depth -= 1 + if lex.brace_depth < 0: + return lex.errorf("unexpected right brace " + r) + + elif r == "[": + lex.emit(ItemType.LeftSBrace) + lex.sbrace_depth += 1 + elif r == "]": + lex.emit(ItemType.RightSBrace) + lex.sbrace_depth -= 1 + if lex.sbrace_depth < 0: + return lex.errorf("unexpected right brace " + r) + elif is_symbol(r): + # L.backup() + lex.emit(ItemType.Symbol) + else: + return lex.errorf("unrecognized character in action: " + r) + + return lex_filename + + +def lex_operator(lex: Lexer): + lex.accept_run("-|:;") + lex.emit(ItemType.Operator) + return lex_filename + + +# LexSpace scans a run of space characters. +# One space has already been seen. +def lex_space(lex: Lexer): + while is_space(lex.peek()): + lex.get() + + lex.emit(ItemType.Space) + return lex_filename + + +# Lex_text scans an alphanumeric. +def lex_text(lex: Lexer): + while True: + r = lex.get() + if is_alpha_numeric(r): + if r.isnumeric(): # E.g. v1 + word = lex.input[lex.start : lex.pos] + if word.lower() in key and key[word.lower()] == ItemType.InfoSpecifier: + lex.backup() + lex.emit(key[word.lower()]) + return lex_filename + else: + if r == "'" and lex.peek() == "s": + lex.get() + else: + lex.backup() + word = lex.input[lex.start : lex.pos + 1] + if word.lower() == "vol" and lex.peek() == ".": + lex.get() + word = lex.input[lex.start : lex.pos + 1] + + if word.lower() in key: + lex.emit(key[word.lower()]) + elif cal(word): + lex.emit(ItemType.Calendar) + else: + lex.emit(ItemType.Text) + break + + return lex_filename + + +def cal(value: str): + month_abbr = [i for i, x in enumerate(calendar.month_abbr) if x == value.title()] + month_name = [i for i, x in enumerate(calendar.month_name) if x == value.title()] + day_abbr = [i for i, x in enumerate(calendar.day_abbr) if x == value.title()] + day_name = [i for i, x in enumerate(calendar.day_name) if x == value.title()] + return set(month_abbr + month_name + day_abbr + day_name) + + +def lex_number(lex: Lexer): + if not lex.scan_number(): + return lex.errorf("bad number syntax: " + lex.input[lex.start : lex.pos]) + # Complex number logic removed. Messes with math operations without space + + if lex.input[lex.start] == "#": + lex.emit(ItemType.IssueNumber) + elif not lex.input[lex.pos].isdigit(): + # Assume that 80th is just text and not a number + lex.emit(ItemType.Text) + else: + lex.emit(ItemType.Number) + + return lex_filename + + +def is_space(character: str): + return character in "_ \t" + + +# IsAlphaNumeric reports whether r is an alphabetic, digit, or underscore. +def is_alpha_numeric(character: str): + return character.isalpha() or character.isnumeric() + + +def is_operator(character: str): + return character in "-|:;/\\" + + +def is_symbol(character: str): + return unicodedata.category(character)[0] in "PS" + + +def Lex(filename: str): + lex = Lexer(string=os.path.basename(filename)) + lex.run() + return lex diff --git a/comicapi/filenameparser.py b/comicapi/filenameparser.py index 9d67231..e4f829d 100644 --- a/comicapi/filenameparser.py +++ b/comicapi/filenameparser.py @@ -23,8 +23,17 @@ This should probably be re-written, but, well, it mostly works! import logging import os import re +from operator import itemgetter +from typing import TypedDict from urllib.parse import unquote +from text2digits import text2digits + +from comicapi import filenamelexer, issuestring + +t2d = text2digits.Text2Digits(add_ordinal_ending=False) +t2do = text2digits.Text2Digits(add_ordinal_ending=True) + logger = logging.getLogger(__name__) @@ -68,9 +77,7 @@ class FileNameParser: if match: count = match.group() - count = count.lstrip("0") - - return count + return count.lstrip("0") def get_issue_number(self, filename): """Returns a tuple of issue number string, and start and end indexes in the filename @@ -222,7 +229,7 @@ class FileNameParser: year = "" # look for four digit number with "(" ")" or "--" around it - match = re.search(r"(\(\d\d\d\d\))|(--\d\d\d\d--)", filename) + match = re.search(r"(\(\d{4}\))|(--\d{4}--)", filename) if match: year = match.group() # remove non-digits @@ -290,3 +297,814 @@ class FileNameParser: self.issue = "0" if self.issue[0] == ".": self.issue = "0" + self.issue + + +class FilenameInfo(TypedDict, total=False): + alternate: str + annual: bool + archive: str + c2c: bool + fcbd: bool + issue: str + issue_count: str + publisher: str + remainder: str + series: str + title: str + volume: str + volume_count: str + year: str + + +eof = filenamelexer.Item(filenamelexer.ItemType.EOF, -1, "") + + +class Parser: + """docstring for FilenameParser""" + + def __init__( + self, + lexer_result: list[filenamelexer.Item], + first_is_alt=False, + remove_c2c=False, + remove_fcbd=False, + remove_publisher=False, + ): + self.state = None + self.pos = -1 + + self.firstItem = True + self.skip = False + self.alt = False + self.filename_info: FilenameInfo = {"series": ""} + self.issue_number_at = None + self.in_something = 0 # In some sort of brackets {}[]() + self.in_brace = 0 # In {} + self.in_s_brace = 0 # In [] + self.in_paren = 0 # In () + self.year_candidates: list[tuple[bool, filenamelexer.Item]] = [] + self.series_parts: list[filenamelexer.Item] = [] + self.title_parts: list[filenamelexer.Item] = [] + self.used_items: list[filenamelexer.Item] = [] + self.irrelevant: list[filenamelexer.Item] = [] + self.operator_rejected: list[filenamelexer.Item] = [] + self.publisher_removed: list[filenamelexer.Item] = [] + + self.first_is_alt = first_is_alt + self.remove_c2c = remove_c2c + self.remove_fcbd = remove_fcbd + self.remove_publisher = remove_publisher + + self.input = lexer_result + for i, item in enumerate(self.input): + if item.typ == filenamelexer.ItemType.IssueNumber: + self.issue_number_at = i + + # Get returns the next Item in the input. + def get(self) -> filenamelexer.Item: + if int(self.pos) >= len(self.input) - 1: + self.pos += 1 + return eof + + self.pos += 1 + return self.input[self.pos] + + # Peek returns but does not consume the next Item in the input. + def peek(self) -> filenamelexer.Item: + if int(self.pos) >= len(self.input) - 1: + return eof + + return self.input[self.pos + 1] + + # Peek_back returns but does not step back the previous Item in the input. + def peek_back(self) -> filenamelexer.Item: + if int(self.pos) == 0: + return eof + + return self.input[self.pos - 1] + + # Backup steps back one Item. + def backup(self): + self.pos -= 1 + + def run(self): + self.state = parse + while self.state is not None: + self.state = self.state(self) + + +def parse(p: Parser): + item: filenamelexer.Item = p.get() + + # We're done, time to do final processing + if item.typ == filenamelexer.ItemType.EOF: + return parse_finish + + # Need to figure out if this is the issue number + if item.typ == filenamelexer.ItemType.Number: + likely_year = False + if p.firstItem and p.first_is_alt: + # raise Exception("fuck you") + p.alt = True + return parse_issue_number + + # The issue number should hopefully not be in parentheses + if p.in_something == 0: + # Assume that operators indicate a non-issue number e.g. IG-88 or 88-IG + if filenamelexer.ItemType.Operator not in (p.peek().typ, p.peek_back().typ): + # It is common to use '89 to refer to an annual reprint from 1989 + if item.val[0] != "'": + # Issue number is less than 4 digits. very few series go above 999 + if len(item.val.lstrip("0")) < 4: + # An issue number starting with # Was not found and no previous number was found + if p.issue_number_at is None: + # Series has already been started/parsed, filters out leading alternate numbers leading alternate number + if len(p.series_parts) > 0: + # Unset first item + if p.firstItem: + p.firstItem = False + return parse_issue_number + else: + p.operator_rejected.append(item) + # operator rejected used later to add back to the series/title + + # It is more likely to be a year if it is inside parentheses. + if p.in_something > 0: + likely_year = True + + # If numbers are directly followed by text it most likely isn't a year e.g. 2048px + if p.peek().typ == filenamelexer.ItemType.Text: + likely_year = False + + # Is either a full year '2001' or a short year "'89" + if len(item.val) == 4 or item.val[0] == "'": + if p.in_something == 0: + # Append to series in case it is a part of the title, but only if were not inside parenthesis + p.series_parts.append(item) + + # Look for a full date as in 2022-04-22 + if p.peek().typ in [ + filenamelexer.ItemType.Symbol, + filenamelexer.ItemType.Operator, + filenamelexer.ItemType.Dot, + ]: + op = [p.get()] + if p.peek().typ == filenamelexer.ItemType.Number: + month = p.get() + if p.peek().typ in [ + filenamelexer.ItemType.Symbol, + filenamelexer.ItemType.Operator, + filenamelexer.ItemType.Dot, + ]: + op.append(p.get()) + if p.peek().typ == filenamelexer.ItemType.Number: + day = p.get() + fulldate = [month, day, item] + p.used_items.extend(op) + p.used_items.extend(fulldate) + else: + p.backup() + p.backup() + p.backup() + # TODO never happens + else: + p.backup() + p.backup() + # TODO never happens + else: + p.backup() + # TODO never happens + + p.year_candidates.append((likely_year, item)) + # Ensures that IG-88 gets added back to the series/title + elif ( + p.in_something == 0 + and p.peek_back().typ == filenamelexer.ItemType.Operator + or p.peek().typ == filenamelexer.ItemType.Operator + ): + # Were not in something and the next or previous type is an operator, add it to the series + p.series_parts.append(item) + p.used_items.append(item) + + # Unset first item + if p.firstItem: + p.firstItem = False + p.get() + return parse_series + + # Number with a leading hash e.g. #003 + elif item.typ == filenamelexer.ItemType.IssueNumber: + # Unset first item + if p.firstItem: + p.firstItem = False + return parse_issue_number + + # Matches FCBD. Not added to p.used_items so it will show in "remainder" + elif item.typ == filenamelexer.ItemType.FCBD: + p.filename_info["fcbd"] = True + + # Matches c2c. Not added to p.used_items so it will show in "remainder" + elif item.typ == filenamelexer.ItemType.C2C: + p.filename_info["c2c"] = True + + # Matches the extension if it is known to be an archive format e.g. cbt,cbz,zip,rar + elif item.typ == filenamelexer.ItemType.ArchiveType: + p.filename_info["archive"] = item.val.lower() + p.used_items.append(item) + if p.peek_back().typ == filenamelexer.ItemType.Dot: + p.used_items.append(p.peek_back()) + + # Allows removing DC from 'Wonder Woman 49 DC Sep-Oct 1951' dependent on publisher being in a static list in the lexer + elif item.typ == filenamelexer.ItemType.Publisher: + p.filename_info["publisher"] = item.val + p.used_items.append(item) + if p.firstItem: + p.firstItem = False + if p.in_something == 0: + return parse_series + p.publisher_removed.append(item) + if p.in_something == 0: + return parse_series + + # Attempts to identify the type e.g. annual + elif item.typ == filenamelexer.ItemType.ComicType: + series_append = True + + if p.peek().typ == filenamelexer.ItemType.Space: + p.get() + + if p.series_parts and "free comic book" in (" ".join([x.val for x in p.series_parts]) + " " + item.val).lower(): + p.filename_info["fcbd"] = True + series_append = True + # If the next item is a number it's probably the volume + elif p.peek().typ == filenamelexer.ItemType.Number or ( + p.peek().typ == filenamelexer.ItemType.Text and t2d.convert(p.peek().val).isnumeric() + ): + number = p.get() + # Mark volume info. Text will be added to the title/series later + if item.val.lower() in ["book", "tpb"]: + p.title_parts.extend([item, number]) + p.filename_info["volume"] = t2do.convert(number.val) + p.filename_info["issue"] = t2do.convert(number.val) + + p.used_items.append(item) + series_append = False + + # Annuals usually mean the year + elif item.val.lower() in ["annual"]: + p.filename_info["annual"] = True + num = t2d.convert(number.val) + if num.isnumeric() and len(num) == 4: + p.year_candidates.append((True, number)) + else: + p.backup() + + elif item.val.lower() in ["annual"]: + p.filename_info["annual"] = True + + # If we don't have a reason to exclude it from the series go back to parsing the series immediately + if series_append: + p.series_parts.append(item) + p.used_items.append(item) + return parse_series + + # We found text, it's probably the title or series + elif item.typ in [filenamelexer.ItemType.Text, filenamelexer.ItemType.Honorific]: + # Unset first item + if p.firstItem: + p.firstItem = False + if p.in_something == 0: + return parse_series + + # Usually the word 'of' eg 1 (of 6) + elif item.typ == filenamelexer.ItemType.InfoSpecifier: + return parse_info_specifier + + # Operator is a symbol that acts as some sort of separator eg - : ; + elif item.typ == filenamelexer.ItemType.Operator: + if p.in_something == 0: + p.irrelevant.append(item) + + # Filter out Month and day names in filename + elif item.typ == filenamelexer.ItemType.Calendar: + # Month and day are currently irrelevant if they are inside parentheses e.g. (January 2002) + if p.in_something > 0: + p.irrelevant.append(item) + + # assume Sep-Oct is not useful in the series/title + elif p.peek().typ in [filenamelexer.ItemType.Symbol, filenamelexer.ItemType.Operator]: + p.get() + if p.peek().typ == filenamelexer.ItemType.Calendar: + p.irrelevant.extend([item, p.input[p.pos], p.get()]) + else: + p.backup() + return parse_series + # This is text that just happens to also be a month/day + else: + return parse_series + + # Specifically '__' or '--', no further title/series parsing is done to keep compatibility with wiki + elif item.typ == filenamelexer.ItemType.Skip: + p.skip = True + + # Keeping track of parentheses depth + elif item.typ == filenamelexer.ItemType.LeftParen: + p.in_paren += 1 + p.in_something += 1 + elif item.typ == filenamelexer.ItemType.LeftBrace: + p.in_brace += 1 + p.in_something += 1 + elif item.typ == filenamelexer.ItemType.LeftSBrace: + p.in_s_brace += 1 + p.in_something += 1 + + elif item.typ == filenamelexer.ItemType.RightParen: + p.in_paren -= 1 + p.in_something -= 1 + elif item.typ == filenamelexer.ItemType.RightBrace: + p.in_brace -= 1 + p.in_something -= 1 + elif item.typ == filenamelexer.ItemType.RightSBrace: + p.in_s_brace -= 1 + p.in_something -= 1 + + # Unset first item + if p.firstItem: + p.firstItem = False + + # Brace management, I don't like negative numbers + if p.in_paren < 0: + p.in_something += p.in_paren * -1 + if p.in_brace < 0: + p.in_something += p.in_brace * -1 + if p.in_s_brace < 0: + p.in_something += p.in_s_brace * -1 + + return parse + + +# TODO: What about more esoteric numbers??? +def parse_issue_number(p: Parser): + item = p.input[p.pos] + + if "issue" in p.filename_info: + if "alternate" in p.filename_info: + p.filename_info["alternate"] += "," + item.val + p.filename_info["alternate"] = item.val + else: + if p.alt: + p.filename_info["alternate"] = item.val + else: + p.filename_info["issue"] = item.val + p.issue_number_at = item.pos + p.used_items.append(item) + item = p.get() + if item.typ == filenamelexer.ItemType.Dot: + p.used_items.append(item) + item = p.get() + if item.typ in [filenamelexer.ItemType.Text, filenamelexer.ItemType.Number]: + if p.alt: + p.filename_info["alternate"] += "." + item.val + else: + p.filename_info["issue"] += "." + item.val + p.used_items.append(item) + else: + p.backup() + p.backup() + else: + p.backup() + p.alt = False + return parse + + +def parse_series(p: Parser): + item = p.input[p.pos] + + series: list[list[filenamelexer.Item]] = [[]] + # Space and Dots are not useful at the beginning of a title/series + if not p.skip and item.typ not in [filenamelexer.ItemType.Space, filenamelexer.ItemType.Dot]: + series[0].append(item) + + current_part = 0 + + title_parts: list[filenamelexer.Item] = [] + series_parts: list[filenamelexer.Item] = [] + + prev_space = False + + # 'free comic book day' screws things up. #TODO look into removing book from ComicType? + + # We stop parsing the series when certain things come up if nothing was done with them continue where we left off + if ( + p.series_parts + and p.series_parts[-1].val.lower() == "book" + or p.peek_back().typ == filenamelexer.ItemType.Number + or item.typ == filenamelexer.ItemType.Calendar + ): + series_parts = p.series_parts + p.series_parts = [] + # Skip is only true if we have come across '--' or '__' + while not p.skip: + item = p.get() + + # Spaces are evil + if item.typ == filenamelexer.ItemType.Space: + prev_space = True + continue + if item.typ in [ + filenamelexer.ItemType.Text, + filenamelexer.ItemType.Symbol, + filenamelexer.ItemType.Publisher, + filenamelexer.ItemType.Honorific, + ]: + series[current_part].append(item) + if item.typ == filenamelexer.ItemType.Honorific and p.peek().typ == filenamelexer.ItemType.Dot: + series[current_part].append(p.get()) + elif item.typ == filenamelexer.ItemType.Publisher: + p.filename_info["publisher"] = item.val + + # Handle Volume + elif item.typ == filenamelexer.ItemType.InfoSpecifier: + # Exception for 'of' + if item.val.lower() == "of": + series[current_part].append(item) + else: + # This specifically lets 'X-Men-V1-067' parse correctly as Series: X-Men Volume: 1 Issue: 67 + while len(series[current_part]) > 0 and series[current_part][-1].typ not in [ + filenamelexer.ItemType.Text, + filenamelexer.ItemType.Symbol, + ]: + p.irrelevant.append(series[current_part].pop()) + p.backup() + break + + elif item.typ == filenamelexer.ItemType.Operator: + peek = p.peek() + # ': ' separates the title from the series, only the last section is considered the title + if not prev_space and peek.typ in [filenamelexer.ItemType.Space]: + series.append([]) # Starts a new section + series[current_part].append(item) + current_part += 1 + else: + # Force space around '-' makes 'batman - superman' stay otherwise we get 'batman-superman' + if prev_space and peek.typ in [filenamelexer.ItemType.Space]: + item.val = " " + item.val + " " + series[current_part].append(item) + + # Stop processing series/title if a skip item is found + elif item.typ == filenamelexer.ItemType.Skip: + p.backup() + break + + elif item.typ == filenamelexer.ItemType.Number: + if p.peek().typ == filenamelexer.ItemType.Space: + p.get() + # We have 2 numbers, add the first to the series and then go back to parse + if p.peek().typ == filenamelexer.ItemType.Number: + series[current_part].append(item) + break + + # We have 1 number break here, it's possible it's the issue + p.backup() # Whitespace + p.backup() # The number + break + # This is 6 in '1 of 6' + if series[current_part] and series[current_part][-1].val.lower() == "of": + series[current_part].append(item) + + # We have 1 number break here, it's possible it's the issue + else: + p.backup() # The number + break + + else: + # Ensure 'ms. marvel' parses 'ms.' correctly + if item.typ == filenamelexer.ItemType.Dot and p.peek_back().typ == filenamelexer.ItemType.Honorific: + series[current_part].append(item) + # Allows avengers.hulk to parse correctly + elif item.typ == filenamelexer.ItemType.Dot and p.peek().typ == filenamelexer.ItemType.Text: + # Marks the dot as used so that the remainder is clean + p.used_items.append(item) + else: + p.backup() + break + + prev_space = False + + # We have a title separator e.g. ': " + if len(series) > 1: + title_parts.extend(series.pop()) + for s in series: + if s and s[-1].typ == filenamelexer.ItemType.Operator: + s[-1].val += " " # Ensures that when there are multiple separators that they display properly + series_parts.extend(s) + p.used_items.append(series_parts.pop()) + else: + series_parts.extend(series[0]) + + # If the series has already been set assume all of this is the title. + if len(p.series_parts) > 0: + p.title_parts.extend(series_parts) + p.title_parts.extend(title_parts) + else: + p.series_parts.extend(series_parts) + p.title_parts.extend(title_parts) + return parse + + +def resolve_year(p: Parser): + if len(p.year_candidates) > 0: + # Sort by likely_year boolean + p.year_candidates.sort(key=itemgetter(0)) + + # Take the last year e.g. (2007) 2099 (2008) becomes 2099 2007 2008 and takes 2008 + selected_year = p.year_candidates.pop()[1] + + p.filename_info["year"] = selected_year.val + p.used_items.append(selected_year) + + # (2008) Title (2009) is many times used to denote the series year if we don't have a volume we use it + if "volume" not in p.filename_info and p.year_candidates and p.year_candidates[-1][0]: + vol = p.year_candidates.pop()[1] + p.filename_info["volume"] = vol.val + p.used_items.append(vol) + + # Remove volume from series and title + if selected_year in p.series_parts: + p.series_parts.remove(selected_year) + if selected_year in p.title_parts: + p.title_parts.remove(selected_year) + + # Remove year from series and title + if selected_year in p.series_parts: + p.series_parts.remove(selected_year) + if selected_year in p.title_parts: + p.title_parts.remove(selected_year) + + +def parse_finish(p: Parser): + resolve_year(p) + + # If we don't have an issue try to find it in the series + if "issue" not in p.filename_info and p.series_parts and p.series_parts[-1].typ == filenamelexer.ItemType.Number: + issue_num = p.series_parts.pop() + + # If the number we just popped is a year put it back on it's probably part of the series e.g. Spider-Man 2099 + if issue_num in [x[1] for x in p.year_candidates]: + p.series_parts.append(issue_num) + else: + # If this number was rejected because of an operator and the operator is still there add it back e.g. 'IG-88' + if ( + issue_num in p.operator_rejected + and p.series_parts + and p.series_parts[-1].typ == filenamelexer.ItemType.Operator + ): + p.series_parts.append(issue_num) + # We have no reason to not use this number as the issue number. Specifically happens when parsing 'X-Men-V1-067.cbr' + else: + p.filename_info["issue"] = issue_num.val + p.used_items.append(issue_num) + p.issue_number_at = issue_num.pos + + # Remove publishers, currently only marvel and dc are defined, + # this is an option specifically because this can drastically screw up parsing + if p.remove_publisher: + for item in p.publisher_removed: + if item in p.series_parts: + p.series_parts.remove(item) + if item in p.title_parts: + p.title_parts.remove(item) + + p.filename_info["series"] = join_title(p.series_parts) + p.used_items.extend(p.series_parts) + + p.filename_info["title"] = join_title(p.title_parts) + p.used_items.extend(p.title_parts) + + if "issue" in p.filename_info: + p.filename_info["issue"] = issuestring.IssueString(p.filename_info["issue"].lstrip("#")).as_string() + + if "volume" in p.filename_info: + p.filename_info["volume"] = p.filename_info["volume"].lstrip("#").lstrip("0") + + if "issue" not in p.filename_info: + # We have an alternate move it to the issue + if "alternate" in p.filename_info: + p.filename_info["issue"] = p.filename_info["alternate"] + p.filename_info["alternate"] = "" + else: + # TODO: This never happens + inp = [x for x in p.input if x not in p.irrelevant and x not in p.used_items and x.typ != eof.typ] + if len(inp) == 1 and inp[0].typ == filenamelexer.ItemType.Number: + p.filename_info["issue"] = inp[0].val + p.used_items.append(inp[0]) + + remove_items = [] + if p.remove_fcbd: + remove_items.append(filenamelexer.ItemType.FCBD) + if p.remove_c2c: + remove_items.append(filenamelexer.ItemType.C2C) + + p.irrelevant.extend([x for x in p.input if x.typ in remove_items]) + + p.filename_info["remainder"] = get_remainder(p) + + # Ensure keys always exist + for s in [ + "alternate", + "issue", + "archive", + "series", + "title", + "volume", + "year", + "remainder", + "issue_count", + "volume_count", + "publisher", + ]: + if s not in p.filename_info: + p.filename_info[s] = "" + for s in ["fcbd", "c2c", "annual"]: + if s not in p.filename_info: + p.filename_info[s] = False + + +def get_remainder(p: Parser): + remainder = "" + rem = [] + + # Remove used items and irrelevant items e.g. the series and useless operators + inp = [x for x in p.input if x not in p.irrelevant and x not in p.used_items] + for i, item in enumerate(inp): + # No double space or space next to parentheses + if item.typ in [filenamelexer.ItemType.Space, filenamelexer.ItemType.Skip]: + if ( + i > 0 + and inp[i - 1].typ + not in [ + filenamelexer.ItemType.Space, + filenamelexer.ItemType.LeftBrace, + filenamelexer.ItemType.LeftParen, + filenamelexer.ItemType.LeftSBrace, + ] + and i + 1 < len(inp) + and inp[i + 1].typ + not in [ + filenamelexer.ItemType.RightBrace, + filenamelexer.ItemType.RightParen, + filenamelexer.ItemType.RightSBrace, + ] + ): + remainder += " " + + # Strip off useless opening parenthesis + elif ( + item.typ + in [ + filenamelexer.ItemType.Space, + filenamelexer.ItemType.RightBrace, + filenamelexer.ItemType.RightParen, + filenamelexer.ItemType.RightSBrace, + ] + and i > 0 + and inp[i - 1].typ + in [ + filenamelexer.ItemType.LeftBrace, + filenamelexer.ItemType.LeftParen, + filenamelexer.ItemType.LeftSBrace, + ] + ): + remainder = remainder.rstrip("[{(") + continue + + # Add the next item + else: + rem.append(item) + remainder += item.val + + # Remove empty parentheses + remainder = re.sub(r"[\[{(]+[]})]+", "", remainder) + return remainder.strip() + + +def parse_info_specifier(p: Parser): + item = p.input[p.pos] + index = p.pos + + if p.peek().typ == filenamelexer.ItemType.Space: + p.get() + + # Handles 'book 3' and 'book three' + if p.peek().typ == filenamelexer.ItemType.Number or ( + p.peek().typ == filenamelexer.ItemType.Text and t2d.convert(p.peek().val).isnumeric() + ): + + number = p.get() + if item.val.lower() in ["volume", "vol", "vol.", "v"]: + p.filename_info["volume"] = t2do.convert(number.val) + p.used_items.append(item) + p.used_items.append(number) + + # 'of' is only special if it is inside a parenthesis. + elif item.val.lower() == "of": + i = get_number(p, index) + if p.in_something > 0: + if p.issue_number_at is None: + # TODO: Figure out what to do here if it ever happens + p.filename_info["issue_count"] = str(int(t2do.convert(number.val))) + p.used_items.append(item) + p.used_items.append(number) + + # This is definitely the issue number + elif p.issue_number_at == i.pos: + p.filename_info["issue_count"] = str(int(t2do.convert(number.val))) + p.used_items.append(item) + p.used_items.append(number) + + # This is not for the issue number it is not in either the issue or the title, assume it is the volume number and count + elif p.issue_number_at != i.pos and i not in p.series_parts and i not in p.title_parts: + p.filename_info["volume"] = i.val + p.filename_info["volume_count"] = str(int(t2do.convert(number.val))) + p.used_items.append(i) + p.used_items.append(item) + p.used_items.append(number) + else: + # TODO: Figure out what to do here if it ever happens + pass + else: + # Lets 'The Wrath of Foobar-Man, Part 1 of 2' parse correctly as the title + if i is not None: + p.pos = [ind for ind, x in enumerate(p.input) if x == i][0] + + if not p.in_something: + return parse_series + return parse + + +# Gets 03 in '03 of 6' +def get_number(p: Parser, index: int): + # Go backward through the filename to see if we can find what this is of eg '03 (of 6)' or '008 title 03 (of 6)' + rev = p.input[:index] + rev.reverse() + for i in rev: + # We don't care about these types, we are looking to see if there is a number that is possibly different from the issue number for this count + if i.typ in [ + filenamelexer.ItemType.LeftParen, + filenamelexer.ItemType.LeftBrace, + filenamelexer.ItemType.LeftSBrace, + filenamelexer.ItemType.Space, + ]: + continue + if i.typ == filenamelexer.ItemType.Number: + # We got our number, time to leave + return i + # This is not a number and not an ignorable type, give up looking for the number this count belongs to + + return None + + +def join_title(lst: list[filenamelexer.Item]): + title = "" + for i, item in enumerate(lst): + if i + 1 == len(lst) and item.val == ",": # We ignore commas on the end + continue + title += item.val # Add the next item + # No space after operators + if item.typ == filenamelexer.ItemType.Operator: + continue + # No trailing space + if i == len(lst) - 1: + continue + # No space after honorifics with a dot + if item.typ == filenamelexer.ItemType.Honorific and lst[i + 1].typ == filenamelexer.ItemType.Dot: + continue + # No space if the next item is an operator or symbol + if lst[i + 1].typ in [ + filenamelexer.ItemType.Operator, + filenamelexer.ItemType.Symbol, + ]: + continue + + # Add a space + title += " " + + return title + + +def Parse( + lexer_result: list[filenamelexer.Item], + first_is_alt=False, + remove_c2c=False, + remove_fcbd=False, + remove_publisher=False, +): + p = Parser( + lexer_result=lexer_result, + first_is_alt=first_is_alt, + remove_c2c=remove_c2c, + remove_fcbd=remove_fcbd, + remove_publisher=remove_publisher, + ) + p.run() + return p diff --git a/comictaggerlib/autotagmatchwindow.py b/comictaggerlib/autotagmatchwindow.py index 03fc7e9..764cc24 100644 --- a/comictaggerlib/autotagmatchwindow.py +++ b/comictaggerlib/autotagmatchwindow.py @@ -32,11 +32,13 @@ logger = logging.getLogger(__name__) class AutoTagMatchWindow(QtWidgets.QDialog): volume_id = 0 - def __init__(self, parent, match_set_list: List[MultipleMatch], style, fetch_func): + def __init__(self, parent, match_set_list: List[MultipleMatch], style, fetch_func, settings): super().__init__(parent) uic.loadUi(ComicTaggerSettings.get_ui_file("matchselectionwindow.ui"), self) + self.settings = settings + self.current_match_set: Optional[MultipleMatch] = None self.altCoverWidget = CoverImageWidget(self.altCoverContainer, CoverImageWidget.AltCoverMode) @@ -221,7 +223,12 @@ class AutoTagMatchWindow(QtWidgets.QDialog): md = ca.read_metadata(self.style) if md.is_empty: - md = ca.metadata_from_filename() + md = ca.metadata_from_filename( + self.settings.complicated_parser, + self.settings.remove_c2c, + self.settings.remove_fcbd, + self.settings.remove_publisher, + ) # now get the particular issue data cv_md = self.fetch_func(match) diff --git a/comictaggerlib/cli.py b/comictaggerlib/cli.py index 43e02ae..f6faf26 100644 --- a/comictaggerlib/cli.py +++ b/comictaggerlib/cli.py @@ -101,7 +101,7 @@ def display_match_set_for_choice(label, match_set: MultipleMatch, opts, settings # save the data! # we know at this point, that the file is all good to go ca = match_set.ca - md = create_local_metadata(opts, ca, ca.has_metadata(opts.data_style)) + md = create_local_metadata(opts, ca, ca.has_metadata(opts.data_style), settings) cv_md = actual_issue_data_fetch(match_set.matches[int(i)], settings, opts) md.overlay(cv_md) actual_metadata_save(ca, opts, md) @@ -164,13 +164,17 @@ def cli_mode(opts, settings): post_process_matches(match_results, opts, settings) -def create_local_metadata(opts, ca: ComicArchive, has_desired_tags): +def create_local_metadata(opts, ca: ComicArchive, has_desired_tags, settings): md = GenericMetadata() md.set_default_page_list(ca.get_number_of_pages()) # now, overlay the parsed filename info if opts.parse_filename: - md.overlay(ca.metadata_from_filename()) + md.overlay( + ca.metadata_from_filename( + settings.complicated_parser, settings.remove_c2c, settings.remove_fcbd, settings.remove_publisher + ) + ) if has_desired_tags: md = ca.read_metadata(opts.data_style) @@ -319,7 +323,7 @@ def process_file_cli(filename, opts, settings, match_results: OnlineMatchResults if batch_mode: print(f"Processing {ca.path}...") - md = create_local_metadata(opts, ca, has[opts.data_style]) + md = create_local_metadata(opts, ca, has[opts.data_style], settings) if md.issue is None or md.issue == "": if opts.assume_issue_is_one_if_not_set: md.issue = "1" @@ -430,7 +434,7 @@ def process_file_cli(filename, opts, settings, match_results: OnlineMatchResults else: use_tags = False - md = create_local_metadata(opts, ca, use_tags) + md = create_local_metadata(opts, ca, use_tags, settings) if md.series is None: logger.error(msg_hdr + "Can't rename without series name") diff --git a/comictaggerlib/issueidentifier.py b/comictaggerlib/issueidentifier.py index b3c6ed7..954d0b6 100644 --- a/comictaggerlib/issueidentifier.py +++ b/comictaggerlib/issueidentifier.py @@ -63,6 +63,7 @@ class IssueIdentifier: result_multiple_good_matches = 5 def __init__(self, comic_archive: ComicArchive, settings): + self.settings = settings self.comic_archive: ComicArchive = comic_archive self.image_hasher = 1 @@ -192,7 +193,12 @@ class IssueIdentifier: internal_metadata = ca.read_cbi() # try to get some metadata from filename - md_from_filename = ca.metadata_from_filename() + md_from_filename = ca.metadata_from_filename( + self.settings.complicated_parser, + self.settings.remove_c2c, + self.settings.remove_fcbd, + self.settings.remove_publisher, + ) # preference order: # 1. Additional metadata diff --git a/comictaggerlib/renamewindow.py b/comictaggerlib/renamewindow.py index a2be891..630a45c 100644 --- a/comictaggerlib/renamewindow.py +++ b/comictaggerlib/renamewindow.py @@ -81,7 +81,12 @@ class RenameWindow(QtWidgets.QDialog): md = ca.read_metadata(self.data_style) if md.is_empty: - md = ca.metadata_from_filename(self.settings.parse_scan_info) + md = ca.metadata_from_filename( + self.settings.complicated_parser, + self.settings.remove_c2c, + self.settings.remove_fcbd, + self.settings.remove_publisher, + ) self.renamer.set_metadata(md) self.renamer.move = self.settings.rename_move_dir diff --git a/comictaggerlib/settings.py b/comictaggerlib/settings.py index 584e954..1222a3e 100644 --- a/comictaggerlib/settings.py +++ b/comictaggerlib/settings.py @@ -88,7 +88,10 @@ class ComicTaggerSettings: self.ask_about_usage_stats = True # filename parsing settings - self.parse_scan_info = True + self.complicated_parser = False + self.remove_c2c = False + self.remove_fcbd = False + self.remove_publisher = False # Comic Vine settings self.use_series_start_as_volume = False @@ -161,7 +164,10 @@ class ComicTaggerSettings: self.ask_about_usage_stats = True # filename parsing settings - self.parse_scan_info = True + self.complicated_parser = False + self.remove_c2c = False + self.remove_fcbd = False + self.remove_publisher = False # Comic Vine settings self.use_series_start_as_volume = False @@ -287,8 +293,14 @@ class ComicTaggerSettings: if self.config.has_option("identifier", "id_publisher_filter"): self.id_publisher_filter = self.config.get("identifier", "id_publisher_filter") - if self.config.has_option("filenameparser", "parse_scan_info"): - self.parse_scan_info = self.config.getboolean("filenameparser", "parse_scan_info") + if self.config.has_option("filenameparser", "complicated_parser"): + self.complicated_parser = self.config.getboolean("filenameparser", "complicated_parser") + if self.config.has_option("filenameparser", "remove_c2c"): + self.remove_c2c = self.config.getboolean("filenameparser", "remove_c2c") + if self.config.has_option("filenameparser", "remove_fcbd"): + self.remove_fcbd = self.config.getboolean("filenameparser", "remove_fcbd") + if self.config.has_option("filenameparser", "remove_publisher"): + self.remove_publisher = self.config.getboolean("filenameparser", "remove_publisher") if self.config.has_option("dialogflags", "ask_about_cbi_in_rar"): self.ask_about_cbi_in_rar = self.config.getboolean("dialogflags", "ask_about_cbi_in_rar") @@ -419,7 +431,10 @@ class ComicTaggerSettings: if not self.config.has_section("filenameparser"): self.config.add_section("filenameparser") - self.config.set("filenameparser", "parse_scan_info", self.parse_scan_info) + self.config.set("filenameparser", "complicated_parser", self.complicated_parser) + self.config.set("filenameparser", "remove_c2c", self.remove_c2c) + self.config.set("filenameparser", "remove_fcbd", self.remove_fcbd) + self.config.set("filenameparser", "remove_publisher", self.remove_publisher) if not self.config.has_section("comicvine"): self.config.add_section("comicvine") diff --git a/comictaggerlib/settingswindow.py b/comictaggerlib/settingswindow.py index 409953a..be2ce3c 100644 --- a/comictaggerlib/settingswindow.py +++ b/comictaggerlib/settingswindow.py @@ -182,6 +182,7 @@ class SettingsWindow(QtWidgets.QDialog): self.cbxMoveFiles.clicked.connect(self.rename_test) self.cbxRenameStrict.clicked.connect(self.rename_test) self.leDirectory.textEdited.connect(self.rename_test) + self.cbxComplicatedParser.clicked.connect(self.switch_parser) def rename_test(self): self.rename__test(self.leRenameTemplate.text()) @@ -199,6 +200,13 @@ class SettingsWindow(QtWidgets.QDialog): self.rename_error = e self.lblRenameTest.setText(str(e)) + def switch_parser(self): + complicated = self.cbxComplicatedParser.isChecked() + + self.cbxRemoveC2C.setEnabled(complicated) + self.cbxRemoveFCBD.setEnabled(complicated) + self.cbxRemovePublisher.setEnabled(complicated) + def settings_to_form(self): # Copy values from settings to form self.leRarExePath.setText(self.settings.rar_exe_path) @@ -208,8 +216,11 @@ class SettingsWindow(QtWidgets.QDialog): if self.settings.check_for_new_version: self.cbxCheckForNewVersion.setCheckState(QtCore.Qt.CheckState.Checked) - if self.settings.parse_scan_info: - self.cbxParseScanInfo.setCheckState(QtCore.Qt.CheckState.Checked) + self.cbxComplicatedParser.setChecked(self.settings.complicated_parser) + self.cbxRemoveC2C.setChecked(self.settings.remove_c2c) + self.cbxRemoveFCBD.setChecked(self.settings.remove_fcbd) + self.cbxRemovePublisher.setChecked(self.settings.remove_publisher) + self.switch_parser() if self.settings.use_series_start_as_volume: self.cbxUseSeriesStartAsVolume.setCheckState(QtCore.Qt.CheckState.Checked) @@ -291,7 +302,10 @@ class SettingsWindow(QtWidgets.QDialog): self.settings.id_length_delta_thresh = int(self.leNameLengthDeltaThresh.text()) self.settings.id_publisher_filter = str(self.tePublisherFilter.toPlainText()) - self.settings.parse_scan_info = self.cbxParseScanInfo.isChecked() + self.settings.complicated_parser = self.cbxComplicatedParser.isChecked() + self.settings.remove_c2c = self.cbxRemoveC2C.isChecked() + self.settings.remove_fcbd = self.cbxRemoveFCBD.isChecked() + self.settings.remove_publisher = self.cbxRemovePublisher.isChecked() self.settings.use_series_start_as_volume = self.cbxUseSeriesStartAsVolume.isChecked() self.settings.clear_form_before_populating_from_cv = self.cbxClearFormBeforePopulating.isChecked() diff --git a/comictaggerlib/taggerwindow.py b/comictaggerlib/taggerwindow.py index 8e6f5b6..f7de3f3 100644 --- a/comictaggerlib/taggerwindow.py +++ b/comictaggerlib/taggerwindow.py @@ -557,7 +557,12 @@ Please choose options below, and select OK. def actual_load_current_archive(self): if self.metadata.is_empty: - self.metadata = self.comic_archive.metadata_from_filename(self.settings.parse_scan_info) + self.metadata = self.comic_archive.metadata_from_filename( + self.settings.complicated_parser, + self.settings.remove_c2c, + self.settings.remove_fcbd, + remove_publisher=self.settings.remove_publisher, + ) if len(self.metadata.pages) == 0: self.metadata.set_default_page_list(self.comic_archive.get_number_of_pages()) @@ -928,7 +933,12 @@ Please choose options below, and select OK. if self.comic_archive is not None: # copy the form onto metadata object self.form_to_metadata() - new_metadata = self.comic_archive.metadata_from_filename(self.settings.parse_scan_info) + new_metadata = self.comic_archive.metadata_from_filename( + self.settings.complicated_parser, + self.settings.remove_c2c, + self.settings.remove_fcbd, + remove_publisher=self.settings.remove_publisher, + ) if new_metadata is not None: self.metadata.overlay(new_metadata) self.metadata_to_form() @@ -1654,7 +1664,12 @@ Please choose options below, and select OK. # read in metadata, and parse file name if not there md = ca.read_metadata(self.save_data_style) if md.is_empty: - md = ca.metadata_from_filename(self.settings.parse_scan_info) + md = ca.metadata_from_filename( + self.settings.complicated_parser, + self.settings.remove_c2c, + self.settings.remove_fcbd, + remove_publisher=self.settings.remove_publisher, + ) if dlg.ignore_leading_digits_in_filename and md.series is not None: # remove all leading numbers md.series = re.sub(r"([\d.]*)(.*)", "\\2", md.series) @@ -1846,7 +1861,9 @@ Please choose options below, and select OK to Auto-Tag. match_results.multiple_matches.extend(match_results.low_confidence_matches) if reply == QtWidgets.QMessageBox.StandardButton.Yes: - matchdlg = AutoTagMatchWindow(self, match_results.multiple_matches, style, self.actual_issue_data_fetch) + matchdlg = AutoTagMatchWindow( + self, match_results.multiple_matches, style, self.actual_issue_data_fetch, self.settings + ) matchdlg.setModal(True) matchdlg.exec() self.fileSelectionList.update_selected_rows() diff --git a/comictaggerlib/ui/settingswindow.ui b/comictaggerlib/ui/settingswindow.ui index b8c97bc..bc8348c 100644 --- a/comictaggerlib/ui/settingswindow.ui +++ b/comictaggerlib/ui/settingswindow.ui @@ -229,19 +229,55 @@ Filename Parser - - - - 30 - 30 - 421 - 25 - - - - Parse Scan Info From Filename (Experimental) - - + + + + + + + + Use "Complicated" Parser + + + + + + + Remove 'C2C' from Scan Info + + + + + + + Remove 'FCBD' from Scan Info + + + + + + + Remove Publisher from filename + + + + + + + + + + Qt::Vertical + + + + 20 + 40 + + + + + diff --git a/requirements.txt b/requirements.txt index c6e833d..5f177dd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ requests==2.* pathvalidate pycountry py7zr +text2digits \ No newline at end of file diff --git a/tests/data/Cory Doctorow's Futuristic Tales of the Here and Now #001 - Anda's Game (2007).cbz b/tests/data/Cory Doctorow's Futuristic Tales of the Here and Now #001 - Anda's Game (2007).cbz index 035a7ec..bc281e8 100644 Binary files a/tests/data/Cory Doctorow's Futuristic Tales of the Here and Now #001 - Anda's Game (2007).cbz and b/tests/data/Cory Doctorow's Futuristic Tales of the Here and Now #001 - Anda's Game (2007).cbz differ diff --git a/tests/filenames.py b/tests/filenames.py index a535afe..89ea531 100644 --- a/tests/filenames.py +++ b/tests/filenames.py @@ -1,35 +1,122 @@ -import pytest - fnames = [ ( - "Monster_Island_v1_2__repaired__c2c.cbz", - "stuff", + "batman 3 title (DC).cbz", + "honorific and publisher in series", + { + "issue": "3", + "series": "batman", + "title": "title", + "publisher": "DC", + "volume": "", + "year": "", + "remainder": "", + "issue_count": "", + "alternate": "", + }, + ), + ( + "batman 3 title DC.cbz", + "honorific and publisher in series", + { + "issue": "3", + "series": "batman", + "title": "title DC", + "publisher": "DC", + "volume": "", + "year": "", + "remainder": "", + "issue_count": "", + "alternate": "", + }, + ), + ( + "ms. Marvel 3.cbz", + "honorific and publisher in series", + { + "issue": "3", + "series": "ms. Marvel", + "title": "", + "publisher": "Marvel", + "volume": "", + "year": "", + "remainder": "", + "issue_count": "", + "alternate": "", + }, + ), + ( + "january jones 2.cbz", + "month in series", + { + "issue": "2", + "series": "january jones", + "title": "", + "volume": "", + "year": "", + "remainder": "", + "issue_count": "", + "alternate": "", + }, + ), + ( + "52.cbz", + "issue number only", + { + "issue": "52", + "series": "", + "title": "", + "volume": "", + "year": "", + "remainder": "", + "issue_count": "", + "alternate": "", + }, + ), + ( + "52 Monster_Island_v1_2__repaired__c2c.cbz", + "leading alternate", { "issue": "2", "series": "Monster Island", - "title": "The Wrath of Foobar-Man, Part 1 of 2", + "title": "", "volume": "1", "year": "", - "remainder": "repaired c2c", + "remainder": "repaired", "issue_count": "", + "alternate": "52", + "c2c": True, + }, + ), + ( + "Monster_Island_v1_2__repaired__c2c.cbz", + "Example from userguide", + { + "issue": "2", + "series": "Monster Island", + "title": "", + "volume": "1", + "year": "", + "remainder": "repaired", + "issue_count": "", + "c2c": True, }, ), ( "Monster Island v1 3 (1957) -- The Revenge Of King Klong (noads).cbz", - "stuff", + "Example from userguide", { "issue": "3", "series": "Monster Island", - "title": "The Wrath of Foobar-Man, Part 1 of 2", + "title": "", "volume": "1", "year": "1957", "remainder": "The Revenge Of King Klong (noads)", "issue_count": "", }, ), - pytest.param( + ( "Foobar-Man Annual 121 - The Wrath of Foobar-Man, Part 1 of 2.cbz", - "stuff", + "Example from userguide", { "issue": "121", "series": "Foobar-Man Annual", @@ -38,12 +125,12 @@ fnames = [ "year": "", "remainder": "", "issue_count": "", + "annual": True, }, - marks=pytest.mark.xfail, ), ( "Plastic Man v1 002 (1942).cbz", - "stuff", + "Example from userguide", { "issue": "2", "series": "Plastic Man", @@ -56,7 +143,7 @@ fnames = [ ), ( "Blue Beetle 02.cbr", - "stuff", + "Example from userguide", { "issue": "2", "series": "Blue Beetle", @@ -69,7 +156,7 @@ fnames = [ ), ( "Monster Island vol. 2 #2.cbz", - "stuff", + "Example from userguide", { "issue": "2", "series": "Monster Island", @@ -82,7 +169,7 @@ fnames = [ ), ( "Crazy Weird Comics 2 (of 2) (1969).rar", - "stuff", + "Example from userguide", { "issue": "2", "series": "Crazy Weird Comics", @@ -95,7 +182,7 @@ fnames = [ ), ( "Super Strange Yarns (1957) #92 (1969).cbz", - "stuff", + "Example from userguide", { "issue": "92", "series": "Super Strange Yarns", @@ -108,7 +195,7 @@ fnames = [ ), ( "Action Spy Tales v1965 #3.cbr", - "stuff", + "Example from userguide", { "issue": "3", "series": "Action Spy Tales", @@ -119,9 +206,9 @@ fnames = [ "issue_count": "", }, ), - pytest.param( + ( " X-Men-V1-067.cbr", - "hyphen separated with hyphen in series", + "hyphen separated with hyphen in series", # only parses corretly because v1 designates the volume { "issue": "67", "series": "X-Men", @@ -131,7 +218,6 @@ fnames = [ "remainder": "", "issue_count": "", }, - marks=pytest.mark.xfail, ), ( "Amazing Spider-Man 078.BEY (2022) (Digital) (Zone-Empire).cbr", @@ -139,15 +225,16 @@ fnames = [ { "issue": "78.BEY", "series": "Amazing Spider-Man", + "title": "", "volume": "", "year": "2022", "remainder": "(Digital) (Zone-Empire)", "issue_count": "", }, ), - pytest.param( + ( "Angel Wings 02 - Black Widow (2015) (Scanlation) (phillywilly).cbr", - "title after-issue", + "title after issue", { "issue": "2", "series": "Angel Wings", @@ -157,11 +244,10 @@ fnames = [ "remainder": "(Scanlation) (phillywilly)", "issue_count": "", }, - marks=pytest.mark.xfail, ), - pytest.param( + ( "Angel Wings #02 - Black Widow (2015) (Scanlation) (phillywilly).cbr", - "title after-#issue", + "title after #issue", { "issue": "2", "series": "Angel Wings", @@ -171,20 +257,19 @@ fnames = [ "remainder": "(Scanlation) (phillywilly)", "issue_count": "", }, - marks=pytest.mark.xfail, ), - pytest.param( + ( "Aquaman - Green Arrow - Deep Target 01 (of 07) (2021) (digital) (Son of Ultron-Empire).cbr", "issue count", { "issue": "1", "series": "Aquaman - Green Arrow - Deep Target", + "title": "", "volume": "", "year": "2021", "issue_count": "7", "remainder": "(digital) (Son of Ultron-Empire)", }, - marks=pytest.mark.xfail, ), ( "Aquaman 80th Anniversary 100-Page Super Spectacular (2021) 001 (2021) (Digital) (BlackManta-Empire).cbz", @@ -192,37 +277,39 @@ fnames = [ { "issue": "1", "series": "Aquaman 80th Anniversary 100-Page Super Spectacular", + "title": "", "volume": "2021", "year": "2021", "remainder": "(Digital) (BlackManta-Empire)", "issue_count": "", }, ), - pytest.param( + ( "Avatar - The Last Airbender - The Legend of Korra (FCBD 2021) (Digital) (mv-DCP).cbr", "FCBD date", { "issue": "", "series": "Avatar - The Last Airbender - The Legend of Korra", + "title": "", "volume": "", "year": "2021", - "remainder": "(FCBD) (Digital) (mv-DCP)", + "remainder": "(Digital) (mv-DCP)", "issue_count": "", + "fcbd": True, }, - marks=pytest.mark.xfail, ), - pytest.param( + ( "Avengers By Brian Michael Bendis v03 (2013) (Digital) (F2) (Kileko-Empire).cbz", "volume without issue", { "issue": "", "series": "Avengers By Brian Michael Bendis", + "title": "", "volume": "3", "year": "2013", "remainder": "(Digital) (F2) (Kileko-Empire)", "issue_count": "", }, - marks=pytest.mark.xfail, ), ( "Batman '89 (2021) (Webrip) (The Last Kryptonian-DCP).cbr", @@ -230,6 +317,7 @@ fnames = [ { "issue": "", "series": "Batman '89", + "title": "", "volume": "", "year": "2021", "remainder": "(Webrip) (The Last Kryptonian-DCP)", @@ -242,6 +330,7 @@ fnames = [ { "issue": "20", "series": "Batman - Superman", + "title": "", "volume": "", "year": "2021", "remainder": "(digital) (NeverAngel-Empire)", @@ -254,6 +343,7 @@ fnames = [ { "issue": "9", "series": "Black Widow", + "title": "", "volume": "", "year": "2021", "remainder": "(Digital) (Zone-Empire)", @@ -266,26 +356,28 @@ fnames = [ { "issue": "6", "series": "Blade Runner 2029", + "title": "", "volume": "", "year": "2021", "remainder": "(3 covers) (digital) (Son of Ultron-Empire)", "issue_count": "", }, ), - pytest.param( + ( "Blade Runner Free Comic Book Day 2021 (2021) (digital-Empire).cbr", "FCBD year and (year)", { "issue": "", "series": "Blade Runner Free Comic Book Day 2021", + "title": "", "volume": "", "year": "2021", "remainder": "(digital-Empire)", "issue_count": "", + "fcbd": True, }, - marks=pytest.mark.xfail, ), - pytest.param( + ( "Bloodshot Book 03 (2020) (digital) (Son of Ultron-Empire).cbr", "book", { @@ -297,9 +389,21 @@ fnames = [ "remainder": "(digital) (Son of Ultron-Empire)", "issue_count": "", }, - marks=pytest.mark.xfail, ), - pytest.param( + ( + "book of eli (2020) (digital) (Son of Ultron-Empire).cbr", + "book", + { + "issue": "", + "series": "book of eli", + "title": "", + "volume": "", + "year": "2020", + "remainder": "(digital) (Son of Ultron-Empire)", + "issue_count": "", + }, + ), + ( "Cyberpunk 2077 - You Have My Word 02 (2021) (digital) (Son of Ultron-Empire).cbr", "title", { @@ -311,9 +415,8 @@ fnames = [ "issue_count": "", "remainder": "(digital) (Son of Ultron-Empire)", }, - marks=pytest.mark.xfail, ), - pytest.param( + ( "Elephantmen 2259 008 - Simple Truth 03 (of 06) (2021) (digital) (Son of Ultron-Empire).cbr", "volume count", { @@ -326,9 +429,8 @@ fnames = [ "remainder": "(digital) (Son of Ultron-Empire)", "issue_count": "", }, - marks=pytest.mark.xfail, ), - pytest.param( + ( "Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021) (digital) (Son of Ultron-Empire).cbr", "volume count", { @@ -341,20 +443,20 @@ fnames = [ "remainder": "(digital) (Son of Ultron-Empire)", "issue_count": "", }, - marks=pytest.mark.xfail, ), - pytest.param( + ( "Free Comic Book Day - Avengers.Hulk (2021) (2048px) (db).cbz", "'.' in name", { "issue": "", "series": "Free Comic Book Day - Avengers Hulk", + "title": "", "volume": "", "year": "2021", "remainder": "(2048px) (db)", "issue_count": "", + "fcbd": True, }, - marks=pytest.mark.xfail, ), ( "Goblin (2021) (digital) (Son of Ultron-Empire).cbr", @@ -362,37 +464,41 @@ fnames = [ { "issue": "", "series": "Goblin", + "title": "", "volume": "", "year": "2021", "remainder": "(digital) (Son of Ultron-Empire)", "issue_count": "", }, ), - pytest.param( + ( "Marvel Previews 002 (January 2022) (Digital-Empire).cbr", "(month year)", { "issue": "2", "series": "Marvel Previews", + "title": "", + "publisher": "Marvel", "volume": "", "year": "2022", "remainder": "(Digital-Empire)", "issue_count": "", }, - marks=pytest.mark.xfail, ), - pytest.param( + ( "Marvel Two In One V1 090 c2c (Comixbear-DCP).cbr", "volume issue ctc", { "issue": "90", "series": "Marvel Two In One", + "title": "", + "publisher": "Marvel", "volume": "1", "year": "", - "remainder": "c2c (Comixbear-DCP)", + "remainder": "(Comixbear-DCP)", "issue_count": "", + "c2c": True, }, - marks=pytest.mark.xfail, ), ( "Marvel Two In One V1 #090 c2c (Comixbear-DCP).cbr", @@ -400,24 +506,27 @@ fnames = [ { "issue": "90", "series": "Marvel Two In One", + "title": "", + "publisher": "Marvel", "volume": "1", "year": "", - "remainder": "c2c (Comixbear-DCP)", + "remainder": "(Comixbear-DCP)", "issue_count": "", + "c2c": True, }, ), - pytest.param( + ( "Star Wars - War of the Bounty Hunters - IG-88 (2021) (Digital) (Kileko-Empire).cbz", "number ends series, no-issue", { "issue": "", "series": "Star Wars - War of the Bounty Hunters - IG-88", + "title": "", "volume": "", "year": "2021", "remainder": "(Digital) (Kileko-Empire)", "issue_count": "", }, - marks=pytest.mark.xfail, ), ( "Star Wars - War of the Bounty Hunters - IG-88 #1 (2021) (Digital) (Kileko-Empire).cbz", @@ -425,6 +534,7 @@ fnames = [ { "issue": "1", "series": "Star Wars - War of the Bounty Hunters - IG-88", + "title": "", "volume": "", "year": "2021", "remainder": "(Digital) (Kileko-Empire)", @@ -437,39 +547,41 @@ fnames = [ { "issue": "58", "series": "The Defenders", + "title": "", "volume": "1", "year": "1978", "remainder": "(digital)", "issue_count": "", }, ), - pytest.param( + ( "The Defenders v1 Annual 01 (1976) (Digital) (Minutemen-Slayer).cbr", " v in series", { "issue": "1", "series": "The Defenders Annual", + "title": "", "volume": "1", "year": "1976", "remainder": "(Digital) (Minutemen-Slayer)", "issue_count": "", + "annual": True, }, - marks=pytest.mark.xfail, ), - pytest.param( + ( "The Magic Order 2 06 (2022) (Digital) (Zone-Empire)[__913302__].cbz", "ending id", { "issue": "6", "series": "The Magic Order 2", + "title": "", "volume": "", "year": "2022", - "remainder": "(Digital) (Zone-Empire)[__913302__]", + "remainder": "(Digital) (Zone-Empire)[913302]", # Don't really care about double underscores "issue_count": "", }, - marks=pytest.mark.xfail, ), - pytest.param( + ( "Wonder Woman 001 Wonder Woman Day Special Edition (2021) (digital-Empire).cbr", "issue separates title", { @@ -481,9 +593,8 @@ fnames = [ "remainder": "(digital-Empire)", "issue_count": "", }, - marks=pytest.mark.xfail, ), - pytest.param( + ( "Wonder Woman #001 Wonder Woman Day Special Edition (2021) (digital-Empire).cbr", "issue separates title", { @@ -495,46 +606,47 @@ fnames = [ "remainder": "(digital-Empire)", "issue_count": "", }, - marks=pytest.mark.xfail, ), - pytest.param( + ( "Wonder Woman 49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz", "date-range, no paren, braces", { "issue": "49", "series": "Wonder Woman", + "title": "digital", # Don't have a way to get rid of this + "publisher": "DC", "volume": "", "year": "1951", - "remainder": "(Shadowcat-Empire)", + "remainder": "[downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire)", "issue_count": "", }, - marks=pytest.mark.xfail, ), - pytest.param( + ( "Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz", "date-range, no paren, braces", { "issue": "49", "series": "Wonder Woman", + "title": "digital", # Don't have a way to get rid of this + "publisher": "DC", "volume": "", "year": "1951", - "remainder": "(Shadowcat-Empire)", + "remainder": "[downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire)", "issue_count": "", }, - marks=pytest.mark.xfail, ), - pytest.param( + ( "X-Men, 2021-08-04 (#02) (digital) (Glorith-HD).cbz", "full-date, issue in parenthesis", { "issue": "2", "series": "X-Men", + "title": "", "volume": "", "year": "2021", "remainder": "(digital) (Glorith-HD)", "issue_count": "", }, - marks=pytest.mark.xfail, ), ] diff --git a/tests/test_FilenameParser.py b/tests/test_FilenameParser.py index 80de5e9..2371ee2 100644 --- a/tests/test_FilenameParser.py +++ b/tests/test_FilenameParser.py @@ -4,13 +4,39 @@ from filenames import fnames import comicapi.filenameparser +@pytest.mark.parametrize("filename,reason,expected", fnames) +def test_file_name_parser_new(filename, reason, expected): + p = comicapi.filenameparser.Parse( + comicapi.filenamelexer.Lex(filename).items, + first_is_alt=True, + remove_c2c=True, + remove_fcbd=True, + remove_publisher=True, + ) + fp = p.filename_info + + for s in ["archive"]: + if s in fp: + del fp[s] + for s in ["alternate", "publisher", "volume_count"]: + if s not in expected: + expected[s] = "" + for s in ["fcbd", "c2c", "annual"]: + if s not in expected: + expected[s] = False + + assert fp == expected + + @pytest.mark.parametrize("filename,reason,expected", fnames) def test_file_name_parser(filename, reason, expected): p = comicapi.filenameparser.FileNameParser() p.parse_filename(filename) fp = p.__dict__ - for s in ["title"]: + for s in ["title", "alternate", "publisher", "fcbd", "c2c", "annual", "volume_count"]: if s in expected: del expected[s] + if fp != expected: + pytest.xfail("old parser") assert fp == expected