From f03b2e58cf5763205818482be184a73b7771abdb Mon Sep 17 00:00:00 2001 From: Timmy Welch Date: Sun, 15 Oct 2023 15:47:04 -0700 Subject: [PATCH 1/4] Improve lexing numbers lex currency amounts as text lex a '.' followed by a number as a number if there is a preceding space --- comicapi/filenamelexer.py | 30 +++++++++++++++++++++++++++--- comicapi/filenameparser.py | 8 +------- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/comicapi/filenamelexer.py b/comicapi/filenamelexer.py index 10a0c78..385ea21 100644 --- a/comicapi/filenamelexer.py +++ b/comicapi/filenamelexer.py @@ -30,10 +30,10 @@ class ItemType(Enum): InfoSpecifier = auto() # Specifies type of info e.g. v1 for 'volume': 1 ArchiveType = auto() Honorific = auto() + Publisher = auto() Keywords = auto() FCBD = auto() ComicType = auto() - Publisher = auto() C2C = auto() @@ -189,6 +189,8 @@ def lex_filename(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # ty return lex_space elif r == ".": r = lex.peek() + if r.isnumeric() and lex.pos > 0 and is_space(lex.input[lex.pos - 1]): + return lex_number lex.emit(ItemType.Dot) return lex_filename elif r == "'": @@ -196,7 +198,7 @@ def lex_filename(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # ty if r.isdigit(): return lex_number lex.emit(ItemType.Text) # TODO: Change to Text - elif "0" <= r <= "9": + elif r.isnumeric(): lex.backup() return lex_number elif r == "#": @@ -241,6 +243,8 @@ def lex_filename(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # ty if lex.sbrace_depth < 0: return errorf(lex, "unexpected right brace " + r) elif is_symbol(r): + if unicodedata.category(r) == "Sc": + return lex_currency lex.emit(ItemType.Symbol) else: return errorf(lex, "unrecognized character in action: " + r) @@ -248,6 +252,19 @@ def lex_filename(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # ty return lex_filename +def lex_currency(lex: Lexer) -> Callable: + orig = lex.pos + while is_space(lex.peek()): + lex.get() + if lex.peek().isnumeric(): + return lex_number + else: + lex.pos = orig + # We don't have a number with this currency symbol. Don't treat it special + lex.emit(ItemType.Symbol) + return lex_filename + + def lex_operator(lex: Lexer) -> Callable: # type: ignore[type-arg] lex.accept_run("-|:;") lex.emit(ItemType.Operator) @@ -315,7 +332,14 @@ def lex_number(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # type # Assume that 80th is just text and not a number lex.emit(ItemType.Text) else: - lex.emit(ItemType.Number) + orig = lex.pos + while is_space(lex.peek()): + lex.get() + if "Sc" in [unicodedata.category(lex.input[lex.start]), unicodedata.category(lex.get())]: + lex.emit(ItemType.Text) + else: + lex.pos = orig + lex.emit(ItemType.Number) return lex_filename diff --git a/comicapi/filenameparser.py b/comicapi/filenameparser.py index 35249b7..ceb3b10 100644 --- a/comicapi/filenameparser.py +++ b/comicapi/filenameparser.py @@ -1233,13 +1233,7 @@ def join_title(lst: list[filenamelexer.Item]) -> str: # No space if the next item is an operator or symbol if lst[i + 1].typ in [filenamelexer.ItemType.Operator, filenamelexer.ItemType.Symbol]: # exept if followed by a dollarsign - if not ( - ( - lst[i].typ in [filenamelexer.ItemType.Number, filenamelexer.ItemType.IssueNumber] - and lst[i + 1].val == "$" - ) - or lst[i + 1].val == "&" - ): + if lst[i + 1].val != "&": continue # Add a space From 496f3f0e7532cd485a7bfa5f2e5d7b4bddb3e2d9 Mon Sep 17 00:00:00 2001 From: Timmy Welch Date: Mon, 23 Oct 2023 21:57:23 -0700 Subject: [PATCH 2/4] fix reset after space --- comicapi/filenamelexer.py | 7 +++-- comicapi/filenameparser.py | 57 ++++++++++++++++---------------------- 2 files changed, 29 insertions(+), 35 deletions(-) diff --git a/comicapi/filenamelexer.py b/comicapi/filenamelexer.py index 385ea21..e780c48 100644 --- a/comicapi/filenamelexer.py +++ b/comicapi/filenamelexer.py @@ -335,11 +335,14 @@ def lex_number(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # type orig = lex.pos while is_space(lex.peek()): lex.get() - if "Sc" in [unicodedata.category(lex.input[lex.start]), unicodedata.category(lex.get())]: + if "Sc" == unicodedata.category(lex.get()): lex.emit(ItemType.Text) else: lex.pos = orig - lex.emit(ItemType.Number) + if "Sc" == unicodedata.category(lex.input[lex.start]): + lex.emit(ItemType.Text) + else: + lex.emit(ItemType.Number) return lex_filename diff --git a/comicapi/filenameparser.py b/comicapi/filenameparser.py index ceb3b10..b2ceb43 100644 --- a/comicapi/filenameparser.py +++ b/comicapi/filenameparser.py @@ -20,6 +20,7 @@ This should probably be re-written, but, well, it mostly works! # http://code.google.com/p/pycomicmetathis/ from __future__ import annotations +import functools import logging import os import re @@ -540,17 +541,15 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign p.peek_back().typ == filenamelexer.ItemType.Space and p.peek_back(2).typ in (filenamelexer.ItemType.IssueNumber, filenamelexer.ItemType.Number) ): - return parse_series + return functools.partial(parse_series, i=item) if ( p.peek_back().typ == filenamelexer.ItemType.Operator or p.peek().typ == filenamelexer.ItemType.Operator ): # Were not in something and the next or previous type is an operator, add it to the series - p.series_parts.append(item) p.used_items.append(item) - p.get() - return parse_series + return functools.partial(parse_series, i=item) # Number with a leading hash e.g. #003 elif item.typ == filenamelexer.ItemType.IssueNumber: @@ -583,10 +582,10 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign if p.firstItem: p.firstItem = False if p.in_something == 0: - return parse_series + return functools.partial(parse_series, i=item) p.publisher_removed.append(item) if p.in_something == 0: - return parse_series + return functools.partial(parse_series, i=item) # Attempts to identify the type e.g. annual elif item.typ == filenamelexer.ItemType.ComicType: @@ -622,11 +621,10 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign # If we don't have a reason to exclude it from the series go back to parsing the series immediately if series_append: - p.series_parts.append(item) p.used_items.append(item) if p.firstItem: p.firstItem = False - return parse_series + return functools.partial(parse_series, i=item) # We found text, it's probably the title or series elif item.typ in [filenamelexer.ItemType.Text, filenamelexer.ItemType.Honorific]: @@ -634,7 +632,7 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign if p.firstItem: p.firstItem = False if p.in_something == 0: - return parse_series + return functools.partial(parse_series, i=None) # TODO # Usually the word 'of' eg 1 (of 6) elif item.typ == filenamelexer.ItemType.InfoSpecifier: @@ -662,15 +660,13 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign p.backup() if p.firstItem: p.firstItem = False - return parse_series + return functools.partial(parse_series, i=item) # This is text that just happens to also be a month/day else: - # Add this to the series and get the next item, parse_series expects the next item to be the current item - p.series_parts.append(item) p.get() if p.firstItem: p.firstItem = False - return parse_series + return functools.partial(parse_series, i=item) # Specifically '__' or '--', no further title/series parsing is done to keep compatibility with wiki elif item.typ == filenamelexer.ItemType.Skip: @@ -745,7 +741,7 @@ def parse_issue_number(p: Parser) -> Callable[[Parser], Callable | None] | None: return parse -def parse_series(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ignore[type-arg] +def parse_series(p: Parser, i: filenamelexer.Item) -> Callable[[Parser], Callable | None] | None: # type: ignore[type-arg] item = p.input[p.pos] current_part = 0 prev_space = False @@ -753,18 +749,23 @@ def parse_series(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty title_parts: list[filenamelexer.Item] = [] series_parts: list[filenamelexer.Item] = [] series: list[list[filenamelexer.Item]] = [[]] + issue_marked_or_passed = ( + p.issue_number_marked and p.issue_number_passed or p.issue_number_at is not None and not p.issue_number_marked + ) # We stop parsing the series when certain things come up if nothing was done with them continue where we left off - if p.peek_back().typ in [filenamelexer.ItemType.Number, filenamelexer.ItemType.Calendar]: - series_parts = p.series_parts - p.series_parts = [] + if i: + if issue_marked_or_passed: + series[0].append(i) + else: + series_parts = p.series_parts + p.series_parts = [] + series_parts.append(i) # Space and Dots are not useful at the beginning of a title/series if not p.skip and item.typ not in [filenamelexer.ItemType.Space, filenamelexer.ItemType.Dot]: - if item.typ == filenamelexer.ItemType.Text: + if item.typ in [filenamelexer.ItemType.Text, filenamelexer.ItemType.Honorific]: p.backup() - else: - series[0].append(item) # Skip is only true if we have come across '--' or '__' while not p.skip: item = p.get() @@ -847,12 +848,7 @@ def parse_series(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty break # the issue number has been marked and passed, keep it as a part of the series - if ( - p.issue_number_marked - and p.issue_number_passed - or p.issue_number_at is not None - and not p.issue_number_marked - ): + if issue_marked_or_passed: # We already have an issue number, this should be a part of the series series[current_part].append(item) else: @@ -864,12 +860,7 @@ def parse_series(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty # We have 1 number break here, it's possible it's the issue else: # the issue number has been #marked or passed, keep it as a part of the series - if ( - p.issue_number_marked - and p.issue_number_passed - or p.issue_number_at is not None - and not p.issue_number_marked - ): + if issue_marked_or_passed: # We already have an issue number, this should be a part of the series series[current_part].append(item) else: @@ -1153,7 +1144,7 @@ def parse_info_specifier(p: Parser) -> Callable[[Parser], Callable | None] | Non p.pos = [ind for ind, x in enumerate(p.input) if x == i][0] if not p.in_something: - return parse_series + return functools.partial(parse_series, i=i) return parse From 78060dff613a83b365fb68775f290f8f802a5a9f Mon Sep 17 00:00:00 2001 From: Timmy Welch Date: Thu, 26 Oct 2023 20:51:53 -0700 Subject: [PATCH 3/4] Rework parse_series --- comicapi/filenameparser.py | 178 +++++++++++++++++++++---------------- 1 file changed, 102 insertions(+), 76 deletions(-) diff --git a/comicapi/filenameparser.py b/comicapi/filenameparser.py index b2ceb43..f21fcd5 100644 --- a/comicapi/filenameparser.py +++ b/comicapi/filenameparser.py @@ -21,6 +21,7 @@ This should probably be re-written, but, well, it mostly works! from __future__ import annotations import functools +import itertools import logging import os import re @@ -390,6 +391,7 @@ class Parser: self.in_s_brace = 0 # In [] self.in_paren = 0 # In () self.year_candidates: list[tuple[bool, bool, filenamelexer.Item]] = [] + self.series: list[list[filenamelexer.Item]] = [] self.series_parts: list[filenamelexer.Item] = [] self.title_parts: list[filenamelexer.Item] = [] self.used_items: list[filenamelexer.Item] = [] @@ -425,11 +427,11 @@ class Parser: return self.input[self.pos] # Peek returns but does not consume the next Item in the input. - def peek(self) -> filenamelexer.Item: - if int(self.pos) >= len(self.input) - 1: + def peek(self, length: int = 1) -> filenamelexer.Item: + if int(self.pos) + length >= len(self.input): return eof - return self.input[self.pos + 1] + return self.input[self.pos + length] # Peek_back returns but does not step back the previous Item in the input. def peek_back(self, length: int = 1) -> filenamelexer.Item: @@ -474,7 +476,7 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign if p.issue_number_at is None: # Series has already been started/parsed, # filters out leading alternate numbers leading alternate number - if len(p.series_parts) > 0: + if len(p.series) > 0: return parse_issue_number else: p.operator_rejected.append(item) @@ -491,12 +493,9 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign # Is either a full year '2001' or a short year "'89" if len(item.val.lstrip("0")) == 4 or item.val[0] == "'": - series = " ".join([x.val for x in p.series_parts]) - if p.series_parts and series.casefold().endswith("free comic book day"): + series = " ".join([x.val for x in (p.series[-1] if p.series else [])]) + if p.series and series.casefold().endswith("free comic book day"): likely_issue_number = False - if p.in_something == 0: - # Append to series in case it is a part of the title, but only if were not inside parenthesis - p.series_parts.append(item) # Look for a full date as in 2022-04-22 if p.peek().typ in [ @@ -534,21 +533,29 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign likely_issue_number = likely_issue_number and item.val[0] != "'" p.year_candidates.append((likely_year, likely_issue_number, item)) + if p.in_something == 0: + # Append to series in case it is a part of the title, but only if were not inside parenthesis + if not p.series: + p.series.append([]) + p.series[-1].append(item) + + # We would use i=item but we want to force a split after year candidates + return functools.partial(parse_series, i=None) # Ensures that IG-88 gets added back to the series/title else: if p.in_something == 0: - if p.peek_back().typ in (filenamelexer.ItemType.IssueNumber, filenamelexer.ItemType.Number) or ( - p.peek_back().typ == filenamelexer.ItemType.Space - and p.peek_back(2).typ in (filenamelexer.ItemType.IssueNumber, filenamelexer.ItemType.Number) - ): - return functools.partial(parse_series, i=item) + to_series = ( + filenamelexer.ItemType.IssueNumber, + filenamelexer.ItemType.Number, + filenamelexer.ItemType.Operator, + ) if ( - p.peek_back().typ == filenamelexer.ItemType.Operator - or p.peek().typ == filenamelexer.ItemType.Operator + p.peek().typ in to_series + or (p.peek().typ == filenamelexer.ItemType.Space and p.peek(2).typ in to_series) + or p.peek_back().typ in to_series + or (p.peek_back().typ == filenamelexer.ItemType.Space and p.peek_back(2).typ in to_series) ): - # Were not in something and the next or previous type is an operator, add it to the series - p.used_items.append(item) - + # Were not in something and the next or previous type is an operator or number, add it to the series return functools.partial(parse_series, i=item) # Number with a leading hash e.g. #003 @@ -600,7 +607,7 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign number = p.get() # Mark volume info. Text will be added to the title/series later if item.val.casefold() in ["tpb"]: - p.title_parts.extend([item, number]) + # p.title_parts.extend([item, number]) p.filename_info["volume"] = t2do.convert(number.val) p.filename_info["issue"] = t2do.convert(number.val) @@ -631,8 +638,9 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign # Unset first item if p.firstItem: p.firstItem = False - if p.in_something == 0: - return functools.partial(parse_series, i=None) # TODO + if p.in_something == 0 and not p.skip: + p.backup() + return functools.partial(parse_series, i=None) # Usually the word 'of' eg 1 (of 6) elif item.typ == filenamelexer.ItemType.InfoSpecifier: @@ -741,31 +749,29 @@ def parse_issue_number(p: Parser) -> Callable[[Parser], Callable | None] | None: return parse -def parse_series(p: Parser, i: filenamelexer.Item) -> Callable[[Parser], Callable | None] | None: # type: ignore[type-arg] - item = p.input[p.pos] - current_part = 0 +# i=None is a split in the series +def parse_series(p: Parser, i: filenamelexer.Item | None) -> Callable[[Parser], Callable | None] | None: + current = [] prev_space = False - title_parts: list[filenamelexer.Item] = [] - series_parts: list[filenamelexer.Item] = [] - series: list[list[filenamelexer.Item]] = [[]] issue_marked_or_passed = ( p.issue_number_marked and p.issue_number_passed or p.issue_number_at is not None and not p.issue_number_marked ) - # We stop parsing the series when certain things come up if nothing was done with them continue where we left off if i: - if issue_marked_or_passed: - series[0].append(i) - else: - series_parts = p.series_parts - p.series_parts = [] - series_parts.append(i) + if not issue_marked_or_passed: + if p.series: + current = p.series.pop() + current.append(i) + else: + # If we are splitting we don't want to sart with these + while p.peek().typ in [ + filenamelexer.ItemType.Space, + filenamelexer.ItemType.Operator, + filenamelexer.ItemType.Symbol, + ]: + p.irrelevant.append(p.get()) - # Space and Dots are not useful at the beginning of a title/series - if not p.skip and item.typ not in [filenamelexer.ItemType.Space, filenamelexer.ItemType.Dot]: - if item.typ in [filenamelexer.ItemType.Text, filenamelexer.ItemType.Honorific]: - p.backup() # Skip is only true if we have come across '--' or '__' while not p.skip: item = p.get() @@ -780,14 +786,14 @@ def parse_series(p: Parser, i: filenamelexer.Item) -> Callable[[Parser], Callabl filenamelexer.ItemType.Publisher, filenamelexer.ItemType.Honorific, ]: - series[current_part].append(item) + current.append(item) if p.peek().typ == filenamelexer.ItemType.Dot: dot = p.get() if item.typ == filenamelexer.ItemType.Honorific or ( p.peek().typ == filenamelexer.ItemType.Space and item.typ in (filenamelexer.ItemType.Text, filenamelexer.ItemType.Publisher) ): - series[current_part].append(dot) + current.append(dot) else: p.backup() if item.typ == filenamelexer.ItemType.Publisher: @@ -797,14 +803,14 @@ def parse_series(p: Parser, i: filenamelexer.Item) -> Callable[[Parser], Callabl elif item.typ == filenamelexer.ItemType.InfoSpecifier: # Exception for 'of' if item.val.casefold() == "of": - series[current_part].append(item) + current.append(item) else: # This specifically lets 'X-Men-V1-067' parse correctly as Series: X-Men Volume: 1 Issue: 67 - while len(series[current_part]) > 0 and series[current_part][-1].typ not in [ + while len(current) > 0 and current[-1].typ not in [ filenamelexer.ItemType.Text, filenamelexer.ItemType.Symbol, ]: - p.irrelevant.append(series[current_part].pop()) + p.irrelevant.append(current.pop()) p.backup() break @@ -812,14 +818,13 @@ def parse_series(p: Parser, i: filenamelexer.Item) -> Callable[[Parser], Callabl peek = p.peek() # ': ' separates the title from the series, only the last section is considered the title if not prev_space and peek.typ in [filenamelexer.ItemType.Space]: - series.append([]) # Starts a new section - series[current_part].append(item) - current_part += 1 + current.append(item) + break else: # Force space around '-' makes 'batman - superman' stay otherwise we get 'batman-superman' if prev_space and peek.typ in [filenamelexer.ItemType.Space]: item.val = " " + item.val + " " - series[current_part].append(item) + current.append(item) # Stop processing series/title if a skip item is found elif item.typ == filenamelexer.ItemType.Skip: @@ -828,14 +833,18 @@ def parse_series(p: Parser, i: filenamelexer.Item) -> Callable[[Parser], Callabl elif item.typ == filenamelexer.ItemType.Number: # Special case for the word 'book' - if series[current_part] and series[current_part][-1].val.casefold() == "book": - title_parts.append(series[current_part].pop()) - title_parts.append(item) + if current and current[-1].val.casefold() == "book": + # Mark the volume p.filename_info["volume"] = t2do.convert(item.val) + + # Add this section to the series EG [['bloodshot', 'book']] + p.series.append(current) + # Pop the last item and break to end this section EG [['bloodshot'], ['book', '3']] + current = [current.pop(), item] break count = get_number(p, p.pos + 1) - # this is an issue or volume number + # this is an issue or volume number eg '1 of 2' if count is not None: p.backup() break @@ -844,13 +853,13 @@ def parse_series(p: Parser, i: filenamelexer.Item) -> Callable[[Parser], Callabl p.get() # We have 2 numbers, add the first to the series and then go back to parse if p.peek().typ in [filenamelexer.ItemType.Number, filenamelexer.ItemType.IssueNumber]: - series[current_part].append(item) + current.append(item) break # the issue number has been marked and passed, keep it as a part of the series if issue_marked_or_passed: # We already have an issue number, this should be a part of the series - series[current_part].append(item) + current.append(item) else: # We have 1 number break here, it's possible it's the issue p.backup() # Whitespace @@ -862,7 +871,7 @@ def parse_series(p: Parser, i: filenamelexer.Item) -> Callable[[Parser], Callabl # the issue number has been #marked or passed, keep it as a part of the series if issue_marked_or_passed: # We already have an issue number, this should be a part of the series - series[current_part].append(item) + current.append(item) else: p.backup() # The number break @@ -871,13 +880,13 @@ def parse_series(p: Parser, i: filenamelexer.Item) -> Callable[[Parser], Callabl # Ensure 'ms. marvel' parses 'ms.' correctly if item.typ == filenamelexer.ItemType.Dot: if p.peek_back().typ == filenamelexer.ItemType.Honorific: - series[current_part].append(item) + current.append(item) elif ( p.peek().typ == filenamelexer.ItemType.Number or p.peek_back().typ == filenamelexer.ItemType.Text and len(p.peek_back().val) == 1 ): - series[current_part].append(item) + current.append(item) item.no_space = True # Allows avengers.hulk to parse correctly elif p.peek().typ in (filenamelexer.ItemType.Text,): @@ -889,24 +898,7 @@ def parse_series(p: Parser, i: filenamelexer.Item) -> Callable[[Parser], Callabl prev_space = False - # We have a title separator e.g. ': " - if len(series) > 1: - title_parts.extend(series.pop()) - for s in series: - if s and s[-1].typ == filenamelexer.ItemType.Operator: - s[-1].val += " " # Ensures that when there are multiple separators that they display properly - series_parts.extend(s) - p.used_items.append(series_parts.pop()) - else: - series_parts.extend(series[0]) - - # If the series has already been set assume all of this is the title. - if len(p.series_parts) > 0: - p.title_parts.extend(series_parts) - p.title_parts.extend(title_parts) - else: - p.series_parts.extend(series_parts) - p.title_parts.extend(title_parts) + p.series.append(current) return parse @@ -1008,7 +1000,40 @@ def resolve_issue(p: Parser) -> None: p.filename_info["format"] = protofolius_issue_number_scheme[p.filename_info["issue"][0].upper()] +def split_series(items: list[list[filenamelexer.Item]]) -> tuple[list[filenamelexer.Item], list[filenamelexer.Item]]: + series_parts: list[list[filenamelexer.Item]] = [] + title_parts: list[list[filenamelexer.Item]] = [] + current = series_parts + # We probably have a title + if len(items) > 1: + for i, s in enumerate(items): + # Switch to title if we are on the last part + if i == len(items) - 1: + current = title_parts + if s: + current.append(s) + if s[-1].typ == filenamelexer.ItemType.Operator: + s[-1].val += " " # Ensures that when there are multiple separators that they display properly + else: # We don't have an operator separating the parts, it's probably an issue number + current = title_parts + else: + if items: + series_parts.extend(items) + + series: list[filenamelexer.Item] = list(itertools.chain.from_iterable(series_parts)) + title: list[filenamelexer.Item] = list(itertools.chain.from_iterable(title_parts)) + if series and series[-1].typ == filenamelexer.ItemType.Operator: + series.pop() + return series, title + + def parse_finish(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ignore[type-arg] + for part in p.series: + p.used_items.extend(part) + p.series_parts, p.title_parts = split_series(p.series) + p.filename_info["series"] = join_title(p.series_parts) + p.filename_info["title"] = join_title(p.title_parts) + resolve_year(p) resolve_issue(p) @@ -1140,7 +1165,8 @@ def parse_info_specifier(p: Parser) -> Callable[[Parser], Callable | None] | Non # TODO: Figure out what to do here if it ever happens pass else: - # Lets 'The Wrath of Foobar-Man, Part 1 of 2' parse correctly as the title + # Resets back to '1' in 'The Wrath of Foobar-Man, Part 1 of 2' + # we then go to parse_series it adds i (the '1') and then continues parsing at of p.pos = [ind for ind, x in enumerate(p.input) if x == i][0] if not p.in_something: From bd9b3522d866dde456ad775a3448facb92135cc0 Mon Sep 17 00:00:00 2001 From: Timmy Welch Date: Fri, 27 Oct 2023 00:03:42 -0700 Subject: [PATCH 4/4] Improve edge cases Lex `'` as a symbol Lex multiple symbols as a single item Prefer `$` at the start of a number Simplify issue number parsing --- comicapi/filenamelexer.py | 90 +++++++++++++++++++++++--------------- comicapi/filenameparser.py | 29 ++++-------- testing/filenames.py | 15 +++++++ 3 files changed, 78 insertions(+), 56 deletions(-) diff --git a/comicapi/filenamelexer.py b/comicapi/filenamelexer.py index e780c48..7dc713a 100644 --- a/comicapi/filenamelexer.py +++ b/comicapi/filenamelexer.py @@ -130,17 +130,25 @@ class Lexer: self.start = self.pos # Accept consumes the next rune if it's from the valid se: - def accept(self, valid: str) -> bool: - if self.get() in valid: - return True + def accept(self, valid: str | Callable[[str], bool]) -> bool: + if isinstance(valid, str): + if self.get() in valid: + return True + else: + if valid(self.get()): + return True self.backup() return False # AcceptRun consumes a run of runes from the valid set. - def accept_run(self, valid: str) -> None: - while self.get() in valid: - continue + def accept_run(self, valid: str | Callable[[str], bool]) -> None: + if isinstance(valid, str): + while self.get() in valid: + continue + else: + while valid(self.get()): + continue self.backup() @@ -150,9 +158,7 @@ class Lexer: self.accept_run(digits) if self.input[self.pos] == ".": self.backup() - while self.get().isalpha(): - ... - self.backup() + self.accept_run(str.isalpha) return True @@ -197,7 +203,8 @@ def lex_filename(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # ty r = lex.peek() if r.isdigit(): return lex_number - lex.emit(ItemType.Text) # TODO: Change to Text + lex.accept_run(is_symbol) + lex.emit(ItemType.Symbol) elif r.isnumeric(): lex.backup() return lex_number @@ -245,17 +252,17 @@ def lex_filename(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # ty elif is_symbol(r): if unicodedata.category(r) == "Sc": return lex_currency + lex.accept_run(is_symbol) lex.emit(ItemType.Symbol) else: - return errorf(lex, "unrecognized character in action: " + r) + return errorf(lex, "unrecognized character in action: " + repr(r)) return lex_filename def lex_currency(lex: Lexer) -> Callable: orig = lex.pos - while is_space(lex.peek()): - lex.get() + lex.accept_run(is_space) if lex.peek().isnumeric(): return lex_number else: @@ -274,8 +281,7 @@ def lex_operator(lex: Lexer) -> Callable: # type: ignore[type-arg] # LexSpace scans a run of space characters. # One space has already been seen. def lex_space(lex: Lexer) -> Callable: # type: ignore[type-arg] - while is_space(lex.peek()): - lex.get() + lex.accept_run(is_space) lex.emit(ItemType.Space) return lex_filename @@ -332,17 +338,37 @@ def lex_number(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # type # Assume that 80th is just text and not a number lex.emit(ItemType.Text) else: - orig = lex.pos - while is_space(lex.peek()): - lex.get() - if "Sc" == unicodedata.category(lex.get()): + # Used to check for a '$' + endNumber = lex.pos + + # Consume any spaces + lex.accept_run(is_space) + + # This number starts with a '$' emit it as Text instead of a Number + if "Sc" == unicodedata.category(lex.input[lex.start]): + lex.pos = endNumber lex.emit(ItemType.Text) - else: - lex.pos = orig - if "Sc" == unicodedata.category(lex.input[lex.start]): - lex.emit(ItemType.Text) - else: + + # This number ends in a '$' if there is a number on the other side we assume it belongs to the following number + elif "Sc" == unicodedata.category(lex.get()): + # Store the end of the number '$'. We still need to check to see if there is a number coming up + endCurrency = lex.pos + # Consume any spaces + lex.accept_run(is_space) + + # This is a number + if lex.peek().isnumeric(): + # We go back to the original number before the '$' and emit a number + lex.pos = endNumber lex.emit(ItemType.Number) + else: + # There was no following number, reset to the '$' and emit a number + lex.pos = endCurrency + lex.emit(ItemType.Text) + else: + # We go back to the original number there is no '$' + lex.pos = endNumber + lex.emit(ItemType.Number) return lex_filename @@ -350,21 +376,13 @@ def lex_number(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # type def lex_issue_number(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # type: ignore[type-arg] # Only called when lex.input[lex.start] == "#" original_start = lex.pos - found_number = False - while True: - r = lex.get() - if is_alpha_numeric(r): - if r.isnumeric(): - found_number = True - else: - lex.backup() - break + lex.accept_run(str.isalpha) - if not found_number: + if lex.peek().isnumeric(): + return lex_number + else: lex.pos = original_start lex.emit(ItemType.Symbol) - else: - lex.emit(ItemType.IssueNumber) return lex_filename diff --git a/comicapi/filenameparser.py b/comicapi/filenameparser.py index f21fcd5..52dc92e 100644 --- a/comicapi/filenameparser.py +++ b/comicapi/filenameparser.py @@ -544,19 +544,8 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign # Ensures that IG-88 gets added back to the series/title else: if p.in_something == 0: - to_series = ( - filenamelexer.ItemType.IssueNumber, - filenamelexer.ItemType.Number, - filenamelexer.ItemType.Operator, - ) - if ( - p.peek().typ in to_series - or (p.peek().typ == filenamelexer.ItemType.Space and p.peek(2).typ in to_series) - or p.peek_back().typ in to_series - or (p.peek_back().typ == filenamelexer.ItemType.Space and p.peek_back(2).typ in to_series) - ): - # Were not in something and the next or previous type is an operator or number, add it to the series - return functools.partial(parse_series, i=item) + # We're not in something add it to the series + return functools.partial(parse_series, i=item) # Number with a leading hash e.g. #003 elif item.typ == filenamelexer.ItemType.IssueNumber: @@ -1031,8 +1020,6 @@ def parse_finish(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty for part in p.series: p.used_items.extend(part) p.series_parts, p.title_parts = split_series(p.series) - p.filename_info["series"] = join_title(p.series_parts) - p.filename_info["title"] = join_title(p.title_parts) resolve_year(p) resolve_issue(p) @@ -1048,7 +1035,6 @@ def parse_finish(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty if p.series_parts: p.filename_info["series"] = join_title(p.series_parts) - p.used_items.extend(p.series_parts) else: p.filename_info["series"] = p.filename_info.get("issue", "") @@ -1056,7 +1042,6 @@ def parse_finish(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty p.filename_info["fcbd"] = True p.filename_info["title"] = join_title(p.title_parts) - p.used_items.extend(p.title_parts) p.irrelevant.extend([x for x in p.input if x.typ in p.remove_from_remainder]) @@ -1153,11 +1138,15 @@ def parse_info_specifier(p: Parser) -> Callable[[Parser], Callable | None] | Non p.used_items.append(item) p.used_items.append(number) - # This is not for the issue number it is not in either the issue or the title, - # assume it is the volume number and count - elif p.issue_number_at != i.pos and i not in p.series_parts and i not in p.title_parts: + # This is not for the issue number + # assume it is the volume number and count, remove from series + elif p.issue_number_at != i.pos: p.filename_info["volume"] = i.val p.filename_info["volume_count"] = str(int(t2do.convert(number.val))) + for part in p.series: + if i in part: + part.remove(i) + break p.used_items.append(i) p.used_items.append(item) p.used_items.append(number) diff --git a/testing/filenames.py b/testing/filenames.py index a2325bc..fa9138c 100644 --- a/testing/filenames.py +++ b/testing/filenames.py @@ -955,6 +955,21 @@ names: list[tuple[str, str, dict[str, str | bool], tuple[bool, bool]]] = [ }, (True, True), ), + ( + "Cory Doctorow's Futuristic Tales of the Here and Now $1$2 3 #0.0.1 (2007).cbz", + "$", + { + "archive": "cbz", + "issue": "0.1", + "series": "Cory Doctorow's Futuristic Tales of the Here and Now $1 $2 3", + "title": "", + "volume": "", + "year": "2007", + "remainder": "", + "issue_count": "", + }, + (True, True), + ), ] oldfnames = []