Merge branch 'lexNumbers' into develop

This commit is contained in:
Timmy Welch 2023-10-27 23:50:05 -07:00
commit b23c3195e3
3 changed files with 201 additions and 141 deletions

View File

@ -30,10 +30,10 @@ class ItemType(Enum):
InfoSpecifier = auto() # Specifies type of info e.g. v1 for 'volume': 1
ArchiveType = auto()
Honorific = auto()
Publisher = auto()
Keywords = auto()
FCBD = auto()
ComicType = auto()
Publisher = auto()
C2C = auto()
@ -130,17 +130,25 @@ class Lexer:
self.start = self.pos
# Accept consumes the next rune if it's from the valid se:
def accept(self, valid: str) -> bool:
if self.get() in valid:
return True
def accept(self, valid: str | Callable[[str], bool]) -> bool:
if isinstance(valid, str):
if self.get() in valid:
return True
else:
if valid(self.get()):
return True
self.backup()
return False
# AcceptRun consumes a run of runes from the valid set.
def accept_run(self, valid: str) -> None:
while self.get() in valid:
continue
def accept_run(self, valid: str | Callable[[str], bool]) -> None:
if isinstance(valid, str):
while self.get() in valid:
continue
else:
while valid(self.get()):
continue
self.backup()
@ -150,9 +158,7 @@ class Lexer:
self.accept_run(digits)
if self.input[self.pos] == ".":
self.backup()
while self.get().isalpha():
...
self.backup()
self.accept_run(str.isalpha)
return True
@ -189,14 +195,17 @@ def lex_filename(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # ty
return lex_space
elif r == ".":
r = lex.peek()
if r.isnumeric() and lex.pos > 0 and is_space(lex.input[lex.pos - 1]):
return lex_number
lex.emit(ItemType.Dot)
return lex_filename
elif r == "'":
r = lex.peek()
if r.isdigit():
return lex_number
lex.emit(ItemType.Text) # TODO: Change to Text
elif "0" <= r <= "9":
lex.accept_run(is_symbol)
lex.emit(ItemType.Symbol)
elif r.isnumeric():
lex.backup()
return lex_number
elif r == "#":
@ -241,13 +250,28 @@ def lex_filename(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # ty
if lex.sbrace_depth < 0:
return errorf(lex, "unexpected right brace " + r)
elif is_symbol(r):
if unicodedata.category(r) == "Sc":
return lex_currency
lex.accept_run(is_symbol)
lex.emit(ItemType.Symbol)
else:
return errorf(lex, "unrecognized character in action: " + r)
return errorf(lex, "unrecognized character in action: " + repr(r))
return lex_filename
def lex_currency(lex: Lexer) -> Callable:
orig = lex.pos
lex.accept_run(is_space)
if lex.peek().isnumeric():
return lex_number
else:
lex.pos = orig
# We don't have a number with this currency symbol. Don't treat it special
lex.emit(ItemType.Symbol)
return lex_filename
def lex_operator(lex: Lexer) -> Callable: # type: ignore[type-arg]
lex.accept_run("-|:;")
lex.emit(ItemType.Operator)
@ -257,8 +281,7 @@ def lex_operator(lex: Lexer) -> Callable: # type: ignore[type-arg]
# LexSpace scans a run of space characters.
# One space has already been seen.
def lex_space(lex: Lexer) -> Callable: # type: ignore[type-arg]
while is_space(lex.peek()):
lex.get()
lex.accept_run(is_space)
lex.emit(ItemType.Space)
return lex_filename
@ -315,7 +338,37 @@ def lex_number(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # type
# Assume that 80th is just text and not a number
lex.emit(ItemType.Text)
else:
lex.emit(ItemType.Number)
# Used to check for a '$'
endNumber = lex.pos
# Consume any spaces
lex.accept_run(is_space)
# This number starts with a '$' emit it as Text instead of a Number
if "Sc" == unicodedata.category(lex.input[lex.start]):
lex.pos = endNumber
lex.emit(ItemType.Text)
# This number ends in a '$' if there is a number on the other side we assume it belongs to the following number
elif "Sc" == unicodedata.category(lex.get()):
# Store the end of the number '$'. We still need to check to see if there is a number coming up
endCurrency = lex.pos
# Consume any spaces
lex.accept_run(is_space)
# This is a number
if lex.peek().isnumeric():
# We go back to the original number before the '$' and emit a number
lex.pos = endNumber
lex.emit(ItemType.Number)
else:
# There was no following number, reset to the '$' and emit a number
lex.pos = endCurrency
lex.emit(ItemType.Text)
else:
# We go back to the original number there is no '$'
lex.pos = endNumber
lex.emit(ItemType.Number)
return lex_filename
@ -323,21 +376,13 @@ def lex_number(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # type
def lex_issue_number(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # type: ignore[type-arg]
# Only called when lex.input[lex.start] == "#"
original_start = lex.pos
found_number = False
while True:
r = lex.get()
if is_alpha_numeric(r):
if r.isnumeric():
found_number = True
else:
lex.backup()
break
lex.accept_run(str.isalpha)
if not found_number:
if lex.peek().isnumeric():
return lex_number
else:
lex.pos = original_start
lex.emit(ItemType.Symbol)
else:
lex.emit(ItemType.IssueNumber)
return lex_filename

View File

@ -20,6 +20,8 @@ This should probably be re-written, but, well, it mostly works!
# http://code.google.com/p/pycomicmetathis/
from __future__ import annotations
import functools
import itertools
import logging
import os
import re
@ -389,6 +391,7 @@ class Parser:
self.in_s_brace = 0 # In []
self.in_paren = 0 # In ()
self.year_candidates: list[tuple[bool, bool, filenamelexer.Item]] = []
self.series: list[list[filenamelexer.Item]] = []
self.series_parts: list[filenamelexer.Item] = []
self.title_parts: list[filenamelexer.Item] = []
self.used_items: list[filenamelexer.Item] = []
@ -424,11 +427,11 @@ class Parser:
return self.input[self.pos]
# Peek returns but does not consume the next Item in the input.
def peek(self) -> filenamelexer.Item:
if int(self.pos) >= len(self.input) - 1:
def peek(self, length: int = 1) -> filenamelexer.Item:
if int(self.pos) + length >= len(self.input):
return eof
return self.input[self.pos + 1]
return self.input[self.pos + length]
# Peek_back returns but does not step back the previous Item in the input.
def peek_back(self, length: int = 1) -> filenamelexer.Item:
@ -473,7 +476,7 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign
if p.issue_number_at is None:
# Series has already been started/parsed,
# filters out leading alternate numbers leading alternate number
if len(p.series_parts) > 0:
if len(p.series) > 0:
return parse_issue_number
else:
p.operator_rejected.append(item)
@ -490,12 +493,9 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign
# Is either a full year '2001' or a short year "'89"
if len(item.val.lstrip("0")) == 4 or item.val[0] == "'":
series = " ".join([x.val for x in p.series_parts])
if p.series_parts and series.casefold().endswith("free comic book day"):
series = " ".join([x.val for x in (p.series[-1] if p.series else [])])
if p.series and series.casefold().endswith("free comic book day"):
likely_issue_number = False
if p.in_something == 0:
# Append to series in case it is a part of the title, but only if were not inside parenthesis
p.series_parts.append(item)
# Look for a full date as in 2022-04-22
if p.peek().typ in [
@ -533,24 +533,19 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign
likely_issue_number = likely_issue_number and item.val[0] != "'"
p.year_candidates.append((likely_year, likely_issue_number, item))
if p.in_something == 0:
# Append to series in case it is a part of the title, but only if were not inside parenthesis
if not p.series:
p.series.append([])
p.series[-1].append(item)
# We would use i=item but we want to force a split after year candidates
return functools.partial(parse_series, i=None)
# Ensures that IG-88 gets added back to the series/title
else:
if p.in_something == 0:
if p.peek_back().typ in (filenamelexer.ItemType.IssueNumber, filenamelexer.ItemType.Number) or (
p.peek_back().typ == filenamelexer.ItemType.Space
and p.peek_back(2).typ in (filenamelexer.ItemType.IssueNumber, filenamelexer.ItemType.Number)
):
return parse_series
if (
p.peek_back().typ == filenamelexer.ItemType.Operator
or p.peek().typ == filenamelexer.ItemType.Operator
):
# Were not in something and the next or previous type is an operator, add it to the series
p.series_parts.append(item)
p.used_items.append(item)
p.get()
return parse_series
# We're not in something add it to the series
return functools.partial(parse_series, i=item)
# Number with a leading hash e.g. #003
elif item.typ == filenamelexer.ItemType.IssueNumber:
@ -583,10 +578,10 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign
if p.firstItem:
p.firstItem = False
if p.in_something == 0:
return parse_series
return functools.partial(parse_series, i=item)
p.publisher_removed.append(item)
if p.in_something == 0:
return parse_series
return functools.partial(parse_series, i=item)
# Attempts to identify the type e.g. annual
elif item.typ == filenamelexer.ItemType.ComicType:
@ -601,7 +596,7 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign
number = p.get()
# Mark volume info. Text will be added to the title/series later
if item.val.casefold() in ["tpb"]:
p.title_parts.extend([item, number])
# p.title_parts.extend([item, number])
p.filename_info["volume"] = t2do.convert(number.val)
p.filename_info["issue"] = t2do.convert(number.val)
@ -622,19 +617,19 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign
# If we don't have a reason to exclude it from the series go back to parsing the series immediately
if series_append:
p.series_parts.append(item)
p.used_items.append(item)
if p.firstItem:
p.firstItem = False
return parse_series
return functools.partial(parse_series, i=item)
# We found text, it's probably the title or series
elif item.typ in [filenamelexer.ItemType.Text, filenamelexer.ItemType.Honorific]:
# Unset first item
if p.firstItem:
p.firstItem = False
if p.in_something == 0:
return parse_series
if p.in_something == 0 and not p.skip:
p.backup()
return functools.partial(parse_series, i=None)
# Usually the word 'of' eg 1 (of 6)
elif item.typ == filenamelexer.ItemType.InfoSpecifier:
@ -662,15 +657,13 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign
p.backup()
if p.firstItem:
p.firstItem = False
return parse_series
return functools.partial(parse_series, i=item)
# This is text that just happens to also be a month/day
else:
# Add this to the series and get the next item, parse_series expects the next item to be the current item
p.series_parts.append(item)
p.get()
if p.firstItem:
p.firstItem = False
return parse_series
return functools.partial(parse_series, i=item)
# Specifically '__' or '--', no further title/series parsing is done to keep compatibility with wiki
elif item.typ == filenamelexer.ItemType.Skip:
@ -745,26 +738,29 @@ def parse_issue_number(p: Parser) -> Callable[[Parser], Callable | None] | None:
return parse
def parse_series(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ignore[type-arg]
item = p.input[p.pos]
current_part = 0
# i=None is a split in the series
def parse_series(p: Parser, i: filenamelexer.Item | None) -> Callable[[Parser], Callable | None] | None:
current = []
prev_space = False
title_parts: list[filenamelexer.Item] = []
series_parts: list[filenamelexer.Item] = []
series: list[list[filenamelexer.Item]] = [[]]
issue_marked_or_passed = (
p.issue_number_marked and p.issue_number_passed or p.issue_number_at is not None and not p.issue_number_marked
)
# We stop parsing the series when certain things come up if nothing was done with them continue where we left off
if p.peek_back().typ in [filenamelexer.ItemType.Number, filenamelexer.ItemType.Calendar]:
series_parts = p.series_parts
p.series_parts = []
if i:
if not issue_marked_or_passed:
if p.series:
current = p.series.pop()
current.append(i)
else:
# If we are splitting we don't want to sart with these
while p.peek().typ in [
filenamelexer.ItemType.Space,
filenamelexer.ItemType.Operator,
filenamelexer.ItemType.Symbol,
]:
p.irrelevant.append(p.get())
# Space and Dots are not useful at the beginning of a title/series
if not p.skip and item.typ not in [filenamelexer.ItemType.Space, filenamelexer.ItemType.Dot]:
if item.typ == filenamelexer.ItemType.Text:
p.backup()
else:
series[0].append(item)
# Skip is only true if we have come across '--' or '__'
while not p.skip:
item = p.get()
@ -779,14 +775,14 @@ def parse_series(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty
filenamelexer.ItemType.Publisher,
filenamelexer.ItemType.Honorific,
]:
series[current_part].append(item)
current.append(item)
if p.peek().typ == filenamelexer.ItemType.Dot:
dot = p.get()
if item.typ == filenamelexer.ItemType.Honorific or (
p.peek().typ == filenamelexer.ItemType.Space
and item.typ in (filenamelexer.ItemType.Text, filenamelexer.ItemType.Publisher)
):
series[current_part].append(dot)
current.append(dot)
else:
p.backup()
if item.typ == filenamelexer.ItemType.Publisher:
@ -796,14 +792,14 @@ def parse_series(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty
elif item.typ == filenamelexer.ItemType.InfoSpecifier:
# Exception for 'of'
if item.val.casefold() == "of":
series[current_part].append(item)
current.append(item)
else:
# This specifically lets 'X-Men-V1-067' parse correctly as Series: X-Men Volume: 1 Issue: 67
while len(series[current_part]) > 0 and series[current_part][-1].typ not in [
while len(current) > 0 and current[-1].typ not in [
filenamelexer.ItemType.Text,
filenamelexer.ItemType.Symbol,
]:
p.irrelevant.append(series[current_part].pop())
p.irrelevant.append(current.pop())
p.backup()
break
@ -811,14 +807,13 @@ def parse_series(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty
peek = p.peek()
# ': ' separates the title from the series, only the last section is considered the title
if not prev_space and peek.typ in [filenamelexer.ItemType.Space]:
series.append([]) # Starts a new section
series[current_part].append(item)
current_part += 1
current.append(item)
break
else:
# Force space around '-' makes 'batman - superman' stay otherwise we get 'batman-superman'
if prev_space and peek.typ in [filenamelexer.ItemType.Space]:
item.val = " " + item.val + " "
series[current_part].append(item)
current.append(item)
# Stop processing series/title if a skip item is found
elif item.typ == filenamelexer.ItemType.Skip:
@ -827,14 +822,18 @@ def parse_series(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty
elif item.typ == filenamelexer.ItemType.Number:
# Special case for the word 'book'
if series[current_part] and series[current_part][-1].val.casefold() == "book":
title_parts.append(series[current_part].pop())
title_parts.append(item)
if current and current[-1].val.casefold() == "book":
# Mark the volume
p.filename_info["volume"] = t2do.convert(item.val)
# Add this section to the series EG [['bloodshot', 'book']]
p.series.append(current)
# Pop the last item and break to end this section EG [['bloodshot'], ['book', '3']]
current = [current.pop(), item]
break
count = get_number(p, p.pos + 1)
# this is an issue or volume number
# this is an issue or volume number eg '1 of 2'
if count is not None:
p.backup()
break
@ -843,18 +842,13 @@ def parse_series(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty
p.get()
# We have 2 numbers, add the first to the series and then go back to parse
if p.peek().typ in [filenamelexer.ItemType.Number, filenamelexer.ItemType.IssueNumber]:
series[current_part].append(item)
current.append(item)
break
# the issue number has been marked and passed, keep it as a part of the series
if (
p.issue_number_marked
and p.issue_number_passed
or p.issue_number_at is not None
and not p.issue_number_marked
):
if issue_marked_or_passed:
# We already have an issue number, this should be a part of the series
series[current_part].append(item)
current.append(item)
else:
# We have 1 number break here, it's possible it's the issue
p.backup() # Whitespace
@ -864,14 +858,9 @@ def parse_series(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty
# We have 1 number break here, it's possible it's the issue
else:
# the issue number has been #marked or passed, keep it as a part of the series
if (
p.issue_number_marked
and p.issue_number_passed
or p.issue_number_at is not None
and not p.issue_number_marked
):
if issue_marked_or_passed:
# We already have an issue number, this should be a part of the series
series[current_part].append(item)
current.append(item)
else:
p.backup() # The number
break
@ -880,13 +869,13 @@ def parse_series(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty
# Ensure 'ms. marvel' parses 'ms.' correctly
if item.typ == filenamelexer.ItemType.Dot:
if p.peek_back().typ == filenamelexer.ItemType.Honorific:
series[current_part].append(item)
current.append(item)
elif (
p.peek().typ == filenamelexer.ItemType.Number
or p.peek_back().typ == filenamelexer.ItemType.Text
and len(p.peek_back().val) == 1
):
series[current_part].append(item)
current.append(item)
item.no_space = True
# Allows avengers.hulk to parse correctly
elif p.peek().typ in (filenamelexer.ItemType.Text,):
@ -898,24 +887,7 @@ def parse_series(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty
prev_space = False
# We have a title separator e.g. ': "
if len(series) > 1:
title_parts.extend(series.pop())
for s in series:
if s and s[-1].typ == filenamelexer.ItemType.Operator:
s[-1].val += " " # Ensures that when there are multiple separators that they display properly
series_parts.extend(s)
p.used_items.append(series_parts.pop())
else:
series_parts.extend(series[0])
# If the series has already been set assume all of this is the title.
if len(p.series_parts) > 0:
p.title_parts.extend(series_parts)
p.title_parts.extend(title_parts)
else:
p.series_parts.extend(series_parts)
p.title_parts.extend(title_parts)
p.series.append(current)
return parse
@ -1017,7 +989,38 @@ def resolve_issue(p: Parser) -> None:
p.filename_info["format"] = protofolius_issue_number_scheme[p.filename_info["issue"][0].upper()]
def split_series(items: list[list[filenamelexer.Item]]) -> tuple[list[filenamelexer.Item], list[filenamelexer.Item]]:
series_parts: list[list[filenamelexer.Item]] = []
title_parts: list[list[filenamelexer.Item]] = []
current = series_parts
# We probably have a title
if len(items) > 1:
for i, s in enumerate(items):
# Switch to title if we are on the last part
if i == len(items) - 1:
current = title_parts
if s:
current.append(s)
if s[-1].typ == filenamelexer.ItemType.Operator:
s[-1].val += " " # Ensures that when there are multiple separators that they display properly
else: # We don't have an operator separating the parts, it's probably an issue number
current = title_parts
else:
if items:
series_parts.extend(items)
series: list[filenamelexer.Item] = list(itertools.chain.from_iterable(series_parts))
title: list[filenamelexer.Item] = list(itertools.chain.from_iterable(title_parts))
if series and series[-1].typ == filenamelexer.ItemType.Operator:
series.pop()
return series, title
def parse_finish(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ignore[type-arg]
for part in p.series:
p.used_items.extend(part)
p.series_parts, p.title_parts = split_series(p.series)
resolve_year(p)
resolve_issue(p)
@ -1032,7 +1035,6 @@ def parse_finish(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty
if p.series_parts:
p.filename_info["series"] = join_title(p.series_parts)
p.used_items.extend(p.series_parts)
else:
p.filename_info["series"] = p.filename_info.get("issue", "")
@ -1040,7 +1042,6 @@ def parse_finish(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty
p.filename_info["fcbd"] = True
p.filename_info["title"] = join_title(p.title_parts)
p.used_items.extend(p.title_parts)
p.irrelevant.extend([x for x in p.input if x.typ in p.remove_from_remainder])
@ -1137,11 +1138,15 @@ def parse_info_specifier(p: Parser) -> Callable[[Parser], Callable | None] | Non
p.used_items.append(item)
p.used_items.append(number)
# This is not for the issue number it is not in either the issue or the title,
# assume it is the volume number and count
elif p.issue_number_at != i.pos and i not in p.series_parts and i not in p.title_parts:
# This is not for the issue number
# assume it is the volume number and count, remove from series
elif p.issue_number_at != i.pos:
p.filename_info["volume"] = i.val
p.filename_info["volume_count"] = str(int(t2do.convert(number.val)))
for part in p.series:
if i in part:
part.remove(i)
break
p.used_items.append(i)
p.used_items.append(item)
p.used_items.append(number)
@ -1149,11 +1154,12 @@ def parse_info_specifier(p: Parser) -> Callable[[Parser], Callable | None] | Non
# TODO: Figure out what to do here if it ever happens
pass
else:
# Lets 'The Wrath of Foobar-Man, Part 1 of 2' parse correctly as the title
# Resets back to '1' in 'The Wrath of Foobar-Man, Part 1 of 2'
# we then go to parse_series it adds i (the '1') and then continues parsing at of
p.pos = [ind for ind, x in enumerate(p.input) if x == i][0]
if not p.in_something:
return parse_series
return functools.partial(parse_series, i=i)
return parse
@ -1233,13 +1239,7 @@ def join_title(lst: list[filenamelexer.Item]) -> str:
# No space if the next item is an operator or symbol
if lst[i + 1].typ in [filenamelexer.ItemType.Operator, filenamelexer.ItemType.Symbol]:
# exept if followed by a dollarsign
if not (
(
lst[i].typ in [filenamelexer.ItemType.Number, filenamelexer.ItemType.IssueNumber]
and lst[i + 1].val == "$"
)
or lst[i + 1].val == "&"
):
if lst[i + 1].val != "&":
continue
# Add a space

View File

@ -955,6 +955,21 @@ names: list[tuple[str, str, dict[str, str | bool], tuple[bool, bool]]] = [
},
(True, True),
),
(
"Cory Doctorow's Futuristic Tales of the Here and Now $1$2 3 #0.0.1 (2007).cbz",
"$",
{
"archive": "cbz",
"issue": "0.1",
"series": "Cory Doctorow's Futuristic Tales of the Here and Now $1 $2 3",
"title": "",
"volume": "",
"year": "2007",
"remainder": "",
"issue_count": "",
},
(True, True),
),
]
oldfnames = []