Improve edge cases
Lex `'` as a symbol Lex multiple symbols as a single item Prefer `$` at the start of a number Simplify issue number parsing
This commit is contained in:
parent
78060dff61
commit
bd9b3522d8
@ -130,17 +130,25 @@ class Lexer:
|
||||
self.start = self.pos
|
||||
|
||||
# Accept consumes the next rune if it's from the valid se:
|
||||
def accept(self, valid: str) -> bool:
|
||||
if self.get() in valid:
|
||||
return True
|
||||
def accept(self, valid: str | Callable[[str], bool]) -> bool:
|
||||
if isinstance(valid, str):
|
||||
if self.get() in valid:
|
||||
return True
|
||||
else:
|
||||
if valid(self.get()):
|
||||
return True
|
||||
|
||||
self.backup()
|
||||
return False
|
||||
|
||||
# AcceptRun consumes a run of runes from the valid set.
|
||||
def accept_run(self, valid: str) -> None:
|
||||
while self.get() in valid:
|
||||
continue
|
||||
def accept_run(self, valid: str | Callable[[str], bool]) -> None:
|
||||
if isinstance(valid, str):
|
||||
while self.get() in valid:
|
||||
continue
|
||||
else:
|
||||
while valid(self.get()):
|
||||
continue
|
||||
|
||||
self.backup()
|
||||
|
||||
@ -150,9 +158,7 @@ class Lexer:
|
||||
self.accept_run(digits)
|
||||
if self.input[self.pos] == ".":
|
||||
self.backup()
|
||||
while self.get().isalpha():
|
||||
...
|
||||
self.backup()
|
||||
self.accept_run(str.isalpha)
|
||||
|
||||
return True
|
||||
|
||||
@ -197,7 +203,8 @@ def lex_filename(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # ty
|
||||
r = lex.peek()
|
||||
if r.isdigit():
|
||||
return lex_number
|
||||
lex.emit(ItemType.Text) # TODO: Change to Text
|
||||
lex.accept_run(is_symbol)
|
||||
lex.emit(ItemType.Symbol)
|
||||
elif r.isnumeric():
|
||||
lex.backup()
|
||||
return lex_number
|
||||
@ -245,17 +252,17 @@ def lex_filename(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # ty
|
||||
elif is_symbol(r):
|
||||
if unicodedata.category(r) == "Sc":
|
||||
return lex_currency
|
||||
lex.accept_run(is_symbol)
|
||||
lex.emit(ItemType.Symbol)
|
||||
else:
|
||||
return errorf(lex, "unrecognized character in action: " + r)
|
||||
return errorf(lex, "unrecognized character in action: " + repr(r))
|
||||
|
||||
return lex_filename
|
||||
|
||||
|
||||
def lex_currency(lex: Lexer) -> Callable:
|
||||
orig = lex.pos
|
||||
while is_space(lex.peek()):
|
||||
lex.get()
|
||||
lex.accept_run(is_space)
|
||||
if lex.peek().isnumeric():
|
||||
return lex_number
|
||||
else:
|
||||
@ -274,8 +281,7 @@ def lex_operator(lex: Lexer) -> Callable: # type: ignore[type-arg]
|
||||
# LexSpace scans a run of space characters.
|
||||
# One space has already been seen.
|
||||
def lex_space(lex: Lexer) -> Callable: # type: ignore[type-arg]
|
||||
while is_space(lex.peek()):
|
||||
lex.get()
|
||||
lex.accept_run(is_space)
|
||||
|
||||
lex.emit(ItemType.Space)
|
||||
return lex_filename
|
||||
@ -332,17 +338,37 @@ def lex_number(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # type
|
||||
# Assume that 80th is just text and not a number
|
||||
lex.emit(ItemType.Text)
|
||||
else:
|
||||
orig = lex.pos
|
||||
while is_space(lex.peek()):
|
||||
lex.get()
|
||||
if "Sc" == unicodedata.category(lex.get()):
|
||||
# Used to check for a '$'
|
||||
endNumber = lex.pos
|
||||
|
||||
# Consume any spaces
|
||||
lex.accept_run(is_space)
|
||||
|
||||
# This number starts with a '$' emit it as Text instead of a Number
|
||||
if "Sc" == unicodedata.category(lex.input[lex.start]):
|
||||
lex.pos = endNumber
|
||||
lex.emit(ItemType.Text)
|
||||
else:
|
||||
lex.pos = orig
|
||||
if "Sc" == unicodedata.category(lex.input[lex.start]):
|
||||
lex.emit(ItemType.Text)
|
||||
else:
|
||||
|
||||
# This number ends in a '$' if there is a number on the other side we assume it belongs to the following number
|
||||
elif "Sc" == unicodedata.category(lex.get()):
|
||||
# Store the end of the number '$'. We still need to check to see if there is a number coming up
|
||||
endCurrency = lex.pos
|
||||
# Consume any spaces
|
||||
lex.accept_run(is_space)
|
||||
|
||||
# This is a number
|
||||
if lex.peek().isnumeric():
|
||||
# We go back to the original number before the '$' and emit a number
|
||||
lex.pos = endNumber
|
||||
lex.emit(ItemType.Number)
|
||||
else:
|
||||
# There was no following number, reset to the '$' and emit a number
|
||||
lex.pos = endCurrency
|
||||
lex.emit(ItemType.Text)
|
||||
else:
|
||||
# We go back to the original number there is no '$'
|
||||
lex.pos = endNumber
|
||||
lex.emit(ItemType.Number)
|
||||
|
||||
return lex_filename
|
||||
|
||||
@ -350,21 +376,13 @@ def lex_number(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # type
|
||||
def lex_issue_number(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # type: ignore[type-arg]
|
||||
# Only called when lex.input[lex.start] == "#"
|
||||
original_start = lex.pos
|
||||
found_number = False
|
||||
while True:
|
||||
r = lex.get()
|
||||
if is_alpha_numeric(r):
|
||||
if r.isnumeric():
|
||||
found_number = True
|
||||
else:
|
||||
lex.backup()
|
||||
break
|
||||
lex.accept_run(str.isalpha)
|
||||
|
||||
if not found_number:
|
||||
if lex.peek().isnumeric():
|
||||
return lex_number
|
||||
else:
|
||||
lex.pos = original_start
|
||||
lex.emit(ItemType.Symbol)
|
||||
else:
|
||||
lex.emit(ItemType.IssueNumber)
|
||||
|
||||
return lex_filename
|
||||
|
||||
|
@ -544,19 +544,8 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign
|
||||
# Ensures that IG-88 gets added back to the series/title
|
||||
else:
|
||||
if p.in_something == 0:
|
||||
to_series = (
|
||||
filenamelexer.ItemType.IssueNumber,
|
||||
filenamelexer.ItemType.Number,
|
||||
filenamelexer.ItemType.Operator,
|
||||
)
|
||||
if (
|
||||
p.peek().typ in to_series
|
||||
or (p.peek().typ == filenamelexer.ItemType.Space and p.peek(2).typ in to_series)
|
||||
or p.peek_back().typ in to_series
|
||||
or (p.peek_back().typ == filenamelexer.ItemType.Space and p.peek_back(2).typ in to_series)
|
||||
):
|
||||
# Were not in something and the next or previous type is an operator or number, add it to the series
|
||||
return functools.partial(parse_series, i=item)
|
||||
# We're not in something add it to the series
|
||||
return functools.partial(parse_series, i=item)
|
||||
|
||||
# Number with a leading hash e.g. #003
|
||||
elif item.typ == filenamelexer.ItemType.IssueNumber:
|
||||
@ -1031,8 +1020,6 @@ def parse_finish(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty
|
||||
for part in p.series:
|
||||
p.used_items.extend(part)
|
||||
p.series_parts, p.title_parts = split_series(p.series)
|
||||
p.filename_info["series"] = join_title(p.series_parts)
|
||||
p.filename_info["title"] = join_title(p.title_parts)
|
||||
|
||||
resolve_year(p)
|
||||
resolve_issue(p)
|
||||
@ -1048,7 +1035,6 @@ def parse_finish(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty
|
||||
|
||||
if p.series_parts:
|
||||
p.filename_info["series"] = join_title(p.series_parts)
|
||||
p.used_items.extend(p.series_parts)
|
||||
else:
|
||||
p.filename_info["series"] = p.filename_info.get("issue", "")
|
||||
|
||||
@ -1056,7 +1042,6 @@ def parse_finish(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty
|
||||
p.filename_info["fcbd"] = True
|
||||
|
||||
p.filename_info["title"] = join_title(p.title_parts)
|
||||
p.used_items.extend(p.title_parts)
|
||||
|
||||
p.irrelevant.extend([x for x in p.input if x.typ in p.remove_from_remainder])
|
||||
|
||||
@ -1153,11 +1138,15 @@ def parse_info_specifier(p: Parser) -> Callable[[Parser], Callable | None] | Non
|
||||
p.used_items.append(item)
|
||||
p.used_items.append(number)
|
||||
|
||||
# This is not for the issue number it is not in either the issue or the title,
|
||||
# assume it is the volume number and count
|
||||
elif p.issue_number_at != i.pos and i not in p.series_parts and i not in p.title_parts:
|
||||
# This is not for the issue number
|
||||
# assume it is the volume number and count, remove from series
|
||||
elif p.issue_number_at != i.pos:
|
||||
p.filename_info["volume"] = i.val
|
||||
p.filename_info["volume_count"] = str(int(t2do.convert(number.val)))
|
||||
for part in p.series:
|
||||
if i in part:
|
||||
part.remove(i)
|
||||
break
|
||||
p.used_items.append(i)
|
||||
p.used_items.append(item)
|
||||
p.used_items.append(number)
|
||||
|
@ -955,6 +955,21 @@ names: list[tuple[str, str, dict[str, str | bool], tuple[bool, bool]]] = [
|
||||
},
|
||||
(True, True),
|
||||
),
|
||||
(
|
||||
"Cory Doctorow's Futuristic Tales of the Here and Now $1$2 3 #0.0.1 (2007).cbz",
|
||||
"$",
|
||||
{
|
||||
"archive": "cbz",
|
||||
"issue": "0.1",
|
||||
"series": "Cory Doctorow's Futuristic Tales of the Here and Now $1 $2 3",
|
||||
"title": "",
|
||||
"volume": "",
|
||||
"year": "2007",
|
||||
"remainder": "",
|
||||
"issue_count": "",
|
||||
},
|
||||
(True, True),
|
||||
),
|
||||
]
|
||||
|
||||
oldfnames = []
|
||||
|
Loading…
Reference in New Issue
Block a user