Merge branch 'protofolius_issue_scheme' into develop

This commit is contained in:
Timmy Welch 2023-10-11 17:05:27 -07:00
commit abfd97d915
12 changed files with 630 additions and 153 deletions

View File

@ -23,7 +23,7 @@ import sys
import traceback
from typing import cast
from comicapi import filenamelexer, filenameparser, utils
from comicapi import utils
from comicapi.archivers import Archiver, UnknownArchiver, ZipArchiver
from comicapi.comet import CoMet
from comicapi.comicbookinfo import ComicBookInfo
@ -558,53 +558,39 @@ class ComicArchive:
remove_fcbd: bool = False,
remove_publisher: bool = False,
split_words: bool = False,
allow_issue_start_with_letter: bool = False,
protofolius_issue_number_scheme: bool = False,
) -> GenericMetadata:
metadata = GenericMetadata()
filename = self.path.name
if split_words:
import wordninja
filename_info = utils.parse_filename(
self.path.name,
complicated_parser=complicated_parser,
remove_c2c=remove_c2c,
remove_fcbd=remove_fcbd,
remove_publisher=remove_publisher,
split_words=split_words,
allow_issue_start_with_letter=allow_issue_start_with_letter,
protofolius_issue_number_scheme=protofolius_issue_number_scheme,
)
metadata.alternate_number = utils.xlate(filename_info.get("alternate", None))
metadata.issue = utils.xlate(filename_info.get("issue", None))
metadata.issue_count = utils.xlate_int(filename_info.get("issue_count", None))
metadata.publisher = utils.xlate(filename_info.get("publisher", None))
metadata.series = utils.xlate(filename_info.get("series", None))
metadata.title = utils.xlate(filename_info.get("title", None))
metadata.volume = utils.xlate_int(filename_info.get("volume", None))
metadata.volume_count = utils.xlate_int(filename_info.get("volume_count", None))
metadata.year = utils.xlate_int(filename_info.get("year", None))
filename = " ".join(wordninja.split(self.path.stem)) + self.path.suffix
if complicated_parser:
lex = filenamelexer.Lex(filename)
p = filenameparser.Parse(
lex.items, remove_c2c=remove_c2c, remove_fcbd=remove_fcbd, remove_publisher=remove_publisher
)
metadata.alternate_number = utils.xlate(p.filename_info["alternate"])
metadata.issue = utils.xlate(p.filename_info["issue"])
metadata.issue_count = utils.xlate_int(p.filename_info["issue_count"])
metadata.publisher = utils.xlate(p.filename_info["publisher"])
metadata.series = utils.xlate(p.filename_info["series"])
metadata.title = utils.xlate(p.filename_info["title"])
metadata.volume = utils.xlate_int(p.filename_info["volume"])
metadata.volume_count = utils.xlate_int(p.filename_info["volume_count"])
metadata.year = utils.xlate_int(p.filename_info["year"])
metadata.scan_info = utils.xlate(p.filename_info["remainder"])
metadata.format = "FCBD" if p.filename_info["fcbd"] else None
if p.filename_info["annual"]:
metadata.format = "Annual"
else:
fnp = filenameparser.FileNameParser()
fnp.parse_filename(filename)
if fnp.issue:
metadata.issue = fnp.issue
if fnp.series:
metadata.series = fnp.series
if fnp.volume:
metadata.volume = utils.xlate_int(fnp.volume)
if fnp.year:
metadata.year = utils.xlate_int(fnp.year)
if fnp.issue_count:
metadata.issue_count = utils.xlate_int(fnp.issue_count)
if fnp.remainder:
metadata.scan_info = fnp.remainder
metadata.scan_info = utils.xlate(filename_info.get("remainder", None))
metadata.format = "FCBD" if filename_info.get("fcbd", None) else None
if filename_info.get("annual", None):
metadata.format = "Annual"
if filename_info.get("format", None):
metadata.format = filename_info["format"]
metadata.is_empty = False
return metadata
def export_as_zip(self, zip_filename: pathlib.Path) -> bool:

View File

@ -81,13 +81,14 @@ class Item:
self.typ: ItemType = typ
self.pos: int = pos
self.val: str = val
self.no_space = False
def __repr__(self) -> str:
return f"{self.val}: index: {self.pos}: {self.typ}"
class Lexer:
def __init__(self, string: str) -> None:
def __init__(self, string: str, allow_issue_start_with_letter: bool = False) -> None:
self.input: str = string # The string being scanned
# The next lexing function to enter
self.state: Callable[[Lexer], Callable | None] | None = None # type: ignore[type-arg]
@ -98,6 +99,7 @@ class Lexer:
self.brace_depth: int = 0 # Nesting depth of { }
self.sbrace_depth: int = 0 # Nesting depth of [ ]
self.items: list[Item] = []
self.allow_issue_start_with_letter = allow_issue_start_with_letter
# Next returns the next rune in the input.
def get(self) -> str:
@ -143,23 +145,14 @@ class Lexer:
self.backup()
def scan_number(self) -> bool:
digits = "0123456789"
digits = "0123456789.,"
self.accept_run(digits)
if self.accept("."):
if self.accept(digits):
self.accept_run(digits)
else:
self.backup()
if self.accept("s"):
if not self.accept("t"):
self.backup()
elif self.accept("nr"):
if not self.accept("d"):
self.backup()
elif self.accept("t"):
if not self.accept("h"):
self.backup()
if self.input[self.pos] == ".":
self.backup()
while self.get().isalpha():
...
self.backup()
return True
@ -196,23 +189,21 @@ def lex_filename(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # ty
return lex_space
elif r == ".":
r = lex.peek()
if r < "0" or "9" < r:
lex.emit(ItemType.Dot)
return lex_filename
lex.backup()
return lex_number
lex.emit(ItemType.Dot)
return lex_filename
elif r == "'":
r = lex.peek()
if r in "0123456789":
if r.isdigit():
return lex_number
lex.emit(ItemType.Text) # TODO: Change to Text
elif "0" <= r <= "9":
lex.backup()
return lex_number
elif r == "#":
if "0" <= lex.peek() <= "9":
return lex_number
if lex.allow_issue_start_with_letter and is_alpha_numeric(lex.peek()):
return lex_issue_number
elif lex.peek().isdigit() or lex.peek() in "-+.":
return lex_issue_number
lex.emit(ItemType.Symbol)
elif is_operator(r):
if r == "-" and lex.peek() == "-":
@ -329,6 +320,28 @@ def lex_number(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # type
return lex_filename
def lex_issue_number(lex: Lexer) -> Callable[[Lexer], Callable | None] | None: # type: ignore[type-arg]
# Only called when lex.input[lex.start] == "#"
original_start = lex.pos
found_number = False
while True:
r = lex.get()
if is_alpha_numeric(r):
if r.isnumeric():
found_number = True
else:
lex.backup()
break
if not found_number:
lex.pos = original_start
lex.emit(ItemType.Symbol)
else:
lex.emit(ItemType.IssueNumber)
return lex_filename
def is_space(character: str) -> bool:
return character in "_ \t"
@ -346,7 +359,7 @@ def is_symbol(character: str) -> bool:
return unicodedata.category(character)[0] in "PS"
def Lex(filename: str) -> Lexer:
lex = Lexer(string=os.path.basename(filename))
def Lex(filename: str, allow_issue_start_with_letter: bool = False) -> Lexer:
lex = Lexer(os.path.basename(filename), allow_issue_start_with_letter)
lex.run()
return lex

View File

@ -324,6 +324,21 @@ class FilenameInfo(TypedDict, total=False):
volume: str
volume_count: str
year: str
format: str
protofolius_issue_number_scheme = {
"B": "biography/best of",
"C": "compact edition",
"E": "entrtainment/puzzle edition",
"F": "familiy book edition",
"J": "jubileum (anniversary) edition",
"P": "pocket edition",
"N": "newly brought out/restyled edition",
"O": "old editions (or oblong format)",
"S": "special edition",
"X": "X-rated edition",
}
eof = filenamelexer.Item(filenamelexer.ItemType.EOF, -1, "")
@ -341,6 +356,7 @@ class Parser:
remove_c2c: bool = False,
remove_fcbd: bool = False,
remove_publisher: bool = False,
protofolius_issue_number_scheme: bool = False,
) -> None:
self.state: Callable[[Parser], Callable | None] | None = None # type: ignore[type-arg]
self.pos = -1
@ -350,6 +366,8 @@ class Parser:
self.alt = False
self.filename_info: FilenameInfo = {"series": ""}
self.issue_number_at = None
self.issue_number_marked = False
self.issue_number_passed = False
self.in_something = 0 # In some sort of brackets {}[]()
self.in_brace = 0 # In {}
self.in_s_brace = 0 # In []
@ -366,6 +384,7 @@ class Parser:
self.remove_c2c = remove_c2c
self.remove_fcbd = remove_fcbd
self.remove_publisher = remove_publisher
self.protofolius_issue_number_scheme = protofolius_issue_number_scheme
self.remove_from_remainder = []
if remove_c2c:
@ -377,6 +396,7 @@ class Parser:
for i, item in enumerate(self.input):
if item.typ == filenamelexer.ItemType.IssueNumber:
self.issue_number_at = i
self.issue_number_marked = True
# Get returns the next Item in the input.
def get(self) -> filenamelexer.Item:
@ -395,11 +415,11 @@ class Parser:
return self.input[self.pos + 1]
# Peek_back returns but does not step back the previous Item in the input.
def peek_back(self) -> filenamelexer.Item:
if int(self.pos) == 0:
def peek_back(self, length: int = 1) -> filenamelexer.Item:
if int(self.pos) - length < 0:
return eof
return self.input[self.pos - 1]
return self.input[self.pos - length]
# Backup steps back one Item.
def backup(self) -> None:
@ -413,7 +433,6 @@ class Parser:
def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ignore[type-arg]
item: filenamelexer.Item = p.get()
# We're done, time to do final processing
if item.typ == filenamelexer.ItemType.EOF:
return parse_finish
@ -429,7 +448,7 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign
# Issue number is not 4 digits e.g. a year
# If this is still used in 7978 years, something is terribly wrong
if len(item.val.lstrip("0")) != 4:
if len(item.val.lstrip("0")) < 4:
# Assume that operators indicate a non-issue number e.g. IG-88 or 88-IG
if filenamelexer.ItemType.Operator not in (p.peek().typ, p.peek_back().typ):
# It is common to use '89 to refer to an annual reprint from 1989
@ -443,7 +462,6 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign
else:
p.operator_rejected.append(item)
# operator rejected used later to add back to the series/title
# It is more likely to be a year if it is inside parentheses.
if p.in_something > 0:
likely_year = len(item.val.lstrip("0")) == 4
@ -500,23 +518,30 @@ def parse(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ign
likely_issue_number = likely_issue_number and item.val[0] != "'"
p.year_candidates.append((likely_year, likely_issue_number, item))
# Ensures that IG-88 gets added back to the series/title
elif (
p.in_something == 0
and p.peek_back().typ == filenamelexer.ItemType.Operator
or p.peek().typ == filenamelexer.ItemType.Operator
):
# Were not in something and the next or previous type is an operator, add it to the series
p.series_parts.append(item)
p.used_items.append(item)
else:
if p.in_something == 0:
if p.peek_back().typ in (filenamelexer.ItemType.IssueNumber, filenamelexer.ItemType.Number) or (
p.peek_back().typ == filenamelexer.ItemType.Space
and p.peek_back(2).typ in (filenamelexer.ItemType.IssueNumber, filenamelexer.ItemType.Number)
):
return parse_series
if (
p.peek_back().typ == filenamelexer.ItemType.Operator
or p.peek().typ == filenamelexer.ItemType.Operator
):
# Were not in something and the next or previous type is an operator, add it to the series
p.series_parts.append(item)
p.used_items.append(item)
p.get()
return parse_series
p.get()
return parse_series
# Number with a leading hash e.g. #003
elif item.typ == filenamelexer.ItemType.IssueNumber:
# Unset first item
if p.firstItem:
p.firstItem = False
p.issue_number_passed = True
return parse_issue_number
# Matches FCBD. Not added to p.used_items so it will show in "remainder"
@ -706,23 +731,24 @@ def parse_issue_number(p: Parser) -> Callable[[Parser], Callable | None] | None:
def parse_series(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ignore[type-arg]
item = p.input[p.pos]
series: list[list[filenamelexer.Item]] = [[]]
# Space and Dots are not useful at the beginning of a title/series
if not p.skip and item.typ not in [filenamelexer.ItemType.Space, filenamelexer.ItemType.Dot]:
series[0].append(item)
current_part = 0
prev_space = False
title_parts: list[filenamelexer.Item] = []
series_parts: list[filenamelexer.Item] = []
prev_space = False
series: list[list[filenamelexer.Item]] = [[]]
# We stop parsing the series when certain things come up if nothing was done with them continue where we left off
if p.peek_back().typ in [filenamelexer.ItemType.Number, filenamelexer.ItemType.Calendar]:
series_parts = p.series_parts
p.series_parts = []
# Space and Dots are not useful at the beginning of a title/series
if not p.skip and item.typ not in [filenamelexer.ItemType.Space, filenamelexer.ItemType.Dot]:
if item.typ == filenamelexer.ItemType.Text:
p.backup()
else:
series[0].append(item)
# Skip is only true if we have come across '--' or '__'
while not p.skip:
item = p.get()
@ -738,9 +764,16 @@ def parse_series(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty
filenamelexer.ItemType.Honorific,
]:
series[current_part].append(item)
if item.typ == filenamelexer.ItemType.Honorific and p.peek().typ == filenamelexer.ItemType.Dot:
series[current_part].append(p.get())
elif item.typ == filenamelexer.ItemType.Publisher:
if p.peek().typ == filenamelexer.ItemType.Dot:
dot = p.get()
if item.typ == filenamelexer.ItemType.Honorific or (
p.peek().typ == filenamelexer.ItemType.Space
and item.typ in (filenamelexer.ItemType.Text, filenamelexer.ItemType.Publisher)
):
series[current_part].append(dot)
else:
p.backup()
if item.typ == filenamelexer.ItemType.Publisher:
p.filename_info["publisher"] = item.val
# Handle Volume
@ -784,9 +817,12 @@ def parse_series(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty
p.filename_info["volume"] = t2do.convert(item.val)
break
# This is 6 in '1 of 6'
if series[current_part] and series[current_part][-1].val.casefold() == "of":
series[current_part].append(item)
count = get_number(p, p.pos + 1)
# this is an issue or volume number
if count is not None:
p.backup()
break
if p.peek().typ == filenamelexer.ItemType.Space:
p.get()
# We have 2 numbers, add the first to the series and then go back to parse
@ -794,24 +830,52 @@ def parse_series(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty
series[current_part].append(item)
break
# We have 1 number break here, it's possible it's the issue
p.backup() # Whitespace
p.backup() # The number
break
# the issue number has been marked and passed, keep it as a part of the series
if (
p.issue_number_marked
and p.issue_number_passed
or p.issue_number_at is not None
and not p.issue_number_marked
):
# We already have an issue number, this should be a part of the series
series[current_part].append(item)
else:
# We have 1 number break here, it's possible it's the issue
p.backup() # Whitespace
p.backup() # The number
break
# We have 1 number break here, it's possible it's the issue
else:
p.backup() # The number
break
# the issue number has been #marked or passed, keep it as a part of the series
if (
p.issue_number_marked
and p.issue_number_passed
or p.issue_number_at is not None
and not p.issue_number_marked
):
# We already have an issue number, this should be a part of the series
series[current_part].append(item)
else:
p.backup() # The number
break
else:
# Ensure 'ms. marvel' parses 'ms.' correctly
if item.typ == filenamelexer.ItemType.Dot and p.peek_back().typ == filenamelexer.ItemType.Honorific:
series[current_part].append(item)
# Allows avengers.hulk to parse correctly
elif item.typ == filenamelexer.ItemType.Dot and p.peek().typ == filenamelexer.ItemType.Text:
# Marks the dot as used so that the remainder is clean
p.used_items.append(item)
if item.typ == filenamelexer.ItemType.Dot:
if p.peek_back().typ == filenamelexer.ItemType.Honorific:
series[current_part].append(item)
elif (
p.peek().typ == filenamelexer.ItemType.Number
or p.peek_back().typ == filenamelexer.ItemType.Text
and len(p.peek_back().val) == 1
):
series[current_part].append(item)
item.no_space = True
# Allows avengers.hulk to parse correctly
elif p.peek().typ in (filenamelexer.ItemType.Text,):
# Marks the dot as used so that the remainder is clean
p.used_items.append(item)
else:
p.backup()
break
@ -926,6 +990,16 @@ def resolve_issue(p: Parser) -> None:
if "volume" in p.filename_info:
p.filename_info["issue"] = p.filename_info["volume"]
if (
"issue" in p.filename_info
and p.protofolius_issue_number_scheme
and len(p.filename_info["issue"]) > 1
and p.filename_info["issue"][0].isalpha()
and p.filename_info["issue"][0].upper() in protofolius_issue_number_scheme
and p.filename_info["issue"][1].isnumeric()
):
p.filename_info["format"] = protofolius_issue_number_scheme[p.filename_info["issue"][0].upper()]
def parse_finish(p: Parser) -> Callable[[Parser], Callable | None] | None: # type: ignore[type-arg]
resolve_year(p)
@ -944,7 +1018,7 @@ def parse_finish(p: Parser) -> Callable[[Parser], Callable | None] | None: # ty
p.filename_info["series"] = join_title(p.series_parts)
p.used_items.extend(p.series_parts)
else:
p.filename_info["series"] = p.filename_info["issue"]
p.filename_info["series"] = p.filename_info.get("issue", "")
if "free comic book" in p.filename_info["series"].casefold():
p.filename_info["fcbd"] = True
@ -1051,7 +1125,7 @@ def parse_info_specifier(p: Parser) -> Callable[[Parser], Callable | None] | Non
# 'of' is only special if it is inside a parenthesis.
elif item.val.casefold() == "of":
i = get_number(p, index)
i = get_number_rev(p, index)
if i is not None:
if p.in_something > 0:
if p.issue_number_at is None:
@ -1087,7 +1161,7 @@ def parse_info_specifier(p: Parser) -> Callable[[Parser], Callable | None] | Non
# Gets 03 in '03 of 6'
def get_number(p: Parser, index: int) -> filenamelexer.Item | None:
def get_number_rev(p: Parser, index: int) -> filenamelexer.Item | None:
# Go backward through the filename to see if we can find what this is of eg '03 (of 6)' or '008 title 03 (of 6)'
rev = p.input[:index]
rev.reverse()
@ -1105,6 +1179,36 @@ def get_number(p: Parser, index: int) -> filenamelexer.Item | None:
# We got our number, time to leave
return i
# This is not a number and not an ignorable type, give up looking for the number this count belongs to
break
return None
# Gets 6 in '03 of 6'
def get_number(p: Parser, index: int) -> filenamelexer.Item | None:
# Go forward through the filename to see if we can find what this is of eg '03 (of 6)' or '008 title 03 (of 6)'
filename = p.input[index:]
of_found = False
for i in filename:
# We don't care about these types, we are looking to see if there is a number that is possibly different from
# the issue number for this count
if i.typ in [
filenamelexer.ItemType.LeftParen,
filenamelexer.ItemType.LeftBrace,
filenamelexer.ItemType.LeftSBrace,
filenamelexer.ItemType.Space,
]:
continue
if i.val == "of":
of_found = True
continue
if i.typ in [filenamelexer.ItemType.Number, filenamelexer.ItemType.IssueNumber]:
# We got our number, time to leave
if of_found:
return i
# This is not a number and not an ignorable type, give up looking for the number this count belongs to
break
return None
@ -1122,11 +1226,21 @@ def join_title(lst: list[filenamelexer.Item]) -> str:
if i == len(lst) - 1:
continue
# No space after honorifics with a dot
if item.typ == filenamelexer.ItemType.Honorific and lst[i + 1].typ == filenamelexer.ItemType.Dot:
if (
item.typ in (filenamelexer.ItemType.Honorific, filenamelexer.ItemType.Text)
and lst[i + 1].typ == filenamelexer.ItemType.Dot
):
continue
if item.no_space:
continue
# No space if the next item is an operator or symbol
if lst[i + 1].typ in [filenamelexer.ItemType.Operator, filenamelexer.ItemType.Symbol]:
continue
# exept if followed by a dollarsign
if not (
lst[i].typ in [filenamelexer.ItemType.Number, filenamelexer.ItemType.IssueNumber]
and lst[i + 1].val == "$"
):
continue
# Add a space
title += " "
@ -1140,6 +1254,7 @@ def Parse(
remove_c2c: bool = False,
remove_fcbd: bool = False,
remove_publisher: bool = False,
protofolius_issue_number_scheme: bool = False,
) -> Parser:
p = Parser(
lexer_result=lexer_result,
@ -1147,6 +1262,7 @@ def Parse(
remove_c2c=remove_c2c,
remove_fcbd=remove_fcbd,
remove_publisher=remove_publisher,
protofolius_issue_number_scheme=protofolius_issue_number_scheme,
)
p.run()
return p

View File

@ -32,6 +32,7 @@ class IssueString:
self.num = None
self.suffix = ""
self.prefix = ""
if text is None:
return
@ -41,18 +42,25 @@ class IssueString:
if len(text) == 0:
return
for idx, r in enumerate(text):
if not r.isalpha():
break
self.prefix = text[:idx]
self.num, self.suffix = self.get_number(text[idx:])
def get_number(self, text: str) -> tuple[float | None, str]:
num, suffix = None, ""
start = 0
# skip the minus sign if it's first
if text[0] == "-":
if text[0] in ("-", "+"):
start = 1
else:
start = 0
# if it's still not numeric at start skip it
if text[start].isdigit() or text[start] == ".":
# walk through the string, look for split point (the first non-numeric)
decimal_count = 0
for idx in range(start, len(text)):
if text[idx] not in "0123456789.":
if not (text[idx].isdigit() or text[idx] in "."):
break
# special case: also split on second "."
if text[idx] == ".":
@ -71,42 +79,48 @@ class IssueString:
if idx == 1 and start == 1:
idx = 0
part1 = text[0:idx]
part2 = text[idx : len(text)]
if part1 != "":
self.num = float(part1)
self.suffix = part2
if text[0:idx]:
num = float(text[0:idx])
suffix = text[idx : len(text)]
else:
self.suffix = text
suffix = text
return num, suffix
def as_string(self, pad: int = 0) -> str:
# return the float, left side zero-padded, with suffix attached
"""return the number, left side zero-padded, with suffix attached"""
# if there is no number return the text
if self.num is None:
return self.suffix
return self.prefix + self.suffix
# negative is added back in last
negative = self.num < 0
num_f = abs(self.num)
# used for padding
num_int = int(num_f)
num_s = str(num_int)
if float(num_int) != num_f:
num_s = str(num_f)
num_s += self.suffix
if num_f.is_integer():
num_s = str(num_int)
else:
num_s = str(num_f)
# create padding
padding = ""
# we only pad the whole number part, we don't care about the decimal
length = len(str(num_int))
if length < pad:
padding = "0" * (pad - length)
# add the padding to the front
num_s = padding + num_s
# finally add the negative back in
if negative:
num_s = "-" + num_s
return num_s
# return the prefix + formatted number + suffix
return self.prefix + num_s + self.suffix
def as_float(self) -> float | None:
# return the float, with no suffix

View File

@ -26,6 +26,7 @@ from shutil import which # noqa: F401
from typing import Any
import comicapi.data
from comicapi import filenamelexer, filenameparser
try:
import icu
@ -60,6 +61,51 @@ def os_sorted(lst: Iterable) -> Iterable:
return sorted(lst, key=key)
def parse_filename(
filename: str,
complicated_parser: bool = False,
remove_c2c: bool = False,
remove_fcbd: bool = False,
remove_publisher: bool = False,
split_words: bool = False,
allow_issue_start_with_letter: bool = False,
protofolius_issue_number_scheme: bool = False,
) -> filenameparser.FilenameInfo:
if split_words:
import wordninja
filename, ext = os.path.splitext(filename)
filename = " ".join(wordninja.split(filename)) + ext
if complicated_parser:
lex = filenamelexer.Lex(filename, allow_issue_start_with_letter)
p = filenameparser.Parse(
lex.items,
remove_c2c=remove_c2c,
remove_fcbd=remove_fcbd,
remove_publisher=remove_publisher,
protofolius_issue_number_scheme=protofolius_issue_number_scheme,
)
return p.filename_info
else:
fnp = filenameparser.FileNameParser()
fnp.parse_filename(filename)
fni = filenameparser.FilenameInfo()
if fnp.issue:
fni["issue"] = fnp.issue
if fnp.series:
fni["series"] = fnp.series
if fnp.volume:
fni["volume"] = fnp.volume
if fnp.year:
fni["year"] = fnp.year
if fnp.issue_count:
fni["issue_count"] = fnp.issue_count
if fnp.remainder:
fni["remainder"] = fnp.remainder
return fni
def combine_notes(existing_notes: str | None, new_notes: str | None, split: str) -> str:
split_notes, split_str, untouched_notes = (existing_notes or "").rpartition(split)
if split_notes or split_str:

View File

@ -119,6 +119,18 @@ def filename(parser: settngs.Manager) -> None:
action=argparse.BooleanOptionalAction,
help="Attempts to remove publisher names from filenames, currently limited to Marvel and DC. Requires --complicated-parser",
)
parser.add_setting(
"--protofolius-issue-number-scheme",
default=False,
action=argparse.BooleanOptionalAction,
help="Use an issue number scheme devised by protofolius for encoding format informatino as a letter in front of an issue number. Implies --allow-issue-start-with-letter. Requires --complicated-parser",
)
parser.add_setting(
"--allow-issue-start-with-letter",
default=False,
action=argparse.BooleanOptionalAction,
help="Allows an issue number to start with a single letter (e.g. '#X01'). Requires --complicated-parser",
)
def talker(parser: settngs.Manager) -> None:
@ -220,7 +232,7 @@ def autotag(parser: settngs.Manager) -> None:
parser.add_setting("remove_archive_after_successful_match", default=False, cmdline=False)
def validate_file_settings(config: settngs.Config[ct_ns]) -> settngs.Config[ct_ns]:
def parse_filter(config: settngs.Config[ct_ns]) -> settngs.Config[ct_ns]:
new_filter = []
remove = []
for x in config[0].Issue_Identifier_publisher_filter:
@ -235,6 +247,13 @@ def validate_file_settings(config: settngs.Config[ct_ns]) -> settngs.Config[ct_n
if x in new_filter:
new_filter.remove(x)
config[0].Issue_Identifier_publisher_filter = new_filter
return config
def validate_file_settings(config: settngs.Config[ct_ns]) -> settngs.Config[ct_ns]:
config = parse_filter(config)
if config[0].Filename_Parsing_protofolius_issue_number_scheme:
config[0].Filename_Parsing_allow_issue_start_with_letter = True
config[0].File_Rename_replacements = Replacements(
[Replacement(x[0], x[1], x[2]) for x in config[0].File_Rename_replacements[0]],

View File

@ -69,6 +69,8 @@ class settngs_namespace(settngs.TypedNS):
Filename_Parsing_remove_c2c: bool
Filename_Parsing_remove_fcbd: bool
Filename_Parsing_remove_publisher: bool
Filename_Parsing_protofolius_issue_number_scheme: bool
Filename_Parsing_allow_issue_start_with_letter: bool
Sources_source: str
Sources_remove_html_tables: bool

View File

@ -195,6 +195,8 @@ class SettingsWindow(QtWidgets.QDialog):
self.settings_to_form()
self.rename_test()
self.dir_test()
self.leFilenameParserTest.setText(self.lblRenameTest.text())
self.filename_parser_test()
# Set General as start tab
self.tabWidget.setCurrentIndex(0)
@ -222,6 +224,15 @@ class SettingsWindow(QtWidgets.QDialog):
self.twLiteralReplacements.cellChanged.connect(self.rename_test)
self.twValueReplacements.cellChanged.connect(self.rename_test)
self.leFilenameParserTest.textEdited.connect(self.filename_parser_test)
self.cbxRemoveC2C.clicked.connect(self.filename_parser_test)
self.cbxRemoveFCBD.clicked.connect(self.filename_parser_test)
self.cbxRemovePublisher.clicked.connect(self.filename_parser_test)
self.cbxProtofoliusIssueNumberScheme.clicked.connect(self.filename_parser_test)
self.cbxProtofoliusIssueNumberScheme.clicked.connect(self.protofolius_clicked)
self.cbxAllowIssueStartWithLetter.clicked.connect(self.filename_parser_test)
self.cbxSplitWords.clicked.connect(self.filename_parser_test)
def disconnect_signals(self) -> None:
self.btnAddLiteralReplacement.clicked.disconnect()
self.btnAddValueReplacement.clicked.disconnect()
@ -241,6 +252,55 @@ class SettingsWindow(QtWidgets.QDialog):
self.leRenameTemplate.textEdited.disconnect()
self.twLiteralReplacements.cellChanged.disconnect()
self.twValueReplacements.cellChanged.disconnect()
self.leFilenameParserTest.textEdited.disconnect()
self.cbxRemoveC2C.clicked.disconnect()
self.cbxRemoveFCBD.clicked.disconnect()
self.cbxRemovePublisher.clicked.disconnect()
self.cbxProtofoliusIssueNumberScheme.clicked.disconnect()
self.cbxAllowIssueStartWithLetter.clicked.disconnect()
self.cbxSplitWords.clicked.disconnect()
def protofolius_clicked(self, *args: Any, **kwargs: Any) -> None:
if self.cbxProtofoliusIssueNumberScheme.isChecked():
self.cbxAllowIssueStartWithLetter.setEnabled(False)
self.cbxAllowIssueStartWithLetter.setChecked(True)
else:
self.cbxAllowIssueStartWithLetter.setEnabled(True)
self.filename_parser_test()
def filename_parser_test(self, *args: Any, **kwargs: Any) -> None:
self._filename_parser_test(self.leFilenameParserTest.text())
def _filename_parser_test(self, filename: str) -> None:
filename_info = utils.parse_filename(
filename=filename,
complicated_parser=self.cbxComplicatedParser.isChecked(),
remove_c2c=self.cbxRemoveC2C.isChecked(),
remove_fcbd=self.cbxRemoveFCBD.isChecked(),
remove_publisher=self.cbxRemovePublisher.isChecked(),
split_words=self.cbxSplitWords.isChecked(),
allow_issue_start_with_letter=self.cbxAllowIssueStartWithLetter.isChecked(),
protofolius_issue_number_scheme=self.cbxProtofoliusIssueNumberScheme.isChecked(),
)
report = ""
for item in (
"series",
"issue",
"issue_count",
"title",
"volume",
"volume_count",
"year",
"alternate",
"publisher",
"archive",
"remainder",
"annual",
"c2c",
"fcbd",
):
report += f"{item.title().replace('_', ' ')}: {dict(filename_info)[item]}\n"
self.lblFilenameParserTest.setText(report)
def addLiteralReplacement(self) -> None:
self.insertRow(self.twLiteralReplacements, self.twLiteralReplacements.rowCount(), Replacement("", "", False))
@ -319,6 +379,9 @@ class SettingsWindow(QtWidgets.QDialog):
self.cbxRemoveC2C.setChecked(self.config[0].Filename_Parsing_remove_c2c)
self.cbxRemoveFCBD.setChecked(self.config[0].Filename_Parsing_remove_fcbd)
self.cbxRemovePublisher.setChecked(self.config[0].Filename_Parsing_remove_publisher)
self.cbxProtofoliusIssueNumberScheme.setChecked(self.config[0].Filename_Parsing_protofolius_issue_number_scheme)
self.cbxAllowIssueStartWithLetter.setChecked(self.config[0].Filename_Parsing_allow_issue_start_with_letter)
self.switch_parser()
self.cbxClearFormBeforePopulating.setChecked(self.config[0].Issue_Identifier_clear_form_before_populating)
@ -434,6 +497,10 @@ class SettingsWindow(QtWidgets.QDialog):
self.config[0].Filename_Parsing_remove_c2c = self.cbxRemoveC2C.isChecked()
self.config[0].Filename_Parsing_remove_fcbd = self.cbxRemoveFCBD.isChecked()
self.config[0].Filename_Parsing_remove_publisher = self.cbxRemovePublisher.isChecked()
self.config[0].Filename_Parsing_allow_issue_start_with_letter = self.cbxAllowIssueStartWithLetter.isChecked()
self.config.values.Filename_Parsing_protofolius_issue_number_scheme = (
self.cbxProtofoliusIssueNumberScheme.isChecked()
)
self.config[0].Issue_Identifier_clear_form_before_populating = self.cbxClearFormBeforePopulating.isChecked()
self.config[0].Issue_Identifier_always_use_publisher_filter = self.cbxUseFilter.isChecked()

View File

@ -318,6 +318,46 @@
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="cbxProtofoliusIssueNumberScheme">
<property name="text">
<string>Use protofolius's issue number scheme</string>
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="cbxAllowIssueStartWithLetter">
<property name="text">
<string>Allow issue numbers to start with a letter</string>
</property>
</widget>
</item>
</layout>
</widget>
</item>
<item>
<widget class="QGroupBox" name="groupBox_3">
<layout class="QVBoxLayout" name="verticalLayout_8">
<item>
<widget class="QCheckBox" name="cbxSplitWords">
<property name="text">
<string>!Preview only! Attempts to split words before parsing the filename. e.g. 'judgedredd' to 'judge dredd'</string>
</property>
</widget>
</item>
<item>
<widget class="QLineEdit" name="leFilenameParserTest"/>
</item>
<item>
<widget class="QLabel" name="lblFilenameParserTest">
<property name="textFormat">
<enum>Qt::PlainText</enum>
</property>
<property name="textInteractionFlags">
<set>Qt::LinksAccessibleByMouse|Qt::TextSelectableByKeyboard|Qt::TextSelectableByMouse</set>
</property>
</widget>
</item>
</layout>
</widget>
</item>

View File

@ -23,6 +23,21 @@ datadir = pathlib.Path(__file__).parent / "data"
cbz_path = datadir / "Cory Doctorow's Futuristic Tales of the Here and Now #001 - Anda's Game (2007).cbz"
names = [
(
"Michel Vaillant #5 Nr. 13 aan de start",
"Shortened word followed by a number eg No. 13, Mr. 13",
{
"issue": "5",
"series": "Michel Vaillant",
"title": "Nr. 13 aan de start",
"volume": "",
"year": "",
"remainder": "",
"issue_count": "",
"alternate": "",
},
(False, True),
),
(
"Karl May #001 Old Shatterhand.cbr",
"Month in series",
@ -39,9 +54,146 @@ names = [
},
(False, True),
),
(
"Michel Vaillant #8 De 8ste man",
"Non english ordinal",
{
"issue": "8",
"series": "Michel Vaillant",
"title": "De 8ste man",
"volume": "",
"year": "",
"remainder": "",
"issue_count": "",
"alternate": "",
},
(False, True),
),
(
"Michel Vaillant #13 Mach 1 voor Steve Warson",
"number in title",
{
"issue": "13",
"series": "Michel Vaillant",
"title": "Mach 1 voor Steve Warson",
"volume": "",
"year": "",
"remainder": "",
"issue_count": "",
"alternate": "",
},
(False, True),
),
(
"Michel Vaillant #19 5 Meisjes in de race",
"number starting title",
{
"issue": "19",
"series": "Michel Vaillant",
"title": "5 Meisjes in de race",
"volume": "",
"year": "",
"remainder": "",
"issue_count": "",
"alternate": "",
},
(False, True),
),
(
"Michel Vaillant #34 Steve Warson gaat K.O.",
"acronym",
{
"issue": "34",
"series": "Michel Vaillant",
"title": "Steve Warson gaat K.O.",
"volume": "",
"year": "",
"remainder": "",
"issue_count": "",
"alternate": "",
},
(False, True),
),
(
"Michel Vaillant #40 F.1 in oproer",
"acronym with numbers",
{
"issue": "40",
"series": "Michel Vaillant",
"title": "F.1 in oproer",
"volume": "",
"year": "",
"remainder": "",
"issue_count": "",
"alternate": "",
},
(False, True),
),
(
"Michel Vaillant #42 300 kmu door Parijs",
"number starting title",
{
"issue": "42",
"series": "Michel Vaillant",
"title": "300 kmu door Parijs",
"volume": "",
"year": "",
"remainder": "",
"issue_count": "",
"alternate": "",
},
(False, True),
),
(
"Michel Vaillant #52 F 3000",
"title ends with number",
{
"issue": "52",
"series": "Michel Vaillant",
"title": "F 3000",
"volume": "",
"year": "",
"remainder": "",
"issue_count": "",
"alternate": "",
},
(False, True),
),
(
"Michel Vaillant #66 100.000.000 $ voor Steve Warson",
"number separator is . and dollarsign after number",
{
"issue": "66",
"series": "Michel Vaillant",
"title": "100.000.000 $ voor Steve Warson",
"volume": "",
"year": "",
"remainder": "",
"issue_count": "",
"alternate": "",
},
(False, True),
),
(
"batman #B01 title (DC).cbz",
"protofolius_issue_number_scheme",
{
"issue": "B1",
"series": "batman",
"title": "title",
"publisher": "DC",
"volume": "",
"year": "",
"remainder": "",
"issue_count": "",
"alternate": "",
"format": "biography/best of",
},
(False, True),
),
(
"batman #3 title (DC).cbz",
"honorific and publisher in series",
"publisher in parenthesis",
{
"issue": "3",
"series": "batman",
@ -57,7 +209,7 @@ names = [
),
(
"batman #3 title DC.cbz",
"honorific and publisher in series",
"publisher in title",
{
"issue": "3",
"series": "batman",
@ -740,15 +892,33 @@ names = [
),
]
fnames = []
oldfnames = []
newfnames = []
for p in names:
pp = list(p)
pp[3] = p[3][0]
fnames.append(tuple(pp))
if "#" in p[0]:
pp[0] = p[0].replace("#", "")
pp[3] = p[3][1]
fnames.append(tuple(pp))
filename, reason, info, xfail = p
nxfail = xfail[0]
newfnames.append(pytest.param(filename, reason, info, nxfail))
oldfnames.append(
pytest.param(filename, reason, info, nxfail, marks=pytest.mark.xfail(condition=nxfail, reason="old parser"))
)
if "#" in filename:
filename = filename.replace("#", "")
nxfail = xfail[1]
if reason in ("protofolius_issue_number_scheme", "number starting title"):
newfnames.append(
pytest.param(
filename,
reason,
info,
nxfail,
marks=pytest.mark.xfail(condition=nxfail, reason=reason),
)
)
else:
newfnames.append(pytest.param(filename, reason, info, nxfail))
oldfnames.append(
pytest.param(filename, reason, info, nxfail, marks=pytest.mark.xfail(condition=nxfail, reason="old parser"))
)
rnames = [
(

View File

@ -2,18 +2,21 @@ from __future__ import annotations
import pytest
import comicapi.filenamelexer
import comicapi.filenameparser
from testing.filenames import fnames
from testing.filenames import newfnames, oldfnames
@pytest.mark.parametrize("filename, reason, expected, xfail", fnames)
@pytest.mark.parametrize("filename, reason, expected, xfail", newfnames)
def test_file_name_parser_new(filename, reason, expected, xfail):
lex = comicapi.filenamelexer.Lex(filename, "protofolius_issue_number_scheme" == reason)
p = comicapi.filenameparser.Parse(
comicapi.filenamelexer.Lex(filename).items,
lex.items,
first_is_alt=True,
remove_c2c=True,
remove_fcbd=True,
remove_publisher=True,
protofolius_issue_number_scheme="protofolius_issue_number_scheme" == reason,
)
fp = p.filename_info
@ -30,13 +33,13 @@ def test_file_name_parser_new(filename, reason, expected, xfail):
assert fp == expected
@pytest.mark.parametrize("filename, reason, expected, xfail", fnames)
@pytest.mark.parametrize("filename, reason, expected, xfail", oldfnames)
def test_file_name_parser(filename, reason, expected, xfail):
p = comicapi.filenameparser.FileNameParser()
p.parse_filename(filename)
fp = p.__dict__
# These are currently not tracked in this parser
for s in ["title", "alternate", "publisher", "fcbd", "c2c", "annual", "volume_count", "remainder"]:
for s in ["title", "alternate", "publisher", "fcbd", "c2c", "annual", "volume_count", "remainder", "format"]:
if s in expected:
del expected[s]
@ -44,6 +47,4 @@ def test_file_name_parser(filename, reason, expected, xfail):
if "remainder" in fp:
del fp["remainder"]
if xfail and fp != expected:
pytest.xfail("old parser")
assert fp == expected

View File

@ -12,6 +12,9 @@ issues = [
("1", 1.0, "001"),
("22.BEY", 22.0, "022.BEY"),
("22A", 22.0, "022A"),
("A22A", 22.0, "A022A"),
("A22", 22.0, "A022"),
("A22½", 22.5, "A022½"),
("22-A", 22.0, "022-A"),
("", None, ""),
]