Add new filename parser

I created a new, mostly over complicated, filename parser
The new parser works well in many cases and will collect more data than
the original parser but will sometimes give odd results because of how
complicated it has been made e.g.
'100 page giant' will cause issues however '100-page giant' will not

Remove the parse scan info setting as it was not respected in many cases
This commit is contained in:
Timmy Welch 2022-04-29 16:37:44 -07:00
parent 049971a78a
commit 205d337751
15 changed files with 1560 additions and 125 deletions

View File

@ -42,10 +42,10 @@ try:
except ImportError:
pil_available = False
from comicapi import filenamelexer, filenameparser
from comicapi.comet import CoMet
from comicapi.comicbookinfo import ComicBookInfo
from comicapi.comicinfoxml import ComicInfoXml
from comicapi.filenameparser import FileNameParser
from comicapi.genericmetadata import GenericMetadata, PageType
logger = logging.getLogger(__name__)
@ -1127,25 +1127,46 @@ class ComicArchive:
data = self.get_page(idx)
p["ImageSize"] = str(len(data))
def metadata_from_filename(self, parse_scan_info=True):
def metadata_from_filename(
self, complicated_parser=False, remove_c2c=False, remove_fcbd=False, remove_publisher=False
):
metadata = GenericMetadata()
fnp = FileNameParser()
fnp.parse_filename(self.path)
if complicated_parser:
lex = filenamelexer.Lex(self.path)
p = filenameparser.Parse(
lex.items, remove_c2c=remove_c2c, remove_fcbd=remove_fcbd, remove_publisher=remove_publisher
)
metadata.alternate_number = p.filename_info["alternate"] or None
metadata.issue = p.filename_info["issue"] or None
metadata.issue_count = p.filename_info["issue_count"] or None
metadata.publisher = p.filename_info["publisher"] or None
metadata.series = p.filename_info["series"] or None
metadata.title = p.filename_info["title"] or None
metadata.volume = p.filename_info["volume"] or None
metadata.volume_count = p.filename_info["volume_count"] or None
metadata.year = p.filename_info["year"] or None
if fnp.issue != "":
metadata.issue = fnp.issue
if fnp.series != "":
metadata.series = fnp.series
if fnp.volume != "":
metadata.volume = fnp.volume
if fnp.year != "":
metadata.year = fnp.year
if fnp.issue_count != "":
metadata.issue_count = fnp.issue_count
if parse_scan_info:
if fnp.remainder != "":
metadata.scan_info = p.filename_info["remainder"] or None
metadata.format = "FCBD" if p.filename_info["fcbd"] else None
if p.filename_info["annual"]:
metadata.format = "Annual"
else:
fnp = filenameparser.FileNameParser()
fnp.parse_filename(self.path)
if fnp.issue:
metadata.issue = fnp.issue
if fnp.series:
metadata.series = fnp.series
if fnp.volume:
metadata.volume = fnp.volume
if fnp.year:
metadata.year = fnp.year
if fnp.issue_count:
metadata.issue_count = fnp.issue_count
if fnp.remainder:
metadata.scan_info = fnp.remainder
metadata.is_empty = False

353
comicapi/filenamelexer.py Normal file
View File

@ -0,0 +1,353 @@
import calendar
import os
import unicodedata
from enum import Enum, auto
class ItemType(Enum):
Error = auto() # Error occurred; value is text of error
EOF = auto()
Text = auto() # Text
LeftParen = auto() # '(' inside action
Number = auto() # Simple number
IssueNumber = auto() # Preceded by a # Symbol
RightParen = auto() # ')' inside action
Space = auto() # Run of spaces separating arguments
Dot = auto()
LeftBrace = auto()
RightBrace = auto()
LeftSBrace = auto()
RightSBrace = auto()
Symbol = auto()
Skip = auto() # __ or -- no title, issue or series information beyond
Operator = auto()
Calendar = auto()
InfoSpecifier = auto() # Specifies type of info e.g. v1 for 'volume': 1
ArchiveType = auto()
Honorific = auto()
Keywords = auto()
FCBD = auto()
ComicType = auto()
Publisher = auto()
C2C = auto()
braces = [
ItemType.LeftBrace,
ItemType.LeftParen,
ItemType.LeftSBrace,
ItemType.RightBrace,
ItemType.RightParen,
ItemType.RightSBrace,
]
eof = chr(0)
key = {
"fcbd": ItemType.FCBD,
"freecomicbookday": ItemType.FCBD,
"cbr": ItemType.ArchiveType,
"cbz": ItemType.ArchiveType,
"cbt": ItemType.ArchiveType,
"cb7": ItemType.ArchiveType,
"rar": ItemType.ArchiveType,
"zip": ItemType.ArchiveType,
"tar": ItemType.ArchiveType,
"7z": ItemType.ArchiveType,
"annual": ItemType.ComicType,
"book": ItemType.ComicType,
"volume": ItemType.InfoSpecifier,
"vol.": ItemType.InfoSpecifier,
"vol": ItemType.InfoSpecifier,
"v": ItemType.InfoSpecifier,
"of": ItemType.InfoSpecifier,
"dc": ItemType.Publisher,
"marvel": ItemType.Publisher,
"covers": ItemType.InfoSpecifier,
"c2c": ItemType.C2C,
"mr": ItemType.Honorific,
"ms": ItemType.Honorific,
"mrs": ItemType.Honorific,
"dr": ItemType.Honorific,
}
class Item:
def __init__(self, typ: ItemType, pos: int, val: str):
self.typ: ItemType = typ
self.pos: int = pos
self.val: str = val
def __repr__(self):
return f"{self.val}: index: {self.pos}: {self.typ}"
class Lexer:
def __init__(self, string):
self.input: str = string # The string being scanned
self.state = None # The next lexing function to enter
self.pos: int = -1 # Current position in the input
self.start: int = 0 # Start position of this item
self.lastPos: int = 0 # Position of most recent item returned by nextItem
self.paren_depth: int = 0 # Nesting depth of ( ) exprs
self.brace_depth: int = 0 # Nesting depth of { }
self.sbrace_depth: int = 0 # Nesting depth of [ ]
self.items = []
# Next returns the next rune in the input.
def get(self) -> str:
if int(self.pos) >= len(self.input) - 1:
self.pos += 1
return eof
self.pos += 1
return self.input[self.pos]
# Peek returns but does not consume the next rune in the input.
def peek(self) -> str:
if int(self.pos) >= len(self.input) - 1:
return eof
return self.input[self.pos + 1]
def backup(self):
self.pos -= 1
# Emit passes an item back to the client.
def emit(self, t: ItemType):
self.items.append(Item(t, self.start, self.input[self.start : self.pos + 1]))
self.start = self.pos + 1
# Ignore skips over the pending input before this point.
def ignore(self):
self.start = self.pos
# Accept consumes the next rune if it's from the valid se:
def accept(self, valid: str):
if self.get() in valid:
return True
self.backup()
return False
# AcceptRun consumes a run of runes from the valid set.
def accept_run(self, valid: str):
while self.get() in valid:
pass
self.backup()
# Errorf returns an error token and terminates the scan by passing
# Back a nil pointer that will be the next state, terminating self.nextItem.
def errorf(self, message: str):
self.items.append(Item(ItemType.Error, self.start, message))
# NextItem returns the next item from the input.
# Called by the parser, not in the lexing goroutine.
# def next_item(self) -> Item:
# item: Item = self.items.get()
# self.lastPos = item.pos
# return item
def scan_number(self):
digits = "0123456789"
self.accept_run(digits)
if self.accept("."):
if self.accept(digits):
self.accept_run(digits)
else:
self.backup()
if self.accept("s"):
if not self.accept("t"):
self.backup()
elif self.accept("nr"):
if not self.accept("d"):
self.backup()
elif self.accept("t"):
if not self.accept("h"):
self.backup()
return True
# Runs the state machine for the lexer.
def run(self):
self.state = lex_filename
while self.state is not None:
self.state = self.state(self)
# Scans the elements inside action delimiters.
def lex_filename(lex: Lexer):
r = lex.get()
if r == eof:
if lex.paren_depth != 0:
return lex.errorf("unclosed left paren")
if lex.brace_depth != 0:
return lex.errorf("unclosed left paren")
lex.emit(ItemType.EOF)
return None
elif is_space(r):
if r == "_" and lex.peek() == "_":
lex.get()
lex.emit(ItemType.Skip)
else:
return lex_space
elif r == ".":
r = lex.peek()
if r < "0" or "9" < r:
lex.emit(ItemType.Dot)
return lex_filename
lex.backup()
return lex_number
elif r == "'":
r = lex.peek()
if r in "0123456789":
return lex_number
lex.emit(ItemType.Text) # TODO: Change to Text
elif "0" <= r <= "9":
lex.backup()
return lex_number
elif r == "#":
if "0" <= lex.peek() <= "9":
return lex_number
lex.emit(ItemType.Symbol)
elif is_operator(r):
if r == "-" and lex.peek() == "-":
lex.get()
lex.emit(ItemType.Skip)
else:
return lex_operator
elif is_alpha_numeric(r):
lex.backup()
return lex_text
elif r == "(":
lex.emit(ItemType.LeftParen)
lex.paren_depth += 1
elif r == ")":
lex.emit(ItemType.RightParen)
lex.paren_depth -= 1
if lex.paren_depth < 0:
return lex.errorf("unexpected right paren " + r)
elif r == "{":
lex.emit(ItemType.LeftBrace)
lex.brace_depth += 1
elif r == "}":
lex.emit(ItemType.RightBrace)
lex.brace_depth -= 1
if lex.brace_depth < 0:
return lex.errorf("unexpected right brace " + r)
elif r == "[":
lex.emit(ItemType.LeftSBrace)
lex.sbrace_depth += 1
elif r == "]":
lex.emit(ItemType.RightSBrace)
lex.sbrace_depth -= 1
if lex.sbrace_depth < 0:
return lex.errorf("unexpected right brace " + r)
elif is_symbol(r):
# L.backup()
lex.emit(ItemType.Symbol)
else:
return lex.errorf("unrecognized character in action: " + r)
return lex_filename
def lex_operator(lex: Lexer):
lex.accept_run("-|:;")
lex.emit(ItemType.Operator)
return lex_filename
# LexSpace scans a run of space characters.
# One space has already been seen.
def lex_space(lex: Lexer):
while is_space(lex.peek()):
lex.get()
lex.emit(ItemType.Space)
return lex_filename
# Lex_text scans an alphanumeric.
def lex_text(lex: Lexer):
while True:
r = lex.get()
if is_alpha_numeric(r):
if r.isnumeric(): # E.g. v1
word = lex.input[lex.start : lex.pos]
if word.lower() in key and key[word.lower()] == ItemType.InfoSpecifier:
lex.backup()
lex.emit(key[word.lower()])
return lex_filename
else:
if r == "'" and lex.peek() == "s":
lex.get()
else:
lex.backup()
word = lex.input[lex.start : lex.pos + 1]
if word.lower() == "vol" and lex.peek() == ".":
lex.get()
word = lex.input[lex.start : lex.pos + 1]
if word.lower() in key:
lex.emit(key[word.lower()])
elif cal(word):
lex.emit(ItemType.Calendar)
else:
lex.emit(ItemType.Text)
break
return lex_filename
def cal(value: str):
month_abbr = [i for i, x in enumerate(calendar.month_abbr) if x == value.title()]
month_name = [i for i, x in enumerate(calendar.month_name) if x == value.title()]
day_abbr = [i for i, x in enumerate(calendar.day_abbr) if x == value.title()]
day_name = [i for i, x in enumerate(calendar.day_name) if x == value.title()]
return set(month_abbr + month_name + day_abbr + day_name)
def lex_number(lex: Lexer):
if not lex.scan_number():
return lex.errorf("bad number syntax: " + lex.input[lex.start : lex.pos])
# Complex number logic removed. Messes with math operations without space
if lex.input[lex.start] == "#":
lex.emit(ItemType.IssueNumber)
elif not lex.input[lex.pos].isdigit():
# Assume that 80th is just text and not a number
lex.emit(ItemType.Text)
else:
lex.emit(ItemType.Number)
return lex_filename
def is_space(character: str):
return character in "_ \t"
# IsAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
def is_alpha_numeric(character: str):
return character.isalpha() or character.isnumeric()
def is_operator(character: str):
return character in "-|:;/\\"
def is_symbol(character: str):
return unicodedata.category(character)[0] in "PS"
def Lex(filename: str):
lex = Lexer(string=os.path.basename(filename))
lex.run()
return lex

View File

@ -23,8 +23,17 @@ This should probably be re-written, but, well, it mostly works!
import logging
import os
import re
from operator import itemgetter
from typing import TypedDict
from urllib.parse import unquote
from text2digits import text2digits
from comicapi import filenamelexer, issuestring
t2d = text2digits.Text2Digits(add_ordinal_ending=False)
t2do = text2digits.Text2Digits(add_ordinal_ending=True)
logger = logging.getLogger(__name__)
@ -68,9 +77,7 @@ class FileNameParser:
if match:
count = match.group()
count = count.lstrip("0")
return count
return count.lstrip("0")
def get_issue_number(self, filename):
"""Returns a tuple of issue number string, and start and end indexes in the filename
@ -222,7 +229,7 @@ class FileNameParser:
year = ""
# look for four digit number with "(" ")" or "--" around it
match = re.search(r"(\(\d\d\d\d\))|(--\d\d\d\d--)", filename)
match = re.search(r"(\(\d{4}\))|(--\d{4}--)", filename)
if match:
year = match.group()
# remove non-digits
@ -290,3 +297,814 @@ class FileNameParser:
self.issue = "0"
if self.issue[0] == ".":
self.issue = "0" + self.issue
class FilenameInfo(TypedDict, total=False):
alternate: str
annual: bool
archive: str
c2c: bool
fcbd: bool
issue: str
issue_count: str
publisher: str
remainder: str
series: str
title: str
volume: str
volume_count: str
year: str
eof = filenamelexer.Item(filenamelexer.ItemType.EOF, -1, "")
class Parser:
"""docstring for FilenameParser"""
def __init__(
self,
lexer_result: list[filenamelexer.Item],
first_is_alt=False,
remove_c2c=False,
remove_fcbd=False,
remove_publisher=False,
):
self.state = None
self.pos = -1
self.firstItem = True
self.skip = False
self.alt = False
self.filename_info: FilenameInfo = {"series": ""}
self.issue_number_at = None
self.in_something = 0 # In some sort of brackets {}[]()
self.in_brace = 0 # In {}
self.in_s_brace = 0 # In []
self.in_paren = 0 # In ()
self.year_candidates: list[tuple[bool, filenamelexer.Item]] = []
self.series_parts: list[filenamelexer.Item] = []
self.title_parts: list[filenamelexer.Item] = []
self.used_items: list[filenamelexer.Item] = []
self.irrelevant: list[filenamelexer.Item] = []
self.operator_rejected: list[filenamelexer.Item] = []
self.publisher_removed: list[filenamelexer.Item] = []
self.first_is_alt = first_is_alt
self.remove_c2c = remove_c2c
self.remove_fcbd = remove_fcbd
self.remove_publisher = remove_publisher
self.input = lexer_result
for i, item in enumerate(self.input):
if item.typ == filenamelexer.ItemType.IssueNumber:
self.issue_number_at = i
# Get returns the next Item in the input.
def get(self) -> filenamelexer.Item:
if int(self.pos) >= len(self.input) - 1:
self.pos += 1
return eof
self.pos += 1
return self.input[self.pos]
# Peek returns but does not consume the next Item in the input.
def peek(self) -> filenamelexer.Item:
if int(self.pos) >= len(self.input) - 1:
return eof
return self.input[self.pos + 1]
# Peek_back returns but does not step back the previous Item in the input.
def peek_back(self) -> filenamelexer.Item:
if int(self.pos) == 0:
return eof
return self.input[self.pos - 1]
# Backup steps back one Item.
def backup(self):
self.pos -= 1
def run(self):
self.state = parse
while self.state is not None:
self.state = self.state(self)
def parse(p: Parser):
item: filenamelexer.Item = p.get()
# We're done, time to do final processing
if item.typ == filenamelexer.ItemType.EOF:
return parse_finish
# Need to figure out if this is the issue number
if item.typ == filenamelexer.ItemType.Number:
likely_year = False
if p.firstItem and p.first_is_alt:
# raise Exception("fuck you")
p.alt = True
return parse_issue_number
# The issue number should hopefully not be in parentheses
if p.in_something == 0:
# Assume that operators indicate a non-issue number e.g. IG-88 or 88-IG
if filenamelexer.ItemType.Operator not in (p.peek().typ, p.peek_back().typ):
# It is common to use '89 to refer to an annual reprint from 1989
if item.val[0] != "'":
# Issue number is less than 4 digits. very few series go above 999
if len(item.val.lstrip("0")) < 4:
# An issue number starting with # Was not found and no previous number was found
if p.issue_number_at is None:
# Series has already been started/parsed, filters out leading alternate numbers leading alternate number
if len(p.series_parts) > 0:
# Unset first item
if p.firstItem:
p.firstItem = False
return parse_issue_number
else:
p.operator_rejected.append(item)
# operator rejected used later to add back to the series/title
# It is more likely to be a year if it is inside parentheses.
if p.in_something > 0:
likely_year = True
# If numbers are directly followed by text it most likely isn't a year e.g. 2048px
if p.peek().typ == filenamelexer.ItemType.Text:
likely_year = False
# Is either a full year '2001' or a short year "'89"
if len(item.val) == 4 or item.val[0] == "'":
if p.in_something == 0:
# Append to series in case it is a part of the title, but only if were not inside parenthesis
p.series_parts.append(item)
# Look for a full date as in 2022-04-22
if p.peek().typ in [
filenamelexer.ItemType.Symbol,
filenamelexer.ItemType.Operator,
filenamelexer.ItemType.Dot,
]:
op = [p.get()]
if p.peek().typ == filenamelexer.ItemType.Number:
month = p.get()
if p.peek().typ in [
filenamelexer.ItemType.Symbol,
filenamelexer.ItemType.Operator,
filenamelexer.ItemType.Dot,
]:
op.append(p.get())
if p.peek().typ == filenamelexer.ItemType.Number:
day = p.get()
fulldate = [month, day, item]
p.used_items.extend(op)
p.used_items.extend(fulldate)
else:
p.backup()
p.backup()
p.backup()
# TODO never happens
else:
p.backup()
p.backup()
# TODO never happens
else:
p.backup()
# TODO never happens
p.year_candidates.append((likely_year, item))
# Ensures that IG-88 gets added back to the series/title
elif (
p.in_something == 0
and p.peek_back().typ == filenamelexer.ItemType.Operator
or p.peek().typ == filenamelexer.ItemType.Operator
):
# Were not in something and the next or previous type is an operator, add it to the series
p.series_parts.append(item)
p.used_items.append(item)
# Unset first item
if p.firstItem:
p.firstItem = False
p.get()
return parse_series
# Number with a leading hash e.g. #003
elif item.typ == filenamelexer.ItemType.IssueNumber:
# Unset first item
if p.firstItem:
p.firstItem = False
return parse_issue_number
# Matches FCBD. Not added to p.used_items so it will show in "remainder"
elif item.typ == filenamelexer.ItemType.FCBD:
p.filename_info["fcbd"] = True
# Matches c2c. Not added to p.used_items so it will show in "remainder"
elif item.typ == filenamelexer.ItemType.C2C:
p.filename_info["c2c"] = True
# Matches the extension if it is known to be an archive format e.g. cbt,cbz,zip,rar
elif item.typ == filenamelexer.ItemType.ArchiveType:
p.filename_info["archive"] = item.val.lower()
p.used_items.append(item)
if p.peek_back().typ == filenamelexer.ItemType.Dot:
p.used_items.append(p.peek_back())
# Allows removing DC from 'Wonder Woman 49 DC Sep-Oct 1951' dependent on publisher being in a static list in the lexer
elif item.typ == filenamelexer.ItemType.Publisher:
p.filename_info["publisher"] = item.val
p.used_items.append(item)
if p.firstItem:
p.firstItem = False
if p.in_something == 0:
return parse_series
p.publisher_removed.append(item)
if p.in_something == 0:
return parse_series
# Attempts to identify the type e.g. annual
elif item.typ == filenamelexer.ItemType.ComicType:
series_append = True
if p.peek().typ == filenamelexer.ItemType.Space:
p.get()
if p.series_parts and "free comic book" in (" ".join([x.val for x in p.series_parts]) + " " + item.val).lower():
p.filename_info["fcbd"] = True
series_append = True
# If the next item is a number it's probably the volume
elif p.peek().typ == filenamelexer.ItemType.Number or (
p.peek().typ == filenamelexer.ItemType.Text and t2d.convert(p.peek().val).isnumeric()
):
number = p.get()
# Mark volume info. Text will be added to the title/series later
if item.val.lower() in ["book", "tpb"]:
p.title_parts.extend([item, number])
p.filename_info["volume"] = t2do.convert(number.val)
p.filename_info["issue"] = t2do.convert(number.val)
p.used_items.append(item)
series_append = False
# Annuals usually mean the year
elif item.val.lower() in ["annual"]:
p.filename_info["annual"] = True
num = t2d.convert(number.val)
if num.isnumeric() and len(num) == 4:
p.year_candidates.append((True, number))
else:
p.backup()
elif item.val.lower() in ["annual"]:
p.filename_info["annual"] = True
# If we don't have a reason to exclude it from the series go back to parsing the series immediately
if series_append:
p.series_parts.append(item)
p.used_items.append(item)
return parse_series
# We found text, it's probably the title or series
elif item.typ in [filenamelexer.ItemType.Text, filenamelexer.ItemType.Honorific]:
# Unset first item
if p.firstItem:
p.firstItem = False
if p.in_something == 0:
return parse_series
# Usually the word 'of' eg 1 (of 6)
elif item.typ == filenamelexer.ItemType.InfoSpecifier:
return parse_info_specifier
# Operator is a symbol that acts as some sort of separator eg - : ;
elif item.typ == filenamelexer.ItemType.Operator:
if p.in_something == 0:
p.irrelevant.append(item)
# Filter out Month and day names in filename
elif item.typ == filenamelexer.ItemType.Calendar:
# Month and day are currently irrelevant if they are inside parentheses e.g. (January 2002)
if p.in_something > 0:
p.irrelevant.append(item)
# assume Sep-Oct is not useful in the series/title
elif p.peek().typ in [filenamelexer.ItemType.Symbol, filenamelexer.ItemType.Operator]:
p.get()
if p.peek().typ == filenamelexer.ItemType.Calendar:
p.irrelevant.extend([item, p.input[p.pos], p.get()])
else:
p.backup()
return parse_series
# This is text that just happens to also be a month/day
else:
return parse_series
# Specifically '__' or '--', no further title/series parsing is done to keep compatibility with wiki
elif item.typ == filenamelexer.ItemType.Skip:
p.skip = True
# Keeping track of parentheses depth
elif item.typ == filenamelexer.ItemType.LeftParen:
p.in_paren += 1
p.in_something += 1
elif item.typ == filenamelexer.ItemType.LeftBrace:
p.in_brace += 1
p.in_something += 1
elif item.typ == filenamelexer.ItemType.LeftSBrace:
p.in_s_brace += 1
p.in_something += 1
elif item.typ == filenamelexer.ItemType.RightParen:
p.in_paren -= 1
p.in_something -= 1
elif item.typ == filenamelexer.ItemType.RightBrace:
p.in_brace -= 1
p.in_something -= 1
elif item.typ == filenamelexer.ItemType.RightSBrace:
p.in_s_brace -= 1
p.in_something -= 1
# Unset first item
if p.firstItem:
p.firstItem = False
# Brace management, I don't like negative numbers
if p.in_paren < 0:
p.in_something += p.in_paren * -1
if p.in_brace < 0:
p.in_something += p.in_brace * -1
if p.in_s_brace < 0:
p.in_something += p.in_s_brace * -1
return parse
# TODO: What about more esoteric numbers???
def parse_issue_number(p: Parser):
item = p.input[p.pos]
if "issue" in p.filename_info:
if "alternate" in p.filename_info:
p.filename_info["alternate"] += "," + item.val
p.filename_info["alternate"] = item.val
else:
if p.alt:
p.filename_info["alternate"] = item.val
else:
p.filename_info["issue"] = item.val
p.issue_number_at = item.pos
p.used_items.append(item)
item = p.get()
if item.typ == filenamelexer.ItemType.Dot:
p.used_items.append(item)
item = p.get()
if item.typ in [filenamelexer.ItemType.Text, filenamelexer.ItemType.Number]:
if p.alt:
p.filename_info["alternate"] += "." + item.val
else:
p.filename_info["issue"] += "." + item.val
p.used_items.append(item)
else:
p.backup()
p.backup()
else:
p.backup()
p.alt = False
return parse
def parse_series(p: Parser):
item = p.input[p.pos]
series: list[list[filenamelexer.Item]] = [[]]
# Space and Dots are not useful at the beginning of a title/series
if not p.skip and item.typ not in [filenamelexer.ItemType.Space, filenamelexer.ItemType.Dot]:
series[0].append(item)
current_part = 0
title_parts: list[filenamelexer.Item] = []
series_parts: list[filenamelexer.Item] = []
prev_space = False
# 'free comic book day' screws things up. #TODO look into removing book from ComicType?
# We stop parsing the series when certain things come up if nothing was done with them continue where we left off
if (
p.series_parts
and p.series_parts[-1].val.lower() == "book"
or p.peek_back().typ == filenamelexer.ItemType.Number
or item.typ == filenamelexer.ItemType.Calendar
):
series_parts = p.series_parts
p.series_parts = []
# Skip is only true if we have come across '--' or '__'
while not p.skip:
item = p.get()
# Spaces are evil
if item.typ == filenamelexer.ItemType.Space:
prev_space = True
continue
if item.typ in [
filenamelexer.ItemType.Text,
filenamelexer.ItemType.Symbol,
filenamelexer.ItemType.Publisher,
filenamelexer.ItemType.Honorific,
]:
series[current_part].append(item)
if item.typ == filenamelexer.ItemType.Honorific and p.peek().typ == filenamelexer.ItemType.Dot:
series[current_part].append(p.get())
elif item.typ == filenamelexer.ItemType.Publisher:
p.filename_info["publisher"] = item.val
# Handle Volume
elif item.typ == filenamelexer.ItemType.InfoSpecifier:
# Exception for 'of'
if item.val.lower() == "of":
series[current_part].append(item)
else:
# This specifically lets 'X-Men-V1-067' parse correctly as Series: X-Men Volume: 1 Issue: 67
while len(series[current_part]) > 0 and series[current_part][-1].typ not in [
filenamelexer.ItemType.Text,
filenamelexer.ItemType.Symbol,
]:
p.irrelevant.append(series[current_part].pop())
p.backup()
break
elif item.typ == filenamelexer.ItemType.Operator:
peek = p.peek()
# ': ' separates the title from the series, only the last section is considered the title
if not prev_space and peek.typ in [filenamelexer.ItemType.Space]:
series.append([]) # Starts a new section
series[current_part].append(item)
current_part += 1
else:
# Force space around '-' makes 'batman - superman' stay otherwise we get 'batman-superman'
if prev_space and peek.typ in [filenamelexer.ItemType.Space]:
item.val = " " + item.val + " "
series[current_part].append(item)
# Stop processing series/title if a skip item is found
elif item.typ == filenamelexer.ItemType.Skip:
p.backup()
break
elif item.typ == filenamelexer.ItemType.Number:
if p.peek().typ == filenamelexer.ItemType.Space:
p.get()
# We have 2 numbers, add the first to the series and then go back to parse
if p.peek().typ == filenamelexer.ItemType.Number:
series[current_part].append(item)
break
# We have 1 number break here, it's possible it's the issue
p.backup() # Whitespace
p.backup() # The number
break
# This is 6 in '1 of 6'
if series[current_part] and series[current_part][-1].val.lower() == "of":
series[current_part].append(item)
# We have 1 number break here, it's possible it's the issue
else:
p.backup() # The number
break
else:
# Ensure 'ms. marvel' parses 'ms.' correctly
if item.typ == filenamelexer.ItemType.Dot and p.peek_back().typ == filenamelexer.ItemType.Honorific:
series[current_part].append(item)
# Allows avengers.hulk to parse correctly
elif item.typ == filenamelexer.ItemType.Dot and p.peek().typ == filenamelexer.ItemType.Text:
# Marks the dot as used so that the remainder is clean
p.used_items.append(item)
else:
p.backup()
break
prev_space = False
# We have a title separator e.g. ': "
if len(series) > 1:
title_parts.extend(series.pop())
for s in series:
if s and s[-1].typ == filenamelexer.ItemType.Operator:
s[-1].val += " " # Ensures that when there are multiple separators that they display properly
series_parts.extend(s)
p.used_items.append(series_parts.pop())
else:
series_parts.extend(series[0])
# If the series has already been set assume all of this is the title.
if len(p.series_parts) > 0:
p.title_parts.extend(series_parts)
p.title_parts.extend(title_parts)
else:
p.series_parts.extend(series_parts)
p.title_parts.extend(title_parts)
return parse
def resolve_year(p: Parser):
if len(p.year_candidates) > 0:
# Sort by likely_year boolean
p.year_candidates.sort(key=itemgetter(0))
# Take the last year e.g. (2007) 2099 (2008) becomes 2099 2007 2008 and takes 2008
selected_year = p.year_candidates.pop()[1]
p.filename_info["year"] = selected_year.val
p.used_items.append(selected_year)
# (2008) Title (2009) is many times used to denote the series year if we don't have a volume we use it
if "volume" not in p.filename_info and p.year_candidates and p.year_candidates[-1][0]:
vol = p.year_candidates.pop()[1]
p.filename_info["volume"] = vol.val
p.used_items.append(vol)
# Remove volume from series and title
if selected_year in p.series_parts:
p.series_parts.remove(selected_year)
if selected_year in p.title_parts:
p.title_parts.remove(selected_year)
# Remove year from series and title
if selected_year in p.series_parts:
p.series_parts.remove(selected_year)
if selected_year in p.title_parts:
p.title_parts.remove(selected_year)
def parse_finish(p: Parser):
resolve_year(p)
# If we don't have an issue try to find it in the series
if "issue" not in p.filename_info and p.series_parts and p.series_parts[-1].typ == filenamelexer.ItemType.Number:
issue_num = p.series_parts.pop()
# If the number we just popped is a year put it back on it's probably part of the series e.g. Spider-Man 2099
if issue_num in [x[1] for x in p.year_candidates]:
p.series_parts.append(issue_num)
else:
# If this number was rejected because of an operator and the operator is still there add it back e.g. 'IG-88'
if (
issue_num in p.operator_rejected
and p.series_parts
and p.series_parts[-1].typ == filenamelexer.ItemType.Operator
):
p.series_parts.append(issue_num)
# We have no reason to not use this number as the issue number. Specifically happens when parsing 'X-Men-V1-067.cbr'
else:
p.filename_info["issue"] = issue_num.val
p.used_items.append(issue_num)
p.issue_number_at = issue_num.pos
# Remove publishers, currently only marvel and dc are defined,
# this is an option specifically because this can drastically screw up parsing
if p.remove_publisher:
for item in p.publisher_removed:
if item in p.series_parts:
p.series_parts.remove(item)
if item in p.title_parts:
p.title_parts.remove(item)
p.filename_info["series"] = join_title(p.series_parts)
p.used_items.extend(p.series_parts)
p.filename_info["title"] = join_title(p.title_parts)
p.used_items.extend(p.title_parts)
if "issue" in p.filename_info:
p.filename_info["issue"] = issuestring.IssueString(p.filename_info["issue"].lstrip("#")).as_string()
if "volume" in p.filename_info:
p.filename_info["volume"] = p.filename_info["volume"].lstrip("#").lstrip("0")
if "issue" not in p.filename_info:
# We have an alternate move it to the issue
if "alternate" in p.filename_info:
p.filename_info["issue"] = p.filename_info["alternate"]
p.filename_info["alternate"] = ""
else:
# TODO: This never happens
inp = [x for x in p.input if x not in p.irrelevant and x not in p.used_items and x.typ != eof.typ]
if len(inp) == 1 and inp[0].typ == filenamelexer.ItemType.Number:
p.filename_info["issue"] = inp[0].val
p.used_items.append(inp[0])
remove_items = []
if p.remove_fcbd:
remove_items.append(filenamelexer.ItemType.FCBD)
if p.remove_c2c:
remove_items.append(filenamelexer.ItemType.C2C)
p.irrelevant.extend([x for x in p.input if x.typ in remove_items])
p.filename_info["remainder"] = get_remainder(p)
# Ensure keys always exist
for s in [
"alternate",
"issue",
"archive",
"series",
"title",
"volume",
"year",
"remainder",
"issue_count",
"volume_count",
"publisher",
]:
if s not in p.filename_info:
p.filename_info[s] = ""
for s in ["fcbd", "c2c", "annual"]:
if s not in p.filename_info:
p.filename_info[s] = False
def get_remainder(p: Parser):
remainder = ""
rem = []
# Remove used items and irrelevant items e.g. the series and useless operators
inp = [x for x in p.input if x not in p.irrelevant and x not in p.used_items]
for i, item in enumerate(inp):
# No double space or space next to parentheses
if item.typ in [filenamelexer.ItemType.Space, filenamelexer.ItemType.Skip]:
if (
i > 0
and inp[i - 1].typ
not in [
filenamelexer.ItemType.Space,
filenamelexer.ItemType.LeftBrace,
filenamelexer.ItemType.LeftParen,
filenamelexer.ItemType.LeftSBrace,
]
and i + 1 < len(inp)
and inp[i + 1].typ
not in [
filenamelexer.ItemType.RightBrace,
filenamelexer.ItemType.RightParen,
filenamelexer.ItemType.RightSBrace,
]
):
remainder += " "
# Strip off useless opening parenthesis
elif (
item.typ
in [
filenamelexer.ItemType.Space,
filenamelexer.ItemType.RightBrace,
filenamelexer.ItemType.RightParen,
filenamelexer.ItemType.RightSBrace,
]
and i > 0
and inp[i - 1].typ
in [
filenamelexer.ItemType.LeftBrace,
filenamelexer.ItemType.LeftParen,
filenamelexer.ItemType.LeftSBrace,
]
):
remainder = remainder.rstrip("[{(")
continue
# Add the next item
else:
rem.append(item)
remainder += item.val
# Remove empty parentheses
remainder = re.sub(r"[\[{(]+[]})]+", "", remainder)
return remainder.strip()
def parse_info_specifier(p: Parser):
item = p.input[p.pos]
index = p.pos
if p.peek().typ == filenamelexer.ItemType.Space:
p.get()
# Handles 'book 3' and 'book three'
if p.peek().typ == filenamelexer.ItemType.Number or (
p.peek().typ == filenamelexer.ItemType.Text and t2d.convert(p.peek().val).isnumeric()
):
number = p.get()
if item.val.lower() in ["volume", "vol", "vol.", "v"]:
p.filename_info["volume"] = t2do.convert(number.val)
p.used_items.append(item)
p.used_items.append(number)
# 'of' is only special if it is inside a parenthesis.
elif item.val.lower() == "of":
i = get_number(p, index)
if p.in_something > 0:
if p.issue_number_at is None:
# TODO: Figure out what to do here if it ever happens
p.filename_info["issue_count"] = str(int(t2do.convert(number.val)))
p.used_items.append(item)
p.used_items.append(number)
# This is definitely the issue number
elif p.issue_number_at == i.pos:
p.filename_info["issue_count"] = str(int(t2do.convert(number.val)))
p.used_items.append(item)
p.used_items.append(number)
# This is not for the issue number it is not in either the issue or the title, assume it is the volume number and count
elif p.issue_number_at != i.pos and i not in p.series_parts and i not in p.title_parts:
p.filename_info["volume"] = i.val
p.filename_info["volume_count"] = str(int(t2do.convert(number.val)))
p.used_items.append(i)
p.used_items.append(item)
p.used_items.append(number)
else:
# TODO: Figure out what to do here if it ever happens
pass
else:
# Lets 'The Wrath of Foobar-Man, Part 1 of 2' parse correctly as the title
if i is not None:
p.pos = [ind for ind, x in enumerate(p.input) if x == i][0]
if not p.in_something:
return parse_series
return parse
# Gets 03 in '03 of 6'
def get_number(p: Parser, index: int):
# Go backward through the filename to see if we can find what this is of eg '03 (of 6)' or '008 title 03 (of 6)'
rev = p.input[:index]
rev.reverse()
for i in rev:
# We don't care about these types, we are looking to see if there is a number that is possibly different from the issue number for this count
if i.typ in [
filenamelexer.ItemType.LeftParen,
filenamelexer.ItemType.LeftBrace,
filenamelexer.ItemType.LeftSBrace,
filenamelexer.ItemType.Space,
]:
continue
if i.typ == filenamelexer.ItemType.Number:
# We got our number, time to leave
return i
# This is not a number and not an ignorable type, give up looking for the number this count belongs to
return None
def join_title(lst: list[filenamelexer.Item]):
title = ""
for i, item in enumerate(lst):
if i + 1 == len(lst) and item.val == ",": # We ignore commas on the end
continue
title += item.val # Add the next item
# No space after operators
if item.typ == filenamelexer.ItemType.Operator:
continue
# No trailing space
if i == len(lst) - 1:
continue
# No space after honorifics with a dot
if item.typ == filenamelexer.ItemType.Honorific and lst[i + 1].typ == filenamelexer.ItemType.Dot:
continue
# No space if the next item is an operator or symbol
if lst[i + 1].typ in [
filenamelexer.ItemType.Operator,
filenamelexer.ItemType.Symbol,
]:
continue
# Add a space
title += " "
return title
def Parse(
lexer_result: list[filenamelexer.Item],
first_is_alt=False,
remove_c2c=False,
remove_fcbd=False,
remove_publisher=False,
):
p = Parser(
lexer_result=lexer_result,
first_is_alt=first_is_alt,
remove_c2c=remove_c2c,
remove_fcbd=remove_fcbd,
remove_publisher=remove_publisher,
)
p.run()
return p

View File

@ -32,11 +32,13 @@ logger = logging.getLogger(__name__)
class AutoTagMatchWindow(QtWidgets.QDialog):
volume_id = 0
def __init__(self, parent, match_set_list: List[MultipleMatch], style, fetch_func):
def __init__(self, parent, match_set_list: List[MultipleMatch], style, fetch_func, settings):
super().__init__(parent)
uic.loadUi(ComicTaggerSettings.get_ui_file("matchselectionwindow.ui"), self)
self.settings = settings
self.current_match_set: Optional[MultipleMatch] = None
self.altCoverWidget = CoverImageWidget(self.altCoverContainer, CoverImageWidget.AltCoverMode)
@ -221,7 +223,12 @@ class AutoTagMatchWindow(QtWidgets.QDialog):
md = ca.read_metadata(self.style)
if md.is_empty:
md = ca.metadata_from_filename()
md = ca.metadata_from_filename(
self.settings.complicated_parser,
self.settings.remove_c2c,
self.settings.remove_fcbd,
self.settings.remove_publisher,
)
# now get the particular issue data
cv_md = self.fetch_func(match)

View File

@ -101,7 +101,7 @@ def display_match_set_for_choice(label, match_set: MultipleMatch, opts, settings
# save the data!
# we know at this point, that the file is all good to go
ca = match_set.ca
md = create_local_metadata(opts, ca, ca.has_metadata(opts.data_style))
md = create_local_metadata(opts, ca, ca.has_metadata(opts.data_style), settings)
cv_md = actual_issue_data_fetch(match_set.matches[int(i)], settings, opts)
md.overlay(cv_md)
actual_metadata_save(ca, opts, md)
@ -164,13 +164,17 @@ def cli_mode(opts, settings):
post_process_matches(match_results, opts, settings)
def create_local_metadata(opts, ca: ComicArchive, has_desired_tags):
def create_local_metadata(opts, ca: ComicArchive, has_desired_tags, settings):
md = GenericMetadata()
md.set_default_page_list(ca.get_number_of_pages())
# now, overlay the parsed filename info
if opts.parse_filename:
md.overlay(ca.metadata_from_filename())
md.overlay(
ca.metadata_from_filename(
settings.complicated_parser, settings.remove_c2c, settings.remove_fcbd, settings.remove_publisher
)
)
if has_desired_tags:
md = ca.read_metadata(opts.data_style)
@ -319,7 +323,7 @@ def process_file_cli(filename, opts, settings, match_results: OnlineMatchResults
if batch_mode:
print(f"Processing {ca.path}...")
md = create_local_metadata(opts, ca, has[opts.data_style])
md = create_local_metadata(opts, ca, has[opts.data_style], settings)
if md.issue is None or md.issue == "":
if opts.assume_issue_is_one_if_not_set:
md.issue = "1"
@ -430,7 +434,7 @@ def process_file_cli(filename, opts, settings, match_results: OnlineMatchResults
else:
use_tags = False
md = create_local_metadata(opts, ca, use_tags)
md = create_local_metadata(opts, ca, use_tags, settings)
if md.series is None:
logger.error(msg_hdr + "Can't rename without series name")

View File

@ -63,6 +63,7 @@ class IssueIdentifier:
result_multiple_good_matches = 5
def __init__(self, comic_archive: ComicArchive, settings):
self.settings = settings
self.comic_archive: ComicArchive = comic_archive
self.image_hasher = 1
@ -192,7 +193,12 @@ class IssueIdentifier:
internal_metadata = ca.read_cbi()
# try to get some metadata from filename
md_from_filename = ca.metadata_from_filename()
md_from_filename = ca.metadata_from_filename(
self.settings.complicated_parser,
self.settings.remove_c2c,
self.settings.remove_fcbd,
self.settings.remove_publisher,
)
# preference order:
# 1. Additional metadata

View File

@ -81,7 +81,12 @@ class RenameWindow(QtWidgets.QDialog):
md = ca.read_metadata(self.data_style)
if md.is_empty:
md = ca.metadata_from_filename(self.settings.parse_scan_info)
md = ca.metadata_from_filename(
self.settings.complicated_parser,
self.settings.remove_c2c,
self.settings.remove_fcbd,
self.settings.remove_publisher,
)
self.renamer.set_metadata(md)
self.renamer.move = self.settings.rename_move_dir

View File

@ -88,7 +88,10 @@ class ComicTaggerSettings:
self.ask_about_usage_stats = True
# filename parsing settings
self.parse_scan_info = True
self.complicated_parser = False
self.remove_c2c = False
self.remove_fcbd = False
self.remove_publisher = False
# Comic Vine settings
self.use_series_start_as_volume = False
@ -161,7 +164,10 @@ class ComicTaggerSettings:
self.ask_about_usage_stats = True
# filename parsing settings
self.parse_scan_info = True
self.complicated_parser = False
self.remove_c2c = False
self.remove_fcbd = False
self.remove_publisher = False
# Comic Vine settings
self.use_series_start_as_volume = False
@ -287,8 +293,14 @@ class ComicTaggerSettings:
if self.config.has_option("identifier", "id_publisher_filter"):
self.id_publisher_filter = self.config.get("identifier", "id_publisher_filter")
if self.config.has_option("filenameparser", "parse_scan_info"):
self.parse_scan_info = self.config.getboolean("filenameparser", "parse_scan_info")
if self.config.has_option("filenameparser", "complicated_parser"):
self.complicated_parser = self.config.getboolean("filenameparser", "complicated_parser")
if self.config.has_option("filenameparser", "remove_c2c"):
self.remove_c2c = self.config.getboolean("filenameparser", "remove_c2c")
if self.config.has_option("filenameparser", "remove_fcbd"):
self.remove_fcbd = self.config.getboolean("filenameparser", "remove_fcbd")
if self.config.has_option("filenameparser", "remove_publisher"):
self.remove_publisher = self.config.getboolean("filenameparser", "remove_publisher")
if self.config.has_option("dialogflags", "ask_about_cbi_in_rar"):
self.ask_about_cbi_in_rar = self.config.getboolean("dialogflags", "ask_about_cbi_in_rar")
@ -419,7 +431,10 @@ class ComicTaggerSettings:
if not self.config.has_section("filenameparser"):
self.config.add_section("filenameparser")
self.config.set("filenameparser", "parse_scan_info", self.parse_scan_info)
self.config.set("filenameparser", "complicated_parser", self.complicated_parser)
self.config.set("filenameparser", "remove_c2c", self.remove_c2c)
self.config.set("filenameparser", "remove_fcbd", self.remove_fcbd)
self.config.set("filenameparser", "remove_publisher", self.remove_publisher)
if not self.config.has_section("comicvine"):
self.config.add_section("comicvine")

View File

@ -182,6 +182,7 @@ class SettingsWindow(QtWidgets.QDialog):
self.cbxMoveFiles.clicked.connect(self.rename_test)
self.cbxRenameStrict.clicked.connect(self.rename_test)
self.leDirectory.textEdited.connect(self.rename_test)
self.cbxComplicatedParser.clicked.connect(self.switch_parser)
def rename_test(self):
self.rename__test(self.leRenameTemplate.text())
@ -199,6 +200,13 @@ class SettingsWindow(QtWidgets.QDialog):
self.rename_error = e
self.lblRenameTest.setText(str(e))
def switch_parser(self):
complicated = self.cbxComplicatedParser.isChecked()
self.cbxRemoveC2C.setEnabled(complicated)
self.cbxRemoveFCBD.setEnabled(complicated)
self.cbxRemovePublisher.setEnabled(complicated)
def settings_to_form(self):
# Copy values from settings to form
self.leRarExePath.setText(self.settings.rar_exe_path)
@ -208,8 +216,11 @@ class SettingsWindow(QtWidgets.QDialog):
if self.settings.check_for_new_version:
self.cbxCheckForNewVersion.setCheckState(QtCore.Qt.CheckState.Checked)
if self.settings.parse_scan_info:
self.cbxParseScanInfo.setCheckState(QtCore.Qt.CheckState.Checked)
self.cbxComplicatedParser.setChecked(self.settings.complicated_parser)
self.cbxRemoveC2C.setChecked(self.settings.remove_c2c)
self.cbxRemoveFCBD.setChecked(self.settings.remove_fcbd)
self.cbxRemovePublisher.setChecked(self.settings.remove_publisher)
self.switch_parser()
if self.settings.use_series_start_as_volume:
self.cbxUseSeriesStartAsVolume.setCheckState(QtCore.Qt.CheckState.Checked)
@ -291,7 +302,10 @@ class SettingsWindow(QtWidgets.QDialog):
self.settings.id_length_delta_thresh = int(self.leNameLengthDeltaThresh.text())
self.settings.id_publisher_filter = str(self.tePublisherFilter.toPlainText())
self.settings.parse_scan_info = self.cbxParseScanInfo.isChecked()
self.settings.complicated_parser = self.cbxComplicatedParser.isChecked()
self.settings.remove_c2c = self.cbxRemoveC2C.isChecked()
self.settings.remove_fcbd = self.cbxRemoveFCBD.isChecked()
self.settings.remove_publisher = self.cbxRemovePublisher.isChecked()
self.settings.use_series_start_as_volume = self.cbxUseSeriesStartAsVolume.isChecked()
self.settings.clear_form_before_populating_from_cv = self.cbxClearFormBeforePopulating.isChecked()

View File

@ -557,7 +557,12 @@ Please choose options below, and select OK.
def actual_load_current_archive(self):
if self.metadata.is_empty:
self.metadata = self.comic_archive.metadata_from_filename(self.settings.parse_scan_info)
self.metadata = self.comic_archive.metadata_from_filename(
self.settings.complicated_parser,
self.settings.remove_c2c,
self.settings.remove_fcbd,
remove_publisher=self.settings.remove_publisher,
)
if len(self.metadata.pages) == 0:
self.metadata.set_default_page_list(self.comic_archive.get_number_of_pages())
@ -928,7 +933,12 @@ Please choose options below, and select OK.
if self.comic_archive is not None:
# copy the form onto metadata object
self.form_to_metadata()
new_metadata = self.comic_archive.metadata_from_filename(self.settings.parse_scan_info)
new_metadata = self.comic_archive.metadata_from_filename(
self.settings.complicated_parser,
self.settings.remove_c2c,
self.settings.remove_fcbd,
remove_publisher=self.settings.remove_publisher,
)
if new_metadata is not None:
self.metadata.overlay(new_metadata)
self.metadata_to_form()
@ -1654,7 +1664,12 @@ Please choose options below, and select OK.
# read in metadata, and parse file name if not there
md = ca.read_metadata(self.save_data_style)
if md.is_empty:
md = ca.metadata_from_filename(self.settings.parse_scan_info)
md = ca.metadata_from_filename(
self.settings.complicated_parser,
self.settings.remove_c2c,
self.settings.remove_fcbd,
remove_publisher=self.settings.remove_publisher,
)
if dlg.ignore_leading_digits_in_filename and md.series is not None:
# remove all leading numbers
md.series = re.sub(r"([\d.]*)(.*)", "\\2", md.series)
@ -1846,7 +1861,9 @@ Please choose options below, and select OK to Auto-Tag.
match_results.multiple_matches.extend(match_results.low_confidence_matches)
if reply == QtWidgets.QMessageBox.StandardButton.Yes:
matchdlg = AutoTagMatchWindow(self, match_results.multiple_matches, style, self.actual_issue_data_fetch)
matchdlg = AutoTagMatchWindow(
self, match_results.multiple_matches, style, self.actual_issue_data_fetch, self.settings
)
matchdlg.setModal(True)
matchdlg.exec()
self.fileSelectionList.update_selected_rows()

View File

@ -229,19 +229,55 @@
<attribute name="title">
<string>Filename Parser</string>
</attribute>
<widget class="QCheckBox" name="cbxParseScanInfo">
<property name="geometry">
<rect>
<x>30</x>
<y>30</y>
<width>421</width>
<height>25</height>
</rect>
</property>
<property name="text">
<string>Parse Scan Info From Filename (Experimental)</string>
</property>
</widget>
<layout class="QVBoxLayout" name="verticalLayout_6">
<item>
<widget class="QGroupBox" name="groupBox_2">
<layout class="QVBoxLayout" name="verticalLayout_7">
<item>
<widget class="QCheckBox" name="cbxComplicatedParser">
<property name="text">
<string>Use &quot;Complicated&quot; Parser</string>
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="cbxRemoveC2C">
<property name="text">
<string>Remove 'C2C' from Scan Info</string>
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="cbxRemoveFCBD">
<property name="text">
<string>Remove 'FCBD' from Scan Info</string>
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="cbxRemovePublisher">
<property name="text">
<string>Remove Publisher from filename</string>
</property>
</widget>
</item>
</layout>
</widget>
</item>
<item>
<spacer name="verticalSpacer_4">
<property name="orientation">
<enum>Qt::Vertical</enum>
</property>
<property name="sizeHint" stdset="0">
<size>
<width>20</width>
<height>40</height>
</size>
</property>
</spacer>
</item>
</layout>
</widget>
<widget class="QWidget" name="tab_3">
<attribute name="title">

View File

@ -5,3 +5,4 @@ requests==2.*
pathvalidate
pycountry
py7zr
text2digits

View File

@ -1,35 +1,122 @@
import pytest
fnames = [
(
"Monster_Island_v1_2__repaired__c2c.cbz",
"stuff",
"batman 3 title (DC).cbz",
"honorific and publisher in series",
{
"issue": "3",
"series": "batman",
"title": "title",
"publisher": "DC",
"volume": "",
"year": "",
"remainder": "",
"issue_count": "",
"alternate": "",
},
),
(
"batman 3 title DC.cbz",
"honorific and publisher in series",
{
"issue": "3",
"series": "batman",
"title": "title DC",
"publisher": "DC",
"volume": "",
"year": "",
"remainder": "",
"issue_count": "",
"alternate": "",
},
),
(
"ms. Marvel 3.cbz",
"honorific and publisher in series",
{
"issue": "3",
"series": "ms. Marvel",
"title": "",
"publisher": "Marvel",
"volume": "",
"year": "",
"remainder": "",
"issue_count": "",
"alternate": "",
},
),
(
"january jones 2.cbz",
"month in series",
{
"issue": "2",
"series": "january jones",
"title": "",
"volume": "",
"year": "",
"remainder": "",
"issue_count": "",
"alternate": "",
},
),
(
"52.cbz",
"issue number only",
{
"issue": "52",
"series": "",
"title": "",
"volume": "",
"year": "",
"remainder": "",
"issue_count": "",
"alternate": "",
},
),
(
"52 Monster_Island_v1_2__repaired__c2c.cbz",
"leading alternate",
{
"issue": "2",
"series": "Monster Island",
"title": "The Wrath of Foobar-Man, Part 1 of 2",
"title": "",
"volume": "1",
"year": "",
"remainder": "repaired c2c",
"remainder": "repaired",
"issue_count": "",
"alternate": "52",
"c2c": True,
},
),
(
"Monster_Island_v1_2__repaired__c2c.cbz",
"Example from userguide",
{
"issue": "2",
"series": "Monster Island",
"title": "",
"volume": "1",
"year": "",
"remainder": "repaired",
"issue_count": "",
"c2c": True,
},
),
(
"Monster Island v1 3 (1957) -- The Revenge Of King Klong (noads).cbz",
"stuff",
"Example from userguide",
{
"issue": "3",
"series": "Monster Island",
"title": "The Wrath of Foobar-Man, Part 1 of 2",
"title": "",
"volume": "1",
"year": "1957",
"remainder": "The Revenge Of King Klong (noads)",
"issue_count": "",
},
),
pytest.param(
(
"Foobar-Man Annual 121 - The Wrath of Foobar-Man, Part 1 of 2.cbz",
"stuff",
"Example from userguide",
{
"issue": "121",
"series": "Foobar-Man Annual",
@ -38,12 +125,12 @@ fnames = [
"year": "",
"remainder": "",
"issue_count": "",
"annual": True,
},
marks=pytest.mark.xfail,
),
(
"Plastic Man v1 002 (1942).cbz",
"stuff",
"Example from userguide",
{
"issue": "2",
"series": "Plastic Man",
@ -56,7 +143,7 @@ fnames = [
),
(
"Blue Beetle 02.cbr",
"stuff",
"Example from userguide",
{
"issue": "2",
"series": "Blue Beetle",
@ -69,7 +156,7 @@ fnames = [
),
(
"Monster Island vol. 2 #2.cbz",
"stuff",
"Example from userguide",
{
"issue": "2",
"series": "Monster Island",
@ -82,7 +169,7 @@ fnames = [
),
(
"Crazy Weird Comics 2 (of 2) (1969).rar",
"stuff",
"Example from userguide",
{
"issue": "2",
"series": "Crazy Weird Comics",
@ -95,7 +182,7 @@ fnames = [
),
(
"Super Strange Yarns (1957) #92 (1969).cbz",
"stuff",
"Example from userguide",
{
"issue": "92",
"series": "Super Strange Yarns",
@ -108,7 +195,7 @@ fnames = [
),
(
"Action Spy Tales v1965 #3.cbr",
"stuff",
"Example from userguide",
{
"issue": "3",
"series": "Action Spy Tales",
@ -119,9 +206,9 @@ fnames = [
"issue_count": "",
},
),
pytest.param(
(
" X-Men-V1-067.cbr",
"hyphen separated with hyphen in series",
"hyphen separated with hyphen in series", # only parses corretly because v1 designates the volume
{
"issue": "67",
"series": "X-Men",
@ -131,7 +218,6 @@ fnames = [
"remainder": "",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
(
"Amazing Spider-Man 078.BEY (2022) (Digital) (Zone-Empire).cbr",
@ -139,15 +225,16 @@ fnames = [
{
"issue": "78.BEY",
"series": "Amazing Spider-Man",
"title": "",
"volume": "",
"year": "2022",
"remainder": "(Digital) (Zone-Empire)",
"issue_count": "",
},
),
pytest.param(
(
"Angel Wings 02 - Black Widow (2015) (Scanlation) (phillywilly).cbr",
"title after-issue",
"title after issue",
{
"issue": "2",
"series": "Angel Wings",
@ -157,11 +244,10 @@ fnames = [
"remainder": "(Scanlation) (phillywilly)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Angel Wings #02 - Black Widow (2015) (Scanlation) (phillywilly).cbr",
"title after-#issue",
"title after #issue",
{
"issue": "2",
"series": "Angel Wings",
@ -171,20 +257,19 @@ fnames = [
"remainder": "(Scanlation) (phillywilly)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Aquaman - Green Arrow - Deep Target 01 (of 07) (2021) (digital) (Son of Ultron-Empire).cbr",
"issue count",
{
"issue": "1",
"series": "Aquaman - Green Arrow - Deep Target",
"title": "",
"volume": "",
"year": "2021",
"issue_count": "7",
"remainder": "(digital) (Son of Ultron-Empire)",
},
marks=pytest.mark.xfail,
),
(
"Aquaman 80th Anniversary 100-Page Super Spectacular (2021) 001 (2021) (Digital) (BlackManta-Empire).cbz",
@ -192,37 +277,39 @@ fnames = [
{
"issue": "1",
"series": "Aquaman 80th Anniversary 100-Page Super Spectacular",
"title": "",
"volume": "2021",
"year": "2021",
"remainder": "(Digital) (BlackManta-Empire)",
"issue_count": "",
},
),
pytest.param(
(
"Avatar - The Last Airbender - The Legend of Korra (FCBD 2021) (Digital) (mv-DCP).cbr",
"FCBD date",
{
"issue": "",
"series": "Avatar - The Last Airbender - The Legend of Korra",
"title": "",
"volume": "",
"year": "2021",
"remainder": "(FCBD) (Digital) (mv-DCP)",
"remainder": "(Digital) (mv-DCP)",
"issue_count": "",
"fcbd": True,
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Avengers By Brian Michael Bendis v03 (2013) (Digital) (F2) (Kileko-Empire).cbz",
"volume without issue",
{
"issue": "",
"series": "Avengers By Brian Michael Bendis",
"title": "",
"volume": "3",
"year": "2013",
"remainder": "(Digital) (F2) (Kileko-Empire)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
(
"Batman '89 (2021) (Webrip) (The Last Kryptonian-DCP).cbr",
@ -230,6 +317,7 @@ fnames = [
{
"issue": "",
"series": "Batman '89",
"title": "",
"volume": "",
"year": "2021",
"remainder": "(Webrip) (The Last Kryptonian-DCP)",
@ -242,6 +330,7 @@ fnames = [
{
"issue": "20",
"series": "Batman - Superman",
"title": "",
"volume": "",
"year": "2021",
"remainder": "(digital) (NeverAngel-Empire)",
@ -254,6 +343,7 @@ fnames = [
{
"issue": "9",
"series": "Black Widow",
"title": "",
"volume": "",
"year": "2021",
"remainder": "(Digital) (Zone-Empire)",
@ -266,26 +356,28 @@ fnames = [
{
"issue": "6",
"series": "Blade Runner 2029",
"title": "",
"volume": "",
"year": "2021",
"remainder": "(3 covers) (digital) (Son of Ultron-Empire)",
"issue_count": "",
},
),
pytest.param(
(
"Blade Runner Free Comic Book Day 2021 (2021) (digital-Empire).cbr",
"FCBD year and (year)",
{
"issue": "",
"series": "Blade Runner Free Comic Book Day 2021",
"title": "",
"volume": "",
"year": "2021",
"remainder": "(digital-Empire)",
"issue_count": "",
"fcbd": True,
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Bloodshot Book 03 (2020) (digital) (Son of Ultron-Empire).cbr",
"book",
{
@ -297,9 +389,21 @@ fnames = [
"remainder": "(digital) (Son of Ultron-Empire)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"book of eli (2020) (digital) (Son of Ultron-Empire).cbr",
"book",
{
"issue": "",
"series": "book of eli",
"title": "",
"volume": "",
"year": "2020",
"remainder": "(digital) (Son of Ultron-Empire)",
"issue_count": "",
},
),
(
"Cyberpunk 2077 - You Have My Word 02 (2021) (digital) (Son of Ultron-Empire).cbr",
"title",
{
@ -311,9 +415,8 @@ fnames = [
"issue_count": "",
"remainder": "(digital) (Son of Ultron-Empire)",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Elephantmen 2259 008 - Simple Truth 03 (of 06) (2021) (digital) (Son of Ultron-Empire).cbr",
"volume count",
{
@ -326,9 +429,8 @@ fnames = [
"remainder": "(digital) (Son of Ultron-Empire)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021) (digital) (Son of Ultron-Empire).cbr",
"volume count",
{
@ -341,20 +443,20 @@ fnames = [
"remainder": "(digital) (Son of Ultron-Empire)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Free Comic Book Day - Avengers.Hulk (2021) (2048px) (db).cbz",
"'.' in name",
{
"issue": "",
"series": "Free Comic Book Day - Avengers Hulk",
"title": "",
"volume": "",
"year": "2021",
"remainder": "(2048px) (db)",
"issue_count": "",
"fcbd": True,
},
marks=pytest.mark.xfail,
),
(
"Goblin (2021) (digital) (Son of Ultron-Empire).cbr",
@ -362,37 +464,41 @@ fnames = [
{
"issue": "",
"series": "Goblin",
"title": "",
"volume": "",
"year": "2021",
"remainder": "(digital) (Son of Ultron-Empire)",
"issue_count": "",
},
),
pytest.param(
(
"Marvel Previews 002 (January 2022) (Digital-Empire).cbr",
"(month year)",
{
"issue": "2",
"series": "Marvel Previews",
"title": "",
"publisher": "Marvel",
"volume": "",
"year": "2022",
"remainder": "(Digital-Empire)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Marvel Two In One V1 090 c2c (Comixbear-DCP).cbr",
"volume issue ctc",
{
"issue": "90",
"series": "Marvel Two In One",
"title": "",
"publisher": "Marvel",
"volume": "1",
"year": "",
"remainder": "c2c (Comixbear-DCP)",
"remainder": "(Comixbear-DCP)",
"issue_count": "",
"c2c": True,
},
marks=pytest.mark.xfail,
),
(
"Marvel Two In One V1 #090 c2c (Comixbear-DCP).cbr",
@ -400,24 +506,27 @@ fnames = [
{
"issue": "90",
"series": "Marvel Two In One",
"title": "",
"publisher": "Marvel",
"volume": "1",
"year": "",
"remainder": "c2c (Comixbear-DCP)",
"remainder": "(Comixbear-DCP)",
"issue_count": "",
"c2c": True,
},
),
pytest.param(
(
"Star Wars - War of the Bounty Hunters - IG-88 (2021) (Digital) (Kileko-Empire).cbz",
"number ends series, no-issue",
{
"issue": "",
"series": "Star Wars - War of the Bounty Hunters - IG-88",
"title": "",
"volume": "",
"year": "2021",
"remainder": "(Digital) (Kileko-Empire)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
(
"Star Wars - War of the Bounty Hunters - IG-88 #1 (2021) (Digital) (Kileko-Empire).cbz",
@ -425,6 +534,7 @@ fnames = [
{
"issue": "1",
"series": "Star Wars - War of the Bounty Hunters - IG-88",
"title": "",
"volume": "",
"year": "2021",
"remainder": "(Digital) (Kileko-Empire)",
@ -437,39 +547,41 @@ fnames = [
{
"issue": "58",
"series": "The Defenders",
"title": "",
"volume": "1",
"year": "1978",
"remainder": "(digital)",
"issue_count": "",
},
),
pytest.param(
(
"The Defenders v1 Annual 01 (1976) (Digital) (Minutemen-Slayer).cbr",
" v in series",
{
"issue": "1",
"series": "The Defenders Annual",
"title": "",
"volume": "1",
"year": "1976",
"remainder": "(Digital) (Minutemen-Slayer)",
"issue_count": "",
"annual": True,
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"The Magic Order 2 06 (2022) (Digital) (Zone-Empire)[__913302__].cbz",
"ending id",
{
"issue": "6",
"series": "The Magic Order 2",
"title": "",
"volume": "",
"year": "2022",
"remainder": "(Digital) (Zone-Empire)[__913302__]",
"remainder": "(Digital) (Zone-Empire)[913302]", # Don't really care about double underscores
"issue_count": "",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Wonder Woman 001 Wonder Woman Day Special Edition (2021) (digital-Empire).cbr",
"issue separates title",
{
@ -481,9 +593,8 @@ fnames = [
"remainder": "(digital-Empire)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Wonder Woman #001 Wonder Woman Day Special Edition (2021) (digital-Empire).cbr",
"issue separates title",
{
@ -495,46 +606,47 @@ fnames = [
"remainder": "(digital-Empire)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Wonder Woman 49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz",
"date-range, no paren, braces",
{
"issue": "49",
"series": "Wonder Woman",
"title": "digital", # Don't have a way to get rid of this
"publisher": "DC",
"volume": "",
"year": "1951",
"remainder": "(Shadowcat-Empire)",
"remainder": "[downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz",
"date-range, no paren, braces",
{
"issue": "49",
"series": "Wonder Woman",
"title": "digital", # Don't have a way to get rid of this
"publisher": "DC",
"volume": "",
"year": "1951",
"remainder": "(Shadowcat-Empire)",
"remainder": "[downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"X-Men, 2021-08-04 (#02) (digital) (Glorith-HD).cbz",
"full-date, issue in parenthesis",
{
"issue": "2",
"series": "X-Men",
"title": "",
"volume": "",
"year": "2021",
"remainder": "(digital) (Glorith-HD)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
]

View File

@ -4,13 +4,39 @@ from filenames import fnames
import comicapi.filenameparser
@pytest.mark.parametrize("filename,reason,expected", fnames)
def test_file_name_parser_new(filename, reason, expected):
p = comicapi.filenameparser.Parse(
comicapi.filenamelexer.Lex(filename).items,
first_is_alt=True,
remove_c2c=True,
remove_fcbd=True,
remove_publisher=True,
)
fp = p.filename_info
for s in ["archive"]:
if s in fp:
del fp[s]
for s in ["alternate", "publisher", "volume_count"]:
if s not in expected:
expected[s] = ""
for s in ["fcbd", "c2c", "annual"]:
if s not in expected:
expected[s] = False
assert fp == expected
@pytest.mark.parametrize("filename,reason,expected", fnames)
def test_file_name_parser(filename, reason, expected):
p = comicapi.filenameparser.FileNameParser()
p.parse_filename(filename)
fp = p.__dict__
for s in ["title"]:
for s in ["title", "alternate", "publisher", "fcbd", "c2c", "annual", "volume_count"]:
if s in expected:
del expected[s]
if fp != expected:
pytest.xfail("old parser")
assert fp == expected