Merge branch 'filenameParser' into develop

This commit is contained in:
Timmy Welch 2022-05-06 00:33:36 -07:00
commit 1bbdebff42
15 changed files with 1560 additions and 125 deletions

View File

@ -42,10 +42,10 @@ try:
except ImportError:
pil_available = False
from comicapi import filenamelexer, filenameparser
from comicapi.comet import CoMet
from comicapi.comicbookinfo import ComicBookInfo
from comicapi.comicinfoxml import ComicInfoXml
from comicapi.filenameparser import FileNameParser
from comicapi.genericmetadata import GenericMetadata, PageType
logger = logging.getLogger(__name__)
@ -1127,25 +1127,46 @@ class ComicArchive:
data = self.get_page(idx)
p["ImageSize"] = str(len(data))
def metadata_from_filename(self, parse_scan_info=True):
def metadata_from_filename(
self, complicated_parser=False, remove_c2c=False, remove_fcbd=False, remove_publisher=False
):
metadata = GenericMetadata()
fnp = FileNameParser()
fnp.parse_filename(self.path)
if complicated_parser:
lex = filenamelexer.Lex(self.path)
p = filenameparser.Parse(
lex.items, remove_c2c=remove_c2c, remove_fcbd=remove_fcbd, remove_publisher=remove_publisher
)
metadata.alternate_number = p.filename_info["alternate"] or None
metadata.issue = p.filename_info["issue"] or None
metadata.issue_count = p.filename_info["issue_count"] or None
metadata.publisher = p.filename_info["publisher"] or None
metadata.series = p.filename_info["series"] or None
metadata.title = p.filename_info["title"] or None
metadata.volume = p.filename_info["volume"] or None
metadata.volume_count = p.filename_info["volume_count"] or None
metadata.year = p.filename_info["year"] or None
if fnp.issue != "":
metadata.issue = fnp.issue
if fnp.series != "":
metadata.series = fnp.series
if fnp.volume != "":
metadata.volume = fnp.volume
if fnp.year != "":
metadata.year = fnp.year
if fnp.issue_count != "":
metadata.issue_count = fnp.issue_count
if parse_scan_info:
if fnp.remainder != "":
metadata.scan_info = p.filename_info["remainder"] or None
metadata.format = "FCBD" if p.filename_info["fcbd"] else None
if p.filename_info["annual"]:
metadata.format = "Annual"
else:
fnp = filenameparser.FileNameParser()
fnp.parse_filename(self.path)
if fnp.issue:
metadata.issue = fnp.issue
if fnp.series:
metadata.series = fnp.series
if fnp.volume:
metadata.volume = fnp.volume
if fnp.year:
metadata.year = fnp.year
if fnp.issue_count:
metadata.issue_count = fnp.issue_count
if fnp.remainder:
metadata.scan_info = fnp.remainder
metadata.is_empty = False

353
comicapi/filenamelexer.py Normal file
View File

@ -0,0 +1,353 @@
import calendar
import os
import unicodedata
from enum import Enum, auto
class ItemType(Enum):
Error = auto() # Error occurred; value is text of error
EOF = auto()
Text = auto() # Text
LeftParen = auto() # '(' inside action
Number = auto() # Simple number
IssueNumber = auto() # Preceded by a # Symbol
RightParen = auto() # ')' inside action
Space = auto() # Run of spaces separating arguments
Dot = auto()
LeftBrace = auto()
RightBrace = auto()
LeftSBrace = auto()
RightSBrace = auto()
Symbol = auto()
Skip = auto() # __ or -- no title, issue or series information beyond
Operator = auto()
Calendar = auto()
InfoSpecifier = auto() # Specifies type of info e.g. v1 for 'volume': 1
ArchiveType = auto()
Honorific = auto()
Keywords = auto()
FCBD = auto()
ComicType = auto()
Publisher = auto()
C2C = auto()
braces = [
ItemType.LeftBrace,
ItemType.LeftParen,
ItemType.LeftSBrace,
ItemType.RightBrace,
ItemType.RightParen,
ItemType.RightSBrace,
]
eof = chr(0)
key = {
"fcbd": ItemType.FCBD,
"freecomicbookday": ItemType.FCBD,
"cbr": ItemType.ArchiveType,
"cbz": ItemType.ArchiveType,
"cbt": ItemType.ArchiveType,
"cb7": ItemType.ArchiveType,
"rar": ItemType.ArchiveType,
"zip": ItemType.ArchiveType,
"tar": ItemType.ArchiveType,
"7z": ItemType.ArchiveType,
"annual": ItemType.ComicType,
"book": ItemType.ComicType,
"volume": ItemType.InfoSpecifier,
"vol.": ItemType.InfoSpecifier,
"vol": ItemType.InfoSpecifier,
"v": ItemType.InfoSpecifier,
"of": ItemType.InfoSpecifier,
"dc": ItemType.Publisher,
"marvel": ItemType.Publisher,
"covers": ItemType.InfoSpecifier,
"c2c": ItemType.C2C,
"mr": ItemType.Honorific,
"ms": ItemType.Honorific,
"mrs": ItemType.Honorific,
"dr": ItemType.Honorific,
}
class Item:
def __init__(self, typ: ItemType, pos: int, val: str):
self.typ: ItemType = typ
self.pos: int = pos
self.val: str = val
def __repr__(self):
return f"{self.val}: index: {self.pos}: {self.typ}"
class Lexer:
def __init__(self, string):
self.input: str = string # The string being scanned
self.state = None # The next lexing function to enter
self.pos: int = -1 # Current position in the input
self.start: int = 0 # Start position of this item
self.lastPos: int = 0 # Position of most recent item returned by nextItem
self.paren_depth: int = 0 # Nesting depth of ( ) exprs
self.brace_depth: int = 0 # Nesting depth of { }
self.sbrace_depth: int = 0 # Nesting depth of [ ]
self.items = []
# Next returns the next rune in the input.
def get(self) -> str:
if int(self.pos) >= len(self.input) - 1:
self.pos += 1
return eof
self.pos += 1
return self.input[self.pos]
# Peek returns but does not consume the next rune in the input.
def peek(self) -> str:
if int(self.pos) >= len(self.input) - 1:
return eof
return self.input[self.pos + 1]
def backup(self):
self.pos -= 1
# Emit passes an item back to the client.
def emit(self, t: ItemType):
self.items.append(Item(t, self.start, self.input[self.start : self.pos + 1]))
self.start = self.pos + 1
# Ignore skips over the pending input before this point.
def ignore(self):
self.start = self.pos
# Accept consumes the next rune if it's from the valid se:
def accept(self, valid: str):
if self.get() in valid:
return True
self.backup()
return False
# AcceptRun consumes a run of runes from the valid set.
def accept_run(self, valid: str):
while self.get() in valid:
pass
self.backup()
# Errorf returns an error token and terminates the scan by passing
# Back a nil pointer that will be the next state, terminating self.nextItem.
def errorf(self, message: str):
self.items.append(Item(ItemType.Error, self.start, message))
# NextItem returns the next item from the input.
# Called by the parser, not in the lexing goroutine.
# def next_item(self) -> Item:
# item: Item = self.items.get()
# self.lastPos = item.pos
# return item
def scan_number(self):
digits = "0123456789"
self.accept_run(digits)
if self.accept("."):
if self.accept(digits):
self.accept_run(digits)
else:
self.backup()
if self.accept("s"):
if not self.accept("t"):
self.backup()
elif self.accept("nr"):
if not self.accept("d"):
self.backup()
elif self.accept("t"):
if not self.accept("h"):
self.backup()
return True
# Runs the state machine for the lexer.
def run(self):
self.state = lex_filename
while self.state is not None:
self.state = self.state(self)
# Scans the elements inside action delimiters.
def lex_filename(lex: Lexer):
r = lex.get()
if r == eof:
if lex.paren_depth != 0:
return lex.errorf("unclosed left paren")
if lex.brace_depth != 0:
return lex.errorf("unclosed left paren")
lex.emit(ItemType.EOF)
return None
elif is_space(r):
if r == "_" and lex.peek() == "_":
lex.get()
lex.emit(ItemType.Skip)
else:
return lex_space
elif r == ".":
r = lex.peek()
if r < "0" or "9" < r:
lex.emit(ItemType.Dot)
return lex_filename
lex.backup()
return lex_number
elif r == "'":
r = lex.peek()
if r in "0123456789":
return lex_number
lex.emit(ItemType.Text) # TODO: Change to Text
elif "0" <= r <= "9":
lex.backup()
return lex_number
elif r == "#":
if "0" <= lex.peek() <= "9":
return lex_number
lex.emit(ItemType.Symbol)
elif is_operator(r):
if r == "-" and lex.peek() == "-":
lex.get()
lex.emit(ItemType.Skip)
else:
return lex_operator
elif is_alpha_numeric(r):
lex.backup()
return lex_text
elif r == "(":
lex.emit(ItemType.LeftParen)
lex.paren_depth += 1
elif r == ")":
lex.emit(ItemType.RightParen)
lex.paren_depth -= 1
if lex.paren_depth < 0:
return lex.errorf("unexpected right paren " + r)
elif r == "{":
lex.emit(ItemType.LeftBrace)
lex.brace_depth += 1
elif r == "}":
lex.emit(ItemType.RightBrace)
lex.brace_depth -= 1
if lex.brace_depth < 0:
return lex.errorf("unexpected right brace " + r)
elif r == "[":
lex.emit(ItemType.LeftSBrace)
lex.sbrace_depth += 1
elif r == "]":
lex.emit(ItemType.RightSBrace)
lex.sbrace_depth -= 1
if lex.sbrace_depth < 0:
return lex.errorf("unexpected right brace " + r)
elif is_symbol(r):
# L.backup()
lex.emit(ItemType.Symbol)
else:
return lex.errorf("unrecognized character in action: " + r)
return lex_filename
def lex_operator(lex: Lexer):
lex.accept_run("-|:;")
lex.emit(ItemType.Operator)
return lex_filename
# LexSpace scans a run of space characters.
# One space has already been seen.
def lex_space(lex: Lexer):
while is_space(lex.peek()):
lex.get()
lex.emit(ItemType.Space)
return lex_filename
# Lex_text scans an alphanumeric.
def lex_text(lex: Lexer):
while True:
r = lex.get()
if is_alpha_numeric(r):
if r.isnumeric(): # E.g. v1
word = lex.input[lex.start : lex.pos]
if word.lower() in key and key[word.lower()] == ItemType.InfoSpecifier:
lex.backup()
lex.emit(key[word.lower()])
return lex_filename
else:
if r == "'" and lex.peek() == "s":
lex.get()
else:
lex.backup()
word = lex.input[lex.start : lex.pos + 1]
if word.lower() == "vol" and lex.peek() == ".":
lex.get()
word = lex.input[lex.start : lex.pos + 1]
if word.lower() in key:
lex.emit(key[word.lower()])
elif cal(word):
lex.emit(ItemType.Calendar)
else:
lex.emit(ItemType.Text)
break
return lex_filename
def cal(value: str):
month_abbr = [i for i, x in enumerate(calendar.month_abbr) if x == value.title()]
month_name = [i for i, x in enumerate(calendar.month_name) if x == value.title()]
day_abbr = [i for i, x in enumerate(calendar.day_abbr) if x == value.title()]
day_name = [i for i, x in enumerate(calendar.day_name) if x == value.title()]
return set(month_abbr + month_name + day_abbr + day_name)
def lex_number(lex: Lexer):
if not lex.scan_number():
return lex.errorf("bad number syntax: " + lex.input[lex.start : lex.pos])
# Complex number logic removed. Messes with math operations without space
if lex.input[lex.start] == "#":
lex.emit(ItemType.IssueNumber)
elif not lex.input[lex.pos].isdigit():
# Assume that 80th is just text and not a number
lex.emit(ItemType.Text)
else:
lex.emit(ItemType.Number)
return lex_filename
def is_space(character: str):
return character in "_ \t"
# IsAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
def is_alpha_numeric(character: str):
return character.isalpha() or character.isnumeric()
def is_operator(character: str):
return character in "-|:;/\\"
def is_symbol(character: str):
return unicodedata.category(character)[0] in "PS"
def Lex(filename: str):
lex = Lexer(string=os.path.basename(filename))
lex.run()
return lex

View File

@ -23,8 +23,17 @@ This should probably be re-written, but, well, it mostly works!
import logging
import os
import re
from operator import itemgetter
from typing import TypedDict
from urllib.parse import unquote
from text2digits import text2digits
from comicapi import filenamelexer, issuestring
t2d = text2digits.Text2Digits(add_ordinal_ending=False)
t2do = text2digits.Text2Digits(add_ordinal_ending=True)
logger = logging.getLogger(__name__)
@ -68,9 +77,7 @@ class FileNameParser:
if match:
count = match.group()
count = count.lstrip("0")
return count
return count.lstrip("0")
def get_issue_number(self, filename):
"""Returns a tuple of issue number string, and start and end indexes in the filename
@ -222,7 +229,7 @@ class FileNameParser:
year = ""
# look for four digit number with "(" ")" or "--" around it
match = re.search(r"(\(\d\d\d\d\))|(--\d\d\d\d--)", filename)
match = re.search(r"(\(\d{4}\))|(--\d{4}--)", filename)
if match:
year = match.group()
# remove non-digits
@ -290,3 +297,814 @@ class FileNameParser:
self.issue = "0"
if self.issue[0] == ".":
self.issue = "0" + self.issue
class FilenameInfo(TypedDict, total=False):
alternate: str
annual: bool
archive: str
c2c: bool
fcbd: bool
issue: str
issue_count: str
publisher: str
remainder: str
series: str
title: str
volume: str
volume_count: str
year: str
eof = filenamelexer.Item(filenamelexer.ItemType.EOF, -1, "")
class Parser:
"""docstring for FilenameParser"""
def __init__(
self,
lexer_result: list[filenamelexer.Item],
first_is_alt=False,
remove_c2c=False,
remove_fcbd=False,
remove_publisher=False,
):
self.state = None
self.pos = -1
self.firstItem = True
self.skip = False
self.alt = False
self.filename_info: FilenameInfo = {"series": ""}
self.issue_number_at = None
self.in_something = 0 # In some sort of brackets {}[]()
self.in_brace = 0 # In {}
self.in_s_brace = 0 # In []
self.in_paren = 0 # In ()
self.year_candidates: list[tuple[bool, filenamelexer.Item]] = []
self.series_parts: list[filenamelexer.Item] = []
self.title_parts: list[filenamelexer.Item] = []
self.used_items: list[filenamelexer.Item] = []
self.irrelevant: list[filenamelexer.Item] = []
self.operator_rejected: list[filenamelexer.Item] = []
self.publisher_removed: list[filenamelexer.Item] = []
self.first_is_alt = first_is_alt
self.remove_c2c = remove_c2c
self.remove_fcbd = remove_fcbd
self.remove_publisher = remove_publisher
self.input = lexer_result
for i, item in enumerate(self.input):
if item.typ == filenamelexer.ItemType.IssueNumber:
self.issue_number_at = i
# Get returns the next Item in the input.
def get(self) -> filenamelexer.Item:
if int(self.pos) >= len(self.input) - 1:
self.pos += 1
return eof
self.pos += 1
return self.input[self.pos]
# Peek returns but does not consume the next Item in the input.
def peek(self) -> filenamelexer.Item:
if int(self.pos) >= len(self.input) - 1:
return eof
return self.input[self.pos + 1]
# Peek_back returns but does not step back the previous Item in the input.
def peek_back(self) -> filenamelexer.Item:
if int(self.pos) == 0:
return eof
return self.input[self.pos - 1]
# Backup steps back one Item.
def backup(self):
self.pos -= 1
def run(self):
self.state = parse
while self.state is not None:
self.state = self.state(self)
def parse(p: Parser):
item: filenamelexer.Item = p.get()
# We're done, time to do final processing
if item.typ == filenamelexer.ItemType.EOF:
return parse_finish
# Need to figure out if this is the issue number
if item.typ == filenamelexer.ItemType.Number:
likely_year = False
if p.firstItem and p.first_is_alt:
# raise Exception("fuck you")
p.alt = True
return parse_issue_number
# The issue number should hopefully not be in parentheses
if p.in_something == 0:
# Assume that operators indicate a non-issue number e.g. IG-88 or 88-IG
if filenamelexer.ItemType.Operator not in (p.peek().typ, p.peek_back().typ):
# It is common to use '89 to refer to an annual reprint from 1989
if item.val[0] != "'":
# Issue number is less than 4 digits. very few series go above 999
if len(item.val.lstrip("0")) < 4:
# An issue number starting with # Was not found and no previous number was found
if p.issue_number_at is None:
# Series has already been started/parsed, filters out leading alternate numbers leading alternate number
if len(p.series_parts) > 0:
# Unset first item
if p.firstItem:
p.firstItem = False
return parse_issue_number
else:
p.operator_rejected.append(item)
# operator rejected used later to add back to the series/title
# It is more likely to be a year if it is inside parentheses.
if p.in_something > 0:
likely_year = True
# If numbers are directly followed by text it most likely isn't a year e.g. 2048px
if p.peek().typ == filenamelexer.ItemType.Text:
likely_year = False
# Is either a full year '2001' or a short year "'89"
if len(item.val) == 4 or item.val[0] == "'":
if p.in_something == 0:
# Append to series in case it is a part of the title, but only if were not inside parenthesis
p.series_parts.append(item)
# Look for a full date as in 2022-04-22
if p.peek().typ in [
filenamelexer.ItemType.Symbol,
filenamelexer.ItemType.Operator,
filenamelexer.ItemType.Dot,
]:
op = [p.get()]
if p.peek().typ == filenamelexer.ItemType.Number:
month = p.get()
if p.peek().typ in [
filenamelexer.ItemType.Symbol,
filenamelexer.ItemType.Operator,
filenamelexer.ItemType.Dot,
]:
op.append(p.get())
if p.peek().typ == filenamelexer.ItemType.Number:
day = p.get()
fulldate = [month, day, item]
p.used_items.extend(op)
p.used_items.extend(fulldate)
else:
p.backup()
p.backup()
p.backup()
# TODO never happens
else:
p.backup()
p.backup()
# TODO never happens
else:
p.backup()
# TODO never happens
p.year_candidates.append((likely_year, item))
# Ensures that IG-88 gets added back to the series/title
elif (
p.in_something == 0
and p.peek_back().typ == filenamelexer.ItemType.Operator
or p.peek().typ == filenamelexer.ItemType.Operator
):
# Were not in something and the next or previous type is an operator, add it to the series
p.series_parts.append(item)
p.used_items.append(item)
# Unset first item
if p.firstItem:
p.firstItem = False
p.get()
return parse_series
# Number with a leading hash e.g. #003
elif item.typ == filenamelexer.ItemType.IssueNumber:
# Unset first item
if p.firstItem:
p.firstItem = False
return parse_issue_number
# Matches FCBD. Not added to p.used_items so it will show in "remainder"
elif item.typ == filenamelexer.ItemType.FCBD:
p.filename_info["fcbd"] = True
# Matches c2c. Not added to p.used_items so it will show in "remainder"
elif item.typ == filenamelexer.ItemType.C2C:
p.filename_info["c2c"] = True
# Matches the extension if it is known to be an archive format e.g. cbt,cbz,zip,rar
elif item.typ == filenamelexer.ItemType.ArchiveType:
p.filename_info["archive"] = item.val.lower()
p.used_items.append(item)
if p.peek_back().typ == filenamelexer.ItemType.Dot:
p.used_items.append(p.peek_back())
# Allows removing DC from 'Wonder Woman 49 DC Sep-Oct 1951' dependent on publisher being in a static list in the lexer
elif item.typ == filenamelexer.ItemType.Publisher:
p.filename_info["publisher"] = item.val
p.used_items.append(item)
if p.firstItem:
p.firstItem = False
if p.in_something == 0:
return parse_series
p.publisher_removed.append(item)
if p.in_something == 0:
return parse_series
# Attempts to identify the type e.g. annual
elif item.typ == filenamelexer.ItemType.ComicType:
series_append = True
if p.peek().typ == filenamelexer.ItemType.Space:
p.get()
if p.series_parts and "free comic book" in (" ".join([x.val for x in p.series_parts]) + " " + item.val).lower():
p.filename_info["fcbd"] = True
series_append = True
# If the next item is a number it's probably the volume
elif p.peek().typ == filenamelexer.ItemType.Number or (
p.peek().typ == filenamelexer.ItemType.Text and t2d.convert(p.peek().val).isnumeric()
):
number = p.get()
# Mark volume info. Text will be added to the title/series later
if item.val.lower() in ["book", "tpb"]:
p.title_parts.extend([item, number])
p.filename_info["volume"] = t2do.convert(number.val)
p.filename_info["issue"] = t2do.convert(number.val)
p.used_items.append(item)
series_append = False
# Annuals usually mean the year
elif item.val.lower() in ["annual"]:
p.filename_info["annual"] = True
num = t2d.convert(number.val)
if num.isnumeric() and len(num) == 4:
p.year_candidates.append((True, number))
else:
p.backup()
elif item.val.lower() in ["annual"]:
p.filename_info["annual"] = True
# If we don't have a reason to exclude it from the series go back to parsing the series immediately
if series_append:
p.series_parts.append(item)
p.used_items.append(item)
return parse_series
# We found text, it's probably the title or series
elif item.typ in [filenamelexer.ItemType.Text, filenamelexer.ItemType.Honorific]:
# Unset first item
if p.firstItem:
p.firstItem = False
if p.in_something == 0:
return parse_series
# Usually the word 'of' eg 1 (of 6)
elif item.typ == filenamelexer.ItemType.InfoSpecifier:
return parse_info_specifier
# Operator is a symbol that acts as some sort of separator eg - : ;
elif item.typ == filenamelexer.ItemType.Operator:
if p.in_something == 0:
p.irrelevant.append(item)
# Filter out Month and day names in filename
elif item.typ == filenamelexer.ItemType.Calendar:
# Month and day are currently irrelevant if they are inside parentheses e.g. (January 2002)
if p.in_something > 0:
p.irrelevant.append(item)
# assume Sep-Oct is not useful in the series/title
elif p.peek().typ in [filenamelexer.ItemType.Symbol, filenamelexer.ItemType.Operator]:
p.get()
if p.peek().typ == filenamelexer.ItemType.Calendar:
p.irrelevant.extend([item, p.input[p.pos], p.get()])
else:
p.backup()
return parse_series
# This is text that just happens to also be a month/day
else:
return parse_series
# Specifically '__' or '--', no further title/series parsing is done to keep compatibility with wiki
elif item.typ == filenamelexer.ItemType.Skip:
p.skip = True
# Keeping track of parentheses depth
elif item.typ == filenamelexer.ItemType.LeftParen:
p.in_paren += 1
p.in_something += 1
elif item.typ == filenamelexer.ItemType.LeftBrace:
p.in_brace += 1
p.in_something += 1
elif item.typ == filenamelexer.ItemType.LeftSBrace:
p.in_s_brace += 1
p.in_something += 1
elif item.typ == filenamelexer.ItemType.RightParen:
p.in_paren -= 1
p.in_something -= 1
elif item.typ == filenamelexer.ItemType.RightBrace:
p.in_brace -= 1
p.in_something -= 1
elif item.typ == filenamelexer.ItemType.RightSBrace:
p.in_s_brace -= 1
p.in_something -= 1
# Unset first item
if p.firstItem:
p.firstItem = False
# Brace management, I don't like negative numbers
if p.in_paren < 0:
p.in_something += p.in_paren * -1
if p.in_brace < 0:
p.in_something += p.in_brace * -1
if p.in_s_brace < 0:
p.in_something += p.in_s_brace * -1
return parse
# TODO: What about more esoteric numbers???
def parse_issue_number(p: Parser):
item = p.input[p.pos]
if "issue" in p.filename_info:
if "alternate" in p.filename_info:
p.filename_info["alternate"] += "," + item.val
p.filename_info["alternate"] = item.val
else:
if p.alt:
p.filename_info["alternate"] = item.val
else:
p.filename_info["issue"] = item.val
p.issue_number_at = item.pos
p.used_items.append(item)
item = p.get()
if item.typ == filenamelexer.ItemType.Dot:
p.used_items.append(item)
item = p.get()
if item.typ in [filenamelexer.ItemType.Text, filenamelexer.ItemType.Number]:
if p.alt:
p.filename_info["alternate"] += "." + item.val
else:
p.filename_info["issue"] += "." + item.val
p.used_items.append(item)
else:
p.backup()
p.backup()
else:
p.backup()
p.alt = False
return parse
def parse_series(p: Parser):
item = p.input[p.pos]
series: list[list[filenamelexer.Item]] = [[]]
# Space and Dots are not useful at the beginning of a title/series
if not p.skip and item.typ not in [filenamelexer.ItemType.Space, filenamelexer.ItemType.Dot]:
series[0].append(item)
current_part = 0
title_parts: list[filenamelexer.Item] = []
series_parts: list[filenamelexer.Item] = []
prev_space = False
# 'free comic book day' screws things up. #TODO look into removing book from ComicType?
# We stop parsing the series when certain things come up if nothing was done with them continue where we left off
if (
p.series_parts
and p.series_parts[-1].val.lower() == "book"
or p.peek_back().typ == filenamelexer.ItemType.Number
or item.typ == filenamelexer.ItemType.Calendar
):
series_parts = p.series_parts
p.series_parts = []
# Skip is only true if we have come across '--' or '__'
while not p.skip:
item = p.get()
# Spaces are evil
if item.typ == filenamelexer.ItemType.Space:
prev_space = True
continue
if item.typ in [
filenamelexer.ItemType.Text,
filenamelexer.ItemType.Symbol,
filenamelexer.ItemType.Publisher,
filenamelexer.ItemType.Honorific,
]:
series[current_part].append(item)
if item.typ == filenamelexer.ItemType.Honorific and p.peek().typ == filenamelexer.ItemType.Dot:
series[current_part].append(p.get())
elif item.typ == filenamelexer.ItemType.Publisher:
p.filename_info["publisher"] = item.val
# Handle Volume
elif item.typ == filenamelexer.ItemType.InfoSpecifier:
# Exception for 'of'
if item.val.lower() == "of":
series[current_part].append(item)
else:
# This specifically lets 'X-Men-V1-067' parse correctly as Series: X-Men Volume: 1 Issue: 67
while len(series[current_part]) > 0 and series[current_part][-1].typ not in [
filenamelexer.ItemType.Text,
filenamelexer.ItemType.Symbol,
]:
p.irrelevant.append(series[current_part].pop())
p.backup()
break
elif item.typ == filenamelexer.ItemType.Operator:
peek = p.peek()
# ': ' separates the title from the series, only the last section is considered the title
if not prev_space and peek.typ in [filenamelexer.ItemType.Space]:
series.append([]) # Starts a new section
series[current_part].append(item)
current_part += 1
else:
# Force space around '-' makes 'batman - superman' stay otherwise we get 'batman-superman'
if prev_space and peek.typ in [filenamelexer.ItemType.Space]:
item.val = " " + item.val + " "
series[current_part].append(item)
# Stop processing series/title if a skip item is found
elif item.typ == filenamelexer.ItemType.Skip:
p.backup()
break
elif item.typ == filenamelexer.ItemType.Number:
if p.peek().typ == filenamelexer.ItemType.Space:
p.get()
# We have 2 numbers, add the first to the series and then go back to parse
if p.peek().typ == filenamelexer.ItemType.Number:
series[current_part].append(item)
break
# We have 1 number break here, it's possible it's the issue
p.backup() # Whitespace
p.backup() # The number
break
# This is 6 in '1 of 6'
if series[current_part] and series[current_part][-1].val.lower() == "of":
series[current_part].append(item)
# We have 1 number break here, it's possible it's the issue
else:
p.backup() # The number
break
else:
# Ensure 'ms. marvel' parses 'ms.' correctly
if item.typ == filenamelexer.ItemType.Dot and p.peek_back().typ == filenamelexer.ItemType.Honorific:
series[current_part].append(item)
# Allows avengers.hulk to parse correctly
elif item.typ == filenamelexer.ItemType.Dot and p.peek().typ == filenamelexer.ItemType.Text:
# Marks the dot as used so that the remainder is clean
p.used_items.append(item)
else:
p.backup()
break
prev_space = False
# We have a title separator e.g. ': "
if len(series) > 1:
title_parts.extend(series.pop())
for s in series:
if s and s[-1].typ == filenamelexer.ItemType.Operator:
s[-1].val += " " # Ensures that when there are multiple separators that they display properly
series_parts.extend(s)
p.used_items.append(series_parts.pop())
else:
series_parts.extend(series[0])
# If the series has already been set assume all of this is the title.
if len(p.series_parts) > 0:
p.title_parts.extend(series_parts)
p.title_parts.extend(title_parts)
else:
p.series_parts.extend(series_parts)
p.title_parts.extend(title_parts)
return parse
def resolve_year(p: Parser):
if len(p.year_candidates) > 0:
# Sort by likely_year boolean
p.year_candidates.sort(key=itemgetter(0))
# Take the last year e.g. (2007) 2099 (2008) becomes 2099 2007 2008 and takes 2008
selected_year = p.year_candidates.pop()[1]
p.filename_info["year"] = selected_year.val
p.used_items.append(selected_year)
# (2008) Title (2009) is many times used to denote the series year if we don't have a volume we use it
if "volume" not in p.filename_info and p.year_candidates and p.year_candidates[-1][0]:
vol = p.year_candidates.pop()[1]
p.filename_info["volume"] = vol.val
p.used_items.append(vol)
# Remove volume from series and title
if selected_year in p.series_parts:
p.series_parts.remove(selected_year)
if selected_year in p.title_parts:
p.title_parts.remove(selected_year)
# Remove year from series and title
if selected_year in p.series_parts:
p.series_parts.remove(selected_year)
if selected_year in p.title_parts:
p.title_parts.remove(selected_year)
def parse_finish(p: Parser):
resolve_year(p)
# If we don't have an issue try to find it in the series
if "issue" not in p.filename_info and p.series_parts and p.series_parts[-1].typ == filenamelexer.ItemType.Number:
issue_num = p.series_parts.pop()
# If the number we just popped is a year put it back on it's probably part of the series e.g. Spider-Man 2099
if issue_num in [x[1] for x in p.year_candidates]:
p.series_parts.append(issue_num)
else:
# If this number was rejected because of an operator and the operator is still there add it back e.g. 'IG-88'
if (
issue_num in p.operator_rejected
and p.series_parts
and p.series_parts[-1].typ == filenamelexer.ItemType.Operator
):
p.series_parts.append(issue_num)
# We have no reason to not use this number as the issue number. Specifically happens when parsing 'X-Men-V1-067.cbr'
else:
p.filename_info["issue"] = issue_num.val
p.used_items.append(issue_num)
p.issue_number_at = issue_num.pos
# Remove publishers, currently only marvel and dc are defined,
# this is an option specifically because this can drastically screw up parsing
if p.remove_publisher:
for item in p.publisher_removed:
if item in p.series_parts:
p.series_parts.remove(item)
if item in p.title_parts:
p.title_parts.remove(item)
p.filename_info["series"] = join_title(p.series_parts)
p.used_items.extend(p.series_parts)
p.filename_info["title"] = join_title(p.title_parts)
p.used_items.extend(p.title_parts)
if "issue" in p.filename_info:
p.filename_info["issue"] = issuestring.IssueString(p.filename_info["issue"].lstrip("#")).as_string()
if "volume" in p.filename_info:
p.filename_info["volume"] = p.filename_info["volume"].lstrip("#").lstrip("0")
if "issue" not in p.filename_info:
# We have an alternate move it to the issue
if "alternate" in p.filename_info:
p.filename_info["issue"] = p.filename_info["alternate"]
p.filename_info["alternate"] = ""
else:
# TODO: This never happens
inp = [x for x in p.input if x not in p.irrelevant and x not in p.used_items and x.typ != eof.typ]
if len(inp) == 1 and inp[0].typ == filenamelexer.ItemType.Number:
p.filename_info["issue"] = inp[0].val
p.used_items.append(inp[0])
remove_items = []
if p.remove_fcbd:
remove_items.append(filenamelexer.ItemType.FCBD)
if p.remove_c2c:
remove_items.append(filenamelexer.ItemType.C2C)
p.irrelevant.extend([x for x in p.input if x.typ in remove_items])
p.filename_info["remainder"] = get_remainder(p)
# Ensure keys always exist
for s in [
"alternate",
"issue",
"archive",
"series",
"title",
"volume",
"year",
"remainder",
"issue_count",
"volume_count",
"publisher",
]:
if s not in p.filename_info:
p.filename_info[s] = ""
for s in ["fcbd", "c2c", "annual"]:
if s not in p.filename_info:
p.filename_info[s] = False
def get_remainder(p: Parser):
remainder = ""
rem = []
# Remove used items and irrelevant items e.g. the series and useless operators
inp = [x for x in p.input if x not in p.irrelevant and x not in p.used_items]
for i, item in enumerate(inp):
# No double space or space next to parentheses
if item.typ in [filenamelexer.ItemType.Space, filenamelexer.ItemType.Skip]:
if (
i > 0
and inp[i - 1].typ
not in [
filenamelexer.ItemType.Space,
filenamelexer.ItemType.LeftBrace,
filenamelexer.ItemType.LeftParen,
filenamelexer.ItemType.LeftSBrace,
]
and i + 1 < len(inp)
and inp[i + 1].typ
not in [
filenamelexer.ItemType.RightBrace,
filenamelexer.ItemType.RightParen,
filenamelexer.ItemType.RightSBrace,
]
):
remainder += " "
# Strip off useless opening parenthesis
elif (
item.typ
in [
filenamelexer.ItemType.Space,
filenamelexer.ItemType.RightBrace,
filenamelexer.ItemType.RightParen,
filenamelexer.ItemType.RightSBrace,
]
and i > 0
and inp[i - 1].typ
in [
filenamelexer.ItemType.LeftBrace,
filenamelexer.ItemType.LeftParen,
filenamelexer.ItemType.LeftSBrace,
]
):
remainder = remainder.rstrip("[{(")
continue
# Add the next item
else:
rem.append(item)
remainder += item.val
# Remove empty parentheses
remainder = re.sub(r"[\[{(]+[]})]+", "", remainder)
return remainder.strip()
def parse_info_specifier(p: Parser):
item = p.input[p.pos]
index = p.pos
if p.peek().typ == filenamelexer.ItemType.Space:
p.get()
# Handles 'book 3' and 'book three'
if p.peek().typ == filenamelexer.ItemType.Number or (
p.peek().typ == filenamelexer.ItemType.Text and t2d.convert(p.peek().val).isnumeric()
):
number = p.get()
if item.val.lower() in ["volume", "vol", "vol.", "v"]:
p.filename_info["volume"] = t2do.convert(number.val)
p.used_items.append(item)
p.used_items.append(number)
# 'of' is only special if it is inside a parenthesis.
elif item.val.lower() == "of":
i = get_number(p, index)
if p.in_something > 0:
if p.issue_number_at is None:
# TODO: Figure out what to do here if it ever happens
p.filename_info["issue_count"] = str(int(t2do.convert(number.val)))
p.used_items.append(item)
p.used_items.append(number)
# This is definitely the issue number
elif p.issue_number_at == i.pos:
p.filename_info["issue_count"] = str(int(t2do.convert(number.val)))
p.used_items.append(item)
p.used_items.append(number)
# This is not for the issue number it is not in either the issue or the title, assume it is the volume number and count
elif p.issue_number_at != i.pos and i not in p.series_parts and i not in p.title_parts:
p.filename_info["volume"] = i.val
p.filename_info["volume_count"] = str(int(t2do.convert(number.val)))
p.used_items.append(i)
p.used_items.append(item)
p.used_items.append(number)
else:
# TODO: Figure out what to do here if it ever happens
pass
else:
# Lets 'The Wrath of Foobar-Man, Part 1 of 2' parse correctly as the title
if i is not None:
p.pos = [ind for ind, x in enumerate(p.input) if x == i][0]
if not p.in_something:
return parse_series
return parse
# Gets 03 in '03 of 6'
def get_number(p: Parser, index: int):
# Go backward through the filename to see if we can find what this is of eg '03 (of 6)' or '008 title 03 (of 6)'
rev = p.input[:index]
rev.reverse()
for i in rev:
# We don't care about these types, we are looking to see if there is a number that is possibly different from the issue number for this count
if i.typ in [
filenamelexer.ItemType.LeftParen,
filenamelexer.ItemType.LeftBrace,
filenamelexer.ItemType.LeftSBrace,
filenamelexer.ItemType.Space,
]:
continue
if i.typ == filenamelexer.ItemType.Number:
# We got our number, time to leave
return i
# This is not a number and not an ignorable type, give up looking for the number this count belongs to
return None
def join_title(lst: list[filenamelexer.Item]):
title = ""
for i, item in enumerate(lst):
if i + 1 == len(lst) and item.val == ",": # We ignore commas on the end
continue
title += item.val # Add the next item
# No space after operators
if item.typ == filenamelexer.ItemType.Operator:
continue
# No trailing space
if i == len(lst) - 1:
continue
# No space after honorifics with a dot
if item.typ == filenamelexer.ItemType.Honorific and lst[i + 1].typ == filenamelexer.ItemType.Dot:
continue
# No space if the next item is an operator or symbol
if lst[i + 1].typ in [
filenamelexer.ItemType.Operator,
filenamelexer.ItemType.Symbol,
]:
continue
# Add a space
title += " "
return title
def Parse(
lexer_result: list[filenamelexer.Item],
first_is_alt=False,
remove_c2c=False,
remove_fcbd=False,
remove_publisher=False,
):
p = Parser(
lexer_result=lexer_result,
first_is_alt=first_is_alt,
remove_c2c=remove_c2c,
remove_fcbd=remove_fcbd,
remove_publisher=remove_publisher,
)
p.run()
return p

View File

@ -32,11 +32,13 @@ logger = logging.getLogger(__name__)
class AutoTagMatchWindow(QtWidgets.QDialog):
volume_id = 0
def __init__(self, parent, match_set_list: List[MultipleMatch], style, fetch_func):
def __init__(self, parent, match_set_list: List[MultipleMatch], style, fetch_func, settings):
super().__init__(parent)
uic.loadUi(ComicTaggerSettings.get_ui_file("matchselectionwindow.ui"), self)
self.settings = settings
self.current_match_set: Optional[MultipleMatch] = None
self.altCoverWidget = CoverImageWidget(self.altCoverContainer, CoverImageWidget.AltCoverMode)
@ -221,7 +223,12 @@ class AutoTagMatchWindow(QtWidgets.QDialog):
md = ca.read_metadata(self.style)
if md.is_empty:
md = ca.metadata_from_filename()
md = ca.metadata_from_filename(
self.settings.complicated_parser,
self.settings.remove_c2c,
self.settings.remove_fcbd,
self.settings.remove_publisher,
)
# now get the particular issue data
cv_md = self.fetch_func(match)

View File

@ -101,7 +101,7 @@ def display_match_set_for_choice(label, match_set: MultipleMatch, opts, settings
# save the data!
# we know at this point, that the file is all good to go
ca = match_set.ca
md = create_local_metadata(opts, ca, ca.has_metadata(opts.data_style))
md = create_local_metadata(opts, ca, ca.has_metadata(opts.data_style), settings)
cv_md = actual_issue_data_fetch(match_set.matches[int(i)], settings, opts)
md.overlay(cv_md)
actual_metadata_save(ca, opts, md)
@ -164,13 +164,17 @@ def cli_mode(opts, settings):
post_process_matches(match_results, opts, settings)
def create_local_metadata(opts, ca: ComicArchive, has_desired_tags):
def create_local_metadata(opts, ca: ComicArchive, has_desired_tags, settings):
md = GenericMetadata()
md.set_default_page_list(ca.get_number_of_pages())
# now, overlay the parsed filename info
if opts.parse_filename:
md.overlay(ca.metadata_from_filename())
md.overlay(
ca.metadata_from_filename(
settings.complicated_parser, settings.remove_c2c, settings.remove_fcbd, settings.remove_publisher
)
)
if has_desired_tags:
md = ca.read_metadata(opts.data_style)
@ -319,7 +323,7 @@ def process_file_cli(filename, opts, settings, match_results: OnlineMatchResults
if batch_mode:
print(f"Processing {ca.path}...")
md = create_local_metadata(opts, ca, has[opts.data_style])
md = create_local_metadata(opts, ca, has[opts.data_style], settings)
if md.issue is None or md.issue == "":
if opts.assume_issue_is_one_if_not_set:
md.issue = "1"
@ -430,7 +434,7 @@ def process_file_cli(filename, opts, settings, match_results: OnlineMatchResults
else:
use_tags = False
md = create_local_metadata(opts, ca, use_tags)
md = create_local_metadata(opts, ca, use_tags, settings)
if md.series is None:
logger.error(msg_hdr + "Can't rename without series name")

View File

@ -63,6 +63,7 @@ class IssueIdentifier:
result_multiple_good_matches = 5
def __init__(self, comic_archive: ComicArchive, settings):
self.settings = settings
self.comic_archive: ComicArchive = comic_archive
self.image_hasher = 1
@ -192,7 +193,12 @@ class IssueIdentifier:
internal_metadata = ca.read_cbi()
# try to get some metadata from filename
md_from_filename = ca.metadata_from_filename()
md_from_filename = ca.metadata_from_filename(
self.settings.complicated_parser,
self.settings.remove_c2c,
self.settings.remove_fcbd,
self.settings.remove_publisher,
)
# preference order:
# 1. Additional metadata

View File

@ -81,7 +81,12 @@ class RenameWindow(QtWidgets.QDialog):
md = ca.read_metadata(self.data_style)
if md.is_empty:
md = ca.metadata_from_filename(self.settings.parse_scan_info)
md = ca.metadata_from_filename(
self.settings.complicated_parser,
self.settings.remove_c2c,
self.settings.remove_fcbd,
self.settings.remove_publisher,
)
self.renamer.set_metadata(md)
self.renamer.move = self.settings.rename_move_dir

View File

@ -88,7 +88,10 @@ class ComicTaggerSettings:
self.ask_about_usage_stats = True
# filename parsing settings
self.parse_scan_info = True
self.complicated_parser = False
self.remove_c2c = False
self.remove_fcbd = False
self.remove_publisher = False
# Comic Vine settings
self.use_series_start_as_volume = False
@ -161,7 +164,10 @@ class ComicTaggerSettings:
self.ask_about_usage_stats = True
# filename parsing settings
self.parse_scan_info = True
self.complicated_parser = False
self.remove_c2c = False
self.remove_fcbd = False
self.remove_publisher = False
# Comic Vine settings
self.use_series_start_as_volume = False
@ -287,8 +293,14 @@ class ComicTaggerSettings:
if self.config.has_option("identifier", "id_publisher_filter"):
self.id_publisher_filter = self.config.get("identifier", "id_publisher_filter")
if self.config.has_option("filenameparser", "parse_scan_info"):
self.parse_scan_info = self.config.getboolean("filenameparser", "parse_scan_info")
if self.config.has_option("filenameparser", "complicated_parser"):
self.complicated_parser = self.config.getboolean("filenameparser", "complicated_parser")
if self.config.has_option("filenameparser", "remove_c2c"):
self.remove_c2c = self.config.getboolean("filenameparser", "remove_c2c")
if self.config.has_option("filenameparser", "remove_fcbd"):
self.remove_fcbd = self.config.getboolean("filenameparser", "remove_fcbd")
if self.config.has_option("filenameparser", "remove_publisher"):
self.remove_publisher = self.config.getboolean("filenameparser", "remove_publisher")
if self.config.has_option("dialogflags", "ask_about_cbi_in_rar"):
self.ask_about_cbi_in_rar = self.config.getboolean("dialogflags", "ask_about_cbi_in_rar")
@ -419,7 +431,10 @@ class ComicTaggerSettings:
if not self.config.has_section("filenameparser"):
self.config.add_section("filenameparser")
self.config.set("filenameparser", "parse_scan_info", self.parse_scan_info)
self.config.set("filenameparser", "complicated_parser", self.complicated_parser)
self.config.set("filenameparser", "remove_c2c", self.remove_c2c)
self.config.set("filenameparser", "remove_fcbd", self.remove_fcbd)
self.config.set("filenameparser", "remove_publisher", self.remove_publisher)
if not self.config.has_section("comicvine"):
self.config.add_section("comicvine")

View File

@ -182,6 +182,7 @@ class SettingsWindow(QtWidgets.QDialog):
self.cbxMoveFiles.clicked.connect(self.rename_test)
self.cbxRenameStrict.clicked.connect(self.rename_test)
self.leDirectory.textEdited.connect(self.rename_test)
self.cbxComplicatedParser.clicked.connect(self.switch_parser)
def rename_test(self):
self.rename__test(self.leRenameTemplate.text())
@ -199,6 +200,13 @@ class SettingsWindow(QtWidgets.QDialog):
self.rename_error = e
self.lblRenameTest.setText(str(e))
def switch_parser(self):
complicated = self.cbxComplicatedParser.isChecked()
self.cbxRemoveC2C.setEnabled(complicated)
self.cbxRemoveFCBD.setEnabled(complicated)
self.cbxRemovePublisher.setEnabled(complicated)
def settings_to_form(self):
# Copy values from settings to form
self.leRarExePath.setText(self.settings.rar_exe_path)
@ -208,8 +216,11 @@ class SettingsWindow(QtWidgets.QDialog):
if self.settings.check_for_new_version:
self.cbxCheckForNewVersion.setCheckState(QtCore.Qt.CheckState.Checked)
if self.settings.parse_scan_info:
self.cbxParseScanInfo.setCheckState(QtCore.Qt.CheckState.Checked)
self.cbxComplicatedParser.setChecked(self.settings.complicated_parser)
self.cbxRemoveC2C.setChecked(self.settings.remove_c2c)
self.cbxRemoveFCBD.setChecked(self.settings.remove_fcbd)
self.cbxRemovePublisher.setChecked(self.settings.remove_publisher)
self.switch_parser()
if self.settings.use_series_start_as_volume:
self.cbxUseSeriesStartAsVolume.setCheckState(QtCore.Qt.CheckState.Checked)
@ -291,7 +302,10 @@ class SettingsWindow(QtWidgets.QDialog):
self.settings.id_length_delta_thresh = int(self.leNameLengthDeltaThresh.text())
self.settings.id_publisher_filter = str(self.tePublisherFilter.toPlainText())
self.settings.parse_scan_info = self.cbxParseScanInfo.isChecked()
self.settings.complicated_parser = self.cbxComplicatedParser.isChecked()
self.settings.remove_c2c = self.cbxRemoveC2C.isChecked()
self.settings.remove_fcbd = self.cbxRemoveFCBD.isChecked()
self.settings.remove_publisher = self.cbxRemovePublisher.isChecked()
self.settings.use_series_start_as_volume = self.cbxUseSeriesStartAsVolume.isChecked()
self.settings.clear_form_before_populating_from_cv = self.cbxClearFormBeforePopulating.isChecked()

View File

@ -557,7 +557,12 @@ Please choose options below, and select OK.
def actual_load_current_archive(self):
if self.metadata.is_empty:
self.metadata = self.comic_archive.metadata_from_filename(self.settings.parse_scan_info)
self.metadata = self.comic_archive.metadata_from_filename(
self.settings.complicated_parser,
self.settings.remove_c2c,
self.settings.remove_fcbd,
remove_publisher=self.settings.remove_publisher,
)
if len(self.metadata.pages) == 0:
self.metadata.set_default_page_list(self.comic_archive.get_number_of_pages())
@ -928,7 +933,12 @@ Please choose options below, and select OK.
if self.comic_archive is not None:
# copy the form onto metadata object
self.form_to_metadata()
new_metadata = self.comic_archive.metadata_from_filename(self.settings.parse_scan_info)
new_metadata = self.comic_archive.metadata_from_filename(
self.settings.complicated_parser,
self.settings.remove_c2c,
self.settings.remove_fcbd,
remove_publisher=self.settings.remove_publisher,
)
if new_metadata is not None:
self.metadata.overlay(new_metadata)
self.metadata_to_form()
@ -1654,7 +1664,12 @@ Please choose options below, and select OK.
# read in metadata, and parse file name if not there
md = ca.read_metadata(self.save_data_style)
if md.is_empty:
md = ca.metadata_from_filename(self.settings.parse_scan_info)
md = ca.metadata_from_filename(
self.settings.complicated_parser,
self.settings.remove_c2c,
self.settings.remove_fcbd,
remove_publisher=self.settings.remove_publisher,
)
if dlg.ignore_leading_digits_in_filename and md.series is not None:
# remove all leading numbers
md.series = re.sub(r"([\d.]*)(.*)", "\\2", md.series)
@ -1846,7 +1861,9 @@ Please choose options below, and select OK to Auto-Tag.
match_results.multiple_matches.extend(match_results.low_confidence_matches)
if reply == QtWidgets.QMessageBox.StandardButton.Yes:
matchdlg = AutoTagMatchWindow(self, match_results.multiple_matches, style, self.actual_issue_data_fetch)
matchdlg = AutoTagMatchWindow(
self, match_results.multiple_matches, style, self.actual_issue_data_fetch, self.settings
)
matchdlg.setModal(True)
matchdlg.exec()
self.fileSelectionList.update_selected_rows()

View File

@ -229,19 +229,55 @@
<attribute name="title">
<string>Filename Parser</string>
</attribute>
<widget class="QCheckBox" name="cbxParseScanInfo">
<property name="geometry">
<rect>
<x>30</x>
<y>30</y>
<width>421</width>
<height>25</height>
</rect>
</property>
<property name="text">
<string>Parse Scan Info From Filename (Experimental)</string>
</property>
</widget>
<layout class="QVBoxLayout" name="verticalLayout_6">
<item>
<widget class="QGroupBox" name="groupBox_2">
<layout class="QVBoxLayout" name="verticalLayout_7">
<item>
<widget class="QCheckBox" name="cbxComplicatedParser">
<property name="text">
<string>Use &quot;Complicated&quot; Parser</string>
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="cbxRemoveC2C">
<property name="text">
<string>Remove 'C2C' from Scan Info</string>
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="cbxRemoveFCBD">
<property name="text">
<string>Remove 'FCBD' from Scan Info</string>
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="cbxRemovePublisher">
<property name="text">
<string>Remove Publisher from filename</string>
</property>
</widget>
</item>
</layout>
</widget>
</item>
<item>
<spacer name="verticalSpacer_4">
<property name="orientation">
<enum>Qt::Vertical</enum>
</property>
<property name="sizeHint" stdset="0">
<size>
<width>20</width>
<height>40</height>
</size>
</property>
</spacer>
</item>
</layout>
</widget>
<widget class="QWidget" name="tab_3">
<attribute name="title">

View File

@ -5,3 +5,4 @@ requests==2.*
pathvalidate
pycountry
py7zr
text2digits

View File

@ -1,35 +1,122 @@
import pytest
fnames = [
(
"Monster_Island_v1_2__repaired__c2c.cbz",
"stuff",
"batman 3 title (DC).cbz",
"honorific and publisher in series",
{
"issue": "3",
"series": "batman",
"title": "title",
"publisher": "DC",
"volume": "",
"year": "",
"remainder": "",
"issue_count": "",
"alternate": "",
},
),
(
"batman 3 title DC.cbz",
"honorific and publisher in series",
{
"issue": "3",
"series": "batman",
"title": "title DC",
"publisher": "DC",
"volume": "",
"year": "",
"remainder": "",
"issue_count": "",
"alternate": "",
},
),
(
"ms. Marvel 3.cbz",
"honorific and publisher in series",
{
"issue": "3",
"series": "ms. Marvel",
"title": "",
"publisher": "Marvel",
"volume": "",
"year": "",
"remainder": "",
"issue_count": "",
"alternate": "",
},
),
(
"january jones 2.cbz",
"month in series",
{
"issue": "2",
"series": "january jones",
"title": "",
"volume": "",
"year": "",
"remainder": "",
"issue_count": "",
"alternate": "",
},
),
(
"52.cbz",
"issue number only",
{
"issue": "52",
"series": "",
"title": "",
"volume": "",
"year": "",
"remainder": "",
"issue_count": "",
"alternate": "",
},
),
(
"52 Monster_Island_v1_2__repaired__c2c.cbz",
"leading alternate",
{
"issue": "2",
"series": "Monster Island",
"title": "The Wrath of Foobar-Man, Part 1 of 2",
"title": "",
"volume": "1",
"year": "",
"remainder": "repaired c2c",
"remainder": "repaired",
"issue_count": "",
"alternate": "52",
"c2c": True,
},
),
(
"Monster_Island_v1_2__repaired__c2c.cbz",
"Example from userguide",
{
"issue": "2",
"series": "Monster Island",
"title": "",
"volume": "1",
"year": "",
"remainder": "repaired",
"issue_count": "",
"c2c": True,
},
),
(
"Monster Island v1 3 (1957) -- The Revenge Of King Klong (noads).cbz",
"stuff",
"Example from userguide",
{
"issue": "3",
"series": "Monster Island",
"title": "The Wrath of Foobar-Man, Part 1 of 2",
"title": "",
"volume": "1",
"year": "1957",
"remainder": "The Revenge Of King Klong (noads)",
"issue_count": "",
},
),
pytest.param(
(
"Foobar-Man Annual 121 - The Wrath of Foobar-Man, Part 1 of 2.cbz",
"stuff",
"Example from userguide",
{
"issue": "121",
"series": "Foobar-Man Annual",
@ -38,12 +125,12 @@ fnames = [
"year": "",
"remainder": "",
"issue_count": "",
"annual": True,
},
marks=pytest.mark.xfail,
),
(
"Plastic Man v1 002 (1942).cbz",
"stuff",
"Example from userguide",
{
"issue": "2",
"series": "Plastic Man",
@ -56,7 +143,7 @@ fnames = [
),
(
"Blue Beetle 02.cbr",
"stuff",
"Example from userguide",
{
"issue": "2",
"series": "Blue Beetle",
@ -69,7 +156,7 @@ fnames = [
),
(
"Monster Island vol. 2 #2.cbz",
"stuff",
"Example from userguide",
{
"issue": "2",
"series": "Monster Island",
@ -82,7 +169,7 @@ fnames = [
),
(
"Crazy Weird Comics 2 (of 2) (1969).rar",
"stuff",
"Example from userguide",
{
"issue": "2",
"series": "Crazy Weird Comics",
@ -95,7 +182,7 @@ fnames = [
),
(
"Super Strange Yarns (1957) #92 (1969).cbz",
"stuff",
"Example from userguide",
{
"issue": "92",
"series": "Super Strange Yarns",
@ -108,7 +195,7 @@ fnames = [
),
(
"Action Spy Tales v1965 #3.cbr",
"stuff",
"Example from userguide",
{
"issue": "3",
"series": "Action Spy Tales",
@ -119,9 +206,9 @@ fnames = [
"issue_count": "",
},
),
pytest.param(
(
" X-Men-V1-067.cbr",
"hyphen separated with hyphen in series",
"hyphen separated with hyphen in series", # only parses corretly because v1 designates the volume
{
"issue": "67",
"series": "X-Men",
@ -131,7 +218,6 @@ fnames = [
"remainder": "",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
(
"Amazing Spider-Man 078.BEY (2022) (Digital) (Zone-Empire).cbr",
@ -139,15 +225,16 @@ fnames = [
{
"issue": "78.BEY",
"series": "Amazing Spider-Man",
"title": "",
"volume": "",
"year": "2022",
"remainder": "(Digital) (Zone-Empire)",
"issue_count": "",
},
),
pytest.param(
(
"Angel Wings 02 - Black Widow (2015) (Scanlation) (phillywilly).cbr",
"title after-issue",
"title after issue",
{
"issue": "2",
"series": "Angel Wings",
@ -157,11 +244,10 @@ fnames = [
"remainder": "(Scanlation) (phillywilly)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Angel Wings #02 - Black Widow (2015) (Scanlation) (phillywilly).cbr",
"title after-#issue",
"title after #issue",
{
"issue": "2",
"series": "Angel Wings",
@ -171,20 +257,19 @@ fnames = [
"remainder": "(Scanlation) (phillywilly)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Aquaman - Green Arrow - Deep Target 01 (of 07) (2021) (digital) (Son of Ultron-Empire).cbr",
"issue count",
{
"issue": "1",
"series": "Aquaman - Green Arrow - Deep Target",
"title": "",
"volume": "",
"year": "2021",
"issue_count": "7",
"remainder": "(digital) (Son of Ultron-Empire)",
},
marks=pytest.mark.xfail,
),
(
"Aquaman 80th Anniversary 100-Page Super Spectacular (2021) 001 (2021) (Digital) (BlackManta-Empire).cbz",
@ -192,37 +277,39 @@ fnames = [
{
"issue": "1",
"series": "Aquaman 80th Anniversary 100-Page Super Spectacular",
"title": "",
"volume": "2021",
"year": "2021",
"remainder": "(Digital) (BlackManta-Empire)",
"issue_count": "",
},
),
pytest.param(
(
"Avatar - The Last Airbender - The Legend of Korra (FCBD 2021) (Digital) (mv-DCP).cbr",
"FCBD date",
{
"issue": "",
"series": "Avatar - The Last Airbender - The Legend of Korra",
"title": "",
"volume": "",
"year": "2021",
"remainder": "(FCBD) (Digital) (mv-DCP)",
"remainder": "(Digital) (mv-DCP)",
"issue_count": "",
"fcbd": True,
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Avengers By Brian Michael Bendis v03 (2013) (Digital) (F2) (Kileko-Empire).cbz",
"volume without issue",
{
"issue": "",
"series": "Avengers By Brian Michael Bendis",
"title": "",
"volume": "3",
"year": "2013",
"remainder": "(Digital) (F2) (Kileko-Empire)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
(
"Batman '89 (2021) (Webrip) (The Last Kryptonian-DCP).cbr",
@ -230,6 +317,7 @@ fnames = [
{
"issue": "",
"series": "Batman '89",
"title": "",
"volume": "",
"year": "2021",
"remainder": "(Webrip) (The Last Kryptonian-DCP)",
@ -242,6 +330,7 @@ fnames = [
{
"issue": "20",
"series": "Batman - Superman",
"title": "",
"volume": "",
"year": "2021",
"remainder": "(digital) (NeverAngel-Empire)",
@ -254,6 +343,7 @@ fnames = [
{
"issue": "9",
"series": "Black Widow",
"title": "",
"volume": "",
"year": "2021",
"remainder": "(Digital) (Zone-Empire)",
@ -266,26 +356,28 @@ fnames = [
{
"issue": "6",
"series": "Blade Runner 2029",
"title": "",
"volume": "",
"year": "2021",
"remainder": "(3 covers) (digital) (Son of Ultron-Empire)",
"issue_count": "",
},
),
pytest.param(
(
"Blade Runner Free Comic Book Day 2021 (2021) (digital-Empire).cbr",
"FCBD year and (year)",
{
"issue": "",
"series": "Blade Runner Free Comic Book Day 2021",
"title": "",
"volume": "",
"year": "2021",
"remainder": "(digital-Empire)",
"issue_count": "",
"fcbd": True,
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Bloodshot Book 03 (2020) (digital) (Son of Ultron-Empire).cbr",
"book",
{
@ -297,9 +389,21 @@ fnames = [
"remainder": "(digital) (Son of Ultron-Empire)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"book of eli (2020) (digital) (Son of Ultron-Empire).cbr",
"book",
{
"issue": "",
"series": "book of eli",
"title": "",
"volume": "",
"year": "2020",
"remainder": "(digital) (Son of Ultron-Empire)",
"issue_count": "",
},
),
(
"Cyberpunk 2077 - You Have My Word 02 (2021) (digital) (Son of Ultron-Empire).cbr",
"title",
{
@ -311,9 +415,8 @@ fnames = [
"issue_count": "",
"remainder": "(digital) (Son of Ultron-Empire)",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Elephantmen 2259 008 - Simple Truth 03 (of 06) (2021) (digital) (Son of Ultron-Empire).cbr",
"volume count",
{
@ -326,9 +429,8 @@ fnames = [
"remainder": "(digital) (Son of Ultron-Empire)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021) (digital) (Son of Ultron-Empire).cbr",
"volume count",
{
@ -341,20 +443,20 @@ fnames = [
"remainder": "(digital) (Son of Ultron-Empire)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Free Comic Book Day - Avengers.Hulk (2021) (2048px) (db).cbz",
"'.' in name",
{
"issue": "",
"series": "Free Comic Book Day - Avengers Hulk",
"title": "",
"volume": "",
"year": "2021",
"remainder": "(2048px) (db)",
"issue_count": "",
"fcbd": True,
},
marks=pytest.mark.xfail,
),
(
"Goblin (2021) (digital) (Son of Ultron-Empire).cbr",
@ -362,37 +464,41 @@ fnames = [
{
"issue": "",
"series": "Goblin",
"title": "",
"volume": "",
"year": "2021",
"remainder": "(digital) (Son of Ultron-Empire)",
"issue_count": "",
},
),
pytest.param(
(
"Marvel Previews 002 (January 2022) (Digital-Empire).cbr",
"(month year)",
{
"issue": "2",
"series": "Marvel Previews",
"title": "",
"publisher": "Marvel",
"volume": "",
"year": "2022",
"remainder": "(Digital-Empire)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Marvel Two In One V1 090 c2c (Comixbear-DCP).cbr",
"volume issue ctc",
{
"issue": "90",
"series": "Marvel Two In One",
"title": "",
"publisher": "Marvel",
"volume": "1",
"year": "",
"remainder": "c2c (Comixbear-DCP)",
"remainder": "(Comixbear-DCP)",
"issue_count": "",
"c2c": True,
},
marks=pytest.mark.xfail,
),
(
"Marvel Two In One V1 #090 c2c (Comixbear-DCP).cbr",
@ -400,24 +506,27 @@ fnames = [
{
"issue": "90",
"series": "Marvel Two In One",
"title": "",
"publisher": "Marvel",
"volume": "1",
"year": "",
"remainder": "c2c (Comixbear-DCP)",
"remainder": "(Comixbear-DCP)",
"issue_count": "",
"c2c": True,
},
),
pytest.param(
(
"Star Wars - War of the Bounty Hunters - IG-88 (2021) (Digital) (Kileko-Empire).cbz",
"number ends series, no-issue",
{
"issue": "",
"series": "Star Wars - War of the Bounty Hunters - IG-88",
"title": "",
"volume": "",
"year": "2021",
"remainder": "(Digital) (Kileko-Empire)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
(
"Star Wars - War of the Bounty Hunters - IG-88 #1 (2021) (Digital) (Kileko-Empire).cbz",
@ -425,6 +534,7 @@ fnames = [
{
"issue": "1",
"series": "Star Wars - War of the Bounty Hunters - IG-88",
"title": "",
"volume": "",
"year": "2021",
"remainder": "(Digital) (Kileko-Empire)",
@ -437,39 +547,41 @@ fnames = [
{
"issue": "58",
"series": "The Defenders",
"title": "",
"volume": "1",
"year": "1978",
"remainder": "(digital)",
"issue_count": "",
},
),
pytest.param(
(
"The Defenders v1 Annual 01 (1976) (Digital) (Minutemen-Slayer).cbr",
" v in series",
{
"issue": "1",
"series": "The Defenders Annual",
"title": "",
"volume": "1",
"year": "1976",
"remainder": "(Digital) (Minutemen-Slayer)",
"issue_count": "",
"annual": True,
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"The Magic Order 2 06 (2022) (Digital) (Zone-Empire)[__913302__].cbz",
"ending id",
{
"issue": "6",
"series": "The Magic Order 2",
"title": "",
"volume": "",
"year": "2022",
"remainder": "(Digital) (Zone-Empire)[__913302__]",
"remainder": "(Digital) (Zone-Empire)[913302]", # Don't really care about double underscores
"issue_count": "",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Wonder Woman 001 Wonder Woman Day Special Edition (2021) (digital-Empire).cbr",
"issue separates title",
{
@ -481,9 +593,8 @@ fnames = [
"remainder": "(digital-Empire)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Wonder Woman #001 Wonder Woman Day Special Edition (2021) (digital-Empire).cbr",
"issue separates title",
{
@ -495,46 +606,47 @@ fnames = [
"remainder": "(digital-Empire)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Wonder Woman 49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz",
"date-range, no paren, braces",
{
"issue": "49",
"series": "Wonder Woman",
"title": "digital", # Don't have a way to get rid of this
"publisher": "DC",
"volume": "",
"year": "1951",
"remainder": "(Shadowcat-Empire)",
"remainder": "[downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"Wonder Woman #49 DC Sep-Oct 1951 digital [downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire).cbz",
"date-range, no paren, braces",
{
"issue": "49",
"series": "Wonder Woman",
"title": "digital", # Don't have a way to get rid of this
"publisher": "DC",
"volume": "",
"year": "1951",
"remainder": "(Shadowcat-Empire)",
"remainder": "[downsized, lightened, 4 missing story pages restored] (Shadowcat-Empire)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
pytest.param(
(
"X-Men, 2021-08-04 (#02) (digital) (Glorith-HD).cbz",
"full-date, issue in parenthesis",
{
"issue": "2",
"series": "X-Men",
"title": "",
"volume": "",
"year": "2021",
"remainder": "(digital) (Glorith-HD)",
"issue_count": "",
},
marks=pytest.mark.xfail,
),
]

View File

@ -4,13 +4,39 @@ from filenames import fnames
import comicapi.filenameparser
@pytest.mark.parametrize("filename,reason,expected", fnames)
def test_file_name_parser_new(filename, reason, expected):
p = comicapi.filenameparser.Parse(
comicapi.filenamelexer.Lex(filename).items,
first_is_alt=True,
remove_c2c=True,
remove_fcbd=True,
remove_publisher=True,
)
fp = p.filename_info
for s in ["archive"]:
if s in fp:
del fp[s]
for s in ["alternate", "publisher", "volume_count"]:
if s not in expected:
expected[s] = ""
for s in ["fcbd", "c2c", "annual"]:
if s not in expected:
expected[s] = False
assert fp == expected
@pytest.mark.parametrize("filename,reason,expected", fnames)
def test_file_name_parser(filename, reason, expected):
p = comicapi.filenameparser.FileNameParser()
p.parse_filename(filename)
fp = p.__dict__
for s in ["title"]:
for s in ["title", "alternate", "publisher", "fcbd", "c2c", "annual", "volume_count"]:
if s in expected:
del expected[s]
if fp != expected:
pytest.xfail("old parser")
assert fp == expected