Merge branch 'filenameParser' into develop
This commit is contained in:
commit
42da653b6e
@ -42,10 +42,10 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
pil_available = False
|
pil_available = False
|
||||||
|
|
||||||
|
from comicapi import filenamelexer, filenameparser
|
||||||
from comicapi.comet import CoMet
|
from comicapi.comet import CoMet
|
||||||
from comicapi.comicbookinfo import ComicBookInfo
|
from comicapi.comicbookinfo import ComicBookInfo
|
||||||
from comicapi.comicinfoxml import ComicInfoXml
|
from comicapi.comicinfoxml import ComicInfoXml
|
||||||
from comicapi.filenameparser import FileNameParser
|
|
||||||
from comicapi.genericmetadata import GenericMetadata, PageType
|
from comicapi.genericmetadata import GenericMetadata, PageType
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -1127,25 +1127,46 @@ class ComicArchive:
|
|||||||
data = self.get_page(idx)
|
data = self.get_page(idx)
|
||||||
p["ImageSize"] = str(len(data))
|
p["ImageSize"] = str(len(data))
|
||||||
|
|
||||||
def metadata_from_filename(self, parse_scan_info=True):
|
def metadata_from_filename(
|
||||||
|
self, complicated_parser=False, remove_c2c=False, remove_fcbd=False, remove_publisher=False
|
||||||
|
):
|
||||||
|
|
||||||
metadata = GenericMetadata()
|
metadata = GenericMetadata()
|
||||||
|
|
||||||
fnp = FileNameParser()
|
if complicated_parser:
|
||||||
|
lex = filenamelexer.Lex(self.path)
|
||||||
|
p = filenameparser.Parse(
|
||||||
|
lex.items, remove_c2c=remove_c2c, remove_fcbd=remove_fcbd, remove_publisher=remove_publisher
|
||||||
|
)
|
||||||
|
metadata.alternate_number = p.filename_info["alternate"] or None
|
||||||
|
metadata.issue = p.filename_info["issue"] or None
|
||||||
|
metadata.issue_count = p.filename_info["issue_count"] or None
|
||||||
|
metadata.publisher = p.filename_info["publisher"] or None
|
||||||
|
metadata.series = p.filename_info["series"] or None
|
||||||
|
metadata.title = p.filename_info["title"] or None
|
||||||
|
metadata.volume = p.filename_info["volume"] or None
|
||||||
|
metadata.volume_count = p.filename_info["volume_count"] or None
|
||||||
|
metadata.year = p.filename_info["year"] or None
|
||||||
|
|
||||||
|
metadata.scan_info = p.filename_info["remainder"] or None
|
||||||
|
metadata.format = "FCBD" if p.filename_info["fcbd"] else None
|
||||||
|
if p.filename_info["annual"]:
|
||||||
|
metadata.format = "Annual"
|
||||||
|
else:
|
||||||
|
fnp = filenameparser.FileNameParser()
|
||||||
fnp.parse_filename(self.path)
|
fnp.parse_filename(self.path)
|
||||||
|
|
||||||
if fnp.issue != "":
|
if fnp.issue:
|
||||||
metadata.issue = fnp.issue
|
metadata.issue = fnp.issue
|
||||||
if fnp.series != "":
|
if fnp.series:
|
||||||
metadata.series = fnp.series
|
metadata.series = fnp.series
|
||||||
if fnp.volume != "":
|
if fnp.volume:
|
||||||
metadata.volume = fnp.volume
|
metadata.volume = fnp.volume
|
||||||
if fnp.year != "":
|
if fnp.year:
|
||||||
metadata.year = fnp.year
|
metadata.year = fnp.year
|
||||||
if fnp.issue_count != "":
|
if fnp.issue_count:
|
||||||
metadata.issue_count = fnp.issue_count
|
metadata.issue_count = fnp.issue_count
|
||||||
if parse_scan_info:
|
if fnp.remainder:
|
||||||
if fnp.remainder != "":
|
|
||||||
metadata.scan_info = fnp.remainder
|
metadata.scan_info = fnp.remainder
|
||||||
|
|
||||||
metadata.is_empty = False
|
metadata.is_empty = False
|
||||||
|
353
comicapi/filenamelexer.py
Normal file
353
comicapi/filenamelexer.py
Normal file
@ -0,0 +1,353 @@
|
|||||||
|
import calendar
|
||||||
|
import os
|
||||||
|
import unicodedata
|
||||||
|
from enum import Enum, auto
|
||||||
|
|
||||||
|
|
||||||
|
class ItemType(Enum):
|
||||||
|
Error = auto() # Error occurred; value is text of error
|
||||||
|
EOF = auto()
|
||||||
|
Text = auto() # Text
|
||||||
|
LeftParen = auto() # '(' inside action
|
||||||
|
Number = auto() # Simple number
|
||||||
|
IssueNumber = auto() # Preceded by a # Symbol
|
||||||
|
RightParen = auto() # ')' inside action
|
||||||
|
Space = auto() # Run of spaces separating arguments
|
||||||
|
Dot = auto()
|
||||||
|
LeftBrace = auto()
|
||||||
|
RightBrace = auto()
|
||||||
|
LeftSBrace = auto()
|
||||||
|
RightSBrace = auto()
|
||||||
|
Symbol = auto()
|
||||||
|
Skip = auto() # __ or -- no title, issue or series information beyond
|
||||||
|
Operator = auto()
|
||||||
|
Calendar = auto()
|
||||||
|
InfoSpecifier = auto() # Specifies type of info e.g. v1 for 'volume': 1
|
||||||
|
ArchiveType = auto()
|
||||||
|
Honorific = auto()
|
||||||
|
Keywords = auto()
|
||||||
|
FCBD = auto()
|
||||||
|
ComicType = auto()
|
||||||
|
Publisher = auto()
|
||||||
|
C2C = auto()
|
||||||
|
|
||||||
|
|
||||||
|
braces = [
|
||||||
|
ItemType.LeftBrace,
|
||||||
|
ItemType.LeftParen,
|
||||||
|
ItemType.LeftSBrace,
|
||||||
|
ItemType.RightBrace,
|
||||||
|
ItemType.RightParen,
|
||||||
|
ItemType.RightSBrace,
|
||||||
|
]
|
||||||
|
|
||||||
|
eof = chr(0)
|
||||||
|
|
||||||
|
key = {
|
||||||
|
"fcbd": ItemType.FCBD,
|
||||||
|
"freecomicbookday": ItemType.FCBD,
|
||||||
|
"cbr": ItemType.ArchiveType,
|
||||||
|
"cbz": ItemType.ArchiveType,
|
||||||
|
"cbt": ItemType.ArchiveType,
|
||||||
|
"cb7": ItemType.ArchiveType,
|
||||||
|
"rar": ItemType.ArchiveType,
|
||||||
|
"zip": ItemType.ArchiveType,
|
||||||
|
"tar": ItemType.ArchiveType,
|
||||||
|
"7z": ItemType.ArchiveType,
|
||||||
|
"annual": ItemType.ComicType,
|
||||||
|
"book": ItemType.ComicType,
|
||||||
|
"volume": ItemType.InfoSpecifier,
|
||||||
|
"vol.": ItemType.InfoSpecifier,
|
||||||
|
"vol": ItemType.InfoSpecifier,
|
||||||
|
"v": ItemType.InfoSpecifier,
|
||||||
|
"of": ItemType.InfoSpecifier,
|
||||||
|
"dc": ItemType.Publisher,
|
||||||
|
"marvel": ItemType.Publisher,
|
||||||
|
"covers": ItemType.InfoSpecifier,
|
||||||
|
"c2c": ItemType.C2C,
|
||||||
|
"mr": ItemType.Honorific,
|
||||||
|
"ms": ItemType.Honorific,
|
||||||
|
"mrs": ItemType.Honorific,
|
||||||
|
"dr": ItemType.Honorific,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class Item:
|
||||||
|
def __init__(self, typ: ItemType, pos: int, val: str):
|
||||||
|
self.typ: ItemType = typ
|
||||||
|
self.pos: int = pos
|
||||||
|
self.val: str = val
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"{self.val}: index: {self.pos}: {self.typ}"
|
||||||
|
|
||||||
|
|
||||||
|
class Lexer:
|
||||||
|
def __init__(self, string):
|
||||||
|
self.input: str = string # The string being scanned
|
||||||
|
self.state = None # The next lexing function to enter
|
||||||
|
self.pos: int = -1 # Current position in the input
|
||||||
|
self.start: int = 0 # Start position of this item
|
||||||
|
self.lastPos: int = 0 # Position of most recent item returned by nextItem
|
||||||
|
self.paren_depth: int = 0 # Nesting depth of ( ) exprs
|
||||||
|
self.brace_depth: int = 0 # Nesting depth of { }
|
||||||
|
self.sbrace_depth: int = 0 # Nesting depth of [ ]
|
||||||
|
self.items = []
|
||||||
|
|
||||||
|
# Next returns the next rune in the input.
|
||||||
|
def get(self) -> str:
|
||||||
|
if int(self.pos) >= len(self.input) - 1:
|
||||||
|
self.pos += 1
|
||||||
|
return eof
|
||||||
|
|
||||||
|
self.pos += 1
|
||||||
|
return self.input[self.pos]
|
||||||
|
|
||||||
|
# Peek returns but does not consume the next rune in the input.
|
||||||
|
def peek(self) -> str:
|
||||||
|
if int(self.pos) >= len(self.input) - 1:
|
||||||
|
return eof
|
||||||
|
|
||||||
|
return self.input[self.pos + 1]
|
||||||
|
|
||||||
|
def backup(self):
|
||||||
|
self.pos -= 1
|
||||||
|
|
||||||
|
# Emit passes an item back to the client.
|
||||||
|
def emit(self, t: ItemType):
|
||||||
|
self.items.append(Item(t, self.start, self.input[self.start : self.pos + 1]))
|
||||||
|
self.start = self.pos + 1
|
||||||
|
|
||||||
|
# Ignore skips over the pending input before this point.
|
||||||
|
def ignore(self):
|
||||||
|
self.start = self.pos
|
||||||
|
|
||||||
|
# Accept consumes the next rune if it's from the valid se:
|
||||||
|
def accept(self, valid: str):
|
||||||
|
if self.get() in valid:
|
||||||
|
return True
|
||||||
|
|
||||||
|
self.backup()
|
||||||
|
return False
|
||||||
|
|
||||||
|
# AcceptRun consumes a run of runes from the valid set.
|
||||||
|
def accept_run(self, valid: str):
|
||||||
|
while self.get() in valid:
|
||||||
|
pass
|
||||||
|
|
||||||
|
self.backup()
|
||||||
|
|
||||||
|
# Errorf returns an error token and terminates the scan by passing
|
||||||
|
# Back a nil pointer that will be the next state, terminating self.nextItem.
|
||||||
|
def errorf(self, message: str):
|
||||||
|
self.items.append(Item(ItemType.Error, self.start, message))
|
||||||
|
|
||||||
|
# NextItem returns the next item from the input.
|
||||||
|
# Called by the parser, not in the lexing goroutine.
|
||||||
|
# def next_item(self) -> Item:
|
||||||
|
# item: Item = self.items.get()
|
||||||
|
# self.lastPos = item.pos
|
||||||
|
# return item
|
||||||
|
|
||||||
|
def scan_number(self):
|
||||||
|
digits = "0123456789"
|
||||||
|
|
||||||
|
self.accept_run(digits)
|
||||||
|
if self.accept("."):
|
||||||
|
if self.accept(digits):
|
||||||
|
self.accept_run(digits)
|
||||||
|
else:
|
||||||
|
self.backup()
|
||||||
|
if self.accept("s"):
|
||||||
|
if not self.accept("t"):
|
||||||
|
self.backup()
|
||||||
|
elif self.accept("nr"):
|
||||||
|
if not self.accept("d"):
|
||||||
|
self.backup()
|
||||||
|
elif self.accept("t"):
|
||||||
|
if not self.accept("h"):
|
||||||
|
self.backup()
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Runs the state machine for the lexer.
|
||||||
|
def run(self):
|
||||||
|
self.state = lex_filename
|
||||||
|
while self.state is not None:
|
||||||
|
self.state = self.state(self)
|
||||||
|
|
||||||
|
|
||||||
|
# Scans the elements inside action delimiters.
|
||||||
|
def lex_filename(lex: Lexer):
|
||||||
|
r = lex.get()
|
||||||
|
if r == eof:
|
||||||
|
if lex.paren_depth != 0:
|
||||||
|
return lex.errorf("unclosed left paren")
|
||||||
|
|
||||||
|
if lex.brace_depth != 0:
|
||||||
|
return lex.errorf("unclosed left paren")
|
||||||
|
lex.emit(ItemType.EOF)
|
||||||
|
return None
|
||||||
|
elif is_space(r):
|
||||||
|
if r == "_" and lex.peek() == "_":
|
||||||
|
lex.get()
|
||||||
|
lex.emit(ItemType.Skip)
|
||||||
|
else:
|
||||||
|
return lex_space
|
||||||
|
elif r == ".":
|
||||||
|
r = lex.peek()
|
||||||
|
if r < "0" or "9" < r:
|
||||||
|
lex.emit(ItemType.Dot)
|
||||||
|
return lex_filename
|
||||||
|
|
||||||
|
lex.backup()
|
||||||
|
return lex_number
|
||||||
|
elif r == "'":
|
||||||
|
r = lex.peek()
|
||||||
|
if r in "0123456789":
|
||||||
|
return lex_number
|
||||||
|
lex.emit(ItemType.Text) # TODO: Change to Text
|
||||||
|
elif "0" <= r <= "9":
|
||||||
|
lex.backup()
|
||||||
|
return lex_number
|
||||||
|
elif r == "#":
|
||||||
|
if "0" <= lex.peek() <= "9":
|
||||||
|
return lex_number
|
||||||
|
lex.emit(ItemType.Symbol)
|
||||||
|
elif is_operator(r):
|
||||||
|
if r == "-" and lex.peek() == "-":
|
||||||
|
lex.get()
|
||||||
|
lex.emit(ItemType.Skip)
|
||||||
|
else:
|
||||||
|
return lex_operator
|
||||||
|
elif is_alpha_numeric(r):
|
||||||
|
lex.backup()
|
||||||
|
return lex_text
|
||||||
|
elif r == "(":
|
||||||
|
lex.emit(ItemType.LeftParen)
|
||||||
|
lex.paren_depth += 1
|
||||||
|
elif r == ")":
|
||||||
|
lex.emit(ItemType.RightParen)
|
||||||
|
lex.paren_depth -= 1
|
||||||
|
if lex.paren_depth < 0:
|
||||||
|
return lex.errorf("unexpected right paren " + r)
|
||||||
|
|
||||||
|
elif r == "{":
|
||||||
|
lex.emit(ItemType.LeftBrace)
|
||||||
|
lex.brace_depth += 1
|
||||||
|
elif r == "}":
|
||||||
|
lex.emit(ItemType.RightBrace)
|
||||||
|
lex.brace_depth -= 1
|
||||||
|
if lex.brace_depth < 0:
|
||||||
|
return lex.errorf("unexpected right brace " + r)
|
||||||
|
|
||||||
|
elif r == "[":
|
||||||
|
lex.emit(ItemType.LeftSBrace)
|
||||||
|
lex.sbrace_depth += 1
|
||||||
|
elif r == "]":
|
||||||
|
lex.emit(ItemType.RightSBrace)
|
||||||
|
lex.sbrace_depth -= 1
|
||||||
|
if lex.sbrace_depth < 0:
|
||||||
|
return lex.errorf("unexpected right brace " + r)
|
||||||
|
elif is_symbol(r):
|
||||||
|
# L.backup()
|
||||||
|
lex.emit(ItemType.Symbol)
|
||||||
|
else:
|
||||||
|
return lex.errorf("unrecognized character in action: " + r)
|
||||||
|
|
||||||
|
return lex_filename
|
||||||
|
|
||||||
|
|
||||||
|
def lex_operator(lex: Lexer):
|
||||||
|
lex.accept_run("-|:;")
|
||||||
|
lex.emit(ItemType.Operator)
|
||||||
|
return lex_filename
|
||||||
|
|
||||||
|
|
||||||
|
# LexSpace scans a run of space characters.
|
||||||
|
# One space has already been seen.
|
||||||
|
def lex_space(lex: Lexer):
|
||||||
|
while is_space(lex.peek()):
|
||||||
|
lex.get()
|
||||||
|
|
||||||
|
lex.emit(ItemType.Space)
|
||||||
|
return lex_filename
|
||||||
|
|
||||||
|
|
||||||
|
# Lex_text scans an alphanumeric.
|
||||||
|
def lex_text(lex: Lexer):
|
||||||
|
while True:
|
||||||
|
r = lex.get()
|
||||||
|
if is_alpha_numeric(r):
|
||||||
|
if r.isnumeric(): # E.g. v1
|
||||||
|
word = lex.input[lex.start : lex.pos]
|
||||||
|
if word.lower() in key and key[word.lower()] == ItemType.InfoSpecifier:
|
||||||
|
lex.backup()
|
||||||
|
lex.emit(key[word.lower()])
|
||||||
|
return lex_filename
|
||||||
|
else:
|
||||||
|
if r == "'" and lex.peek() == "s":
|
||||||
|
lex.get()
|
||||||
|
else:
|
||||||
|
lex.backup()
|
||||||
|
word = lex.input[lex.start : lex.pos + 1]
|
||||||
|
if word.lower() == "vol" and lex.peek() == ".":
|
||||||
|
lex.get()
|
||||||
|
word = lex.input[lex.start : lex.pos + 1]
|
||||||
|
|
||||||
|
if word.lower() in key:
|
||||||
|
lex.emit(key[word.lower()])
|
||||||
|
elif cal(word):
|
||||||
|
lex.emit(ItemType.Calendar)
|
||||||
|
else:
|
||||||
|
lex.emit(ItemType.Text)
|
||||||
|
break
|
||||||
|
|
||||||
|
return lex_filename
|
||||||
|
|
||||||
|
|
||||||
|
def cal(value: str):
|
||||||
|
month_abbr = [i for i, x in enumerate(calendar.month_abbr) if x == value.title()]
|
||||||
|
month_name = [i for i, x in enumerate(calendar.month_name) if x == value.title()]
|
||||||
|
day_abbr = [i for i, x in enumerate(calendar.day_abbr) if x == value.title()]
|
||||||
|
day_name = [i for i, x in enumerate(calendar.day_name) if x == value.title()]
|
||||||
|
return set(month_abbr + month_name + day_abbr + day_name)
|
||||||
|
|
||||||
|
|
||||||
|
def lex_number(lex: Lexer):
|
||||||
|
if not lex.scan_number():
|
||||||
|
return lex.errorf("bad number syntax: " + lex.input[lex.start : lex.pos])
|
||||||
|
# Complex number logic removed. Messes with math operations without space
|
||||||
|
|
||||||
|
if lex.input[lex.start] == "#":
|
||||||
|
lex.emit(ItemType.IssueNumber)
|
||||||
|
elif not lex.input[lex.pos].isdigit():
|
||||||
|
# Assume that 80th is just text and not a number
|
||||||
|
lex.emit(ItemType.Text)
|
||||||
|
else:
|
||||||
|
lex.emit(ItemType.Number)
|
||||||
|
|
||||||
|
return lex_filename
|
||||||
|
|
||||||
|
|
||||||
|
def is_space(character: str):
|
||||||
|
return character in "_ \t"
|
||||||
|
|
||||||
|
|
||||||
|
# IsAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
|
||||||
|
def is_alpha_numeric(character: str):
|
||||||
|
return character.isalpha() or character.isnumeric()
|
||||||
|
|
||||||
|
|
||||||
|
def is_operator(character: str):
|
||||||
|
return character in "-|:;/\\"
|
||||||
|
|
||||||
|
|
||||||
|
def is_symbol(character: str):
|
||||||
|
return unicodedata.category(character)[0] in "PS"
|
||||||
|
|
||||||
|
|
||||||
|
def Lex(filename: str):
|
||||||
|
lex = Lexer(string=os.path.basename(filename))
|
||||||
|
lex.run()
|
||||||
|
return lex
|
@ -23,8 +23,17 @@ This should probably be re-written, but, well, it mostly works!
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
from operator import itemgetter
|
||||||
|
from typing import TypedDict
|
||||||
from urllib.parse import unquote
|
from urllib.parse import unquote
|
||||||
|
|
||||||
|
from text2digits import text2digits
|
||||||
|
|
||||||
|
from comicapi import filenamelexer, issuestring
|
||||||
|
|
||||||
|
t2d = text2digits.Text2Digits(add_ordinal_ending=False)
|
||||||
|
t2do = text2digits.Text2Digits(add_ordinal_ending=True)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@ -68,9 +77,7 @@ class FileNameParser:
|
|||||||
if match:
|
if match:
|
||||||
count = match.group()
|
count = match.group()
|
||||||
|
|
||||||
count = count.lstrip("0")
|
return count.lstrip("0")
|
||||||
|
|
||||||
return count
|
|
||||||
|
|
||||||
def get_issue_number(self, filename):
|
def get_issue_number(self, filename):
|
||||||
"""Returns a tuple of issue number string, and start and end indexes in the filename
|
"""Returns a tuple of issue number string, and start and end indexes in the filename
|
||||||
@ -222,7 +229,7 @@ class FileNameParser:
|
|||||||
|
|
||||||
year = ""
|
year = ""
|
||||||
# look for four digit number with "(" ")" or "--" around it
|
# look for four digit number with "(" ")" or "--" around it
|
||||||
match = re.search(r"(\(\d\d\d\d\))|(--\d\d\d\d--)", filename)
|
match = re.search(r"(\(\d{4}\))|(--\d{4}--)", filename)
|
||||||
if match:
|
if match:
|
||||||
year = match.group()
|
year = match.group()
|
||||||
# remove non-digits
|
# remove non-digits
|
||||||
@ -290,3 +297,814 @@ class FileNameParser:
|
|||||||
self.issue = "0"
|
self.issue = "0"
|
||||||
if self.issue[0] == ".":
|
if self.issue[0] == ".":
|
||||||
self.issue = "0" + self.issue
|
self.issue = "0" + self.issue
|
||||||
|
|
||||||
|
|
||||||
|
class FilenameInfo(TypedDict, total=False):
|
||||||
|
alternate: str
|
||||||
|
annual: bool
|
||||||
|
archive: str
|
||||||
|
c2c: bool
|
||||||
|
fcbd: bool
|
||||||
|
issue: str
|
||||||
|
issue_count: str
|
||||||
|
publisher: str
|
||||||
|
remainder: str
|
||||||
|
series: str
|
||||||
|
title: str
|
||||||
|
volume: str
|
||||||
|
volume_count: str
|
||||||
|
year: str
|
||||||
|
|
||||||
|
|
||||||
|
eof = filenamelexer.Item(filenamelexer.ItemType.EOF, -1, "")
|
||||||
|
|
||||||
|
|
||||||
|
class Parser:
|
||||||
|
"""docstring for FilenameParser"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
lexer_result: list[filenamelexer.Item],
|
||||||
|
first_is_alt=False,
|
||||||
|
remove_c2c=False,
|
||||||
|
remove_fcbd=False,
|
||||||
|
remove_publisher=False,
|
||||||
|
):
|
||||||
|
self.state = None
|
||||||
|
self.pos = -1
|
||||||
|
|
||||||
|
self.firstItem = True
|
||||||
|
self.skip = False
|
||||||
|
self.alt = False
|
||||||
|
self.filename_info: FilenameInfo = {"series": ""}
|
||||||
|
self.issue_number_at = None
|
||||||
|
self.in_something = 0 # In some sort of brackets {}[]()
|
||||||
|
self.in_brace = 0 # In {}
|
||||||
|
self.in_s_brace = 0 # In []
|
||||||
|
self.in_paren = 0 # In ()
|
||||||
|
self.year_candidates: list[tuple[bool, filenamelexer.Item]] = []
|
||||||
|
self.series_parts: list[filenamelexer.Item] = []
|
||||||
|
self.title_parts: list[filenamelexer.Item] = []
|
||||||
|
self.used_items: list[filenamelexer.Item] = []
|
||||||
|
self.irrelevant: list[filenamelexer.Item] = []
|
||||||
|
self.operator_rejected: list[filenamelexer.Item] = []
|
||||||
|
self.publisher_removed: list[filenamelexer.Item] = []
|
||||||
|
|
||||||
|
self.first_is_alt = first_is_alt
|
||||||
|
self.remove_c2c = remove_c2c
|
||||||
|
self.remove_fcbd = remove_fcbd
|
||||||
|
self.remove_publisher = remove_publisher
|
||||||
|
|
||||||
|
self.input = lexer_result
|
||||||
|
for i, item in enumerate(self.input):
|
||||||
|
if item.typ == filenamelexer.ItemType.IssueNumber:
|
||||||
|
self.issue_number_at = i
|
||||||
|
|
||||||
|
# Get returns the next Item in the input.
|
||||||
|
def get(self) -> filenamelexer.Item:
|
||||||
|
if int(self.pos) >= len(self.input) - 1:
|
||||||
|
self.pos += 1
|
||||||
|
return eof
|
||||||
|
|
||||||
|
self.pos += 1
|
||||||
|
return self.input[self.pos]
|
||||||
|
|
||||||
|
# Peek returns but does not consume the next Item in the input.
|
||||||
|
def peek(self) -> filenamelexer.Item:
|
||||||
|
if int(self.pos) >= len(self.input) - 1:
|
||||||
|
return eof
|
||||||
|
|
||||||
|
return self.input[self.pos + 1]
|
||||||
|
|
||||||
|
# Peek_back returns but does not step back the previous Item in the input.
|
||||||
|
def peek_back(self) -> filenamelexer.Item:
|
||||||
|
if int(self.pos) == 0:
|
||||||
|
return eof
|
||||||
|
|
||||||
|
return self.input[self.pos - 1]
|
||||||
|
|
||||||
|
# Backup steps back one Item.
|
||||||
|
def backup(self):
|
||||||
|
self.pos -= 1
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
self.state = parse
|
||||||
|
while self.state is not None:
|
||||||
|
self.state = self.state(self)
|
||||||
|
|
||||||
|
|
||||||
|
def parse(p: Parser):
|
||||||
|
item: filenamelexer.Item = p.get()
|
||||||
|
|
||||||
|
# We're done, time to do final processing
|
||||||
|
if item.typ == filenamelexer.ItemType.EOF:
|
||||||
|
return parse_finish
|
||||||
|
|
||||||
|
# Need to figure out if this is the issue number
|
||||||
|
if item.typ == filenamelexer.ItemType.Number:
|
||||||
|
likely_year = False
|
||||||
|
if p.firstItem and p.first_is_alt:
|
||||||
|
# raise Exception("fuck you")
|
||||||
|
p.alt = True
|
||||||
|
return parse_issue_number
|
||||||
|
|
||||||
|
# The issue number should hopefully not be in parentheses
|
||||||
|
if p.in_something == 0:
|
||||||
|
# Assume that operators indicate a non-issue number e.g. IG-88 or 88-IG
|
||||||
|
if filenamelexer.ItemType.Operator not in (p.peek().typ, p.peek_back().typ):
|
||||||
|
# It is common to use '89 to refer to an annual reprint from 1989
|
||||||
|
if item.val[0] != "'":
|
||||||
|
# Issue number is less than 4 digits. very few series go above 999
|
||||||
|
if len(item.val.lstrip("0")) < 4:
|
||||||
|
# An issue number starting with # Was not found and no previous number was found
|
||||||
|
if p.issue_number_at is None:
|
||||||
|
# Series has already been started/parsed, filters out leading alternate numbers leading alternate number
|
||||||
|
if len(p.series_parts) > 0:
|
||||||
|
# Unset first item
|
||||||
|
if p.firstItem:
|
||||||
|
p.firstItem = False
|
||||||
|
return parse_issue_number
|
||||||
|
else:
|
||||||
|
p.operator_rejected.append(item)
|
||||||
|
# operator rejected used later to add back to the series/title
|
||||||
|
|
||||||
|
# It is more likely to be a year if it is inside parentheses.
|
||||||
|
if p.in_something > 0:
|
||||||
|
likely_year = True
|
||||||
|
|
||||||
|
# If numbers are directly followed by text it most likely isn't a year e.g. 2048px
|
||||||
|
if p.peek().typ == filenamelexer.ItemType.Text:
|
||||||
|
likely_year = False
|
||||||
|
|
||||||
|
# Is either a full year '2001' or a short year "'89"
|
||||||
|
if len(item.val) == 4 or item.val[0] == "'":
|
||||||
|
if p.in_something == 0:
|
||||||
|
# Append to series in case it is a part of the title, but only if were not inside parenthesis
|
||||||
|
p.series_parts.append(item)
|
||||||
|
|
||||||
|
# Look for a full date as in 2022-04-22
|
||||||
|
if p.peek().typ in [
|
||||||
|
filenamelexer.ItemType.Symbol,
|
||||||
|
filenamelexer.ItemType.Operator,
|
||||||
|
filenamelexer.ItemType.Dot,
|
||||||
|
]:
|
||||||
|
op = [p.get()]
|
||||||
|
if p.peek().typ == filenamelexer.ItemType.Number:
|
||||||
|
month = p.get()
|
||||||
|
if p.peek().typ in [
|
||||||
|
filenamelexer.ItemType.Symbol,
|
||||||
|
filenamelexer.ItemType.Operator,
|
||||||
|
filenamelexer.ItemType.Dot,
|
||||||
|
]:
|
||||||
|
op.append(p.get())
|
||||||
|
if p.peek().typ == filenamelexer.ItemType.Number:
|
||||||
|
day = p.get()
|
||||||
|
fulldate = [month, day, item]
|
||||||
|
p.used_items.extend(op)
|
||||||
|
p.used_items.extend(fulldate)
|
||||||
|
else:
|
||||||
|
p.backup()
|
||||||
|
p.backup()
|
||||||
|
p.backup()
|
||||||
|
# TODO never happens
|
||||||
|
else:
|
||||||
|
p.backup()
|
||||||
|
p.backup()
|
||||||
|
# TODO never happens
|
||||||
|
else:
|
||||||
|
p.backup()
|
||||||
|
# TODO never happens
|
||||||
|
|
||||||
|
p.year_candidates.append((likely_year, item))
|
||||||
|
# Ensures that IG-88 gets added back to the series/title
|
||||||
|
elif (
|
||||||
|
p.in_something == 0
|
||||||
|
and p.peek_back().typ == filenamelexer.ItemType.Operator
|
||||||
|
or p.peek().typ == filenamelexer.ItemType.Operator
|
||||||
|
):
|
||||||
|
# Were not in something and the next or previous type is an operator, add it to the series
|
||||||
|
p.series_parts.append(item)
|
||||||
|
p.used_items.append(item)
|
||||||
|
|
||||||
|
# Unset first item
|
||||||
|
if p.firstItem:
|
||||||
|
p.firstItem = False
|
||||||
|
p.get()
|
||||||
|
return parse_series
|
||||||
|
|
||||||
|
# Number with a leading hash e.g. #003
|
||||||
|
elif item.typ == filenamelexer.ItemType.IssueNumber:
|
||||||
|
# Unset first item
|
||||||
|
if p.firstItem:
|
||||||
|
p.firstItem = False
|
||||||
|
return parse_issue_number
|
||||||
|
|
||||||
|
# Matches FCBD. Not added to p.used_items so it will show in "remainder"
|
||||||
|
elif item.typ == filenamelexer.ItemType.FCBD:
|
||||||
|
p.filename_info["fcbd"] = True
|
||||||
|
|
||||||
|
# Matches c2c. Not added to p.used_items so it will show in "remainder"
|
||||||
|
elif item.typ == filenamelexer.ItemType.C2C:
|
||||||
|
p.filename_info["c2c"] = True
|
||||||
|
|
||||||
|
# Matches the extension if it is known to be an archive format e.g. cbt,cbz,zip,rar
|
||||||
|
elif item.typ == filenamelexer.ItemType.ArchiveType:
|
||||||
|
p.filename_info["archive"] = item.val.lower()
|
||||||
|
p.used_items.append(item)
|
||||||
|
if p.peek_back().typ == filenamelexer.ItemType.Dot:
|
||||||
|
p.used_items.append(p.peek_back())
|
||||||
|
|
||||||
|
# Allows removing DC from 'Wonder Woman 49 DC Sep-Oct 1951' dependent on publisher being in a static list in the lexer
|
||||||
|
elif item.typ == filenamelexer.ItemType.Publisher:
|
||||||
|
p.filename_info["publisher"] = item.val
|
||||||
|
p.used_items.append(item)
|
||||||
|
if p.firstItem:
|
||||||
|
p.firstItem = False
|
||||||
|
if p.in_something == 0:
|
||||||
|
return parse_series
|
||||||
|
p.publisher_removed.append(item)
|
||||||
|
if p.in_something == 0:
|
||||||
|
return parse_series
|
||||||
|
|
||||||
|
# Attempts to identify the type e.g. annual
|
||||||
|
elif item.typ == filenamelexer.ItemType.ComicType:
|
||||||
|
series_append = True
|
||||||
|
|
||||||
|
if p.peek().typ == filenamelexer.ItemType.Space:
|
||||||
|
p.get()
|
||||||
|
|
||||||
|
if p.series_parts and "free comic book" in (" ".join([x.val for x in p.series_parts]) + " " + item.val).lower():
|
||||||
|
p.filename_info["fcbd"] = True
|
||||||
|
series_append = True
|
||||||
|
# If the next item is a number it's probably the volume
|
||||||
|
elif p.peek().typ == filenamelexer.ItemType.Number or (
|
||||||
|
p.peek().typ == filenamelexer.ItemType.Text and t2d.convert(p.peek().val).isnumeric()
|
||||||
|
):
|
||||||
|
number = p.get()
|
||||||
|
# Mark volume info. Text will be added to the title/series later
|
||||||
|
if item.val.lower() in ["book", "tpb"]:
|
||||||
|
p.title_parts.extend([item, number])
|
||||||
|
p.filename_info["volume"] = t2do.convert(number.val)
|
||||||
|
p.filename_info["issue"] = t2do.convert(number.val)
|
||||||
|
|
||||||
|
p.used_items.append(item)
|
||||||
|
series_append = False
|
||||||
|
|
||||||
|
# Annuals usually mean the year
|
||||||
|
elif item.val.lower() in ["annual"]:
|
||||||
|
p.filename_info["annual"] = True
|
||||||
|
num = t2d.convert(number.val)
|
||||||
|
if num.isnumeric() and len(num) == 4:
|
||||||
|
p.year_candidates.append((True, number))
|
||||||
|
else:
|
||||||
|
p.backup()
|
||||||
|
|
||||||
|
elif item.val.lower() in ["annual"]:
|
||||||
|
p.filename_info["annual"] = True
|
||||||
|
|
||||||
|
# If we don't have a reason to exclude it from the series go back to parsing the series immediately
|
||||||
|
if series_append:
|
||||||
|
p.series_parts.append(item)
|
||||||
|
p.used_items.append(item)
|
||||||
|
return parse_series
|
||||||
|
|
||||||
|
# We found text, it's probably the title or series
|
||||||
|
elif item.typ in [filenamelexer.ItemType.Text, filenamelexer.ItemType.Honorific]:
|
||||||
|
# Unset first item
|
||||||
|
if p.firstItem:
|
||||||
|
p.firstItem = False
|
||||||
|
if p.in_something == 0:
|
||||||
|
return parse_series
|
||||||
|
|
||||||
|
# Usually the word 'of' eg 1 (of 6)
|
||||||
|
elif item.typ == filenamelexer.ItemType.InfoSpecifier:
|
||||||
|
return parse_info_specifier
|
||||||
|
|
||||||
|
# Operator is a symbol that acts as some sort of separator eg - : ;
|
||||||
|
elif item.typ == filenamelexer.ItemType.Operator:
|
||||||
|
if p.in_something == 0:
|
||||||
|
p.irrelevant.append(item)
|
||||||
|
|
||||||
|
# Filter out Month and day names in filename
|
||||||
|
elif item.typ == filenamelexer.ItemType.Calendar:
|
||||||
|
# Month and day are currently irrelevant if they are inside parentheses e.g. (January 2002)
|
||||||
|
if p.in_something > 0:
|
||||||
|
p.irrelevant.append(item)
|
||||||
|
|
||||||
|
# assume Sep-Oct is not useful in the series/title
|
||||||
|
elif p.peek().typ in [filenamelexer.ItemType.Symbol, filenamelexer.ItemType.Operator]:
|
||||||
|
p.get()
|
||||||
|
if p.peek().typ == filenamelexer.ItemType.Calendar:
|
||||||
|
p.irrelevant.extend([item, p.input[p.pos], p.get()])
|
||||||
|
else:
|
||||||
|
p.backup()
|
||||||
|
return parse_series
|
||||||
|
# This is text that just happens to also be a month/day
|
||||||
|
else:
|
||||||
|
return parse_series
|
||||||
|
|
||||||
|
# Specifically '__' or '--', no further title/series parsing is done to keep compatibility with wiki
|
||||||
|
elif item.typ == filenamelexer.ItemType.Skip:
|
||||||
|
p.skip = True
|
||||||
|
|
||||||
|
# Keeping track of parentheses depth
|
||||||
|
elif item.typ == filenamelexer.ItemType.LeftParen:
|
||||||
|
p.in_paren += 1
|
||||||
|
p.in_something += 1
|
||||||
|
elif item.typ == filenamelexer.ItemType.LeftBrace:
|
||||||
|
p.in_brace += 1
|
||||||
|
p.in_something += 1
|
||||||
|
elif item.typ == filenamelexer.ItemType.LeftSBrace:
|
||||||
|
p.in_s_brace += 1
|
||||||
|
p.in_something += 1
|
||||||
|
|
||||||
|
elif item.typ == filenamelexer.ItemType.RightParen:
|
||||||
|
p.in_paren -= 1
|
||||||
|
p.in_something -= 1
|
||||||
|
elif item.typ == filenamelexer.ItemType.RightBrace:
|
||||||
|
p.in_brace -= 1
|
||||||
|
p.in_something -= 1
|
||||||
|
elif item.typ == filenamelexer.ItemType.RightSBrace:
|
||||||
|
p.in_s_brace -= 1
|
||||||
|
p.in_something -= 1
|
||||||
|
|
||||||
|
# Unset first item
|
||||||
|
if p.firstItem:
|
||||||
|
p.firstItem = False
|
||||||
|
|
||||||
|
# Brace management, I don't like negative numbers
|
||||||
|
if p.in_paren < 0:
|
||||||
|
p.in_something += p.in_paren * -1
|
||||||
|
if p.in_brace < 0:
|
||||||
|
p.in_something += p.in_brace * -1
|
||||||
|
if p.in_s_brace < 0:
|
||||||
|
p.in_something += p.in_s_brace * -1
|
||||||
|
|
||||||
|
return parse
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: What about more esoteric numbers???
|
||||||
|
def parse_issue_number(p: Parser):
|
||||||
|
item = p.input[p.pos]
|
||||||
|
|
||||||
|
if "issue" in p.filename_info:
|
||||||
|
if "alternate" in p.filename_info:
|
||||||
|
p.filename_info["alternate"] += "," + item.val
|
||||||
|
p.filename_info["alternate"] = item.val
|
||||||
|
else:
|
||||||
|
if p.alt:
|
||||||
|
p.filename_info["alternate"] = item.val
|
||||||
|
else:
|
||||||
|
p.filename_info["issue"] = item.val
|
||||||
|
p.issue_number_at = item.pos
|
||||||
|
p.used_items.append(item)
|
||||||
|
item = p.get()
|
||||||
|
if item.typ == filenamelexer.ItemType.Dot:
|
||||||
|
p.used_items.append(item)
|
||||||
|
item = p.get()
|
||||||
|
if item.typ in [filenamelexer.ItemType.Text, filenamelexer.ItemType.Number]:
|
||||||
|
if p.alt:
|
||||||
|
p.filename_info["alternate"] += "." + item.val
|
||||||
|
else:
|
||||||
|
p.filename_info["issue"] += "." + item.val
|
||||||
|
p.used_items.append(item)
|
||||||
|
else:
|
||||||
|
p.backup()
|
||||||
|
p.backup()
|
||||||
|
else:
|
||||||
|
p.backup()
|
||||||
|
p.alt = False
|
||||||
|
return parse
|
||||||
|
|
||||||
|
|
||||||
|
def parse_series(p: Parser):
|
||||||
|
item = p.input[p.pos]
|
||||||
|
|
||||||
|
series: list[list[filenamelexer.Item]] = [[]]
|
||||||
|
# Space and Dots are not useful at the beginning of a title/series
|
||||||
|
if not p.skip and item.typ not in [filenamelexer.ItemType.Space, filenamelexer.ItemType.Dot]:
|
||||||
|
series[0].append(item)
|
||||||
|
|
||||||
|
current_part = 0
|
||||||
|
|
||||||
|
title_parts: list[filenamelexer.Item] = []
|
||||||
|
series_parts: list[filenamelexer.Item] = []
|
||||||
|
|
||||||
|
prev_space = False
|
||||||
|
|
||||||
|
# 'free comic book day' screws things up. #TODO look into removing book from ComicType?
|
||||||
|
|
||||||
|
# We stop parsing the series when certain things come up if nothing was done with them continue where we left off
|
||||||
|
if (
|
||||||
|
p.series_parts
|
||||||
|
and p.series_parts[-1].val.lower() == "book"
|
||||||
|
or p.peek_back().typ == filenamelexer.ItemType.Number
|
||||||
|
or item.typ == filenamelexer.ItemType.Calendar
|
||||||
|
):
|
||||||
|
series_parts = p.series_parts
|
||||||
|
p.series_parts = []
|
||||||
|
# Skip is only true if we have come across '--' or '__'
|
||||||
|
while not p.skip:
|
||||||
|
item = p.get()
|
||||||
|
|
||||||
|
# Spaces are evil
|
||||||
|
if item.typ == filenamelexer.ItemType.Space:
|
||||||
|
prev_space = True
|
||||||
|
continue
|
||||||
|
if item.typ in [
|
||||||
|
filenamelexer.ItemType.Text,
|
||||||
|
filenamelexer.ItemType.Symbol,
|
||||||
|
filenamelexer.ItemType.Publisher,
|
||||||
|
filenamelexer.ItemType.Honorific,
|
||||||
|
]:
|
||||||
|
series[current_part].append(item)
|
||||||
|
if item.typ == filenamelexer.ItemType.Honorific and p.peek().typ == filenamelexer.ItemType.Dot:
|
||||||
|
series[current_part].append(p.get())
|
||||||
|
elif item.typ == filenamelexer.ItemType.Publisher:
|
||||||
|
p.filename_info["publisher"] = item.val
|
||||||
|
|
||||||
|
# Handle Volume
|
||||||
|
elif item.typ == filenamelexer.ItemType.InfoSpecifier:
|
||||||
|
# Exception for 'of'
|
||||||
|
if item.val.lower() == "of":
|
||||||
|
series[current_part].append(item)
|
||||||
|
else:
|
||||||
|
# This specifically lets 'X-Men-V1-067' parse correctly as Series: X-Men Volume: 1 Issue: 67
|
||||||
|
while len(series[current_part]) > 0 and series[current_part][-1].typ not in [
|
||||||
|
filenamelexer.ItemType.Text,
|
||||||
|
filenamelexer.ItemType.Symbol,
|
||||||
|
]:
|
||||||
|
p.irrelevant.append(series[current_part].pop())
|
||||||
|
p.backup()
|
||||||
|
break
|
||||||
|
|
||||||
|
elif item.typ == filenamelexer.ItemType.Operator:
|
||||||
|
peek = p.peek()
|
||||||
|
# ': ' separates the title from the series, only the last section is considered the title
|
||||||
|
if not prev_space and peek.typ in [filenamelexer.ItemType.Space]:
|
||||||
|
series.append([]) # Starts a new section
|
||||||
|
series[current_part].append(item)
|
||||||
|
current_part += 1
|
||||||
|
else:
|
||||||
|
# Force space around '-' makes 'batman - superman' stay otherwise we get 'batman-superman'
|
||||||
|
if prev_space and peek.typ in [filenamelexer.ItemType.Space]:
|
||||||
|
item.val = " " + item.val + " "
|
||||||
|
series[current_part].append(item)
|
||||||
|
|
||||||
|
# Stop processing series/title if a skip item is found
|
||||||
|
elif item.typ == filenamelexer.ItemType.Skip:
|
||||||
|
p.backup()
|
||||||
|
break
|
||||||
|
|
||||||
|
elif item.typ == filenamelexer.ItemType.Number:
|
||||||
|
if p.peek().typ == filenamelexer.ItemType.Space:
|
||||||
|
p.get()
|
||||||
|
# We have 2 numbers, add the first to the series and then go back to parse
|
||||||
|
if p.peek().typ == filenamelexer.ItemType.Number:
|
||||||
|
series[current_part].append(item)
|
||||||
|
break
|
||||||
|
|
||||||
|
# We have 1 number break here, it's possible it's the issue
|
||||||
|
p.backup() # Whitespace
|
||||||
|
p.backup() # The number
|
||||||
|
break
|
||||||
|
# This is 6 in '1 of 6'
|
||||||
|
if series[current_part] and series[current_part][-1].val.lower() == "of":
|
||||||
|
series[current_part].append(item)
|
||||||
|
|
||||||
|
# We have 1 number break here, it's possible it's the issue
|
||||||
|
else:
|
||||||
|
p.backup() # The number
|
||||||
|
break
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Ensure 'ms. marvel' parses 'ms.' correctly
|
||||||
|
if item.typ == filenamelexer.ItemType.Dot and p.peek_back().typ == filenamelexer.ItemType.Honorific:
|
||||||
|
series[current_part].append(item)
|
||||||
|
# Allows avengers.hulk to parse correctly
|
||||||
|
elif item.typ == filenamelexer.ItemType.Dot and p.peek().typ == filenamelexer.ItemType.Text:
|
||||||
|
# Marks the dot as used so that the remainder is clean
|
||||||
|
p.used_items.append(item)
|
||||||
|
else:
|
||||||
|
p.backup()
|
||||||
|
break
|
||||||
|
|
||||||
|
prev_space = False
|
||||||
|
|
||||||
|
# We have a title separator e.g. ': "
|
||||||
|
if len(series) > 1:
|
||||||
|
title_parts.extend(series.pop())
|
||||||
|
for s in series:
|
||||||
|
if s and s[-1].typ == filenamelexer.ItemType.Operator:
|
||||||
|
s[-1].val += " " # Ensures that when there are multiple separators that they display properly
|
||||||
|
series_parts.extend(s)
|
||||||
|
p.used_items.append(series_parts.pop())
|
||||||
|
else:
|
||||||
|
series_parts.extend(series[0])
|
||||||
|
|
||||||
|
# If the series has already been set assume all of this is the title.
|
||||||
|
if len(p.series_parts) > 0:
|
||||||
|
p.title_parts.extend(series_parts)
|
||||||
|
p.title_parts.extend(title_parts)
|
||||||
|
else:
|
||||||
|
p.series_parts.extend(series_parts)
|
||||||
|
p.title_parts.extend(title_parts)
|
||||||
|
return parse
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_year(p: Parser):
|
||||||
|
if len(p.year_candidates) > 0:
|
||||||
|
# Sort by likely_year boolean
|
||||||
|
p.year_candidates.sort(key=itemgetter(0))
|
||||||
|
|
||||||
|
# Take the last year e.g. (2007) 2099 (2008) becomes 2099 2007 2008 and takes 2008
|
||||||
|
selected_year = p.year_candidates.pop()[1]
|
||||||
|
|
||||||
|
p.filename_info["year"] = selected_year.val
|
||||||
|
p.used_items.append(selected_year)
|
||||||
|
|
||||||
|
# (2008) Title (2009) is many times used to denote the series year if we don't have a volume we use it
|
||||||
|
if "volume" not in p.filename_info and p.year_candidates and p.year_candidates[-1][0]:
|
||||||
|
vol = p.year_candidates.pop()[1]
|
||||||
|
p.filename_info["volume"] = vol.val
|
||||||
|
p.used_items.append(vol)
|
||||||
|
|
||||||
|
# Remove volume from series and title
|
||||||
|
if selected_year in p.series_parts:
|
||||||
|
p.series_parts.remove(selected_year)
|
||||||
|
if selected_year in p.title_parts:
|
||||||
|
p.title_parts.remove(selected_year)
|
||||||
|
|
||||||
|
# Remove year from series and title
|
||||||
|
if selected_year in p.series_parts:
|
||||||
|
p.series_parts.remove(selected_year)
|
||||||
|
if selected_year in p.title_parts:
|
||||||
|
p.title_parts.remove(selected_year)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_finish(p: Parser):
|
||||||
|
resolve_year(p)
|
||||||
|
|
||||||
|
# If we don't have an issue try to find it in the series
|
||||||
|
if "issue" not in p.filename_info and p.series_parts and p.series_parts[-1].typ == filenamelexer.ItemType.Number:
|
||||||
|
issue_num = p.series_parts.pop()
|
||||||
|
|
||||||
|
# If the number we just popped is a year put it back on it's probably part of the series e.g. Spider-Man 2099
|
||||||
|
if issue_num in [x[1] for x in p.year_candidates]:
|
||||||
|
p.series_parts.append(issue_num)
|
||||||
|
else:
|
||||||
|
# If this number was rejected because of an operator and the operator is still there add it back e.g. 'IG-88'
|
||||||
|
if (
|
||||||
|
issue_num in p.operator_rejected
|
||||||
|
and p.series_parts
|
||||||
|
and p.series_parts[-1].typ == filenamelexer.ItemType.Operator
|
||||||
|
):
|
||||||
|
p.series_parts.append(issue_num)
|
||||||
|
# We have no reason to not use this number as the issue number. Specifically happens when parsing 'X-Men-V1-067.cbr'
|
||||||
|
else:
|
||||||
|
p.filename_info["issue"] = issue_num.val
|
||||||
|
p.used_items.append(issue_num)
|
||||||
|
p.issue_number_at = issue_num.pos
|
||||||
|
|
||||||
|
# Remove publishers, currently only marvel and dc are defined,
|
||||||
|
# this is an option specifically because this can drastically screw up parsing
|
||||||
|
if p.remove_publisher:
|
||||||
|
for item in p.publisher_removed:
|
||||||
|
if item in p.series_parts:
|
||||||
|
p.series_parts.remove(item)
|
||||||
|
if item in p.title_parts:
|
||||||
|
p.title_parts.remove(item)
|
||||||
|
|
||||||
|
p.filename_info["series"] = join_title(p.series_parts)
|
||||||
|
p.used_items.extend(p.series_parts)
|
||||||
|
|
||||||
|
p.filename_info["title"] = join_title(p.title_parts)
|
||||||
|
p.used_items.extend(p.title_parts)
|
||||||
|
|
||||||
|
if "issue" in p.filename_info:
|
||||||
|
p.filename_info["issue"] = issuestring.IssueString(p.filename_info["issue"].lstrip("#")).as_string()
|
||||||
|
|
||||||
|
if "volume" in p.filename_info:
|
||||||
|
p.filename_info["volume"] = p.filename_info["volume"].lstrip("#").lstrip("0")
|
||||||
|
|
||||||
|
if "issue" not in p.filename_info:
|
||||||
|
# We have an alternate move it to the issue
|
||||||
|
if "alternate" in p.filename_info:
|
||||||
|
p.filename_info["issue"] = p.filename_info["alternate"]
|
||||||
|
p.filename_info["alternate"] = ""
|
||||||
|
else:
|
||||||
|
# TODO: This never happens
|
||||||
|
inp = [x for x in p.input if x not in p.irrelevant and x not in p.used_items and x.typ != eof.typ]
|
||||||
|
if len(inp) == 1 and inp[0].typ == filenamelexer.ItemType.Number:
|
||||||
|
p.filename_info["issue"] = inp[0].val
|
||||||
|
p.used_items.append(inp[0])
|
||||||
|
|
||||||
|
remove_items = []
|
||||||
|
if p.remove_fcbd:
|
||||||
|
remove_items.append(filenamelexer.ItemType.FCBD)
|
||||||
|
if p.remove_c2c:
|
||||||
|
remove_items.append(filenamelexer.ItemType.C2C)
|
||||||
|
|
||||||
|
p.irrelevant.extend([x for x in p.input if x.typ in remove_items])
|
||||||
|
|
||||||
|
p.filename_info["remainder"] = get_remainder(p)
|
||||||
|
|
||||||
|
# Ensure keys always exist
|
||||||
|
for s in [
|
||||||
|
"alternate",
|
||||||
|
"issue",
|
||||||
|
"archive",
|
||||||
|
"series",
|
||||||
|
"title",
|
||||||
|
"volume",
|
||||||
|
"year",
|
||||||
|
"remainder",
|
||||||
|
"issue_count",
|
||||||
|
"volume_count",
|
||||||
|
"publisher",
|
||||||
|
]:
|
||||||
|
if s not in p.filename_info:
|
||||||
|
p.filename_info[s] = ""
|
||||||
|
for s in ["fcbd", "c2c", "annual"]:
|
||||||
|
if s not in p.filename_info:
|
||||||
|
p.filename_info[s] = False
|
||||||
|
|
||||||
|
|
||||||
|
def get_remainder(p: Parser):
|
||||||
|
remainder = ""
|
||||||
|
rem = []
|
||||||
|
|
||||||
|
# Remove used items and irrelevant items e.g. the series and useless operators
|
||||||
|
inp = [x for x in p.input if x not in p.irrelevant and x not in p.used_items]
|
||||||
|
for i, item in enumerate(inp):
|
||||||
|
# No double space or space next to parentheses
|
||||||
|
if item.typ in [filenamelexer.ItemType.Space, filenamelexer.ItemType.Skip]:
|
||||||
|
if (
|
||||||
|
i > 0
|
||||||
|
and inp[i - 1].typ
|
||||||
|
not in [
|
||||||
|
filenamelexer.ItemType.Space,
|
||||||
|
filenamelexer.ItemType.LeftBrace,
|
||||||
|
filenamelexer.ItemType.LeftParen,
|
||||||
|
filenamelexer.ItemType.LeftSBrace,
|
||||||
|
]
|
||||||
|
and i + 1 < len(inp)
|
||||||
|
and inp[i + 1].typ
|
||||||
|
not in [
|
||||||
|
filenamelexer.ItemType.RightBrace,
|
||||||
|
filenamelexer.ItemType.RightParen,
|
||||||
|
filenamelexer.ItemType.RightSBrace,
|
||||||
|
]
|
||||||
|
):
|
||||||
|
remainder += " "
|
||||||
|
|
||||||
|
# Strip off useless opening parenthesis
|
||||||
|
elif (
|
||||||
|
item.typ
|
||||||
|
in [
|
||||||
|
filenamelexer.ItemType.Space,
|
||||||
|
filenamelexer.ItemType.RightBrace,
|
||||||
|
filenamelexer.ItemType.RightParen,
|
||||||
|
filenamelexer.ItemType.RightSBrace,
|
||||||
|
]
|
||||||
|
and i > 0
|
||||||
|
and inp[i - 1].typ
|
||||||
|
in [
|
||||||
|
filenamelexer.ItemType.LeftBrace,
|
||||||
|
filenamelexer.ItemType.LeftParen,
|
||||||
|
filenamelexer.ItemType.LeftSBrace,
|
||||||
|
]
|
||||||
|
):
|
||||||
|
remainder = remainder.rstrip("[{(")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Add the next item
|
||||||
|
else:
|
||||||
|
rem.append(item)
|
||||||
|
remainder += item.val
|
||||||
|
|
||||||
|
# Remove empty parentheses
|
||||||
|
remainder = re.sub(r"[\[{(]+[]})]+", "", remainder)
|
||||||
|
return remainder.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_info_specifier(p: Parser):
|
||||||
|
item = p.input[p.pos]
|
||||||
|
index = p.pos
|
||||||
|
|
||||||
|
if p.peek().typ == filenamelexer.ItemType.Space:
|
||||||
|
p.get()
|
||||||
|
|
||||||
|
# Handles 'book 3' and 'book three'
|
||||||
|
if p.peek().typ == filenamelexer.ItemType.Number or (
|
||||||
|
p.peek().typ == filenamelexer.ItemType.Text and t2d.convert(p.peek().val).isnumeric()
|
||||||
|
):
|
||||||
|
|
||||||
|
number = p.get()
|
||||||
|
if item.val.lower() in ["volume", "vol", "vol.", "v"]:
|
||||||
|
p.filename_info["volume"] = t2do.convert(number.val)
|
||||||
|
p.used_items.append(item)
|
||||||
|
p.used_items.append(number)
|
||||||
|
|
||||||
|
# 'of' is only special if it is inside a parenthesis.
|
||||||
|
elif item.val.lower() == "of":
|
||||||
|
i = get_number(p, index)
|
||||||
|
if p.in_something > 0:
|
||||||
|
if p.issue_number_at is None:
|
||||||
|
# TODO: Figure out what to do here if it ever happens
|
||||||
|
p.filename_info["issue_count"] = str(int(t2do.convert(number.val)))
|
||||||
|
p.used_items.append(item)
|
||||||
|
p.used_items.append(number)
|
||||||
|
|
||||||
|
# This is definitely the issue number
|
||||||
|
elif p.issue_number_at == i.pos:
|
||||||
|
p.filename_info["issue_count"] = str(int(t2do.convert(number.val)))
|
||||||
|
p.used_items.append(item)
|
||||||
|
p.used_items.append(number)
|
||||||
|
|
||||||
|
# This is not for the issue number it is not in either the issue or the title, assume it is the volume number and count
|
||||||
|
elif p.issue_number_at != i.pos and i not in p.series_parts and i not in p.title_parts:
|
||||||
|
p.filename_info["volume"] = i.val
|
||||||
|
p.filename_info["volume_count"] = str(int(t2do.convert(number.val)))
|
||||||
|
p.used_items.append(i)
|
||||||
|
p.used_items.append(item)
|
||||||
|
p.used_items.append(number)
|
||||||
|
else:
|
||||||
|
# TODO: Figure out what to do here if it ever happens
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
# Lets 'The Wrath of Foobar-Man, Part 1 of 2' parse correctly as the title
|
||||||
|
if i is not None:
|
||||||
|
p.pos = [ind for ind, x in enumerate(p.input) if x == i][0]
|
||||||
|
|
||||||
|
if not p.in_something:
|
||||||
|
return parse_series
|
||||||
|
return parse
|
||||||
|
|
||||||
|
|
||||||
|
# Gets 03 in '03 of 6'
|
||||||
|
def get_number(p: Parser, index: int):
|
||||||
|
# Go backward through the filename to see if we can find what this is of eg '03 (of 6)' or '008 title 03 (of 6)'
|
||||||
|
rev = p.input[:index]
|
||||||
|
rev.reverse()
|
||||||
|
for i in rev:
|
||||||
|
# We don't care about these types, we are looking to see if there is a number that is possibly different from the issue number for this count
|
||||||
|
if i.typ in [
|
||||||
|
filenamelexer.ItemType.LeftParen,
|
||||||
|
filenamelexer.ItemType.LeftBrace,
|
||||||
|
filenamelexer.ItemType.LeftSBrace,
|
||||||
|
filenamelexer.ItemType.Space,
|
||||||
|
]:
|
||||||
|
continue
|
||||||
|
if i.typ == filenamelexer.ItemType.Number:
|
||||||
|
# We got our number, time to leave
|
||||||
|
return i
|
||||||
|
# This is not a number and not an ignorable type, give up looking for the number this count belongs to
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def join_title(lst: list[filenamelexer.Item]):
|
||||||
|
title = ""
|
||||||
|
for i, item in enumerate(lst):
|
||||||
|
if i + 1 == len(lst) and item.val == ",": # We ignore commas on the end
|
||||||
|
continue
|
||||||
|
title += item.val # Add the next item
|
||||||
|
# No space after operators
|
||||||
|
if item.typ == filenamelexer.ItemType.Operator:
|
||||||
|
continue
|
||||||
|
# No trailing space
|
||||||
|
if i == len(lst) - 1:
|
||||||
|
continue
|
||||||
|
# No space after honorifics with a dot
|
||||||
|
if item.typ == filenamelexer.ItemType.Honorific and lst[i + 1].typ == filenamelexer.ItemType.Dot:
|
||||||
|
continue
|
||||||
|
# No space if the next item is an operator or symbol
|
||||||
|
if lst[i + 1].typ in [
|
||||||
|
filenamelexer.ItemType.Operator,
|
||||||
|
filenamelexer.ItemType.Symbol,
|
||||||
|
]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Add a space
|
||||||
|
title += " "
|
||||||
|
|
||||||
|
return title
|
||||||
|
|
||||||
|
|
||||||
|
def Parse(
|
||||||
|
lexer_result: list[filenamelexer.Item],
|
||||||
|
first_is_alt=False,
|
||||||
|
remove_c2c=False,
|
||||||
|
remove_fcbd=False,
|
||||||
|
remove_publisher=False,
|
||||||
|
):
|
||||||
|
p = Parser(
|
||||||
|
lexer_result=lexer_result,
|
||||||
|
first_is_alt=first_is_alt,
|
||||||
|
remove_c2c=remove_c2c,
|
||||||
|
remove_fcbd=remove_fcbd,
|
||||||
|
remove_publisher=remove_publisher,
|
||||||
|
)
|
||||||
|
p.run()
|
||||||
|
return p
|
||||||
|
Loading…
x
Reference in New Issue
Block a user