215 lines
5.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Parsing regexes."""
import re
from types import MappingProxyType
def re_compile(exp, parenthify=False):
"""Compile regex with options."""
if parenthify:
exp = r"\(" + exp + r"\)"
return re.compile(exp, flags=re.IGNORECASE)
PUBLISHERS_UNAMBIGUOUS = (
r"Abrams ComicArts",
r"BOOM! Studios",
r"DC(\sComics)?",
r"Dark Horse Comics",
r"Drawn & Quarterly",
r"Dynamite Entertainment",
r"IDW Publishing",
r"Icon Comics",
r"Kodansha",
r"Oni Press",
r"Pantheon Books",
r"SLG Publishing",
r"SelfMadeHero",
r"Titan Comics",
)
PUBLISHERS_AMBIGUOUS = (
r"Marvel",
r"Heavy Metal",
r"Epic",
r"Image",
r"Mirage",
)
ORIGINAL_FORMAT_PATTERNS = (
r"Anthology",
r"(One|1)[-\s]Shot",
r"Annual",
r"Annotation[s]?",
r"Box[-\s]Set",
r"Digital",
r"Director[']?s\sCut", # noqa: RUF001
r"Giant([-\s]Size(d)?)?",
r"Graphic\sNovel",
r"GN",
r"Hard[-\s]?Cover",
r"HC",
r"HD-Upscaled",
r"King[-\s]Size(d)?",
r"Magazine",
r"Manga?",
r"Omnibus",
r"PDF([-\s]Rip)?",
r"Preview",
r"Prologue",
r"Scanlation",
r"Script",
r"Sketch",
r"TPB",
r"Trade[-\s]Paper[-\s]?Back",
r"Web([-\s]?(Comic|Rip))?",
)
MONTHS = (
r"Jan(uary)?",
r"Feb(ruary)?",
r"Mar(ch)?",
r"Apr(il)?",
r"May",
r"Jun(e)?",
r"Jul(y)?",
r"Aug(ust)?",
r"Sep(tember)?",
r"Oct(ober)?",
r"Nov(ember)?",
r"Dec(ember)?",
)
TOKEN_DELIMETER = r"/"
# CLEAN
_TOKEN_DIVIDERS_RE = re_compile(r":")
_SPACE_EQUIVALENT_RE = re_compile(r"_")
_EXTRA_SPACES_RE = re_compile(r"\s\s+")
_LEFT_PAREN_EQUIVALENT_RE = re_compile(r"\[")
_RIGHT_PAREN_EQUIVALENT_RE = re_compile(r"\]")
_DOUBLE_UNDERSCORE_RE = re_compile(r"__(.*)__")
REGEX_SUBS: MappingProxyType[re.Pattern, tuple[str, int]] = MappingProxyType(
{
_DOUBLE_UNDERSCORE_RE: (r"(\1)", 0),
_TOKEN_DIVIDERS_RE: (TOKEN_DELIMETER, 1),
_SPACE_EQUIVALENT_RE: (r" ", 0),
_EXTRA_SPACES_RE: (r" ", 0),
_LEFT_PAREN_EQUIVALENT_RE: (r"(", 0),
_RIGHT_PAREN_EQUIVALENT_RE: (r")", 0),
}
)
### DATES
_YEAR_RE_EXP = r"(?P<year>[12]\d{3})"
_MONTH_ALPHA_RE_EXP = r"(" + "(?P<alpha_month>" + r"|".join(MONTHS) + r")\.?" r")"
_MONTH_NUMERIC_RE_EXP = r"(?P<month>0?\d|1[0-2]?)"
_MONTH_RE_EXP = r"(" + _MONTH_ALPHA_RE_EXP + r"|" + _MONTH_NUMERIC_RE_EXP + r")"
_ALPHA_MONTH_RANGE = (
r"\b"
+ r"("
+ r"|".join(MONTHS)
+ r")"
+ r"("
+ r"\.?-"
+ r"("
+ r"|".join(MONTHS)
+ r")"
+ r")\b"
)
print(_ALPHA_MONTH_RANGE)
ALPHA_MONTH_RANGE_RE = re_compile(_ALPHA_MONTH_RANGE)
_DAY_RE_EXP = r"(?P<day>([0-2]?\d|(3)[0-1]))"
_DATE_DELIM = r"[-\s]+"
_MONTH_FIRST_DATE_RE_EXP = (
r"((\b|\(?)"
# Month
+ _MONTH_RE_EXP
# Day
+ r"("
+ _DATE_DELIM
+ _DAY_RE_EXP
+ r")?"
# Year
+ r"[,]?"
+ _DATE_DELIM
+ _YEAR_RE_EXP
+ r"(\)?|\b))"
)
_YEAR_FIRST_DATE_RE_EXP = (
r"(\b\(?"
+ _YEAR_RE_EXP
+ _DATE_DELIM
+ _MONTH_RE_EXP
+ _DATE_DELIM
+ _DAY_RE_EXP
+ r"\b\)?)"
)
MONTH_FIRST_DATE_RE = re_compile(_MONTH_FIRST_DATE_RE_EXP)
YEAR_FIRST_DATE_RE = re_compile(_YEAR_FIRST_DATE_RE_EXP)
YEAR_TOKEN_RE = re_compile(_YEAR_RE_EXP, parenthify=True)
YEAR_END_RE = re_compile(_YEAR_RE_EXP + r"\/|$")
# PAREN GROUPS
_OF_PATTERNS = r"|".join(ORIGINAL_FORMAT_PATTERNS)
_ORIGINAL_FORMAT_RE_EXP = r"(?P<original_format>" + _OF_PATTERNS + r")"
_SCAN_INFO_RE_EXP = r"(?P<scan_info>[^()]*)"
_ORIGINAL_FORMAT_SCAN_INFO_RE_EXP = (
_ORIGINAL_FORMAT_RE_EXP + r"\s*[\(:-]" + _SCAN_INFO_RE_EXP # + r")?"
)
ORIGINAL_FORMAT_SCAN_INFO_RE = re_compile(
_ORIGINAL_FORMAT_SCAN_INFO_RE_EXP, parenthify=True
)
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE = re_compile(
r"\(" + _ORIGINAL_FORMAT_RE_EXP + r"\).*\(" + _SCAN_INFO_RE_EXP + r"\)"
)
SCAN_INFO_SECONDARY_RE = re_compile(r"\b(?P<secondary_scan_info>c2c)\b")
# ISSUE
_ISSUE_RE_EXP = r"(?P<issue>\w*(½|\d+)[\.\d+]*\w*)"
_ISSUE_COUNT_RE_EXP = r"\(of\s*(?P<issue_count>\d+)\)"
ISSUE_NUMBER_RE = re_compile(
r"(\(?#" + _ISSUE_RE_EXP + r"\)?)" + r"(\W*" + _ISSUE_COUNT_RE_EXP + r")?"
)
ISSUE_WITH_COUNT_RE = re_compile(
r"(\(?" + _ISSUE_RE_EXP + r"\)?" + r"\W*" + _ISSUE_COUNT_RE_EXP + r")"
)
ISSUE_END_RE = re_compile(r"([\/\s]\(?" + _ISSUE_RE_EXP + r"\)?(\/|$))")
ISSUE_BEGIN_RE = re_compile(r"((^|\/)\(?" + _ISSUE_RE_EXP + r"\)?[\/|\s])")
# TODO unused
ISSUE_ANYWHERE_RE = re_compile(r"\b(\(?" + _ISSUE_RE_EXP + r"\)?)\b")
# Volume
_VOLUME_COUNT_RE_EXP = r"\(of\s*(?P<volume_count>\d+)\)"
VOLUME_RE = re_compile(
r"(" + r"(?:v(?:ol(?:ume)?)?\.?)\s*(?P<volume>\d+)"
r"(\W*" + _VOLUME_COUNT_RE_EXP + r")?" + r")"
)
VOLUME_WITH_COUNT_RE = re_compile(
r"(\(?" + r"(?P<volume>\d+)" + r"\)?" + r"\W*" + _VOLUME_COUNT_RE_EXP + r")"
)
BOOK_VOLUME_RE = re_compile(r"(?P<title>" + r"book\s*(?P<volume>\d+)" + r")")
# Publisher
_PUBLISHER_UNAMBIGUOUS_RE_EXP = (
r"(\b(?P<publisher>" + r"|".join(PUBLISHERS_UNAMBIGUOUS) + r")\b)"
)
_PUBLISHER_AMBIGUOUS_RE_EXP = (
r"(\b(?P<publisher>" + r"|".join(PUBLISHERS_AMBIGUOUS) + r")\b)"
)
PUBLISHER_UNAMBIGUOUS_TOKEN_RE = re_compile(
r"(^|\/)" + _PUBLISHER_UNAMBIGUOUS_RE_EXP + r"($|\/)"
)
PUBLISHER_AMBIGUOUS_TOKEN_RE = re_compile(
r"(^|\/)" + _PUBLISHER_AMBIGUOUS_RE_EXP + r"($|\/)"
)
PUBLISHER_UNAMBIGUOUS_RE = re_compile(_PUBLISHER_UNAMBIGUOUS_RE_EXP)
PUBLISHER_AMBIGUOUS_RE = re_compile(_PUBLISHER_AMBIGUOUS_RE_EXP)
# LONG STRINGS
REMAINING_GROUP_RE = re_compile(r"^[^\(].*[^\)]")
NON_NUMBER_DOT_RE = re_compile(r"(\D)\.(\D)")