sophisticated date parsing

This commit is contained in:
AJ Slater 2024-02-20 17:14:03 -08:00
parent 93ac5760a0
commit 120feab7af
4 changed files with 147 additions and 55 deletions

View File

@ -4,6 +4,7 @@
- Titles are now parsed only if they occur after the series token AND after
either issue, year or volume.
- A more sophisticated date parser.
- Issue numbers that lead with a '#' character may start with alphabetical
characters.
- If volume is parsed, but issue number is not, the issue number is copied from

View File

@ -1,5 +1,6 @@
"""Parse comic book archive names using the simple 'parse' parser."""
from pprint import pprint
from calendar import month_abbr
from copy import copy
from pathlib import Path
from re import Pattern
@ -7,6 +8,7 @@ from typing import Any
from comicfn2dict.regex import (
NON_NUMBER_DOT_RE,
YEAR_FIRST_DATE_RE,
EXTRA_SPACES_RE,
ISSUE_ANYWHERE_RE,
ISSUE_COUNT_RE,
@ -18,14 +20,14 @@ from comicfn2dict.regex import (
ORIGINAL_FORMAT_SCAN_INFO_RE,
REMAINING_GROUP_RE,
VOLUME_RE,
YEAR_BEGIN_RE,
YEAR_END_RE,
MONTH_FIRST_DATE_RE,
YEAR_TOKEN_RE,
)
_REMAINING_GROUP_KEYS = ("series", "title")
_TITLE_PRECEDING_KEYS = ("issue", "year", "volume")
_TOKEN_DELIMETER = "/"
_DATE_KEYS = frozenset({"year", "month", "day"})
class ComicFilenameParser:
@ -69,7 +71,7 @@ class ComicFilenameParser:
value = value.strip("'").strip('"').strip()
return value
def _parse_item(
def _parse_items(
self,
regex: Pattern,
require_all: bool = False,
@ -95,6 +97,30 @@ class ComicFilenameParser:
parts.append(token)
self._unparsed_path = _TOKEN_DELIMETER.join(parts)
def _alpha_month_to_numeric(self):
"""Translate alpha_month to numeric month."""
if alpha_month := self.metadata.get("alpha_month", ""):
alpha_month = alpha_month.capitalize() # type: ignore
for index, abbr in enumerate(month_abbr):
if abbr and alpha_month.startswith(abbr):
month = f"{index:02d}"
self.metadata["month"] = month
break
def _parse_dates(self):
"""Parse date schemes."""
# Month first date
self._parse_items(MONTH_FIRST_DATE_RE)
self._alpha_month_to_numeric()
# Year first date
if _DATE_KEYS - self.metadata.keys():
self._parse_items(YEAR_FIRST_DATE_RE)
self._alpha_month_to_numeric()
if "year" not in self.metadata:
self._parse_items(YEAR_TOKEN_RE)
def _is_title_in_position(self, value):
"""Does the title come after series and one other token if they exist."""
title_index = self.path.find(value)
@ -171,35 +197,28 @@ class ComicFilenameParser:
self._log_progress("CLEANED")
# Parse paren tokens
self._parse_item(ISSUE_COUNT_RE)
self._parse_item(YEAR_TOKEN_RE)
self._parse_item(
self._parse_items(ISSUE_COUNT_RE)
self._parse_dates()
self._parse_items(
ORIGINAL_FORMAT_SCAN_INFO_RE,
require_all=True,
)
if "original_format" not in self.metadata:
self._parse_item(
self._parse_items(
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
)
self._log_progress("AFTER PAREN TOKENS")
# Parse regular tokens
self._parse_item(VOLUME_RE)
self._parse_item(ISSUE_NUMBER_RE)
self._parse_items(VOLUME_RE)
self._parse_items(ISSUE_NUMBER_RE)
self._log_progress("AFTER REGULAR TOKENS")
# Pickup year if not gotten.
if "year" not in self.metadata:
self._parse_item(YEAR_BEGIN_RE)
if "year" not in self.metadata:
self._parse_item(YEAR_END_RE)
self._log_progress("AFTER YEAR PICKUP")
# Pickup issue if it's a standalone token
if "issue" not in self.metadata:
self._parse_item(ISSUE_END_RE)
self._parse_items(ISSUE_END_RE)
if "issue" not in self.metadata:
self._parse_item(ISSUE_BEGIN_RE)
self._parse_items(ISSUE_BEGIN_RE)
self._log_progress("AFTER ISSUE PICKUP")
@ -210,7 +229,7 @@ class ComicFilenameParser:
# Final try for issue number.
if "issue" not in self.metadata:
# TODO is this useful?
self._parse_item(ISSUE_ANYWHERE_RE)
self._parse_items(ISSUE_ANYWHERE_RE)
self._log_progress("AFTER ISSUE PICKUP")
# Copy volume into issue if it's all we have.

View File

@ -38,16 +38,64 @@ ORIGINAL_FORMAT_PATTERNS = (
r"Web([-\s]?(Comic|Rip))?",
)
MONTHS = (
r"Jan(uary)?",
r"Feb(ruary)?",
r"Mar(ch)?",
r"Apr(il)?",
r"May",
r"Jun(e)?",
r"Jul(y)?",
r"Aug(ust)?",
r"Sept(ember)?",
r"Oct(ober)?",
r"Nov(ember)?",
r"Dec(ember)?",
)
# CLEAN
NON_SPACE_DIVIDER_RE = re_compile(r"[_\+]")
EXTRA_SPACES_RE = re_compile(r"\s\s+")
### DATES
_YEAR_RE_EXP = r"(?P<year>[12]\d{3})"
_MONTH_ALPHA_RE_EXP = r"(?P<alpha_month>" + r"|".join(MONTHS) + r")\.?"
_MONTH_NUMERIC_RE_EXP = r"(?P<month>0?\d|1[0-2]?)"
_MONTH_RE_EXP = r"(" + _MONTH_ALPHA_RE_EXP + r"|" + _MONTH_NUMERIC_RE_EXP + r")"
_DAY_RE_EXP = r"(?P<day>([0-2]?\d|(3)[0-1]))"
_DATE_DELIM = r"[-\s]+"
_MONTH_FIRST_DATE_RE_EXP = (
r"((\b|\(?)"
# Month
+ _MONTH_RE_EXP
# Day
+ r"("
+ _DATE_DELIM
+ _DAY_RE_EXP
+ r")?"
# Year
+ r"[,]?"
+ _DATE_DELIM
+ _YEAR_RE_EXP
+ r"(\)?|\b))"
)
_YEAR_FIRST_DATE_RE_EXP = (
r"(\b\(?"
+ _YEAR_RE_EXP
+ _DATE_DELIM
+ _MONTH_RE_EXP
+ _DATE_DELIM
+ _DAY_RE_EXP
+ r"\b\)?)"
)
MONTH_FIRST_DATE_RE = re_compile(_MONTH_FIRST_DATE_RE_EXP)
YEAR_FIRST_DATE_RE = re_compile(_YEAR_FIRST_DATE_RE_EXP)
YEAR_TOKEN_RE = re_compile(_YEAR_RE_EXP, parenthify=True)
# PAREN GROUPS
ISSUE_COUNT_RE = re_compile(r"of\s*(?P<issue_count>\d+)", parenthify=True)
_YEAR_RE_EXP = r"(?P<year>[12]\d{3})"
YEAR_TOKEN_RE = re_compile(_YEAR_RE_EXP, parenthify=True)
YEAR_BEGIN_RE = re_compile(r"^" + _YEAR_RE_EXP + r"\b")
YEAR_END_RE = re_compile(r"\b" + _YEAR_RE_EXP + r"$")
_OF_PATTERNS = r"|".join(ORIGINAL_FORMAT_PATTERNS)
_ORIGINAL_FORMAT_RE_EXP = r"(?P<original_format>" + _OF_PATTERNS + r")"
_SCAN_INFO_RE_EXP = r"(?P<scan_info>[^()]*)"

View File

@ -329,6 +329,30 @@ FNS.update(
"volume": "03",
"year": "2013",
},
# CT catches the year
"Marvel Previews #002 (January 2022).cbr": {
"ext": "cbr",
"issue": "002",
"series": "Marvel Previews",
"alpha_month": "January",
"month": "01",
"year": "2022",
},
"Test Numeric Year #2 2001-02-24.cbz": {
"ext": "cbz",
"issue": "2",
"series": "Test Numeric Year",
"year": "2002",
"month": "02",
"day": "24",
},
"Test Month First Date 02-24-2001.cbz": {
"ext": "cbz",
"series": "Test Month First Date",
"year": "2002",
"month": "02",
"day": "24",
},
}
)
LATER = {
@ -348,17 +372,13 @@ LATER = {
"volume": "1957",
"year": "1969",
},
}
# Not examined yet.
FNS.update(
{
# CT has extra processing to re-attach the year in this case
"Blade Runner Free Comic Book Day 2021 (2021).cbr": {
"ext": "cbr",
"series": "Blade Runner Free Comic Book Day 2021",
"year": "2021",
}, # CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series)
},
# CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series)
"Bloodshot Book 03 (2020).cbr": {
"ext": "cbr",
"issue": "03",
@ -366,7 +386,8 @@ FNS.update(
"title": "Book 03",
"volume": "03",
"year": "2020",
}, # CT checks for the following '(of 06)' after the '03' and marks it as the volume
},
# CT checks for the following '(of 06)' after the '03' and marks it as the volume
"Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": {
"ext": "cbr",
"issue": "008",
@ -375,13 +396,13 @@ FNS.update(
"volume": "03",
"year": "2021",
"volume_count": "06",
}, # CT catches the year
"Marvel Previews #002 (January 2022).cbr": {
"ext": "cbr",
"issue": "002",
"series": "Marvel Previews",
"year": "2022",
}, # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder
},
}
# Not examined yet.
FNS.update(
{
# c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder
"Marvel Two In One V1 #090 c2c.cbr": {
"ext": "cbr",
"issue": "090",
@ -397,20 +418,23 @@ FNS.update(
"title": "digital",
"publisher": "DC",
"year": "1951",
}, # CT notices that this is a full date, CT doesn't actually return the month or day though just removes it
},
# CT notices that this is a full date, CT doesn't actually return the month or day though just removes it
"X-Men, 2021-08-04 (#02).cbz": {
"ext": "cbz",
"issue": "02",
"series": "X-Men",
"year": "2021",
}, # CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation
},
# CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation
"Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": {
"ext": "cbz",
"issue": "001",
"series": "Cory Doctorow's Futuristic Tales of the Here and Now",
"title": "Anda's Game",
"year": "2007",
}, # This is a contrived test case. I've never seen this I just wanted to handle it with my parser
},
# This is a contrived test case. I've never seen this I just wanted to handle it with my parser
"Cory Doctorow's Futuristic Tales of the Here and Now #0.0.1 (2007).cbz": {
"ext": "cbz",
"issue": "0.1",