sophisticated date parsing

This commit is contained in:
AJ Slater 2024-02-20 17:14:03 -08:00
parent 93ac5760a0
commit 120feab7af
4 changed files with 147 additions and 55 deletions

View File

@ -4,6 +4,7 @@
- Titles are now parsed only if they occur after the series token AND after - Titles are now parsed only if they occur after the series token AND after
either issue, year or volume. either issue, year or volume.
- A more sophisticated date parser.
- Issue numbers that lead with a '#' character may start with alphabetical - Issue numbers that lead with a '#' character may start with alphabetical
characters. characters.
- If volume is parsed, but issue number is not, the issue number is copied from - If volume is parsed, but issue number is not, the issue number is copied from

View File

@ -1,5 +1,6 @@
"""Parse comic book archive names using the simple 'parse' parser.""" """Parse comic book archive names using the simple 'parse' parser."""
from pprint import pprint from pprint import pprint
from calendar import month_abbr
from copy import copy from copy import copy
from pathlib import Path from pathlib import Path
from re import Pattern from re import Pattern
@ -7,6 +8,7 @@ from typing import Any
from comicfn2dict.regex import ( from comicfn2dict.regex import (
NON_NUMBER_DOT_RE, NON_NUMBER_DOT_RE,
YEAR_FIRST_DATE_RE,
EXTRA_SPACES_RE, EXTRA_SPACES_RE,
ISSUE_ANYWHERE_RE, ISSUE_ANYWHERE_RE,
ISSUE_COUNT_RE, ISSUE_COUNT_RE,
@ -18,14 +20,14 @@ from comicfn2dict.regex import (
ORIGINAL_FORMAT_SCAN_INFO_RE, ORIGINAL_FORMAT_SCAN_INFO_RE,
REMAINING_GROUP_RE, REMAINING_GROUP_RE,
VOLUME_RE, VOLUME_RE,
YEAR_BEGIN_RE, MONTH_FIRST_DATE_RE,
YEAR_END_RE,
YEAR_TOKEN_RE, YEAR_TOKEN_RE,
) )
_REMAINING_GROUP_KEYS = ("series", "title") _REMAINING_GROUP_KEYS = ("series", "title")
_TITLE_PRECEDING_KEYS = ("issue", "year", "volume") _TITLE_PRECEDING_KEYS = ("issue", "year", "volume")
_TOKEN_DELIMETER = "/" _TOKEN_DELIMETER = "/"
_DATE_KEYS = frozenset({"year", "month", "day"})
class ComicFilenameParser: class ComicFilenameParser:
@ -69,7 +71,7 @@ class ComicFilenameParser:
value = value.strip("'").strip('"').strip() value = value.strip("'").strip('"').strip()
return value return value
def _parse_item( def _parse_items(
self, self,
regex: Pattern, regex: Pattern,
require_all: bool = False, require_all: bool = False,
@ -95,6 +97,30 @@ class ComicFilenameParser:
parts.append(token) parts.append(token)
self._unparsed_path = _TOKEN_DELIMETER.join(parts) self._unparsed_path = _TOKEN_DELIMETER.join(parts)
def _alpha_month_to_numeric(self):
"""Translate alpha_month to numeric month."""
if alpha_month := self.metadata.get("alpha_month", ""):
alpha_month = alpha_month.capitalize() # type: ignore
for index, abbr in enumerate(month_abbr):
if abbr and alpha_month.startswith(abbr):
month = f"{index:02d}"
self.metadata["month"] = month
break
def _parse_dates(self):
"""Parse date schemes."""
# Month first date
self._parse_items(MONTH_FIRST_DATE_RE)
self._alpha_month_to_numeric()
# Year first date
if _DATE_KEYS - self.metadata.keys():
self._parse_items(YEAR_FIRST_DATE_RE)
self._alpha_month_to_numeric()
if "year" not in self.metadata:
self._parse_items(YEAR_TOKEN_RE)
def _is_title_in_position(self, value): def _is_title_in_position(self, value):
"""Does the title come after series and one other token if they exist.""" """Does the title come after series and one other token if they exist."""
title_index = self.path.find(value) title_index = self.path.find(value)
@ -171,35 +197,28 @@ class ComicFilenameParser:
self._log_progress("CLEANED") self._log_progress("CLEANED")
# Parse paren tokens # Parse paren tokens
self._parse_item(ISSUE_COUNT_RE) self._parse_items(ISSUE_COUNT_RE)
self._parse_item(YEAR_TOKEN_RE) self._parse_dates()
self._parse_item( self._parse_items(
ORIGINAL_FORMAT_SCAN_INFO_RE, ORIGINAL_FORMAT_SCAN_INFO_RE,
require_all=True, require_all=True,
) )
if "original_format" not in self.metadata: if "original_format" not in self.metadata:
self._parse_item( self._parse_items(
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
) )
self._log_progress("AFTER PAREN TOKENS") self._log_progress("AFTER PAREN TOKENS")
# Parse regular tokens # Parse regular tokens
self._parse_item(VOLUME_RE) self._parse_items(VOLUME_RE)
self._parse_item(ISSUE_NUMBER_RE) self._parse_items(ISSUE_NUMBER_RE)
self._log_progress("AFTER REGULAR TOKENS") self._log_progress("AFTER REGULAR TOKENS")
# Pickup year if not gotten.
if "year" not in self.metadata:
self._parse_item(YEAR_BEGIN_RE)
if "year" not in self.metadata:
self._parse_item(YEAR_END_RE)
self._log_progress("AFTER YEAR PICKUP")
# Pickup issue if it's a standalone token # Pickup issue if it's a standalone token
if "issue" not in self.metadata: if "issue" not in self.metadata:
self._parse_item(ISSUE_END_RE) self._parse_items(ISSUE_END_RE)
if "issue" not in self.metadata: if "issue" not in self.metadata:
self._parse_item(ISSUE_BEGIN_RE) self._parse_items(ISSUE_BEGIN_RE)
self._log_progress("AFTER ISSUE PICKUP") self._log_progress("AFTER ISSUE PICKUP")
@ -210,7 +229,7 @@ class ComicFilenameParser:
# Final try for issue number. # Final try for issue number.
if "issue" not in self.metadata: if "issue" not in self.metadata:
# TODO is this useful? # TODO is this useful?
self._parse_item(ISSUE_ANYWHERE_RE) self._parse_items(ISSUE_ANYWHERE_RE)
self._log_progress("AFTER ISSUE PICKUP") self._log_progress("AFTER ISSUE PICKUP")
# Copy volume into issue if it's all we have. # Copy volume into issue if it's all we have.

View File

@ -38,16 +38,64 @@ ORIGINAL_FORMAT_PATTERNS = (
r"Web([-\s]?(Comic|Rip))?", r"Web([-\s]?(Comic|Rip))?",
) )
MONTHS = (
r"Jan(uary)?",
r"Feb(ruary)?",
r"Mar(ch)?",
r"Apr(il)?",
r"May",
r"Jun(e)?",
r"Jul(y)?",
r"Aug(ust)?",
r"Sept(ember)?",
r"Oct(ober)?",
r"Nov(ember)?",
r"Dec(ember)?",
)
# CLEAN # CLEAN
NON_SPACE_DIVIDER_RE = re_compile(r"[_\+]") NON_SPACE_DIVIDER_RE = re_compile(r"[_\+]")
EXTRA_SPACES_RE = re_compile(r"\s\s+") EXTRA_SPACES_RE = re_compile(r"\s\s+")
### DATES
_YEAR_RE_EXP = r"(?P<year>[12]\d{3})"
_MONTH_ALPHA_RE_EXP = r"(?P<alpha_month>" + r"|".join(MONTHS) + r")\.?"
_MONTH_NUMERIC_RE_EXP = r"(?P<month>0?\d|1[0-2]?)"
_MONTH_RE_EXP = r"(" + _MONTH_ALPHA_RE_EXP + r"|" + _MONTH_NUMERIC_RE_EXP + r")"
_DAY_RE_EXP = r"(?P<day>([0-2]?\d|(3)[0-1]))"
_DATE_DELIM = r"[-\s]+"
_MONTH_FIRST_DATE_RE_EXP = (
r"((\b|\(?)"
# Month
+ _MONTH_RE_EXP
# Day
+ r"("
+ _DATE_DELIM
+ _DAY_RE_EXP
+ r")?"
# Year
+ r"[,]?"
+ _DATE_DELIM
+ _YEAR_RE_EXP
+ r"(\)?|\b))"
)
_YEAR_FIRST_DATE_RE_EXP = (
r"(\b\(?"
+ _YEAR_RE_EXP
+ _DATE_DELIM
+ _MONTH_RE_EXP
+ _DATE_DELIM
+ _DAY_RE_EXP
+ r"\b\)?)"
)
MONTH_FIRST_DATE_RE = re_compile(_MONTH_FIRST_DATE_RE_EXP)
YEAR_FIRST_DATE_RE = re_compile(_YEAR_FIRST_DATE_RE_EXP)
YEAR_TOKEN_RE = re_compile(_YEAR_RE_EXP, parenthify=True)
# PAREN GROUPS # PAREN GROUPS
ISSUE_COUNT_RE = re_compile(r"of\s*(?P<issue_count>\d+)", parenthify=True) ISSUE_COUNT_RE = re_compile(r"of\s*(?P<issue_count>\d+)", parenthify=True)
_YEAR_RE_EXP = r"(?P<year>[12]\d{3})"
YEAR_TOKEN_RE = re_compile(_YEAR_RE_EXP, parenthify=True)
YEAR_BEGIN_RE = re_compile(r"^" + _YEAR_RE_EXP + r"\b")
YEAR_END_RE = re_compile(r"\b" + _YEAR_RE_EXP + r"$")
_OF_PATTERNS = r"|".join(ORIGINAL_FORMAT_PATTERNS) _OF_PATTERNS = r"|".join(ORIGINAL_FORMAT_PATTERNS)
_ORIGINAL_FORMAT_RE_EXP = r"(?P<original_format>" + _OF_PATTERNS + r")" _ORIGINAL_FORMAT_RE_EXP = r"(?P<original_format>" + _OF_PATTERNS + r")"
_SCAN_INFO_RE_EXP = r"(?P<scan_info>[^()]*)" _SCAN_INFO_RE_EXP = r"(?P<scan_info>[^()]*)"

View File

@ -329,6 +329,30 @@ FNS.update(
"volume": "03", "volume": "03",
"year": "2013", "year": "2013",
}, },
# CT catches the year
"Marvel Previews #002 (January 2022).cbr": {
"ext": "cbr",
"issue": "002",
"series": "Marvel Previews",
"alpha_month": "January",
"month": "01",
"year": "2022",
},
"Test Numeric Year #2 2001-02-24.cbz": {
"ext": "cbz",
"issue": "2",
"series": "Test Numeric Year",
"year": "2002",
"month": "02",
"day": "24",
},
"Test Month First Date 02-24-2001.cbz": {
"ext": "cbz",
"series": "Test Month First Date",
"year": "2002",
"month": "02",
"day": "24",
},
} }
) )
LATER = { LATER = {
@ -348,40 +372,37 @@ LATER = {
"volume": "1957", "volume": "1957",
"year": "1969", "year": "1969",
}, },
# CT has extra processing to re-attach the year in this case
"Blade Runner Free Comic Book Day 2021 (2021).cbr": {
"ext": "cbr",
"series": "Blade Runner Free Comic Book Day 2021",
"year": "2021",
},
# CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series)
"Bloodshot Book 03 (2020).cbr": {
"ext": "cbr",
"issue": "03",
"series": "Bloodshot",
"title": "Book 03",
"volume": "03",
"year": "2020",
},
# CT checks for the following '(of 06)' after the '03' and marks it as the volume
"Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": {
"ext": "cbr",
"issue": "008",
"series": "Elephantmen 2259",
"title": "Simple Truth",
"volume": "03",
"year": "2021",
"volume_count": "06",
},
} }
# Not examined yet. # Not examined yet.
FNS.update( FNS.update(
{ {
# CT has extra processing to re-attach the year in this case # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder
"Blade Runner Free Comic Book Day 2021 (2021).cbr": {
"ext": "cbr",
"series": "Blade Runner Free Comic Book Day 2021",
"year": "2021",
}, # CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series)
"Bloodshot Book 03 (2020).cbr": {
"ext": "cbr",
"issue": "03",
"series": "Bloodshot",
"title": "Book 03",
"volume": "03",
"year": "2020",
}, # CT checks for the following '(of 06)' after the '03' and marks it as the volume
"Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": {
"ext": "cbr",
"issue": "008",
"series": "Elephantmen 2259",
"title": "Simple Truth",
"volume": "03",
"year": "2021",
"volume_count": "06",
}, # CT catches the year
"Marvel Previews #002 (January 2022).cbr": {
"ext": "cbr",
"issue": "002",
"series": "Marvel Previews",
"year": "2022",
}, # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder
"Marvel Two In One V1 #090 c2c.cbr": { "Marvel Two In One V1 #090 c2c.cbr": {
"ext": "cbr", "ext": "cbr",
"issue": "090", "issue": "090",
@ -397,20 +418,23 @@ FNS.update(
"title": "digital", "title": "digital",
"publisher": "DC", "publisher": "DC",
"year": "1951", "year": "1951",
}, # CT notices that this is a full date, CT doesn't actually return the month or day though just removes it },
# CT notices that this is a full date, CT doesn't actually return the month or day though just removes it
"X-Men, 2021-08-04 (#02).cbz": { "X-Men, 2021-08-04 (#02).cbz": {
"ext": "cbz", "ext": "cbz",
"issue": "02", "issue": "02",
"series": "X-Men", "series": "X-Men",
"year": "2021", "year": "2021",
}, # CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation },
# CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation
"Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": { "Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": {
"ext": "cbz", "ext": "cbz",
"issue": "001", "issue": "001",
"series": "Cory Doctorow's Futuristic Tales of the Here and Now", "series": "Cory Doctorow's Futuristic Tales of the Here and Now",
"title": "Anda's Game", "title": "Anda's Game",
"year": "2007", "year": "2007",
}, # This is a contrived test case. I've never seen this I just wanted to handle it with my parser },
# This is a contrived test case. I've never seen this I just wanted to handle it with my parser
"Cory Doctorow's Futuristic Tales of the Here and Now #0.0.1 (2007).cbz": { "Cory Doctorow's Futuristic Tales of the Here and Now #0.0.1 (2007).cbz": {
"ext": "cbz", "ext": "cbz",
"issue": "0.1", "issue": "0.1",