sophisticated date parsing
This commit is contained in:
parent
93ac5760a0
commit
120feab7af
1
NEWS.md
1
NEWS.md
@ -4,6 +4,7 @@
|
||||
|
||||
- Titles are now parsed only if they occur after the series token AND after
|
||||
either issue, year or volume.
|
||||
- A more sophisticated date parser.
|
||||
- Issue numbers that lead with a '#' character may start with alphabetical
|
||||
characters.
|
||||
- If volume is parsed, but issue number is not, the issue number is copied from
|
||||
|
@ -1,5 +1,6 @@
|
||||
"""Parse comic book archive names using the simple 'parse' parser."""
|
||||
from pprint import pprint
|
||||
from calendar import month_abbr
|
||||
from copy import copy
|
||||
from pathlib import Path
|
||||
from re import Pattern
|
||||
@ -7,6 +8,7 @@ from typing import Any
|
||||
|
||||
from comicfn2dict.regex import (
|
||||
NON_NUMBER_DOT_RE,
|
||||
YEAR_FIRST_DATE_RE,
|
||||
EXTRA_SPACES_RE,
|
||||
ISSUE_ANYWHERE_RE,
|
||||
ISSUE_COUNT_RE,
|
||||
@ -18,14 +20,14 @@ from comicfn2dict.regex import (
|
||||
ORIGINAL_FORMAT_SCAN_INFO_RE,
|
||||
REMAINING_GROUP_RE,
|
||||
VOLUME_RE,
|
||||
YEAR_BEGIN_RE,
|
||||
YEAR_END_RE,
|
||||
MONTH_FIRST_DATE_RE,
|
||||
YEAR_TOKEN_RE,
|
||||
)
|
||||
|
||||
_REMAINING_GROUP_KEYS = ("series", "title")
|
||||
_TITLE_PRECEDING_KEYS = ("issue", "year", "volume")
|
||||
_TOKEN_DELIMETER = "/"
|
||||
_DATE_KEYS = frozenset({"year", "month", "day"})
|
||||
|
||||
|
||||
class ComicFilenameParser:
|
||||
@ -69,7 +71,7 @@ class ComicFilenameParser:
|
||||
value = value.strip("'").strip('"').strip()
|
||||
return value
|
||||
|
||||
def _parse_item(
|
||||
def _parse_items(
|
||||
self,
|
||||
regex: Pattern,
|
||||
require_all: bool = False,
|
||||
@ -95,6 +97,30 @@ class ComicFilenameParser:
|
||||
parts.append(token)
|
||||
self._unparsed_path = _TOKEN_DELIMETER.join(parts)
|
||||
|
||||
def _alpha_month_to_numeric(self):
|
||||
"""Translate alpha_month to numeric month."""
|
||||
if alpha_month := self.metadata.get("alpha_month", ""):
|
||||
alpha_month = alpha_month.capitalize() # type: ignore
|
||||
for index, abbr in enumerate(month_abbr):
|
||||
if abbr and alpha_month.startswith(abbr):
|
||||
month = f"{index:02d}"
|
||||
self.metadata["month"] = month
|
||||
break
|
||||
|
||||
def _parse_dates(self):
|
||||
"""Parse date schemes."""
|
||||
# Month first date
|
||||
self._parse_items(MONTH_FIRST_DATE_RE)
|
||||
self._alpha_month_to_numeric()
|
||||
|
||||
# Year first date
|
||||
if _DATE_KEYS - self.metadata.keys():
|
||||
self._parse_items(YEAR_FIRST_DATE_RE)
|
||||
self._alpha_month_to_numeric()
|
||||
|
||||
if "year" not in self.metadata:
|
||||
self._parse_items(YEAR_TOKEN_RE)
|
||||
|
||||
def _is_title_in_position(self, value):
|
||||
"""Does the title come after series and one other token if they exist."""
|
||||
title_index = self.path.find(value)
|
||||
@ -171,35 +197,28 @@ class ComicFilenameParser:
|
||||
self._log_progress("CLEANED")
|
||||
|
||||
# Parse paren tokens
|
||||
self._parse_item(ISSUE_COUNT_RE)
|
||||
self._parse_item(YEAR_TOKEN_RE)
|
||||
self._parse_item(
|
||||
self._parse_items(ISSUE_COUNT_RE)
|
||||
self._parse_dates()
|
||||
self._parse_items(
|
||||
ORIGINAL_FORMAT_SCAN_INFO_RE,
|
||||
require_all=True,
|
||||
)
|
||||
if "original_format" not in self.metadata:
|
||||
self._parse_item(
|
||||
self._parse_items(
|
||||
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
|
||||
)
|
||||
self._log_progress("AFTER PAREN TOKENS")
|
||||
|
||||
# Parse regular tokens
|
||||
self._parse_item(VOLUME_RE)
|
||||
self._parse_item(ISSUE_NUMBER_RE)
|
||||
self._parse_items(VOLUME_RE)
|
||||
self._parse_items(ISSUE_NUMBER_RE)
|
||||
self._log_progress("AFTER REGULAR TOKENS")
|
||||
|
||||
# Pickup year if not gotten.
|
||||
if "year" not in self.metadata:
|
||||
self._parse_item(YEAR_BEGIN_RE)
|
||||
if "year" not in self.metadata:
|
||||
self._parse_item(YEAR_END_RE)
|
||||
self._log_progress("AFTER YEAR PICKUP")
|
||||
|
||||
# Pickup issue if it's a standalone token
|
||||
if "issue" not in self.metadata:
|
||||
self._parse_item(ISSUE_END_RE)
|
||||
self._parse_items(ISSUE_END_RE)
|
||||
if "issue" not in self.metadata:
|
||||
self._parse_item(ISSUE_BEGIN_RE)
|
||||
self._parse_items(ISSUE_BEGIN_RE)
|
||||
|
||||
self._log_progress("AFTER ISSUE PICKUP")
|
||||
|
||||
@ -210,7 +229,7 @@ class ComicFilenameParser:
|
||||
# Final try for issue number.
|
||||
if "issue" not in self.metadata:
|
||||
# TODO is this useful?
|
||||
self._parse_item(ISSUE_ANYWHERE_RE)
|
||||
self._parse_items(ISSUE_ANYWHERE_RE)
|
||||
self._log_progress("AFTER ISSUE PICKUP")
|
||||
|
||||
# Copy volume into issue if it's all we have.
|
||||
|
@ -38,16 +38,64 @@ ORIGINAL_FORMAT_PATTERNS = (
|
||||
r"Web([-\s]?(Comic|Rip))?",
|
||||
)
|
||||
|
||||
MONTHS = (
|
||||
r"Jan(uary)?",
|
||||
r"Feb(ruary)?",
|
||||
r"Mar(ch)?",
|
||||
r"Apr(il)?",
|
||||
r"May",
|
||||
r"Jun(e)?",
|
||||
r"Jul(y)?",
|
||||
r"Aug(ust)?",
|
||||
r"Sept(ember)?",
|
||||
r"Oct(ober)?",
|
||||
r"Nov(ember)?",
|
||||
r"Dec(ember)?",
|
||||
)
|
||||
|
||||
# CLEAN
|
||||
NON_SPACE_DIVIDER_RE = re_compile(r"[_\+]")
|
||||
EXTRA_SPACES_RE = re_compile(r"\s\s+")
|
||||
|
||||
### DATES
|
||||
_YEAR_RE_EXP = r"(?P<year>[12]\d{3})"
|
||||
_MONTH_ALPHA_RE_EXP = r"(?P<alpha_month>" + r"|".join(MONTHS) + r")\.?"
|
||||
_MONTH_NUMERIC_RE_EXP = r"(?P<month>0?\d|1[0-2]?)"
|
||||
_MONTH_RE_EXP = r"(" + _MONTH_ALPHA_RE_EXP + r"|" + _MONTH_NUMERIC_RE_EXP + r")"
|
||||
|
||||
_DAY_RE_EXP = r"(?P<day>([0-2]?\d|(3)[0-1]))"
|
||||
_DATE_DELIM = r"[-\s]+"
|
||||
_MONTH_FIRST_DATE_RE_EXP = (
|
||||
r"((\b|\(?)"
|
||||
# Month
|
||||
+ _MONTH_RE_EXP
|
||||
# Day
|
||||
+ r"("
|
||||
+ _DATE_DELIM
|
||||
+ _DAY_RE_EXP
|
||||
+ r")?"
|
||||
# Year
|
||||
+ r"[,]?"
|
||||
+ _DATE_DELIM
|
||||
+ _YEAR_RE_EXP
|
||||
+ r"(\)?|\b))"
|
||||
)
|
||||
_YEAR_FIRST_DATE_RE_EXP = (
|
||||
r"(\b\(?"
|
||||
+ _YEAR_RE_EXP
|
||||
+ _DATE_DELIM
|
||||
+ _MONTH_RE_EXP
|
||||
+ _DATE_DELIM
|
||||
+ _DAY_RE_EXP
|
||||
+ r"\b\)?)"
|
||||
)
|
||||
|
||||
MONTH_FIRST_DATE_RE = re_compile(_MONTH_FIRST_DATE_RE_EXP)
|
||||
YEAR_FIRST_DATE_RE = re_compile(_YEAR_FIRST_DATE_RE_EXP)
|
||||
YEAR_TOKEN_RE = re_compile(_YEAR_RE_EXP, parenthify=True)
|
||||
|
||||
# PAREN GROUPS
|
||||
ISSUE_COUNT_RE = re_compile(r"of\s*(?P<issue_count>\d+)", parenthify=True)
|
||||
_YEAR_RE_EXP = r"(?P<year>[12]\d{3})"
|
||||
YEAR_TOKEN_RE = re_compile(_YEAR_RE_EXP, parenthify=True)
|
||||
YEAR_BEGIN_RE = re_compile(r"^" + _YEAR_RE_EXP + r"\b")
|
||||
YEAR_END_RE = re_compile(r"\b" + _YEAR_RE_EXP + r"$")
|
||||
_OF_PATTERNS = r"|".join(ORIGINAL_FORMAT_PATTERNS)
|
||||
_ORIGINAL_FORMAT_RE_EXP = r"(?P<original_format>" + _OF_PATTERNS + r")"
|
||||
_SCAN_INFO_RE_EXP = r"(?P<scan_info>[^()]*)"
|
||||
|
@ -329,6 +329,30 @@ FNS.update(
|
||||
"volume": "03",
|
||||
"year": "2013",
|
||||
},
|
||||
# CT catches the year
|
||||
"Marvel Previews #002 (January 2022).cbr": {
|
||||
"ext": "cbr",
|
||||
"issue": "002",
|
||||
"series": "Marvel Previews",
|
||||
"alpha_month": "January",
|
||||
"month": "01",
|
||||
"year": "2022",
|
||||
},
|
||||
"Test Numeric Year #2 2001-02-24.cbz": {
|
||||
"ext": "cbz",
|
||||
"issue": "2",
|
||||
"series": "Test Numeric Year",
|
||||
"year": "2002",
|
||||
"month": "02",
|
||||
"day": "24",
|
||||
},
|
||||
"Test Month First Date 02-24-2001.cbz": {
|
||||
"ext": "cbz",
|
||||
"series": "Test Month First Date",
|
||||
"year": "2002",
|
||||
"month": "02",
|
||||
"day": "24",
|
||||
},
|
||||
}
|
||||
)
|
||||
LATER = {
|
||||
@ -348,40 +372,37 @@ LATER = {
|
||||
"volume": "1957",
|
||||
"year": "1969",
|
||||
},
|
||||
# CT has extra processing to re-attach the year in this case
|
||||
"Blade Runner Free Comic Book Day 2021 (2021).cbr": {
|
||||
"ext": "cbr",
|
||||
"series": "Blade Runner Free Comic Book Day 2021",
|
||||
"year": "2021",
|
||||
},
|
||||
# CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series)
|
||||
"Bloodshot Book 03 (2020).cbr": {
|
||||
"ext": "cbr",
|
||||
"issue": "03",
|
||||
"series": "Bloodshot",
|
||||
"title": "Book 03",
|
||||
"volume": "03",
|
||||
"year": "2020",
|
||||
},
|
||||
# CT checks for the following '(of 06)' after the '03' and marks it as the volume
|
||||
"Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": {
|
||||
"ext": "cbr",
|
||||
"issue": "008",
|
||||
"series": "Elephantmen 2259",
|
||||
"title": "Simple Truth",
|
||||
"volume": "03",
|
||||
"year": "2021",
|
||||
"volume_count": "06",
|
||||
},
|
||||
}
|
||||
|
||||
# Not examined yet.
|
||||
FNS.update(
|
||||
{
|
||||
# CT has extra processing to re-attach the year in this case
|
||||
"Blade Runner Free Comic Book Day 2021 (2021).cbr": {
|
||||
"ext": "cbr",
|
||||
"series": "Blade Runner Free Comic Book Day 2021",
|
||||
"year": "2021",
|
||||
}, # CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series)
|
||||
"Bloodshot Book 03 (2020).cbr": {
|
||||
"ext": "cbr",
|
||||
"issue": "03",
|
||||
"series": "Bloodshot",
|
||||
"title": "Book 03",
|
||||
"volume": "03",
|
||||
"year": "2020",
|
||||
}, # CT checks for the following '(of 06)' after the '03' and marks it as the volume
|
||||
"Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": {
|
||||
"ext": "cbr",
|
||||
"issue": "008",
|
||||
"series": "Elephantmen 2259",
|
||||
"title": "Simple Truth",
|
||||
"volume": "03",
|
||||
"year": "2021",
|
||||
"volume_count": "06",
|
||||
}, # CT catches the year
|
||||
"Marvel Previews #002 (January 2022).cbr": {
|
||||
"ext": "cbr",
|
||||
"issue": "002",
|
||||
"series": "Marvel Previews",
|
||||
"year": "2022",
|
||||
}, # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder
|
||||
# c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder
|
||||
"Marvel Two In One V1 #090 c2c.cbr": {
|
||||
"ext": "cbr",
|
||||
"issue": "090",
|
||||
@ -397,20 +418,23 @@ FNS.update(
|
||||
"title": "digital",
|
||||
"publisher": "DC",
|
||||
"year": "1951",
|
||||
}, # CT notices that this is a full date, CT doesn't actually return the month or day though just removes it
|
||||
},
|
||||
# CT notices that this is a full date, CT doesn't actually return the month or day though just removes it
|
||||
"X-Men, 2021-08-04 (#02).cbz": {
|
||||
"ext": "cbz",
|
||||
"issue": "02",
|
||||
"series": "X-Men",
|
||||
"year": "2021",
|
||||
}, # CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation
|
||||
},
|
||||
# CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation
|
||||
"Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": {
|
||||
"ext": "cbz",
|
||||
"issue": "001",
|
||||
"series": "Cory Doctorow's Futuristic Tales of the Here and Now",
|
||||
"title": "Anda's Game",
|
||||
"year": "2007",
|
||||
}, # This is a contrived test case. I've never seen this I just wanted to handle it with my parser
|
||||
},
|
||||
# This is a contrived test case. I've never seen this I just wanted to handle it with my parser
|
||||
"Cory Doctorow's Futuristic Tales of the Here and Now #0.0.1 (2007).cbz": {
|
||||
"ext": "cbz",
|
||||
"issue": "0.1",
|
||||
|
Loading…
Reference in New Issue
Block a user