sophisticated date parsing
This commit is contained in:
parent
93ac5760a0
commit
120feab7af
1
NEWS.md
1
NEWS.md
@ -4,6 +4,7 @@
|
|||||||
|
|
||||||
- Titles are now parsed only if they occur after the series token AND after
|
- Titles are now parsed only if they occur after the series token AND after
|
||||||
either issue, year or volume.
|
either issue, year or volume.
|
||||||
|
- A more sophisticated date parser.
|
||||||
- Issue numbers that lead with a '#' character may start with alphabetical
|
- Issue numbers that lead with a '#' character may start with alphabetical
|
||||||
characters.
|
characters.
|
||||||
- If volume is parsed, but issue number is not, the issue number is copied from
|
- If volume is parsed, but issue number is not, the issue number is copied from
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
"""Parse comic book archive names using the simple 'parse' parser."""
|
"""Parse comic book archive names using the simple 'parse' parser."""
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
from calendar import month_abbr
|
||||||
from copy import copy
|
from copy import copy
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from re import Pattern
|
from re import Pattern
|
||||||
@ -7,6 +8,7 @@ from typing import Any
|
|||||||
|
|
||||||
from comicfn2dict.regex import (
|
from comicfn2dict.regex import (
|
||||||
NON_NUMBER_DOT_RE,
|
NON_NUMBER_DOT_RE,
|
||||||
|
YEAR_FIRST_DATE_RE,
|
||||||
EXTRA_SPACES_RE,
|
EXTRA_SPACES_RE,
|
||||||
ISSUE_ANYWHERE_RE,
|
ISSUE_ANYWHERE_RE,
|
||||||
ISSUE_COUNT_RE,
|
ISSUE_COUNT_RE,
|
||||||
@ -18,14 +20,14 @@ from comicfn2dict.regex import (
|
|||||||
ORIGINAL_FORMAT_SCAN_INFO_RE,
|
ORIGINAL_FORMAT_SCAN_INFO_RE,
|
||||||
REMAINING_GROUP_RE,
|
REMAINING_GROUP_RE,
|
||||||
VOLUME_RE,
|
VOLUME_RE,
|
||||||
YEAR_BEGIN_RE,
|
MONTH_FIRST_DATE_RE,
|
||||||
YEAR_END_RE,
|
|
||||||
YEAR_TOKEN_RE,
|
YEAR_TOKEN_RE,
|
||||||
)
|
)
|
||||||
|
|
||||||
_REMAINING_GROUP_KEYS = ("series", "title")
|
_REMAINING_GROUP_KEYS = ("series", "title")
|
||||||
_TITLE_PRECEDING_KEYS = ("issue", "year", "volume")
|
_TITLE_PRECEDING_KEYS = ("issue", "year", "volume")
|
||||||
_TOKEN_DELIMETER = "/"
|
_TOKEN_DELIMETER = "/"
|
||||||
|
_DATE_KEYS = frozenset({"year", "month", "day"})
|
||||||
|
|
||||||
|
|
||||||
class ComicFilenameParser:
|
class ComicFilenameParser:
|
||||||
@ -69,7 +71,7 @@ class ComicFilenameParser:
|
|||||||
value = value.strip("'").strip('"').strip()
|
value = value.strip("'").strip('"').strip()
|
||||||
return value
|
return value
|
||||||
|
|
||||||
def _parse_item(
|
def _parse_items(
|
||||||
self,
|
self,
|
||||||
regex: Pattern,
|
regex: Pattern,
|
||||||
require_all: bool = False,
|
require_all: bool = False,
|
||||||
@ -95,6 +97,30 @@ class ComicFilenameParser:
|
|||||||
parts.append(token)
|
parts.append(token)
|
||||||
self._unparsed_path = _TOKEN_DELIMETER.join(parts)
|
self._unparsed_path = _TOKEN_DELIMETER.join(parts)
|
||||||
|
|
||||||
|
def _alpha_month_to_numeric(self):
|
||||||
|
"""Translate alpha_month to numeric month."""
|
||||||
|
if alpha_month := self.metadata.get("alpha_month", ""):
|
||||||
|
alpha_month = alpha_month.capitalize() # type: ignore
|
||||||
|
for index, abbr in enumerate(month_abbr):
|
||||||
|
if abbr and alpha_month.startswith(abbr):
|
||||||
|
month = f"{index:02d}"
|
||||||
|
self.metadata["month"] = month
|
||||||
|
break
|
||||||
|
|
||||||
|
def _parse_dates(self):
|
||||||
|
"""Parse date schemes."""
|
||||||
|
# Month first date
|
||||||
|
self._parse_items(MONTH_FIRST_DATE_RE)
|
||||||
|
self._alpha_month_to_numeric()
|
||||||
|
|
||||||
|
# Year first date
|
||||||
|
if _DATE_KEYS - self.metadata.keys():
|
||||||
|
self._parse_items(YEAR_FIRST_DATE_RE)
|
||||||
|
self._alpha_month_to_numeric()
|
||||||
|
|
||||||
|
if "year" not in self.metadata:
|
||||||
|
self._parse_items(YEAR_TOKEN_RE)
|
||||||
|
|
||||||
def _is_title_in_position(self, value):
|
def _is_title_in_position(self, value):
|
||||||
"""Does the title come after series and one other token if they exist."""
|
"""Does the title come after series and one other token if they exist."""
|
||||||
title_index = self.path.find(value)
|
title_index = self.path.find(value)
|
||||||
@ -171,35 +197,28 @@ class ComicFilenameParser:
|
|||||||
self._log_progress("CLEANED")
|
self._log_progress("CLEANED")
|
||||||
|
|
||||||
# Parse paren tokens
|
# Parse paren tokens
|
||||||
self._parse_item(ISSUE_COUNT_RE)
|
self._parse_items(ISSUE_COUNT_RE)
|
||||||
self._parse_item(YEAR_TOKEN_RE)
|
self._parse_dates()
|
||||||
self._parse_item(
|
self._parse_items(
|
||||||
ORIGINAL_FORMAT_SCAN_INFO_RE,
|
ORIGINAL_FORMAT_SCAN_INFO_RE,
|
||||||
require_all=True,
|
require_all=True,
|
||||||
)
|
)
|
||||||
if "original_format" not in self.metadata:
|
if "original_format" not in self.metadata:
|
||||||
self._parse_item(
|
self._parse_items(
|
||||||
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
|
ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE,
|
||||||
)
|
)
|
||||||
self._log_progress("AFTER PAREN TOKENS")
|
self._log_progress("AFTER PAREN TOKENS")
|
||||||
|
|
||||||
# Parse regular tokens
|
# Parse regular tokens
|
||||||
self._parse_item(VOLUME_RE)
|
self._parse_items(VOLUME_RE)
|
||||||
self._parse_item(ISSUE_NUMBER_RE)
|
self._parse_items(ISSUE_NUMBER_RE)
|
||||||
self._log_progress("AFTER REGULAR TOKENS")
|
self._log_progress("AFTER REGULAR TOKENS")
|
||||||
|
|
||||||
# Pickup year if not gotten.
|
|
||||||
if "year" not in self.metadata:
|
|
||||||
self._parse_item(YEAR_BEGIN_RE)
|
|
||||||
if "year" not in self.metadata:
|
|
||||||
self._parse_item(YEAR_END_RE)
|
|
||||||
self._log_progress("AFTER YEAR PICKUP")
|
|
||||||
|
|
||||||
# Pickup issue if it's a standalone token
|
# Pickup issue if it's a standalone token
|
||||||
if "issue" not in self.metadata:
|
if "issue" not in self.metadata:
|
||||||
self._parse_item(ISSUE_END_RE)
|
self._parse_items(ISSUE_END_RE)
|
||||||
if "issue" not in self.metadata:
|
if "issue" not in self.metadata:
|
||||||
self._parse_item(ISSUE_BEGIN_RE)
|
self._parse_items(ISSUE_BEGIN_RE)
|
||||||
|
|
||||||
self._log_progress("AFTER ISSUE PICKUP")
|
self._log_progress("AFTER ISSUE PICKUP")
|
||||||
|
|
||||||
@ -210,7 +229,7 @@ class ComicFilenameParser:
|
|||||||
# Final try for issue number.
|
# Final try for issue number.
|
||||||
if "issue" not in self.metadata:
|
if "issue" not in self.metadata:
|
||||||
# TODO is this useful?
|
# TODO is this useful?
|
||||||
self._parse_item(ISSUE_ANYWHERE_RE)
|
self._parse_items(ISSUE_ANYWHERE_RE)
|
||||||
self._log_progress("AFTER ISSUE PICKUP")
|
self._log_progress("AFTER ISSUE PICKUP")
|
||||||
|
|
||||||
# Copy volume into issue if it's all we have.
|
# Copy volume into issue if it's all we have.
|
||||||
|
@ -38,16 +38,64 @@ ORIGINAL_FORMAT_PATTERNS = (
|
|||||||
r"Web([-\s]?(Comic|Rip))?",
|
r"Web([-\s]?(Comic|Rip))?",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
MONTHS = (
|
||||||
|
r"Jan(uary)?",
|
||||||
|
r"Feb(ruary)?",
|
||||||
|
r"Mar(ch)?",
|
||||||
|
r"Apr(il)?",
|
||||||
|
r"May",
|
||||||
|
r"Jun(e)?",
|
||||||
|
r"Jul(y)?",
|
||||||
|
r"Aug(ust)?",
|
||||||
|
r"Sept(ember)?",
|
||||||
|
r"Oct(ober)?",
|
||||||
|
r"Nov(ember)?",
|
||||||
|
r"Dec(ember)?",
|
||||||
|
)
|
||||||
|
|
||||||
# CLEAN
|
# CLEAN
|
||||||
NON_SPACE_DIVIDER_RE = re_compile(r"[_\+]")
|
NON_SPACE_DIVIDER_RE = re_compile(r"[_\+]")
|
||||||
EXTRA_SPACES_RE = re_compile(r"\s\s+")
|
EXTRA_SPACES_RE = re_compile(r"\s\s+")
|
||||||
|
|
||||||
|
### DATES
|
||||||
|
_YEAR_RE_EXP = r"(?P<year>[12]\d{3})"
|
||||||
|
_MONTH_ALPHA_RE_EXP = r"(?P<alpha_month>" + r"|".join(MONTHS) + r")\.?"
|
||||||
|
_MONTH_NUMERIC_RE_EXP = r"(?P<month>0?\d|1[0-2]?)"
|
||||||
|
_MONTH_RE_EXP = r"(" + _MONTH_ALPHA_RE_EXP + r"|" + _MONTH_NUMERIC_RE_EXP + r")"
|
||||||
|
|
||||||
|
_DAY_RE_EXP = r"(?P<day>([0-2]?\d|(3)[0-1]))"
|
||||||
|
_DATE_DELIM = r"[-\s]+"
|
||||||
|
_MONTH_FIRST_DATE_RE_EXP = (
|
||||||
|
r"((\b|\(?)"
|
||||||
|
# Month
|
||||||
|
+ _MONTH_RE_EXP
|
||||||
|
# Day
|
||||||
|
+ r"("
|
||||||
|
+ _DATE_DELIM
|
||||||
|
+ _DAY_RE_EXP
|
||||||
|
+ r")?"
|
||||||
|
# Year
|
||||||
|
+ r"[,]?"
|
||||||
|
+ _DATE_DELIM
|
||||||
|
+ _YEAR_RE_EXP
|
||||||
|
+ r"(\)?|\b))"
|
||||||
|
)
|
||||||
|
_YEAR_FIRST_DATE_RE_EXP = (
|
||||||
|
r"(\b\(?"
|
||||||
|
+ _YEAR_RE_EXP
|
||||||
|
+ _DATE_DELIM
|
||||||
|
+ _MONTH_RE_EXP
|
||||||
|
+ _DATE_DELIM
|
||||||
|
+ _DAY_RE_EXP
|
||||||
|
+ r"\b\)?)"
|
||||||
|
)
|
||||||
|
|
||||||
|
MONTH_FIRST_DATE_RE = re_compile(_MONTH_FIRST_DATE_RE_EXP)
|
||||||
|
YEAR_FIRST_DATE_RE = re_compile(_YEAR_FIRST_DATE_RE_EXP)
|
||||||
|
YEAR_TOKEN_RE = re_compile(_YEAR_RE_EXP, parenthify=True)
|
||||||
|
|
||||||
# PAREN GROUPS
|
# PAREN GROUPS
|
||||||
ISSUE_COUNT_RE = re_compile(r"of\s*(?P<issue_count>\d+)", parenthify=True)
|
ISSUE_COUNT_RE = re_compile(r"of\s*(?P<issue_count>\d+)", parenthify=True)
|
||||||
_YEAR_RE_EXP = r"(?P<year>[12]\d{3})"
|
|
||||||
YEAR_TOKEN_RE = re_compile(_YEAR_RE_EXP, parenthify=True)
|
|
||||||
YEAR_BEGIN_RE = re_compile(r"^" + _YEAR_RE_EXP + r"\b")
|
|
||||||
YEAR_END_RE = re_compile(r"\b" + _YEAR_RE_EXP + r"$")
|
|
||||||
_OF_PATTERNS = r"|".join(ORIGINAL_FORMAT_PATTERNS)
|
_OF_PATTERNS = r"|".join(ORIGINAL_FORMAT_PATTERNS)
|
||||||
_ORIGINAL_FORMAT_RE_EXP = r"(?P<original_format>" + _OF_PATTERNS + r")"
|
_ORIGINAL_FORMAT_RE_EXP = r"(?P<original_format>" + _OF_PATTERNS + r")"
|
||||||
_SCAN_INFO_RE_EXP = r"(?P<scan_info>[^()]*)"
|
_SCAN_INFO_RE_EXP = r"(?P<scan_info>[^()]*)"
|
||||||
|
@ -329,6 +329,30 @@ FNS.update(
|
|||||||
"volume": "03",
|
"volume": "03",
|
||||||
"year": "2013",
|
"year": "2013",
|
||||||
},
|
},
|
||||||
|
# CT catches the year
|
||||||
|
"Marvel Previews #002 (January 2022).cbr": {
|
||||||
|
"ext": "cbr",
|
||||||
|
"issue": "002",
|
||||||
|
"series": "Marvel Previews",
|
||||||
|
"alpha_month": "January",
|
||||||
|
"month": "01",
|
||||||
|
"year": "2022",
|
||||||
|
},
|
||||||
|
"Test Numeric Year #2 2001-02-24.cbz": {
|
||||||
|
"ext": "cbz",
|
||||||
|
"issue": "2",
|
||||||
|
"series": "Test Numeric Year",
|
||||||
|
"year": "2002",
|
||||||
|
"month": "02",
|
||||||
|
"day": "24",
|
||||||
|
},
|
||||||
|
"Test Month First Date 02-24-2001.cbz": {
|
||||||
|
"ext": "cbz",
|
||||||
|
"series": "Test Month First Date",
|
||||||
|
"year": "2002",
|
||||||
|
"month": "02",
|
||||||
|
"day": "24",
|
||||||
|
},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
LATER = {
|
LATER = {
|
||||||
@ -348,40 +372,37 @@ LATER = {
|
|||||||
"volume": "1957",
|
"volume": "1957",
|
||||||
"year": "1969",
|
"year": "1969",
|
||||||
},
|
},
|
||||||
|
# CT has extra processing to re-attach the year in this case
|
||||||
|
"Blade Runner Free Comic Book Day 2021 (2021).cbr": {
|
||||||
|
"ext": "cbr",
|
||||||
|
"series": "Blade Runner Free Comic Book Day 2021",
|
||||||
|
"year": "2021",
|
||||||
|
},
|
||||||
|
# CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series)
|
||||||
|
"Bloodshot Book 03 (2020).cbr": {
|
||||||
|
"ext": "cbr",
|
||||||
|
"issue": "03",
|
||||||
|
"series": "Bloodshot",
|
||||||
|
"title": "Book 03",
|
||||||
|
"volume": "03",
|
||||||
|
"year": "2020",
|
||||||
|
},
|
||||||
|
# CT checks for the following '(of 06)' after the '03' and marks it as the volume
|
||||||
|
"Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": {
|
||||||
|
"ext": "cbr",
|
||||||
|
"issue": "008",
|
||||||
|
"series": "Elephantmen 2259",
|
||||||
|
"title": "Simple Truth",
|
||||||
|
"volume": "03",
|
||||||
|
"year": "2021",
|
||||||
|
"volume_count": "06",
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
# Not examined yet.
|
# Not examined yet.
|
||||||
FNS.update(
|
FNS.update(
|
||||||
{
|
{
|
||||||
# CT has extra processing to re-attach the year in this case
|
# c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder
|
||||||
"Blade Runner Free Comic Book Day 2021 (2021).cbr": {
|
|
||||||
"ext": "cbr",
|
|
||||||
"series": "Blade Runner Free Comic Book Day 2021",
|
|
||||||
"year": "2021",
|
|
||||||
}, # CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series)
|
|
||||||
"Bloodshot Book 03 (2020).cbr": {
|
|
||||||
"ext": "cbr",
|
|
||||||
"issue": "03",
|
|
||||||
"series": "Bloodshot",
|
|
||||||
"title": "Book 03",
|
|
||||||
"volume": "03",
|
|
||||||
"year": "2020",
|
|
||||||
}, # CT checks for the following '(of 06)' after the '03' and marks it as the volume
|
|
||||||
"Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": {
|
|
||||||
"ext": "cbr",
|
|
||||||
"issue": "008",
|
|
||||||
"series": "Elephantmen 2259",
|
|
||||||
"title": "Simple Truth",
|
|
||||||
"volume": "03",
|
|
||||||
"year": "2021",
|
|
||||||
"volume_count": "06",
|
|
||||||
}, # CT catches the year
|
|
||||||
"Marvel Previews #002 (January 2022).cbr": {
|
|
||||||
"ext": "cbr",
|
|
||||||
"issue": "002",
|
|
||||||
"series": "Marvel Previews",
|
|
||||||
"year": "2022",
|
|
||||||
}, # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder
|
|
||||||
"Marvel Two In One V1 #090 c2c.cbr": {
|
"Marvel Two In One V1 #090 c2c.cbr": {
|
||||||
"ext": "cbr",
|
"ext": "cbr",
|
||||||
"issue": "090",
|
"issue": "090",
|
||||||
@ -397,20 +418,23 @@ FNS.update(
|
|||||||
"title": "digital",
|
"title": "digital",
|
||||||
"publisher": "DC",
|
"publisher": "DC",
|
||||||
"year": "1951",
|
"year": "1951",
|
||||||
}, # CT notices that this is a full date, CT doesn't actually return the month or day though just removes it
|
},
|
||||||
|
# CT notices that this is a full date, CT doesn't actually return the month or day though just removes it
|
||||||
"X-Men, 2021-08-04 (#02).cbz": {
|
"X-Men, 2021-08-04 (#02).cbz": {
|
||||||
"ext": "cbz",
|
"ext": "cbz",
|
||||||
"issue": "02",
|
"issue": "02",
|
||||||
"series": "X-Men",
|
"series": "X-Men",
|
||||||
"year": "2021",
|
"year": "2021",
|
||||||
}, # CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation
|
},
|
||||||
|
# CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation
|
||||||
"Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": {
|
"Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": {
|
||||||
"ext": "cbz",
|
"ext": "cbz",
|
||||||
"issue": "001",
|
"issue": "001",
|
||||||
"series": "Cory Doctorow's Futuristic Tales of the Here and Now",
|
"series": "Cory Doctorow's Futuristic Tales of the Here and Now",
|
||||||
"title": "Anda's Game",
|
"title": "Anda's Game",
|
||||||
"year": "2007",
|
"year": "2007",
|
||||||
}, # This is a contrived test case. I've never seen this I just wanted to handle it with my parser
|
},
|
||||||
|
# This is a contrived test case. I've never seen this I just wanted to handle it with my parser
|
||||||
"Cory Doctorow's Futuristic Tales of the Here and Now #0.0.1 (2007).cbz": {
|
"Cory Doctorow's Futuristic Tales of the Here and Now #0.0.1 (2007).cbz": {
|
||||||
"ext": "cbz",
|
"ext": "cbz",
|
||||||
"issue": "0.1",
|
"issue": "0.1",
|
||||||
|
Loading…
Reference in New Issue
Block a user