From 120feab7af886fa8f0323efc21edb14120627394 Mon Sep 17 00:00:00 2001 From: AJ Slater Date: Tue, 20 Feb 2024 17:14:03 -0800 Subject: [PATCH] sophisticated date parsing --- NEWS.md | 1 + comicfn2dict/parse.py | 57 +++++++++++++++++--------- comicfn2dict/regex.py | 56 +++++++++++++++++++++++-- tests/comic_filenames.py | 88 +++++++++++++++++++++++++--------------- 4 files changed, 147 insertions(+), 55 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4037888..f002b1d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,7 @@ - Titles are now parsed only if they occur after the series token AND after either issue, year or volume. +- A more sophisticated date parser. - Issue numbers that lead with a '#' character may start with alphabetical characters. - If volume is parsed, but issue number is not, the issue number is copied from diff --git a/comicfn2dict/parse.py b/comicfn2dict/parse.py index 77c8d5a..0628ca4 100644 --- a/comicfn2dict/parse.py +++ b/comicfn2dict/parse.py @@ -1,5 +1,6 @@ """Parse comic book archive names using the simple 'parse' parser.""" from pprint import pprint +from calendar import month_abbr from copy import copy from pathlib import Path from re import Pattern @@ -7,6 +8,7 @@ from typing import Any from comicfn2dict.regex import ( NON_NUMBER_DOT_RE, + YEAR_FIRST_DATE_RE, EXTRA_SPACES_RE, ISSUE_ANYWHERE_RE, ISSUE_COUNT_RE, @@ -18,14 +20,14 @@ from comicfn2dict.regex import ( ORIGINAL_FORMAT_SCAN_INFO_RE, REMAINING_GROUP_RE, VOLUME_RE, - YEAR_BEGIN_RE, - YEAR_END_RE, + MONTH_FIRST_DATE_RE, YEAR_TOKEN_RE, ) _REMAINING_GROUP_KEYS = ("series", "title") _TITLE_PRECEDING_KEYS = ("issue", "year", "volume") _TOKEN_DELIMETER = "/" +_DATE_KEYS = frozenset({"year", "month", "day"}) class ComicFilenameParser: @@ -69,7 +71,7 @@ class ComicFilenameParser: value = value.strip("'").strip('"').strip() return value - def _parse_item( + def _parse_items( self, regex: Pattern, require_all: bool = False, @@ -95,6 +97,30 @@ class ComicFilenameParser: parts.append(token) self._unparsed_path = _TOKEN_DELIMETER.join(parts) + def _alpha_month_to_numeric(self): + """Translate alpha_month to numeric month.""" + if alpha_month := self.metadata.get("alpha_month", ""): + alpha_month = alpha_month.capitalize() # type: ignore + for index, abbr in enumerate(month_abbr): + if abbr and alpha_month.startswith(abbr): + month = f"{index:02d}" + self.metadata["month"] = month + break + + def _parse_dates(self): + """Parse date schemes.""" + # Month first date + self._parse_items(MONTH_FIRST_DATE_RE) + self._alpha_month_to_numeric() + + # Year first date + if _DATE_KEYS - self.metadata.keys(): + self._parse_items(YEAR_FIRST_DATE_RE) + self._alpha_month_to_numeric() + + if "year" not in self.metadata: + self._parse_items(YEAR_TOKEN_RE) + def _is_title_in_position(self, value): """Does the title come after series and one other token if they exist.""" title_index = self.path.find(value) @@ -171,35 +197,28 @@ class ComicFilenameParser: self._log_progress("CLEANED") # Parse paren tokens - self._parse_item(ISSUE_COUNT_RE) - self._parse_item(YEAR_TOKEN_RE) - self._parse_item( + self._parse_items(ISSUE_COUNT_RE) + self._parse_dates() + self._parse_items( ORIGINAL_FORMAT_SCAN_INFO_RE, require_all=True, ) if "original_format" not in self.metadata: - self._parse_item( + self._parse_items( ORIGINAL_FORMAT_SCAN_INFO_SEPARATE_RE, ) self._log_progress("AFTER PAREN TOKENS") # Parse regular tokens - self._parse_item(VOLUME_RE) - self._parse_item(ISSUE_NUMBER_RE) + self._parse_items(VOLUME_RE) + self._parse_items(ISSUE_NUMBER_RE) self._log_progress("AFTER REGULAR TOKENS") - # Pickup year if not gotten. - if "year" not in self.metadata: - self._parse_item(YEAR_BEGIN_RE) - if "year" not in self.metadata: - self._parse_item(YEAR_END_RE) - self._log_progress("AFTER YEAR PICKUP") - # Pickup issue if it's a standalone token if "issue" not in self.metadata: - self._parse_item(ISSUE_END_RE) + self._parse_items(ISSUE_END_RE) if "issue" not in self.metadata: - self._parse_item(ISSUE_BEGIN_RE) + self._parse_items(ISSUE_BEGIN_RE) self._log_progress("AFTER ISSUE PICKUP") @@ -210,7 +229,7 @@ class ComicFilenameParser: # Final try for issue number. if "issue" not in self.metadata: # TODO is this useful? - self._parse_item(ISSUE_ANYWHERE_RE) + self._parse_items(ISSUE_ANYWHERE_RE) self._log_progress("AFTER ISSUE PICKUP") # Copy volume into issue if it's all we have. diff --git a/comicfn2dict/regex.py b/comicfn2dict/regex.py index 296fab9..5a61484 100644 --- a/comicfn2dict/regex.py +++ b/comicfn2dict/regex.py @@ -38,16 +38,64 @@ ORIGINAL_FORMAT_PATTERNS = ( r"Web([-\s]?(Comic|Rip))?", ) +MONTHS = ( + r"Jan(uary)?", + r"Feb(ruary)?", + r"Mar(ch)?", + r"Apr(il)?", + r"May", + r"Jun(e)?", + r"Jul(y)?", + r"Aug(ust)?", + r"Sept(ember)?", + r"Oct(ober)?", + r"Nov(ember)?", + r"Dec(ember)?", +) + # CLEAN NON_SPACE_DIVIDER_RE = re_compile(r"[_\+]") EXTRA_SPACES_RE = re_compile(r"\s\s+") +### DATES +_YEAR_RE_EXP = r"(?P[12]\d{3})" +_MONTH_ALPHA_RE_EXP = r"(?P" + r"|".join(MONTHS) + r")\.?" +_MONTH_NUMERIC_RE_EXP = r"(?P0?\d|1[0-2]?)" +_MONTH_RE_EXP = r"(" + _MONTH_ALPHA_RE_EXP + r"|" + _MONTH_NUMERIC_RE_EXP + r")" + +_DAY_RE_EXP = r"(?P([0-2]?\d|(3)[0-1]))" +_DATE_DELIM = r"[-\s]+" +_MONTH_FIRST_DATE_RE_EXP = ( + r"((\b|\(?)" + # Month + + _MONTH_RE_EXP + # Day + + r"(" + + _DATE_DELIM + + _DAY_RE_EXP + + r")?" + # Year + + r"[,]?" + + _DATE_DELIM + + _YEAR_RE_EXP + + r"(\)?|\b))" +) +_YEAR_FIRST_DATE_RE_EXP = ( + r"(\b\(?" + + _YEAR_RE_EXP + + _DATE_DELIM + + _MONTH_RE_EXP + + _DATE_DELIM + + _DAY_RE_EXP + + r"\b\)?)" +) + +MONTH_FIRST_DATE_RE = re_compile(_MONTH_FIRST_DATE_RE_EXP) +YEAR_FIRST_DATE_RE = re_compile(_YEAR_FIRST_DATE_RE_EXP) +YEAR_TOKEN_RE = re_compile(_YEAR_RE_EXP, parenthify=True) + # PAREN GROUPS ISSUE_COUNT_RE = re_compile(r"of\s*(?P\d+)", parenthify=True) -_YEAR_RE_EXP = r"(?P[12]\d{3})" -YEAR_TOKEN_RE = re_compile(_YEAR_RE_EXP, parenthify=True) -YEAR_BEGIN_RE = re_compile(r"^" + _YEAR_RE_EXP + r"\b") -YEAR_END_RE = re_compile(r"\b" + _YEAR_RE_EXP + r"$") _OF_PATTERNS = r"|".join(ORIGINAL_FORMAT_PATTERNS) _ORIGINAL_FORMAT_RE_EXP = r"(?P" + _OF_PATTERNS + r")" _SCAN_INFO_RE_EXP = r"(?P[^()]*)" diff --git a/tests/comic_filenames.py b/tests/comic_filenames.py index db4238e..1c860dd 100644 --- a/tests/comic_filenames.py +++ b/tests/comic_filenames.py @@ -329,6 +329,30 @@ FNS.update( "volume": "03", "year": "2013", }, + # CT catches the year + "Marvel Previews #002 (January 2022).cbr": { + "ext": "cbr", + "issue": "002", + "series": "Marvel Previews", + "alpha_month": "January", + "month": "01", + "year": "2022", + }, + "Test Numeric Year #2 2001-02-24.cbz": { + "ext": "cbz", + "issue": "2", + "series": "Test Numeric Year", + "year": "2002", + "month": "02", + "day": "24", + }, + "Test Month First Date 02-24-2001.cbz": { + "ext": "cbz", + "series": "Test Month First Date", + "year": "2002", + "month": "02", + "day": "24", + }, } ) LATER = { @@ -348,40 +372,37 @@ LATER = { "volume": "1957", "year": "1969", }, + # CT has extra processing to re-attach the year in this case + "Blade Runner Free Comic Book Day 2021 (2021).cbr": { + "ext": "cbr", + "series": "Blade Runner Free Comic Book Day 2021", + "year": "2021", + }, + # CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series) + "Bloodshot Book 03 (2020).cbr": { + "ext": "cbr", + "issue": "03", + "series": "Bloodshot", + "title": "Book 03", + "volume": "03", + "year": "2020", + }, + # CT checks for the following '(of 06)' after the '03' and marks it as the volume + "Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": { + "ext": "cbr", + "issue": "008", + "series": "Elephantmen 2259", + "title": "Simple Truth", + "volume": "03", + "year": "2021", + "volume_count": "06", + }, } # Not examined yet. FNS.update( { - # CT has extra processing to re-attach the year in this case - "Blade Runner Free Comic Book Day 2021 (2021).cbr": { - "ext": "cbr", - "series": "Blade Runner Free Comic Book Day 2021", - "year": "2021", - }, # CT treats book like 'v' but also adds it as the title (matches ComicVine for this particular series) - "Bloodshot Book 03 (2020).cbr": { - "ext": "cbr", - "issue": "03", - "series": "Bloodshot", - "title": "Book 03", - "volume": "03", - "year": "2020", - }, # CT checks for the following '(of 06)' after the '03' and marks it as the volume - "Elephantmen 2259 #008 - Simple Truth 03 (of 06) (2021).cbr": { - "ext": "cbr", - "issue": "008", - "series": "Elephantmen 2259", - "title": "Simple Truth", - "volume": "03", - "year": "2021", - "volume_count": "06", - }, # CT catches the year - "Marvel Previews #002 (January 2022).cbr": { - "ext": "cbr", - "issue": "002", - "series": "Marvel Previews", - "year": "2022", - }, # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder + # c2c aka "cover to cover" is fairly common and CT moves it to scan_info/remainder "Marvel Two In One V1 #090 c2c.cbr": { "ext": "cbr", "issue": "090", @@ -397,20 +418,23 @@ FNS.update( "title": "digital", "publisher": "DC", "year": "1951", - }, # CT notices that this is a full date, CT doesn't actually return the month or day though just removes it + }, + # CT notices that this is a full date, CT doesn't actually return the month or day though just removes it "X-Men, 2021-08-04 (#02).cbz": { "ext": "cbz", "issue": "02", "series": "X-Men", "year": "2021", - }, # CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation + }, + # CT treats ':' the same as '-' but here the ':' is attached to 'Now' which CT sees as a title separation "Cory Doctorow's Futuristic Tales of the Here and Now: Anda's Game #001 (2007).cbz": { "ext": "cbz", "issue": "001", "series": "Cory Doctorow's Futuristic Tales of the Here and Now", "title": "Anda's Game", "year": "2007", - }, # This is a contrived test case. I've never seen this I just wanted to handle it with my parser + }, + # This is a contrived test case. I've never seen this I just wanted to handle it with my parser "Cory Doctorow's Futuristic Tales of the Here and Now #0.0.1 (2007).cbz": { "ext": "cbz", "issue": "0.1",