aadeb07c49
Refactor add_to_path with tests Fix type hints for titles_match Use casefold in get_language Fix using the recursive flag in cli mode Add http status code to ComicVine exceptions Fix parenthesis getting removed when renaming Add more tests
272 lines
7.6 KiB
Python
272 lines
7.6 KiB
Python
"""Some generic utilities"""
|
||
# Copyright 2012-2014 Anthony Beville
|
||
#
|
||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
# you may not use this file except in compliance with the License.
|
||
# You may obtain a copy of the License at
|
||
#
|
||
# http://www.apache.org/licenses/LICENSE-2.0
|
||
#
|
||
# Unless required by applicable law or agreed to in writing, software
|
||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
# See the License for the specific language governing permissions and
|
||
# limitations under the License.
|
||
from __future__ import annotations
|
||
|
||
import glob
|
||
import json
|
||
import logging
|
||
import os
|
||
import pathlib
|
||
import unicodedata
|
||
from collections import defaultdict
|
||
from shutil import which # noqa: F401
|
||
from typing import Any, Mapping
|
||
|
||
import pycountry
|
||
import thefuzz.fuzz
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
class UtilsVars:
|
||
already_fixed_encoding = False
|
||
|
||
|
||
def parse_date_str(date_str: str) -> tuple[int | None, int | None, int | None]:
|
||
day = None
|
||
month = None
|
||
year = None
|
||
if date_str:
|
||
parts = date_str.split("-")
|
||
year = xlate(parts[0], True)
|
||
if len(parts) > 1:
|
||
month = xlate(parts[1], True)
|
||
if len(parts) > 2:
|
||
day = xlate(parts[2], True)
|
||
return day, month, year
|
||
|
||
|
||
def get_recursive_filelist(pathlist: list[str]) -> list[str]:
|
||
"""Get a recursive list of of all files under all path items in the list"""
|
||
|
||
filelist: list[str] = []
|
||
for p in pathlist:
|
||
|
||
if os.path.isdir(p):
|
||
filelist.extend(x for x in glob.glob(f"{p}{os.sep}/**", recursive=True) if not os.path.isdir(x))
|
||
elif str(p) not in filelist:
|
||
filelist.append(str(p))
|
||
|
||
return filelist
|
||
|
||
|
||
def add_to_path(dirname: str) -> None:
|
||
if dirname:
|
||
dirname = os.path.abspath(dirname)
|
||
paths = [os.path.normpath(x) for x in os.environ["PATH"].split(os.pathsep)]
|
||
|
||
if dirname not in paths:
|
||
paths.insert(0, dirname)
|
||
os.environ["PATH"] = os.pathsep.join(paths)
|
||
|
||
|
||
def xlate(data: Any, is_int: bool = False, is_float: bool = False) -> Any:
|
||
if data is None or data == "":
|
||
return None
|
||
if is_int or is_float:
|
||
i: str | int | float
|
||
if isinstance(data, (int, float)):
|
||
i = data
|
||
else:
|
||
i = str(data).translate(defaultdict(lambda: None, zip((ord(c) for c in "1234567890."), "1234567890.")))
|
||
if i == "":
|
||
return None
|
||
try:
|
||
if is_float:
|
||
return float(i)
|
||
return int(float(i))
|
||
except ValueError:
|
||
return None
|
||
|
||
return str(data)
|
||
|
||
|
||
def remove_articles(text: str) -> str:
|
||
text = text.casefold()
|
||
articles = [
|
||
"&",
|
||
"a",
|
||
"am",
|
||
"an",
|
||
"and",
|
||
"as",
|
||
"at",
|
||
"be",
|
||
"but",
|
||
"by",
|
||
"for",
|
||
"if",
|
||
"is",
|
||
"issue",
|
||
"it",
|
||
"it's",
|
||
"its",
|
||
"itself",
|
||
"of",
|
||
"or",
|
||
"so",
|
||
"the",
|
||
"the",
|
||
"with",
|
||
"ms",
|
||
"mrs",
|
||
"mr",
|
||
"dr",
|
||
]
|
||
new_text = ""
|
||
for word in text.split(" "):
|
||
if word not in articles:
|
||
new_text += word + " "
|
||
|
||
new_text = new_text[:-1]
|
||
|
||
return new_text
|
||
|
||
|
||
def sanitize_title(text: str, basic: bool = False) -> str:
|
||
# normalize unicode and convert to ascii. Does not work for everything eg ½ to 1⁄2 not 1/2
|
||
text = unicodedata.normalize("NFKD", text).casefold()
|
||
if basic:
|
||
# comicvine keeps apostrophes a part of the word
|
||
text = text.replace("'", "")
|
||
text = text.replace('"', "")
|
||
else:
|
||
# comicvine ignores punctuation and accents
|
||
# remove all characters that are not a letter, separator (space) or number
|
||
# replace any "dash punctuation" with a space
|
||
# makes sure that batman-superman and self-proclaimed stay separate words
|
||
text = "".join(
|
||
c if not unicodedata.category(c) in ("Pd",) else " "
|
||
for c in text
|
||
if unicodedata.category(c)[0] in "LZN" or unicodedata.category(c) in ("Pd",)
|
||
)
|
||
# remove extra space and articles and all lower case
|
||
text = remove_articles(text).strip()
|
||
|
||
return text
|
||
|
||
|
||
def titles_match(search_title: str, record_title: str, threshold: int = 90) -> bool:
|
||
sanitized_search = sanitize_title(search_title)
|
||
sanitized_record = sanitize_title(record_title)
|
||
ratio: int = thefuzz.fuzz.ratio(sanitized_search, sanitized_record)
|
||
logger.debug(
|
||
"search title: %s ; record title: %s ; ratio: %d ; match threshold: %d",
|
||
search_title,
|
||
record_title,
|
||
ratio,
|
||
threshold,
|
||
)
|
||
return ratio >= threshold
|
||
|
||
|
||
def unique_file(file_name: pathlib.Path) -> pathlib.Path:
|
||
name = file_name.name
|
||
counter = 1
|
||
while True:
|
||
if not file_name.exists():
|
||
return file_name
|
||
file_name = file_name.with_name(name + " (" + str(counter) + ")")
|
||
counter += 1
|
||
|
||
|
||
languages: dict[str | None, str | None] = defaultdict(lambda: None)
|
||
|
||
countries: dict[str | None, str | None] = defaultdict(lambda: None)
|
||
|
||
for c in pycountry.countries:
|
||
if "alpha_2" in c._fields:
|
||
countries[c.alpha_2] = c.name
|
||
|
||
for lng in pycountry.languages:
|
||
if "alpha_2" in lng._fields:
|
||
languages[lng.alpha_2] = lng.name
|
||
|
||
|
||
def get_language_from_iso(iso: str | None) -> str | None:
|
||
return languages[iso]
|
||
|
||
|
||
def get_language(string: str | None) -> str | None:
|
||
if string is None:
|
||
return None
|
||
string = string.casefold()
|
||
|
||
lang = get_language_from_iso(string)
|
||
|
||
if lang is None:
|
||
try:
|
||
return str(pycountry.languages.lookup(string).name)
|
||
except LookupError:
|
||
return None
|
||
return lang
|
||
|
||
|
||
def get_publisher(publisher: str) -> tuple[str, str]:
|
||
imprint = ""
|
||
|
||
for pub in publishers.values():
|
||
imprint, publisher, ok = pub[publisher]
|
||
if ok:
|
||
break
|
||
|
||
return (imprint, publisher)
|
||
|
||
|
||
def update_publishers(new_publishers: Mapping[str, Mapping[str, str]]) -> None:
|
||
for publisher in new_publishers:
|
||
if publisher in publishers:
|
||
publishers[publisher].update(new_publishers[publisher])
|
||
else:
|
||
publishers[publisher] = ImprintDict(publisher, new_publishers[publisher])
|
||
|
||
|
||
class ImprintDict(dict):
|
||
"""
|
||
ImprintDict takes a publisher and a dict or mapping of lowercased
|
||
imprint names to the proper imprint name. Retrieving a value from an
|
||
ImprintDict returns a tuple of (imprint, publisher, keyExists).
|
||
if the key does not exist the key is returned as the publisher unchanged
|
||
"""
|
||
|
||
def __init__(self, publisher: str, mapping: tuple | Mapping = (), **kwargs: dict) -> None:
|
||
super().__init__(mapping, **kwargs)
|
||
self.publisher = publisher
|
||
|
||
def __missing__(self, key: str) -> None:
|
||
return None
|
||
|
||
def __getitem__(self, k: str) -> tuple[str, str, bool]:
|
||
item = super().__getitem__(k.casefold())
|
||
if k.casefold() == self.publisher.casefold():
|
||
return ("", self.publisher, True)
|
||
if item is None:
|
||
return ("", k, False)
|
||
else:
|
||
return (item, self.publisher, True)
|
||
|
||
def copy(self) -> ImprintDict:
|
||
return ImprintDict(self.publisher, super().copy())
|
||
|
||
|
||
publishers: dict[str, ImprintDict] = {}
|
||
|
||
|
||
def load_publishers() -> None:
|
||
try:
|
||
update_publishers(json.loads((pathlib.Path(__file__).parent / "data" / "publishers.json").read_text("utf-8")))
|
||
except Exception:
|
||
logger.exception("Failed to load publishers.json; The are no publishers or imprints loaded")
|