"""Some generic utilities""" # Copyright 2012-2014 ComicTagger Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import annotations import json import logging import os import pathlib import platform import unicodedata from collections import defaultdict from collections.abc import Iterable, Mapping from shutil import which # noqa: F401 from typing import Any import comicapi.data from comicapi import filenamelexer, filenameparser try: import icu del icu icu_available = True except ImportError: icu_available = False logger = logging.getLogger(__name__) def _custom_key(tup): import natsort lst = [] for x in natsort.os_sort_keygen()(tup): ret = x if len(x) > 1 and isinstance(x[1], int) and isinstance(x[0], str) and x[0] == "": ret = ("a", *x[1:]) lst.append(ret) return tuple(lst) def os_sorted(lst: Iterable) -> Iterable: import natsort key = _custom_key if icu_available or platform.system() == "Windows": key = natsort.os_sort_keygen() return sorted(lst, key=key) def parse_filename( filename: str, complicated_parser: bool = False, remove_c2c: bool = False, remove_fcbd: bool = False, remove_publisher: bool = False, split_words: bool = False, allow_issue_start_with_letter: bool = False, protofolius_issue_number_scheme: bool = False, ) -> filenameparser.FilenameInfo: if not filename: return filenameparser.FilenameInfo( alternate="", annual=False, archive="", c2c=False, fcbd=False, issue="", issue_count="", publisher="", remainder="", series="", title="", volume="", volume_count="", year="", format="", ) if split_words: import wordninja filename, ext = os.path.splitext(filename) filename = " ".join(wordninja.split(filename)) + ext if complicated_parser: lex = filenamelexer.Lex(filename, allow_issue_start_with_letter) p = filenameparser.Parse( lex.items, remove_c2c=remove_c2c, remove_fcbd=remove_fcbd, remove_publisher=remove_publisher, protofolius_issue_number_scheme=protofolius_issue_number_scheme, ) return p.filename_info else: fnp = filenameparser.FileNameParser() fnp.parse_filename(filename) fni = filenameparser.FilenameInfo( alternate="", annual=False, archive="", c2c=False, fcbd=False, issue=fnp.issue, issue_count=fnp.issue_count, publisher="", remainder=fnp.remainder, series=fnp.series, title="", volume=fnp.volume, volume_count="", year=fnp.year, format="", ) return fni def combine_notes(existing_notes: str | None, new_notes: str | None, split: str) -> str: split_notes, split_str, untouched_notes = (existing_notes or "").rpartition(split) if split_notes or split_str: return (split_notes + (new_notes or "")).strip() else: return (untouched_notes + "\n" + (new_notes or "")).strip() def parse_date_str(date_str: str | None) -> tuple[int | None, int | None, int | None]: day = None month = None year = None if date_str: parts = date_str.split("-") year = xlate_int(parts[0]) if len(parts) > 1: month = xlate_int(parts[1]) if len(parts) > 2: day = xlate_int(parts[2]) return day, month, year def get_recursive_filelist(pathlist: list[str]) -> list[str]: """Get a recursive list of of all files under all path items in the list""" filelist: list[str] = [] for p in pathlist: if os.path.isdir(p): for root, _, files in os.walk(p): for f in files: filelist.append(os.path.join(root, f)) elif os.path.exists(p): filelist.append(p) return filelist def add_to_path(dirname: str) -> None: if dirname: dirname = os.path.abspath(dirname) paths = [os.path.normpath(x) for x in split(os.environ["PATH"], os.pathsep)] if dirname not in paths: paths.insert(0, dirname) os.environ["PATH"] = os.pathsep.join(paths) def xlate_int(data: Any) -> int | None: data = xlate_float(data) if data is None: return None return int(data) def xlate_float(data: Any) -> float | None: if isinstance(data, str): data = data.strip() if data is None or data == "": return None i: str | int | float if isinstance(data, (int, float)): i = data else: i = str(data).translate(defaultdict(lambda: None, zip((ord(c) for c in "1234567890."), "1234567890."))) if i == "": return None try: return float(i) except ValueError: return None def xlate(data: Any) -> str | None: if data is None or isinstance(data, str) and data.strip() == "": return None return str(data).strip() def split(s: str | None, c: str) -> list[str]: s = xlate(s) if s: return [x.strip() for x in s.strip().split(c) if x.strip()] return [] def remove_articles(text: str) -> str: text = text.casefold() articles = [ "&", "a", "am", "an", "and", "as", "at", "be", "but", "by", "for", "if", "is", "issue", "it", "it's", "its", "itself", "of", "or", "so", "the", "the", "with", ] new_text = "" for word in text.split(): if word not in articles: new_text += word + " " new_text = new_text[:-1] return new_text def sanitize_title(text: str, basic: bool = False) -> str: # normalize unicode and convert to ascii. Does not work for everything eg ½ to 1⁄2 not 1/2 text = unicodedata.normalize("NFKD", text).casefold() # comicvine keeps apostrophes a part of the word text = text.replace("'", "") text = text.replace('"', "") if not basic: # comicvine ignores punctuation and accents # remove all characters that are not a letter, separator (space) or number # replace any "dash punctuation" with a space # makes sure that batman-superman and self-proclaimed stay separate words text = "".join( c if unicodedata.category(c)[0] not in "P" else " " for c in text if unicodedata.category(c)[0] in "LZNP" ) # remove extra space and articles and all lower case text = remove_articles(text).strip() return text def titles_match(search_title: str, record_title: str, threshold: int = 90) -> bool: import rapidfuzz.fuzz sanitized_search = sanitize_title(search_title) sanitized_record = sanitize_title(record_title) ratio = int(rapidfuzz.fuzz.ratio(sanitized_search, sanitized_record)) logger.debug( "search title: %s ; record title: %s ; ratio: %d ; match threshold: %d", search_title, record_title, ratio, threshold, ) return ratio >= threshold def unique_file(file_name: pathlib.Path) -> pathlib.Path: name = file_name.stem counter = 1 while True: if not file_name.exists(): return file_name file_name = file_name.with_stem(name + " (" + str(counter) + ")") counter += 1 _languages: dict[str | None, str | None] = defaultdict(lambda: None) _countries: dict[str | None, str | None] = defaultdict(lambda: None) def countries() -> dict[str | None, str | None]: if not _countries: import isocodes for alpha_2, c in isocodes.countries.by_alpha_2: _countries[alpha_2] = c["name"] return _countries def languages() -> dict[str | None, str | None]: if not _languages: import isocodes for alpha_2, lng in isocodes.extendend_languages._sorted_by_index(index="alpha_2"): _languages[alpha_2] = lng["name"] return _languages def get_language_from_iso(iso: str | None) -> str | None: return languages()[iso] def get_language_iso(string: str | None) -> str | None: if string is None: return None import isocodes # Return current string if all else fails lang = string.casefold() found = None for lng in isocodes.extendend_languages.items: for x in ("alpha_2", "alpha_3", "bibliographic", "common_name", "name"): if x in lng and lng[x].casefold() == lang: found = lng if found: break if found: return found.get("alpha_2", None) return lang def get_country_from_iso(iso: str | None) -> str | None: return countries()[iso] def get_publisher(publisher: str) -> tuple[str, str]: imprint = "" for pub in publishers.values(): imprint, publisher, ok = pub[publisher] if ok: break return imprint, publisher def update_publishers(new_publishers: Mapping[str, Mapping[str, str]]) -> None: for publisher in new_publishers: if publisher in publishers: publishers[publisher].update(new_publishers[publisher]) else: publishers[publisher] = ImprintDict(publisher, new_publishers[publisher]) class ImprintDict(dict): # type: ignore """ ImprintDict takes a publisher and a dict or mapping of lowercased imprint names to the proper imprint name. Retrieving a value from an ImprintDict returns a tuple of (imprint, publisher, keyExists). if the key does not exist the key is returned as the publisher unchanged """ def __init__(self, publisher: str, mapping: tuple | Mapping = (), **kwargs: dict) -> None: # type: ignore super().__init__(mapping, **kwargs) self.publisher = publisher def __missing__(self, key: str) -> None: return None def __getitem__(self, k: str) -> tuple[str, str, bool]: item = super().__getitem__(k.casefold()) if k.casefold() == self.publisher.casefold(): return "", self.publisher, True if item is None: return "", k, False else: return item, self.publisher, True def copy(self) -> ImprintDict: return ImprintDict(self.publisher, super().copy()) publishers: dict[str, ImprintDict] = {} def load_publishers() -> None: try: update_publishers(json.loads((comicapi.data.data_path / "publishers.json").read_text("utf-8"))) except Exception: logger.exception("Failed to load publishers.json; The are no publishers or imprints loaded")