2c3a2566cc
I could not find a good reason for ComicIssue to exist other than that it had more attributes than GenericMetadata, so it has been replaced. New attributes for GenericMetadata: series_id: a string uniquely identifying the series to tag_origin series_aliases: alternate series names that are not the canonical name title_aliases: alternate issue titles that are not the canonical name alternate_images: a list of urls to alternate cover images Updated attributes for GenericMetadata: genre -> genres: str -> list[str] comments -> description: str -> str story_arc -> story_arcs: str -> list[str] series_group -> series_groups: str -> list[str] character -> characters: str -> list[str] team -> teams: str -> list[str] location -> locations: str -> list[str] tag_origin -> tag_origin: str -> TagOrigin (tuple[str, str]) ComicSeries has been relocated to the ComicAPI package, currently has no usage within ComicAPI. CreditMetadata has been renamed to Credit and has replaced Credit from ComicTalker. fetch_series has been added to ComicTalker, this is currently only used in the GUI when a series is selected and does not already contain the needed fields, this function should always be cached. A new split function has been added to ComicAPI, all uses of split on single characters have been updated to use this cleanup_html and the corresponding setting are now only used in ComicTagger proper, for display we want any html directly from the upstream. When applying the metadata we then strip the description of any html. A new conversion has been added to the MetadataFormatter: j: joins any lists into a string with ', '. Note this is a valid operation on strings as well, it will add ', ' in between every character. parse_settings now assigns the given ComicTaggerPaths object to the result ensuring that the correct path is always used.
336 lines
9.1 KiB
Python
336 lines
9.1 KiB
Python
"""Some generic utilities"""
|
||
# Copyright 2012-2014 ComicTagger Authors
|
||
#
|
||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
# you may not use this file except in compliance with the License.
|
||
# You may obtain a copy of the License at
|
||
#
|
||
# http://www.apache.org/licenses/LICENSE-2.0
|
||
#
|
||
# Unless required by applicable law or agreed to in writing, software
|
||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
# See the License for the specific language governing permissions and
|
||
# limitations under the License.
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import logging
|
||
import os
|
||
import pathlib
|
||
import platform
|
||
import unicodedata
|
||
from collections import defaultdict
|
||
from collections.abc import Iterable, Mapping
|
||
from shutil import which # noqa: F401
|
||
from typing import Any
|
||
|
||
import comicapi.data
|
||
|
||
try:
|
||
import icu
|
||
|
||
del icu
|
||
icu_available = True
|
||
except ImportError:
|
||
icu_available = False
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
def _custom_key(tup):
|
||
import natsort
|
||
|
||
lst = []
|
||
for x in natsort.os_sort_keygen()(tup):
|
||
ret = x
|
||
if len(x) > 1 and isinstance(x[1], int) and isinstance(x[0], str) and x[0] == "":
|
||
ret = ("a", *x[1:])
|
||
|
||
lst.append(ret)
|
||
return tuple(lst)
|
||
|
||
|
||
def os_sorted(lst: Iterable) -> Iterable:
|
||
import natsort
|
||
|
||
key = _custom_key
|
||
if icu_available or platform.system() == "Windows":
|
||
key = natsort.os_sort_keygen()
|
||
return sorted(lst, key=key)
|
||
|
||
|
||
def combine_notes(existing_notes: str | None, new_notes: str | None, split: str) -> str:
|
||
split_notes, split_str, untouched_notes = (existing_notes or "").rpartition(split)
|
||
if split_notes or split_str:
|
||
return (split_notes + (new_notes or "")).strip()
|
||
else:
|
||
return (untouched_notes + "\n" + (new_notes or "")).strip()
|
||
|
||
|
||
def parse_date_str(date_str: str | None) -> tuple[int | None, int | None, int | None]:
|
||
day = None
|
||
month = None
|
||
year = None
|
||
if date_str:
|
||
parts = date_str.split("-")
|
||
year = xlate_int(parts[0])
|
||
if len(parts) > 1:
|
||
month = xlate_int(parts[1])
|
||
if len(parts) > 2:
|
||
day = xlate_int(parts[2])
|
||
return day, month, year
|
||
|
||
|
||
def get_recursive_filelist(pathlist: list[str]) -> list[str]:
|
||
"""Get a recursive list of of all files under all path items in the list"""
|
||
|
||
filelist: list[str] = []
|
||
for p in pathlist:
|
||
if os.path.isdir(p):
|
||
for root, _, files in os.walk(p):
|
||
for f in files:
|
||
filelist.append(os.path.join(root, f))
|
||
elif os.path.exists(p):
|
||
filelist.append(p)
|
||
|
||
return filelist
|
||
|
||
|
||
def add_to_path(dirname: str) -> None:
|
||
if dirname:
|
||
dirname = os.path.abspath(dirname)
|
||
paths = [os.path.normpath(x) for x in split(os.environ["PATH"], os.pathsep)]
|
||
|
||
if dirname not in paths:
|
||
paths.insert(0, dirname)
|
||
os.environ["PATH"] = os.pathsep.join(paths)
|
||
|
||
|
||
def xlate_int(data: Any) -> int | None:
|
||
data = xlate_float(data)
|
||
if data is None:
|
||
return None
|
||
return int(data)
|
||
|
||
|
||
def xlate_float(data: Any) -> float | None:
|
||
if isinstance(data, str):
|
||
data = data.strip()
|
||
if data is None or data == "":
|
||
return None
|
||
i: str | int | float
|
||
if isinstance(data, (int, float)):
|
||
i = data
|
||
else:
|
||
i = str(data).translate(defaultdict(lambda: None, zip((ord(c) for c in "1234567890."), "1234567890.")))
|
||
if i == "":
|
||
return None
|
||
try:
|
||
return float(i)
|
||
except ValueError:
|
||
return None
|
||
|
||
|
||
def xlate(data: Any) -> str | None:
|
||
if data is None or isinstance(data, str) and data.strip() == "":
|
||
return None
|
||
|
||
return str(data).strip()
|
||
|
||
|
||
def split(s: str | None, c: str) -> list[str]:
|
||
s = xlate(s)
|
||
if s:
|
||
return [x.strip() for x in s.strip().split(c) if x.strip()]
|
||
return []
|
||
|
||
|
||
def remove_articles(text: str) -> str:
|
||
text = text.casefold()
|
||
articles = [
|
||
"&",
|
||
"a",
|
||
"am",
|
||
"an",
|
||
"and",
|
||
"as",
|
||
"at",
|
||
"be",
|
||
"but",
|
||
"by",
|
||
"for",
|
||
"if",
|
||
"is",
|
||
"issue",
|
||
"it",
|
||
"it's",
|
||
"its",
|
||
"itself",
|
||
"of",
|
||
"or",
|
||
"so",
|
||
"the",
|
||
"the",
|
||
"with",
|
||
]
|
||
new_text = ""
|
||
for word in text.split():
|
||
if word not in articles:
|
||
new_text += word + " "
|
||
|
||
new_text = new_text[:-1]
|
||
|
||
return new_text
|
||
|
||
|
||
def sanitize_title(text: str, basic: bool = False) -> str:
|
||
# normalize unicode and convert to ascii. Does not work for everything eg ½ to 1⁄2 not 1/2
|
||
text = unicodedata.normalize("NFKD", text).casefold()
|
||
# comicvine keeps apostrophes a part of the word
|
||
text = text.replace("'", "")
|
||
text = text.replace('"', "")
|
||
if not basic:
|
||
# comicvine ignores punctuation and accents
|
||
# remove all characters that are not a letter, separator (space) or number
|
||
# replace any "dash punctuation" with a space
|
||
# makes sure that batman-superman and self-proclaimed stay separate words
|
||
text = "".join(
|
||
c if unicodedata.category(c)[0] not in "P" else " " for c in text if unicodedata.category(c)[0] in "LZNP"
|
||
)
|
||
# remove extra space and articles and all lower case
|
||
text = remove_articles(text).strip()
|
||
|
||
return text
|
||
|
||
|
||
def titles_match(search_title: str, record_title: str, threshold: int = 90) -> bool:
|
||
import rapidfuzz.fuzz
|
||
|
||
sanitized_search = sanitize_title(search_title)
|
||
sanitized_record = sanitize_title(record_title)
|
||
ratio = int(rapidfuzz.fuzz.ratio(sanitized_search, sanitized_record))
|
||
logger.debug(
|
||
"search title: %s ; record title: %s ; ratio: %d ; match threshold: %d",
|
||
search_title,
|
||
record_title,
|
||
ratio,
|
||
threshold,
|
||
)
|
||
return ratio >= threshold
|
||
|
||
|
||
def unique_file(file_name: pathlib.Path) -> pathlib.Path:
|
||
name = file_name.stem
|
||
counter = 1
|
||
while True:
|
||
if not file_name.exists():
|
||
return file_name
|
||
file_name = file_name.with_stem(name + " (" + str(counter) + ")")
|
||
counter += 1
|
||
|
||
|
||
_languages: dict[str | None, str | None] = defaultdict(lambda: None)
|
||
|
||
_countries: dict[str | None, str | None] = defaultdict(lambda: None)
|
||
|
||
|
||
def countries() -> dict[str | None, str | None]:
|
||
if not _countries:
|
||
import pycountry
|
||
|
||
for c in pycountry.countries:
|
||
if "alpha_2" in c._fields:
|
||
_countries[c.alpha_2] = c.name
|
||
return _countries
|
||
|
||
|
||
def languages() -> dict[str | None, str | None]:
|
||
if not _languages:
|
||
import pycountry
|
||
|
||
for lng in pycountry.languages:
|
||
if "alpha_2" in lng._fields:
|
||
_languages[lng.alpha_2] = lng.name
|
||
return _languages
|
||
|
||
|
||
def get_language_from_iso(iso: str | None) -> str | None:
|
||
return languages()[iso]
|
||
|
||
|
||
def get_language_iso(string: str | None) -> str | None:
|
||
if string is None:
|
||
return None
|
||
import pycountry
|
||
|
||
# Return current string if all else fails
|
||
lang = string.casefold()
|
||
|
||
try:
|
||
return getattr(pycountry.languages.lookup(string), "alpha_2", None)
|
||
except LookupError:
|
||
pass
|
||
return lang
|
||
|
||
|
||
def get_country_from_iso(iso: str | None) -> str | None:
|
||
return countries()[iso]
|
||
|
||
|
||
def get_publisher(publisher: str) -> tuple[str, str]:
|
||
imprint = ""
|
||
|
||
for pub in publishers.values():
|
||
imprint, publisher, ok = pub[publisher]
|
||
if ok:
|
||
break
|
||
|
||
return imprint, publisher
|
||
|
||
|
||
def update_publishers(new_publishers: Mapping[str, Mapping[str, str]]) -> None:
|
||
for publisher in new_publishers:
|
||
if publisher in publishers:
|
||
publishers[publisher].update(new_publishers[publisher])
|
||
else:
|
||
publishers[publisher] = ImprintDict(publisher, new_publishers[publisher])
|
||
|
||
|
||
class ImprintDict(dict): # type: ignore
|
||
"""
|
||
ImprintDict takes a publisher and a dict or mapping of lowercased
|
||
imprint names to the proper imprint name. Retrieving a value from an
|
||
ImprintDict returns a tuple of (imprint, publisher, keyExists).
|
||
if the key does not exist the key is returned as the publisher unchanged
|
||
"""
|
||
|
||
def __init__(self, publisher: str, mapping: tuple | Mapping = (), **kwargs: dict) -> None: # type: ignore
|
||
super().__init__(mapping, **kwargs)
|
||
self.publisher = publisher
|
||
|
||
def __missing__(self, key: str) -> None:
|
||
return None
|
||
|
||
def __getitem__(self, k: str) -> tuple[str, str, bool]:
|
||
item = super().__getitem__(k.casefold())
|
||
if k.casefold() == self.publisher.casefold():
|
||
return "", self.publisher, True
|
||
if item is None:
|
||
return "", k, False
|
||
else:
|
||
return item, self.publisher, True
|
||
|
||
def copy(self) -> ImprintDict:
|
||
return ImprintDict(self.publisher, super().copy())
|
||
|
||
|
||
publishers: dict[str, ImprintDict] = {}
|
||
|
||
|
||
def load_publishers() -> None:
|
||
try:
|
||
update_publishers(json.loads((comicapi.data.data_path / "publishers.json").read_text("utf-8")))
|
||
except Exception:
|
||
logger.exception("Failed to load publishers.json; The are no publishers or imprints loaded")
|