comictagger/comicapi/utils.py

"""Some generic utilities"""

# Copyright 2012-2014 Anthony Beville

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import logging
import os
import pathlib
import re
import unicodedata
from collections import defaultdict
from typing import Any, Iterable, List, Optional, Union

import pycountry

logger = logging.getLogger(__name__)


class UtilsVars:
    already_fixed_encoding = False


def get_recursive_filelist(pathlist: List[str]) -> List[str]:
    """Get a recursive list of of all files under all path items in the list"""

    filelist = []
    for p in pathlist:
        # if path is a folder, walk it recursively, and all files underneath
        if not isinstance(p, str):
            # it's probably a QString
            p = str(p)

        if os.path.isdir(p):
            for root, _, files in os.walk(p):
                for f in files:
                    if not isinstance(f, str):
                        # it's probably a QString
                        f = str(f)
                    filelist.append(os.path.join(root, f))
        else:
            filelist.append(p)

    return filelist


def list_to_string(lst: List[Union[str, Any]]) -> str:
    string = ""
    if lst is not None:
        for item in lst:
            if len(string) > 0:
                string += ", "
            string += item
    return string


def add_to_path(dirname: str) -> None:
    if dirname is not None and dirname != "":

        # verify that path doesn't already contain the given dirname
        tmpdirname = re.escape(dirname)
        pattern = r"(^|{sep}){dir}({sep}|$)".format(dir=tmpdirname, sep=os.pathsep)

        match = re.search(pattern, os.environ["PATH"])
        if not match:
            os.environ["PATH"] = dirname + os.pathsep + os.environ["PATH"]


def which(program: str) -> Optional[str]:
    """Returns path of the executable, if it exists"""

    def is_exe(fpath: str) -> bool:
        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)

    fpath, _ = os.path.split(program)
    if fpath:
        if is_exe(program):
            return program
    else:
        for path in os.environ["PATH"].split(os.pathsep):
            exe_file = os.path.join(path, program)
            if is_exe(exe_file):
                return exe_file

    return None


def xlate(data: Any, is_int: bool = False) -> Any:
    if data is None or data == "":
        return None
    if is_int:
        i = str(data).translate(defaultdict(lambda: None, zip((ord(c) for c in "1234567890"), "1234567890")))
        if i == "0":
            return "0"
        if i == "":
            return None
        return int(i)

    return str(data)


def remove_articles(text: str) -> str:
    text = text.lower()
    articles = [
        "&",
        "a",
        "am",
        "an",
        "and",
        "as",
        "at",
        "be",
        "but",
        "by",
        "for",
        "if",
        "is",
        "issue",
        "it",
        "it's",
        "its",
        "itself",
        "of",
        "or",
        "so",
        "the",
        "the",
        "with",
    ]
    new_text = ""
    for word in text.split(" "):
        if word not in articles:
            new_text += word + " "

    new_text = new_text[:-1]

    return new_text


def sanitize_title(text: str) -> str:
    # normalize unicode and convert to ascii. Does not work for everything eg ½ to 1⁄2 not 1/2
    # this will probably cause issues with titles in other character sets e.g. chinese, japanese
    text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("ascii")
    # comicvine keeps apostrophes a part of the word
    text = text.replace("'", "")
    text = text.replace('"', "")
    # comicvine ignores punctuation and accents
    text = re.sub(r"[^A-Za-z0-9]+", " ", text)
    # remove extra space and articles and all lower case
    text = remove_articles(text).lower().strip()

    return text


def unique_file(file_name: str) -> str:
    counter = 1
    file_name_parts = os.path.splitext(file_name)
    while True:
        if not os.path.lexists(file_name):
            return file_name
        file_name = file_name_parts[0] + " (" + str(counter) + ")" + file_name_parts[1]
        counter += 1


languages: dict[Optional[str], Optional[str]] = defaultdict(lambda: None)

countries: dict[Optional[str], Optional[str]] = defaultdict(lambda: None)

for c in pycountry.countries:
    if "alpha_2" in c._fields:
        countries[c.alpha_2] = c.name

for lng in pycountry.languages:
    if "alpha_2" in lng._fields:
        languages[lng.alpha_2] = lng.name


def get_language_from_iso(iso: Optional[str]) -> Optional[str]:
    return languages[iso]


def get_language(string: Optional[str]) -> Optional[str]:
    if string is None:
        return None

    lang = get_language_from_iso(string)

    if lang is None:
        try:
            return str(pycountry.languages.lookup(string).name)
        except:
            return None
    return lang


def get_publisher(publisher: str) -> tuple[str, str]:
    if publisher is None:
        return ("", "")
    imprint = ""

    for pub in publishers.values():
        imprint, publisher, ok = pub[publisher]
        if ok:
            break

    return (imprint, publisher)


def update_publishers(new_publishers: dict[str, dict[str, str]]) -> None:
    for publisher in new_publishers:
        if publisher in publishers:
            publishers[publisher].update(new_publishers[publisher])
        else:
            publishers[publisher] = ImprintDict(publisher, new_publishers[publisher])


class ImprintDict(dict):
    """
    ImprintDict takes a publisher and a dict or mapping of lowercased
    imprint names to the proper imprint name. Retreiving a value from an
    ImprintDict returns a tuple of (imprint, publisher, keyExists).
    if the key does not exist the key is returned as the publisher unchanged
    """

    def __init__(self, publisher: str, mapping: Iterable = (), **kwargs: Any):
        super().__init__(mapping, **kwargs)
        self.publisher = publisher

    def __missing__(self, key: str) -> None:
        return None

    def __getitem__(self, k: str) -> tuple[str, str, bool]:
        item = super().__getitem__(k.casefold())
        if k.casefold() == self.publisher.casefold():
            return ("", self.publisher, True)
        if item is None:
            return ("", k, False)
        else:
            return (item, self.publisher, True)

    def copy(self) -> "ImprintDict":
        return ImprintDict(self.publisher, super().copy())


publishers: dict[str, ImprintDict] = {}


def load_publishers() -> None:
    try:
        update_publishers(json.loads((pathlib.Path(__file__).parent / "data" / "publishers.json").read_text("utf-8")))
    except Exception:
        logger.exception("Failed to load publishers.json; The are no publishers or imprints loaded")