From e6414fba966f7405d3c5389e735bc3649881951d Mon Sep 17 00:00:00 2001 From: Timmy Welch Date: Sat, 9 Jul 2022 23:26:30 -0700 Subject: [PATCH] Allow non-ascii in ComicVine searches --- comicapi/utils.py | 33 +++++++++++++++++++++++-------- comictaggerlib/comicvinetalker.py | 15 ++++---------- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/comicapi/utils.py b/comicapi/utils.py index 93e42de..8ff2355 100644 --- a/comicapi/utils.py +++ b/comicapi/utils.py @@ -121,19 +121,36 @@ def remove_articles(text: str) -> str: def sanitize_title(text: str, basic: bool = False) -> str: # normalize unicode and convert to ascii. Does not work for everything eg ½ to 1⁄2 not 1/2 - text = unicodedata.normalize("NFKD", text) - # comicvine keeps apostrophes a part of the word - text = text.replace("'", "") - text = text.replace('"', "") - if not basic: - # comicvine ignores punctuation and accents, TODO: only remove punctuation accents and similar - text = re.sub(r"[^A-Za-z0-9]+", " ", text) + text = unicodedata.normalize("NFKD", text).casefold() + if basic: + # comicvine keeps apostrophes a part of the word + text = text.replace("'", "") + text = text.replace('"', "") + else: + # comicvine ignores punctuation and accents + # remove all characters that are not a letter, separator (space) or number + # replace any "dash punctuation" with a space + # makes sure that batman-superman and self-proclaimed stay separate words + text = "".join( + c if not unicodedata.category(c) in ("Pd") else " " + for c in text + if unicodedata.category(c)[0] in "LZN" or unicodedata.category(c) in ("Pd") + ) # remove extra space and articles and all lower case - text = remove_articles(text).casefold().strip() + text = remove_articles(text).strip() return text +def titles_match(search_title, record_title): + sanitized_search = sanitize_title(search_title) + sanitized_record = sanitize_title(record_title) + for term in sanitized_search.split(): + if term not in sanitized_record: + return False + return True + + def unique_file(file_name: pathlib.Path) -> pathlib.Path: name = file_name.name counter = 1 diff --git a/comictaggerlib/comicvinetalker.py b/comictaggerlib/comicvinetalker.py index c5cb236..089edb4 100644 --- a/comictaggerlib/comicvinetalker.py +++ b/comictaggerlib/comicvinetalker.py @@ -227,7 +227,7 @@ class ComicVineTalker: "format": "json", "resources": "volume", "query": search_series_name, - "field_list": "volume,name,id,start_year,publisher,image,description,count_of_issues", + "field_list": "volume,name,id,start_year,publisher,image,description,count_of_issues,aliases", "page": 1, "limit": 100, } @@ -272,10 +272,7 @@ class ComicVineTalker: # See if the last result's name has all the of the search terms. # If not, break out of this, loop, we're done. - for term in search_series_name.split(): - if term not in last_result: - stop_searching = True - break + stop_searching = utils.titles_match(search_series_name, last_result) # Also, stop searching when the word count of last results is too much longer than our search terms list if len(last_result) > result_word_count_max: @@ -301,12 +298,8 @@ class ComicVineTalker: if not literal: # Remove any search results that don't contain all the search terms (iterate backwards for easy removal) for record in reversed(search_results): - # Sanitize the series name for comicvine searching, comicvine search ignore symbols - record_name = utils.sanitize_title(record["name"]) - for term in search_series_name.split(): - if term not in record_name: - search_results.remove(record) - break + if not utils.titles_match(search_series_name, record["name"]): + search_results.remove(record) # Cache these search results, even if it's literal we cache the results # The most it will cause is extra processing time