Allow non-ascii in ComicVine searches

This commit is contained in:
Timmy Welch 2022-07-09 23:26:30 -07:00
parent a00891f622
commit e6414fba96
2 changed files with 29 additions and 19 deletions

View File

@ -121,19 +121,36 @@ def remove_articles(text: str) -> str:
def sanitize_title(text: str, basic: bool = False) -> str:
# normalize unicode and convert to ascii. Does not work for everything eg ½ to 12 not 1/2
text = unicodedata.normalize("NFKD", text)
# comicvine keeps apostrophes a part of the word
text = text.replace("'", "")
text = text.replace('"', "")
if not basic:
# comicvine ignores punctuation and accents, TODO: only remove punctuation accents and similar
text = re.sub(r"[^A-Za-z0-9]+", " ", text)
text = unicodedata.normalize("NFKD", text).casefold()
if basic:
# comicvine keeps apostrophes a part of the word
text = text.replace("'", "")
text = text.replace('"', "")
else:
# comicvine ignores punctuation and accents
# remove all characters that are not a letter, separator (space) or number
# replace any "dash punctuation" with a space
# makes sure that batman-superman and self-proclaimed stay separate words
text = "".join(
c if not unicodedata.category(c) in ("Pd") else " "
for c in text
if unicodedata.category(c)[0] in "LZN" or unicodedata.category(c) in ("Pd")
)
# remove extra space and articles and all lower case
text = remove_articles(text).casefold().strip()
text = remove_articles(text).strip()
return text
def titles_match(search_title, record_title):
sanitized_search = sanitize_title(search_title)
sanitized_record = sanitize_title(record_title)
for term in sanitized_search.split():
if term not in sanitized_record:
return False
return True
def unique_file(file_name: pathlib.Path) -> pathlib.Path:
name = file_name.name
counter = 1

View File

@ -227,7 +227,7 @@ class ComicVineTalker:
"format": "json",
"resources": "volume",
"query": search_series_name,
"field_list": "volume,name,id,start_year,publisher,image,description,count_of_issues",
"field_list": "volume,name,id,start_year,publisher,image,description,count_of_issues,aliases",
"page": 1,
"limit": 100,
}
@ -272,10 +272,7 @@ class ComicVineTalker:
# See if the last result's name has all the of the search terms.
# If not, break out of this, loop, we're done.
for term in search_series_name.split():
if term not in last_result:
stop_searching = True
break
stop_searching = utils.titles_match(search_series_name, last_result)
# Also, stop searching when the word count of last results is too much longer than our search terms list
if len(last_result) > result_word_count_max:
@ -301,12 +298,8 @@ class ComicVineTalker:
if not literal:
# Remove any search results that don't contain all the search terms (iterate backwards for easy removal)
for record in reversed(search_results):
# Sanitize the series name for comicvine searching, comicvine search ignore symbols
record_name = utils.sanitize_title(record["name"])
for term in search_series_name.split():
if term not in record_name:
search_results.remove(record)
break
if not utils.titles_match(search_series_name, record["name"]):
search_results.remove(record)
# Cache these search results, even if it's literal we cache the results
# The most it will cause is extra processing time