Allow non-ascii in ComicVine searches
This commit is contained in:
parent
a00891f622
commit
e6414fba96
@ -121,19 +121,36 @@ def remove_articles(text: str) -> str:
|
||||
|
||||
def sanitize_title(text: str, basic: bool = False) -> str:
|
||||
# normalize unicode and convert to ascii. Does not work for everything eg ½ to 1⁄2 not 1/2
|
||||
text = unicodedata.normalize("NFKD", text)
|
||||
# comicvine keeps apostrophes a part of the word
|
||||
text = text.replace("'", "")
|
||||
text = text.replace('"', "")
|
||||
if not basic:
|
||||
# comicvine ignores punctuation and accents, TODO: only remove punctuation accents and similar
|
||||
text = re.sub(r"[^A-Za-z0-9]+", " ", text)
|
||||
text = unicodedata.normalize("NFKD", text).casefold()
|
||||
if basic:
|
||||
# comicvine keeps apostrophes a part of the word
|
||||
text = text.replace("'", "")
|
||||
text = text.replace('"', "")
|
||||
else:
|
||||
# comicvine ignores punctuation and accents
|
||||
# remove all characters that are not a letter, separator (space) or number
|
||||
# replace any "dash punctuation" with a space
|
||||
# makes sure that batman-superman and self-proclaimed stay separate words
|
||||
text = "".join(
|
||||
c if not unicodedata.category(c) in ("Pd") else " "
|
||||
for c in text
|
||||
if unicodedata.category(c)[0] in "LZN" or unicodedata.category(c) in ("Pd")
|
||||
)
|
||||
# remove extra space and articles and all lower case
|
||||
text = remove_articles(text).casefold().strip()
|
||||
text = remove_articles(text).strip()
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def titles_match(search_title, record_title):
|
||||
sanitized_search = sanitize_title(search_title)
|
||||
sanitized_record = sanitize_title(record_title)
|
||||
for term in sanitized_search.split():
|
||||
if term not in sanitized_record:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def unique_file(file_name: pathlib.Path) -> pathlib.Path:
|
||||
name = file_name.name
|
||||
counter = 1
|
||||
|
@ -227,7 +227,7 @@ class ComicVineTalker:
|
||||
"format": "json",
|
||||
"resources": "volume",
|
||||
"query": search_series_name,
|
||||
"field_list": "volume,name,id,start_year,publisher,image,description,count_of_issues",
|
||||
"field_list": "volume,name,id,start_year,publisher,image,description,count_of_issues,aliases",
|
||||
"page": 1,
|
||||
"limit": 100,
|
||||
}
|
||||
@ -272,10 +272,7 @@ class ComicVineTalker:
|
||||
|
||||
# See if the last result's name has all the of the search terms.
|
||||
# If not, break out of this, loop, we're done.
|
||||
for term in search_series_name.split():
|
||||
if term not in last_result:
|
||||
stop_searching = True
|
||||
break
|
||||
stop_searching = utils.titles_match(search_series_name, last_result)
|
||||
|
||||
# Also, stop searching when the word count of last results is too much longer than our search terms list
|
||||
if len(last_result) > result_word_count_max:
|
||||
@ -301,12 +298,8 @@ class ComicVineTalker:
|
||||
if not literal:
|
||||
# Remove any search results that don't contain all the search terms (iterate backwards for easy removal)
|
||||
for record in reversed(search_results):
|
||||
# Sanitize the series name for comicvine searching, comicvine search ignore symbols
|
||||
record_name = utils.sanitize_title(record["name"])
|
||||
for term in search_series_name.split():
|
||||
if term not in record_name:
|
||||
search_results.remove(record)
|
||||
break
|
||||
if not utils.titles_match(search_series_name, record["name"]):
|
||||
search_results.remove(record)
|
||||
|
||||
# Cache these search results, even if it's literal we cache the results
|
||||
# The most it will cause is extra processing time
|
||||
|
Loading…
x
Reference in New Issue
Block a user