From e7fe5206603133aa593dc1b7fdc8d9affbd15aa2 Mon Sep 17 00:00:00 2001 From: lordwelch Date: Sun, 26 Sep 2021 17:06:30 -0700 Subject: [PATCH] Improve issue identification Move title sanitizing code to utils module Update issue identifier to compare sanitized names --- comicapi/utils.py | 16 ++++++++++++++++ comictaggerlib/comicvinetalker.py | 26 ++++++-------------------- comictaggerlib/issueidentifier.py | 6 ++++-- 3 files changed, 26 insertions(+), 22 deletions(-) diff --git a/comicapi/utils.py b/comicapi/utils.py index f82f99a..05bacc2 100644 --- a/comicapi/utils.py +++ b/comicapi/utils.py @@ -21,6 +21,7 @@ import re import platform import locale import codecs +import unicodedata class UtilsVars: @@ -151,6 +152,21 @@ def removearticles(text): return newText +def sanitize_title(text): + # normalize unicode and convert to ascii. Does not work for everything eg ½ to 1⁄2 not 1/2 + # this will probably cause issues with titles in other character sets e.g. chinese, japanese + text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii') + # comicvine keeps apostrophes a part of the word + text = text.replace("'", "") + text = text.replace("\"", "") + # comicvine ignores punctuation and accents + text = re.sub(r'[^A-Za-z0-9]+',' ', text) + # remove extra space and articles and all lower case + text = removearticles(text).lower().strip() + + return text + + def unique_file(file_name): counter = 1 # returns ('/path/file', '.ext') diff --git a/comictaggerlib/comicvinetalker.py b/comictaggerlib/comicvinetalker.py index 75f254a..c92ce69 100644 --- a/comictaggerlib/comicvinetalker.py +++ b/comictaggerlib/comicvinetalker.py @@ -21,7 +21,6 @@ import time import datetime import sys import ssl -import unicodedata #from pprint import pprint #import math @@ -204,13 +203,8 @@ class ComicVineTalker(QObject): def searchForSeries(self, series_name, callback=None, refresh_cache=False): - # normalize unicode and convert to ascii. Does not work for everything eg ½ to 1⁄2 not 1/2 - search_series_name = unicodedata.normalize('NFKD', series_name).encode('ascii', 'ignore').decode('ascii') - # comicvine ignores punctuation and accents - search_series_name = re.sub(r'[^A-Za-z0-9]+',' ', search_series_name) - # remove extra space and articles and all lower case - search_series_name = utils.removearticles(search_series_name).lower().strip() - + # Sanitize the series name for comicvine searching, comicvine search ignore symbols + search_series_name = utils.sanitize_title(series_name) # before we search online, look in our cache, since we might have # done this same search recently @@ -270,12 +264,8 @@ class ComicVineTalker(QObject): last_result = search_results[-1]['name'] - # normalize unicode and convert to ascii. Does not work for everything eg ½ to 1⁄2 not 1/2 - last_result = unicodedata.normalize('NFKD', last_result).encode('ascii', 'ignore').decode('ascii') - # comicvine ignores punctuation and accents - last_result = re.sub(r'[^A-Za-z0-9]+',' ', last_result) - # remove extra space and articles and all lower case - last_result = utils.removearticles(last_result).lower().strip() + # Sanitize the series name for comicvine searching, comicvine search ignore symbols + last_result = utils.sanitize_title(last_result) # See if the last result's name has all the of the search terms. # if not, break out of this, loop, we're done. @@ -314,13 +304,9 @@ class ComicVineTalker(QObject): # (iterate backwards for easy removal) for i in range(len(search_results) - 1, -1, -1): record = search_results[i] + # Sanitize the series name for comicvine searching, comicvine search ignore symbols + recordName = utils.sanitize_title(record['name']) for term in search_series_name.split(): - # normalize unicode and convert to ascii. Does not work for everything eg ½ to 1⁄2 not 1/2 - recordName = unicodedata.normalize('NFKD', record['name']).encode('ascii', 'ignore').decode('ascii') - # comicvine ignores punctuation and accents - recordName = re.sub(r'[^A-Za-z0-9]+',' ', recordName) - # remove extra space and articles and all lower case - recordName = utils.removearticles(recordName).lower().strip() if term not in recordName: del search_results[i] diff --git a/comictaggerlib/issueidentifier.py b/comictaggerlib/issueidentifier.py index 292e84c..0c16248 100644 --- a/comictaggerlib/issueidentifier.py +++ b/comictaggerlib/issueidentifier.py @@ -435,8 +435,10 @@ class IssueIdentifier: # assume that our search name is close to the actual name, say # within ,e.g. 5 chars - shortened_key = utils.removearticles(keys['series']) - shortened_item_name = utils.removearticles(item['name']) + # sanitize both the search string and the result so that + # we are comparing the same type of data + shortened_key = utils.sanitize_title(keys['series']) + shortened_item_name = utils.sanitize_title(item['name']) if len(shortened_item_name) < ( len(shortened_key) + self.length_delta_thresh): length_approved = True