Improve issue identification

Move title sanitizing code to utils module
Update issue identifier to compare sanitized names
This commit is contained in:
lordwelch 2021-09-26 17:06:30 -07:00
parent fff28cf6ae
commit e7fe520660
3 changed files with 26 additions and 22 deletions

View File

@ -21,6 +21,7 @@ import re
import platform
import locale
import codecs
import unicodedata
class UtilsVars:
@ -151,6 +152,21 @@ def removearticles(text):
return newText
def sanitize_title(text):
# normalize unicode and convert to ascii. Does not work for everything eg ½ to 12 not 1/2
# this will probably cause issues with titles in other character sets e.g. chinese, japanese
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
# comicvine keeps apostrophes a part of the word
text = text.replace("'", "")
text = text.replace("\"", "")
# comicvine ignores punctuation and accents
text = re.sub(r'[^A-Za-z0-9]+',' ', text)
# remove extra space and articles and all lower case
text = removearticles(text).lower().strip()
return text
def unique_file(file_name):
counter = 1
# returns ('/path/file', '.ext')

View File

@ -21,7 +21,6 @@ import time
import datetime
import sys
import ssl
import unicodedata
#from pprint import pprint
#import math
@ -204,13 +203,8 @@ class ComicVineTalker(QObject):
def searchForSeries(self, series_name, callback=None, refresh_cache=False):
# normalize unicode and convert to ascii. Does not work for everything eg ½ to 12 not 1/2
search_series_name = unicodedata.normalize('NFKD', series_name).encode('ascii', 'ignore').decode('ascii')
# comicvine ignores punctuation and accents
search_series_name = re.sub(r'[^A-Za-z0-9]+',' ', search_series_name)
# remove extra space and articles and all lower case
search_series_name = utils.removearticles(search_series_name).lower().strip()
# Sanitize the series name for comicvine searching, comicvine search ignore symbols
search_series_name = utils.sanitize_title(series_name)
# before we search online, look in our cache, since we might have
# done this same search recently
@ -270,12 +264,8 @@ class ComicVineTalker(QObject):
last_result = search_results[-1]['name']
# normalize unicode and convert to ascii. Does not work for everything eg ½ to 12 not 1/2
last_result = unicodedata.normalize('NFKD', last_result).encode('ascii', 'ignore').decode('ascii')
# comicvine ignores punctuation and accents
last_result = re.sub(r'[^A-Za-z0-9]+',' ', last_result)
# remove extra space and articles and all lower case
last_result = utils.removearticles(last_result).lower().strip()
# Sanitize the series name for comicvine searching, comicvine search ignore symbols
last_result = utils.sanitize_title(last_result)
# See if the last result's name has all the of the search terms.
# if not, break out of this, loop, we're done.
@ -314,13 +304,9 @@ class ComicVineTalker(QObject):
# (iterate backwards for easy removal)
for i in range(len(search_results) - 1, -1, -1):
record = search_results[i]
# Sanitize the series name for comicvine searching, comicvine search ignore symbols
recordName = utils.sanitize_title(record['name'])
for term in search_series_name.split():
# normalize unicode and convert to ascii. Does not work for everything eg ½ to 12 not 1/2
recordName = unicodedata.normalize('NFKD', record['name']).encode('ascii', 'ignore').decode('ascii')
# comicvine ignores punctuation and accents
recordName = re.sub(r'[^A-Za-z0-9]+',' ', recordName)
# remove extra space and articles and all lower case
recordName = utils.removearticles(recordName).lower().strip()
if term not in recordName:
del search_results[i]

View File

@ -435,8 +435,10 @@ class IssueIdentifier:
# assume that our search name is close to the actual name, say
# within ,e.g. 5 chars
shortened_key = utils.removearticles(keys['series'])
shortened_item_name = utils.removearticles(item['name'])
# sanitize both the search string and the result so that
# we are comparing the same type of data
shortened_key = utils.sanitize_title(keys['series'])
shortened_item_name = utils.sanitize_title(item['name'])
if len(shortened_item_name) < (
len(shortened_key) + self.length_delta_thresh):
length_approved = True