Improve issue identification
Move title sanitizing code to utils module Update issue identifier to compare sanitized names
This commit is contained in:
parent
fff28cf6ae
commit
e7fe520660
@ -21,6 +21,7 @@ import re
|
||||
import platform
|
||||
import locale
|
||||
import codecs
|
||||
import unicodedata
|
||||
|
||||
|
||||
class UtilsVars:
|
||||
@ -151,6 +152,21 @@ def removearticles(text):
|
||||
return newText
|
||||
|
||||
|
||||
def sanitize_title(text):
|
||||
# normalize unicode and convert to ascii. Does not work for everything eg ½ to 1⁄2 not 1/2
|
||||
# this will probably cause issues with titles in other character sets e.g. chinese, japanese
|
||||
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
|
||||
# comicvine keeps apostrophes a part of the word
|
||||
text = text.replace("'", "")
|
||||
text = text.replace("\"", "")
|
||||
# comicvine ignores punctuation and accents
|
||||
text = re.sub(r'[^A-Za-z0-9]+',' ', text)
|
||||
# remove extra space and articles and all lower case
|
||||
text = removearticles(text).lower().strip()
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def unique_file(file_name):
|
||||
counter = 1
|
||||
# returns ('/path/file', '.ext')
|
||||
|
@ -21,7 +21,6 @@ import time
|
||||
import datetime
|
||||
import sys
|
||||
import ssl
|
||||
import unicodedata
|
||||
#from pprint import pprint
|
||||
#import math
|
||||
|
||||
@ -204,13 +203,8 @@ class ComicVineTalker(QObject):
|
||||
|
||||
def searchForSeries(self, series_name, callback=None, refresh_cache=False):
|
||||
|
||||
# normalize unicode and convert to ascii. Does not work for everything eg ½ to 1⁄2 not 1/2
|
||||
search_series_name = unicodedata.normalize('NFKD', series_name).encode('ascii', 'ignore').decode('ascii')
|
||||
# comicvine ignores punctuation and accents
|
||||
search_series_name = re.sub(r'[^A-Za-z0-9]+',' ', search_series_name)
|
||||
# remove extra space and articles and all lower case
|
||||
search_series_name = utils.removearticles(search_series_name).lower().strip()
|
||||
|
||||
# Sanitize the series name for comicvine searching, comicvine search ignore symbols
|
||||
search_series_name = utils.sanitize_title(series_name)
|
||||
|
||||
# before we search online, look in our cache, since we might have
|
||||
# done this same search recently
|
||||
@ -270,12 +264,8 @@ class ComicVineTalker(QObject):
|
||||
|
||||
last_result = search_results[-1]['name']
|
||||
|
||||
# normalize unicode and convert to ascii. Does not work for everything eg ½ to 1⁄2 not 1/2
|
||||
last_result = unicodedata.normalize('NFKD', last_result).encode('ascii', 'ignore').decode('ascii')
|
||||
# comicvine ignores punctuation and accents
|
||||
last_result = re.sub(r'[^A-Za-z0-9]+',' ', last_result)
|
||||
# remove extra space and articles and all lower case
|
||||
last_result = utils.removearticles(last_result).lower().strip()
|
||||
# Sanitize the series name for comicvine searching, comicvine search ignore symbols
|
||||
last_result = utils.sanitize_title(last_result)
|
||||
|
||||
# See if the last result's name has all the of the search terms.
|
||||
# if not, break out of this, loop, we're done.
|
||||
@ -314,13 +304,9 @@ class ComicVineTalker(QObject):
|
||||
# (iterate backwards for easy removal)
|
||||
for i in range(len(search_results) - 1, -1, -1):
|
||||
record = search_results[i]
|
||||
# Sanitize the series name for comicvine searching, comicvine search ignore symbols
|
||||
recordName = utils.sanitize_title(record['name'])
|
||||
for term in search_series_name.split():
|
||||
# normalize unicode and convert to ascii. Does not work for everything eg ½ to 1⁄2 not 1/2
|
||||
recordName = unicodedata.normalize('NFKD', record['name']).encode('ascii', 'ignore').decode('ascii')
|
||||
# comicvine ignores punctuation and accents
|
||||
recordName = re.sub(r'[^A-Za-z0-9]+',' ', recordName)
|
||||
# remove extra space and articles and all lower case
|
||||
recordName = utils.removearticles(recordName).lower().strip()
|
||||
|
||||
if term not in recordName:
|
||||
del search_results[i]
|
||||
|
@ -435,8 +435,10 @@ class IssueIdentifier:
|
||||
|
||||
# assume that our search name is close to the actual name, say
|
||||
# within ,e.g. 5 chars
|
||||
shortened_key = utils.removearticles(keys['series'])
|
||||
shortened_item_name = utils.removearticles(item['name'])
|
||||
# sanitize both the search string and the result so that
|
||||
# we are comparing the same type of data
|
||||
shortened_key = utils.sanitize_title(keys['series'])
|
||||
shortened_item_name = utils.sanitize_title(item['name'])
|
||||
if len(shortened_item_name) < (
|
||||
len(shortened_key) + self.length_delta_thresh):
|
||||
length_approved = True
|
||||
|
Loading…
Reference in New Issue
Block a user