From 7aa4e1c4edd93f05b1d4ec0b729ebba4c3af990b Mon Sep 17 00:00:00 2001
From: lordwelch <timmy@narnian.us>
Date: Thu, 13 Feb 2020 00:27:08 -0800
Subject: [PATCH] Improve searchForSeries

Refactor removearticles to only remove articles
Add normalization on the search string and the series name results

Searching now only compares ASCII a-z and 0-9 and all other characters
are replaced with single space, this is done to both the search string
and the result. This fixes an with names that are separated by a
hyphen (-) in the filename but in the Comic Vine name are separated by a
slash (/) and other similar issues.
---
 comicapi/utils.py                 | 10 --------
 comictaggerlib/comicvinetalker.py | 42 ++++++++++++++++++++++---------
 2 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/comicapi/utils.py b/comicapi/utils.py
index 1303689..ec78d43 100644
--- a/comicapi/utils.py
+++ b/comicapi/utils.py
@@ -131,16 +131,6 @@ def removearticles(text):
 
     newText = newText[:-1]
 
-    # now get rid of some other junk
-    newText = newText.replace(":", "")
-    newText = newText.replace(",", "")
-    newText = newText.replace("-", " ")
-
-    # since the CV API changed, searches for series names with periods
-    # now explicitly require the period to be in the search key,
-    # so the line below is removed (for now)
-    #newText = newText.replace(".", "")
-
     return newText
 
 
diff --git a/comictaggerlib/comicvinetalker.py b/comictaggerlib/comicvinetalker.py
index 5a0ca85..9918e9b 100644
--- a/comictaggerlib/comicvinetalker.py
+++ b/comictaggerlib/comicvinetalker.py
@@ -21,6 +21,7 @@ import time
 import datetime
 import sys
 import ssl
+import unicodedata
 #from pprint import pprint
 #import math
 
@@ -203,8 +204,13 @@ class ComicVineTalker(QObject):
 
     def searchForSeries(self, series_name, callback=None, refresh_cache=False):
 
-        # remove cruft from the search string
-        series_name = utils.removearticles(series_name).lower().strip()
+        # normalize unicode and convert to ascii. Does not work for everything eg ½ to 1⁄2 not 1/2
+        search_series_name = unicodedata.normalize('NFKD', series_name).encode('ascii', 'ignore').decode('ascii')
+        # comicvine ignores punctuation and accents
+        search_series_name = re.sub(r'[^A-Za-z0-9]+',' ', search_series_name)
+        # remove extra space and articles and all lower case
+        search_series_name = utils.removearticles(search_series_name).lower().strip()
+
 
         # before we search online, look in our cache, since we might have
         # done this same search recently
@@ -215,14 +221,12 @@ class ComicVineTalker(QObject):
             if len(cached_search_results) > 0:
                 return cached_search_results
 
-        original_series_name = series_name
-
         params = {
             'api_key': self.api_key,
             'format': 'json',
             'resources': 'volume',
-            'query': series_name,
-            'field_list': 'name,id,start_year,publisher,image,description,count_of_issues',
+            'query': search_series_name,
+            'field_list': 'volume,name,id,start_year,publisher,image,description,count_of_issues',
             'page': 1
         }
 
@@ -245,7 +249,7 @@ class ComicVineTalker(QObject):
         # 2. Halt when not all of our search terms are present in a result
         # 3. Halt when the results contain more (plus threshold) words than
         #    our search
-        result_word_count_max = len(series_name.split()) + 3
+        result_word_count_max = len(search_series_name.split()) + 3
 
         total_result_count = min(total_result_count, max_results)
 
@@ -266,9 +270,16 @@ class ComicVineTalker(QObject):
 
             last_result = search_results[-1]['name']
 
+            # normalize unicode and convert to ascii. Does not work for everything eg ½ to 1⁄2 not 1/2
+            last_result = unicodedata.normalize('NFKD', last_result).encode('ascii', 'ignore').decode('ascii')
+            # comicvine ignores punctuation and accents
+            last_result = re.sub(r'[^A-Za-z0-9]+',' ', last_result)
+            # remove extra space and articles and all lower case
+            last_result = utils.removearticles(last_result).lower().strip()
+
             # See if the last result's name has all the of the search terms.
             # if not, break out of this, loop, we're done.
-            for term in series_name.split():
+            for term in search_series_name.split():
                 if term not in last_result.lower():
                     #print("Term '{}' not in last result. Halting search result fetching".format(term))
                     stop_searching = True
@@ -276,7 +287,7 @@ class ComicVineTalker(QObject):
 
             # Also, stop searching when the word count of last results is too much longer
             # than our search terms list
-            if len(utils.removearticles(last_result).split()) > result_word_count_max:
+            if len(last_result) > result_word_count_max:
                 #print("Last result '{}' is too long. Halting search result fetching".format(last_result))
                 stop_searching = True
 
@@ -303,8 +314,15 @@ class ComicVineTalker(QObject):
         # (iterate backwards for easy removal)
         for i in range(len(search_results) - 1, -1, -1):
             record = search_results[i]
-            for term in series_name.split():
-                if term not in record['name'].lower():
+            for term in search_series_name.split():
+                # normalize unicode and convert to ascii. Does not work for everything eg ½ to 1⁄2 not 1/2
+                recordName = unicodedata.normalize('NFKD', record['name']).encode('ascii', 'ignore').decode('ascii')
+                # comicvine ignores punctuation and accents
+                recordName = re.sub(r'[^A-Za-z0-9]+',' ', recordName)
+                # remove extra space and articles and all lower case
+                recordName = utils.removearticles(recordName).lower().strip()
+
+                if term not in recordName:
                     del search_results[i]
                     break
 
@@ -315,7 +333,7 @@ class ComicVineTalker(QObject):
         #print(u"{0}: {1} ({2})".format(search_results['results'][0]['id'], search_results['results'][0]['name'] , search_results['results'][0]['start_year']))
 
         # cache these search results
-        cvc.add_search_results(original_series_name, search_results)
+        cvc.add_search_results(series_name, search_results)
 
         return search_results