From a884974a9c054fcc8c4b9ce9525921ba0e994788 Mon Sep 17 00:00:00 2001 From: lordwelch Date: Sun, 26 Sep 2021 17:06:30 -0700 Subject: [PATCH] Improve issue identification Move title sanitizing code to utils module Update issue identifier to compare sanitized names --- comicapi/utils.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/comicapi/utils.py b/comicapi/utils.py index f82f99a..05bacc2 100644 --- a/comicapi/utils.py +++ b/comicapi/utils.py @@ -21,6 +21,7 @@ import re import platform import locale import codecs +import unicodedata class UtilsVars: @@ -151,6 +152,21 @@ def removearticles(text): return newText +def sanitize_title(text): + # normalize unicode and convert to ascii. Does not work for everything eg ½ to 1⁄2 not 1/2 + # this will probably cause issues with titles in other character sets e.g. chinese, japanese + text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii') + # comicvine keeps apostrophes a part of the word + text = text.replace("'", "") + text = text.replace("\"", "") + # comicvine ignores punctuation and accents + text = re.sub(r'[^A-Za-z0-9]+',' ', text) + # remove extra space and articles and all lower case + text = removearticles(text).lower().strip() + + return text + + def unique_file(file_name): counter = 1 # returns ('/path/file', '.ext')