From be983c61bc2e2ca1f04f95ef0ad8aa9699c5d316 Mon Sep 17 00:00:00 2001 From: Timmy Welch Date: Fri, 12 Aug 2022 07:10:36 -0700 Subject: [PATCH] Fix #353 The two primary cases fixed are: Ms. Marvel spider-man/deadpool The first issue removed 'Ms.' which is a problem as many comics have series that the only difference in the title is the designation/honorific. The second issue is that the '/' was removed and not replaced with anything causing a search for 'mandeadpool' which will not show useful results. Consequently all designations/honorifics are now untouched All punctuation is replaced with a space --- comicapi/utils.py | 19 ++++++------------- tests/utils_test.py | 16 ++++++++++++++++ 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/comicapi/utils.py b/comicapi/utils.py index ae376ca..c458365 100644 --- a/comicapi/utils.py +++ b/comicapi/utils.py @@ -121,13 +121,9 @@ def remove_articles(text: str) -> str: "the", "the", "with", - "ms", - "mrs", - "mr", - "dr", ] new_text = "" - for word in text.split(" "): + for word in text.split(): if word not in articles: new_text += word + " " @@ -139,19 +135,16 @@ def remove_articles(text: str) -> str: def sanitize_title(text: str, basic: bool = False) -> str: # normalize unicode and convert to ascii. Does not work for everything eg ½ to 1⁄2 not 1/2 text = unicodedata.normalize("NFKD", text).casefold() - if basic: - # comicvine keeps apostrophes a part of the word - text = text.replace("'", "") - text = text.replace('"', "") - else: + # comicvine keeps apostrophes a part of the word + text = text.replace("'", "") + text = text.replace('"', "") + if not basic: # comicvine ignores punctuation and accents # remove all characters that are not a letter, separator (space) or number # replace any "dash punctuation" with a space # makes sure that batman-superman and self-proclaimed stay separate words text = "".join( - c if not unicodedata.category(c) in ("Pd",) else " " - for c in text - if unicodedata.category(c)[0] in "LZN" or unicodedata.category(c) in ("Pd",) + c if unicodedata.category(c)[0] not in "P" else " " for c in text if unicodedata.category(c)[0] in "LZNP" ) # remove extra space and articles and all lower case text = remove_articles(text).strip() diff --git a/tests/utils_test.py b/tests/utils_test.py index 74747e9..0bf7713 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -108,3 +108,19 @@ titles = [ @pytest.mark.parametrize("value, result", titles) def test_titles_match(value, result): assert comicapi.utils.titles_match(value[0], value[1]) == result + + +titles_2 = [ + ("", ""), + ("鋼の錬金術師", "鋼の錬金術師"), + ("Conan el Bárbaro", "Conan el Barbaro"), + ("The Batman's Grave", "batmans grave"), + ("A+X", "ax"), + ("ms. marvel", "ms marvel"), + ("spider-man/deadpool", "spider man deadpool"), +] + + +@pytest.mark.parametrize("value, result", titles_2) +def test_sanitize_title(value, result): + assert comicapi.utils.sanitize_title(value) == result.casefold()