Fix #353
The two primary cases fixed are: Ms. Marvel spider-man/deadpool The first issue removed 'Ms.' which is a problem as many comics have series that the only difference in the title is the designation/honorific. The second issue is that the '/' was removed and not replaced with anything causing a search for 'mandeadpool' which will not show useful results. Consequently all designations/honorifics are now untouched All punctuation is replaced with a space
This commit is contained in:
parent
77a53a6834
commit
be983c61bc
@ -121,13 +121,9 @@ def remove_articles(text: str) -> str:
|
||||
"the",
|
||||
"the",
|
||||
"with",
|
||||
"ms",
|
||||
"mrs",
|
||||
"mr",
|
||||
"dr",
|
||||
]
|
||||
new_text = ""
|
||||
for word in text.split(" "):
|
||||
for word in text.split():
|
||||
if word not in articles:
|
||||
new_text += word + " "
|
||||
|
||||
@ -139,19 +135,16 @@ def remove_articles(text: str) -> str:
|
||||
def sanitize_title(text: str, basic: bool = False) -> str:
|
||||
# normalize unicode and convert to ascii. Does not work for everything eg ½ to 1⁄2 not 1/2
|
||||
text = unicodedata.normalize("NFKD", text).casefold()
|
||||
if basic:
|
||||
# comicvine keeps apostrophes a part of the word
|
||||
text = text.replace("'", "")
|
||||
text = text.replace('"', "")
|
||||
else:
|
||||
# comicvine keeps apostrophes a part of the word
|
||||
text = text.replace("'", "")
|
||||
text = text.replace('"', "")
|
||||
if not basic:
|
||||
# comicvine ignores punctuation and accents
|
||||
# remove all characters that are not a letter, separator (space) or number
|
||||
# replace any "dash punctuation" with a space
|
||||
# makes sure that batman-superman and self-proclaimed stay separate words
|
||||
text = "".join(
|
||||
c if not unicodedata.category(c) in ("Pd",) else " "
|
||||
for c in text
|
||||
if unicodedata.category(c)[0] in "LZN" or unicodedata.category(c) in ("Pd",)
|
||||
c if unicodedata.category(c)[0] not in "P" else " " for c in text if unicodedata.category(c)[0] in "LZNP"
|
||||
)
|
||||
# remove extra space and articles and all lower case
|
||||
text = remove_articles(text).strip()
|
||||
|
@ -108,3 +108,19 @@ titles = [
|
||||
@pytest.mark.parametrize("value, result", titles)
|
||||
def test_titles_match(value, result):
|
||||
assert comicapi.utils.titles_match(value[0], value[1]) == result
|
||||
|
||||
|
||||
titles_2 = [
|
||||
("", ""),
|
||||
("鋼の錬金術師", "鋼の錬金術師"),
|
||||
("Conan el Bárbaro", "Conan el Barbaro"),
|
||||
("The Batman's Grave", "batmans grave"),
|
||||
("A+X", "ax"),
|
||||
("ms. marvel", "ms marvel"),
|
||||
("spider-man/deadpool", "spider man deadpool"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("value, result", titles_2)
|
||||
def test_sanitize_title(value, result):
|
||||
assert comicapi.utils.sanitize_title(value) == result.casefold()
|
||||
|
Loading…
Reference in New Issue
Block a user