Add a literal search option

This commit is contained in:
Timmy Welch 2022-06-07 11:49:56 -07:00
parent 470b5c0a17
commit db37ec7204
5 changed files with 90 additions and 51 deletions

View File

@ -130,17 +130,17 @@ def remove_articles(text: str) -> str:
return new_text
def sanitize_title(text: str) -> str:
def sanitize_title(text: str, basic: bool = False) -> str:
# normalize unicode and convert to ascii. Does not work for everything eg ½ to 12 not 1/2
# this will probably cause issues with titles in other character sets e.g. chinese, japanese
text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("ascii")
text = unicodedata.normalize("NFKD", text)
# comicvine keeps apostrophes a part of the word
text = text.replace("'", "")
text = text.replace('"', "")
# comicvine ignores punctuation and accents
text = re.sub(r"[^A-Za-z0-9]+", " ", text)
# remove extra space and articles and all lower case
text = remove_articles(text).lower().strip()
if not basic:
# comicvine ignores punctuation and accents, TODO: only remove punctuation accents and similar
text = re.sub(r"[^A-Za-z0-9]+", " ", text)
# remove extra space and articles and all lower case
text = remove_articles(text).casefold().strip()
return text

View File

@ -200,15 +200,21 @@ class ComicVineTalker:
raise ComicVineTalkerException(ComicVineTalkerException.Unknown, "Error on Comic Vine server")
def search_for_series(
self, series_name: str, callback: Callable[[int, int], None] | None = None, refresh_cache: bool = False
self,
series_name: str,
callback: Callable[[int, int], None] | None = None,
refresh_cache: bool = False,
literal: bool = False,
) -> list[CVVolumeResults]:
# Sanitize the series name for comicvine searching, comicvine search ignore symbols
search_series_name = utils.sanitize_title(series_name)
search_series_name = utils.sanitize_title(series_name, literal)
logger.info("Searching: %s", search_series_name)
# before we search online, look in our cache, since we might have done this same search recently
# Before we search online, look in our cache, since we might have done this same search recently
# For literal searches always retrieve from online
cvc = ComicVineCacher()
if not refresh_cache:
if not refresh_cache and not literal:
cached_search_results = cvc.get_search_results(series_name)
if len(cached_search_results) > 0:
@ -258,25 +264,24 @@ class ComicVineTalker:
stop_searching = False
while current_result_count < total_result_count:
last_result = search_results[-1]["name"]
if not literal:
# Sanitize the series name for comicvine searching, comicvine search ignore symbols
last_result = utils.sanitize_title(search_results[-1]["name"])
# Sanitize the series name for comicvine searching, comicvine search ignore symbols
last_result = utils.sanitize_title(last_result)
# See if the last result's name has all the of the search terms.
# If not, break out of this, loop, we're done.
for term in search_series_name.split():
if term not in last_result:
stop_searching = True
break
# See if the last result's name has all the of the search terms.
# If not, break out of this, loop, we're done.
for term in search_series_name.split():
if term not in last_result.lower():
# Also, stop searching when the word count of last results is too much longer than our search terms list
if len(last_result) > result_word_count_max:
stop_searching = True
if stop_searching:
break
# Also, stop searching when the word count of last results is too much longer than our search terms list
if len(last_result) > result_word_count_max:
stop_searching = True
if stop_searching:
break
if callback is None:
self.write_log(f"getting another page of results {current_result_count} of {total_result_count}...\n")
page += 1
@ -290,18 +295,19 @@ class ComicVineTalker:
if callback is not None:
callback(current_result_count, total_result_count)
# Remove any search results that don't contain all the search terms (iterate backwards for easy removal)
for i in range(len(search_results) - 1, -1, -1):
record = search_results[i]
# Sanitize the series name for comicvine searching, comicvine search ignore symbols
record_name = utils.sanitize_title(record["name"])
for term in search_series_name.split():
# Literal searches simply return the matches no extra processing is doneo
if not literal:
# Remove any search results that don't contain all the search terms (iterate backwards for easy removal)
for record in reversed(search_results):
# Sanitize the series name for comicvine searching, comicvine search ignore symbols
record_name = utils.sanitize_title(record["name"])
for term in search_series_name.split():
if term not in record_name:
search_results.remove(record)
break
if term not in record_name:
del search_results[i]
break
# cache these search results
# Cache these search results, even if it's literal we cache the results
# The most it will cause is extra processing time
cvc.add_search_results(series_name, search_results)
return search_results

View File

@ -385,6 +385,8 @@ Have fun!
self.actionAutoIdentify.setShortcut("Ctrl+I")
self.actionAutoIdentify.triggered.connect(self.auto_identify_search)
self.actionLiteralSearch.triggered.connect(self.literal_search)
self.actionApplyCBLTransform.setShortcut("Ctrl+L")
self.actionApplyCBLTransform.setStatusTip("Modify tags specifically for CBL format")
self.actionApplyCBLTransform.triggered.connect(self.apply_cbl_transform)
@ -424,6 +426,7 @@ Have fun!
self.actionParse_Filename.setIcon(QtGui.QIcon(ComicTaggerSettings.get_graphic("parse.png")))
self.actionParse_Filename_split_words.setIcon(QtGui.QIcon(ComicTaggerSettings.get_graphic("parse.png")))
self.actionSearchOnline.setIcon(QtGui.QIcon(ComicTaggerSettings.get_graphic("search.png")))
self.actionLiteralSearch.setIcon(QtGui.QIcon(ComicTaggerSettings.get_graphic("search.png")))
self.actionAutoIdentify.setIcon(QtGui.QIcon(ComicTaggerSettings.get_graphic("auto.png")))
self.actionAutoTag.setIcon(QtGui.QIcon(ComicTaggerSettings.get_graphic("autotag.png")))
self.actionAutoImprint.setIcon(QtGui.QIcon(ComicTaggerSettings.get_graphic("autotag.png")))
@ -434,6 +437,7 @@ Have fun!
self.toolBar.addAction(self.actionLoadFolder)
self.toolBar.addAction(self.actionWrite_Tags)
self.toolBar.addAction(self.actionSearchOnline)
self.toolBar.addAction(self.actionLiteralSearch)
self.toolBar.addAction(self.actionAutoIdentify)
self.toolBar.addAction(self.actionAutoTag)
self.toolBar.addAction(self.actionClearEntryForm)
@ -1015,7 +1019,10 @@ Have fun!
self.query_online(autoselect=True)
def query_online(self, autoselect: bool = False) -> None:
def literal_search(self):
self.query_online(autoselect=False, literal=True)
def query_online(self, autoselect: bool = False, literal: bool = False) -> None:
issue_number = str(self.leIssueNum.text()).strip()
@ -1046,6 +1053,7 @@ Have fun!
cast(ComicArchive, self.comic_archive),
self.settings,
autoselect,
literal,
)
selector.setWindowTitle(f"Search: '{series_name}' - Select Series")

View File

@ -1236,6 +1236,7 @@
<addaction name="actionSearchOnline"/>
<addaction name="actionAutoIdentify"/>
<addaction name="actionAutoImprint"/>
<addaction name="actionLiteralSearch"/>
<addaction name="separator"/>
<addaction name="actionApplyCBLTransform"/>
<addaction name="actionReCalcPageDims"/>
@ -1461,6 +1462,14 @@
<string>Show Log Window</string>
</property>
</action>
<action name="actionLiteralSearch">
<property name="text">
<string>Literal Search</string>
</property>
<property name="toolTip">
<string>perform a literal search on the series and return the first 50 results</string>
</property>
</action>
</widget>
<layoutdefault spacing="6" margin="11"/>
<resources/>

View File

@ -15,7 +15,9 @@
# limitations under the License.
from __future__ import annotations
import itertools
import logging
from collections import deque
from PyQt5 import QtCore, QtWidgets, uic
from PyQt5.QtCore import pyqtSignal
@ -40,20 +42,21 @@ class SearchThread(QtCore.QThread):
searchComplete = pyqtSignal()
progressUpdate = pyqtSignal(int, int)
def __init__(self, series_name: str, refresh: bool) -> None:
def __init__(self, series_name: str, refresh: bool, literal: bool = False) -> None:
QtCore.QThread.__init__(self)
self.series_name = series_name
self.refresh: bool = refresh
self.error_code: int | None = None
self.cv_error = False
self.cv_search_results: list[CVVolumeResults] = []
self.literal = literal
def run(self) -> None:
comic_vine = ComicVineTalker()
try:
self.cv_error = False
self.cv_search_results = comic_vine.search_for_series(
self.series_name, callback=self.prog_callback, refresh_cache=self.refresh
self.series_name, self.prog_callback, self.refresh, self.literal
)
except ComicVineTalkerException as e:
self.cv_search_results = []
@ -101,6 +104,7 @@ class VolumeSelectionWindow(QtWidgets.QDialog):
comic_archive: ComicArchive,
settings: ComicTaggerSettings,
autoselect: bool = False,
literal: bool = False,
) -> None:
super().__init__(parent)
@ -132,6 +136,7 @@ class VolumeSelectionWindow(QtWidgets.QDialog):
self.immediate_autoselect = autoselect
self.cover_index_list = cover_index_list
self.cv_search_results: list[CVVolumeResults] = []
self.literal = literal
self.ii: IssueIdentifier | None = None
self.iddialog: IDProgressWindow | None = None
self.id_thread: IdentifyThread | None = None
@ -155,7 +160,7 @@ class VolumeSelectionWindow(QtWidgets.QDialog):
self.twList.selectRow(0)
def update_buttons(self) -> None:
enabled = bool(self.cv_search_results and len(self.cv_search_results) > 0)
enabled = bool(self.cv_search_results)
self.btnRequery.setEnabled(enabled)
self.btnIssues.setEnabled(enabled)
@ -305,7 +310,7 @@ class VolumeSelectionWindow(QtWidgets.QDialog):
self.progdialog.canceled.connect(self.search_canceled)
self.progdialog.setModal(True)
self.progdialog.setMinimumDuration(300)
self.search_thread = SearchThread(self.series_name, refresh)
self.search_thread = SearchThread(self.series_name, refresh, self.literal)
self.search_thread.searchComplete.connect(self.search_complete)
self.search_thread.progressUpdate.connect(self.search_progress_update)
self.search_thread.start()
@ -382,14 +387,25 @@ class VolumeSelectionWindow(QtWidgets.QDialog):
# move sanitized matches to the front
if self.settings.exact_series_matches_first:
try:
sanitized = utils.sanitize_title(self.series_name)
exact_matches = list(
filter(lambda d: utils.sanitize_title(str(d["name"])) in sanitized, self.cv_search_results)
)
non_matches = list(
filter(lambda d: utils.sanitize_title(str(d["name"])) not in sanitized, self.cv_search_results)
)
self.cv_search_results = exact_matches + non_matches
sanitized = utils.sanitize_title(self.series_name, False).casefold()
sanitized_no_articles = utils.sanitize_title(self.series_name, True).casefold()
deques: list[deque[CVVolumeResults]] = [deque(), deque(), deque()]
def categorize(result):
# We don't remove anything on this one so that we only get exact matches
if utils.sanitize_title(result["name"], True).casefold() == sanitized_no_articles:
return 0
# this ensures that 'The Joker' is near the top even if you search 'Joker'
if utils.sanitize_title(result["name"], False).casefold() in sanitized:
return 1
return 2
for comic in self.cv_search_results:
deques[categorize(comic)].append(comic)
logger.info("Length: %d, %d, %d", len(deques[0]), len(deques[1]), len(deques[2]))
self.cv_search_results = list(itertools.chain.from_iterable(deques))
except Exception:
logger.exception("bad data error filtering exact/near matches")
@ -436,12 +452,12 @@ class VolumeSelectionWindow(QtWidgets.QDialog):
self.twList.selectRow(0)
self.twList.resizeColumnsToContents()
if len(self.cv_search_results) == 0:
if not self.cv_search_results:
QtCore.QCoreApplication.processEvents()
QtWidgets.QMessageBox.information(self, "Search Result", "No matches found!")
QtCore.QTimer.singleShot(200, self.close_me)
if self.immediate_autoselect and len(self.cv_search_results) > 0:
if self.immediate_autoselect and self.cv_search_results:
# defer the immediate autoselect so this dialog has time to pop up
QtCore.QCoreApplication.processEvents()
QtCore.QTimer.singleShot(10, self.do_immediate_autoselect)