diff --git a/comicapi/utils.py b/comicapi/utils.py index 8ff2355..8ac9287 100644 --- a/comicapi/utils.py +++ b/comicapi/utils.py @@ -26,6 +26,7 @@ from shutil import which # noqa: F401 from typing import Any, Mapping import pycountry +import thefuzz.fuzz logger = logging.getLogger(__name__) @@ -108,6 +109,10 @@ def remove_articles(text: str) -> str: "the", "the", "with", + "ms", + "mrs", + "mr", + "dr", ] new_text = "" for word in text.split(" "): @@ -142,13 +147,18 @@ def sanitize_title(text: str, basic: bool = False) -> str: return text -def titles_match(search_title, record_title): +def titles_match(search_title: str, record_title: str, threshold: int = 90): sanitized_search = sanitize_title(search_title) sanitized_record = sanitize_title(record_title) - for term in sanitized_search.split(): - if term not in sanitized_record: - return False - return True + ratio = thefuzz.fuzz.ratio(sanitized_search, sanitized_record) + logger.debug( + "search title: %s ; record title: %s ; ratio: %d ; match threshold: %d", + search_title, + record_title, + ratio, + threshold, + ) + return ratio >= threshold def unique_file(file_name: pathlib.Path) -> pathlib.Path: diff --git a/comictaggerlib/autotagstartwindow.py b/comictaggerlib/autotagstartwindow.py index 492187b..2c790ee 100644 --- a/comictaggerlib/autotagstartwindow.py +++ b/comictaggerlib/autotagstartwindow.py @@ -17,7 +17,7 @@ from __future__ import annotations import logging -from PyQt5 import QtCore, QtGui, QtWidgets, uic +from PyQt5 import QtCore, QtWidgets, uic from comictaggerlib.settings import ComicTaggerSettings @@ -39,7 +39,7 @@ class AutoTagStartWindow(QtWidgets.QDialog): self.cbxSpecifySearchString.setChecked(False) self.cbxSplitWords.setChecked(False) - self.leNameLengthMatchTolerance.setText(str(self.settings.id_length_delta_thresh)) + self.sbNameMatchSearchThresh.setValue(self.settings.id_series_match_identify_thresh) self.leSearchString.setEnabled(False) self.cbxSaveOnLowConfidence.setChecked(self.settings.save_on_low_confidence) @@ -50,13 +50,12 @@ class AutoTagStartWindow(QtWidgets.QDialog): self.cbxWaitForRateLimit.setChecked(self.settings.wait_and_retry_on_rate_limit) self.cbxAutoImprint.setChecked(self.settings.auto_imprint) - nlmt_tip = """ The Name Length Match Tolerance is for eliminating automatic - search matches that are too long compared to your series name search. The higher + nlmt_tip = """The Name Match Ratio Threshold: Auto-Identify is for eliminating automatic + search matches that are too long compared to your series name search. The lower it is, the more likely to have a good match, but each search will take longer and - use more bandwidth. Too low, and only the very closest lexical matches will be - explored.""" + use more bandwidth. Too high, and only the very closest matches will be explored.""" - self.leNameLengthMatchTolerance.setToolTip(nlmt_tip) + self.sbNameMatchSearchThresh.setToolTip(nlmt_tip) ss_tip = """ The series search string specifies the search string to be used for all selected archives. @@ -66,9 +65,6 @@ class AutoTagStartWindow(QtWidgets.QDialog): self.leSearchString.setToolTip(ss_tip) self.cbxSpecifySearchString.setToolTip(ss_tip) - validator = QtGui.QIntValidator(0, 99, self) - self.leNameLengthMatchTolerance.setValidator(validator) - self.cbxSpecifySearchString.stateChanged.connect(self.search_string_toggle) self.auto_save_on_low = False @@ -78,7 +74,7 @@ class AutoTagStartWindow(QtWidgets.QDialog): self.remove_after_success = False self.wait_and_retry_on_rate_limit = False self.search_string = "" - self.name_length_match_tolerance = self.settings.id_length_delta_thresh + self.name_length_match_tolerance = self.settings.id_series_match_search_thresh self.split_words = self.cbxSplitWords.isChecked() def search_string_toggle(self) -> None: @@ -93,7 +89,7 @@ class AutoTagStartWindow(QtWidgets.QDialog): self.assume_issue_one = self.cbxAssumeIssueOne.isChecked() self.ignore_leading_digits_in_filename = self.cbxIgnoreLeadingDigitsInFilename.isChecked() self.remove_after_success = self.cbxRemoveAfterSuccess.isChecked() - self.name_length_match_tolerance = int(self.leNameLengthMatchTolerance.text()) + self.name_length_match_tolerance = int(self.leNameMatchThresh.text()) self.wait_and_retry_on_rate_limit = self.cbxWaitForRateLimit.isChecked() self.split_words = self.cbxSplitWords.isChecked() diff --git a/comictaggerlib/cli.py b/comictaggerlib/cli.py index e7a4068..f95cf45 100644 --- a/comictaggerlib/cli.py +++ b/comictaggerlib/cli.py @@ -42,7 +42,7 @@ def actual_issue_data_fetch( ) -> GenericMetadata: # now get the particular issue data try: - comic_vine = ComicVineTalker() + comic_vine = ComicVineTalker(settings.id_series_match_search_thresh) comic_vine.wait_for_rate_limit = opts.wait_on_cv_rate_limit cv_md = comic_vine.fetch_issue_data(match["volume_id"], match["issue_number"], settings) except ComicVineTalkerException: @@ -375,7 +375,7 @@ def process_file_cli( if opts.issue_id is not None: # we were given the actual ID to search with try: - comic_vine = ComicVineTalker() + comic_vine = ComicVineTalker(settings.id_series_match_search_thresh) comic_vine.wait_for_rate_limit = opts.wait_on_cv_rate_limit cv_md = comic_vine.fetch_issue_data_by_issue_id(opts.issue_id, settings) except ComicVineTalkerException: diff --git a/comictaggerlib/comicvinetalker.py b/comictaggerlib/comicvinetalker.py index 4db18ae..c938909 100644 --- a/comictaggerlib/comicvinetalker.py +++ b/comictaggerlib/comicvinetalker.py @@ -91,11 +91,12 @@ class ComicVineTalker: return "Comic Vine rate limit exceeded. Please wait a bit." - def __init__(self) -> None: + def __init__(self, series_match_thresh: int = 90) -> None: # Identity name for the information source self.source_name = "comicvine" self.wait_for_rate_limit = False + self.series_match_thresh = series_match_thresh # key that is registered to comictagger default_api_key = "27431e6787042105bd3e47e169a624521f89f3a4" @@ -245,10 +246,8 @@ class ComicVineTalker: # ORed together, and we get thousands of results. Good news is the # results are sorted by relevance, so we can be smart about halting the search. # 1. Don't fetch more than some sane amount of pages. - max_results = 500 - # 2. Halt when not all of our search terms are present in a result - # 3. Halt when the results contain more (plus threshold) words than our search - result_word_count_max = len(search_series_name.split()) + 3 + # 2. Halt when any result on the current page is less than or equal to a set ratio using thefuzz + max_results = 500 # 5 pages total_result_count = min(total_result_count, max_results) @@ -267,16 +266,11 @@ class ComicVineTalker: while current_result_count < total_result_count: if not literal: - # Sanitize the series name for comicvine searching, comicvine search ignore symbols - last_result = utils.sanitize_title(search_results[-1]["name"]) - - # See if the last result's name has all the of the search terms. - # If not, break out of this, loop, we're done. - stop_searching = utils.titles_match(search_series_name, last_result) - - # Also, stop searching when the word count of last results is too much longer than our search terms list - if len(last_result) > result_word_count_max: - stop_searching = True + # Stop searching once any entry falls below the threshold + stop_searching = any( + not utils.titles_match(search_series_name, volume["name"], self.series_match_thresh) + for volume in cast(list[CVVolumeResults], cv_response["results"]) + ) if stop_searching: break @@ -294,23 +288,6 @@ class ComicVineTalker: if callback is not None: callback(current_result_count, total_result_count) - # Literal searches simply return the matches no extra processing is doneo - if not literal: - # Remove any search results that don't contain all the search terms (iterate backwards for easy removal) - for record in reversed(search_results): - matched = False - aliases = [] - if record["aliases"]: - aliases = record["aliases"].split("\n") - aliases.append(record["name"]) - - for name in aliases: - if utils.titles_match(search_series_name, name): - matched = True - break - if not matched: - search_results.remove(record) - # Cache these search results, even if it's literal we cache the results # The most it will cause is extra processing time cvc.add_search_results(self.source_name, series_name, search_results) diff --git a/comictaggerlib/issueidentifier.py b/comictaggerlib/issueidentifier.py index 1fce66b..7e8f7ce 100644 --- a/comictaggerlib/issueidentifier.py +++ b/comictaggerlib/issueidentifier.py @@ -95,7 +95,7 @@ class IssueIdentifier: # used to eliminate series names that are too long based on our search # string - self.length_delta_thresh = settings.id_length_delta_thresh + self.series_match_thresh = settings.id_series_match_identify_thresh # used to eliminate unlikely publishers self.publisher_filter = [s.strip().casefold() for s in settings.id_publisher_filter.split(",")] @@ -120,8 +120,8 @@ class IssueIdentifier: def set_additional_metadata(self, md: GenericMetadata) -> None: self.additional_metadata = md - def set_name_length_delta_threshold(self, delta: int) -> None: - self.length_delta_thresh = delta + def set_name_series_match_threshold(self, delta: int) -> None: + self.series_match_thresh = delta def set_publisher_filter(self, flt: list[str]) -> None: self.publisher_filter = flt @@ -398,7 +398,7 @@ class IssueIdentifier: if keys["month"] is not None: self.log_msg("\tMonth: " + str(keys["month"])) - comic_vine = ComicVineTalker() + comic_vine = ComicVineTalker(self.settings.id_series_match_search_thresh) comic_vine.wait_for_rate_limit = self.wait_and_retry_on_rate_limit comic_vine.set_log_func(self.output_function) @@ -433,15 +433,13 @@ class IssueIdentifier: if int(keys["year"]) < int(item["start_year"]): date_approved = False - # assume that our search name is close to the actual name, say - # within ,e.g. 5 chars - # sanitize both the search string and the result so that - # we are comparing the same type of data - shortened_key = utils.sanitize_title(keys["series"]) - shortened_item_name = utils.sanitize_title(item["name"]) - if len(shortened_item_name) < (len(shortened_key) + self.length_delta_thresh): - length_approved = True - + aliases = [] + if item["aliases"]: + aliases = item["aliases"].split("\n") + for name in [item["name"], *aliases]: + if utils.titles_match(keys["series"], name, self.series_match_thresh): + length_approved = True + break # remove any series from publishers on the filter if item["publisher"] is not None: publisher = item["publisher"]["name"] diff --git a/comictaggerlib/issueselectionwindow.py b/comictaggerlib/issueselectionwindow.py index 1e773b9..678ea5f 100644 --- a/comictaggerlib/issueselectionwindow.py +++ b/comictaggerlib/issueselectionwindow.py @@ -97,7 +97,7 @@ class IssueSelectionWindow(QtWidgets.QDialog): QtWidgets.QApplication.setOverrideCursor(QtGui.QCursor(QtCore.Qt.CursorShape.WaitCursor)) try: - comic_vine = ComicVineTalker() + comic_vine = ComicVineTalker(self.settings.id_series_match_search_thresh) comic_vine.fetch_volume_data(self.series_id) self.issue_list = comic_vine.fetch_issues_by_volume(self.series_id) except ComicVineTalkerException as e: diff --git a/comictaggerlib/settings.py b/comictaggerlib/settings.py index a81cf4c..0ce6b65 100644 --- a/comictaggerlib/settings.py +++ b/comictaggerlib/settings.py @@ -78,7 +78,8 @@ class ComicTaggerSettings: self.last_filelist_sorted_order = 0 # identifier settings - self.id_length_delta_thresh = 5 + self.id_series_match_search_thresh = 90 + self.id_series_match_identify_thresh = 91 self.id_publisher_filter = "Panini Comics, Abril, Planeta DeAgostini, Editorial Televisa, Dino Comics" # Show/ask dialog flags @@ -217,8 +218,10 @@ class ComicTaggerSettings: if self.config.has_option("auto", "last_filelist_sorted_order"): self.last_filelist_sorted_order = self.config.getint("auto", "last_filelist_sorted_order") - if self.config.has_option("identifier", "id_length_delta_thresh"): - self.id_length_delta_thresh = self.config.getint("identifier", "id_length_delta_thresh") + if self.config.has_option("identifier", "id_series_match_search_thresh"): + self.id_series_match_search_thresh = self.config.getint("identifier", "id_series_match_search_thresh") + if self.config.has_option("identifier", "id_series_match_identify_thresh"): + self.id_series_match_identify_thresh = self.config.getint("identifier", "id_series_match_identify_thresh") if self.config.has_option("identifier", "id_publisher_filter"): self.id_publisher_filter = self.config.get("identifier", "id_publisher_filter") @@ -352,7 +355,8 @@ class ComicTaggerSettings: if not self.config.has_section("identifier"): self.config.add_section("identifier") - self.config.set("identifier", "id_length_delta_thresh", self.id_length_delta_thresh) + self.config.set("identifier", "id_series_match_search_thresh", self.id_series_match_search_thresh) + self.config.set("identifier", "id_series_match_identify_thresh", self.id_series_match_identify_thresh) self.config.set("identifier", "id_publisher_filter", self.id_publisher_filter) if not self.config.has_section("dialogflags"): diff --git a/comictaggerlib/settingswindow.py b/comictaggerlib/settingswindow.py index 8597af6..314edee 100644 --- a/comictaggerlib/settingswindow.py +++ b/comictaggerlib/settingswindow.py @@ -155,13 +155,16 @@ class SettingsWindow(QtWidgets.QDialog): self.lblDefaultSettings.setText("Revert to default " + self.name.casefold()) self.btnResetSettings.setText("Default " + self.name) - nldt_tip = """The Default Name Length Match Tolerance is for eliminating automatic - search matches that are too long compared to your series name search. The higher + nmit_tip = """The Name Match Ratio Threshold: Auto-Identify is for eliminating automatic + search matches that are too long compared to your series name search. The lower it is, the more likely to have a good match, but each search will take longer and - use more bandwidth. Too low, and only the very closest lexical matches will be - explored.""" + use more bandwidth. Too high, and only the very closest matches will be explored.""" + nmst_tip = """The Name Match Ratio Threshold: Search is for reducing the total + number of results that are returned from a search. The lower it is, the more pages will + be returned (max 5 pages or 500 results)""" - self.leNameLengthDeltaThresh.setToolTip(nldt_tip) + self.sbNameMatchIdentifyThresh.setToolTip(nmit_tip) + self.sbNameMatchSearchThresh.setToolTip(nmst_tip) pbl_tip = """ The Publisher Filter is for eliminating automatic matches to certain publishers @@ -173,9 +176,6 @@ class SettingsWindow(QtWidgets.QDialog): validator = QtGui.QIntValidator(1, 4, self) self.leIssueNumPadding.setValidator(validator) - validator = QtGui.QIntValidator(0, 99, self) - self.leNameLengthDeltaThresh.setValidator(validator) - self.leRenameTemplate.setToolTip(f"
{html.escape(template_tooltip)}") self.settings_to_form() self.rename_error: Exception | None = None @@ -225,7 +225,8 @@ class SettingsWindow(QtWidgets.QDialog): def settings_to_form(self) -> None: # Copy values from settings to form self.leRarExePath.setText(self.settings.rar_exe_path) - self.leNameLengthDeltaThresh.setText(str(self.settings.id_length_delta_thresh)) + self.sbNameMatchIdentifyThresh.setValue(self.settings.id_series_match_identify_thresh) + self.sbNameMatchSearchThresh.setValue(self.settings.id_series_match_search_thresh) self.tePublisherFilter.setPlainText(self.settings.id_publisher_filter) self.cbxCheckForNewVersion.setChecked(self.settings.check_for_new_version) @@ -287,15 +288,13 @@ class SettingsWindow(QtWidgets.QDialog): if self.settings.rar_exe_path: utils.add_to_path(os.path.dirname(self.settings.rar_exe_path)) - if not str(self.leNameLengthDeltaThresh.text()).isdigit(): - self.leNameLengthDeltaThresh.setText("0") - if not str(self.leIssueNumPadding.text()).isdigit(): self.leIssueNumPadding.setText("0") self.settings.check_for_new_version = self.cbxCheckForNewVersion.isChecked() - self.settings.id_length_delta_thresh = int(self.leNameLengthDeltaThresh.text()) + self.settings.id_series_match_identify_thresh = self.sbNameMatchIdentifyThresh.value() + self.settings.id_series_match_search_thresh = self.sbNameMatchSearchThresh.value() self.settings.id_publisher_filter = str(self.tePublisherFilter.toPlainText()) self.settings.complicated_parser = self.cbxComplicatedParser.isChecked() diff --git a/comictaggerlib/taggerwindow.py b/comictaggerlib/taggerwindow.py index 977118f..95a845b 100644 --- a/comictaggerlib/taggerwindow.py +++ b/comictaggerlib/taggerwindow.py @@ -1064,7 +1064,7 @@ Have fun! self.form_to_metadata() try: - comic_vine = ComicVineTalker() + comic_vine = ComicVineTalker(self.settings.id_series_match_search_thresh) new_metadata = comic_vine.fetch_issue_data(selector.volume_id, selector.issue_number, self.settings) except ComicVineTalkerException as e: QtWidgets.QApplication.restoreOverrideCursor() @@ -1674,7 +1674,7 @@ Have fun! QtWidgets.QApplication.setOverrideCursor(QtGui.QCursor(QtCore.Qt.CursorShape.WaitCursor)) try: - comic_vine = ComicVineTalker() + comic_vine = ComicVineTalker(self.settings.id_series_match_search_thresh) comic_vine.wait_for_rate_limit = self.settings.wait_and_retry_on_rate_limit cv_md = comic_vine.fetch_issue_data(match["volume_id"], match["issue_number"], self.settings) except ComicVineTalkerException: @@ -1743,7 +1743,7 @@ Have fun! ii.cover_page_index = md.get_cover_page_index_list()[0] if self.atprogdialog is not None: ii.set_cover_url_callback(self.atprogdialog.set_test_image) - ii.set_name_length_delta_threshold(dlg.name_length_match_tolerance) + ii.set_name_series_match_threshold(dlg.name_length_match_tolerance) matches: list[IssueResult] = ii.search() diff --git a/comictaggerlib/ui/autotagstartwindow.ui b/comictaggerlib/ui/autotagstartwindow.ui index 09d8450..dcf341d 100644 --- a/comictaggerlib/ui/autotagstartwindow.ui +++ b/comictaggerlib/ui/autotagstartwindow.ui @@ -44,6 +44,16 @@