From c3f5badc7d3d071b842965b9b3a73ef50671e892 Mon Sep 17 00:00:00 2001 From: Mizaki Date: Tue, 11 Feb 2025 01:03:12 +0000 Subject: [PATCH 1/3] Use source hashes for cover matching --- comicapi/genericmetadata.py | 10 ++++++- comictaggerlib/cli.py | 2 +- comictaggerlib/issueidentifier.py | 39 +++++++++++++++++--------- comictaggerlib/issueselectionwindow.py | 5 +++- 4 files changed, 40 insertions(+), 16 deletions(-) diff --git a/comicapi/genericmetadata.py b/comicapi/genericmetadata.py index 6713e8e..7315195 100644 --- a/comicapi/genericmetadata.py +++ b/comicapi/genericmetadata.py @@ -136,6 +136,14 @@ class MetadataOrigin(NamedTuple): return self.name +class ImageHash(NamedTuple): + Hash: int + Kind: str # ahash, phash + + def __str__(self) -> str: + return str(self.Hash) + ": " + self.Kind + + @dataclasses.dataclass class GenericMetadata: writer_synonyms = ("writer", "plotter", "scripter", "script") @@ -202,7 +210,7 @@ class GenericMetadata: last_mark: str | None = None # urls to cover image, not generally part of the metadata - _cover_image: str | None = None + _cover_image: str | ImageHash | None = None _alternate_images: list[str] = dataclasses.field(default_factory=list) def __post_init__(self) -> None: diff --git a/comictaggerlib/cli.py b/comictaggerlib/cli.py index 94965a4..0cbf876 100644 --- a/comictaggerlib/cli.py +++ b/comictaggerlib/cli.py @@ -639,7 +639,7 @@ class CLI: month=ct_md.month, year=ct_md.year, publisher=None, - image_url=ct_md._cover_image or "", + image_url=str(ct_md._cover_image) or "", alt_image_urls=[], description=ct_md.description or "", ) diff --git a/comictaggerlib/issueidentifier.py b/comictaggerlib/issueidentifier.py index 8605d20..bf9bc3d 100644 --- a/comictaggerlib/issueidentifier.py +++ b/comictaggerlib/issueidentifier.py @@ -25,7 +25,7 @@ from typing_extensions import NotRequired, TypedDict from comicapi import utils from comicapi.comicarchive import ComicArchive -from comicapi.genericmetadata import ComicSeries, GenericMetadata +from comicapi.genericmetadata import ComicSeries, GenericMetadata, ImageHash from comicapi.issuestring import IssueString from comictaggerlib.ctsettings import ct_ns from comictaggerlib.imagefetcher import ImageFetcher, ImageFetcherException @@ -132,13 +132,13 @@ class IssueIdentifier: def set_cover_url_callback(self, cb_func: Callable[[bytes], None]) -> None: self.cover_url_callback = cb_func - def calculate_hash(self, image_data: bytes) -> int: + def calculate_hash(self, image_data: bytes = b"", image: Image = None) -> int: if self.image_hasher == 3: - return ImageHasher(data=image_data).p_hash() + return ImageHasher(data=image_data, image=image).p_hash() if self.image_hasher == 2: - return -1 # ImageHasher(data=image_data).average_hash2() + return -1 # ImageHasher(data=image_data, image=image).average_hash2() - return ImageHasher(data=image_data).average_hash() + return ImageHasher(data=image_data, image=image).average_hash() def log_msg(self, msg: Any) -> None: msg = str(msg) @@ -306,7 +306,7 @@ class IssueIdentifier: def _get_issue_cover_match_score( self, - primary_img_url: str, + primary_img_url: str | ImageHash, alt_urls: list[str], local_hashes: list[tuple[str, int]], use_alt_urls: bool = False, @@ -320,12 +320,15 @@ class IssueIdentifier: self._user_canceled() - urls = [primary_img_url] - if use_alt_urls: - urls.extend(alt_urls) - self.log_msg(f"[{len(alt_urls)} alt. covers]") + if isinstance(primary_img_url, ImageHash): + remote_hashes = [("0", primary_img_url.Hash)] + else: + urls = [primary_img_url] + if use_alt_urls: + urls.extend(alt_urls) + self.log_msg(f"[{len(alt_urls)} alt. covers]") - remote_hashes = self._get_remote_hashes(urls) + remote_hashes = self._get_remote_hashes(urls) score_list = [] done = False @@ -490,7 +493,7 @@ class IssueIdentifier: def _calculate_hashes(self, images: list[tuple[str, Image.Image]]) -> list[tuple[str, int]]: hashes = [] for name, image in images: - hashes.append((name, ImageHasher(image=image).average_hash())) + hashes.append((name, self.calculate_hash(image=image))) return hashes def _match_covers( @@ -536,7 +539,7 @@ class IssueIdentifier: month=issue.month, year=issue.year, publisher=None, - image_url=image_url, + image_url=str(image_url), alt_image_urls=alt_urls, description=issue.description or "", ) @@ -620,6 +623,16 @@ class IssueIdentifier: extra_images: list[tuple[str, Image.Image]], issues: list[tuple[ComicSeries, GenericMetadata]], ) -> list[IssueResult]: + # Set hashing kind, will presume all hashes are of the same kind + for series, issue in issues: + if isinstance(issue._cover_image, ImageHash): + if issue._cover_image.Kind == "phash": + self.image_hasher = 3 + break + elif issue._cover_image == "ahash": + self.image_hasher = 1 # Set to 1 on init but might as well be sure + break + cover_matching_1 = self._match_covers(terms, images, issues, use_alternates=False) if len(cover_matching_1) == 0: diff --git a/comictaggerlib/issueselectionwindow.py b/comictaggerlib/issueselectionwindow.py index c2b14ae..1ef2b2c 100644 --- a/comictaggerlib/issueselectionwindow.py +++ b/comictaggerlib/issueselectionwindow.py @@ -221,7 +221,10 @@ class IssueSelectionWindow(QtWidgets.QDialog): QtWidgets.QApplication.restoreOverrideCursor() self.issue_number = issue.issue or "" - self.coverWidget.set_issue_details(self.issue_id, [issue._cover_image or "", *issue._alternate_images]) + cover_image = "" + if isinstance(issue._cover_image, str): + cover_image = issue._cover_image + self.coverWidget.set_issue_details(self.issue_id, [cover_image, *issue._alternate_images]) if issue.description is None: self.set_description(self.teDescription, "") else: From d2499f6bae10f5011a08a824bffda48544bfdb7b Mon Sep 17 00:00:00 2001 From: Mizaki Date: Wed, 12 Feb 2025 23:15:19 +0000 Subject: [PATCH 2/3] Add ImageHash support for alternate_urls --- comicapi/genericmetadata.py | 2 +- comictaggerlib/issueidentifier.py | 30 +++++++++++++++++--------- comictaggerlib/issueselectionwindow.py | 8 +++---- 3 files changed, 25 insertions(+), 15 deletions(-) diff --git a/comicapi/genericmetadata.py b/comicapi/genericmetadata.py index 7315195..0bea82c 100644 --- a/comicapi/genericmetadata.py +++ b/comicapi/genericmetadata.py @@ -211,7 +211,7 @@ class GenericMetadata: # urls to cover image, not generally part of the metadata _cover_image: str | ImageHash | None = None - _alternate_images: list[str] = dataclasses.field(default_factory=list) + _alternate_images: list[str | ImageHash] = dataclasses.field(default_factory=list) def __post_init__(self) -> None: for key, value in self.__dict__.items(): diff --git a/comictaggerlib/issueidentifier.py b/comictaggerlib/issueidentifier.py index bf9bc3d..7702c3a 100644 --- a/comictaggerlib/issueidentifier.py +++ b/comictaggerlib/issueidentifier.py @@ -307,7 +307,7 @@ class IssueIdentifier: def _get_issue_cover_match_score( self, primary_img_url: str | ImageHash, - alt_urls: list[str], + alt_urls: list[str | ImageHash], local_hashes: list[tuple[str, int]], use_alt_urls: bool = False, ) -> Score: @@ -316,17 +316,23 @@ class IssueIdentifier: # If there is no URL return 100 if not primary_img_url: - return Score(score=100, url="", remote_hash=0) + return Score(score=100, url="", remote_hash=0, local_hash=0, local_hash_name="0") self._user_canceled() + remote_hashes = [] + # If the cover is ImageHash and the alternate covers are URLs, the alts will not be hashed/checked currently if isinstance(primary_img_url, ImageHash): - remote_hashes = [("0", primary_img_url.Hash)] + # ImageHash doesn't have a url so we just give it an empty string + remote_hashes.append(("", primary_img_url.Hash)) + if use_alt_urls and alt_urls: + remote_hashes.extend(("", alt_hash.Hash) for alt_hash in alt_urls if isinstance(alt_hash, ImageHash)) else: urls = [primary_img_url] if use_alt_urls: - urls.extend(alt_urls) - self.log_msg(f"[{len(alt_urls)} alt. covers]") + only_urls = [url for url in alt_urls if isinstance(url, str)] + urls.extend(only_urls) + self.log_msg(f"[{len(only_urls)} alt. covers]") remote_hashes = self._get_remote_hashes(urls) @@ -519,10 +525,14 @@ class IssueIdentifier: ) try: - image_url = issue._cover_image or "" - alt_urls = issue._alternate_images + image_url = issue._cover_image if isinstance(issue._cover_image, str) else "" + # We only include urls in the IssueResult so we don't have to deal with it down the line + # TODO: display the hash to the user so they know a direct hash was used instead of downloading an image + alt_urls: list[str] = [url for url in issue._alternate_images if isinstance(url, str)] - score_item = self._get_issue_cover_match_score(image_url, alt_urls, hashes, use_alt_urls=use_alternates) + score_item = self._get_issue_cover_match_score( + image_url, issue._alternate_images, hashes, use_alt_urls=use_alternates + ) except Exception: logger.exception(f"Scoring series{alternate} covers failed") return [] @@ -539,7 +549,7 @@ class IssueIdentifier: month=issue.month, year=issue.year, publisher=None, - image_url=str(image_url), + image_url=image_url, alt_image_urls=alt_urls, description=issue.description or "", ) @@ -629,7 +639,7 @@ class IssueIdentifier: if issue._cover_image.Kind == "phash": self.image_hasher = 3 break - elif issue._cover_image == "ahash": + elif issue._cover_image.Kind == "ahash": self.image_hasher = 1 # Set to 1 on init but might as well be sure break diff --git a/comictaggerlib/issueselectionwindow.py b/comictaggerlib/issueselectionwindow.py index 1ef2b2c..01712e4 100644 --- a/comictaggerlib/issueselectionwindow.py +++ b/comictaggerlib/issueselectionwindow.py @@ -221,10 +221,10 @@ class IssueSelectionWindow(QtWidgets.QDialog): QtWidgets.QApplication.restoreOverrideCursor() self.issue_number = issue.issue or "" - cover_image = "" - if isinstance(issue._cover_image, str): - cover_image = issue._cover_image - self.coverWidget.set_issue_details(self.issue_id, [cover_image, *issue._alternate_images]) + # We don't currently have a way to display hashes to the user + # TODO: display the hash to the user so they know it will be used for cover matching + alt_images = [url for url in issue._alternate_images if isinstance(url, str)] + self.coverWidget.set_issue_details(self.issue_id, [str(issue._cover_image) or "", *alt_images]) if issue.description is None: self.set_description(self.teDescription, "") else: From 085b599bc43bf02101b562887fcc75e5088bd758 Mon Sep 17 00:00:00 2001 From: Mizaki Date: Fri, 14 Feb 2025 00:59:10 +0000 Subject: [PATCH 3/3] Parametrise cover match test and add ImageHash data --- testing/comicdata.py | 73 +++++++++++++++++++++++++++++++++++ tests/issueidentifier_test.py | 24 ++++++------ 2 files changed, 86 insertions(+), 11 deletions(-) diff --git a/testing/comicdata.py b/testing/comicdata.py index cafbe16..3832b11 100644 --- a/testing/comicdata.py +++ b/testing/comicdata.py @@ -288,3 +288,76 @@ metadata_prepared = ( ), ), ) + +issueidentifier_score = ( + ( + ( + comicapi.genericmetadata.ImageHash( + Hash=0, # Force using the alternate, since the alternate is a url it will be ignored + Kind="ahash", + ), + ["https://comicvine.gamespot.com/cory-doctorows-futuristic-tales-of-the-here-and-no/4000-140529/"], + True, + ), + { + "remote_hash": 0, + "score": 31, + "url": "", + "local_hash": 212201432349720, + "local_hash_name": "Cover 1", + }, + ), + ( + ( + comicapi.genericmetadata.ImageHash( + Hash=0, + Kind="ahash", + ), + [ + comicapi.genericmetadata.ImageHash( + Hash=212201432349720, + Kind="ahash", + ), + ], + True, + ), + { + "remote_hash": 212201432349720, + "score": 0, + "url": "", + "local_hash": 212201432349720, + "local_hash_name": "Cover 1", + }, + ), + ( + ( + comicapi.genericmetadata.ImageHash( + Hash=212201432349720, + Kind="ahash", + ), + ["https://comicvine.gamespot.com/cory-doctorows-futuristic-tales-of-the-here-and-no/4000-140529/"], + False, + ), + { + "remote_hash": 212201432349720, + "score": 0, + "url": "", + "local_hash": 212201432349720, + "local_hash_name": "Cover 1", + }, + ), + ( + ( + "https://comicvine.gamespot.com/a/uploads/scale_large/0/574/585444-109004_20080707014047_large.jpg", + ["https://comicvine.gamespot.com/cory-doctorows-futuristic-tales-of-the-here-and-no/4000-140529/"], + False, + ), + { + "remote_hash": 212201432349720, + "score": 0, + "url": "https://comicvine.gamespot.com/a/uploads/scale_large/0/574/585444-109004_20080707014047_large.jpg", + "local_hash": 212201432349720, + "local_hash_name": "Cover 1", + }, + ), +) diff --git a/tests/issueidentifier_test.py b/tests/issueidentifier_test.py index bec1f80..9e96a3a 100644 --- a/tests/issueidentifier_test.py +++ b/tests/issueidentifier_test.py @@ -9,6 +9,7 @@ import comictaggerlib.imagehasher import comictaggerlib.issueidentifier import testing.comicdata import testing.comicvine +from comicapi.genericmetadata import ImageHash from comictaggerlib.resulttypes import IssueResult @@ -36,21 +37,22 @@ def test_get_search_keys(cbz, config, additional_md, expected, comicvine_api): assert expected == ii._get_search_keys(additional_md) -def test_get_issue_cover_match_score(cbz, config, comicvine_api): +@pytest.mark.parametrize("data, expected", testing.comicdata.issueidentifier_score) +def test_get_issue_cover_match_score( + cbz, + config, + comicvine_api, + data: tuple[str | ImageHash, list[str | ImageHash], bool], + expected: comictaggerlib.issueidentifier.Score, +): config, definitions = config ii = comictaggerlib.issueidentifier.IssueIdentifier(cbz, config, comicvine_api) score = ii._get_issue_cover_match_score( - "https://comicvine.gamespot.com/a/uploads/scale_large/0/574/585444-109004_20080707014047_large.jpg", - ["https://comicvine.gamespot.com/cory-doctorows-futuristic-tales-of-the-here-and-no/4000-140529/"], - [("Cover 1", ii.calculate_hash(cbz.get_page(0)))], + primary_img_url=data[0], + alt_urls=data[1], + local_hashes=[("Cover 1", ii.calculate_hash(cbz.get_page(0)))], + use_alt_urls=data[2], ) - expected = { - "remote_hash": 212201432349720, - "score": 0, - "url": "https://comicvine.gamespot.com/a/uploads/scale_large/0/574/585444-109004_20080707014047_large.jpg", - "local_hash": 212201432349720, - "local_hash_name": "Cover 1", - } assert expected == score