comictagger/comictaggerlib/issueidentifier.py
lordwelch 4608b97e23 v1.2.0
Separate comicapi into it's own package
Add support for tar files
Insert standard gitignore
Use suggested _version from setuptools-scm
Cleanup setup.py

Fix formatting in the rename template help
2021-08-05 22:42:13 -07:00

646 lines
25 KiB
Python

"""A class to automatically identify a comic archive"""
# Copyright 2012-2014 Anthony Beville
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import sys
from . import utils
from .comicvinetalker import ComicVineTalker, ComicVineTalkerException
from .genericmetadata import GenericMetadata
from .imagefetcher import ImageFetcher, ImageFetcherException
from .imagehasher import ImageHasher
from .issuestring import IssueString
try:
from PIL import Image, WebPImagePlugin
pil_available = True
except ImportError:
pil_available = False
class IssueIdentifierNetworkError(Exception):
pass
class IssueIdentifierCancelled(Exception):
pass
class IssueIdentifier:
ResultNoMatches = 0
ResultFoundMatchButBadCoverScore = 1
ResultFoundMatchButNotFirstPage = 2
ResultMultipleMatchesWithBadImageScores = 3
ResultOneGoodMatch = 4
ResultMultipleGoodMatches = 5
def __init__(self, comic_archive, settings):
self.comic_archive = comic_archive
self.image_hasher = 1
self.onlyUseAdditionalMetaData = False
# a decent hamming score, good enough to call it a match
self.min_score_thresh = 16
# for alternate covers, be more stringent, since we're a bit more
# scattershot in comparisons
self.min_alternate_score_thresh = 12
# the min distance a hamming score must be to separate itself from
# closest neighbor
self.min_score_distance = 4
# a very strong hamming score, almost certainly the same image
self.strong_score_thresh = 8
# used to eliminate series names that are too long based on our search
# string
self.length_delta_thresh = settings.id_length_delta_thresh
# used to eliminate unlikely publishers
self.publisher_blacklist = [s.strip().lower() for s in settings.id_publisher_blacklist.split(",")]
self.additional_metadata = GenericMetadata()
self.output_function = IssueIdentifier.defaultWriteOutput
self.callback = None
self.coverUrlCallback = None
self.search_result = self.ResultNoMatches
self.cover_page_index = 0
self.cancel = False
self.waitAndRetryOnRateLimit = False
def setScoreMinThreshold(self, thresh):
self.min_score_thresh = thresh
def setScoreMinDistance(self, distance):
self.min_score_distance = distance
def setAdditionalMetadata(self, md):
self.additional_metadata = md
def setNameLengthDeltaThreshold(self, delta):
self.length_delta_thresh = delta
def setPublisherBlackList(self, blacklist):
self.publisher_blacklist = blacklist
def setHasherAlgorithm(self, algo):
self.image_hasher = algo
pass
def setOutputFunction(self, func):
self.output_function = func
pass
def calculateHash(self, image_data):
if self.image_hasher == "3":
return ImageHasher(data=image_data).dct_average_hash()
elif self.image_hasher == "2":
return ImageHasher(data=image_data).average_hash2()
else:
return ImageHasher(data=image_data).average_hash()
def getAspectRatio(self, image_data):
try:
im = Image.open(io.StringIO(image_data))
w, h = im.size
return float(h) / float(w)
except:
return 1.5
def cropCover(self, image_data):
im = Image.open(io.StringIO(image_data))
w, h = im.size
try:
cropped_im = im.crop((int(w / 2), 0, w, h))
except Exception as e:
print("cropCover() error:", e)
return None
output = io.StringIO()
cropped_im.save(output, format="PNG")
cropped_image_data = output.getvalue()
output.close()
return cropped_image_data
def setProgressCallback(self, cb_func):
self.callback = cb_func
def setCoverURLCallback(self, cb_func):
self.coverUrlCallback = cb_func
def getSearchKeys(self):
ca = self.comic_archive
search_keys = dict()
search_keys["series"] = None
search_keys["issue_number"] = None
search_keys["month"] = None
search_keys["year"] = None
search_keys["issue_count"] = None
if ca is None:
return
if self.onlyUseAdditionalMetaData:
search_keys["series"] = self.additional_metadata.series
search_keys["issue_number"] = self.additional_metadata.issue
search_keys["year"] = self.additional_metadata.year
search_keys["month"] = self.additional_metadata.month
search_keys["issue_count"] = self.additional_metadata.issueCount
return search_keys
# see if the archive has any useful meta data for searching with
if ca.hasCIX():
internal_metadata = ca.readCIX()
elif ca.hasCBI():
internal_metadata = ca.readCBI()
else:
internal_metadata = ca.readCBI()
# try to get some metadata from filename
md_from_filename = ca.metadataFromFilename()
# preference order:
# 1. Additional metadata
# 1. Internal metadata
# 1. Filename metadata
if self.additional_metadata.series is not None:
search_keys["series"] = self.additional_metadata.series
elif internal_metadata.series is not None:
search_keys["series"] = internal_metadata.series
else:
search_keys["series"] = md_from_filename.series
if self.additional_metadata.issue is not None:
search_keys["issue_number"] = self.additional_metadata.issue
elif internal_metadata.issue is not None:
search_keys["issue_number"] = internal_metadata.issue
else:
search_keys["issue_number"] = md_from_filename.issue
if self.additional_metadata.year is not None:
search_keys["year"] = self.additional_metadata.year
elif internal_metadata.year is not None:
search_keys["year"] = internal_metadata.year
else:
search_keys["year"] = md_from_filename.year
if self.additional_metadata.month is not None:
search_keys["month"] = self.additional_metadata.month
elif internal_metadata.month is not None:
search_keys["month"] = internal_metadata.month
else:
search_keys["month"] = md_from_filename.month
if self.additional_metadata.issueCount is not None:
search_keys["issue_count"] = self.additional_metadata.issueCount
elif internal_metadata.issueCount is not None:
search_keys["issue_count"] = internal_metadata.issueCount
else:
search_keys["issue_count"] = md_from_filename.issueCount
return search_keys
@staticmethod
def defaultWriteOutput(text):
sys.stdout.write(text)
sys.stdout.flush()
def log_msg(self, msg, newline=True):
self.output_function(msg)
if newline:
self.output_function("\n")
def getIssueCoverMatchScore(
self, comicVine, issue_id, primary_img_url, primary_thumb_url, page_url, localCoverHashList, useRemoteAlternates=False, useLog=True
):
# localHashes is a list of pre-calculated hashs.
# useRemoteAlternates - indicates to use alternate covers from CV
try:
url_image_data = ImageFetcher().fetch(primary_thumb_url, blocking=True)
except ImageFetcherException:
self.log_msg("Network issue while fetching cover image from Comic Vine. Aborting...")
raise IssueIdentifierNetworkError
if self.cancel:
raise IssueIdentifierCancelled
# alert the GUI, if needed
if self.coverUrlCallback is not None:
self.coverUrlCallback(url_image_data)
remote_cover_list = []
item = dict()
item["url"] = primary_img_url
item["hash"] = self.calculateHash(url_image_data)
remote_cover_list.append(item)
if self.cancel:
raise IssueIdentifierCancelled
if useRemoteAlternates:
alt_img_url_list = comicVine.fetchAlternateCoverURLs(issue_id, page_url)
for alt_url in alt_img_url_list:
try:
alt_url_image_data = ImageFetcher().fetch(alt_url, blocking=True)
except ImageFetcherException:
self.log_msg("Network issue while fetching alt. cover image from Comic Vine. Aborting...")
raise IssueIdentifierNetworkError
if self.cancel:
raise IssueIdentifierCancelled
# alert the GUI, if needed
if self.coverUrlCallback is not None:
self.coverUrlCallback(alt_url_image_data)
item = dict()
item["url"] = alt_url
item["hash"] = self.calculateHash(alt_url_image_data)
remote_cover_list.append(item)
if self.cancel:
raise IssueIdentifierCancelled
if useLog and useRemoteAlternates:
self.log_msg("[{0} alt. covers]".format(len(remote_cover_list) - 1), False)
if useLog:
self.log_msg("[ ", False)
score_list = []
done = False
for local_cover_hash in localCoverHashList:
for remote_cover_item in remote_cover_list:
score = ImageHasher.hamming_distance(local_cover_hash, remote_cover_item["hash"])
score_item = dict()
score_item["score"] = score
score_item["url"] = remote_cover_item["url"]
score_item["hash"] = remote_cover_item["hash"]
score_list.append(score_item)
if useLog:
self.log_msg("{0}".format(score), False)
if score <= self.strong_score_thresh:
# such a good score, we can quit now, since for sure we
# have a winner
done = True
break
if done:
break
if useLog:
self.log_msg(" ]", False)
best_score_item = min(score_list, key=lambda x: x["score"])
return best_score_item
# def validate(self, issue_id):
# create hash list
# score = self.getIssueMatchScore(issue_id, hash_list, useRemoteAlternates = True)
# if score < 20:
# return True
# else:
# return False
def search(self):
ca = self.comic_archive
self.match_list = []
self.cancel = False
self.search_result = self.ResultNoMatches
if not pil_available:
self.log_msg("Python Imaging Library (PIL) is not available and is needed for issue identification.")
return self.match_list
if not ca.seemsToBeAComicArchive():
self.log_msg("Sorry, but " + opts.filename + " is not a comic archive!")
return self.match_list
cover_image_data = ca.getPage(self.cover_page_index)
cover_hash = self.calculateHash(cover_image_data)
# check the aspect ratio
# if it's wider than it is high, it's probably a two page spread
# if so, crop it and calculate a second hash
narrow_cover_hash = None
aspect_ratio = self.getAspectRatio(cover_image_data)
if aspect_ratio < 1.0:
right_side_image_data = self.cropCover(cover_image_data)
if right_side_image_data is not None:
narrow_cover_hash = self.calculateHash(right_side_image_data)
# self.log_msg("Cover hash = {0:016x}".format(cover_hash))
keys = self.getSearchKeys()
# normalize the issue number
keys["issue_number"] = IssueString(keys["issue_number"]).asString()
# we need, at minimum, a series and issue number
if keys["series"] is None or keys["issue_number"] is None:
self.log_msg("Not enough info for a search!")
return []
self.log_msg("Going to search for:")
self.log_msg("\tSeries: " + keys["series"])
self.log_msg("\tIssue: " + keys["issue_number"])
if keys["issue_count"] is not None:
self.log_msg("\tCount: " + str(keys["issue_count"]))
if keys["year"] is not None:
self.log_msg("\tYear: " + str(keys["year"]))
if keys["month"] is not None:
self.log_msg("\tMonth: " + str(keys["month"]))
# self.log_msg("Publisher Blacklist: " + str(self.publisher_blacklist))
comicVine = ComicVineTalker()
comicVine.wait_for_rate_limit = self.waitAndRetryOnRateLimit
comicVine.setLogFunc(self.output_function)
# self.log_msg(("Searching for " + keys['series'] + "...")
self.log_msg("Searching for {0} #{1} ...".format(keys["series"], keys["issue_number"]))
try:
cv_search_results = comicVine.searchForSeries(keys["series"])
except ComicVineTalkerException:
self.log_msg("Network issue while searching for series. Aborting...")
return []
# self.log_msg("Found " + str(len(cv_search_results)) + " initial results")
if self.cancel:
return []
if cv_search_results is None:
return []
series_second_round_list = []
# self.log_msg("Removing results with too long names, banned publishers, or future start dates")
for item in cv_search_results:
length_approved = False
publisher_approved = True
date_approved = True
# remove any series that starts after the issue year
if keys["year"] is not None and str(keys["year"]).isdigit() and item["start_year"] is not None and str(item["start_year"]).isdigit():
if int(keys["year"]) < int(item["start_year"]):
date_approved = False
# assume that our search name is close to the actual name, say
# within ,e.g. 5 chars
shortened_key = utils.removearticles(keys["series"])
shortened_item_name = utils.removearticles(item["name"])
if len(shortened_item_name) < (len(shortened_key) + self.length_delta_thresh):
length_approved = True
# remove any series from publishers on the blacklist
if item["publisher"] is not None:
publisher = item["publisher"]["name"]
if publisher is not None and publisher.lower() in self.publisher_blacklist:
publisher_approved = False
if length_approved and publisher_approved and date_approved:
series_second_round_list.append(item)
self.log_msg("Searching in " + str(len(series_second_round_list)) + " series")
if self.callback is not None:
self.callback(0, len(series_second_round_list))
# now sort the list by name length
series_second_round_list.sort(key=lambda x: len(x["name"]), reverse=False)
# build a list of volume IDs
volume_id_list = list()
for series in series_second_round_list:
volume_id_list.append(series["id"])
try:
issue_list = comicVine.fetchIssuesByVolumeIssueNumAndYear(volume_id_list, keys["issue_number"], keys["year"])
except ComicVineTalkerException:
self.log_msg("Network issue while searching for series details. Aborting...")
return []
if issue_list is None:
return []
shortlist = list()
# now re-associate the issues and volumes
for issue in issue_list:
for series in series_second_round_list:
if series["id"] == issue["volume"]["id"]:
shortlist.append((series, issue))
break
if keys["year"] is None:
self.log_msg("Found {0} series that have an issue #{1}".format(len(shortlist), keys["issue_number"]))
else:
self.log_msg("Found {0} series that have an issue #{1} from {2}".format(len(shortlist), keys["issue_number"], keys["year"]))
# now we have a shortlist of volumes with the desired issue number
# Do first round of cover matching
counter = len(shortlist)
for series, issue in shortlist:
if self.callback is not None:
self.callback(counter, len(shortlist) * 3)
counter += 1
self.log_msg("Examining covers for ID: {0} {1} ({2}) ...".format(series["id"], series["name"], series["start_year"]), newline=False)
# parse out the cover date
day, month, year = comicVine.parseDateStr(issue["cover_date"])
# Now check the cover match against the primary image
hash_list = [cover_hash]
if narrow_cover_hash is not None:
hash_list.append(narrow_cover_hash)
try:
image_url = issue["image"]["super_url"]
thumb_url = issue["image"]["thumb_url"]
page_url = issue["site_detail_url"]
score_item = self.getIssueCoverMatchScore(
comicVine, issue["id"], image_url, thumb_url, page_url, hash_list, useRemoteAlternates=False
)
except:
self.match_list = []
return self.match_list
match = dict()
match["series"] = "{0} ({1})".format(series["name"], series["start_year"])
match["distance"] = score_item["score"]
match["issue_number"] = keys["issue_number"]
match["cv_issue_count"] = series["count_of_issues"]
match["url_image_hash"] = score_item["hash"]
match["issue_title"] = issue["name"]
match["issue_id"] = issue["id"]
match["volume_id"] = series["id"]
match["month"] = month
match["year"] = year
match["publisher"] = None
if series["publisher"] is not None:
match["publisher"] = series["publisher"]["name"]
match["image_url"] = image_url
match["thumb_url"] = thumb_url
match["page_url"] = page_url
match["description"] = issue["description"]
self.match_list.append(match)
self.log_msg(" --> {0}".format(match["distance"]), newline=False)
self.log_msg("")
if len(self.match_list) == 0:
self.log_msg(":-(no matches!")
self.search_result = self.ResultNoMatches
return self.match_list
# sort list by image match scores
self.match_list.sort(key=lambda k: k["distance"])
l = []
for i in self.match_list:
l.append(i["distance"])
self.log_msg("Compared to covers in {0} issue(s):".format(len(self.match_list)), newline=False)
self.log_msg(str(l))
def print_match(item):
self.log_msg(
"-----> {0} #{1} {2} ({3}/{4}) -- score: {5}".format(
item["series"], item["issue_number"], item["issue_title"], item["month"], item["year"], item["distance"]
)
)
best_score = self.match_list[0]["distance"]
if best_score >= self.min_score_thresh:
# we have 1 or more low-confidence matches (all bad cover scores)
# look at a few more pages in the archive, and also alternate
# covers online
self.log_msg("Very weak scores for the cover. Analyzing alternate pages and covers...")
hash_list = [cover_hash]
if narrow_cover_hash is not None:
hash_list.append(narrow_cover_hash)
for i in range(1, min(3, ca.getNumberOfPages())):
image_data = ca.getPage(i)
page_hash = self.calculateHash(image_data)
hash_list.append(page_hash)
second_match_list = []
counter = 2 * len(self.match_list)
for m in self.match_list:
if self.callback is not None:
self.callback(counter, len(self.match_list) * 3)
counter += 1
self.log_msg("Examining alternate covers for ID: {0} {1} ...".format(m["volume_id"], m["series"]), newline=False)
try:
score_item = self.getIssueCoverMatchScore(
comicVine, m["issue_id"], m["image_url"], m["thumb_url"], m["page_url"], hash_list, useRemoteAlternates=True
)
except:
self.match_list = []
return self.match_list
self.log_msg("--->{0}".format(score_item["score"]))
self.log_msg("")
if score_item["score"] < self.min_alternate_score_thresh:
second_match_list.append(m)
m["distance"] = score_item["score"]
if len(second_match_list) == 0:
if len(self.match_list) == 1:
self.log_msg("No matching pages in the issue.")
self.log_msg("--------------------------------------------------------------------------")
print_match(self.match_list[0])
self.log_msg("--------------------------------------------------------------------------")
self.search_result = self.ResultFoundMatchButBadCoverScore
else:
self.log_msg("--------------------------------------------------------------------------")
self.log_msg("Multiple bad cover matches! Need to use other info...")
self.log_msg("--------------------------------------------------------------------------")
self.search_result = self.ResultMultipleMatchesWithBadImageScores
return self.match_list
else:
# We did good, found something!
self.log_msg("Success in secondary/alternate cover matching!")
self.match_list = second_match_list
# sort new list by image match scores
self.match_list.sort(key=lambda k: k["distance"])
best_score = self.match_list[0]["distance"]
self.log_msg("[Second round cover matching: best score = {0}]".format(best_score))
# now drop down into the rest of the processing
if self.callback is not None:
self.callback(99, 100)
# now pare down list, remove any item more than specified distant from
# the top scores
for item in reversed(self.match_list):
if item["distance"] > best_score + self.min_score_distance:
self.match_list.remove(item)
# One more test for the case choosing limited series first issue vs a trade with the same cover:
# if we have a given issue count > 1 and the volume from CV has
# count==1, remove it from match list
if len(self.match_list) >= 2 and keys["issue_count"] is not None and keys["issue_count"] != 1:
new_list = list()
for match in self.match_list:
if match["cv_issue_count"] != 1:
new_list.append(match)
else:
self.log_msg("Removing volume {0} [{1}] from consideration (only 1 issue)".format(match["series"], match["volume_id"]))
if len(new_list) > 0:
self.match_list = new_list
if len(self.match_list) == 1:
self.log_msg("--------------------------------------------------------------------------")
print_match(self.match_list[0])
self.log_msg("--------------------------------------------------------------------------")
self.search_result = self.ResultOneGoodMatch
elif len(self.match_list) == 0:
self.log_msg("--------------------------------------------------------------------------")
self.log_msg("No matches found :(")
self.log_msg("--------------------------------------------------------------------------")
self.search_result = self.ResultNoMatches
else:
# we've got multiple good matches:
self.log_msg("More than one likely candidate.")
self.search_result = self.ResultMultipleGoodMatches
self.log_msg("--------------------------------------------------------------------------")
for item in self.match_list:
print_match(item)
self.log_msg("--------------------------------------------------------------------------")
return self.match_list