comictagger/comictaggerlib/issueidentifier.py

655 lines
25 KiB
Python
Raw Normal View History

"""
A python class to automatically identify a comic archive
"""
"""
Copyright 2012-2014 Anthony Beville
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import sys
import math
import urllib2, urllib
import StringIO
try:
from PIL import Image
from PIL import WebPImagePlugin
pil_available = True
except ImportError:
pil_available = False
from settings import ComicTaggerSettings
from comicvinecacher import ComicVineCacher
from genericmetadata import GenericMetadata
from comicvinetalker import ComicVineTalker, ComicVineTalkerException
from imagehasher import ImageHasher
from imagefetcher import ImageFetcher, ImageFetcherException
from issuestring import IssueString
import utils
class IssueIdentifierNetworkError(Exception):
pass
class IssueIdentifierCancelled(Exception):
pass
class IssueIdentifier:
ResultNoMatches = 0
ResultFoundMatchButBadCoverScore = 1
ResultFoundMatchButNotFirstPage = 2
ResultMultipleMatchesWithBadImageScores = 3
ResultOneGoodMatch = 4
ResultMultipleGoodMatches = 5
def __init__(self, comic_archive, settings):
self.comic_archive = comic_archive
self.image_hasher = 1
self.onlyUseAdditionalMetaData = False
# a decent hamming score, good enough to call it a match
self.min_score_thresh = 16
# for alternate covers, be more stringent, since we're a bit more scattershot in comparisons
self.min_alternate_score_thresh = 12
# the min distance a hamming score must be to separate itself from closest neighbor
self.min_score_distance = 4
# a very strong hamming score, almost certainly the same image
self.strong_score_thresh = 8
# used to eliminate series names that are too long based on our search string
self.length_delta_thresh = settings.id_length_delta_thresh
# used to eliminate unlikely publishers
self.publisher_blacklist = [ s.strip().lower() for s in settings.id_publisher_blacklist.split(',') ]
self.additional_metadata = GenericMetadata()
self.output_function = IssueIdentifier.defaultWriteOutput
self.callback = None
self.coverUrlCallback = None
self.search_result = self.ResultNoMatches
self.cover_page_index = 0
self.cancel = False
self.waitAndRetryOnRateLimit = False
def setScoreMinThreshold(self, thresh):
self.min_score_thresh = thresh
def setScoreMinDistance(self, distance):
self.min_score_distance = distance
def setAdditionalMetadata(self, md):
self.additional_metadata = md
def setNameLengthDeltaThreshold(self, delta):
self.length_delta_thresh = delta
def setPublisherBlackList(self, blacklist):
self.publisher_blacklist = blacklist
def setHasherAlgorithm(self, algo):
self.image_hasher = algo
pass
def setOutputFunction(self, func):
self.output_function = func
pass
def calculateHash(self, image_data):
if self.image_hasher == '3':
return ImageHasher(data=image_data).dct_average_hash()
elif self.image_hasher == '2':
return ImageHasher(data=image_data).average_hash2()
else:
return ImageHasher(data=image_data).average_hash()
def getAspectRatio(self, image_data):
try:
im = Image.open(StringIO.StringIO(image_data))
w,h = im.size
return float(h)/float(w)
except:
return 1.5
def cropCover(self, image_data):
im = Image.open(StringIO.StringIO(image_data))
w,h = im.size
try:
cropped_im = im.crop((int(w/2), 0, w, h))
except Exception as e:
sys.exc_clear()
print "cropCover() error:", e
return None
output = StringIO.StringIO()
cropped_im.save(output, format="PNG")
cropped_image_data = output.getvalue()
output.close()
return cropped_image_data
def setProgressCallback(self, cb_func):
self.callback = cb_func
def setCoverURLCallback(self, cb_func):
self.coverUrlCallback = cb_func
def getSearchKeys(self):
ca = self.comic_archive
search_keys = dict()
search_keys['series'] = None
search_keys['issue_number'] = None
search_keys['month'] = None
search_keys['year'] = None
search_keys['issue_count'] = None
if ca is None:
return
if self.onlyUseAdditionalMetaData:
search_keys['series'] = self.additional_metadata.series
search_keys['issue_number'] = self.additional_metadata.issue
search_keys['year'] = self.additional_metadata.year
search_keys['month'] = self.additional_metadata.month
search_keys['issue_count'] = self.additional_metadata.issueCount
return search_keys
# see if the archive has any useful meta data for searching with
if ca.hasCIX():
internal_metadata = ca.readCIX()
elif ca.hasCBI():
internal_metadata = ca.readCBI()
else:
internal_metadata = ca.readCBI()
# try to get some metadata from filename
md_from_filename = ca.metadataFromFilename()
# preference order:
#1. Additional metadata
#1. Internal metadata
#1. Filename metadata
if self.additional_metadata.series is not None:
search_keys['series'] = self.additional_metadata.series
elif internal_metadata.series is not None:
search_keys['series'] = internal_metadata.series
else:
search_keys['series'] = md_from_filename.series
if self.additional_metadata.issue is not None:
search_keys['issue_number'] = self.additional_metadata.issue
elif internal_metadata.issue is not None:
search_keys['issue_number'] = internal_metadata.issue
else:
search_keys['issue_number'] = md_from_filename.issue
if self.additional_metadata.year is not None:
search_keys['year'] = self.additional_metadata.year
elif internal_metadata.year is not None:
search_keys['year'] = internal_metadata.year
else:
search_keys['year'] = md_from_filename.year
if self.additional_metadata.month is not None:
search_keys['month'] = self.additional_metadata.month
elif internal_metadata.month is not None:
search_keys['month'] = internal_metadata.month
else:
search_keys['month'] = md_from_filename.month
if self.additional_metadata.issueCount is not None:
search_keys['issue_count'] = self.additional_metadata.issueCount
elif internal_metadata.issueCount is not None:
search_keys['issue_count'] = internal_metadata.issueCount
else:
search_keys['issue_count'] = md_from_filename.issueCount
return search_keys
@staticmethod
def defaultWriteOutput(text):
sys.stdout.write(text)
sys.stdout.flush()
def log_msg(self, msg , newline=True):
self.output_function(msg)
if newline:
self.output_function("\n")
def getIssueCoverMatchScore(self, comicVine, issue_id, primary_img_url, primary_thumb_url, page_url, localCoverHashList, useRemoteAlternates = False , useLog=True):
# localHashes is a list of pre-calculated hashs.
# useRemoteAlternates - indicates to use alternate covers from CV
try:
url_image_data = ImageFetcher().fetch(primary_thumb_url, blocking=True)
except ImageFetcherException:
self.log_msg("Network issue while fetching cover image from ComicVine. Aborting...")
raise IssueIdentifierNetworkError
if self.cancel == True:
raise IssueIdentifierCancelled
# alert the GUI, if needed
if self.coverUrlCallback is not None:
self.coverUrlCallback(url_image_data)
remote_cover_list = []
item = dict()
item['url'] = primary_img_url
item['hash'] = self.calculateHash(url_image_data)
remote_cover_list.append(item)
if self.cancel == True:
raise IssueIdentifierCancelled
if useRemoteAlternates:
alt_img_url_list = comicVine.fetchAlternateCoverURLs(issue_id, page_url)
for alt_url in alt_img_url_list:
try:
alt_url_image_data = ImageFetcher().fetch(alt_url, blocking=True)
except ImageFetcherException:
self.log_msg("Network issue while fetching alt. cover image from ComicVine. Aborting...")
raise IssueIdentifierNetworkError
if self.cancel == True:
raise IssueIdentifierCancelled
# alert the GUI, if needed
if self.coverUrlCallback is not None:
self.coverUrlCallback(alt_url_image_data)
item = dict()
item['url'] = alt_url
item['hash'] = self.calculateHash(alt_url_image_data)
remote_cover_list.append(item)
if self.cancel == True:
raise IssueIdentifierCancelled
if useLog and useRemoteAlternates:
self.log_msg("[{0} alt. covers]".format(len(remote_cover_list)-1), False)
if useLog:
self.log_msg("[ ", False)
score_list = []
done = False
for local_cover_hash in localCoverHashList:
for remote_cover_item in remote_cover_list:
score = ImageHasher.hamming_distance(local_cover_hash, remote_cover_item['hash'])
score_item = dict()
score_item['score'] = score
score_item['url'] = remote_cover_item['url']
score_item['hash'] = remote_cover_item['hash']
score_list.append(score_item)
if useLog:
self.log_msg("{0} ".format(score), False)
if score <= self.strong_score_thresh:
# such a good score, we can quit now, since for sure we have a winner
done = True
break
if done:
break
if useLog:
self.log_msg(" ]", False)
best_score_item = min(score_list, key=lambda x:x['score'])
return best_score_item
"""
def validate(self, issue_id):
# create hash list
score = self.getIssueMatchScore(issue_id, hash_list, useRemoteAlternates = True)
if score < 20:
return True
else:
return False
"""
def search(self):
ca = self.comic_archive
self.match_list = []
self.cancel = False
self.search_result = self.ResultNoMatches
if not pil_available:
self.log_msg("Python Imaging Library (PIL) is not available and is needed for issue identification.")
return self.match_list
if not ca.seemsToBeAComicArchive():
self.log_msg("Sorry, but "+ opts.filename + " is not a comic archive!")
return self.match_list
cover_image_data = ca.getPage(self.cover_page_index)
cover_hash = self.calculateHash(cover_image_data)
#check the apect ratio
# if it's wider than it is high, it's probably a two page spread
# if so, crop it and calculate a second hash
narrow_cover_hash = None
aspect_ratio = self.getAspectRatio(cover_image_data)
if aspect_ratio < 1.0:
right_side_image_data = self.cropCover(cover_image_data)
if right_side_image_data is not None:
narrow_cover_hash = self.calculateHash(right_side_image_data)
#self.log_msg("Cover hash = {0:016x}".format(cover_hash))
keys = self.getSearchKeys()
#normalize the issue number
keys['issue_number'] = IssueString(keys['issue_number']).asString()
# we need, at minimum, a series and issue number
if keys['series'] is None or keys['issue_number'] is None:
self.log_msg("Not enough info for a search!")
return []
self.log_msg("Going to search for:")
self.log_msg("\tSeries: " + keys['series'])
self.log_msg("\tIssue : " + keys['issue_number'])
if keys['issue_count'] is not None:
self.log_msg("\tCount : " + str(keys['issue_count']))
if keys['year'] is not None:
self.log_msg("\tYear : " + str(keys['year']))
if keys['month'] is not None:
self.log_msg("\tMonth : " + str(keys['month']))
#self.log_msg("Publisher Blacklist: " + str(self.publisher_blacklist))
comicVine = ComicVineTalker()
comicVine.wait_for_rate_limit = self.waitAndRetryOnRateLimit
comicVine.setLogFunc(self.output_function)
#self.log_msg(("Searching for " + keys['series'] + "...")
self.log_msg(u"Searching for {0} #{1} ...".format(keys['series'], keys['issue_number']))
try:
cv_search_results = comicVine.searchForSeries(keys['series'])
except ComicVineTalkerException:
self.log_msg("Network issue while searching for series. Aborting...")
return []
#self.log_msg("Found " + str(len(cv_search_results)) + " initial results")
if self.cancel == True:
return []
if cv_search_results == None:
return []
series_second_round_list = []
#self.log_msg("Removing results with too long names, banned publishers, or future start dates")
for item in cv_search_results:
length_approved = False
publisher_approved = True
date_approved = True
# remove any series that starts after the issue year
if keys['year'] is not None and str(keys['year']).isdigit() and item['start_year'] is not None and str(item['start_year']).isdigit():
if int(keys['year']) < int(item['start_year']):
date_approved = False
#assume that our search name is close to the actual name, say within ,e.g. 5 chars
shortened_key = utils.removearticles(keys['series'])
shortened_item_name = utils.removearticles(item['name'])
if len(shortened_item_name) < (len(shortened_key) + self.length_delta_thresh) :
length_approved = True
# remove any series from publishers on the blacklist
if item['publisher'] is not None:
publisher = item['publisher']['name']
if publisher is not None and publisher.lower() in self.publisher_blacklist:
publisher_approved = False
if length_approved and publisher_approved and date_approved:
series_second_round_list.append(item)
self.log_msg("Searching in " + str(len(series_second_round_list)) +" series")
if self.callback is not None:
self.callback(0, len(series_second_round_list))
# now sort the list by name length
series_second_round_list.sort(key=lambda x: len(x['name']), reverse=False)
#build a list of volume IDs
volume_id_list = list()
for series in series_second_round_list:
volume_id_list.append(series['id'])
try:
issue_list = comicVine.fetchIssuesByVolumeIssueNumAndYear(volume_id_list,
keys['issue_number'],
keys['year'])
except ComicVineTalkerException:
self.log_msg("Network issue while searching for series details. Aborting...")
return []
if issue_list is None:
return []
shortlist = list()
#now re-associate the issues and volumes
for issue in issue_list:
for series in series_second_round_list:
if series['id'] == issue['volume']['id']:
shortlist.append((series, issue))
break
if keys['year'] is None:
self.log_msg(u"Found {0} series that have an issue #{1}".format(len(shortlist), keys['issue_number']))
else:
self.log_msg(u"Found {0} series that have an issue #{1} from {2}".format(len(shortlist), keys['issue_number'], keys['year']))
# now we have a shortlist of volumes with the desired issue number
# Do first round of cover matching
counter = len(shortlist)
for series, issue in shortlist:
if self.callback is not None:
self.callback(counter, len(shortlist)*3)
counter += 1
self.log_msg(u"Examining covers for ID: {0} {1} ({2}) ...".format(
series['id'],
series['name'],
series['start_year']), newline=False)
# parse out the cover date
day, month, year = comicVine.parseDateStr(issue['cover_date'])
# Now check the cover match against the primary image
hash_list = [ cover_hash ]
if narrow_cover_hash is not None:
hash_list.append(narrow_cover_hash)
try:
image_url = issue['image']['super_url']
thumb_url = issue['image']['thumb_url']
page_url = issue['site_detail_url']
score_item = self.getIssueCoverMatchScore(comicVine, issue['id'], image_url, thumb_url, page_url, hash_list, useRemoteAlternates = False)
except:
self.match_list = []
return self.match_list
match = dict()
match['series'] = u"{0} ({1})".format(series['name'], series['start_year'])
match['distance'] = score_item['score']
match['issue_number'] = keys['issue_number']
match['cv_issue_count'] = series['count_of_issues']
match['url_image_hash'] = score_item['hash']
match['issue_title'] = issue['name']
match['issue_id'] = issue['id']
match['volume_id'] = series['id']
match['month'] = month
match['year'] = year
match['publisher'] = None
if series['publisher'] is not None:
match['publisher'] = series['publisher']['name']
match['image_url'] = image_url
match['thumb_url'] = thumb_url
match['page_url'] = page_url
match['description'] = issue['description']
self.match_list.append(match)
self.log_msg(" --> {0}".format(match['distance']), newline=False)
self.log_msg("")
if len(self.match_list) == 0:
self.log_msg(":-(no matches!")
self.search_result = self.ResultNoMatches
return self.match_list
# sort list by image match scores
self.match_list.sort(key=lambda k: k['distance'])
l = []
for i in self.match_list:
l.append(i['distance'])
self.log_msg("Compared to covers in {0} issue(s):".format(len(self.match_list)), newline=False)
self.log_msg(str(l))
def print_match(item):
self.log_msg(u"-----> {0} #{1} {2} ({3}/{4}) -- score: {5}".format(
item['series'],
item['issue_number'],
item['issue_title'],
item['month'],
item['year'],
item['distance']))
best_score = self.match_list[0]['distance']
if best_score >= self.min_score_thresh:
# we have 1 or more low-confidence matches (all bad cover scores)
# look at a few more pages in the archive, and also alternate covers online
self.log_msg("Very weak scores for the cover. Analyzing alternate pages and covers...")
hash_list = [ cover_hash ]
if narrow_cover_hash is not None:
hash_list.append(narrow_cover_hash)
for i in range(1, min(3, ca.getNumberOfPages())):
image_data = ca.getPage(i)
page_hash = self.calculateHash(image_data)
hash_list.append(page_hash)
second_match_list = []
counter = 2*len(self.match_list)
for m in self.match_list:
if self.callback is not None:
self.callback(counter, len(self.match_list)*3)
counter += 1
self.log_msg(u"Examining alternate covers for ID: {0} {1} ...".format(
m['volume_id'],
m['series']), newline=False)
try:
score_item = self.getIssueCoverMatchScore(comicVine, m['issue_id'], m['image_url'], m['thumb_url'], m['page_url'], hash_list, useRemoteAlternates = True)
except:
self.match_list = []
return self.match_list
self.log_msg("--->{0}".format(score_item['score']))
self.log_msg("")
if score_item['score'] < self.min_alternate_score_thresh:
second_match_list.append(m)
m['distance'] = score_item['score']
if len(second_match_list) == 0:
if len(self.match_list) == 1:
self.log_msg("No matching pages in the issue.")
self.log_msg(u"--------------------------------------------------")
print_match(self.match_list[0])
self.log_msg(u"--------------------------------------------------")
self.search_result = self.ResultFoundMatchButBadCoverScore
else:
self.log_msg(u"--------------------------------------------------")
self.log_msg(u"Multiple bad cover matches! Need to use other info...")
self.log_msg(u"--------------------------------------------------")
self.search_result = self.ResultMultipleMatchesWithBadImageScores
return self.match_list
else:
# We did good, found something!
self.log_msg("Success in secondary/alternate cover matching!")
self.match_list = second_match_list
# sort new list by image match scores
self.match_list.sort(key=lambda k: k['distance'])
best_score = self.match_list[0]['distance']
self.log_msg("[Second round cover matching: best score = {0}]".format(best_score))
# now drop down into the rest of the processing
if self.callback is not None:
self.callback(99, 100)
#now pare down list, remove any item more than specified distant from the top scores
for item in reversed(self.match_list):
if item['distance'] > best_score + self.min_score_distance:
self.match_list.remove(item)
# One more test for the case choosing limited series first issue vs a trade with the same cover:
# if we have a given issue count > 1 and the volume from CV has count==1, remove it from match list
if len(self.match_list) >= 2 and keys['issue_count'] is not None and keys['issue_count'] != 1:
new_list = list()
for match in self.match_list:
if match['cv_issue_count'] != 1:
new_list.append(match)
else:
self.log_msg("Removing volume {0} [{1}] from consideration (only 1 issue)".format(match['series'], match['volume_id']))
if len(new_list) > 0:
self.match_list = new_list
if len(self.match_list) == 1:
self.log_msg(u"--------------------------------------------------")
print_match(self.match_list[0])
self.log_msg(u"--------------------------------------------------")
self.search_result = self.ResultOneGoodMatch
elif len(self.match_list) == 0:
self.log_msg(u"--------------------------------------------------")
self.log_msg("No matches found :(")
self.log_msg(u"--------------------------------------------------")
self.search_result = self.ResultNoMatches
else:
# we've got multiple good matches:
self.log_msg("More than one likley candiate.")
self.search_result = self.ResultMultipleGoodMatches
self.log_msg(u"--------------------------------------------------")
for item in self.match_list:
print_match(item)
self.log_msg(u"--------------------------------------------------")
return self.match_list