""" A python class to automatically identify a comic archive """ """ Copyright 2012 Anthony Beville Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import sys import math import urllib2, urllib import StringIO try: import Image pil_available = True except ImportError: pil_available = False from settings import ComicTaggerSettings from comicvinecacher import ComicVineCacher from genericmetadata import GenericMetadata from comicvinetalker import ComicVineTalker, ComicVineTalkerException from imagehasher import ImageHasher from imagefetcher import ImageFetcher, ImageFetcherException from issuestring import IssueString import utils class IssueIdentifierNetworkError(Exception): pass class IssueIdentifierCancelled(Exception): pass class IssueIdentifier: ResultNoMatches = 0 ResultFoundMatchButBadCoverScore = 1 ResultFoundMatchButNotFirstPage = 2 ResultMultipleMatchesWithBadImageScores = 3 ResultOneGoodMatch = 4 ResultMultipleGoodMatches = 5 def __init__(self, comic_archive, settings ): self.comic_archive = comic_archive self.image_hasher = 1 self.onlyUseAdditionalMetaData = False # a decent hamming score, good enough to call it a match self.min_score_thresh = 16 # for alternate covers, be more stringent, since we're a bit more scattershot in comparisons self.min_alternate_score_thresh = 14 # the min distance a hamming score must be to separate itself from closest neighbor self.min_score_distance = 4 # a very strong hamming score, almost certainly the same image self.strong_score_thresh = 8 # used to eliminate series names that are too long based on our search string self.length_delta_thresh = settings.id_length_delta_thresh # used to eliminate unlikely publishers self.publisher_blacklist = [ s.strip().lower() for s in settings.id_publisher_blacklist.split(',') ] self.additional_metadata = GenericMetadata() self.output_function = IssueIdentifier.defaultWriteOutput self.callback = None self.coverUrlCallback = None self.search_result = self.ResultNoMatches self.cover_page_index = 0 def setScoreMinThreshold( self, thresh ): self.min_score_thresh = thresh def setScoreMinDistance( self, distance ): self.min_score_distance = distance def setAdditionalMetadata( self, md ): self.additional_metadata = md def setNameLengthDeltaThreshold( self, delta ): self.length_delta_thresh = delta def setPublisherBlackList( self, blacklist ): self.publisher_blacklist = blacklist def setHasherAlgorithm( self, algo ): self.image_hasher = algo pass def setOutputFunction( self, func ): self.output_function = func pass def calculateHash( self, image_data ): if self.image_hasher == '3': return ImageHasher( data=image_data ).dct_average_hash() elif self.image_hasher == '2': return ImageHasher( data=image_data ).average_hash2() else: return ImageHasher( data=image_data ).average_hash() def getAspectRatio( self, image_data ): try: im = Image.open(StringIO.StringIO(image_data)) w,h = im.size return float(h)/float(w) except: return 1.5 def cropCover( self, image_data ): im = Image.open(StringIO.StringIO(image_data)) w,h = im.size try: cropped_im = im.crop( (int(w/2), 0, w, h) ) except Exception as e: sys.exc_clear() print "cropCover() error:", e return None output = StringIO.StringIO() cropped_im.save(output, format="JPEG") cropped_image_data = output.getvalue() output.close() return cropped_image_data def setProgressCallback( self, cb_func ): self.callback = cb_func def setCoverURLCallback( self, cb_func ): self.coverUrlCallback = cb_func def getSearchKeys( self ): ca = self.comic_archive search_keys = dict() search_keys['series'] = None search_keys['issue_number'] = None search_keys['month'] = None search_keys['year'] = None if ca is None: return if self.onlyUseAdditionalMetaData: search_keys['series'] = self.additional_metadata.series search_keys['issue_number'] = self.additional_metadata.issue search_keys['year'] = self.additional_metadata.year search_keys['month'] = self.additional_metadata.month return search_keys # see if the archive has any useful meta data for searching with if ca.hasCIX(): internal_metadata = ca.readCIX() elif ca.hasCBI(): internal_metadata = ca.readCBI() else: internal_metadata = ca.readCBI() # try to get some metadata from filename md_from_filename = ca.metadataFromFilename() # preference order: #1. Additional metadata #1. Internal metadata #1. Filename metadata if self.additional_metadata.series is not None: search_keys['series'] = self.additional_metadata.series elif internal_metadata.series is not None: search_keys['series'] = internal_metadata.series else: search_keys['series'] = md_from_filename.series if self.additional_metadata.issue is not None: search_keys['issue_number'] = self.additional_metadata.issue elif internal_metadata.issue is not None: search_keys['issue_number'] = internal_metadata.issue else: search_keys['issue_number'] = md_from_filename.issue if self.additional_metadata.year is not None: search_keys['year'] = self.additional_metadata.year elif internal_metadata.year is not None: search_keys['year'] = internal_metadata.year else: search_keys['year'] = md_from_filename.year if self.additional_metadata.month is not None: search_keys['month'] = self.additional_metadata.month elif internal_metadata.month is not None: search_keys['month'] = internal_metadata.month else: search_keys['month'] = md_from_filename.month return search_keys @staticmethod def defaultWriteOutput( text ): sys.stdout.write( text ) sys.stdout.flush() def log_msg( self, msg , newline=True ): self.output_function(msg) if newline: self.output_function("\n") def getIssueCoverMatchScore( self, comicVine, issue_id, localCoverHashList, useRemoteAlternates = False , use_log=True): # localHashes is a list of pre-calculated hashs. # useRemoteAlternates - indicates to use alternate covers from CV # first get the primary cover image primary_img_url, primary_thumb_url = comicVine.fetchIssueCoverURLs( issue_id ) try: url_image_data = ImageFetcher().fetch(primary_thumb_url, blocking=True) except ImageFetcherException: self.log_msg( "Network issue while fetching cover image from ComicVine. Aborting...") raise IssueIdentifierNetworkError if self.cancel == True: raise IssueIdentifierCancelled # alert the GUI, if needed if self.coverUrlCallback is not None: self.coverUrlCallback( url_image_data ) remote_cover_list = [] item = dict() item['url'] = primary_img_url item['hash'] = self.calculateHash( url_image_data ) remote_cover_list.append( item ) if self.cancel == True: raise IssueIdentifierCancelled if useRemoteAlternates: alt_img_url_list = comicVine.fetchAlternateCoverURLs( issue_id ) for alt_url in alt_img_url_list: try: alt_url_image_data = ImageFetcher().fetch(alt_url, blocking=True) except ImageFetcherException: self.log_msg( "Network issue while fetching alt. cover image from ComicVine. Aborting...") raise IssueIdentifierNetworkError if self.cancel == True: raise IssueIdentifierCancelled # alert the GUI, if needed if self.coverUrlCallback is not None: self.coverUrlCallback( alt_url_image_data ) item = dict() item['url'] = alt_url item['hash'] = self.calculateHash( alt_url_image_data ) remote_cover_list.append( item ) if self.cancel == True: raise IssueIdentifierCancelled if use_log and useRemoteAlternates: self.log_msg( "[{0} alt. covers]".format(len(remote_cover_list)-1), False ) if use_log: self.log_msg( "[ ", False ) score_list = [] done = False for local_cover_hash in localCoverHashList: for remote_cover_item in remote_cover_list: score = ImageHasher.hamming_distance(local_cover_hash, remote_cover_item['hash'] ) score_item = dict() score_item['score'] = score score_item['url'] = remote_cover_item['url'] score_item['hash'] = remote_cover_item['hash'] score_list.append( score_item ) if use_log: self.log_msg( "{0} ".format(score), False ) if score <= self.strong_score_thresh: # such a good score, we can quit now, since for sure we have a winner done = True break if done: break if use_log: self.log_msg( " ]", False ) best_score_item = min(score_list, key=lambda x:x['score']) return best_score_item """ def validate( self, issue_id ): # create hash list score = self.getIssueMatchScore( issue_id, hash_list, useRemoteAlternates = True ) if score < 20: return True else: return False """ def search( self ): ca = self.comic_archive self.match_list = [] self.cancel = False self.search_result = self.ResultNoMatches if not pil_available: self.log_msg( "Python Imaging Library (PIL) is not available and is needed for issue identification." ) return self.match_list if not ca.seemsToBeAComicArchive(): self.log_msg( "Sorry, but "+ opts.filename + " is not a comic archive!") return self.match_list cover_image_data = ca.getPage( self.cover_page_index ) cover_hash = self.calculateHash( cover_image_data ) #check the apect ratio # if it's wider than it is high, it's probably a two page spread # if so, crop it and calculate a second hash narrow_cover_hash = None aspect_ratio = self.getAspectRatio( cover_image_data ) if aspect_ratio < 1.0: right_side_image_data = self.cropCover( cover_image_data ) if right_side_image_data is not None: narrow_cover_hash = self.calculateHash( right_side_image_data ) self.log_msg(unicode(str(narrow_cover_hash))) #self.log_msg( "Cover hash = {0:016x}".format(cover_hash) ) keys = self.getSearchKeys() # we need, at minimum, a series and issue number if keys['series'] is None or keys['issue_number'] is None: self.log_msg("Not enough info for a search!") return [] self.log_msg( "Going to search for:" ) self.log_msg( "\tSeries: " + keys['series'] ) self.log_msg( "\tIssue : " + keys['issue_number'] ) if keys['year'] is not None: self.log_msg( "\tYear : " + str(keys['year']) ) if keys['month'] is not None: self.log_msg( "\tMonth : " + str(keys['month']) ) #self.log_msg("Publisher Blacklist: " + str(self.publisher_blacklist)) comicVine = ComicVineTalker( ) comicVine.setLogFunc( self.output_function ) #self.log_msg( ( "Searching for " + keys['series'] + "...") self.log_msg( u"Searching for {0} #{1} ...".format( keys['series'], keys['issue_number']) ) try: cv_search_results = comicVine.searchForSeries( keys['series'] ) except ComicVineTalkerException: self.log_msg( "Network issue while searching for series. Aborting...") return [] #self.log_msg( "Found " + str(len(cv_search_results)) + " initial results" ) if self.cancel == True: return [] series_second_round_list = [] #self.log_msg( "Removing results with too long names, banned publishers, or future start dates" ) for item in cv_search_results: length_approved = False publisher_approved = True date_approved = True # remove any series that starts after the issue year if keys['year'] is not None and str(keys['year']).isdigit(): if int(keys['year']) < item['start_year']: date_approved = False #assume that our search name is close to the actual name, say within ,e.g. 5 chars shortened_key = utils.removearticles(keys['series']) shortened_item_name = utils.removearticles(item['name']) if len( shortened_item_name ) < ( len( shortened_key ) + self.length_delta_thresh) : length_approved = True # remove any series from publishers on the blacklist if item['publisher'] is not None: publisher = item['publisher']['name'] if publisher is not None and publisher.lower() in self.publisher_blacklist: publisher_approved = False if length_approved and publisher_approved and date_approved: series_second_round_list.append(item) # if we don't think it's an issue number 1, remove any series' that are one-shots if keys['issue_number'] not in [ '1', '0' ]: #self.log_msg( "Removing one-shots" ) series_second_round_list[:] = [x for x in series_second_round_list if not x['count_of_issues'] == 1] self.log_msg( "Searching in " + str(len(series_second_round_list)) +" series" ) if self.callback is not None: self.callback( 0, len(series_second_round_list)) # now sort the list by name length series_second_round_list.sort(key=lambda x: len(x['name']), reverse=False) # Now we've got a list of series that we can dig into look for matching issue number counter = 0 shortlist = [] for series in series_second_round_list: if self.callback is not None: self.callback( counter, len(series_second_round_list)*3) counter += 1 self.log_msg( u"Fetching info for ID: {0} {1} ({2}) ...".format( series['id'], series['name'], series['start_year']), newline=True ) try: cv_series_results = comicVine.fetchVolumeData( series['id'] ) except ComicVineTalkerException: self.log_msg( "Network issue while searching for series details. Aborting...") return [] issue_list = cv_series_results['issues'] for issue in issue_list: num_s = IssueString(issue['issue_number']).asString() # look for a matching issue number if num_s == keys['issue_number']: # now, if we have an issue year key given, reject this one if not a match month, year = comicVine.fetchIssueDate( issue['id'] ) if keys['year'] is not None: if unicode(keys['year']) != unicode(year): break # found a matching issue number! add it to short list shortlist.append( (series, cv_series_results, issue) ) if keys['year'] is None: self.log_msg( "Found {0} series that have an issue #{1}".format(len(shortlist), keys['issue_number']) ) else: self.log_msg( "Found {0} series that have an issue #{1} from {2}".format(len(shortlist), keys['issue_number'], keys['year'] )) # now we have a shortlist of volumes with the desired issue number # Do first round of cover matching counter = len(shortlist) for series, cv_series_results, issue in shortlist: if self.callback is not None: self.callback( counter, len(shortlist)*3) counter += 1 self.log_msg( u"Examining covers for ID: {0} {1} ({2}) ...".format( series['id'], series['name'], series['start_year']), newline=False ) # now, if we have an issue year key given, reject this one if not a match month, year = comicVine.fetchIssueDate( issue['id'] ) # Now check the cover match against the primary image hash_list = [ cover_hash ] if narrow_cover_hash is not None: hash_list.append(narrow_cover_hash) try: score_item = self.getIssueCoverMatchScore( comicVine, issue['id'], hash_list, useRemoteAlternates = False ) except: self.match_list = [] return self.match_list match = dict() match['series'] = u"{0} ({1})".format(series['name'], series['start_year']) match['distance'] = score_item['score'] match['issue_number'] = keys['issue_number'] match['url_image_hash'] = score_item['hash'] match['issue_title'] = issue['name'] match['img_url'] = score_item['url'] match['issue_id'] = issue['id'] match['volume_id'] = series['id'] match['month'] = month match['year'] = year match['publisher'] = None if series['publisher'] is not None: match['publisher'] = series['publisher']['name'] self.match_list.append(match) self.log_msg( " --> {0}".format(match['distance']), newline=False ) self.log_msg( "" ) if len(self.match_list) == 0: self.log_msg( ":-( no matches!" ) self.search_result = self.ResultNoMatches return self.match_list # sort list by image match scores self.match_list.sort(key=lambda k: k['distance']) l = [] for i in self.match_list: l.append( i['distance'] ) self.log_msg( "Compared to covers in {0} issue(s):".format(len(self.match_list)), newline=False) self.log_msg( str(l)) def print_match(item): self.log_msg( u"-----> {0} #{1} {2} ({3}/{4}) -- score: {5}".format( item['series'], item['issue_number'], item['issue_title'], item['month'], item['year'], item['distance']) ) best_score = self.match_list[0]['distance'] if best_score >= self.min_score_thresh: # we have 1 or more low-confidence matches (all bad cover scores) # look at a few more pages in the archive, and also alternate covers online self.log_msg( "Very weak scores for the cover. Analyzing alternate pages and covers..." ) hash_list = [ cover_hash ] if narrow_cover_hash is not None: hash_list.append(narrow_cover_hash) for i in range( 1, min(3, ca.getNumberOfPages())): image_data = ca.getPage(i) page_hash = self.calculateHash( image_data ) hash_list.append( page_hash ) second_match_list = [] counter = 2*len(self.match_list) for m in self.match_list: if self.callback is not None: self.callback( counter, len(self.match_list)*3) counter += 1 self.log_msg( u"Examining alternate covers for ID: {0} {1} ...".format( m['volume_id'], m['series']), newline=False ) try: score_item = self.getIssueCoverMatchScore( comicVine, m['issue_id'], hash_list, useRemoteAlternates = True ) except: self.match_list = [] return self.match_list self.log_msg("--->{0}".format(score_item['score'])) self.log_msg( "" ) if score_item['score'] < self.min_alternate_score_thresh: second_match_list.append(m) m['distance'] = score_item['score'] if len( second_match_list ) == 0: if len( self.match_list) == 1: self.log_msg( "No matching pages in the issue." ) self.log_msg( u"--------------------------------------------------") print_match(self.match_list[0]) self.log_msg( u"--------------------------------------------------") self.search_result = self.ResultFoundMatchButBadCoverScore else: self.log_msg( u"--------------------------------------------------") self.log_msg( u"Multiple bad cover matches! Need to use other info..." ) self.log_msg( u"--------------------------------------------------") self.search_result = self.ResultMultipleMatchesWithBadImageScores return self.match_list else: # We did good, found something! self.log_msg( "Success in secondary/alternate cover matching!" ) self.match_list = second_match_list # sort new list by image match scores self.match_list.sort(key=lambda k: k['distance']) best_score = self.match_list[0]['distance'] self.log_msg("[Second round cover matching: best score = {0}]".format(best_score)) # now drop down into the rest of the processing if self.callback is not None: self.callback( 99, 100) #now pare down list, remove any item more than specified distant from the top scores for item in reversed(self.match_list): if item['distance'] > best_score + self.min_score_distance: self.match_list.remove(item) if len(self.match_list) == 1: self.log_msg( u"--------------------------------------------------") print_match(self.match_list[0]) self.log_msg( u"--------------------------------------------------") self.search_result = self.ResultOneGoodMatch elif len(self.match_list) == 0: self.log_msg( u"--------------------------------------------------") self.log_msg( "No matches found :(" ) self.log_msg( u"--------------------------------------------------") self.search_result = self.ResultNoMatches else: print self.log_msg( "More than one likley candiate." ) self.search_result = self.ResultMultipleGoodMatches self.log_msg( u"--------------------------------------------------") for item in self.match_list: print_match(item) self.log_msg( u"--------------------------------------------------") return self.match_list