diff --git a/comictaggerlib/issueidentifier.py b/comictaggerlib/issueidentifier.py index eeea830..e923fc3 100644 --- a/comictaggerlib/issueidentifier.py +++ b/comictaggerlib/issueidentifier.py @@ -1,661 +1,661 @@ -""" -A python class to automatically identify a comic archive -""" - -""" -Copyright 2012 Anthony Beville - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -import sys -import math -import urllib2, urllib -import StringIO -try: - import Image - pil_available = True -except ImportError: - pil_available = False - -from settings import ComicTaggerSettings -from comicvinecacher import ComicVineCacher -from genericmetadata import GenericMetadata -from comicvinetalker import ComicVineTalker, ComicVineTalkerException -from imagehasher import ImageHasher -from imagefetcher import ImageFetcher, ImageFetcherException -from issuestring import IssueString - -import utils - -class IssueIdentifierNetworkError(Exception): - pass -class IssueIdentifierCancelled(Exception): - pass - -class IssueIdentifier: - - ResultNoMatches = 0 - ResultFoundMatchButBadCoverScore = 1 - ResultFoundMatchButNotFirstPage = 2 - ResultMultipleMatchesWithBadImageScores = 3 - ResultOneGoodMatch = 4 - ResultMultipleGoodMatches = 5 - - def __init__(self, comic_archive, settings ): - self.comic_archive = comic_archive - self.image_hasher = 1 - - self.onlyUseAdditionalMetaData = False - - # a decent hamming score, good enough to call it a match - self.min_score_thresh = 16 - - # for alternate covers, be more stringent, since we're a bit more scattershot in comparisons - self.min_alternate_score_thresh = 12 - - # the min distance a hamming score must be to separate itself from closest neighbor - self.min_score_distance = 4 - - # a very strong hamming score, almost certainly the same image - self.strong_score_thresh = 8 - - # used to eliminate series names that are too long based on our search string - self.length_delta_thresh = settings.id_length_delta_thresh - - # used to eliminate unlikely publishers - self.publisher_blacklist = [ s.strip().lower() for s in settings.id_publisher_blacklist.split(',') ] - - self.additional_metadata = GenericMetadata() - self.output_function = IssueIdentifier.defaultWriteOutput - self.callback = None - self.coverUrlCallback = None - self.search_result = self.ResultNoMatches - self.cover_page_index = 0 - self.cancel = False - - def setScoreMinThreshold( self, thresh ): - self.min_score_thresh = thresh - - def setScoreMinDistance( self, distance ): - self.min_score_distance = distance - - def setAdditionalMetadata( self, md ): - self.additional_metadata = md - - def setNameLengthDeltaThreshold( self, delta ): - self.length_delta_thresh = delta - - def setPublisherBlackList( self, blacklist ): - self.publisher_blacklist = blacklist - - def setHasherAlgorithm( self, algo ): - self.image_hasher = algo - pass - - def setOutputFunction( self, func ): - self.output_function = func - pass - - def calculateHash( self, image_data ): - if self.image_hasher == '3': - return ImageHasher( data=image_data ).dct_average_hash() - elif self.image_hasher == '2': - return ImageHasher( data=image_data ).average_hash2() - else: - return ImageHasher( data=image_data ).average_hash() - - def getAspectRatio( self, image_data ): - try: - im = Image.open(StringIO.StringIO(image_data)) - w,h = im.size - return float(h)/float(w) - except: - return 1.5 - - def cropCover( self, image_data ): - - im = Image.open(StringIO.StringIO(image_data)) - w,h = im.size - - try: - cropped_im = im.crop( (int(w/2), 0, w, h) ) - except Exception as e: - sys.exc_clear() - print "cropCover() error:", e - return None - - output = StringIO.StringIO() - cropped_im.save(output, format="PNG") - cropped_image_data = output.getvalue() - output.close() - - return cropped_image_data - - - def setProgressCallback( self, cb_func ): - self.callback = cb_func - - def setCoverURLCallback( self, cb_func ): - self.coverUrlCallback = cb_func - - def getSearchKeys( self ): - - ca = self.comic_archive - search_keys = dict() - search_keys['series'] = None - search_keys['issue_number'] = None - search_keys['month'] = None - search_keys['year'] = None - - if ca is None: - return - - if self.onlyUseAdditionalMetaData: - search_keys['series'] = self.additional_metadata.series - search_keys['issue_number'] = self.additional_metadata.issue - search_keys['year'] = self.additional_metadata.year - search_keys['month'] = self.additional_metadata.month - return search_keys - - # see if the archive has any useful meta data for searching with - if ca.hasCIX(): - internal_metadata = ca.readCIX() - elif ca.hasCBI(): - internal_metadata = ca.readCBI() - else: - internal_metadata = ca.readCBI() - - # try to get some metadata from filename - md_from_filename = ca.metadataFromFilename() - - # preference order: - #1. Additional metadata - #1. Internal metadata - #1. Filename metadata - - if self.additional_metadata.series is not None: - search_keys['series'] = self.additional_metadata.series - elif internal_metadata.series is not None: - search_keys['series'] = internal_metadata.series - else: - search_keys['series'] = md_from_filename.series - - if self.additional_metadata.issue is not None: - search_keys['issue_number'] = self.additional_metadata.issue - elif internal_metadata.issue is not None: - search_keys['issue_number'] = internal_metadata.issue - else: - search_keys['issue_number'] = md_from_filename.issue - - if self.additional_metadata.year is not None: - search_keys['year'] = self.additional_metadata.year - elif internal_metadata.year is not None: - search_keys['year'] = internal_metadata.year - else: - search_keys['year'] = md_from_filename.year - - if self.additional_metadata.month is not None: - search_keys['month'] = self.additional_metadata.month - elif internal_metadata.month is not None: - search_keys['month'] = internal_metadata.month - else: - search_keys['month'] = md_from_filename.month - - return search_keys - - @staticmethod - def defaultWriteOutput( text ): - sys.stdout.write( text ) - sys.stdout.flush() - - def log_msg( self, msg , newline=True ): - self.output_function(msg) - if newline: - self.output_function("\n") - - def getIssueCoverMatchScore( self, comicVine, issue_id, primary_img_url, primary_thumb_url, page_url, localCoverHashList, useRemoteAlternates = False , useLog=True): - # localHashes is a list of pre-calculated hashs. - # useRemoteAlternates - indicates to use alternate covers from CV - - try: - url_image_data = ImageFetcher().fetch(primary_thumb_url, blocking=True) - except ImageFetcherException: - self.log_msg( "Network issue while fetching cover image from ComicVine. Aborting...") - raise IssueIdentifierNetworkError - - if self.cancel == True: - raise IssueIdentifierCancelled - - # alert the GUI, if needed - if self.coverUrlCallback is not None: - self.coverUrlCallback( url_image_data ) - - remote_cover_list = [] - item = dict() - item['url'] = primary_img_url - - item['hash'] = self.calculateHash( url_image_data ) - remote_cover_list.append( item ) - - if self.cancel == True: - raise IssueIdentifierCancelled - - if useRemoteAlternates: - alt_img_url_list = comicVine.fetchAlternateCoverURLs( issue_id, page_url ) - for alt_url in alt_img_url_list: - try: - alt_url_image_data = ImageFetcher().fetch(alt_url, blocking=True) - except ImageFetcherException: - self.log_msg( "Network issue while fetching alt. cover image from ComicVine. Aborting...") - raise IssueIdentifierNetworkError - - if self.cancel == True: - raise IssueIdentifierCancelled - - # alert the GUI, if needed - if self.coverUrlCallback is not None: - self.coverUrlCallback( alt_url_image_data ) - - item = dict() - item['url'] = alt_url - item['hash'] = self.calculateHash( alt_url_image_data ) - remote_cover_list.append( item ) - - if self.cancel == True: - raise IssueIdentifierCancelled - - if useLog and useRemoteAlternates: - self.log_msg( "[{0} alt. covers]".format(len(remote_cover_list)-1), False ) - if useLog: - self.log_msg( "[ ", False ) - - score_list = [] - done = False - for local_cover_hash in localCoverHashList: - for remote_cover_item in remote_cover_list: - score = ImageHasher.hamming_distance(local_cover_hash, remote_cover_item['hash'] ) - score_item = dict() - score_item['score'] = score - score_item['url'] = remote_cover_item['url'] - score_item['hash'] = remote_cover_item['hash'] - score_list.append( score_item ) - if useLog: - self.log_msg( "{0} ".format(score), False ) - - if score <= self.strong_score_thresh: - # such a good score, we can quit now, since for sure we have a winner - done = True - break - if done: - break - - if useLog: - self.log_msg( " ]", False ) - - best_score_item = min(score_list, key=lambda x:x['score']) - - return best_score_item - - """ - def validate( self, issue_id ): - # create hash list - score = self.getIssueMatchScore( issue_id, hash_list, useRemoteAlternates = True ) - if score < 20: - return True - else: - return False - """ - +""" +A python class to automatically identify a comic archive +""" + +""" +Copyright 2012 Anthony Beville + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import sys +import math +import urllib2, urllib +import StringIO +try: + import Image + pil_available = True +except ImportError: + pil_available = False + +from settings import ComicTaggerSettings +from comicvinecacher import ComicVineCacher +from genericmetadata import GenericMetadata +from comicvinetalker import ComicVineTalker, ComicVineTalkerException +from imagehasher import ImageHasher +from imagefetcher import ImageFetcher, ImageFetcherException +from issuestring import IssueString + +import utils + +class IssueIdentifierNetworkError(Exception): + pass +class IssueIdentifierCancelled(Exception): + pass + +class IssueIdentifier: + + ResultNoMatches = 0 + ResultFoundMatchButBadCoverScore = 1 + ResultFoundMatchButNotFirstPage = 2 + ResultMultipleMatchesWithBadImageScores = 3 + ResultOneGoodMatch = 4 + ResultMultipleGoodMatches = 5 + + def __init__(self, comic_archive, settings ): + self.comic_archive = comic_archive + self.image_hasher = 1 + + self.onlyUseAdditionalMetaData = False + + # a decent hamming score, good enough to call it a match + self.min_score_thresh = 16 + + # for alternate covers, be more stringent, since we're a bit more scattershot in comparisons + self.min_alternate_score_thresh = 12 + + # the min distance a hamming score must be to separate itself from closest neighbor + self.min_score_distance = 4 + + # a very strong hamming score, almost certainly the same image + self.strong_score_thresh = 8 + + # used to eliminate series names that are too long based on our search string + self.length_delta_thresh = settings.id_length_delta_thresh + + # used to eliminate unlikely publishers + self.publisher_blacklist = [ s.strip().lower() for s in settings.id_publisher_blacklist.split(',') ] + + self.additional_metadata = GenericMetadata() + self.output_function = IssueIdentifier.defaultWriteOutput + self.callback = None + self.coverUrlCallback = None + self.search_result = self.ResultNoMatches + self.cover_page_index = 0 + self.cancel = False + + def setScoreMinThreshold( self, thresh ): + self.min_score_thresh = thresh + + def setScoreMinDistance( self, distance ): + self.min_score_distance = distance + + def setAdditionalMetadata( self, md ): + self.additional_metadata = md + + def setNameLengthDeltaThreshold( self, delta ): + self.length_delta_thresh = delta + + def setPublisherBlackList( self, blacklist ): + self.publisher_blacklist = blacklist + + def setHasherAlgorithm( self, algo ): + self.image_hasher = algo + pass + + def setOutputFunction( self, func ): + self.output_function = func + pass + + def calculateHash( self, image_data ): + if self.image_hasher == '3': + return ImageHasher( data=image_data ).dct_average_hash() + elif self.image_hasher == '2': + return ImageHasher( data=image_data ).average_hash2() + else: + return ImageHasher( data=image_data ).average_hash() + + def getAspectRatio( self, image_data ): + try: + im = Image.open(StringIO.StringIO(image_data)) + w,h = im.size + return float(h)/float(w) + except: + return 1.5 + + def cropCover( self, image_data ): + + im = Image.open(StringIO.StringIO(image_data)) + w,h = im.size + + try: + cropped_im = im.crop( (int(w/2), 0, w, h) ) + except Exception as e: + sys.exc_clear() + print "cropCover() error:", e + return None + + output = StringIO.StringIO() + cropped_im.save(output, format="PNG") + cropped_image_data = output.getvalue() + output.close() + + return cropped_image_data + + + def setProgressCallback( self, cb_func ): + self.callback = cb_func + + def setCoverURLCallback( self, cb_func ): + self.coverUrlCallback = cb_func + + def getSearchKeys( self ): + + ca = self.comic_archive + search_keys = dict() + search_keys['series'] = None + search_keys['issue_number'] = None + search_keys['month'] = None + search_keys['year'] = None + + if ca is None: + return + + if self.onlyUseAdditionalMetaData: + search_keys['series'] = self.additional_metadata.series + search_keys['issue_number'] = self.additional_metadata.issue + search_keys['year'] = self.additional_metadata.year + search_keys['month'] = self.additional_metadata.month + return search_keys + + # see if the archive has any useful meta data for searching with + if ca.hasCIX(): + internal_metadata = ca.readCIX() + elif ca.hasCBI(): + internal_metadata = ca.readCBI() + else: + internal_metadata = ca.readCBI() + + # try to get some metadata from filename + md_from_filename = ca.metadataFromFilename() + + # preference order: + #1. Additional metadata + #1. Internal metadata + #1. Filename metadata + + if self.additional_metadata.series is not None: + search_keys['series'] = self.additional_metadata.series + elif internal_metadata.series is not None: + search_keys['series'] = internal_metadata.series + else: + search_keys['series'] = md_from_filename.series + + if self.additional_metadata.issue is not None: + search_keys['issue_number'] = self.additional_metadata.issue + elif internal_metadata.issue is not None: + search_keys['issue_number'] = internal_metadata.issue + else: + search_keys['issue_number'] = md_from_filename.issue + + if self.additional_metadata.year is not None: + search_keys['year'] = self.additional_metadata.year + elif internal_metadata.year is not None: + search_keys['year'] = internal_metadata.year + else: + search_keys['year'] = md_from_filename.year + + if self.additional_metadata.month is not None: + search_keys['month'] = self.additional_metadata.month + elif internal_metadata.month is not None: + search_keys['month'] = internal_metadata.month + else: + search_keys['month'] = md_from_filename.month + + return search_keys + + @staticmethod + def defaultWriteOutput( text ): + sys.stdout.write( text ) + sys.stdout.flush() + + def log_msg( self, msg , newline=True ): + self.output_function(msg) + if newline: + self.output_function("\n") + + def getIssueCoverMatchScore( self, comicVine, issue_id, primary_img_url, primary_thumb_url, page_url, localCoverHashList, useRemoteAlternates = False , useLog=True): + # localHashes is a list of pre-calculated hashs. + # useRemoteAlternates - indicates to use alternate covers from CV + + try: + url_image_data = ImageFetcher().fetch(primary_thumb_url, blocking=True) + except ImageFetcherException: + self.log_msg( "Network issue while fetching cover image from ComicVine. Aborting...") + raise IssueIdentifierNetworkError + + if self.cancel == True: + raise IssueIdentifierCancelled + + # alert the GUI, if needed + if self.coverUrlCallback is not None: + self.coverUrlCallback( url_image_data ) + + remote_cover_list = [] + item = dict() + item['url'] = primary_img_url + + item['hash'] = self.calculateHash( url_image_data ) + remote_cover_list.append( item ) + + if self.cancel == True: + raise IssueIdentifierCancelled + + if useRemoteAlternates: + alt_img_url_list = comicVine.fetchAlternateCoverURLs( issue_id, page_url ) + for alt_url in alt_img_url_list: + try: + alt_url_image_data = ImageFetcher().fetch(alt_url, blocking=True) + except ImageFetcherException: + self.log_msg( "Network issue while fetching alt. cover image from ComicVine. Aborting...") + raise IssueIdentifierNetworkError + + if self.cancel == True: + raise IssueIdentifierCancelled + + # alert the GUI, if needed + if self.coverUrlCallback is not None: + self.coverUrlCallback( alt_url_image_data ) + + item = dict() + item['url'] = alt_url + item['hash'] = self.calculateHash( alt_url_image_data ) + remote_cover_list.append( item ) + + if self.cancel == True: + raise IssueIdentifierCancelled + + if useLog and useRemoteAlternates: + self.log_msg( "[{0} alt. covers]".format(len(remote_cover_list)-1), False ) + if useLog: + self.log_msg( "[ ", False ) + + score_list = [] + done = False + for local_cover_hash in localCoverHashList: + for remote_cover_item in remote_cover_list: + score = ImageHasher.hamming_distance(local_cover_hash, remote_cover_item['hash'] ) + score_item = dict() + score_item['score'] = score + score_item['url'] = remote_cover_item['url'] + score_item['hash'] = remote_cover_item['hash'] + score_list.append( score_item ) + if useLog: + self.log_msg( "{0} ".format(score), False ) + + if score <= self.strong_score_thresh: + # such a good score, we can quit now, since for sure we have a winner + done = True + break + if done: + break + + if useLog: + self.log_msg( " ]", False ) + + best_score_item = min(score_list, key=lambda x:x['score']) + + return best_score_item + + """ + def validate( self, issue_id ): + # create hash list + score = self.getIssueMatchScore( issue_id, hash_list, useRemoteAlternates = True ) + if score < 20: + return True + else: + return False + """ + def search( self ): - - ca = self.comic_archive - self.match_list = [] - self.cancel = False - self.search_result = self.ResultNoMatches - - if not pil_available: - self.log_msg( "Python Imaging Library (PIL) is not available and is needed for issue identification." ) - return self.match_list - - if not ca.seemsToBeAComicArchive(): - self.log_msg( "Sorry, but "+ opts.filename + " is not a comic archive!") - return self.match_list - - cover_image_data = ca.getPage( self.cover_page_index ) - cover_hash = self.calculateHash( cover_image_data ) - - #check the apect ratio - # if it's wider than it is high, it's probably a two page spread - # if so, crop it and calculate a second hash - narrow_cover_hash = None - aspect_ratio = self.getAspectRatio( cover_image_data ) - if aspect_ratio < 1.0: - right_side_image_data = self.cropCover( cover_image_data ) - if right_side_image_data is not None: - narrow_cover_hash = self.calculateHash( right_side_image_data ) - - #self.log_msg( "Cover hash = {0:016x}".format(cover_hash) ) - - keys = self.getSearchKeys() - - # we need, at minimum, a series and issue number - if keys['series'] is None or keys['issue_number'] is None: - self.log_msg("Not enough info for a search!") - return [] - - - self.log_msg( "Going to search for:" ) - self.log_msg( "\tSeries: " + keys['series'] ) - self.log_msg( "\tIssue : " + keys['issue_number'] ) - if keys['year'] is not None: - self.log_msg( "\tYear : " + str(keys['year']) ) - if keys['month'] is not None: - self.log_msg( "\tMonth : " + str(keys['month']) ) - - #self.log_msg("Publisher Blacklist: " + str(self.publisher_blacklist)) - - comicVine = ComicVineTalker( ) - comicVine.setLogFunc( self.output_function ) - - #self.log_msg( ( "Searching for " + keys['series'] + "...") - self.log_msg( u"Searching for {0} #{1} ...".format( keys['series'], keys['issue_number']) ) - try: - cv_search_results = comicVine.searchForSeries( keys['series'] ) - except ComicVineTalkerException: - self.log_msg( "Network issue while searching for series. Aborting...") - return [] - - #self.log_msg( "Found " + str(len(cv_search_results)) + " initial results" ) - if self.cancel == True: - return [] - - series_second_round_list = [] - - #self.log_msg( "Removing results with too long names, banned publishers, or future start dates" ) - for item in cv_search_results: - length_approved = False - publisher_approved = True - date_approved = True - - # remove any series that starts after the issue year - if keys['year'] is not None and str(keys['year']).isdigit(): - if int(keys['year']) < item['start_year']: - date_approved = False - - #assume that our search name is close to the actual name, say within ,e.g. 5 chars - shortened_key = utils.removearticles(keys['series']) - shortened_item_name = utils.removearticles(item['name']) - if len( shortened_item_name ) < ( len( shortened_key ) + self.length_delta_thresh) : - length_approved = True - - # remove any series from publishers on the blacklist - if item['publisher'] is not None: - publisher = item['publisher']['name'] - if publisher is not None and publisher.lower() in self.publisher_blacklist: - publisher_approved = False - - if length_approved and publisher_approved and date_approved: - series_second_round_list.append(item) - - # if we don't think it's an issue number 1, remove any series' that are one-shots - if keys['issue_number'] not in [ '1', '0', '0.1' ]: - #self.log_msg( "Removing one-shots" ) - series_second_round_list[:] = [x for x in series_second_round_list if not x['count_of_issues'] == 1] - - self.log_msg( "Searching in " + str(len(series_second_round_list)) +" series" ) - - if self.callback is not None: - self.callback( 0, len(series_second_round_list)) - - # now sort the list by name length - series_second_round_list.sort(key=lambda x: len(x['name']), reverse=False) - - #--------new way--------------- - #build a list of volume IDs - volume_id_list = list() - for series in series_second_round_list: - volume_id_list.append( series['id']) - - try: - issue_list = comicVine.fetchIssuesByVolumeIssueNumAndYear( volume_id_list, - keys['issue_number'], - keys['year']) - - except ComicVineTalkerException: - self.log_msg( "Network issue while searching for series details. Aborting...") - return [] - - shortlist = list() - #now re-associate the issues and volumes - for issue in issue_list: - for series in series_second_round_list: - if series['id'] == issue['volume']['id']: - shortlist.append( (series, issue) ) - break - - #--------new way--------------- - - """ - #--vvvv---old way--------------- - # Now we've got a list of series that we can dig into look for matching issue number - counter = 0 - shortlist = [] - for series in series_second_round_list: - if self.callback is not None: - self.callback( counter, len(series_second_round_list)*3) - counter += 1 - - self.log_msg( u"Fetching info for ID: {0} {1} ({2}) ...".format( - series['id'], - series['name'], - series['start_year']), newline=True ) - - try: - issue_list = comicVine.fetchIssuesByVolume( series['id'] ) - - except ComicVineTalkerException: - self.log_msg( "Network issue while searching for series details. Aborting...") - return [] - - for issue in issue_list: - num_s = IssueString(issue['issue_number']).asString() - - # look for a matching issue number - if num_s.lower() == keys['issue_number'].lower(): - - # now, if we have an issue year key given, reject this one if not a match - month, year = comicVine.fetchIssueDate( issue['id'] ) - if keys['year'] is not None: - if unicode(keys['year']) != unicode(year): - break - - # found a matching issue number! add it to short list - shortlist.append( (series, issue) ) - #--^^^^---old way--------------- - """ - - if keys['year'] is None: - self.log_msg( u"Found {0} series that have an issue #{1}".format(len(shortlist), keys['issue_number']) ) - else: - self.log_msg( u"Found {0} series that have an issue #{1} from {2}".format(len(shortlist), keys['issue_number'], keys['year'] )) - - - # now we have a shortlist of volumes with the desired issue number - # Do first round of cover matching - counter = len(shortlist) - for series, issue in shortlist: - if self.callback is not None: - self.callback( counter, len(shortlist)*3) - counter += 1 - - self.log_msg( u"Examining covers for ID: {0} {1} ({2}) ...".format( - series['id'], - series['name'], - series['start_year']), newline=False ) - - # parse out the cover date - day, month, year = comicVine.parseDateStr( issue['cover_date'] ) - - # Now check the cover match against the primary image - hash_list = [ cover_hash ] - if narrow_cover_hash is not None: - hash_list.append(narrow_cover_hash) - - try: - image_url = issue['image']['super_url'] - thumb_url = issue['image']['thumb_url'] - page_url = issue['site_detail_url'] - - score_item = self.getIssueCoverMatchScore( comicVine, issue['id'], image_url, thumb_url, page_url, hash_list, useRemoteAlternates = False ) - except: - self.match_list = [] - return self.match_list - - match = dict() - match['series'] = u"{0} ({1})".format(series['name'], series['start_year']) - match['distance'] = score_item['score'] - match['issue_number'] = keys['issue_number'] - match['url_image_hash'] = score_item['hash'] - match['issue_title'] = issue['name'] - match['issue_id'] = issue['id'] - match['volume_id'] = series['id'] - match['month'] = month - match['year'] = year - match['publisher'] = None - if series['publisher'] is not None: - match['publisher'] = series['publisher']['name'] - match['image_url'] = image_url - match['thumb_url'] = thumb_url - match['page_url'] = page_url - - self.match_list.append(match) - - self.log_msg( " --> {0}".format(match['distance']), newline=False ) - - self.log_msg( "" ) - - if len(self.match_list) == 0: - self.log_msg( ":-( no matches!" ) - self.search_result = self.ResultNoMatches - return self.match_list - - - # sort list by image match scores - self.match_list.sort(key=lambda k: k['distance']) - - l = [] - for i in self.match_list: - l.append( i['distance'] ) - - self.log_msg( "Compared to covers in {0} issue(s):".format(len(self.match_list)), newline=False) - self.log_msg( str(l)) - - def print_match(item): - self.log_msg( u"-----> {0} #{1} {2} ({3}/{4}) -- score: {5}".format( - item['series'], - item['issue_number'], - item['issue_title'], - item['month'], - item['year'], - item['distance']) ) - - best_score = self.match_list[0]['distance'] - - if best_score >= self.min_score_thresh: - # we have 1 or more low-confidence matches (all bad cover scores) - # look at a few more pages in the archive, and also alternate covers online - self.log_msg( "Very weak scores for the cover. Analyzing alternate pages and covers..." ) - hash_list = [ cover_hash ] - if narrow_cover_hash is not None: - hash_list.append(narrow_cover_hash) - for i in range( 1, min(3, ca.getNumberOfPages())): - image_data = ca.getPage(i) - page_hash = self.calculateHash( image_data ) - hash_list.append( page_hash ) - - second_match_list = [] - counter = 2*len(self.match_list) - for m in self.match_list: - if self.callback is not None: - self.callback( counter, len(self.match_list)*3) - counter += 1 - self.log_msg( u"Examining alternate covers for ID: {0} {1} ...".format( - m['volume_id'], - m['series']), newline=False ) - try: - score_item = self.getIssueCoverMatchScore( comicVine, m['issue_id'], m['image_url'], m['thumb_url'], m['page_url'], hash_list, useRemoteAlternates = True ) - except: - self.match_list = [] - return self.match_list - self.log_msg("--->{0}".format(score_item['score'])) - self.log_msg( "" ) - - if score_item['score'] < self.min_alternate_score_thresh: - second_match_list.append(m) - m['distance'] = score_item['score'] - - if len( second_match_list ) == 0: - if len( self.match_list) == 1: - self.log_msg( "No matching pages in the issue." ) - self.log_msg( u"--------------------------------------------------") - print_match(self.match_list[0]) - self.log_msg( u"--------------------------------------------------") - self.search_result = self.ResultFoundMatchButBadCoverScore - else: - self.log_msg( u"--------------------------------------------------") - self.log_msg( u"Multiple bad cover matches! Need to use other info..." ) - self.log_msg( u"--------------------------------------------------") - self.search_result = self.ResultMultipleMatchesWithBadImageScores - return self.match_list - else: - # We did good, found something! - self.log_msg( "Success in secondary/alternate cover matching!" ) - - self.match_list = second_match_list - # sort new list by image match scores - self.match_list.sort(key=lambda k: k['distance']) - best_score = self.match_list[0]['distance'] - self.log_msg("[Second round cover matching: best score = {0}]".format(best_score)) - # now drop down into the rest of the processing - - if self.callback is not None: - self.callback( 99, 100) - - #now pare down list, remove any item more than specified distant from the top scores - for item in reversed(self.match_list): - if item['distance'] > best_score + self.min_score_distance: - self.match_list.remove(item) - - if len(self.match_list) == 1: - self.log_msg( u"--------------------------------------------------") - print_match(self.match_list[0]) - self.log_msg( u"--------------------------------------------------") - self.search_result = self.ResultOneGoodMatch - - elif len(self.match_list) == 0: - self.log_msg( u"--------------------------------------------------") - self.log_msg( "No matches found :(" ) - self.log_msg( u"--------------------------------------------------") - self.search_result = self.ResultNoMatches - else: - self.log_msg( "More than one likley candiate." ) - self.search_result = self.ResultMultipleGoodMatches - self.log_msg( u"--------------------------------------------------") - for item in self.match_list: - print_match(item) - self.log_msg( u"--------------------------------------------------") - - return self.match_list - + + ca = self.comic_archive + self.match_list = [] + self.cancel = False + self.search_result = self.ResultNoMatches + + if not pil_available: + self.log_msg( "Python Imaging Library (PIL) is not available and is needed for issue identification." ) + return self.match_list + + if not ca.seemsToBeAComicArchive(): + self.log_msg( "Sorry, but "+ opts.filename + " is not a comic archive!") + return self.match_list + + cover_image_data = ca.getPage( self.cover_page_index ) + cover_hash = self.calculateHash( cover_image_data ) + + #check the apect ratio + # if it's wider than it is high, it's probably a two page spread + # if so, crop it and calculate a second hash + narrow_cover_hash = None + aspect_ratio = self.getAspectRatio( cover_image_data ) + if aspect_ratio < 1.0: + right_side_image_data = self.cropCover( cover_image_data ) + if right_side_image_data is not None: + narrow_cover_hash = self.calculateHash( right_side_image_data ) + + #self.log_msg( "Cover hash = {0:016x}".format(cover_hash) ) + + keys = self.getSearchKeys() + + # we need, at minimum, a series and issue number + if keys['series'] is None or keys['issue_number'] is None: + self.log_msg("Not enough info for a search!") + return [] + + + self.log_msg( "Going to search for:" ) + self.log_msg( "\tSeries: " + keys['series'] ) + self.log_msg( "\tIssue : " + keys['issue_number'] ) + if keys['year'] is not None: + self.log_msg( "\tYear : " + str(keys['year']) ) + if keys['month'] is not None: + self.log_msg( "\tMonth : " + str(keys['month']) ) + + #self.log_msg("Publisher Blacklist: " + str(self.publisher_blacklist)) + + comicVine = ComicVineTalker( ) + comicVine.setLogFunc( self.output_function ) + + #self.log_msg( ( "Searching for " + keys['series'] + "...") + self.log_msg( u"Searching for {0} #{1} ...".format( keys['series'], keys['issue_number']) ) + try: + cv_search_results = comicVine.searchForSeries( keys['series'] ) + except ComicVineTalkerException: + self.log_msg( "Network issue while searching for series. Aborting...") + return [] + + #self.log_msg( "Found " + str(len(cv_search_results)) + " initial results" ) + if self.cancel == True: + return [] + + series_second_round_list = [] + + #self.log_msg( "Removing results with too long names, banned publishers, or future start dates" ) + for item in cv_search_results: + length_approved = False + publisher_approved = True + date_approved = True + + # remove any series that starts after the issue year + if keys['year'] is not None and str(keys['year']).isdigit() and item['start_year'] is not None and str(item['start_year']).isdigit(): + if int(keys['year']) < int(item['start_year']): + date_approved = False + + #assume that our search name is close to the actual name, say within ,e.g. 5 chars + shortened_key = utils.removearticles(keys['series']) + shortened_item_name = utils.removearticles(item['name']) + if len( shortened_item_name ) < ( len( shortened_key ) + self.length_delta_thresh) : + length_approved = True + + # remove any series from publishers on the blacklist + if item['publisher'] is not None: + publisher = item['publisher']['name'] + if publisher is not None and publisher.lower() in self.publisher_blacklist: + publisher_approved = False + + if length_approved and publisher_approved and date_approved: + series_second_round_list.append(item) + + # if we don't think it's an issue number 1, remove any series' that are one-shots + if keys['issue_number'] not in [ '1', '0', '0.1' ]: + #self.log_msg( "Removing one-shots" ) + series_second_round_list[:] = [x for x in series_second_round_list if not x['count_of_issues'] == 1] + + self.log_msg( "Searching in " + str(len(series_second_round_list)) +" series" ) + + if self.callback is not None: + self.callback( 0, len(series_second_round_list)) + + # now sort the list by name length + series_second_round_list.sort(key=lambda x: len(x['name']), reverse=False) + + #--------new way--------------- + #build a list of volume IDs + volume_id_list = list() + for series in series_second_round_list: + volume_id_list.append( series['id']) + + try: + issue_list = comicVine.fetchIssuesByVolumeIssueNumAndYear( volume_id_list, + keys['issue_number'], + keys['year']) + + except ComicVineTalkerException: + self.log_msg( "Network issue while searching for series details. Aborting...") + return [] + + shortlist = list() + #now re-associate the issues and volumes + for issue in issue_list: + for series in series_second_round_list: + if series['id'] == issue['volume']['id']: + shortlist.append( (series, issue) ) + break + + #--------new way--------------- + + """ + #--vvvv---old way--------------- + # Now we've got a list of series that we can dig into look for matching issue number + counter = 0 + shortlist = [] + for series in series_second_round_list: + if self.callback is not None: + self.callback( counter, len(series_second_round_list)*3) + counter += 1 + + self.log_msg( u"Fetching info for ID: {0} {1} ({2}) ...".format( + series['id'], + series['name'], + series['start_year']), newline=True ) + + try: + issue_list = comicVine.fetchIssuesByVolume( series['id'] ) + + except ComicVineTalkerException: + self.log_msg( "Network issue while searching for series details. Aborting...") + return [] + + for issue in issue_list: + num_s = IssueString(issue['issue_number']).asString() + + # look for a matching issue number + if num_s.lower() == keys['issue_number'].lower(): + + # now, if we have an issue year key given, reject this one if not a match + month, year = comicVine.fetchIssueDate( issue['id'] ) + if keys['year'] is not None: + if unicode(keys['year']) != unicode(year): + break + + # found a matching issue number! add it to short list + shortlist.append( (series, issue) ) + #--^^^^---old way--------------- + """ + + if keys['year'] is None: + self.log_msg( u"Found {0} series that have an issue #{1}".format(len(shortlist), keys['issue_number']) ) + else: + self.log_msg( u"Found {0} series that have an issue #{1} from {2}".format(len(shortlist), keys['issue_number'], keys['year'] )) + + + # now we have a shortlist of volumes with the desired issue number + # Do first round of cover matching + counter = len(shortlist) + for series, issue in shortlist: + if self.callback is not None: + self.callback( counter, len(shortlist)*3) + counter += 1 + + self.log_msg( u"Examining covers for ID: {0} {1} ({2}) ...".format( + series['id'], + series['name'], + series['start_year']), newline=False ) + + # parse out the cover date + day, month, year = comicVine.parseDateStr( issue['cover_date'] ) + + # Now check the cover match against the primary image + hash_list = [ cover_hash ] + if narrow_cover_hash is not None: + hash_list.append(narrow_cover_hash) + + try: + image_url = issue['image']['super_url'] + thumb_url = issue['image']['thumb_url'] + page_url = issue['site_detail_url'] + + score_item = self.getIssueCoverMatchScore( comicVine, issue['id'], image_url, thumb_url, page_url, hash_list, useRemoteAlternates = False ) + except: + self.match_list = [] + return self.match_list + + match = dict() + match['series'] = u"{0} ({1})".format(series['name'], series['start_year']) + match['distance'] = score_item['score'] + match['issue_number'] = keys['issue_number'] + match['url_image_hash'] = score_item['hash'] + match['issue_title'] = issue['name'] + match['issue_id'] = issue['id'] + match['volume_id'] = series['id'] + match['month'] = month + match['year'] = year + match['publisher'] = None + if series['publisher'] is not None: + match['publisher'] = series['publisher']['name'] + match['image_url'] = image_url + match['thumb_url'] = thumb_url + match['page_url'] = page_url + + self.match_list.append(match) + + self.log_msg( " --> {0}".format(match['distance']), newline=False ) + + self.log_msg( "" ) + + if len(self.match_list) == 0: + self.log_msg( ":-( no matches!" ) + self.search_result = self.ResultNoMatches + return self.match_list + + + # sort list by image match scores + self.match_list.sort(key=lambda k: k['distance']) + + l = [] + for i in self.match_list: + l.append( i['distance'] ) + + self.log_msg( "Compared to covers in {0} issue(s):".format(len(self.match_list)), newline=False) + self.log_msg( str(l)) + + def print_match(item): + self.log_msg( u"-----> {0} #{1} {2} ({3}/{4}) -- score: {5}".format( + item['series'], + item['issue_number'], + item['issue_title'], + item['month'], + item['year'], + item['distance']) ) + + best_score = self.match_list[0]['distance'] + + if best_score >= self.min_score_thresh: + # we have 1 or more low-confidence matches (all bad cover scores) + # look at a few more pages in the archive, and also alternate covers online + self.log_msg( "Very weak scores for the cover. Analyzing alternate pages and covers..." ) + hash_list = [ cover_hash ] + if narrow_cover_hash is not None: + hash_list.append(narrow_cover_hash) + for i in range( 1, min(3, ca.getNumberOfPages())): + image_data = ca.getPage(i) + page_hash = self.calculateHash( image_data ) + hash_list.append( page_hash ) + + second_match_list = [] + counter = 2*len(self.match_list) + for m in self.match_list: + if self.callback is not None: + self.callback( counter, len(self.match_list)*3) + counter += 1 + self.log_msg( u"Examining alternate covers for ID: {0} {1} ...".format( + m['volume_id'], + m['series']), newline=False ) + try: + score_item = self.getIssueCoverMatchScore( comicVine, m['issue_id'], m['image_url'], m['thumb_url'], m['page_url'], hash_list, useRemoteAlternates = True ) + except: + self.match_list = [] + return self.match_list + self.log_msg("--->{0}".format(score_item['score'])) + self.log_msg( "" ) + + if score_item['score'] < self.min_alternate_score_thresh: + second_match_list.append(m) + m['distance'] = score_item['score'] + + if len( second_match_list ) == 0: + if len( self.match_list) == 1: + self.log_msg( "No matching pages in the issue." ) + self.log_msg( u"--------------------------------------------------") + print_match(self.match_list[0]) + self.log_msg( u"--------------------------------------------------") + self.search_result = self.ResultFoundMatchButBadCoverScore + else: + self.log_msg( u"--------------------------------------------------") + self.log_msg( u"Multiple bad cover matches! Need to use other info..." ) + self.log_msg( u"--------------------------------------------------") + self.search_result = self.ResultMultipleMatchesWithBadImageScores + return self.match_list + else: + # We did good, found something! + self.log_msg( "Success in secondary/alternate cover matching!" ) + + self.match_list = second_match_list + # sort new list by image match scores + self.match_list.sort(key=lambda k: k['distance']) + best_score = self.match_list[0]['distance'] + self.log_msg("[Second round cover matching: best score = {0}]".format(best_score)) + # now drop down into the rest of the processing + + if self.callback is not None: + self.callback( 99, 100) + + #now pare down list, remove any item more than specified distant from the top scores + for item in reversed(self.match_list): + if item['distance'] > best_score + self.min_score_distance: + self.match_list.remove(item) + + if len(self.match_list) == 1: + self.log_msg( u"--------------------------------------------------") + print_match(self.match_list[0]) + self.log_msg( u"--------------------------------------------------") + self.search_result = self.ResultOneGoodMatch + + elif len(self.match_list) == 0: + self.log_msg( u"--------------------------------------------------") + self.log_msg( "No matches found :(" ) + self.log_msg( u"--------------------------------------------------") + self.search_result = self.ResultNoMatches + else: + self.log_msg( "More than one likley candiate." ) + self.search_result = self.ResultMultipleGoodMatches + self.log_msg( u"--------------------------------------------------") + for item in self.match_list: + print_match(item) + self.log_msg( u"--------------------------------------------------") + + return self.match_list + \ No newline at end of file