e1f3397960
git-svn-id: http://comictagger.googlecode.com/svn/trunk@29 6c5673fe-1810-88d6-992b-cd32ca31540c
313 lines
9.9 KiB
Python
313 lines
9.9 KiB
Python
"""
|
|
A python class to automatically identify a comic archive
|
|
"""
|
|
|
|
"""
|
|
Copyright 2012 Anthony Beville
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
"""
|
|
|
|
import sys
|
|
import math
|
|
import urllib2, urllib
|
|
|
|
from settings import ComicTaggerSettings
|
|
from comicvinecacher import ComicVineCacher
|
|
from genericmetadata import GenericMetadata
|
|
from comicvinetalker import ComicVineTalker
|
|
from imagehasher import ImageHasher
|
|
from imagefetcher import ImageFetcher
|
|
|
|
import utils
|
|
|
|
class IssueIdentifier:
|
|
|
|
def __init__(self, comic_archive, cv_api_key ):
|
|
self.comic_archive = comic_archive
|
|
self.image_hasher = 1
|
|
self.additional_metadata = None
|
|
self.min_score_thresh = 22
|
|
self.min_score_distance = 2
|
|
self.additional_metadata = GenericMetadata()
|
|
self.cv_api_key = cv_api_key
|
|
self.output_function = IssueIdentifier.defaultWriteOutput
|
|
self.callback = None
|
|
|
|
def setScoreMinThreshold( self, thresh ):
|
|
self.min_score_thresh = thresh
|
|
|
|
def setScoreMinDistance( self, distance ):
|
|
self.min_score_distance = distance
|
|
|
|
def setAdditionalMetadata( self, md ):
|
|
self.additional_metadata = md
|
|
|
|
def setHasherAlgorithm( self, algo ):
|
|
self.image_hasher = algo
|
|
pass
|
|
|
|
def setOutputFunction( self, func ):
|
|
self.output_function = func
|
|
pass
|
|
|
|
def calculateHash( self, image_data ):
|
|
if self.image_hasher == '3':
|
|
return ImageHasher( data=image_data ).dct_average_hash()
|
|
elif self.image_hasher == '2':
|
|
return ImageHasher( data=image_data ).average_hash2()
|
|
else:
|
|
return ImageHasher( data=image_data ).average_hash()
|
|
|
|
def setProgressCallback( self, cb_func ):
|
|
self.callback = cb_func
|
|
|
|
def getSearchKeys( self ):
|
|
|
|
ca = self.comic_archive
|
|
search_keys = dict()
|
|
search_keys['series'] = None
|
|
search_keys['issue_number'] = None
|
|
search_keys['month'] = None
|
|
search_keys['year'] = None
|
|
|
|
if ca is None:
|
|
return
|
|
|
|
# see if the archive has any useful meta data for searching with
|
|
if ca.hasCIX():
|
|
internal_metadata = ca.readCIX()
|
|
elif ca.hasCBI():
|
|
internal_metadata = ca.readCBI()
|
|
else:
|
|
internal_metadata = ca.readCBI()
|
|
|
|
# try to get some metadata from filename
|
|
md_from_filename = ca.metadataFromFilename()
|
|
|
|
# preference order:
|
|
#1. Additional metadata
|
|
#1. Internal metadata
|
|
#1. Filename metadata
|
|
|
|
if self.additional_metadata.series is not None:
|
|
search_keys['series'] = self.additional_metadata.series
|
|
elif internal_metadata.series is not None:
|
|
search_keys['series'] = internal_metadata.series
|
|
else:
|
|
search_keys['series'] = md_from_filename.series
|
|
|
|
if self.additional_metadata.issueNumber is not None:
|
|
search_keys['issue_number'] = self.additional_metadata.issueNumber
|
|
elif internal_metadata.issueNumber is not None:
|
|
search_keys['issue_number'] = internal_metadata.issueNumber
|
|
else:
|
|
search_keys['issue_number'] = md_from_filename.issueNumber
|
|
|
|
if self.additional_metadata.publicationYear is not None:
|
|
search_keys['year'] = self.additional_metadata.publicationYear
|
|
elif internal_metadata.publicationYear is not None:
|
|
search_keys['year'] = internal_metadata.publicationYear
|
|
else:
|
|
search_keys['year'] = md_from_filename.publicationYear
|
|
|
|
if self.additional_metadata.publicationMonth is not None:
|
|
search_keys['month'] = self.additional_metadata.publicationMonth
|
|
elif internal_metadata.publicationMonth is not None:
|
|
search_keys['month'] = internal_metadata.publicationMonth
|
|
else:
|
|
search_keys['month'] = md_from_filename.publicationMonth
|
|
|
|
return search_keys
|
|
|
|
@staticmethod
|
|
def defaultWriteOutput( text ):
|
|
sys.stdout.write(text)
|
|
sys.stdout.flush()
|
|
|
|
def log_msg( self, msg , newline=True ):
|
|
self.output_function(msg)
|
|
if newline:
|
|
self.output_function("\n")
|
|
|
|
def search( self ):
|
|
|
|
ca = self.comic_archive
|
|
self.match_list = []
|
|
self.cancel = False
|
|
|
|
if not ca.seemsToBeAComicArchive():
|
|
self.log_msg( "Sorry, but "+ opts.filename + " is not a comic archive!")
|
|
return []
|
|
|
|
cover_image_data = ca.getCoverPage()
|
|
|
|
cover_hash = self.calculateHash( cover_image_data )
|
|
|
|
#self.log_msg( "Cover hash = {0:016x}".format(cover_hash) )
|
|
|
|
keys = self.getSearchKeys()
|
|
|
|
# we need, at minimum, a series and issue number
|
|
if keys['series'] is None or keys['issue_number'] is None:
|
|
self.log_msg("Not enough info for a search!")
|
|
return []
|
|
|
|
"""
|
|
self.log_msg( "Going to search for:" )
|
|
self.log_msg( "Series: " + keys['series'] )
|
|
self.log_msg( "Issue : " + keys['issue_number'] )
|
|
if keys['year'] is not None:
|
|
self.log_msg( "Year : " + keys['year'] )
|
|
if keys['month'] is not None:
|
|
self.log_msg( "Month : " + keys['month'] )
|
|
"""
|
|
comicVine = ComicVineTalker( self.cv_api_key )
|
|
|
|
#self.log_msg( ( "Searching for " + keys['series'] + "...")
|
|
self.log_msg( "Searching for {0} #{1} ...".format( keys['series'], keys['issue_number']) )
|
|
|
|
keys['series'] = utils.removearticles( keys['series'] )
|
|
|
|
cv_search_results = comicVine.searchForSeries( keys['series'] )
|
|
|
|
#self.log_msg( "Found " + str(len(cv_search_results)) + " initial results" )
|
|
if self.cancel == True:
|
|
return []
|
|
|
|
series_shortlist = []
|
|
|
|
#self.log_msg( "Removing results with too long names" )
|
|
for item in cv_search_results:
|
|
#assume that our search name is close to the actual name, say within 5 characters
|
|
if len( utils.removearticles(item['name'])) < len( keys['series'] ) + 5:
|
|
series_shortlist.append(item)
|
|
|
|
# if we don't think it's an issue number 1, remove any series' that are one-shots
|
|
if keys['issue_number'] != '1':
|
|
#self.log_msg( "Removing one-shots" )
|
|
series_shortlist[:] = [x for x in series_shortlist if not x['count_of_issues'] == 1]
|
|
|
|
self.log_msg( "Searching in " + str(len(series_shortlist)) +" series" )
|
|
|
|
if self.callback is not None:
|
|
self.callback( 0, len(series_shortlist))
|
|
|
|
|
|
# now sort the list by name length
|
|
series_shortlist.sort(key=lambda x: len(x['name']), reverse=False)
|
|
|
|
# Now we've got a list of series that we can dig into,
|
|
# and look for matching issue number, date, and cover image
|
|
|
|
counter = 0
|
|
for series in series_shortlist:
|
|
if self.callback is not None:
|
|
counter += 1
|
|
self.callback( counter, len(series_shortlist))
|
|
|
|
self.log_msg( "Fetching info for ID: {0} {1} ({2}) ...".format(
|
|
series['id'],
|
|
series['name'],
|
|
series['start_year']), newline=False )
|
|
|
|
cv_series_results = comicVine.fetchVolumeData( series['id'] )
|
|
issue_list = cv_series_results['issues']
|
|
for issue in issue_list:
|
|
|
|
# format the issue number string nicely, since it's usually something like "2.00"
|
|
num_f = float(issue['issue_number'])
|
|
num_s = str( int(math.floor(num_f)) )
|
|
if math.floor(num_f) != num_f:
|
|
num_s = str( num_f )
|
|
|
|
# look for a matching issue number
|
|
if num_s == keys['issue_number']:
|
|
# found a matching issue number! now get the issue data
|
|
img_url, thumb_url = comicVine.fetchIssueCoverURLs( issue['id'] )
|
|
url_image_data = ImageFetcher().fetch(thumb_url, blocking=True)
|
|
|
|
if self.cancel == True:
|
|
self.match_list = []
|
|
return self.match_list
|
|
|
|
url_image_hash = self.calculateHash( url_image_data )
|
|
|
|
match = dict()
|
|
match['series'] = "{0} ({1})".format(series['name'], series['start_year'])
|
|
match['distance'] = ImageHasher.hamming_distance(cover_hash, url_image_hash)
|
|
match['issue_number'] = num_s
|
|
match['url_image_hash'] = url_image_hash
|
|
match['issue_title'] = issue['name']
|
|
match['img_url'] = thumb_url
|
|
match['issue_id'] = issue['id']
|
|
match['volume_id'] = series['id']
|
|
self.match_list.append(match)
|
|
|
|
self.log_msg( " --> {0}".format(match['distance']), newline=False )
|
|
|
|
break
|
|
self.log_msg( "" )
|
|
|
|
|
|
if len(self.match_list) == 0:
|
|
self.log_msg( ":-( no matches!" )
|
|
return self.match_list
|
|
|
|
# sort list by image match scores
|
|
self.match_list.sort(key=lambda k: k['distance'])
|
|
|
|
l = []
|
|
for i in self.match_list:
|
|
l.append( i['distance'] )
|
|
|
|
self.log_msg( "Compared {0} covers".format(len(self.match_list)), newline=False)
|
|
self.log_msg( str(l))
|
|
|
|
def print_match(item):
|
|
self.log_msg( u"-----> {0} #{1} {2} -- score: {3}".format(
|
|
item['series'],
|
|
item['issue_number'],
|
|
item['issue_title'],
|
|
item['distance']) )
|
|
|
|
best_score = self.match_list[0]['distance']
|
|
|
|
if len(self.match_list) == 1:
|
|
if best_score > self.min_score_thresh:
|
|
self.log_msg( "!!!! Very weak score for the cover. Maybe it's not the cover?" )
|
|
print_match(self.match_list[0])
|
|
return self.match_list
|
|
|
|
elif best_score > self.min_score_thresh and len(self.match_list) > 1:
|
|
self.log_msg( "No good image matches! Need to use other info..." )
|
|
return self.match_list
|
|
|
|
#now pare down list, remove any item more than specified distant from the top scores
|
|
for item in reversed(self.match_list):
|
|
if item['distance'] > best_score + self.min_score_distance:
|
|
self.match_list.remove(item)
|
|
|
|
if len(self.match_list) == 1:
|
|
print_match(self.match_list[0])
|
|
elif len(self.match_list) == 0:
|
|
self.log_msg( "No matches found :(" )
|
|
else:
|
|
print
|
|
self.log_msg( "More than one likley candiate. Maybe a lexical comparison??" )
|
|
for item in self.match_list:
|
|
print_match(item)
|
|
|
|
return self.match_list
|
|
|
|
|