Moved issue matching code into its own class
git-svn-id: http://comictagger.googlecode.com/svn/trunk@27 6c5673fe-1810-88d6-992b-cd32ca31540c
This commit is contained in:
parent
cb427f49b8
commit
d8a8355f5b
284
issueidentifier.py
Normal file
284
issueidentifier.py
Normal file
@ -0,0 +1,284 @@
|
||||
"""
|
||||
A python class to automatically identify a comic archive
|
||||
"""
|
||||
|
||||
"""
|
||||
Copyright 2012 Anthony Beville
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import math
|
||||
import urllib2, urllib
|
||||
|
||||
from settings import ComicTaggerSettings
|
||||
from comicvinecacher import ComicVineCacher
|
||||
from genericmetadata import GenericMetadata
|
||||
from comicvinetalker import ComicVineTalker
|
||||
from imagehasher import ImageHasher
|
||||
import utils
|
||||
|
||||
class IssueIdentifier:
|
||||
|
||||
def __init__(self, comic_archive, cv_api_key ):
|
||||
self.comic_archive = comic_archive
|
||||
self.image_hasher = 1
|
||||
self.additional_metadata = None
|
||||
self.min_score_thresh = 22
|
||||
self.min_score_distance = 2
|
||||
self.additional_metadata = GenericMetadata()
|
||||
self.cv_api_key = cv_api_key
|
||||
|
||||
def setScoreMinThreshold( self, thresh ):
|
||||
self.min_score_thresh = thresh
|
||||
|
||||
def setScoreMinDistance( self, distance ):
|
||||
self.min_score_distance = distance
|
||||
|
||||
def setAdditionalMetadata( self, md ):
|
||||
self.additional_metadata = md
|
||||
|
||||
def setHasherAlgorithm( self, algo ):
|
||||
self.image_hasher = algo
|
||||
pass
|
||||
|
||||
def calculateHash( self, image_data ):
|
||||
if self.image_hasher == '3':
|
||||
return ImageHasher( data=image_data ).dct_average_hash()
|
||||
elif self.image_hasher == '2':
|
||||
return ImageHasher( data=image_data ).average_hash2()
|
||||
else:
|
||||
return ImageHasher( data=image_data ).average_hash()
|
||||
|
||||
def getSearchKeys( self ):
|
||||
|
||||
ca = self.comic_archive
|
||||
search_keys = dict()
|
||||
search_keys['series'] = None
|
||||
search_keys['issue_number'] = None
|
||||
search_keys['month'] = None
|
||||
search_keys['year'] = None
|
||||
|
||||
if ca is None:
|
||||
return
|
||||
|
||||
# see if the archive has any useful meta data for searching with
|
||||
if ca.hasCIX():
|
||||
internal_metadata = ca.readCIX()
|
||||
elif ca.hasCBI():
|
||||
internal_metadata = ca.readCBI()
|
||||
else:
|
||||
internal_metadata = ca.readCBI()
|
||||
|
||||
# try to get some metadata from filename
|
||||
md_from_filename = ca.metadataFromFilename()
|
||||
|
||||
# preference order:
|
||||
#1. Additional metadata
|
||||
#1. Internal metadata
|
||||
#1. Filename metadata
|
||||
|
||||
if self.additional_metadata.series is not None:
|
||||
search_keys['series'] = self.additional_metadata.series
|
||||
elif internal_metadata.series is not None:
|
||||
search_keys['series'] = internal_metadata.series
|
||||
else:
|
||||
search_keys['series'] = md_from_filename.series
|
||||
|
||||
if self.additional_metadata.issueNumber is not None:
|
||||
search_keys['issue_number'] = self.additional_metadata.issueNumber
|
||||
elif internal_metadata.issueNumber is not None:
|
||||
search_keys['issue_number'] = internal_metadata.issueNumber
|
||||
else:
|
||||
search_keys['issue_number'] = md_from_filename.issueNumber
|
||||
|
||||
if self.additional_metadata.publicationYear is not None:
|
||||
search_keys['year'] = self.additional_metadata.publicationYear
|
||||
elif internal_metadata.publicationYear is not None:
|
||||
search_keys['year'] = internal_metadata.publicationYear
|
||||
else:
|
||||
search_keys['year'] = md_from_filename.publicationYear
|
||||
|
||||
if self.additional_metadata.publicationMonth is not None:
|
||||
search_keys['month'] = self.additional_metadata.publicationMonth
|
||||
elif internal_metadata.publicationMonth is not None:
|
||||
search_keys['month'] = internal_metadata.publicationMonth
|
||||
else:
|
||||
search_keys['month'] = md_from_filename.publicationMonth
|
||||
|
||||
return search_keys
|
||||
|
||||
@staticmethod
|
||||
def log_msg( msg , newline=True ):
|
||||
sys.stdout.write(msg)
|
||||
if newline:
|
||||
sys.stdout.write("\n")
|
||||
sys.stdout.flush()
|
||||
|
||||
def search( self ):
|
||||
|
||||
ca = self.comic_archive
|
||||
if not ca.seemsToBeAComicArchive():
|
||||
IssueIdentifier.log_msg( "Sorry, but "+ opts.filename + " is not a comic archive!")
|
||||
return
|
||||
|
||||
cover_image_data = ca.getCoverPage()
|
||||
|
||||
cover_hash = self.calculateHash( cover_image_data )
|
||||
|
||||
#IssueIdentifier.log_msg( "Cover hash = {0:016x}".format(cover_hash) )
|
||||
|
||||
keys = self.getSearchKeys()
|
||||
|
||||
# we need, at minimum, a series and issue number
|
||||
if keys['series'] is None or keys['issue_number'] is None:
|
||||
IssueIdentifier.log_msg("Not enough info for a search!")
|
||||
return None
|
||||
|
||||
"""
|
||||
IssueIdentifier.log_msg( "Going to search for:" )
|
||||
IssueIdentifier.log_msg( "Series: " + keys['series'] )
|
||||
IssueIdentifier.log_msg( "Issue : " + keys['issue_number'] )
|
||||
if keys['year'] is not None:
|
||||
IssueIdentifier.log_msg( "Year : " + keys['year'] )
|
||||
if keys['month'] is not None:
|
||||
IssueIdentifier.log_msg( "Month : " + keys['month'] )
|
||||
"""
|
||||
comicVine = ComicVineTalker( self.cv_api_key )
|
||||
|
||||
#IssueIdentifier.log_msg( ( "Searching for " + keys['series'] + "...")
|
||||
IssueIdentifier.log_msg( "Searching for {0} #{1} ...".format( keys['series'], keys['issue_number']) )
|
||||
|
||||
keys['series'] = utils.removearticles( keys['series'] )
|
||||
|
||||
cv_search_results = comicVine.searchForSeries( keys['series'] )
|
||||
|
||||
#IssueIdentifier.log_msg( "Found " + str(len(cv_search_results)) + " initial results" )
|
||||
|
||||
series_shortlist = []
|
||||
|
||||
#IssueIdentifier.log_msg( "Removing results with too long names" )
|
||||
for item in cv_search_results:
|
||||
#assume that our search name is close to the actual name, say within 5 characters
|
||||
if len( utils.removearticles(item['name'])) < len( keys['series'] ) + 5:
|
||||
series_shortlist.append(item)
|
||||
|
||||
# if we don't think it's an issue number 1, remove any series' that are one-shots
|
||||
if keys['issue_number'] != '1':
|
||||
#IssueIdentifier.log_msg( "Removing one-shots" )
|
||||
series_shortlist[:] = [x for x in series_shortlist if not x['count_of_issues'] == 1]
|
||||
|
||||
IssueIdentifier.log_msg( "Searching in " + str(len(series_shortlist)) +" series" )
|
||||
|
||||
# now sort the list by name length
|
||||
series_shortlist.sort(key=lambda x: len(x['name']), reverse=False)
|
||||
|
||||
# Now we've got a list of series that we can dig into,
|
||||
# and look for matching issue number, date, and cover image
|
||||
|
||||
match_list = []
|
||||
|
||||
IssueIdentifier.log_msg( "Fetching issue data", newline=False)
|
||||
|
||||
for series in series_shortlist:
|
||||
#IssueIdentifier.log_msg( "Fetching info for ID: {0} {1} ({2}) ...".format(
|
||||
# series['id'],
|
||||
# series['name'],
|
||||
# series['start_year']) )
|
||||
IssueIdentifier.log_msg( ".", newline=False)
|
||||
|
||||
cv_series_results = comicVine.fetchVolumeData( series['id'] )
|
||||
issue_list = cv_series_results['issues']
|
||||
for issue in issue_list:
|
||||
|
||||
# format the issue number string nicely, since it's usually something like "2.00"
|
||||
num_f = float(issue['issue_number'])
|
||||
num_s = str( int(math.floor(num_f)) )
|
||||
if math.floor(num_f) != num_f:
|
||||
num_s = str( num_f )
|
||||
|
||||
# look for a matching issue number
|
||||
if num_s == keys['issue_number']:
|
||||
# found a matching issue number! now get the issue data
|
||||
img_url, thumb_url = comicVine.fetchIssueCoverURLs( issue['id'] )
|
||||
#TODO get the image from URL, and calc hash!!
|
||||
url_image_data = urllib.urlopen(thumb_url).read()
|
||||
|
||||
url_image_hash = self.calculateHash( url_image_data )
|
||||
|
||||
match = dict()
|
||||
match['series'] = "{0} ({1})".format(series['name'], series['start_year'])
|
||||
match['distance'] = ImageHasher.hamming_distance(cover_hash, url_image_hash)
|
||||
match['issue_number'] = num_s
|
||||
match['url_image_hash'] = url_image_hash
|
||||
match['issue_title'] = issue['name']
|
||||
match['img_url'] = thumb_url
|
||||
match_list.append(match)
|
||||
|
||||
break
|
||||
IssueIdentifier.log_msg( "done!" )
|
||||
|
||||
if len(match_list) == 0:
|
||||
IssueIdentifier.log_msg( ":-( no matches!" )
|
||||
return
|
||||
|
||||
# sort list by image match scores
|
||||
match_list.sort(key=lambda k: k['distance'])
|
||||
|
||||
l = []
|
||||
for i in match_list:
|
||||
l.append( i['distance'] )
|
||||
|
||||
IssueIdentifier.log_msg( "Compared {0} covers".format(len(match_list)), newline=False)
|
||||
IssueIdentifier.log_msg( str(l))
|
||||
|
||||
def print_match(item):
|
||||
IssueIdentifier.log_msg( u"-----> {0} #{1} {2} -- score: {3}\n-------> url:{4}".format(
|
||||
item['series'],
|
||||
item['issue_number'],
|
||||
item['issue_title'],
|
||||
item['distance'],
|
||||
item['img_url']) )
|
||||
|
||||
best_score = match_list[0]['distance']
|
||||
|
||||
if len(match_list) == 1:
|
||||
if best_score > self.min_score_thresh:
|
||||
IssueIdentifier.log_msg( "!!!! Very weak score for the cover. Maybe it's not the cover?" )
|
||||
print_match(match_list[0])
|
||||
return
|
||||
|
||||
elif best_score > self.min_score_thresh and len(match_list) > 1:
|
||||
IssueIdentifier.log_msg( "No good image matches! Need to use other info..." )
|
||||
return
|
||||
|
||||
#now pare down list, remove any item more than specified distant from the top scores
|
||||
for item in reversed(match_list):
|
||||
if item['distance'] > best_score + self.min_score_distance:
|
||||
match_list.remove(item)
|
||||
|
||||
if len(match_list) == 1:
|
||||
print_match(match_list[0])
|
||||
return
|
||||
elif len(match_list) == 0:
|
||||
IssueIdentifier.log_msg( "No matches found :(" )
|
||||
return
|
||||
|
||||
else:
|
||||
print
|
||||
IssueIdentifier.log_msg( "More than one likley candiate. Maybe a lexical comparison??" )
|
||||
for item in match_list:
|
||||
print_match(item)
|
||||
|
||||
|
199
tagger.py
199
tagger.py
@ -21,27 +21,17 @@ limitations under the License.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import getopt
|
||||
import json
|
||||
import xml
|
||||
from pprint import pprint
|
||||
from PyQt4 import QtCore, QtGui
|
||||
import signal
|
||||
import os
|
||||
import math
|
||||
import urllib2, urllib
|
||||
|
||||
from PyQt4 import QtCore, QtGui
|
||||
|
||||
from settings import ComicTaggerSettings
|
||||
|
||||
from taggerwindow import TaggerWindow
|
||||
from options import Options, MetaDataStyle
|
||||
from comicarchive import ComicArchive
|
||||
from issueidentifier import IssueIdentifier
|
||||
|
||||
from comicvinetalker import ComicVineTalker
|
||||
from comicvinecacher import ComicVineCacher
|
||||
from comicinfoxml import ComicInfoXml
|
||||
from comicbookinfo import ComicBookInfo
|
||||
from imagehasher import ImageHasher
|
||||
import utils
|
||||
|
||||
#-----------------------------
|
||||
@ -52,188 +42,9 @@ def cliProcedure( opts, settings ):
|
||||
print "Sorry, but "+ opts.filename + " is not a comic archive!"
|
||||
return
|
||||
|
||||
cover_image_data = ca.getCoverPage()
|
||||
|
||||
if opts.image_hasher == '3':
|
||||
cover_hash = ImageHasher( data=cover_image_data ).dct_average_hash()
|
||||
elif opts.image_hasher == '2':
|
||||
cover_hash = ImageHasher( data=cover_image_data ).average_hash2()
|
||||
else:
|
||||
cover_hash = ImageHasher( data=cover_image_data ).average_hash()
|
||||
|
||||
#print "Cover hash = {0:016x}".format(cover_hash)
|
||||
|
||||
# see if the archive has any useful meta data for searching with
|
||||
if ca.hasCIX():
|
||||
internal_metadata = ca.readCIX()
|
||||
elif ca.hasCBI():
|
||||
internal_metadata = ca.readCBI()
|
||||
else:
|
||||
internal_metadata = ca.readCBI()
|
||||
|
||||
# try to get some metadata from filename
|
||||
md_from_filename = ca.metadataFromFilename()
|
||||
ii = IssueIdentifier( ca, settings.cv_api_key )
|
||||
ii.search()
|
||||
|
||||
# now figure out what we have to search with
|
||||
search_series = internal_metadata.series
|
||||
search_issue_number = internal_metadata.issueNumber
|
||||
search_year = internal_metadata.publicationYear
|
||||
search_month = internal_metadata.publicationMonth
|
||||
|
||||
if search_series is None:
|
||||
search_series = md_from_filename.series
|
||||
|
||||
if search_issue_number is None:
|
||||
search_issue_number = md_from_filename.issueNumber
|
||||
|
||||
if search_year is None:
|
||||
search_year = md_from_filename.publicationYear
|
||||
|
||||
# we need, at minimum, a series and issue number
|
||||
if search_series is None or search_issue_number is None:
|
||||
print "Not enough info for a search!"
|
||||
return
|
||||
|
||||
"""
|
||||
print "Going to search for:"
|
||||
print "Series: ", search_series
|
||||
print "Issue : ", search_issue_number
|
||||
if search_year is not None:
|
||||
print "Year : ", search_year
|
||||
if search_month is not None:
|
||||
print "Month : ", search_month
|
||||
"""
|
||||
comicVine = ComicVineTalker( settings.cv_api_key )
|
||||
|
||||
#print ( "Searching for " + search_series + "...")
|
||||
print "Searching for {0} #{1} ...".format( search_series, search_issue_number)
|
||||
|
||||
search_series = utils.removearticles( search_series )
|
||||
|
||||
cv_search_results = comicVine.searchForSeries( search_series )
|
||||
|
||||
#print "Found " + str(len(cv_search_results)) + " initial results"
|
||||
|
||||
series_shortlist = []
|
||||
|
||||
#print "Removing results with too long names"
|
||||
for item in cv_search_results:
|
||||
#assume that our search name is close to the actual name, say within 5 characters
|
||||
if len( utils.removearticles(item['name'])) < len( search_series ) + 5:
|
||||
series_shortlist.append(item)
|
||||
|
||||
# if we don't think it's an issue number 1, remove any series' that are one-shots
|
||||
if search_issue_number != '1':
|
||||
#print "Removing one-shots"
|
||||
series_shortlist[:] = [x for x in series_shortlist if not x['count_of_issues'] == 1]
|
||||
|
||||
print "Searching in " + str(len(series_shortlist)) +" series"
|
||||
|
||||
# now sort the list by name length
|
||||
series_shortlist.sort(key=lambda x: len(x['name']), reverse=False)
|
||||
|
||||
# Now we've got a list of series that we can dig into,
|
||||
# and look for matching issue number, date, and cover image
|
||||
|
||||
match_list = []
|
||||
|
||||
print "Fetching issue data",
|
||||
|
||||
for series in series_shortlist:
|
||||
#print "Fetching info for ID: {0} {1} ({2}) ...".format(
|
||||
# series['id'],
|
||||
# series['name'],
|
||||
# series['start_year'])
|
||||
print ".",
|
||||
sys.stdout.flush()
|
||||
|
||||
cv_series_results = comicVine.fetchVolumeData( series['id'] )
|
||||
issue_list = cv_series_results['issues']
|
||||
for issue in issue_list:
|
||||
|
||||
# format the issue number string nicely, since it's usually something like "2.00"
|
||||
num_f = float(issue['issue_number'])
|
||||
num_s = str( int(math.floor(num_f)) )
|
||||
if math.floor(num_f) != num_f:
|
||||
num_s = str( num_f )
|
||||
|
||||
# look for a matching issue number
|
||||
if num_s == search_issue_number:
|
||||
# found a matching issue number! now get the issue data
|
||||
img_url, thumb_url = comicVine.fetchIssueCoverURLs( issue['id'] )
|
||||
#TODO get the image from URL, and calc hash!!
|
||||
url_image_data = urllib.urlopen(thumb_url).read()
|
||||
|
||||
if opts.image_hasher == '3':
|
||||
url_image_hash = ImageHasher( data=url_image_data ).dct_average_hash()
|
||||
elif opts.image_hasher == '2':
|
||||
url_image_hash = ImageHasher( data=url_image_data ).average_hash2()
|
||||
else:
|
||||
url_image_hash = ImageHasher( data=url_image_data ).average_hash()
|
||||
|
||||
match = dict()
|
||||
match['series'] = "{0} ({1})".format(series['name'], series['start_year'])
|
||||
match['distance'] = ImageHasher.hamming_distance(cover_hash, url_image_hash)
|
||||
match['issue_number'] = num_s
|
||||
match['url_image_hash'] = url_image_hash
|
||||
match['issue_title'] = issue['name']
|
||||
match['img_url'] = thumb_url
|
||||
match_list.append(match)
|
||||
|
||||
break
|
||||
print "done!"
|
||||
|
||||
if len(match_list) == 0:
|
||||
print ":-( no matches!"
|
||||
return
|
||||
|
||||
# sort list by image match scores
|
||||
match_list.sort(key=lambda k: k['distance'])
|
||||
|
||||
print "Compared {0} covers".format(len(match_list)),
|
||||
|
||||
l = []
|
||||
for i in match_list:
|
||||
l.append( i['distance'] )
|
||||
print l
|
||||
|
||||
def print_match(item):
|
||||
print u"-----> {0} #{1} {2} -- score: {3}\n-------> url:{4}".format(
|
||||
item['series'],
|
||||
item['issue_number'],
|
||||
item['issue_title'],
|
||||
item['distance'],
|
||||
item['img_url'])
|
||||
|
||||
best_score = match_list[0]['distance']
|
||||
|
||||
if len(match_list) == 1:
|
||||
if best_score > 25:
|
||||
print "!!!! Very weak score for the cover. Maybe it's not the cover?"
|
||||
print_match(match_list[0])
|
||||
return
|
||||
|
||||
elif best_score > 25 and len(match_list) > 1:
|
||||
print "No good image matches! Need to use other info..."
|
||||
return
|
||||
|
||||
#now pare down list, remove any item more than 2 distant from the top scores
|
||||
for item in reversed(match_list):
|
||||
if item['distance'] > best_score + 2:
|
||||
match_list.remove(item)
|
||||
|
||||
if len(match_list) == 1:
|
||||
print_match(match_list[0])
|
||||
return
|
||||
elif len(match_list) == 0:
|
||||
print "No matches found :("
|
||||
return
|
||||
|
||||
else:
|
||||
print "More than one likley candiate. Maybe a lexical comparison??"
|
||||
for item in match_list:
|
||||
print_match(item)
|
||||
|
||||
"""
|
||||
# now get the particular issue data
|
||||
metadata = comicVine.fetchIssueData( series_id, opts.issue_number )
|
||||
|
Loading…
Reference in New Issue
Block a user