Moved issue matching code into its own class

git-svn-id: http://comictagger.googlecode.com/svn/trunk@27 6c5673fe-1810-88d6-992b-cd32ca31540c
2012-11-10 19:02:38 +00:00 · 2012-11-10 19:02:38 +00:00 · d8a8355f5b
commit d8a8355f5b
parent cb427f49b8
2 changed files with 289 additions and 194 deletions
--- a/issueidentifier.py
+++ b/issueidentifier.py
@ -0,0 +1,284 @@
+"""
+A python class to automatically identify a comic archive
+"""
+
+"""
+Copyright 2012  Anthony Beville
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import sys
+import math
+import urllib2, urllib
+
+from settings import ComicTaggerSettings
+from comicvinecacher import ComicVineCacher
+from genericmetadata import GenericMetadata
+from comicvinetalker import ComicVineTalker
+from imagehasher import ImageHasher
+import utils 
+
+class IssueIdentifier:
+
+	def __init__(self, comic_archive, cv_api_key ):
+		self.comic_archive = comic_archive
+		self.image_hasher = 1
+		self.additional_metadata = None
+		self.min_score_thresh = 22
+		self.min_score_distance = 2
+		self.additional_metadata = GenericMetadata()
+		self.cv_api_key = cv_api_key
+	
+	def setScoreMinThreshold( self, thresh ):
+		self.min_score_thresh = thresh
+
+	def setScoreMinDistance( self, distance ):
+		self.min_score_distance = distance
+		
+	def setAdditionalMetadata( self, md ):
+		self.additional_metadata = md
+
+	def setHasherAlgorithm( self, algo ):
+		self.image_hasher = algo
+		pass
+	
+	def calculateHash( self, image_data ):
+		if self.image_hasher == '3':
+			return ImageHasher( data=image_data ).dct_average_hash() 
+		elif self.image_hasher == '2':
+			return ImageHasher( data=image_data ).average_hash2() 
+		else:
+			return ImageHasher( data=image_data ).average_hash() 
+	
+	def getSearchKeys( self ):
+	
+		ca = self.comic_archive
+		search_keys = dict()
+		search_keys['series'] = None
+		search_keys['issue_number'] = None
+		search_keys['month'] = None
+		search_keys['year'] = None
+		
+		if ca is None:
+			return
+		
+		# see if the archive has any useful meta data for searching with
+		if ca.hasCIX():
+			internal_metadata = ca.readCIX()
+		elif ca.hasCBI():
+			internal_metadata = ca.readCBI()
+		else:
+			internal_metadata = ca.readCBI()
+
+		# try to get some metadata from filename
+		md_from_filename = ca.metadataFromFilename()
+
+		# preference order:
+			#1. Additional metadata
+			#1. Internal metadata
+			#1. Filename metadata
+		
+		if self.additional_metadata.series is not None:
+			search_keys['series'] = self.additional_metadata.series
+		elif internal_metadata.series is not None:
+			search_keys['series'] = internal_metadata.series
+		else:
+			search_keys['series'] = md_from_filename.series
+
+		if self.additional_metadata.issueNumber is not None:
+			search_keys['issue_number'] = self.additional_metadata.issueNumber
+		elif internal_metadata.issueNumber is not None:
+			search_keys['issue_number'] = internal_metadata.issueNumber
+		else:
+			search_keys['issue_number'] = md_from_filename.issueNumber
+			
+		if self.additional_metadata.publicationYear is not None:
+			search_keys['year'] = self.additional_metadata.publicationYear
+		elif internal_metadata.publicationYear is not None:
+			search_keys['year'] = internal_metadata.publicationYear
+		else:
+			search_keys['year'] = md_from_filename.publicationYear
+			
+		if self.additional_metadata.publicationMonth is not None:
+			search_keys['month'] = self.additional_metadata.publicationMonth
+		elif internal_metadata.publicationMonth is not None:
+			search_keys['month'] = internal_metadata.publicationMonth
+		else:
+			search_keys['month'] = md_from_filename.publicationMonth
+			
+		return search_keys
+	
+	@staticmethod
+	def log_msg( msg , newline=True ):
+		sys.stdout.write(msg)
+		if newline:
+			sys.stdout.write("\n")
+		sys.stdout.flush()
+		
+	def search( self ):
+	
+		ca = self.comic_archive
+		if not ca.seemsToBeAComicArchive():
+			IssueIdentifier.log_msg( "Sorry, but "+ opts.filename + "  is not a comic archive!")
+			return
+		
+		cover_image_data = ca.getCoverPage()
+
+		cover_hash = self.calculateHash( cover_image_data )
+
+		#IssueIdentifier.log_msg( "Cover hash = {0:016x}".format(cover_hash) )
+
+		keys = self.getSearchKeys()
+		
+		# we need, at minimum, a series and issue number
+		if keys['series'] is None or keys['issue_number'] is None:
+			IssueIdentifier.log_msg("Not enough info for a search!")
+			return None
+		
+		"""
+		IssueIdentifier.log_msg( "Going to search for:" )
+		IssueIdentifier.log_msg( "Series: " + keys['series'] )
+		IssueIdentifier.log_msg( "Issue : " + keys['issue_number']  )
+		if keys['year'] is not None:
+			IssueIdentifier.log_msg( "Year :  " + keys['year'] )
+		if keys['month'] is not None:
+			IssueIdentifier.log_msg( "Month : " + keys['month'] )
+		"""
+		comicVine = ComicVineTalker( self.cv_api_key )
+
+		#IssueIdentifier.log_msg( ( "Searching for " + keys['series'] + "...")
+		IssueIdentifier.log_msg( "Searching for  {0} #{1} ...".format( keys['series'], keys['issue_number']) )
+
+		keys['series'] = utils.removearticles( keys['series'] )
+		
+		cv_search_results = comicVine.searchForSeries( keys['series'] )
+		
+		#IssueIdentifier.log_msg( "Found " + str(len(cv_search_results)) + " initial results" )
+		
+		series_shortlist = []
+		
+		#IssueIdentifier.log_msg( "Removing results with too long names" )
+		for item in cv_search_results:
+			#assume that our search name is close to the actual name, say within 5 characters
+			if len( utils.removearticles(item['name'])) < len( keys['series'] ) + 5:
+				series_shortlist.append(item)
+		
+		# if we don't think it's an issue number 1, remove any series' that are one-shots
+		if keys['issue_number'] != '1':
+			#IssueIdentifier.log_msg( "Removing one-shots" )
+			series_shortlist[:] = [x for x in series_shortlist if not x['count_of_issues'] == 1]	
+
+		IssueIdentifier.log_msg( "Searching in " + str(len(series_shortlist)) +" series" )
+		
+		# now sort the list by name length
+		series_shortlist.sort(key=lambda x: len(x['name']), reverse=False)
+		
+		# Now we've got a list of series that we can dig into, 
+		# and look for matching issue number, date, and cover image
+		
+		match_list = []
+
+		IssueIdentifier.log_msg( "Fetching issue data", newline=False)
+
+		for series in series_shortlist:
+			#IssueIdentifier.log_msg( "Fetching info for  ID: {0} {1} ({2}) ...".format(
+			#               series['id'], 
+			#               series['name'], 
+			#               series['start_year']) )
+			IssueIdentifier.log_msg( ".", newline=False)
+			
+			cv_series_results = comicVine.fetchVolumeData( series['id'] )
+			issue_list = cv_series_results['issues']
+			for issue in issue_list:
+				
+				# format the issue number string nicely, since it's usually something like "2.00"
+				num_f = float(issue['issue_number'])
+				num_s = str( int(math.floor(num_f)) )
+				if math.floor(num_f) != num_f:
+					num_s = str( num_f )			
+
+				# look for a matching issue number
+				if num_s == keys['issue_number']:
+					# found a matching issue number!  now get the issue data 
+					img_url, thumb_url = comicVine.fetchIssueCoverURLs( issue['id'] )
+					#TODO get the image from URL, and calc hash!!
+					url_image_data = urllib.urlopen(thumb_url).read()
+
+					url_image_hash = self.calculateHash( url_image_data )
+					
+					match = dict()
+					match['series'] = "{0} ({1})".format(series['name'], series['start_year'])
+					match['distance'] = ImageHasher.hamming_distance(cover_hash, url_image_hash)
+					match['issue_number'] = num_s
+					match['url_image_hash'] = url_image_hash
+					match['issue_title'] = issue['name']
+					match['img_url'] = thumb_url
+					match_list.append(match)
+					
+					break
+		IssueIdentifier.log_msg( "done!" )
+		
+		if len(match_list) == 0:
+			IssueIdentifier.log_msg( ":-(  no matches!" )
+			return
+		
+		# sort list by image match scores
+		match_list.sort(key=lambda k: k['distance'])		
+		
+		l = []
+		for i in match_list:
+			l.append( i['distance'] )
+
+		IssueIdentifier.log_msg( "Compared {0} covers".format(len(match_list)), newline=False)
+		IssueIdentifier.log_msg( str(l))
+
+		def print_match(item):
+			IssueIdentifier.log_msg( u"-----> {0} #{1} {2} -- score: {3}\n-------> url:{4}".format(
+									item['series'], 
+									item['issue_number'], 
+									item['issue_title'],
+									item['distance'],
+									item['img_url']) )
+		
+		best_score = match_list[0]['distance']
+
+		if len(match_list) == 1:
+			if best_score > self.min_score_thresh:
+				IssueIdentifier.log_msg( "!!!! Very weak score for the cover.  Maybe it's not the cover?" )
+			print_match(match_list[0])
+			return
+
+		elif best_score > self.min_score_thresh and len(match_list) > 1:
+			IssueIdentifier.log_msg( "No good image matches!  Need to use other info..." )
+			return
+		
+		#now pare down list, remove any item more than specified distant from the top scores
+		for item in reversed(match_list):
+			if item['distance'] > best_score + self.min_score_distance:
+				match_list.remove(item)
+
+		if len(match_list) == 1:
+			print_match(match_list[0])
+			return
+		elif len(match_list) == 0:
+			IssueIdentifier.log_msg( "No matches found :(" )
+			return
+			
+		else:
+			print 
+			IssueIdentifier.log_msg( "More than one likley candiate.  Maybe a lexical comparison??" )
+			for item in match_list:
+				print_match(item)
+	
+	
--- a/tagger.py
+++ b/tagger.py
@ -21,27 +21,17 @@ limitations under the License.
 """

 import sys
-import getopt
-import json
-import xml
-from pprint import pprint 
-from PyQt4 import QtCore, QtGui
 import signal
 import os
-import math
-import urllib2, urllib 
+
+from PyQt4 import QtCore, QtGui

 from settings import ComicTaggerSettings
-
 from taggerwindow import TaggerWindow
 from options import Options, MetaDataStyle
 from comicarchive import ComicArchive
+from issueidentifier import IssueIdentifier

-from comicvinetalker import ComicVineTalker
-from comicvinecacher import ComicVineCacher
-from comicinfoxml import ComicInfoXml
-from comicbookinfo import ComicBookInfo
-from imagehasher import ImageHasher
 import utils

 #-----------------------------
@ -52,188 +42,9 @@ def cliProcedure( opts, settings ):
 		print "Sorry, but "+ opts.filename + "  is not a comic archive!"
 		return
 	
-	cover_image_data = ca.getCoverPage()
-
-	if opts.image_hasher == '3':
-		cover_hash = ImageHasher( data=cover_image_data ).dct_average_hash() 
-	elif opts.image_hasher == '2':
-		cover_hash = ImageHasher( data=cover_image_data ).average_hash2() 
-	else:
-		cover_hash = ImageHasher( data=cover_image_data ).average_hash() 
-
-	#print "Cover hash = {0:016x}".format(cover_hash)
-
-	# see if the archive has any useful meta data for searching with
-	if ca.hasCIX():
-		internal_metadata = ca.readCIX()
-	elif ca.hasCBI():
-		internal_metadata = ca.readCBI()
-	else:
-		internal_metadata = ca.readCBI()
-
-	# try to get some metadata from filename
-	md_from_filename = ca.metadataFromFilename()
+	ii = IssueIdentifier( ca, settings.cv_api_key )
+	ii.search()
 	
-	# now figure out what we have to search with
-	search_series = internal_metadata.series
-	search_issue_number = internal_metadata.issueNumber
-	search_year = internal_metadata.publicationYear
-	search_month = internal_metadata.publicationMonth
-	
-	if search_series is None:
-		search_series = md_from_filename.series
-	
-	if search_issue_number is None:
-		search_issue_number = md_from_filename.issueNumber
-		
-	if search_year is None:
-		search_year = md_from_filename.publicationYear
-		
-	# we need, at minimum, a series and issue number
-	if search_series is None or search_issue_number is None:
-		print "Not enough info for a search!"
-		return
-	
-	"""
-	print "Going to search for:" 
-	print "Series: ", search_series
-	print "Issue : ", search_issue_number  
-	if search_year is not None:
-		print "Year :  ", search_year 
-	if search_month is not None:
-		print "Month : ", search_month
-	"""
-	comicVine = ComicVineTalker( settings.cv_api_key )
-
-	#print ( "Searching for " + search_series + "...")
-	print "Searching for  {0} #{1} ...".format( search_series, search_issue_number)
-
-	search_series = utils.removearticles( search_series )
-	
-	cv_search_results = comicVine.searchForSeries( search_series )
-	
-	#print "Found " + str(len(cv_search_results)) + " initial results"
-	
-	series_shortlist = []
-	
-	#print "Removing results with too long names"
-	for item in cv_search_results:
-		#assume that our search name is close to the actual name, say within 5 characters
-		if len( utils.removearticles(item['name'])) < len( search_series ) + 5:
-			series_shortlist.append(item)
-	
-	# if we don't think it's an issue number 1, remove any series' that are one-shots
-	if search_issue_number != '1':
-		#print "Removing one-shots"
-		series_shortlist[:] = [x for x in series_shortlist if not x['count_of_issues'] == 1]	
-
-	print "Searching in " + str(len(series_shortlist)) +" series" 
-	
-	# now sort the list by name length
-	series_shortlist.sort(key=lambda x: len(x['name']), reverse=False)
-	
-	# Now we've got a list of series that we can dig into, 
-	# and look for matching issue number, date, and cover image
-	
-	match_list = []
-
-	print "Fetching issue data",
-
-	for series in series_shortlist:
-		#print "Fetching info for  ID: {0} {1} ({2}) ...".format(
-		#               series['id'], 
-		#               series['name'], 
-		#               series['start_year'])
-		print ".",
-		sys.stdout.flush()
-		
-		cv_series_results = comicVine.fetchVolumeData( series['id'] )
-		issue_list = cv_series_results['issues']
-		for issue in issue_list:
-			
-			# format the issue number string nicely, since it's usually something like "2.00"
-			num_f = float(issue['issue_number'])
-			num_s = str( int(math.floor(num_f)) )
-			if math.floor(num_f) != num_f:
-				num_s = str( num_f )			
-
-			# look for a matching issue number
-			if num_s == search_issue_number:
-				# found a matching issue number!  now get the issue data 
-				img_url, thumb_url = comicVine.fetchIssueCoverURLs( issue['id'] )
-				#TODO get the image from URL, and calc hash!!
-				url_image_data = urllib.urlopen(thumb_url).read()
-				
-				if opts.image_hasher == '3':
-					url_image_hash = ImageHasher( data=url_image_data ).dct_average_hash() 
-				elif opts.image_hasher == '2':
-					url_image_hash = ImageHasher( data=url_image_data ).average_hash2() 
-				else:
-					url_image_hash = ImageHasher( data=url_image_data ).average_hash() 
-				
-				match = dict()
-				match['series'] = "{0} ({1})".format(series['name'], series['start_year'])
-				match['distance'] = ImageHasher.hamming_distance(cover_hash, url_image_hash)
-				match['issue_number'] = num_s
-				match['url_image_hash'] = url_image_hash
-				match['issue_title'] = issue['name']
-				match['img_url'] = thumb_url
-				match_list.append(match)
-				
-				break
-	print "done!"
-	
-	if len(match_list) == 0:
-		print ":-(  no matches!"
-		return
-	
-	# sort list by image match scores
-	match_list.sort(key=lambda k: k['distance'])
-	
-	print "Compared {0} covers".format(len(match_list)),
-	
-	l = []
-	for i in match_list:
-		l.append( i['distance'] )
-	print l
-
-	def print_match(item):
-		print u"-----> {0} #{1} {2} -- score: {3}\n-------> url:{4}".format(
-								item['series'], 
-								item['issue_number'], 
-								item['issue_title'],
-								item['distance'],
-								item['img_url'])
-	
-	best_score = match_list[0]['distance']
-
-	if len(match_list) == 1:
-		if best_score > 25:
-			print "!!!! Very weak score for the cover.  Maybe it's not the cover?"
-		print_match(match_list[0])
-		return
-
-	elif best_score > 25 and len(match_list) > 1:
-		print "No good image matches!  Need to use other info..."
-		return
-	
-	#now pare down list, remove any item more than 2 distant from the top scores
-	for item in reversed(match_list):
-		if item['distance'] > best_score + 2:
-			match_list.remove(item)
-
-	if len(match_list) == 1:
-		print_match(match_list[0])
-		return
-	elif len(match_list) == 0:
-		print "No matches found :("
-		return
-		
-	else:
-		print "More than one likley candiate.  Maybe a lexical comparison??"
-		for item in match_list:
-			print_match(item)
-			
 	"""
 	# now get the particular issue data
 	metadata = comicVine.fetchIssueData( series_id, opts.issue_number )