autoselect now uses the issue year to help filter

git-svn-id: http://comictagger.googlecode.com/svn/trunk@54 6c5673fe-1810-88d6-992b-cd32ca31540c
This commit is contained in:
beville@gmail.com 2012-11-17 03:02:04 +00:00
parent cdb22347ab
commit 9f9d9a2635
7 changed files with 126 additions and 45 deletions

View File

@ -76,6 +76,8 @@ class ComicVineCacher:
"image_hash TEXT," +
"thumb_image_url TEXT," +
"thumb_image_hash TEXT," +
"publish_month TEXT," +
"publish_year TEXT," +
"timestamp TEXT," +
"PRIMARY KEY (id ) )"
)
@ -229,7 +231,7 @@ class ComicVineCacher:
return result
def add_issue_image_url( self, issue_id, image_url, thumb_image_url ):
def add_issue_select_details( self, issue_id, image_url, thumb_image_url, publish_month, publish_year ):
con = lite.connect( self.db_file )
@ -240,25 +242,27 @@ class ComicVineCacher:
data = {
"image_url": image_url,
"thumb_image_url": thumb_image_url,
"publish_month": publish_month,
"publish_year": publish_year,
"timestamp": timestamp
}
self.upsert( cur, "issues" , "id", issue_id, data)
def get_issue_image_url( self, issue_id ):
def get_issue_select_details( self, issue_id ):
con = lite.connect( self.db_file )
with con:
cur = con.cursor()
cur.execute("SELECT image_url,thumb_image_url FROM Issues WHERE id=?", [ issue_id ])
cur.execute("SELECT image_url,thumb_image_url,publish_month,publish_year FROM Issues WHERE id=?", [ issue_id ])
row = cur.fetchone()
if row[0] is None :
return None, None
return None, None, None, None
else:
return row[0],row[1]
return row[0],row[1],row[2],row[3]
def upsert( self, cur, tablename, pkname, pkval, data):

View File

@ -250,38 +250,47 @@ class ComicVineTalker(QObject):
return newstring
def fetchIssueDate( self, issue_id ):
image_url, thumb_url, month,year = self.fetchIssueSelectDetails( issue_id )
return month, year
def fetchIssueCoverURLs( self, issue_id ):
image_url, thumb_url, month,year = self.fetchIssueSelectDetails( issue_id )
return image_url, thumb_url
def fetchIssueSelectDetails( self, issue_id ):
cached_image_url,cached_thumb_url = self.fetchCachedIssueCoverURLs( issue_id )
cached_image_url,cached_thumb_url,cached_month,cached_year = self.fetchCachedIssueSelectDetails( issue_id )
if cached_image_url is not None:
return cached_image_url,cached_thumb_url
return cached_image_url,cached_thumb_url, cached_month, cached_year
issue_url = "http://api.comicvine.com/issue/" + str(issue_id) + "/?api_key=" + self.api_key + "&format=json&field_list=image"
issue_url = "http://api.comicvine.com/issue/" + str(issue_id) + "/?api_key=" + self.api_key + "&format=json&field_list=image,publish_month,publish_year"
resp = urllib2.urlopen(issue_url)
content = resp.read()
cv_response = json.loads(content)
if cv_response[ 'status_code' ] != 1:
print ( "Comic Vine query failed with error: [{0}]. ".format( cv_response[ 'error' ] ))
return None, None
return None, None,None,None
image_url = cv_response['results']['image']['super_url']
thumb_url = cv_response['results']['image']['thumb_url']
year = cv_response['results']['publish_year']
month = cv_response['results']['publish_month']
if image_url is not None:
self.cacheIssueCoverURLs( issue_id, image_url,thumb_url )
return image_url,thumb_url
self.cacheIssueSelectDetails( issue_id, image_url,thumb_url, month, year )
return image_url,thumb_url,month,year
def fetchCachedIssueCoverURLs( self, issue_id ):
def fetchCachedIssueSelectDetails( self, issue_id ):
# before we search online, look in our cache, since we might already
# have this info
cvc = ComicVineCacher( ComicTaggerSettings.getSettingsFolder() )
return cvc.get_issue_image_url( issue_id )
return cvc.get_issue_select_details( issue_id )
def cacheIssueCoverURLs( self, issue_id, image_url,thumb_url ):
def cacheIssueSelectDetails( self, issue_id, image_url, thumb_url, month, year ):
cvc = ComicVineCacher( ComicTaggerSettings.getSettingsFolder() )
cvc.add_issue_image_url( issue_id, image_url, thumb_url )
cvc.add_issue_select_details( issue_id, image_url, thumb_url, month, year )
#---------------------------------------------------------------------------
@ -290,12 +299,12 @@ class ComicVineTalker(QObject):
def asyncFetchIssueCoverURLs( self, issue_id ):
self.issue_id = issue_id
cached_image_url,cached_thumb_url = self.fetchCachedIssueCoverURLs( issue_id )
cached_image_url,cached_thumb_url,month,year = self.fetchCachedIssueSelectDetails( issue_id )
if cached_image_url is not None:
self.urlFetchComplete.emit( cached_image_url,cached_thumb_url, self.issue_id )
return
issue_url = "http://api.comicvine.com/issue/" + str(issue_id) + "/?api_key=" + self.api_key + "&format=json&field_list=image"
issue_url = "http://api.comicvine.com/issue/" + str(issue_id) + "/?api_key=" + self.api_key + "&format=json&field_list=image,publish_month,publish_year"
self.nam = QNetworkAccessManager()
self.nam.finished.connect( self.asyncFetchIssueCoverURLComplete )
self.nam.get(QNetworkRequest(QUrl(issue_url)))
@ -311,8 +320,10 @@ class ComicVineTalker(QObject):
image_url = cv_response['results']['image']['super_url']
thumb_url = cv_response['results']['image']['thumb_url']
year = cv_response['results']['publish_year']
month = cv_response['results']['publish_month']
self.cacheIssueCoverURLs( self.issue_id, image_url, thumb_url )
self.cacheIssueSelectDetails( self.issue_id, image_url, thumb_url, month, year )
self.urlFetchComplete.emit( image_url, thumb_url, self.issue_id )

View File

@ -43,14 +43,26 @@ class IssueIdentifier:
def __init__(self, comic_archive, cv_api_key ):
self.comic_archive = comic_archive
self.image_hasher = 1
self.additional_metadata = None
self.min_score_thresh = 22
self.onlyUseAdditionalMetaData = False
# a decent hamming score, good enough to call it a match
self.min_score_thresh = 20
# the min distance a hamming score must be to separate itself from closest neighbor
self.min_score_distance = 2
# a very strong hamming score, almost certainly the same image
self.strong_score_thresh = 8
# used to eliminate series names that are too long based on our search string
self.length_delta_thresh = 3
self.additional_metadata = GenericMetadata()
self.cv_api_key = cv_api_key
self.output_function = IssueIdentifier.defaultWriteOutput
self.callback = None
self.search_result = self.ResultNoMatches
def setScoreMinThreshold( self, thresh ):
self.min_score_thresh = thresh
@ -91,7 +103,14 @@ class IssueIdentifier:
if ca is None:
return
if self.onlyUseAdditionalMetaData:
search_keys['series'] = self.additional_metadata.series
search_keys['issue_number'] = self.additional_metadata.issueNumber
search_keys['year'] = self.additional_metadata.publicationYear
search_keys['month'] = self.additional_metadata.publicationMonth
return search_keys
# see if the archive has any useful meta data for searching with
if ca.hasCIX():
internal_metadata = ca.readCIX()
@ -128,7 +147,7 @@ class IssueIdentifier:
search_keys['year'] = internal_metadata.publicationYear
else:
search_keys['year'] = md_from_filename.publicationYear
if self.additional_metadata.publicationMonth is not None:
search_keys['month'] = self.additional_metadata.publicationMonth
elif internal_metadata.publicationMonth is not None:
@ -156,7 +175,7 @@ class IssueIdentifier:
if not ca.seemsToBeAComicArchive():
self.log_msg( "Sorry, but "+ opts.filename + " is not a comic archive!")
return []
return self.ResultNoMatches, []
cover_image_data = ca.getCoverPage()
@ -171,7 +190,7 @@ class IssueIdentifier:
self.log_msg("Not enough info for a search!")
return []
"""
self.log_msg( "Going to search for:" )
self.log_msg( "Series: " + keys['series'] )
self.log_msg( "Issue : " + keys['issue_number'] )
@ -179,7 +198,7 @@ class IssueIdentifier:
self.log_msg( "Year : " + keys['year'] )
if keys['month'] is not None:
self.log_msg( "Month : " + keys['month'] )
"""
comicVine = ComicVineTalker( self.cv_api_key )
#self.log_msg( ( "Searching for " + keys['series'] + "...")
@ -195,8 +214,10 @@ class IssueIdentifier:
#self.log_msg( "Removing results with too long names" )
for item in cv_search_results:
#assume that our search name is close to the actual name, say within 5 characters
if len( utils.removearticles(item['name'])) < len( keys['series'] ) + 5:
#assume that our search name is close to the actual name, say within ,e.g. 5 chars
shortened_key = utils.removearticles(keys['series'])
shortened_item_name = utils.removearticles(item['name'])
if len( shortened_item_name ) < ( len( shortened_key ) + self.length_delta_thresh) :
series_shortlist.append(item)
# if we don't think it's an issue number 1, remove any series' that are one-shots
@ -241,6 +262,17 @@ class IssueIdentifier:
if num_s == keys['issue_number']:
# found a matching issue number! now get the issue data
img_url, thumb_url = comicVine.fetchIssueCoverURLs( issue['id'] )
month, year = comicVine.fetchIssueDate( issue['id'] )
if self.cancel == True:
self.match_list = []
return self.match_list
# now, if we have an issue year key given, reject this one if not a match
if keys['year'] is not None:
if keys['year'] != year:
break
url_image_data = ImageFetcher().fetch(thumb_url, blocking=True)
if self.cancel == True:
@ -258,6 +290,8 @@ class IssueIdentifier:
match['img_url'] = thumb_url
match['issue_id'] = issue['id']
match['volume_id'] = series['id']
match['month'] = month
match['year'] = year
self.match_list.append(match)
self.log_msg( " --> {0}".format(match['distance']), newline=False )
@ -268,8 +302,10 @@ class IssueIdentifier:
if len(self.match_list) == 0:
self.log_msg( ":-( no matches!" )
self.search_result = self.ResultNoMatches
return self.match_list
# sort list by image match scores
self.match_list.sort(key=lambda k: k['distance'])
@ -281,20 +317,22 @@ class IssueIdentifier:
self.log_msg( str(l))
def print_match(item):
self.log_msg( u"-----> {0} #{1} {2} -- score: {3}".format(
self.log_msg( u"-----> {0} #{1} {2} ({3}/{4}) -- score: {5}".format(
item['series'],
item['issue_number'],
item['issue_title'],
item['month'],
item['year'],
item['distance']) )
best_score = self.match_list[0]['distance']
if len(self.match_list) == 1:
self.search_result = self.ResultOneGoodMatch
if best_score > self.min_score_thresh:
self.log_msg( "!!!! Very weak score for the cover. Maybe it's not the cover?" )
self.log_msg( "Comparing other pages now..." )
self.log_msg( "Comparing other archive pages now..." )
found = False
for i in range(ca.getNumberOfPages()):
image_data = ca.getPage(i)
@ -311,12 +349,15 @@ class IssueIdentifier:
self.log_msg( "" )
if not found:
self.log_msg( "No matching pages in the issue. Bummer" )
self.search_result = self.ResultFoundMatchButBadCoverScore
print_match(self.match_list[0])
return self.match_list
elif best_score > self.min_score_thresh and len(self.match_list) > 1:
self.log_msg( "No good image matches! Need to use other info..." )
self.search_result = self.ResultMultipleMatchesWithBadImageScores
return self.match_list
#now pare down list, remove any item more than specified distant from the top scores
@ -326,11 +367,15 @@ class IssueIdentifier:
if len(self.match_list) == 1:
print_match(self.match_list[0])
self.search_result = self.ResultOneGoodMatch
elif len(self.match_list) == 0:
self.log_msg( "No matches found :(" )
self.search_result = self.ResultNoMatches
else:
print
self.log_msg( "More than one likley candiate. Maybe a lexical comparison??" )
self.log_msg( "More than one likley candiate." )
self.search_result = self.ResultMultipleGoodMatches
for item in self.match_list:
print_match(item)

View File

@ -848,8 +848,12 @@ class TaggerWindow( QtGui.QMainWindow):
return
issue_number = str(self.leIssueNum.text()).strip()
selector = VolumeSelectionWindow( self, self.settings.cv_api_key, series_name, issue_number, self.comic_archive, self.settings, autoselect )
year = str(self.lePubYear.text()).strip()
if year == "":
year = None
selector = VolumeSelectionWindow( self, self.settings.cv_api_key, series_name, issue_number, year, self.comic_archive, self.settings, autoselect )
title = "Search: '" + series_name + "' - "
selector.setWindowTitle( title + "Select Series")

View File

@ -3,12 +3,8 @@
Features
----------------
Auto-select:
msgbox on autoselect failure, or warning
Multi-match dialog
More auto-select logic using metadata
Maybe, if only one match, but bad score, compare each page in the archive to online cover
Check aspect ratio, and maybe break cover into two parts for hashing?
Stand-alone CLI
@ -19,7 +15,7 @@ Stand-alone CLI
TaggerWindow entry fields
Special tabbed Dialogs needed for:
Special tabbed Dialog needed for:
Pages Info - maybe a custom painted widget
At minimum, preserve the page data
@ -44,7 +40,7 @@ Disable CBL for RAR
SERIOUS BUG: rebuilding zips!
http://stackoverflow.com/questions/11578443/trigger-io-errno-18-cross-device-link
MAC:
OSX:
toolbar
weird unrar complaints
Page browser sizing
@ -75,8 +71,7 @@ Image Hashes:
Filename parsing:
Concatenation of Name and Issue??
"1602"
Issue identifier - compare names with aricles removed
App option to covert RAR to ZIP

View File

@ -54,6 +54,7 @@ def addtopath( dir ):
os.environ['PATH'] = dir + os.pathsep + os.environ['PATH']
def removearticles( text ):
text = text.lower()
articles = ['and', 'the', 'a', '&' ]
newText = ''
for word in text.split(' '):

View File

@ -77,12 +77,11 @@ class IdentifyThread( QtCore.QThread):
def run(self):
matches =self.identifier.search()
self.identifyComplete.emit( )
class VolumeSelectionWindow(QtGui.QDialog):
def __init__(self, parent, cv_api_key, series_name, issue_number, comic_archive, settings, autoselect=False):
def __init__(self, parent, cv_api_key, series_name, issue_number, year, comic_archive, settings, autoselect=False):
super(VolumeSelectionWindow, self).__init__(parent)
uic.loadUi(os.path.join(ComicTaggerSettings.baseDir(), 'volumeselectionwindow.ui' ), self)
@ -90,6 +89,7 @@ class VolumeSelectionWindow(QtGui.QDialog):
self.settings = settings
self.series_name = series_name
self.issue_number = issue_number
self.year = year
self.cv_api_key = cv_api_key
self.volume_id = 0
self.comic_archive = comic_archive
@ -121,7 +121,10 @@ class VolumeSelectionWindow(QtGui.QDialog):
md = GenericMetadata()
md.series = self.series_name
md.issueNumber = self.issue_number
md.publicationYear = self.year
self.ii.setAdditionalMetadata( md )
self.ii.onlyUseAdditionalMetaData = True
self.id_thread = IdentifyThread( self.ii )
self.id_thread.identifyComplete.connect( self.identifyComplete )
@ -147,7 +150,25 @@ class VolumeSelectionWindow(QtGui.QDialog):
def identifyComplete( self ):
matches = self.ii.match_list
if len(matches) == 1:
result = self.ii.search_result
found_match = False
if result == self.ii.ResultNoMatches:
QtGui.QMessageBox.information(self,"Auto-Select Result", " No matches found :-(")
elif result == self.ii.ResultFoundMatchButBadCoverScore:
QtGui.QMessageBox.information(self,"Auto-Select Result", " Found a match, but cover doesn't seem to match. Verify before commiting!")
found_match = True
elif result == self.ii.ResultFoundMatchButNotFirstPage :
QtGui.QMessageBox.information(self,"Auto-Select Result", " Found a match, but not with the first page of the archive.")
found_match = True
elif result == self.ii.ResultMultipleMatchesWithBadImageScores:
QtGui.QMessageBox.information(self,"Auto-Select Result", " Found some possibilities, but no confidence. Proceed manually.")
elif result == self.ii.ResultOneGoodMatch:
found_match = True
elif result == self.ii.ResultMultipleGoodMatches:
QtGui.QMessageBox.information(self,"Auto-Select Result", " Found multiple likely matches! Selection DIALOG TBD.")
if found_match:
self.iddialog.accept()
print "VolumeSelectionWindow found a match!!", matches[0]['volume_id'], matches[0]['issue_number']