Added a caching class to reduce hits on the CV database

git-svn-id: http://comictagger.googlecode.com/svn/trunk@18 6c5673fe-1810-88d6-992b-cd32ca31540c
This commit is contained in:
beville@gmail.com 2012-11-09 04:02:14 +00:00
parent cc9c963c5b
commit 1486cce990
3 changed files with 400 additions and 19 deletions

303
comicvinecacher.py Normal file
View File

@ -0,0 +1,303 @@
"""
A python class to manage caching of data from Comic Vine
"""
"""
Copyright 2012 Anthony Beville
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from pprint import pprint
import sqlite3 as lite
import sys
import os
import datetime
class ComicVineCacher:
def __init__(self, settings_folder ):
self.settings_folder = settings_folder
self.db_file = os.path.join( self.settings_folder, "cv_cache.db")
if not os.path.exists( self.db_file ):
self.create_cache_db()
def create_cache_db( self ):
# this will wipe out any existing version
open( self.db_file, 'w').close()
con = lite.connect( self.db_file )
# create tables
with con:
cur = con.cursor()
#name,id,start_year,publisher,image,description,count_of_issues
cur.execute("CREATE TABLE VolumeSearchCache(" +
"search_term TEXT," +
"id INT," +
"name TEXT," +
"start_year INT," +
"publisher TEXT," +
"count_of_issues INT," +
"image_url TEXT," +
"description TEXT," +
"timestamp TEXT)"
)
cur.execute("CREATE TABLE Volumes(" +
"id INT," +
"name TEXT," +
"publisher TEXT," +
"count_of_issues INT," +
"timestamp TEXT," +
"PRIMARY KEY (id) )"
)
cur.execute("CREATE TABLE Issues(" +
"id INT," +
"volume_id INT," +
"name TEXT," +
"issue_number TEXT," +
"image_url TEXT," +
"image_hash TEXT," +
"thumb_image_url TEXT," +
"thumb_image_hash TEXT," +
"timestamp TEXT," +
"PRIMARY KEY (id ) )"
)
def add_search_results( self, search_term, cv_search_results ):
con = lite.connect( self.db_file )
with con:
cur = con.cursor()
# remove all previous entries with this search term
cur.execute("DELETE FROM VolumeSearchCache WHERE search_term = '{0}'".format(search_term.lower()))
# now add in new results
for record in cv_search_results:
timestamp = datetime.datetime.now()
if record['publisher'] is None:
pub_name = ""
else:
pub_name = record['publisher']['name']
if record['image'] is None:
url = ""
else:
url = record['image']['super_url']
cur.execute("INSERT INTO VolumeSearchCache VALUES( ?, ?, ?, ?, ?, ?, ?, ?, ? )" ,
( search_term.lower(),
record['id'],
record['name'],
record['start_year'],
pub_name,
record['count_of_issues'],
url,
record['description'],
timestamp )
)
def get_search_results( self, search_term ):
results = list()
con = lite.connect( self.db_file )
with con:
cur = con.cursor()
# TODO purge stale search results ( older than a day, maybe??)
# fetch
cur.execute("SELECT * FROM VolumeSearchCache WHERE search_term=?", [ search_term.lower() ] )
rows = cur.fetchall()
# now process the results
for record in rows:
result = dict()
result['id'] = record[1]
result['name'] = record[2]
result['start_year'] = record[3]
result['publisher'] = dict()
result['publisher']['name'] = record[4]
result['count_of_issues'] = record[5]
result['image'] = dict()
result['image']['super_url'] = record[6]
result['description'] = record[7]
results.append(result)
return results
def add_volume_info( self, cv_volume_record ):
con = lite.connect( self.db_file )
with con:
cur = con.cursor()
timestamp = datetime.datetime.now()
data = {
"name": cv_volume_record['name'],
"publisher": cv_volume_record['publisher']['name'],
"count_of_issues": cv_volume_record['count_of_issues'],
"timestamp": timestamp
}
self.upsert( cur, "volumes", "id", cv_volume_record['id'], data)
# now add in issues
for issue in cv_volume_record['issues']:
data = {
"volume_id": cv_volume_record['id'],
"name": issue['name'],
"issue_number": issue['issue_number'],
"timestamp": timestamp
}
self.upsert( cur, "issues" , "id", issue['id'], data)
def get_volume_info( self, volume_id ):
result = None
con = lite.connect( self.db_file )
with con:
cur = con.cursor()
# TODO purge stale volume records ( older than a week, maybe??)
# fetch
cur.execute("SELECT id,name,publisher,count_of_issues FROM Volumes WHERE id = ?", [ volume_id ] )
row = cur.fetchone()
if row is None :
return result
result = dict()
#since ID is primary key, there is only one row
result['id'] = row[0]
result['name'] = row[1]
result['publisher'] = dict()
result['publisher']['name'] = row[2]
result['count_of_issues'] = row[3]
result['issues'] = list()
cur.execute("SELECT id,name,issue_number,image_url,image_hash FROM Issues WHERE volume_id = ?", [ volume_id ] )
rows = cur.fetchall()
# now process the results
for row in rows:
record = dict()
record['id'] = row[0]
record['name'] = row[1]
record['issue_number'] = row[2]
record['image_url'] = row[3]
record['image_hash'] = row[4]
result['issues'].append(record)
return result
def add_issue_image_url( self, issue_id, image_url ):
con = lite.connect( self.db_file )
with con:
cur = con.cursor()
timestamp = datetime.datetime.now()
data = {
"image_url": image_url,
"timestamp": timestamp
}
self.upsert( cur, "issues" , "id", issue_id, data)
def get_issue_image_url( self, issue_id ):
con = lite.connect( self.db_file )
with con:
cur = con.cursor()
cur.execute("SELECT image_url FROM Issues WHERE id=?", [ issue_id ])
row = cur.fetchone()
if row[0] is None :
return None
else:
return row[0]
def upsert( self, cur, tablename, pkname, pkval, data):
"""
This does an insert if the given PK doesn't exist, and an update it if does
"""
# TODO - look into checking if UPDATE is needed
# TODO - should the cursor be created here, and not up the stack?
ins_count = len(data) + 1
keys = ""
vals = list()
ins_slots = ""
set_slots = ""
for key in data:
if keys != "":
keys += ", "
if ins_slots != "":
ins_slots += ", "
if set_slots != "":
set_slots += ", "
keys += key
vals.append( data[key] )
ins_slots += "?"
set_slots += key + " = ?"
keys += ", " + pkname
vals.append( pkval )
ins_slots += ", ?"
condition = pkname + " = ?"
sql_ins = ( "INSERT OR IGNORE INTO " + tablename +
" ( " + keys + " ) " +
" VALUES ( " + ins_slots + " )" )
cur.execute( sql_ins , vals )
sql_upd = ( "UPDATE " + tablename +
" SET " + set_slots + " WHERE " + condition )
cur.execute( sql_upd , vals )

View File

@ -26,7 +26,8 @@ import math
import re
import utils
from settings import ComicTaggerSettings
from comicvinecacher import ComicVineCacher
from genericmetadata import GenericMetadata
class ComicVineTalker:
@ -48,6 +49,16 @@ class ComicVineTalker:
def searchForSeries( self, series_name ):
# before we search online, look in our cache, since we might have
# done this same search recently
cvc = ComicVineCacher( ComicTaggerSettings.getSettingsFolder() )
cached_search_results = cvc.get_search_results( series_name )
if len (cached_search_results) > 0:
return cached_search_results
original_series_name = series_name
series_name = urllib.quote_plus(str(series_name))
search_url = "http://api.comicvine.com/search/?api_key=" + self.api_key + "&format=json&resources=volume&query=" + series_name + "&field_list=name,id,start_year,publisher,image,description,count_of_issues&sort=start_year"
@ -95,9 +106,21 @@ class ComicVineTalker:
#print "{0}: {1} ({2})".format(search_results['results'][0]['id'], smart_str(search_results['results'][0]['name']) , search_results['results'][0]['start_year'] )
# cache these search results
cvc.add_search_results( original_series_name, search_results )
return search_results
def fetchVolumeData( self, series_id ):
# before we search online, look in our cache, since we might already
# have this info
cvc = ComicVineCacher( ComicTaggerSettings.getSettingsFolder() )
cached_volume_result = cvc.get_volume_info( series_id )
if cached_volume_result is not None:
return cached_volume_result
volume_url = "http://api.comicvine.com/volume/" + str(series_id) + "/?api_key=" + self.api_key + "&format=json"
#print "search_url = : ", volume_url
@ -113,6 +136,8 @@ class ComicVineTalker:
volume_results = cv_response['results']
cvc.add_volume_info( volume_results )
return volume_results
@ -208,6 +233,14 @@ class ComicVineTalker:
def fetchIssueCoverURL( self, issue_id ):
# before we search online, look in our cache, since we might already
# have this info
cvc = ComicVineCacher( ComicTaggerSettings.getSettingsFolder() )
cached_image_url = cvc.get_issue_image_url( issue_id )
if cached_image_url is not None:
return cached_image_url
issue_url = "http://api.comicvine.com/issue/" + str(issue_id) + "/?api_key=" + self.api_key + "&format=json&field_list=image"
resp = urllib2.urlopen(issue_url)
content = resp.read()
@ -216,6 +249,7 @@ class ComicVineTalker:
print ( "Comic Vine query failed with error: [{0}]. ".format( cv_response[ 'error' ] ))
return None
cvc.add_issue_image_url( issue_id, cv_response['results']['image']['super_url'] )
return cv_response['results']['image']['super_url']

View File

@ -38,6 +38,7 @@ from options import Options, MetaDataStyle
from comicarchive import ComicArchive
from comicvinetalker import ComicVineTalker
from comicvinecacher import ComicVineCacher
from comicinfoxml import ComicInfoXml
from comicbookinfo import ComicBookInfo
from imagehasher import ImageHasher
@ -105,6 +106,16 @@ def cliProcedure( opts, settings ):
print ( "Searching for " + search_series + "...")
cv_search_results = comicVine.searchForSeries( search_series )
#---------- TEST
#cvc = ComicVineCacher( settings.folder )
#cvc.add_search_results( search_series, cv_search_results )
#cached_search_results = cvc.get_search_results( search_series)
#for r in cached_search_results:
# print "{0}: {1} ({2})".format( r['id'], r['name'], r['start_year'])
#quit()
#---------- TEST
print "Found " + str(len(cv_search_results)) + " initial results"
@ -129,7 +140,8 @@ def cliProcedure( opts, settings ):
# Now we've got a list of series that we can dig into,
# and look for matching issue number, date, and cover image
match_list = []
for series in series_shortlist:
#print series['id'], series['name'], series['start_year'], series['count_of_issues']
print "Fetching info for ID: {0} {1} ({2}) ...".format(
@ -156,24 +168,60 @@ def cliProcedure( opts, settings ):
#url_image_hash = ImageHasher( data=url_image_data ).average_hash()
url_image_hash = ImageHasher( data=url_image_data, ).average_hash2()
#url_image_hash = ImageHasher( data=url_image_data, width=32, height=32 ).perceptual_hash()
print u"-----> ID: {0} #{1} ({2}) Hash: {3} Distance: {4}\n-------> url:{5}".format(
issue['id'], num_s, issue['name'],
url_image_hash,
ImageHasher.hamming_distance(cover_hash, url_image_hash),
img_url)
match = dict()
match['series'] = "{0} ({1})".format(series['name'], series['start_year'])
match['distance'] = ImageHasher.hamming_distance(cover_hash, url_image_hash)
match['issue_number'] = num_s
match['issue_title'] = issue['name']
match['img_url'] = img_url
match_list.append(match)
break
print "Compared covers for {0} issues".format(len(match_list))
# sort list by image match scores
match_list.sort(key=lambda k: k['distance'])
#helper
def print_match(item):
print u"-----> {0} #{1} {2} -- score: {3}\n-------> url:{4}".format(
item['series'],
item['issue_number'],
item['issue_title'],
item['distance'],
item['img_url'])
best_score = match_list[0]['distance']
if len(match_list) == 0:
print "No matches found :("
return
if len(match_list) == 1:
print_match(match_list[0])
return
elif best_score > 20 and len(match_list) > 1:
print "No good image matches! Need to use other info..."
return
#now pare down list, remove any item more than 2 distant from the top scores
for item in reversed(match_list):
if item['distance'] > best_score + 2:
match_list.remove(item)
if len(match_list) == 1:
print_match(match_list[0])
return
else:
print "More than one likley candiate. Maybe a lexical comparison??"
for item in match_list:
print_match(item)
"""
#error checking here: did we get any results?
# we will eventualy want user interaction to choose the appropriate result, but for now, assume the first one
series_id = cv_search_results[0]['id']
print( "-->Auto-selecting volume ID:", cv_search_results[0]['id'] )
print(" ")
# now get the particular issue data
metadata = comicVine.fetchIssueData( series_id, opts.issue_number )
@ -182,9 +230,6 @@ def cliProcedure( opts, settings ):
ca = ComicArchive(opts.filename)
ca.writeMetadata( metadata, opts.data_style )
#debugging
ComicBookInfo().writeToExternalFile( "test.json" )
ComicBookInfo().writeToExternalFile( "test.xml" )
"""
#-----------------------------
@ -192,7 +237,6 @@ def main():
opts = Options()
opts.parseCmdLineArgs()
settings = ComicTaggerSettings()
# make sure unrar program is in the path for the UnRAR class
utils.addtopath(os.path.dirname(settings.unrar_exe_path))