Added a caching class to reduce hits on the CV database
git-svn-id: http://comictagger.googlecode.com/svn/trunk@18 6c5673fe-1810-88d6-992b-cd32ca31540c
This commit is contained in:
parent
cc9c963c5b
commit
1486cce990
303
comicvinecacher.py
Normal file
303
comicvinecacher.py
Normal file
@ -0,0 +1,303 @@
|
||||
"""
|
||||
A python class to manage caching of data from Comic Vine
|
||||
"""
|
||||
|
||||
"""
|
||||
Copyright 2012 Anthony Beville
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
"""
|
||||
|
||||
from pprint import pprint
|
||||
|
||||
import sqlite3 as lite
|
||||
import sys
|
||||
import os
|
||||
import datetime
|
||||
|
||||
class ComicVineCacher:
|
||||
|
||||
def __init__(self, settings_folder ):
|
||||
self.settings_folder = settings_folder
|
||||
self.db_file = os.path.join( self.settings_folder, "cv_cache.db")
|
||||
|
||||
if not os.path.exists( self.db_file ):
|
||||
self.create_cache_db()
|
||||
|
||||
def create_cache_db( self ):
|
||||
|
||||
# this will wipe out any existing version
|
||||
open( self.db_file, 'w').close()
|
||||
|
||||
con = lite.connect( self.db_file )
|
||||
|
||||
# create tables
|
||||
with con:
|
||||
|
||||
cur = con.cursor()
|
||||
#name,id,start_year,publisher,image,description,count_of_issues
|
||||
cur.execute("CREATE TABLE VolumeSearchCache(" +
|
||||
"search_term TEXT," +
|
||||
"id INT," +
|
||||
"name TEXT," +
|
||||
"start_year INT," +
|
||||
"publisher TEXT," +
|
||||
"count_of_issues INT," +
|
||||
"image_url TEXT," +
|
||||
"description TEXT," +
|
||||
"timestamp TEXT)"
|
||||
)
|
||||
|
||||
cur.execute("CREATE TABLE Volumes(" +
|
||||
"id INT," +
|
||||
"name TEXT," +
|
||||
"publisher TEXT," +
|
||||
"count_of_issues INT," +
|
||||
"timestamp TEXT," +
|
||||
"PRIMARY KEY (id) )"
|
||||
)
|
||||
|
||||
cur.execute("CREATE TABLE Issues(" +
|
||||
"id INT," +
|
||||
"volume_id INT," +
|
||||
"name TEXT," +
|
||||
"issue_number TEXT," +
|
||||
"image_url TEXT," +
|
||||
"image_hash TEXT," +
|
||||
"thumb_image_url TEXT," +
|
||||
"thumb_image_hash TEXT," +
|
||||
"timestamp TEXT," +
|
||||
"PRIMARY KEY (id ) )"
|
||||
)
|
||||
|
||||
def add_search_results( self, search_term, cv_search_results ):
|
||||
|
||||
con = lite.connect( self.db_file )
|
||||
|
||||
with con:
|
||||
|
||||
cur = con.cursor()
|
||||
|
||||
# remove all previous entries with this search term
|
||||
cur.execute("DELETE FROM VolumeSearchCache WHERE search_term = '{0}'".format(search_term.lower()))
|
||||
|
||||
# now add in new results
|
||||
for record in cv_search_results:
|
||||
timestamp = datetime.datetime.now()
|
||||
|
||||
if record['publisher'] is None:
|
||||
pub_name = ""
|
||||
else:
|
||||
pub_name = record['publisher']['name']
|
||||
|
||||
if record['image'] is None:
|
||||
url = ""
|
||||
else:
|
||||
url = record['image']['super_url']
|
||||
|
||||
cur.execute("INSERT INTO VolumeSearchCache VALUES( ?, ?, ?, ?, ?, ?, ?, ?, ? )" ,
|
||||
( search_term.lower(),
|
||||
record['id'],
|
||||
record['name'],
|
||||
record['start_year'],
|
||||
pub_name,
|
||||
record['count_of_issues'],
|
||||
url,
|
||||
record['description'],
|
||||
timestamp )
|
||||
)
|
||||
|
||||
def get_search_results( self, search_term ):
|
||||
|
||||
results = list()
|
||||
con = lite.connect( self.db_file )
|
||||
with con:
|
||||
cur = con.cursor()
|
||||
|
||||
# TODO purge stale search results ( older than a day, maybe??)
|
||||
|
||||
# fetch
|
||||
cur.execute("SELECT * FROM VolumeSearchCache WHERE search_term=?", [ search_term.lower() ] )
|
||||
rows = cur.fetchall()
|
||||
# now process the results
|
||||
for record in rows:
|
||||
|
||||
result = dict()
|
||||
result['id'] = record[1]
|
||||
result['name'] = record[2]
|
||||
result['start_year'] = record[3]
|
||||
result['publisher'] = dict()
|
||||
result['publisher']['name'] = record[4]
|
||||
result['count_of_issues'] = record[5]
|
||||
result['image'] = dict()
|
||||
result['image']['super_url'] = record[6]
|
||||
result['description'] = record[7]
|
||||
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def add_volume_info( self, cv_volume_record ):
|
||||
|
||||
con = lite.connect( self.db_file )
|
||||
|
||||
with con:
|
||||
|
||||
cur = con.cursor()
|
||||
|
||||
timestamp = datetime.datetime.now()
|
||||
|
||||
data = {
|
||||
"name": cv_volume_record['name'],
|
||||
"publisher": cv_volume_record['publisher']['name'],
|
||||
"count_of_issues": cv_volume_record['count_of_issues'],
|
||||
"timestamp": timestamp
|
||||
}
|
||||
self.upsert( cur, "volumes", "id", cv_volume_record['id'], data)
|
||||
|
||||
# now add in issues
|
||||
|
||||
for issue in cv_volume_record['issues']:
|
||||
|
||||
data = {
|
||||
"volume_id": cv_volume_record['id'],
|
||||
"name": issue['name'],
|
||||
"issue_number": issue['issue_number'],
|
||||
"timestamp": timestamp
|
||||
}
|
||||
self.upsert( cur, "issues" , "id", issue['id'], data)
|
||||
|
||||
|
||||
def get_volume_info( self, volume_id ):
|
||||
|
||||
result = None
|
||||
|
||||
con = lite.connect( self.db_file )
|
||||
with con:
|
||||
cur = con.cursor()
|
||||
|
||||
# TODO purge stale volume records ( older than a week, maybe??)
|
||||
|
||||
# fetch
|
||||
cur.execute("SELECT id,name,publisher,count_of_issues FROM Volumes WHERE id = ?", [ volume_id ] )
|
||||
|
||||
row = cur.fetchone()
|
||||
|
||||
if row is None :
|
||||
return result
|
||||
|
||||
result = dict()
|
||||
|
||||
#since ID is primary key, there is only one row
|
||||
result['id'] = row[0]
|
||||
result['name'] = row[1]
|
||||
result['publisher'] = dict()
|
||||
result['publisher']['name'] = row[2]
|
||||
result['count_of_issues'] = row[3]
|
||||
result['issues'] = list()
|
||||
|
||||
cur.execute("SELECT id,name,issue_number,image_url,image_hash FROM Issues WHERE volume_id = ?", [ volume_id ] )
|
||||
rows = cur.fetchall()
|
||||
|
||||
# now process the results
|
||||
for row in rows:
|
||||
record = dict()
|
||||
record['id'] = row[0]
|
||||
record['name'] = row[1]
|
||||
record['issue_number'] = row[2]
|
||||
record['image_url'] = row[3]
|
||||
record['image_hash'] = row[4]
|
||||
|
||||
result['issues'].append(record)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def add_issue_image_url( self, issue_id, image_url ):
|
||||
|
||||
con = lite.connect( self.db_file )
|
||||
|
||||
with con:
|
||||
cur = con.cursor()
|
||||
timestamp = datetime.datetime.now()
|
||||
|
||||
data = {
|
||||
"image_url": image_url,
|
||||
"timestamp": timestamp
|
||||
}
|
||||
self.upsert( cur, "issues" , "id", issue_id, data)
|
||||
|
||||
|
||||
|
||||
def get_issue_image_url( self, issue_id ):
|
||||
|
||||
con = lite.connect( self.db_file )
|
||||
with con:
|
||||
cur = con.cursor()
|
||||
|
||||
cur.execute("SELECT image_url FROM Issues WHERE id=?", [ issue_id ])
|
||||
row = cur.fetchone()
|
||||
|
||||
if row[0] is None :
|
||||
return None
|
||||
else:
|
||||
return row[0]
|
||||
|
||||
|
||||
def upsert( self, cur, tablename, pkname, pkval, data):
|
||||
"""
|
||||
This does an insert if the given PK doesn't exist, and an update it if does
|
||||
"""
|
||||
|
||||
# TODO - look into checking if UPDATE is needed
|
||||
# TODO - should the cursor be created here, and not up the stack?
|
||||
|
||||
ins_count = len(data) + 1
|
||||
|
||||
keys = ""
|
||||
vals = list()
|
||||
ins_slots = ""
|
||||
set_slots = ""
|
||||
|
||||
for key in data:
|
||||
|
||||
if keys != "":
|
||||
keys += ", "
|
||||
if ins_slots != "":
|
||||
ins_slots += ", "
|
||||
if set_slots != "":
|
||||
set_slots += ", "
|
||||
|
||||
keys += key
|
||||
vals.append( data[key] )
|
||||
ins_slots += "?"
|
||||
set_slots += key + " = ?"
|
||||
|
||||
keys += ", " + pkname
|
||||
vals.append( pkval )
|
||||
ins_slots += ", ?"
|
||||
condition = pkname + " = ?"
|
||||
|
||||
sql_ins = ( "INSERT OR IGNORE INTO " + tablename +
|
||||
" ( " + keys + " ) " +
|
||||
" VALUES ( " + ins_slots + " )" )
|
||||
cur.execute( sql_ins , vals )
|
||||
|
||||
sql_upd = ( "UPDATE " + tablename +
|
||||
" SET " + set_slots + " WHERE " + condition )
|
||||
cur.execute( sql_upd , vals )
|
||||
|
||||
|
||||
|
||||
|
@ -26,7 +26,8 @@ import math
|
||||
import re
|
||||
|
||||
import utils
|
||||
|
||||
from settings import ComicTaggerSettings
|
||||
from comicvinecacher import ComicVineCacher
|
||||
from genericmetadata import GenericMetadata
|
||||
|
||||
class ComicVineTalker:
|
||||
@ -48,6 +49,16 @@ class ComicVineTalker:
|
||||
|
||||
|
||||
def searchForSeries( self, series_name ):
|
||||
|
||||
# before we search online, look in our cache, since we might have
|
||||
# done this same search recently
|
||||
cvc = ComicVineCacher( ComicTaggerSettings.getSettingsFolder() )
|
||||
cached_search_results = cvc.get_search_results( series_name )
|
||||
|
||||
if len (cached_search_results) > 0:
|
||||
return cached_search_results
|
||||
|
||||
original_series_name = series_name
|
||||
|
||||
series_name = urllib.quote_plus(str(series_name))
|
||||
search_url = "http://api.comicvine.com/search/?api_key=" + self.api_key + "&format=json&resources=volume&query=" + series_name + "&field_list=name,id,start_year,publisher,image,description,count_of_issues&sort=start_year"
|
||||
@ -95,9 +106,21 @@ class ComicVineTalker:
|
||||
|
||||
#print "{0}: {1} ({2})".format(search_results['results'][0]['id'], smart_str(search_results['results'][0]['name']) , search_results['results'][0]['start_year'] )
|
||||
|
||||
# cache these search results
|
||||
cvc.add_search_results( original_series_name, search_results )
|
||||
|
||||
return search_results
|
||||
|
||||
def fetchVolumeData( self, series_id ):
|
||||
|
||||
# before we search online, look in our cache, since we might already
|
||||
# have this info
|
||||
cvc = ComicVineCacher( ComicTaggerSettings.getSettingsFolder() )
|
||||
cached_volume_result = cvc.get_volume_info( series_id )
|
||||
|
||||
if cached_volume_result is not None:
|
||||
return cached_volume_result
|
||||
|
||||
|
||||
volume_url = "http://api.comicvine.com/volume/" + str(series_id) + "/?api_key=" + self.api_key + "&format=json"
|
||||
#print "search_url = : ", volume_url
|
||||
@ -113,6 +136,8 @@ class ComicVineTalker:
|
||||
|
||||
volume_results = cv_response['results']
|
||||
|
||||
cvc.add_volume_info( volume_results )
|
||||
|
||||
return volume_results
|
||||
|
||||
|
||||
@ -208,6 +233,14 @@ class ComicVineTalker:
|
||||
|
||||
def fetchIssueCoverURL( self, issue_id ):
|
||||
|
||||
# before we search online, look in our cache, since we might already
|
||||
# have this info
|
||||
cvc = ComicVineCacher( ComicTaggerSettings.getSettingsFolder() )
|
||||
cached_image_url = cvc.get_issue_image_url( issue_id )
|
||||
|
||||
if cached_image_url is not None:
|
||||
return cached_image_url
|
||||
|
||||
issue_url = "http://api.comicvine.com/issue/" + str(issue_id) + "/?api_key=" + self.api_key + "&format=json&field_list=image"
|
||||
resp = urllib2.urlopen(issue_url)
|
||||
content = resp.read()
|
||||
@ -216,6 +249,7 @@ class ComicVineTalker:
|
||||
print ( "Comic Vine query failed with error: [{0}]. ".format( cv_response[ 'error' ] ))
|
||||
return None
|
||||
|
||||
cvc.add_issue_image_url( issue_id, cv_response['results']['image']['super_url'] )
|
||||
return cv_response['results']['image']['super_url']
|
||||
|
||||
|
||||
|
80
tagger.py
80
tagger.py
@ -38,6 +38,7 @@ from options import Options, MetaDataStyle
|
||||
from comicarchive import ComicArchive
|
||||
|
||||
from comicvinetalker import ComicVineTalker
|
||||
from comicvinecacher import ComicVineCacher
|
||||
from comicinfoxml import ComicInfoXml
|
||||
from comicbookinfo import ComicBookInfo
|
||||
from imagehasher import ImageHasher
|
||||
@ -105,6 +106,16 @@ def cliProcedure( opts, settings ):
|
||||
print ( "Searching for " + search_series + "...")
|
||||
|
||||
cv_search_results = comicVine.searchForSeries( search_series )
|
||||
|
||||
#---------- TEST
|
||||
#cvc = ComicVineCacher( settings.folder )
|
||||
#cvc.add_search_results( search_series, cv_search_results )
|
||||
#cached_search_results = cvc.get_search_results( search_series)
|
||||
#for r in cached_search_results:
|
||||
# print "{0}: {1} ({2})".format( r['id'], r['name'], r['start_year'])
|
||||
#quit()
|
||||
#---------- TEST
|
||||
|
||||
|
||||
print "Found " + str(len(cv_search_results)) + " initial results"
|
||||
|
||||
@ -129,7 +140,8 @@ def cliProcedure( opts, settings ):
|
||||
# Now we've got a list of series that we can dig into,
|
||||
# and look for matching issue number, date, and cover image
|
||||
|
||||
|
||||
match_list = []
|
||||
|
||||
for series in series_shortlist:
|
||||
#print series['id'], series['name'], series['start_year'], series['count_of_issues']
|
||||
print "Fetching info for ID: {0} {1} ({2}) ...".format(
|
||||
@ -156,24 +168,60 @@ def cliProcedure( opts, settings ):
|
||||
#url_image_hash = ImageHasher( data=url_image_data ).average_hash()
|
||||
url_image_hash = ImageHasher( data=url_image_data, ).average_hash2()
|
||||
#url_image_hash = ImageHasher( data=url_image_data, width=32, height=32 ).perceptual_hash()
|
||||
print u"-----> ID: {0} #{1} ({2}) Hash: {3} Distance: {4}\n-------> url:{5}".format(
|
||||
issue['id'], num_s, issue['name'],
|
||||
url_image_hash,
|
||||
ImageHasher.hamming_distance(cover_hash, url_image_hash),
|
||||
img_url)
|
||||
|
||||
match = dict()
|
||||
match['series'] = "{0} ({1})".format(series['name'], series['start_year'])
|
||||
match['distance'] = ImageHasher.hamming_distance(cover_hash, url_image_hash)
|
||||
match['issue_number'] = num_s
|
||||
match['issue_title'] = issue['name']
|
||||
match['img_url'] = img_url
|
||||
match_list.append(match)
|
||||
|
||||
break
|
||||
|
||||
print "Compared covers for {0} issues".format(len(match_list))
|
||||
|
||||
# sort list by image match scores
|
||||
match_list.sort(key=lambda k: k['distance'])
|
||||
|
||||
#helper
|
||||
def print_match(item):
|
||||
print u"-----> {0} #{1} {2} -- score: {3}\n-------> url:{4}".format(
|
||||
item['series'],
|
||||
item['issue_number'],
|
||||
item['issue_title'],
|
||||
item['distance'],
|
||||
item['img_url'])
|
||||
|
||||
best_score = match_list[0]['distance']
|
||||
|
||||
if len(match_list) == 0:
|
||||
print "No matches found :("
|
||||
return
|
||||
|
||||
if len(match_list) == 1:
|
||||
print_match(match_list[0])
|
||||
return
|
||||
|
||||
elif best_score > 20 and len(match_list) > 1:
|
||||
print "No good image matches! Need to use other info..."
|
||||
return
|
||||
|
||||
#now pare down list, remove any item more than 2 distant from the top scores
|
||||
for item in reversed(match_list):
|
||||
if item['distance'] > best_score + 2:
|
||||
match_list.remove(item)
|
||||
|
||||
if len(match_list) == 1:
|
||||
print_match(match_list[0])
|
||||
return
|
||||
|
||||
else:
|
||||
print "More than one likley candiate. Maybe a lexical comparison??"
|
||||
for item in match_list:
|
||||
print_match(item)
|
||||
|
||||
"""
|
||||
#error checking here: did we get any results?
|
||||
|
||||
# we will eventualy want user interaction to choose the appropriate result, but for now, assume the first one
|
||||
series_id = cv_search_results[0]['id']
|
||||
|
||||
print( "-->Auto-selecting volume ID:", cv_search_results[0]['id'] )
|
||||
print(" ")
|
||||
|
||||
# now get the particular issue data
|
||||
metadata = comicVine.fetchIssueData( series_id, opts.issue_number )
|
||||
|
||||
@ -182,9 +230,6 @@ def cliProcedure( opts, settings ):
|
||||
ca = ComicArchive(opts.filename)
|
||||
ca.writeMetadata( metadata, opts.data_style )
|
||||
|
||||
#debugging
|
||||
ComicBookInfo().writeToExternalFile( "test.json" )
|
||||
ComicBookInfo().writeToExternalFile( "test.xml" )
|
||||
"""
|
||||
#-----------------------------
|
||||
|
||||
@ -192,7 +237,6 @@ def main():
|
||||
opts = Options()
|
||||
opts.parseCmdLineArgs()
|
||||
settings = ComicTaggerSettings()
|
||||
|
||||
# make sure unrar program is in the path for the UnRAR class
|
||||
utils.addtopath(os.path.dirname(settings.unrar_exe_path))
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user