From 1486cce990895cd2f505ca1f1661525abae390c4 Mon Sep 17 00:00:00 2001 From: "beville@gmail.com" Date: Fri, 9 Nov 2012 04:02:14 +0000 Subject: [PATCH] Added a caching class to reduce hits on the CV database git-svn-id: http://comictagger.googlecode.com/svn/trunk@18 6c5673fe-1810-88d6-992b-cd32ca31540c --- comicvinecacher.py | 303 +++++++++++++++++++++++++++++++++++++++++++++ comicvinetalker.py | 36 +++++- tagger.py | 80 +++++++++--- 3 files changed, 400 insertions(+), 19 deletions(-) create mode 100644 comicvinecacher.py diff --git a/comicvinecacher.py b/comicvinecacher.py new file mode 100644 index 0000000..650b27b --- /dev/null +++ b/comicvinecacher.py @@ -0,0 +1,303 @@ +""" +A python class to manage caching of data from Comic Vine +""" + +""" +Copyright 2012 Anthony Beville + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from pprint import pprint + +import sqlite3 as lite +import sys +import os +import datetime + +class ComicVineCacher: + + def __init__(self, settings_folder ): + self.settings_folder = settings_folder + self.db_file = os.path.join( self.settings_folder, "cv_cache.db") + + if not os.path.exists( self.db_file ): + self.create_cache_db() + + def create_cache_db( self ): + + # this will wipe out any existing version + open( self.db_file, 'w').close() + + con = lite.connect( self.db_file ) + + # create tables + with con: + + cur = con.cursor() + #name,id,start_year,publisher,image,description,count_of_issues + cur.execute("CREATE TABLE VolumeSearchCache(" + + "search_term TEXT," + + "id INT," + + "name TEXT," + + "start_year INT," + + "publisher TEXT," + + "count_of_issues INT," + + "image_url TEXT," + + "description TEXT," + + "timestamp TEXT)" + ) + + cur.execute("CREATE TABLE Volumes(" + + "id INT," + + "name TEXT," + + "publisher TEXT," + + "count_of_issues INT," + + "timestamp TEXT," + + "PRIMARY KEY (id) )" + ) + + cur.execute("CREATE TABLE Issues(" + + "id INT," + + "volume_id INT," + + "name TEXT," + + "issue_number TEXT," + + "image_url TEXT," + + "image_hash TEXT," + + "thumb_image_url TEXT," + + "thumb_image_hash TEXT," + + "timestamp TEXT," + + "PRIMARY KEY (id ) )" + ) + + def add_search_results( self, search_term, cv_search_results ): + + con = lite.connect( self.db_file ) + + with con: + + cur = con.cursor() + + # remove all previous entries with this search term + cur.execute("DELETE FROM VolumeSearchCache WHERE search_term = '{0}'".format(search_term.lower())) + + # now add in new results + for record in cv_search_results: + timestamp = datetime.datetime.now() + + if record['publisher'] is None: + pub_name = "" + else: + pub_name = record['publisher']['name'] + + if record['image'] is None: + url = "" + else: + url = record['image']['super_url'] + + cur.execute("INSERT INTO VolumeSearchCache VALUES( ?, ?, ?, ?, ?, ?, ?, ?, ? )" , + ( search_term.lower(), + record['id'], + record['name'], + record['start_year'], + pub_name, + record['count_of_issues'], + url, + record['description'], + timestamp ) + ) + + def get_search_results( self, search_term ): + + results = list() + con = lite.connect( self.db_file ) + with con: + cur = con.cursor() + + # TODO purge stale search results ( older than a day, maybe??) + + # fetch + cur.execute("SELECT * FROM VolumeSearchCache WHERE search_term=?", [ search_term.lower() ] ) + rows = cur.fetchall() + # now process the results + for record in rows: + + result = dict() + result['id'] = record[1] + result['name'] = record[2] + result['start_year'] = record[3] + result['publisher'] = dict() + result['publisher']['name'] = record[4] + result['count_of_issues'] = record[5] + result['image'] = dict() + result['image']['super_url'] = record[6] + result['description'] = record[7] + + results.append(result) + + return results + + + def add_volume_info( self, cv_volume_record ): + + con = lite.connect( self.db_file ) + + with con: + + cur = con.cursor() + + timestamp = datetime.datetime.now() + + data = { + "name": cv_volume_record['name'], + "publisher": cv_volume_record['publisher']['name'], + "count_of_issues": cv_volume_record['count_of_issues'], + "timestamp": timestamp + } + self.upsert( cur, "volumes", "id", cv_volume_record['id'], data) + + # now add in issues + + for issue in cv_volume_record['issues']: + + data = { + "volume_id": cv_volume_record['id'], + "name": issue['name'], + "issue_number": issue['issue_number'], + "timestamp": timestamp + } + self.upsert( cur, "issues" , "id", issue['id'], data) + + + def get_volume_info( self, volume_id ): + + result = None + + con = lite.connect( self.db_file ) + with con: + cur = con.cursor() + + # TODO purge stale volume records ( older than a week, maybe??) + + # fetch + cur.execute("SELECT id,name,publisher,count_of_issues FROM Volumes WHERE id = ?", [ volume_id ] ) + + row = cur.fetchone() + + if row is None : + return result + + result = dict() + + #since ID is primary key, there is only one row + result['id'] = row[0] + result['name'] = row[1] + result['publisher'] = dict() + result['publisher']['name'] = row[2] + result['count_of_issues'] = row[3] + result['issues'] = list() + + cur.execute("SELECT id,name,issue_number,image_url,image_hash FROM Issues WHERE volume_id = ?", [ volume_id ] ) + rows = cur.fetchall() + + # now process the results + for row in rows: + record = dict() + record['id'] = row[0] + record['name'] = row[1] + record['issue_number'] = row[2] + record['image_url'] = row[3] + record['image_hash'] = row[4] + + result['issues'].append(record) + + return result + + + def add_issue_image_url( self, issue_id, image_url ): + + con = lite.connect( self.db_file ) + + with con: + cur = con.cursor() + timestamp = datetime.datetime.now() + + data = { + "image_url": image_url, + "timestamp": timestamp + } + self.upsert( cur, "issues" , "id", issue_id, data) + + + + def get_issue_image_url( self, issue_id ): + + con = lite.connect( self.db_file ) + with con: + cur = con.cursor() + + cur.execute("SELECT image_url FROM Issues WHERE id=?", [ issue_id ]) + row = cur.fetchone() + + if row[0] is None : + return None + else: + return row[0] + + + def upsert( self, cur, tablename, pkname, pkval, data): + """ + This does an insert if the given PK doesn't exist, and an update it if does + """ + + # TODO - look into checking if UPDATE is needed + # TODO - should the cursor be created here, and not up the stack? + + ins_count = len(data) + 1 + + keys = "" + vals = list() + ins_slots = "" + set_slots = "" + + for key in data: + + if keys != "": + keys += ", " + if ins_slots != "": + ins_slots += ", " + if set_slots != "": + set_slots += ", " + + keys += key + vals.append( data[key] ) + ins_slots += "?" + set_slots += key + " = ?" + + keys += ", " + pkname + vals.append( pkval ) + ins_slots += ", ?" + condition = pkname + " = ?" + + sql_ins = ( "INSERT OR IGNORE INTO " + tablename + + " ( " + keys + " ) " + + " VALUES ( " + ins_slots + " )" ) + cur.execute( sql_ins , vals ) + + sql_upd = ( "UPDATE " + tablename + + " SET " + set_slots + " WHERE " + condition ) + cur.execute( sql_upd , vals ) + + + + diff --git a/comicvinetalker.py b/comicvinetalker.py index 507e592..c714d4e 100644 --- a/comicvinetalker.py +++ b/comicvinetalker.py @@ -26,7 +26,8 @@ import math import re import utils - +from settings import ComicTaggerSettings +from comicvinecacher import ComicVineCacher from genericmetadata import GenericMetadata class ComicVineTalker: @@ -48,6 +49,16 @@ class ComicVineTalker: def searchForSeries( self, series_name ): + + # before we search online, look in our cache, since we might have + # done this same search recently + cvc = ComicVineCacher( ComicTaggerSettings.getSettingsFolder() ) + cached_search_results = cvc.get_search_results( series_name ) + + if len (cached_search_results) > 0: + return cached_search_results + + original_series_name = series_name series_name = urllib.quote_plus(str(series_name)) search_url = "http://api.comicvine.com/search/?api_key=" + self.api_key + "&format=json&resources=volume&query=" + series_name + "&field_list=name,id,start_year,publisher,image,description,count_of_issues&sort=start_year" @@ -95,9 +106,21 @@ class ComicVineTalker: #print "{0}: {1} ({2})".format(search_results['results'][0]['id'], smart_str(search_results['results'][0]['name']) , search_results['results'][0]['start_year'] ) + # cache these search results + cvc.add_search_results( original_series_name, search_results ) + return search_results def fetchVolumeData( self, series_id ): + + # before we search online, look in our cache, since we might already + # have this info + cvc = ComicVineCacher( ComicTaggerSettings.getSettingsFolder() ) + cached_volume_result = cvc.get_volume_info( series_id ) + + if cached_volume_result is not None: + return cached_volume_result + volume_url = "http://api.comicvine.com/volume/" + str(series_id) + "/?api_key=" + self.api_key + "&format=json" #print "search_url = : ", volume_url @@ -113,6 +136,8 @@ class ComicVineTalker: volume_results = cv_response['results'] + cvc.add_volume_info( volume_results ) + return volume_results @@ -208,6 +233,14 @@ class ComicVineTalker: def fetchIssueCoverURL( self, issue_id ): + # before we search online, look in our cache, since we might already + # have this info + cvc = ComicVineCacher( ComicTaggerSettings.getSettingsFolder() ) + cached_image_url = cvc.get_issue_image_url( issue_id ) + + if cached_image_url is not None: + return cached_image_url + issue_url = "http://api.comicvine.com/issue/" + str(issue_id) + "/?api_key=" + self.api_key + "&format=json&field_list=image" resp = urllib2.urlopen(issue_url) content = resp.read() @@ -216,6 +249,7 @@ class ComicVineTalker: print ( "Comic Vine query failed with error: [{0}]. ".format( cv_response[ 'error' ] )) return None + cvc.add_issue_image_url( issue_id, cv_response['results']['image']['super_url'] ) return cv_response['results']['image']['super_url'] diff --git a/tagger.py b/tagger.py index 628095f..ea7452e 100755 --- a/tagger.py +++ b/tagger.py @@ -38,6 +38,7 @@ from options import Options, MetaDataStyle from comicarchive import ComicArchive from comicvinetalker import ComicVineTalker +from comicvinecacher import ComicVineCacher from comicinfoxml import ComicInfoXml from comicbookinfo import ComicBookInfo from imagehasher import ImageHasher @@ -105,6 +106,16 @@ def cliProcedure( opts, settings ): print ( "Searching for " + search_series + "...") cv_search_results = comicVine.searchForSeries( search_series ) + + #---------- TEST + #cvc = ComicVineCacher( settings.folder ) + #cvc.add_search_results( search_series, cv_search_results ) + #cached_search_results = cvc.get_search_results( search_series) + #for r in cached_search_results: + # print "{0}: {1} ({2})".format( r['id'], r['name'], r['start_year']) + #quit() + #---------- TEST + print "Found " + str(len(cv_search_results)) + " initial results" @@ -129,7 +140,8 @@ def cliProcedure( opts, settings ): # Now we've got a list of series that we can dig into, # and look for matching issue number, date, and cover image - + match_list = [] + for series in series_shortlist: #print series['id'], series['name'], series['start_year'], series['count_of_issues'] print "Fetching info for ID: {0} {1} ({2}) ...".format( @@ -156,24 +168,60 @@ def cliProcedure( opts, settings ): #url_image_hash = ImageHasher( data=url_image_data ).average_hash() url_image_hash = ImageHasher( data=url_image_data, ).average_hash2() #url_image_hash = ImageHasher( data=url_image_data, width=32, height=32 ).perceptual_hash() - print u"-----> ID: {0} #{1} ({2}) Hash: {3} Distance: {4}\n-------> url:{5}".format( - issue['id'], num_s, issue['name'], - url_image_hash, - ImageHasher.hamming_distance(cover_hash, url_image_hash), - img_url) + match = dict() + match['series'] = "{0} ({1})".format(series['name'], series['start_year']) + match['distance'] = ImageHasher.hamming_distance(cover_hash, url_image_hash) + match['issue_number'] = num_s + match['issue_title'] = issue['name'] + match['img_url'] = img_url + match_list.append(match) break + print "Compared covers for {0} issues".format(len(match_list)) + + # sort list by image match scores + match_list.sort(key=lambda k: k['distance']) + + #helper + def print_match(item): + print u"-----> {0} #{1} {2} -- score: {3}\n-------> url:{4}".format( + item['series'], + item['issue_number'], + item['issue_title'], + item['distance'], + item['img_url']) + + best_score = match_list[0]['distance'] + + if len(match_list) == 0: + print "No matches found :(" + return + + if len(match_list) == 1: + print_match(match_list[0]) + return + + elif best_score > 20 and len(match_list) > 1: + print "No good image matches! Need to use other info..." + return + + #now pare down list, remove any item more than 2 distant from the top scores + for item in reversed(match_list): + if item['distance'] > best_score + 2: + match_list.remove(item) + + if len(match_list) == 1: + print_match(match_list[0]) + return + + else: + print "More than one likley candiate. Maybe a lexical comparison??" + for item in match_list: + print_match(item) + """ - #error checking here: did we get any results? - - # we will eventualy want user interaction to choose the appropriate result, but for now, assume the first one - series_id = cv_search_results[0]['id'] - - print( "-->Auto-selecting volume ID:", cv_search_results[0]['id'] ) - print(" ") - # now get the particular issue data metadata = comicVine.fetchIssueData( series_id, opts.issue_number ) @@ -182,9 +230,6 @@ def cliProcedure( opts, settings ): ca = ComicArchive(opts.filename) ca.writeMetadata( metadata, opts.data_style ) - #debugging - ComicBookInfo().writeToExternalFile( "test.json" ) - ComicBookInfo().writeToExternalFile( "test.xml" ) """ #----------------------------- @@ -192,7 +237,6 @@ def main(): opts = Options() opts.parseCmdLineArgs() settings = ComicTaggerSettings() - # make sure unrar program is in the path for the UnRAR class utils.addtopath(os.path.dirname(settings.unrar_exe_path))