From c28d24481f2d242eeaae3416d3bc70b593953245 Mon Sep 17 00:00:00 2001 From: "beville@gmail.com" Date: Wed, 7 Nov 2012 17:29:45 +0000 Subject: [PATCH] First cut at image comparison using hashes git-svn-id: http://comictagger.googlecode.com/svn/trunk@14 6c5673fe-1810-88d6-992b-cd32ca31540c --- comicarchive.py | 4 +- imagehasher.py | 72 +++++++++++++++++++++++++ tagger.py | 125 +++++++++++++++++++++++++++++++++++++++--- todo.txt | 141 +++++++++++++++++++++++++----------------------- 4 files changed, 265 insertions(+), 77 deletions(-) create mode 100755 imagehasher.py diff --git a/comicarchive.py b/comicarchive.py index 732b1bf..32dd8a4 100644 --- a/comicarchive.py +++ b/comicarchive.py @@ -396,10 +396,8 @@ class ComicArchive: return True def seemsToBeAComicArchive( self ): - # TODO this will need to be fleshed out to support RAR and Folder - ext = os.path.splitext(self.path)[1].lower() - + ext = os.path.splitext(self.path)[1].lower() if ( ( ( ( self.isZip() ) and ( ext.lower() in [ '.zip', '.cbz' ] )) diff --git a/imagehasher.py b/imagehasher.py new file mode 100755 index 0000000..cfa771e --- /dev/null +++ b/imagehasher.py @@ -0,0 +1,72 @@ + +import Image +#import numpy +#import math +#import operator +import StringIO + +#from bitarray import bitarray + +class ImageHasher(object): + def __init__(self, path=None, data=None, size=8): + self.hash_size = size + + if path is None and data is None: + raise IOError + elif path is not None: + self.image = Image.open(path) + else: + self.image = Image.open(StringIO.StringIO(data)) + + def average_hash(self): + image = self.image.resize((self.hash_size, self.hash_size), Image.ANTIALIAS).convert("L") + pixels = list(image.getdata()) + avg = sum(pixels) / len(pixels) + + diff = [] + for pixel in pixels: + value = 1 if pixel > avg else 0 + diff.append(str(value)) + + #ba = bitarray("".join(diff), endian='little') + #h = ba.tobytes().encode('hex') + + # This isn't super pretty, but we avoid the bitarray inclusion. + # (Build up a hex string from the binary list of bits) + hash = "" + binary_string = "".join(diff) + for i in range(0,self.hash_size**2,8): + # 8 bits at time, reverse, for little-endian + s = binary_string[i:i+8][::-1] + hash = hash + "{0:02x}".format( int(s,2)) + + return hash + + @staticmethod + def count_bits(number): + bit = 1 + count = 0 + while number >= bit: + if number & bit: + count += 1 + bit <<= 1 + return count + + #accepts 2 hash strings, and returns the hamming distance + + @staticmethod + def hamming_distance(h1, h2): + + # conver hex strings to ints + n1 = long( h1, 16) + n2 = long( h2, 16) + # xor the two numbers + n = n1 ^ n2 + + # now count the ones + return ImageHasher.count_bits( n ) + + + + + diff --git a/tagger.py b/tagger.py index 8a371e5..efb3d8b 100755 --- a/tagger.py +++ b/tagger.py @@ -28,7 +28,8 @@ from pprint import pprint from PyQt4 import QtCore, QtGui import signal import os - +import math +import urllib2, urllib from settings import ComicTaggerSettings @@ -39,17 +40,125 @@ from comicarchive import ComicArchive from comicvinetalker import ComicVineTalker from comicinfoxml import ComicInfoXml from comicbookinfo import ComicBookInfo +from imagehasher import ImageHasher import utils #----------------------------- -def cliProcedure( opts ): +def cliProcedure( opts, settings ): - pass + ca = ComicArchive(opts.filename) + if not ca.seemsToBeAComicArchive(): + print "Sorry, but "+ opts.filename + " is not a comic archive!" + return + + cover_image_data = ca.getCoverPage() + cover_hash = ImageHasher( data=cover_image_data ).average_hash() + print "Cover hash = ",cover_hash + + # see if the archive has any useful meta data for searching with + if ca.hasCIX(): + internal_metadata = ca.readCIX() + elif ca.hasCBI(): + internal_metadata = ca.readCBI() + else: + internal_metadata = ca.readCBI() + + # try to get some metadata from filename + md_from_filename = ca.metadataFromFilename() + + # now figure out what we have to search with + search_series = internal_metadata.series + search_issue_number = internal_metadata.issueNumber + search_year = internal_metadata.publicationYear + search_month = internal_metadata.publicationMonth + + if search_series is None: + search_series = md_from_filename.series + + if search_issue_number is None: + search_issue_number = md_from_filename.issueNumber + + if search_year is None: + search_year = md_from_filename.publicationYear + + # we need, at minimum, a series and issue number + if search_series is None or search_issue_number is None: + print "Not enough info for a search!" + return + + print ( "Going to search for:" ) + print ( "Series: ", search_series ) + print ( "Issue : ", search_issue_number ) + if search_year is not None: + print ( "Year : ", search_year ) + if search_month is not None: + print ( "Month : ", search_month ) + + + comicVine = ComicVineTalker( settings.cv_api_key ) + + print ( "Searching for " + search_series + "...") + + cv_search_results = comicVine.searchForSeries( search_series ) + + print "Found " + str(len(cv_search_results)) + " initial results" + + series_shortlist = [] + + print "Removing results with too long names" + for item in cv_search_results: + #assume that our search name is close to the actual name, say within 8 characters + if len( item['name']) < len( search_series ) + 8: + series_shortlist.append(item) + + # if we don't think it's an issue number 1, remove any series' that are one-shots + if search_issue_number != '1': + print "Removing one-shots" + series_shortlist[:] = [x for x in series_shortlist if not x['count_of_issues'] == 1] + + print "Finally, searching in " + str(len(series_shortlist)) +" series" + + # now sort the list by name length + series_shortlist.sort(key=lambda x: len(x['name']), reverse=False) + + # Now we've got a list of series that we can dig into, + # and look for matching issue number, date, and cover image + + + for series in series_shortlist: + #print series['id'], series['name'], series['start_year'], series['count_of_issues'] + print "Fetching info for ID: {0} {1} ({2}) ...".format( + series['id'], + series['name'], + series['start_year']) + + cv_series_results = comicVine.fetchVolumeData( series['id'] ) + issue_list = cv_series_results['issues'] + for issue in issue_list: + + # format the issue number string nicely, since it's usually something like "2.00" + num_f = float(issue['issue_number']) + num_s = str( int(math.floor(num_f)) ) + if math.floor(num_f) != num_f: + num_s = str( num_f ) + + # look for a matching issue number + if num_s == search_issue_number: + # found a matching issue number! now get the issue data + img_url = comicVine.fetchIssueCoverURL( issue['id'] ) + #TODO get the URL, and calc hash!! + url_image_data = urllib.urlopen(img_url).read() + url_image_hash = ImageHasher( data=url_image_data ).average_hash() + print "-----> ID: {0} #{1} ({2}) Hash: {3} Distance: {4}\n-------> url:{5}".format( + issue['id'], num_s, issue['name'], + url_image_hash, + ImageHasher.hamming_distance(cover_hash, url_image_hash), + img_url) + + + break + """ - comicVine = ComicVineTalker() - - cv_search_results = comicVine.searchForSeries( opts.series_name ) - #error checking here: did we get any results? # we will eventualy want user interaction to choose the appropriate result, but for now, assume the first one @@ -84,7 +193,7 @@ def main(): if opts.no_gui: - cliProcedure( opts ) + cliProcedure( opts, settings ) else: diff --git a/todo.txt b/todo.txt index 957fc17..c6a8b9d 100644 --- a/todo.txt +++ b/todo.txt @@ -1,66 +1,75 @@ - -Add License/Copyright headers - -Toolbar icons - -Consolidate Credit Roles for english variants? : Penciler vs Penciller - -Stand-alone CLI - -TaggerWindow entry fields - General layout - Special Dialogs needed for: - Pages Info - Color changing stuff need more work - - Indicate credits for CR style - - CR has editable dropdowns/comboboxes for Format, Publisher, Imprint ------------ - -Form type validation Ints vs strings for month, year. etc - -Check all HTTP responses for errors - -Lots of error checking - -Archive function to detect tag blocks out of sync - -Hourglass popup, or whatever, for when busy - -Idea: Support only CBI or CIX for any given file, and not both - If user selects different one, warn about potential loss/re-arranging of data - -Longer term: - Think about mass tagging and (semi) automatic volume selection - -Maybe: keep a history of tagged volumes IDs from CV, and present those first - -Other settings possibilities: - Last tag style - Last "Open" folder (include dragged) - Keep a history of queries somewhere?? - -Content Hashes!! - - -App option to covert RAR to ZIP - -If no unrar in path, then filter out CBR/RAR from open dialog - -"Select Issues" dialog request cover URLs in background -"Select Issues" dialog cache cover images - ----------------------------------------------- -COMIC RACK Questions - -Missing from XML as enterable in ComicRack: - Main Character or Team - Review - User Rating - -Some that seem library only: - "Series Complete" - Tags - Proposed Values - Community Rating - + +Toolbar icons + +Page Browser + +Stand-alone CLI + +TaggerWindow entry fields + General layout + Special Dialogs needed for: + Pages Info + Color changing stuff need more work + - Indicate credits for CR style + + CR has editable dropdowns/comboboxes for Format, Publisher, Imprint +----------- + +Form type validation Ints vs strings for month, year. etc + +Check all HTTP responses for errors + +Lots of error checking + +Archive function to detect tag blocks out of sync + +Hourglass popup, or whatever, for when busy + +Idea: Support only CBI or CIX for any given file, and not both + If user selects different one, warn about potential loss/re-arranging of data + +Longer term: + Think about mass tagging and (semi) automatic volume selection + +Maybe: keep a history of tagged volumes IDs from CV, and present those first + +Other settings possibilities: + Last tag style + Last "Open" folder (include dragged) + Keep a history of queries somewhere?? + +Content Hashes!! + + +App option to covert RAR to ZIP + +If no unrar in path, then filter out CBR/RAR from open dialog + +"Select Issues" dialog request cover URLs in background +"Select Issues" dialog cache cover images + +Wizard for converting between tag styles + +Auto search: + 1st search local SQL tables that are built on the fly in the next step + search certain table: series: CV info + then search table issues: Id, title, number + URL, image hash, series ID + cache hash and URL as needed + if not found search CV directly caching results in tables + + + +---------------------------------------------- +COMIC RACK Questions + +Missing from XML as enterable in ComicRack: + Main Character or Team + Review + User Rating + +Some that seem library only: + "Series Complete" + Tags + Proposed Values + Community Rating +