First cut at image comparison using hashes

git-svn-id: http://comictagger.googlecode.com/svn/trunk@14 6c5673fe-1810-88d6-992b-cd32ca31540c
This commit is contained in:
beville@gmail.com 2012-11-07 17:29:45 +00:00
parent a971bc27e5
commit c28d24481f
4 changed files with 265 additions and 77 deletions

View File

@ -396,10 +396,8 @@ class ComicArchive:
return True
def seemsToBeAComicArchive( self ):
# TODO this will need to be fleshed out to support RAR and Folder
ext = os.path.splitext(self.path)[1].lower()
ext = os.path.splitext(self.path)[1].lower()
if (
( ( ( self.isZip() ) and
( ext.lower() in [ '.zip', '.cbz' ] ))

72
imagehasher.py Executable file
View File

@ -0,0 +1,72 @@
import Image
#import numpy
#import math
#import operator
import StringIO
#from bitarray import bitarray
class ImageHasher(object):
def __init__(self, path=None, data=None, size=8):
self.hash_size = size
if path is None and data is None:
raise IOError
elif path is not None:
self.image = Image.open(path)
else:
self.image = Image.open(StringIO.StringIO(data))
def average_hash(self):
image = self.image.resize((self.hash_size, self.hash_size), Image.ANTIALIAS).convert("L")
pixels = list(image.getdata())
avg = sum(pixels) / len(pixels)
diff = []
for pixel in pixels:
value = 1 if pixel > avg else 0
diff.append(str(value))
#ba = bitarray("".join(diff), endian='little')
#h = ba.tobytes().encode('hex')
# This isn't super pretty, but we avoid the bitarray inclusion.
# (Build up a hex string from the binary list of bits)
hash = ""
binary_string = "".join(diff)
for i in range(0,self.hash_size**2,8):
# 8 bits at time, reverse, for little-endian
s = binary_string[i:i+8][::-1]
hash = hash + "{0:02x}".format( int(s,2))
return hash
@staticmethod
def count_bits(number):
bit = 1
count = 0
while number >= bit:
if number & bit:
count += 1
bit <<= 1
return count
#accepts 2 hash strings, and returns the hamming distance
@staticmethod
def hamming_distance(h1, h2):
# conver hex strings to ints
n1 = long( h1, 16)
n2 = long( h2, 16)
# xor the two numbers
n = n1 ^ n2
# now count the ones
return ImageHasher.count_bits( n )

125
tagger.py
View File

@ -28,7 +28,8 @@ from pprint import pprint
from PyQt4 import QtCore, QtGui
import signal
import os
import math
import urllib2, urllib
from settings import ComicTaggerSettings
@ -39,17 +40,125 @@ from comicarchive import ComicArchive
from comicvinetalker import ComicVineTalker
from comicinfoxml import ComicInfoXml
from comicbookinfo import ComicBookInfo
from imagehasher import ImageHasher
import utils
#-----------------------------
def cliProcedure( opts ):
def cliProcedure( opts, settings ):
pass
ca = ComicArchive(opts.filename)
if not ca.seemsToBeAComicArchive():
print "Sorry, but "+ opts.filename + " is not a comic archive!"
return
cover_image_data = ca.getCoverPage()
cover_hash = ImageHasher( data=cover_image_data ).average_hash()
print "Cover hash = ",cover_hash
# see if the archive has any useful meta data for searching with
if ca.hasCIX():
internal_metadata = ca.readCIX()
elif ca.hasCBI():
internal_metadata = ca.readCBI()
else:
internal_metadata = ca.readCBI()
# try to get some metadata from filename
md_from_filename = ca.metadataFromFilename()
# now figure out what we have to search with
search_series = internal_metadata.series
search_issue_number = internal_metadata.issueNumber
search_year = internal_metadata.publicationYear
search_month = internal_metadata.publicationMonth
if search_series is None:
search_series = md_from_filename.series
if search_issue_number is None:
search_issue_number = md_from_filename.issueNumber
if search_year is None:
search_year = md_from_filename.publicationYear
# we need, at minimum, a series and issue number
if search_series is None or search_issue_number is None:
print "Not enough info for a search!"
return
print ( "Going to search for:" )
print ( "Series: ", search_series )
print ( "Issue : ", search_issue_number )
if search_year is not None:
print ( "Year : ", search_year )
if search_month is not None:
print ( "Month : ", search_month )
comicVine = ComicVineTalker( settings.cv_api_key )
print ( "Searching for " + search_series + "...")
cv_search_results = comicVine.searchForSeries( search_series )
print "Found " + str(len(cv_search_results)) + " initial results"
series_shortlist = []
print "Removing results with too long names"
for item in cv_search_results:
#assume that our search name is close to the actual name, say within 8 characters
if len( item['name']) < len( search_series ) + 8:
series_shortlist.append(item)
# if we don't think it's an issue number 1, remove any series' that are one-shots
if search_issue_number != '1':
print "Removing one-shots"
series_shortlist[:] = [x for x in series_shortlist if not x['count_of_issues'] == 1]
print "Finally, searching in " + str(len(series_shortlist)) +" series"
# now sort the list by name length
series_shortlist.sort(key=lambda x: len(x['name']), reverse=False)
# Now we've got a list of series that we can dig into,
# and look for matching issue number, date, and cover image
for series in series_shortlist:
#print series['id'], series['name'], series['start_year'], series['count_of_issues']
print "Fetching info for ID: {0} {1} ({2}) ...".format(
series['id'],
series['name'],
series['start_year'])
cv_series_results = comicVine.fetchVolumeData( series['id'] )
issue_list = cv_series_results['issues']
for issue in issue_list:
# format the issue number string nicely, since it's usually something like "2.00"
num_f = float(issue['issue_number'])
num_s = str( int(math.floor(num_f)) )
if math.floor(num_f) != num_f:
num_s = str( num_f )
# look for a matching issue number
if num_s == search_issue_number:
# found a matching issue number! now get the issue data
img_url = comicVine.fetchIssueCoverURL( issue['id'] )
#TODO get the URL, and calc hash!!
url_image_data = urllib.urlopen(img_url).read()
url_image_hash = ImageHasher( data=url_image_data ).average_hash()
print "-----> ID: {0} #{1} ({2}) Hash: {3} Distance: {4}\n-------> url:{5}".format(
issue['id'], num_s, issue['name'],
url_image_hash,
ImageHasher.hamming_distance(cover_hash, url_image_hash),
img_url)
break
"""
comicVine = ComicVineTalker()
cv_search_results = comicVine.searchForSeries( opts.series_name )
#error checking here: did we get any results?
# we will eventualy want user interaction to choose the appropriate result, but for now, assume the first one
@ -84,7 +193,7 @@ def main():
if opts.no_gui:
cliProcedure( opts )
cliProcedure( opts, settings )
else:

141
todo.txt
View File

@ -1,66 +1,75 @@
Add License/Copyright headers
Toolbar icons
Consolidate Credit Roles for english variants? : Penciler vs Penciller
Stand-alone CLI
TaggerWindow entry fields
General layout
Special Dialogs needed for:
Pages Info
Color changing stuff need more work
- Indicate credits for CR style
CR has editable dropdowns/comboboxes for Format, Publisher, Imprint
-----------
Form type validation Ints vs strings for month, year. etc
Check all HTTP responses for errors
Lots of error checking
Archive function to detect tag blocks out of sync
Hourglass popup, or whatever, for when busy
Idea: Support only CBI or CIX for any given file, and not both
If user selects different one, warn about potential loss/re-arranging of data
Longer term:
Think about mass tagging and (semi) automatic volume selection
Maybe: keep a history of tagged volumes IDs from CV, and present those first
Other settings possibilities:
Last tag style
Last "Open" folder (include dragged)
Keep a history of queries somewhere??
Content Hashes!!
App option to covert RAR to ZIP
If no unrar in path, then filter out CBR/RAR from open dialog
"Select Issues" dialog request cover URLs in background
"Select Issues" dialog cache cover images
----------------------------------------------
COMIC RACK Questions
Missing from XML as enterable in ComicRack:
Main Character or Team
Review
User Rating
Some that seem library only:
"Series Complete"
Tags
Proposed Values
Community Rating
Toolbar icons
Page Browser
Stand-alone CLI
TaggerWindow entry fields
General layout
Special Dialogs needed for:
Pages Info
Color changing stuff need more work
- Indicate credits for CR style
CR has editable dropdowns/comboboxes for Format, Publisher, Imprint
-----------
Form type validation Ints vs strings for month, year. etc
Check all HTTP responses for errors
Lots of error checking
Archive function to detect tag blocks out of sync
Hourglass popup, or whatever, for when busy
Idea: Support only CBI or CIX for any given file, and not both
If user selects different one, warn about potential loss/re-arranging of data
Longer term:
Think about mass tagging and (semi) automatic volume selection
Maybe: keep a history of tagged volumes IDs from CV, and present those first
Other settings possibilities:
Last tag style
Last "Open" folder (include dragged)
Keep a history of queries somewhere??
Content Hashes!!
App option to covert RAR to ZIP
If no unrar in path, then filter out CBR/RAR from open dialog
"Select Issues" dialog request cover URLs in background
"Select Issues" dialog cache cover images
Wizard for converting between tag styles
Auto search:
1st search local SQL tables that are built on the fly in the next step
search certain table: series: CV info
then search table issues: Id, title, number + URL, image hash, series ID
cache hash and URL as needed
if not found search CV directly caching results in tables
----------------------------------------------
COMIC RACK Questions
Missing from XML as enterable in ComicRack:
Main Character or Team
Review
User Rating
Some that seem library only:
"Series Complete"
Tags
Proposed Values
Community Rating