Took a whack at perceptual image hashing. Ugh.

git-svn-id: http://comictagger.googlecode.com/svn/trunk@17 6c5673fe-1810-88d6-992b-cd32ca31540c
This commit is contained in:
beville@gmail.com 2012-11-08 06:25:29 +00:00
parent 9b67d70dde
commit cc9c963c5b
3 changed files with 143 additions and 16 deletions

View File

@ -1,15 +1,18 @@
import Image
#import numpy
#import math
#import operator
import StringIO
import numpy
import scipy.signal
#from bitarray import bitarray
class ImageHasher(object):
def __init__(self, path=None, data=None, size=8):
self.hash_size = size
def __init__(self, path=None, data=None, width=8, height=8):
#self.hash_size = size
self.width = width
self.height = height
if path is None and data is None:
raise IOError
@ -19,7 +22,8 @@ class ImageHasher(object):
self.image = Image.open(StringIO.StringIO(data))
def average_hash(self):
image = self.image.resize((self.hash_size, self.hash_size), Image.ANTIALIAS).convert("L")
#image = self.image.resize((self.hash_size, self.hash_size), Image.ANTIALIAS).convert("L")
image = self.image.resize((self.width, self.height), Image.ANTIALIAS).convert("L")
pixels = list(image.getdata())
avg = sum(pixels) / len(pixels)
@ -35,13 +39,115 @@ class ImageHasher(object):
# (Build up a hex string from the binary list of bits)
hash = ""
binary_string = "".join(diff)
for i in range(0,self.hash_size**2,8):
for i in range(0, self.width*self.height, 8):
# 8 bits at time, reverse, for little-endian
s = binary_string[i:i+8][::-1]
hash = hash + "{0:02x}".format( int(s,2))
return hash
def average_hash2( self ):
im = self.image.resize((self.width, self.height), Image.ANTIALIAS).convert('L')
in_data = numpy.array((im.getdata())).reshape(self.width, self.height)
filt = numpy.array([[0,1,0],[1,-4,1],[0,1,0]])
filt_data = scipy.signal.convolve2d(in_data,filt,mode='same',boundary='symm').flatten()
result = reduce(lambda x, (y, z): x | (z << y),
enumerate(map(lambda i: 0 if i < 0 else 1, filt_data)),
0)
return result
def perceptual_hash(self):
"""
# Algorithm source: http://syntaxcandy.blogspot.com/2012/08/perceptual-hash.html
1. Reduce size. Like Average Hash, pHash starts with a small image.
However, the image is larger than 8x8; 32x32 is a good size. This
is really done to simplify the DCT computation and not because it
is needed to reduce the high frequencies.
2. Reduce color. The image is reduced to a grayscale just to further
simplify the number of computations.
3. Compute the DCT. The DCT separates the image into a collection of
frequencies and scalars. While JPEG uses an 8x8 DCT, this algorithm
uses a 32x32 DCT.
4. Reduce the DCT. This is the magic step. While the DCT is 32x32,
just keep the top-left 8x8. Those represent the lowest frequencies in
the picture.
5. Compute the average value. Like the Average Hash, compute the mean DCT
value (using only the 8x8 DCT low-frequency values and excluding the first
term since the DC coefficient can be significantly different from the other
values and will throw off the average). Thanks to David Starkweather for the
added information about pHash. He wrote: "the dct hash is based on the low 2D
DCT coefficients starting at the second from lowest, leaving out the first DC
term. This excludes completely flat image information (i.e. solid colors) from
being included in the hash description."
6. Further reduce the DCT. This is the magic step. Set the 64 hash bits to 0 or
1 depending on whether each of the 64 DCT values is above or below the average
value. The result doesn't tell us the actual low frequencies; it just tells us
the very-rough relative scale of the frequencies to the mean. The result will not
vary as long as the overall structure of the image remains the same; this can
survive gamma and color histogram adjustments without a problem.
7. Construct the hash. Set the 64 bits into a 64-bit integer. The order does not
matter, just as long as you are consistent.
"""
# Step 1,2
im = self.image.resize((32, 32), Image.ANTIALIAS).convert("L")
in_data = numpy.array(im.getdata(), dtype=numpy.dtype('float')).reshape(self.width, self.height)
#print len(im.getdata())
#print in_data
# Step 3
dct = scipy.fftpack.dct( in_data )
# Step 4
# NO! -- lofreq_dct = dct[:8,:8].flatten()
# NO? -- lofreq_dct = dct[24:32, 24:32].flatten()
lofreq_dct = dct[:8, 24:32].flatten()
#print dct[:8, 24:32]
# NO! -- lofreq_dct = dct[24:32, :8 ].flatten()
#omit = 0
#omit = 7
#omit = 56
#omit = 63
# Step 5
#avg = ( lofreq_dct.sum() - lofreq_dct[omit] ) / ( lofreq_dct.size - 1 )
avg = ( lofreq_dct.sum() ) / ( lofreq_dct.size )
#print lofreq_dct.sum()
#print lofreq_dct[0]
#print avg, lofreq_dct.size
# Step 6
def compare_value_to_avg(i):
if i > avg:
return (1)
else:
return (0)
bitlist = map(compare_value_to_avg, lofreq_dct)
#Step 7
def accumulate( accumulator, (idx, val) ):
return (accumulator | (val << idx))
result = reduce(accumulate, enumerate(bitlist), long(0))
print "{0:016x}".format(result)
return result
@staticmethod
def count_bits(number):
bit = 1
@ -52,14 +158,18 @@ class ImageHasher(object):
bit <<= 1
return count
#accepts 2 hash strings, and returns the hamming distance
#accepts 2 hashes (long or hex strings) and returns the hamming distance
@staticmethod
def hamming_distance(h1, h2):
# conver hex strings to ints
n1 = long( h1, 16)
n2 = long( h2, 16)
if type(h1) == long:
n1 = h1
n2 = h2
else:
# conver hex strings to ints
n1 = long( h1, 16)
n2 = long( h2, 16)
# xor the two numbers
n = n1 ^ n2

View File

@ -52,9 +52,14 @@ def cliProcedure( opts, settings ):
return
cover_image_data = ca.getCoverPage()
cover_hash = ImageHasher( data=cover_image_data ).average_hash()
print "Cover hash = ",cover_hash
#cover_hash = ImageHasher( data=cover_image_data ).average_hash()
#print "Cover hash = ",cover_hash
cover_hash = ImageHasher( data=cover_image_data ).average_hash2()
#print "Cover hash = ",cover_hash
#cover_hash = ImageHasher( data=cover_image_data , width=32, height=32 ).perceptual_hash()
# see if the archive has any useful meta data for searching with
if ca.hasCIX():
internal_metadata = ca.readCIX()
@ -148,8 +153,10 @@ def cliProcedure( opts, settings ):
img_url = comicVine.fetchIssueCoverURL( issue['id'] )
#TODO get the URL, and calc hash!!
url_image_data = urllib.urlopen(img_url).read()
url_image_hash = ImageHasher( data=url_image_data ).average_hash()
print "-----> ID: {0} #{1} ({2}) Hash: {3} Distance: {4}\n-------> url:{5}".format(
#url_image_hash = ImageHasher( data=url_image_data ).average_hash()
url_image_hash = ImageHasher( data=url_image_data, ).average_hash2()
#url_image_hash = ImageHasher( data=url_image_data, width=32, height=32 ).perceptual_hash()
print u"-----> ID: {0} #{1} ({2}) Hash: {3} Distance: {4}\n-------> url:{5}".format(
issue['id'], num_s, issue['name'],
url_image_hash,
ImageHasher.hamming_distance(cover_hash, url_image_hash),

View File

@ -40,6 +40,16 @@ Other settings possibilities:
Content Hashes!!
Image Hashes:
Failures of average hash:
Thor 600 Wrap-around w/ different aspect ratio
Bone 3 - Variant Cover,
Old Avengers -- Best match, but high difference
Filename parsing:
Concatenation of Name and Issue??
"1602"
App option to covert RAR to ZIP