First cut at image comparison using hashes
git-svn-id: http://comictagger.googlecode.com/svn/trunk@14 6c5673fe-1810-88d6-992b-cd32ca31540c
This commit is contained in:
parent
a971bc27e5
commit
c28d24481f
@ -396,10 +396,8 @@ class ComicArchive:
|
||||
return True
|
||||
|
||||
def seemsToBeAComicArchive( self ):
|
||||
# TODO this will need to be fleshed out to support RAR and Folder
|
||||
|
||||
ext = os.path.splitext(self.path)[1].lower()
|
||||
|
||||
ext = os.path.splitext(self.path)[1].lower()
|
||||
if (
|
||||
( ( ( self.isZip() ) and
|
||||
( ext.lower() in [ '.zip', '.cbz' ] ))
|
||||
|
72
imagehasher.py
Executable file
72
imagehasher.py
Executable file
@ -0,0 +1,72 @@
|
||||
|
||||
import Image
|
||||
#import numpy
|
||||
#import math
|
||||
#import operator
|
||||
import StringIO
|
||||
|
||||
#from bitarray import bitarray
|
||||
|
||||
class ImageHasher(object):
|
||||
def __init__(self, path=None, data=None, size=8):
|
||||
self.hash_size = size
|
||||
|
||||
if path is None and data is None:
|
||||
raise IOError
|
||||
elif path is not None:
|
||||
self.image = Image.open(path)
|
||||
else:
|
||||
self.image = Image.open(StringIO.StringIO(data))
|
||||
|
||||
def average_hash(self):
|
||||
image = self.image.resize((self.hash_size, self.hash_size), Image.ANTIALIAS).convert("L")
|
||||
pixels = list(image.getdata())
|
||||
avg = sum(pixels) / len(pixels)
|
||||
|
||||
diff = []
|
||||
for pixel in pixels:
|
||||
value = 1 if pixel > avg else 0
|
||||
diff.append(str(value))
|
||||
|
||||
#ba = bitarray("".join(diff), endian='little')
|
||||
#h = ba.tobytes().encode('hex')
|
||||
|
||||
# This isn't super pretty, but we avoid the bitarray inclusion.
|
||||
# (Build up a hex string from the binary list of bits)
|
||||
hash = ""
|
||||
binary_string = "".join(diff)
|
||||
for i in range(0,self.hash_size**2,8):
|
||||
# 8 bits at time, reverse, for little-endian
|
||||
s = binary_string[i:i+8][::-1]
|
||||
hash = hash + "{0:02x}".format( int(s,2))
|
||||
|
||||
return hash
|
||||
|
||||
@staticmethod
|
||||
def count_bits(number):
|
||||
bit = 1
|
||||
count = 0
|
||||
while number >= bit:
|
||||
if number & bit:
|
||||
count += 1
|
||||
bit <<= 1
|
||||
return count
|
||||
|
||||
#accepts 2 hash strings, and returns the hamming distance
|
||||
|
||||
@staticmethod
|
||||
def hamming_distance(h1, h2):
|
||||
|
||||
# conver hex strings to ints
|
||||
n1 = long( h1, 16)
|
||||
n2 = long( h2, 16)
|
||||
# xor the two numbers
|
||||
n = n1 ^ n2
|
||||
|
||||
# now count the ones
|
||||
return ImageHasher.count_bits( n )
|
||||
|
||||
|
||||
|
||||
|
||||
|
125
tagger.py
125
tagger.py
@ -28,7 +28,8 @@ from pprint import pprint
|
||||
from PyQt4 import QtCore, QtGui
|
||||
import signal
|
||||
import os
|
||||
|
||||
import math
|
||||
import urllib2, urllib
|
||||
|
||||
from settings import ComicTaggerSettings
|
||||
|
||||
@ -39,17 +40,125 @@ from comicarchive import ComicArchive
|
||||
from comicvinetalker import ComicVineTalker
|
||||
from comicinfoxml import ComicInfoXml
|
||||
from comicbookinfo import ComicBookInfo
|
||||
from imagehasher import ImageHasher
|
||||
import utils
|
||||
|
||||
#-----------------------------
|
||||
def cliProcedure( opts ):
|
||||
def cliProcedure( opts, settings ):
|
||||
|
||||
pass
|
||||
ca = ComicArchive(opts.filename)
|
||||
if not ca.seemsToBeAComicArchive():
|
||||
print "Sorry, but "+ opts.filename + " is not a comic archive!"
|
||||
return
|
||||
|
||||
cover_image_data = ca.getCoverPage()
|
||||
cover_hash = ImageHasher( data=cover_image_data ).average_hash()
|
||||
print "Cover hash = ",cover_hash
|
||||
|
||||
# see if the archive has any useful meta data for searching with
|
||||
if ca.hasCIX():
|
||||
internal_metadata = ca.readCIX()
|
||||
elif ca.hasCBI():
|
||||
internal_metadata = ca.readCBI()
|
||||
else:
|
||||
internal_metadata = ca.readCBI()
|
||||
|
||||
# try to get some metadata from filename
|
||||
md_from_filename = ca.metadataFromFilename()
|
||||
|
||||
# now figure out what we have to search with
|
||||
search_series = internal_metadata.series
|
||||
search_issue_number = internal_metadata.issueNumber
|
||||
search_year = internal_metadata.publicationYear
|
||||
search_month = internal_metadata.publicationMonth
|
||||
|
||||
if search_series is None:
|
||||
search_series = md_from_filename.series
|
||||
|
||||
if search_issue_number is None:
|
||||
search_issue_number = md_from_filename.issueNumber
|
||||
|
||||
if search_year is None:
|
||||
search_year = md_from_filename.publicationYear
|
||||
|
||||
# we need, at minimum, a series and issue number
|
||||
if search_series is None or search_issue_number is None:
|
||||
print "Not enough info for a search!"
|
||||
return
|
||||
|
||||
print ( "Going to search for:" )
|
||||
print ( "Series: ", search_series )
|
||||
print ( "Issue : ", search_issue_number )
|
||||
if search_year is not None:
|
||||
print ( "Year : ", search_year )
|
||||
if search_month is not None:
|
||||
print ( "Month : ", search_month )
|
||||
|
||||
|
||||
comicVine = ComicVineTalker( settings.cv_api_key )
|
||||
|
||||
print ( "Searching for " + search_series + "...")
|
||||
|
||||
cv_search_results = comicVine.searchForSeries( search_series )
|
||||
|
||||
print "Found " + str(len(cv_search_results)) + " initial results"
|
||||
|
||||
series_shortlist = []
|
||||
|
||||
print "Removing results with too long names"
|
||||
for item in cv_search_results:
|
||||
#assume that our search name is close to the actual name, say within 8 characters
|
||||
if len( item['name']) < len( search_series ) + 8:
|
||||
series_shortlist.append(item)
|
||||
|
||||
# if we don't think it's an issue number 1, remove any series' that are one-shots
|
||||
if search_issue_number != '1':
|
||||
print "Removing one-shots"
|
||||
series_shortlist[:] = [x for x in series_shortlist if not x['count_of_issues'] == 1]
|
||||
|
||||
print "Finally, searching in " + str(len(series_shortlist)) +" series"
|
||||
|
||||
# now sort the list by name length
|
||||
series_shortlist.sort(key=lambda x: len(x['name']), reverse=False)
|
||||
|
||||
# Now we've got a list of series that we can dig into,
|
||||
# and look for matching issue number, date, and cover image
|
||||
|
||||
|
||||
for series in series_shortlist:
|
||||
#print series['id'], series['name'], series['start_year'], series['count_of_issues']
|
||||
print "Fetching info for ID: {0} {1} ({2}) ...".format(
|
||||
series['id'],
|
||||
series['name'],
|
||||
series['start_year'])
|
||||
|
||||
cv_series_results = comicVine.fetchVolumeData( series['id'] )
|
||||
issue_list = cv_series_results['issues']
|
||||
for issue in issue_list:
|
||||
|
||||
# format the issue number string nicely, since it's usually something like "2.00"
|
||||
num_f = float(issue['issue_number'])
|
||||
num_s = str( int(math.floor(num_f)) )
|
||||
if math.floor(num_f) != num_f:
|
||||
num_s = str( num_f )
|
||||
|
||||
# look for a matching issue number
|
||||
if num_s == search_issue_number:
|
||||
# found a matching issue number! now get the issue data
|
||||
img_url = comicVine.fetchIssueCoverURL( issue['id'] )
|
||||
#TODO get the URL, and calc hash!!
|
||||
url_image_data = urllib.urlopen(img_url).read()
|
||||
url_image_hash = ImageHasher( data=url_image_data ).average_hash()
|
||||
print "-----> ID: {0} #{1} ({2}) Hash: {3} Distance: {4}\n-------> url:{5}".format(
|
||||
issue['id'], num_s, issue['name'],
|
||||
url_image_hash,
|
||||
ImageHasher.hamming_distance(cover_hash, url_image_hash),
|
||||
img_url)
|
||||
|
||||
|
||||
break
|
||||
|
||||
"""
|
||||
comicVine = ComicVineTalker()
|
||||
|
||||
cv_search_results = comicVine.searchForSeries( opts.series_name )
|
||||
|
||||
#error checking here: did we get any results?
|
||||
|
||||
# we will eventualy want user interaction to choose the appropriate result, but for now, assume the first one
|
||||
@ -84,7 +193,7 @@ def main():
|
||||
|
||||
if opts.no_gui:
|
||||
|
||||
cliProcedure( opts )
|
||||
cliProcedure( opts, settings )
|
||||
|
||||
else:
|
||||
|
||||
|
141
todo.txt
141
todo.txt
@ -1,66 +1,75 @@
|
||||
|
||||
Add License/Copyright headers
|
||||
|
||||
Toolbar icons
|
||||
|
||||
Consolidate Credit Roles for english variants? : Penciler vs Penciller
|
||||
|
||||
Stand-alone CLI
|
||||
|
||||
TaggerWindow entry fields
|
||||
General layout
|
||||
Special Dialogs needed for:
|
||||
Pages Info
|
||||
Color changing stuff need more work
|
||||
- Indicate credits for CR style
|
||||
|
||||
CR has editable dropdowns/comboboxes for Format, Publisher, Imprint
|
||||
-----------
|
||||
|
||||
Form type validation Ints vs strings for month, year. etc
|
||||
|
||||
Check all HTTP responses for errors
|
||||
|
||||
Lots of error checking
|
||||
|
||||
Archive function to detect tag blocks out of sync
|
||||
|
||||
Hourglass popup, or whatever, for when busy
|
||||
|
||||
Idea: Support only CBI or CIX for any given file, and not both
|
||||
If user selects different one, warn about potential loss/re-arranging of data
|
||||
|
||||
Longer term:
|
||||
Think about mass tagging and (semi) automatic volume selection
|
||||
|
||||
Maybe: keep a history of tagged volumes IDs from CV, and present those first
|
||||
|
||||
Other settings possibilities:
|
||||
Last tag style
|
||||
Last "Open" folder (include dragged)
|
||||
Keep a history of queries somewhere??
|
||||
|
||||
Content Hashes!!
|
||||
|
||||
|
||||
App option to covert RAR to ZIP
|
||||
|
||||
If no unrar in path, then filter out CBR/RAR from open dialog
|
||||
|
||||
"Select Issues" dialog request cover URLs in background
|
||||
"Select Issues" dialog cache cover images
|
||||
|
||||
----------------------------------------------
|
||||
COMIC RACK Questions
|
||||
|
||||
Missing from XML as enterable in ComicRack:
|
||||
Main Character or Team
|
||||
Review
|
||||
User Rating
|
||||
|
||||
Some that seem library only:
|
||||
"Series Complete"
|
||||
Tags
|
||||
Proposed Values
|
||||
Community Rating
|
||||
|
||||
|
||||
Toolbar icons
|
||||
|
||||
Page Browser
|
||||
|
||||
Stand-alone CLI
|
||||
|
||||
TaggerWindow entry fields
|
||||
General layout
|
||||
Special Dialogs needed for:
|
||||
Pages Info
|
||||
Color changing stuff need more work
|
||||
- Indicate credits for CR style
|
||||
|
||||
CR has editable dropdowns/comboboxes for Format, Publisher, Imprint
|
||||
-----------
|
||||
|
||||
Form type validation Ints vs strings for month, year. etc
|
||||
|
||||
Check all HTTP responses for errors
|
||||
|
||||
Lots of error checking
|
||||
|
||||
Archive function to detect tag blocks out of sync
|
||||
|
||||
Hourglass popup, or whatever, for when busy
|
||||
|
||||
Idea: Support only CBI or CIX for any given file, and not both
|
||||
If user selects different one, warn about potential loss/re-arranging of data
|
||||
|
||||
Longer term:
|
||||
Think about mass tagging and (semi) automatic volume selection
|
||||
|
||||
Maybe: keep a history of tagged volumes IDs from CV, and present those first
|
||||
|
||||
Other settings possibilities:
|
||||
Last tag style
|
||||
Last "Open" folder (include dragged)
|
||||
Keep a history of queries somewhere??
|
||||
|
||||
Content Hashes!!
|
||||
|
||||
|
||||
App option to covert RAR to ZIP
|
||||
|
||||
If no unrar in path, then filter out CBR/RAR from open dialog
|
||||
|
||||
"Select Issues" dialog request cover URLs in background
|
||||
"Select Issues" dialog cache cover images
|
||||
|
||||
Wizard for converting between tag styles
|
||||
|
||||
Auto search:
|
||||
1st search local SQL tables that are built on the fly in the next step
|
||||
search certain table: series: CV info
|
||||
then search table issues: Id, title, number + URL, image hash, series ID
|
||||
cache hash and URL as needed
|
||||
if not found search CV directly caching results in tables
|
||||
|
||||
|
||||
|
||||
----------------------------------------------
|
||||
COMIC RACK Questions
|
||||
|
||||
Missing from XML as enterable in ComicRack:
|
||||
Main Character or Team
|
||||
Review
|
||||
User Rating
|
||||
|
||||
Some that seem library only:
|
||||
"Series Complete"
|
||||
Tags
|
||||
Proposed Values
|
||||
Community Rating
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user