From a04d8055f369b3e08263041cc9407df9e41695f0 Mon Sep 17 00:00:00 2001 From: "beville@gmail.com" Date: Fri, 9 Nov 2012 21:04:33 +0000 Subject: [PATCH] More work on automated processing git-svn-id: http://comictagger.googlecode.com/svn/trunk@22 6c5673fe-1810-88d6-992b-cd32ca31540c --- tagger.py | 93 ++++++++++++++++++++++++++++++------------------------- todo.txt | 25 +++++++++------ utils.py | 16 ++++++++++ 3 files changed, 83 insertions(+), 51 deletions(-) diff --git a/tagger.py b/tagger.py index ea7452e..ac1f1a9 100755 --- a/tagger.py +++ b/tagger.py @@ -53,13 +53,15 @@ def cliProcedure( opts, settings ): return cover_image_data = ca.getCoverPage() - #cover_hash = ImageHasher( data=cover_image_data ).average_hash() - #print "Cover hash = ",cover_hash - cover_hash = ImageHasher( data=cover_image_data ).average_hash2() - #print "Cover hash = ",cover_hash + if opts.image_hasher == '3': + cover_hash = ImageHasher( data=cover_image_data ).dct_average_hash() + elif opts.image_hasher == '2': + cover_hash = ImageHasher( data=cover_image_data ).average_hash2() + else: + cover_hash = ImageHasher( data=cover_image_data ).average_hash() - #cover_hash = ImageHasher( data=cover_image_data , width=32, height=32 ).perceptual_hash() + #print "Cover hash = {0:016x}".format(cover_hash) # see if the archive has any useful meta data for searching with if ca.hasCIX(): @@ -91,48 +93,41 @@ def cliProcedure( opts, settings ): if search_series is None or search_issue_number is None: print "Not enough info for a search!" return - - print ( "Going to search for:" ) - print ( "Series: ", search_series ) - print ( "Issue : ", search_issue_number ) + + """ + print "Going to search for:" + print "Series: ", search_series + print "Issue : ", search_issue_number if search_year is not None: - print ( "Year : ", search_year ) + print "Year : ", search_year if search_month is not None: - print ( "Month : ", search_month ) - - + print "Month : ", search_month + """ comicVine = ComicVineTalker( settings.cv_api_key ) - print ( "Searching for " + search_series + "...") + #print ( "Searching for " + search_series + "...") + print "Searching for {0} #{1} ...".format( search_series, search_issue_number) + search_series = utils.removearticles( search_series ) + cv_search_results = comicVine.searchForSeries( search_series ) - #---------- TEST - #cvc = ComicVineCacher( settings.folder ) - #cvc.add_search_results( search_series, cv_search_results ) - #cached_search_results = cvc.get_search_results( search_series) - #for r in cached_search_results: - # print "{0}: {1} ({2})".format( r['id'], r['name'], r['start_year']) - #quit() - #---------- TEST - - - print "Found " + str(len(cv_search_results)) + " initial results" + #print "Found " + str(len(cv_search_results)) + " initial results" series_shortlist = [] - print "Removing results with too long names" + #print "Removing results with too long names" for item in cv_search_results: - #assume that our search name is close to the actual name, say within 8 characters - if len( item['name']) < len( search_series ) + 8: + #assume that our search name is close to the actual name, say within 5 characters + if len( utils.removearticles(item['name'])) < len( search_series ) + 5: series_shortlist.append(item) # if we don't think it's an issue number 1, remove any series' that are one-shots if search_issue_number != '1': - print "Removing one-shots" + #print "Removing one-shots" series_shortlist[:] = [x for x in series_shortlist if not x['count_of_issues'] == 1] - print "Finally, searching in " + str(len(series_shortlist)) +" series" + print "Searching in " + str(len(series_shortlist)) +" series" # now sort the list by name length series_shortlist.sort(key=lambda x: len(x['name']), reverse=False) @@ -142,12 +137,15 @@ def cliProcedure( opts, settings ): match_list = [] + print "Fetching issue data", + for series in series_shortlist: - #print series['id'], series['name'], series['start_year'], series['count_of_issues'] - print "Fetching info for ID: {0} {1} ({2}) ...".format( - series['id'], - series['name'], - series['start_year']) + #print "Fetching info for ID: {0} {1} ({2}) ...".format( + # series['id'], + # series['name'], + # series['start_year']) + print ".", + sys.stdout.flush() cv_series_results = comicVine.fetchVolumeData( series['id'] ) issue_list = cv_series_results['issues'] @@ -165,26 +163,36 @@ def cliProcedure( opts, settings ): img_url = comicVine.fetchIssueCoverURL( issue['id'] ) #TODO get the URL, and calc hash!! url_image_data = urllib.urlopen(img_url).read() - #url_image_hash = ImageHasher( data=url_image_data ).average_hash() - url_image_hash = ImageHasher( data=url_image_data, ).average_hash2() - #url_image_hash = ImageHasher( data=url_image_data, width=32, height=32 ).perceptual_hash() + + if opts.image_hasher == '3': + url_image_hash = ImageHasher( data=url_image_data ).dct_average_hash() + elif opts.image_hasher == '2': + url_image_hash = ImageHasher( data=url_image_data ).average_hash2() + else: + url_image_hash = ImageHasher( data=url_image_data ).average_hash() match = dict() match['series'] = "{0} ({1})".format(series['name'], series['start_year']) match['distance'] = ImageHasher.hamming_distance(cover_hash, url_image_hash) match['issue_number'] = num_s + match['url_image_hash'] = url_image_hash match['issue_title'] = issue['name'] match['img_url'] = img_url match_list.append(match) break - - print "Compared covers for {0} issues".format(len(match_list)) + print "done!" # sort list by image match scores match_list.sort(key=lambda k: k['distance']) + + print "Compared {0} covers".format(len(match_list)), + + l = [] + for i in match_list: + l.append( i['distance'] ) + print l - #helper def print_match(item): print u"-----> {0} #{1} {2} -- score: {3}\n-------> url:{4}".format( item['series'], @@ -203,7 +211,7 @@ def cliProcedure( opts, settings ): print_match(match_list[0]) return - elif best_score > 20 and len(match_list) > 1: + elif best_score > 25 and len(match_list) > 1: print "No good image matches! Need to use other info..." return @@ -236,6 +244,7 @@ def cliProcedure( opts, settings ): def main(): opts = Options() opts.parseCmdLineArgs() + settings = ComicTaggerSettings() # make sure unrar program is in the path for the UnRAR class utils.addtopath(os.path.dirname(settings.unrar_exe_path)) diff --git a/todo.txt b/todo.txt index 0418315..5d5adc0 100644 --- a/todo.txt +++ b/todo.txt @@ -13,6 +13,8 @@ TaggerWindow entry fields - Indicate credits for CR style CR has editable dropdowns/comboboxes for Format, Publisher, Imprint + +GUI to handle mutliple files or folders ----------- Form type validation Ints vs strings for month, year. etc @@ -44,7 +46,8 @@ Image Hashes: Failures of average hash: Thor 600 Wrap-around w/ different aspect ratio Bone 3 - Variant Cover, - Old Avengers -- Best match, but high difference + Avengers #1, #13, #81 + Filename parsing: Concatenation of Name and Issue?? @@ -60,15 +63,19 @@ If no unrar in path, then filter out CBR/RAR from open dialog Wizard for converting between tag styles + +Remove stale data from cache DB +SQLite chokes on "Batman\ Li'l Gotham 001.cbr" name + + Auto search: - 1st search local SQL tables that are built on the fly in the next step - search certain table: series: CV info - then search table issues: Id, title, number + URL, image hash, series ID - cache hash and URL as needed - if not found search CV directly caching results in tables - - - + Logging class + Choosing with pub year + Lexical analysis + Maybe remove "the" and leading "A" before searching and matching for closer + Searching w/o issue #? + + ---------------------------------------------- COMIC RACK Questions diff --git a/utils.py b/utils.py index 360cb6f..7343f6f 100644 --- a/utils.py +++ b/utils.py @@ -52,7 +52,23 @@ def addtopath( dir ): # TODO only add if not there already if dir is not None and dir != "": os.environ['PATH'] = dir + os.pathsep + os.environ['PATH'] + +def removearticles( text ): + articles = ['and', 'the', 'a', '&' ] + newText = '' + for word in text.split(' '): + if word not in articles: + newText += word+' ' + newText = newText[:-1] + + # now get rid of some other junk + newText = newText.replace(":", "") + newText = newText.replace(".", "") + newText = newText.replace(",", "") + + return newText + # -o- coding: utf-8 -o- # ISO639 python dict