From af4b3af14e529ba694b9fa0d1214b87265ec95ac Mon Sep 17 00:00:00 2001 From: lordwelch Date: Sat, 7 Aug 2021 21:54:29 -0700 Subject: [PATCH 1/3] Cleanup metadata handling Mainly corrects for consistency in most situations CoMet is not touched as there is no support in the gui and has an odd requirements on attributes --- comicapi/comicbookinfo.py | 86 +++++++++++-------------- comicapi/comicinfoxml.py | 85 ++++++++++++------------- comicapi/utils.py | 17 +++++ comictaggerlib/comicvinetalker.py | 20 +++--- comictaggerlib/taggerwindow.py | 101 +++++++++++++++--------------- 5 files changed, 152 insertions(+), 157 deletions(-) diff --git a/comicapi/comicbookinfo.py b/comicapi/comicbookinfo.py index cb2d9e2..e55fac9 100644 --- a/comicapi/comicbookinfo.py +++ b/comicapi/comicbookinfo.py @@ -24,39 +24,33 @@ from . import utils class ComicBookInfo: - def metadataFromString(self, string): - + class Default(dict): + def __missing__(self, key): + return None cbi_container = json.loads(str(string, 'utf-8')) metadata = GenericMetadata() - cbi = cbi_container['ComicBookInfo/1.0'] + cbi = Default(cbi_container['ComicBookInfo/1.0']) - # helper func - # If item is not in CBI, return None - def xlate(cbi_entry): - if cbi_entry in cbi: - return cbi[cbi_entry] - else: - return None + metadata.series = utils.xlate(cbi['series']) + metadata.title = utils.xlate(cbi['title']) + metadata.issue = utils.xlate(cbi['issue']) + metadata.publisher = utils.xlate(cbi['publisher']) + metadata.month = utils.xlate(cbi['publicationMonth'], True) + metadata.year = utils.xlate(cbi['publicationYear'], True) + metadata.issueCount = utils.xlate(cbi['numberOfIssues'], True) + metadata.comments = utils.xlate(cbi['comments']) + metadata.genre = utils.xlate(cbi['genre']) + metadata.volume = utils.xlate(cbi['volume'], True) + metadata.volumeCount = utils.xlate(cbi['numberOfVolumes'], True) + metadata.language = utils.xlate(cbi['language']) + metadata.country = utils.xlate(cbi['country']) + metadata.criticalRating = utils.xlate(cbi['rating']) - metadata.series = xlate('series') - metadata.title = xlate('title') - metadata.issue = xlate('issue') - metadata.publisher = xlate('publisher') - metadata.month = xlate('publicationMonth') - metadata.year = xlate('publicationYear') - metadata.issueCount = xlate('numberOfIssues') - metadata.comments = xlate('comments') - metadata.credits = xlate('credits') - metadata.genre = xlate('genre') - metadata.volume = xlate('volume') - metadata.volumeCount = xlate('numberOfVolumes') - metadata.language = xlate('language') - metadata.country = xlate('country') - metadata.criticalRating = xlate('rating') - metadata.tags = xlate('tags') + metadata.credits = cbi['credits'] + metadata.tags = cbi['tags'] # make sure credits and tags are at least empty lists and not None if metadata.credits is None: @@ -103,33 +97,23 @@ class ComicBookInfo: # helper func def assign(cbi_entry, md_entry): - if md_entry is not None: + if md_entry is not None or isinstance(md_entry, str) and md_entry != "": cbi[cbi_entry] = md_entry - # helper func - def toInt(s): - i = None - if type(s) in [str, str, int]: - try: - i = int(s) - except ValueError: - pass - return i - - assign('series', metadata.series) - assign('title', metadata.title) - assign('issue', metadata.issue) - assign('publisher', metadata.publisher) - assign('publicationMonth', toInt(metadata.month)) - assign('publicationYear', toInt(metadata.year)) - assign('numberOfIssues', toInt(metadata.issueCount)) - assign('comments', metadata.comments) - assign('genre', metadata.genre) - assign('volume', toInt(metadata.volume)) - assign('numberOfVolumes', toInt(metadata.volumeCount)) - assign('language', utils.getLanguageFromISO(metadata.language)) - assign('country', metadata.country) - assign('rating', metadata.criticalRating) + assign('series', utils.xlate(metadata.series)) + assign('title', utils.xlate(metadata.title)) + assign('issue', utils.xlate(metadata.issue)) + assign('publisher', utils.xlate(metadata.publisher)) + assign('publicationMonth', utils.xlate(metadata.month, True)) + assign('publicationYear', utils.xlate(metadata.year, True)) + assign('numberOfIssues', utils.xlate(metadata.issueCount, True)) + assign('comments', utils.xlate(metadata.comments)) + assign('genre', utils.xlate(metadata.genre)) + assign('volume', utils.xlate(metadata.volume, True)) + assign('numberOfVolumes', utils.xlate(metadata.volumeCount, True)) + assign('language', utils.xlate(utils.getLanguageFromISO(metadata.language))) + assign('country', utils.xlate(metadata.country)) + assign('rating', utils.xlate(metadata.criticalRating)) assign('credits', metadata.credits) assign('tags', metadata.tags) diff --git a/comicapi/comicinfoxml.py b/comicapi/comicinfoxml.py index 757fa46..5c38902 100644 --- a/comicapi/comicinfoxml.py +++ b/comicapi/comicinfoxml.py @@ -20,6 +20,7 @@ import xml.etree.ElementTree as ET #import zipfile from .genericmetadata import GenericMetadata +from .issuestring import IssueString from . import utils @@ -206,48 +207,44 @@ class ComicInfoXml: raise 1 return None - metadata = GenericMetadata() - md = metadata - - # Helper function - def xlate(tag): - node = root.find(tag) - if node is not None: - return node.text - else: + def get(name): + tag = root.find(name) + if tag is None: return None + return tag.text - md.series = xlate('Series') - md.title = xlate('Title') - md.issue = xlate('Number') - md.issueCount = xlate('Count') - md.volume = xlate('Volume') - md.alternateSeries = xlate('AlternateSeries') - md.alternateNumber = xlate('AlternateNumber') - md.alternateCount = xlate('AlternateCount') - md.comments = xlate('Summary') - md.notes = xlate('Notes') - md.year = xlate('Year') - md.month = xlate('Month') - md.day = xlate('Day') - md.publisher = xlate('Publisher') - md.imprint = xlate('Imprint') - md.genre = xlate('Genre') - md.webLink = xlate('Web') - md.language = xlate('LanguageISO') - md.format = xlate('Format') - md.manga = xlate('Manga') - md.characters = xlate('Characters') - md.teams = xlate('Teams') - md.locations = xlate('Locations') - md.pageCount = xlate('PageCount') - md.scanInfo = xlate('ScanInformation') - md.storyArc = xlate('StoryArc') - md.seriesGroup = xlate('SeriesGroup') - md.maturityRating = xlate('AgeRating') + md = GenericMetadata() - tmp = xlate('BlackAndWhite') - md.blackAndWhite = False + md.series = utils.xlate(get('Series')) + md.title = utils.xlate(get('Title')) + md.issue = IssueString(utils.xlate(get('Number'))).asString() + md.issueCount = utils.xlate(get('Count'), True) + md.volume = utils.xlate(get('Volume'), True) + md.alternateSeries = utils.xlate(get('AlternateSeries')) + md.alternateNumber = IssueString(utils.xlate(get('AlternateNumber'))).asString() + md.alternateCount = utils.xlate(get('AlternateCount'), True) + md.comments = utils.xlate(get('Summary')) + md.notes = utils.xlate(get('Notes')) + md.year = utils.xlate(get('Year'), True) + md.month = utils.xlate(get('Month'), True) + md.day = utils.xlate(get('Day'), True) + md.publisher = utils.xlate(get('Publisher')) + md.imprint = utils.xlate(get('Imprint')) + md.genre = utils.xlate(get('Genre')) + md.webLink = utils.xlate(get('Web')) + md.language = utils.xlate(get('LanguageISO')) + md.format = utils.xlate(get('Format')) + md.manga = utils.xlate(get('Manga')) + md.characters = utils.xlate(get('Characters')) + md.teams = utils.xlate(get('Teams')) + md.locations = utils.xlate(get('Locations')) + md.pageCount = utils.xlate(get('PageCount'), True) + md.scanInfo = utils.xlate(get('ScanInformation')) + md.storyArc = utils.xlate(get('StoryArc')) + md.seriesGroup = utils.xlate(get('SeriesGroup')) + md.maturityRating = utils.xlate(get('AgeRating')) + + tmp = utils.xlate(get('BlackAndWhite')) if tmp is not None and tmp.lower() in ["yes", "true", "1"]: md.blackAndWhite = True # Now extract the credit info @@ -261,23 +258,23 @@ class ComicInfoXml: ): if n.text is not None: for name in n.text.split(','): - metadata.addCredit(name.strip(), n.tag) + md.addCredit(name.strip(), n.tag) if n.tag == 'CoverArtist': if n.text is not None: for name in n.text.split(','): - metadata.addCredit(name.strip(), "Cover") + md.addCredit(name.strip(), "Cover") # parse page data now pages_node = root.find("Pages") if pages_node is not None: for page in pages_node: - metadata.pages.append(page.attrib) + md.pages.append(page.attrib) # print page.attrib - metadata.isEmpty = False + md.isEmpty = False - return metadata + return md def writeToExternalFile(self, filename, metadata): diff --git a/comicapi/utils.py b/comicapi/utils.py index 1303689..aeec166 100644 --- a/comicapi/utils.py +++ b/comicapi/utils.py @@ -121,6 +121,23 @@ def which(program): return None +def xlate(data, isInt=False): + class Default(dict): + def __missing__(self, key): + return None + if data is None or data == "": + return None + if isInt: + i = str(data).translate(Default(zip((ord(c) for c in "1234567890"),"1234567890"))) + if i == "0": + return "0" + if i is "": + return None + return int(i) + else: + return str(data) + + def removearticles(text): text = text.lower() articles = ['and', 'a', '&', 'issue', 'the'] diff --git a/comictaggerlib/comicvinetalker.py b/comictaggerlib/comicvinetalker.py index 119f144..5e2a88c 100644 --- a/comictaggerlib/comicvinetalker.py +++ b/comictaggerlib/comicvinetalker.py @@ -124,11 +124,11 @@ class ComicVineTalker(QObject): year = None if date_str is not None: parts = date_str.split('-') - year = parts[0] + year = utils.xlate(parts[0], True) if len(parts) > 1: - month = parts[1] + month = utils.xlate(parts[1], True) if len(parts) > 2: - day = parts[2] + day = utils.xlate(parts[2], True) return day, month, year def testKey(self, key): @@ -497,15 +497,13 @@ class ComicVineTalker(QObject): # Now, map the Comic Vine data to generic metadata metadata = GenericMetadata() - metadata.series = issue_results['volume']['name'] + metadata.series = utils.xlate(issue_results['volume']['name']) + metadata.issue = IssueString(issue_results['issue_number']).asString() + metadata.title = utils.xlate(issue_results['name']) - num_s = IssueString(issue_results['issue_number']).asString() - metadata.issue = num_s - metadata.title = issue_results['name'] - - metadata.publisher = volume_results['publisher']['name'] - metadata.day, metadata.month, metadata.year = self.parseDateStr( - issue_results['cover_date']) + if volume_results['publisher'] is not None: + metadata.publisher = utils.xlate(volume_results['publisher']['name']) + metadata.day, metadata.month, metadata.year = self.parseDateStr(issue_results['cover_date']) #metadata.issueCount = volume_results['count_of_issues'] metadata.comments = self.cleanup_html( diff --git a/comictaggerlib/taggerwindow.py b/comictaggerlib/taggerwindow.py index 726b860..be4ab8f 100644 --- a/comictaggerlib/taggerwindow.py +++ b/comictaggerlib/taggerwindow.py @@ -51,6 +51,7 @@ from .cbltransformer import CBLTransformer from .renamewindow import RenameWindow from .exportwindow import ExportWindow, ExportConflictOpts from .issueidentifier import IssueIdentifier +from .issuestring import IssueString from .autotagstartwindow import AutoTagStartWindow from .autotagprogresswindow import AutoTagProgressWindow from .autotagmatchwindow import AutoTagMatchWindow @@ -761,14 +762,12 @@ class TaggerWindow(QtWidgets.QMainWindow): for child in widget.children(): self.clearChildren(child) + # Copy all of the metadata object into to the form. + # Merging of metadata should be done via the overlay function def metadataToForm(self): - # copy the the metadata object into to the form - - # helper func def assignText(field, value): if value is not None: field.setText(str(value)) - md = self.metadata assignText(self.leSeries, md.series) @@ -810,23 +809,33 @@ class TaggerWindow(QtWidgets.QMainWindow): self.cbMaturityRating.setEditText(md.maturityRating) else: self.cbMaturityRating.setCurrentIndex(i) + else: + self.cbMaturityRating.setCurrentIndex(0) if md.language is not None: i = self.cbLanguage.findData(md.language) self.cbLanguage.setCurrentIndex(i) + else: + self.cbLanguage.setCurrentIndex(0) if md.country is not None: i = self.cbCountry.findText(md.country) self.cbCountry.setCurrentIndex(i) + else: + self.cbCountry.setCurrentIndex(0) if md.manga is not None: i = self.cbManga.findData(md.manga) self.cbManga.setCurrentIndex(i) + else: + self.cbManga.setCurrentIndex(0) - if md.blackAndWhite is not None and md.blackAndWhite: + if md.blackAndWhite: self.cbBW.setChecked(True) + else: + self.cbBW.setChecked(False) - assignText(self.teTags, utils.listToString(md.tags)) + self.teTags.setText(utils.listToString(md.tags)) # !!! Should we clear the credits table or just avoid duplicates? while self.twCredits.rowCount() > 0: @@ -885,58 +894,47 @@ class TaggerWindow(QtWidgets.QMainWindow): return False def formToMetadata(self): - - # helper func - def xlate(data, type_str): - s = "{0}".format(data).strip() - if s == "": - return None - elif type_str == "str": - return s - else: - return int(s) - # copy the data from the form into the metadata - md = self.metadata - md.series = xlate(self.leSeries.text(), "str") - md.issue = xlate(self.leIssueNum.text(), "str") - md.issueCount = xlate(self.leIssueCount.text(), "int") - md.volume = xlate(self.leVolumeNum.text(), "int") - md.volumeCount = xlate(self.leVolumeCount.text(), "int") - md.title = xlate(self.leTitle.text(), "str") - md.publisher = xlate(self.lePublisher.text(), "str") - md.month = xlate(self.lePubMonth.text(), "int") - md.year = xlate(self.lePubYear.text(), "int") - md.day = xlate(self.lePubDay.text(), "int") - md.genre = xlate(self.leGenre.text(), "str") - md.imprint = xlate(self.leImprint.text(), "str") - md.comments = xlate(self.teComments.toPlainText(), "str") - md.notes = xlate(self.teNotes.toPlainText(), "str") - md.criticalRating = xlate(self.leCriticalRating.text(), "int") - md.maturityRating = xlate(self.cbMaturityRating.currentText(), "str") + md = GenericMetadata() + md.isEmpty = False + md.alternateNumber = IssueString(self.leAltIssueNum.text()).asString() + md.issue = IssueString(self.leIssueNum.text()).asString() + md.issueCount = utils.xlate(self.leIssueCount.text(), True) + md.volume = utils.xlate(self.leVolumeNum.text(), True) + md.volumeCount = utils.xlate(self.leVolumeCount.text(), True) + md.month = utils.xlate(self.lePubMonth.text(), True) + md.year = utils.xlate(self.lePubYear.text(), True) + md.day = utils.xlate(self.lePubDay.text(), True) + md.criticalRating = utils.xlate(self.leCriticalRating.text(), True) + md.alternateCount = utils.xlate(self.leAltIssueCount.text(), True) - md.storyArc = xlate(self.leStoryArc.text(), "str") - md.scanInfo = xlate(self.leScanInfo.text(), "str") - md.seriesGroup = xlate(self.leSeriesGroup.text(), "str") - md.alternateSeries = xlate(self.leAltSeries.text(), "str") - md.alternateNumber = xlate(self.leAltIssueNum.text(), "int") - md.alternateCount = xlate(self.leAltIssueCount.text(), "int") - md.webLink = xlate(self.leWebLink.text(), "str") - md.characters = xlate(self.teCharacters.toPlainText(), "str") - md.teams = xlate(self.teTeams.toPlainText(), "str") - md.locations = xlate(self.teLocations.toPlainText(), "str") + md.series = self.leSeries.text() + md.title = self.leTitle.text() + md.publisher = self.lePublisher.text() + md.genre = self.leGenre.text() + md.imprint = self.leImprint.text() + md.comments = self.teComments.toPlainText() + md.notes = self.teNotes.toPlainText() + md.maturityRating = self.cbMaturityRating.currentText() - md.format = xlate(self.cbFormat.currentText(), "str") - md.country = xlate(self.cbCountry.currentText(), "str") + md.storyArc = self.leStoryArc.text() + md.scanInfo = self.leScanInfo.text() + md.seriesGroup = self.leSeriesGroup.text() + md.alternateSeries = self.leAltSeries.text() + md.webLink = self.leWebLink.text() + md.characters = self.teCharacters.toPlainText() + md.teams = self.teTeams.toPlainText() + md.locations = self.teLocations.toPlainText() - langiso = self.cbLanguage.itemData(self.cbLanguage.currentIndex()) - md.language = xlate(langiso, "str") + md.format = self.cbFormat.currentText() + md.country = self.cbCountry.currentText() - manga_code = self.cbManga.itemData(self.cbManga.currentIndex()) - md.manga = xlate(manga_code, "str") + md.language = utils.xlate(self.cbLanguage.itemData(self.cbLanguage.currentIndex())) + + md.manga = utils.xlate(self.cbManga.itemData(self.cbManga.currentIndex())) # Make a list from the coma delimited tags string - tmp = xlate(self.teTags.toPlainText(), "str") + tmp = self.teTags.toPlainText() if tmp is not None: def striplist(l): return([x.strip() for x in l]) @@ -960,6 +958,7 @@ class TaggerWindow(QtWidgets.QMainWindow): row += 1 md.pages = self.pageListEditor.getPageList() + self.metadata = md def useFilename(self): if self.comic_archive is not None: From 11bf5a9709e27dc5b0a7f7eece2aceb5422f5930 Mon Sep 17 00:00:00 2001 From: lordwelch Date: Wed, 12 Feb 2020 23:30:04 -0800 Subject: [PATCH 2/3] Move to python requests module Add requests to requirements.txt Requests is much simpler and fixes all ssl errors. Comic Vine now requires a unique useragent string --- comictaggerlib/comicvinetalker.py | 184 +++++++++++++++--------------- comictaggerlib/imagefetcher.py | 11 +- comictaggerlib/issueidentifier.py | 3 - comictaggerlib/versionchecker.py | 37 +++--- requirements.txt | 1 + 5 files changed, 115 insertions(+), 121 deletions(-) diff --git a/comictaggerlib/comicvinetalker.py b/comictaggerlib/comicvinetalker.py index 5e2a88c..0acd808 100644 --- a/comictaggerlib/comicvinetalker.py +++ b/comictaggerlib/comicvinetalker.py @@ -15,8 +15,7 @@ # limitations under the License. import json -import urllib.request, urllib.error, urllib.parse -import urllib.request, urllib.parse, urllib.error +import requests import re import time import datetime @@ -104,9 +103,6 @@ class ComicVineTalker(QObject): self.log_func = None - # always use a tls context for urlopen - self.ssl = ssl.SSLContext(ssl.PROTOCOL_TLS) - def setLogFunc(self, log_func): self.log_func = log_func @@ -134,13 +130,10 @@ class ComicVineTalker(QObject): def testKey(self, key): try: - test_url = self.api_base_url + "/issue/1/?api_key=" + \ - key + "&format=json&field_list=name" - resp = urllib.request.urlopen(test_url, context=self.ssl) - content = resp.read() - - cv_response = json.loads(content.decode('utf-8')) - + test_url = self.api_base_url + "/issue/1/?api_key=" + key + "&format=json&field_list=name" + + cv_response = requests.get(test_url, headers={'user-agent': 'comictagger/' + ctversion.version}).json() + # Bogus request, but if the key is wrong, you get error 100: "Invalid # API Key" return cv_response['status_code'] != 100 @@ -152,14 +145,13 @@ class ComicVineTalker(QObject): sleep for a bit and retry. """ - def getCVContent(self, url): + def getCVContent(self, url, params): total_time_waited = 0 limit_wait_time = 1 counter = 0 wait_times = [1, 2, 3, 4] while True: - content = self.getUrlContent(url) - cv_response = json.loads(content.decode('utf-8')) + cv_response = self.getUrlContent(url, params) if self.wait_for_rate_limit and cv_response[ 'status_code'] == ComicVineTalkerException.RateLimit: self.writeLog( @@ -184,25 +176,24 @@ class ComicVineTalker(QObject): break return cv_response - def getUrlContent(self, url): + def getUrlContent(self, url, params): # connect to server: # if there is a 500 error, try a few more times before giving up # any other error, just bail #print("---", url) for tries in range(3): try: - resp = urllib.request.urlopen(url, context=self.ssl) - return resp.read() - except urllib.error.HTTPError as e: - if e.getcode() == 500: + resp = requests.get(url, params=params, headers={'user-agent': 'comictagger/' + ctversion.version}) + if resp.status_code == 200: + return resp.json() + if resp.status_code == 500: self.writeLog("Try #{0}: ".format(tries + 1)) time.sleep(1) - self.writeLog(str(e) + "\n") - - if e.getcode() != 500: + self.writeLog(str(resp.status_code) + "\n") + else: break - except Exception as e: + except requests.exceptions.RequestException as e: self.writeLog(str(e) + "\n") raise ComicVineTalkerException( ComicVineTalkerException.Network, "Network Error!") @@ -226,17 +217,16 @@ class ComicVineTalker(QObject): original_series_name = series_name - # Split and rejoin to remove extra internal spaces - query_word_list = series_name.split() - query_string = " ".join( query_word_list ).strip() - #print ("Query string = ", query_string) + params = { + 'api_key': self.api_key, + 'format': 'json', + 'resources': 'volume', + 'query': series_name, + 'field_list': 'name,id,start_year,publisher,image,description,count_of_issues', + 'page': 1 + } - query_string = urllib.parse.quote_plus(query_string.encode("utf-8")) - - search_url = self.api_base_url + "/search/?api_key=" + self.api_key + "&format=json&resources=volume&query=" + \ - query_string + \ - "&field_list=name,id,start_year,publisher,image,description,count_of_issues&limit=100" - cv_response = self.getCVContent(search_url + "&page=1") + cv_response = self.getCVContent(self.api_base_url + "/search", params) search_results = list() @@ -249,15 +239,15 @@ class ComicVineTalker(QObject): # 8 Dec 2018 - Comic Vine changed query results again. Terms are now # ORed together, and we get thousands of results. Good news is the # results are sorted by relevance, so we can be smart about halting - # the search. + # the search. # 1. Don't fetch more than some sane amount of pages. - max_results = 500 + max_results = 500 # 2. Halt when not all of our search terms are present in a result # 3. Halt when the results contain more (plus threshold) words than # our search - result_word_count_max = len(query_word_list) + 3 + result_word_count_max = len(series_name.split()) + 3 - total_result_count = min(total_result_count, max_results) + total_result_count = min(total_result_count, max_results) if callback is None: self.writeLog( @@ -278,15 +268,14 @@ class ComicVineTalker(QObject): # See if the last result's name has all the of the search terms. # if not, break out of this, loop, we're done. - #print("Searching for {} in '{}'".format(query_word_list, last_result)) - for term in query_word_list: + for term in series_name.split(): if term not in last_result.lower(): #print("Term '{}' not in last result. Halting search result fetching".format(term)) stop_searching = True break # Also, stop searching when the word count of last results is too much longer - # than our search terms list + # than our search terms list if len(utils.removearticles(last_result).split()) > result_word_count_max: #print("Last result '{}' is too long. Halting search result fetching".format(last_result)) stop_searching = True @@ -301,7 +290,8 @@ class ComicVineTalker(QObject): total_result_count)) page += 1 - cv_response = self.getCVContent(search_url + "&page=" + str(page)) + params['page'] = page + cv_response = self.getCVContent(self.api_base_url + "/search", params) search_results.extend(cv_response['results']) current_result_count += cv_response['number_of_page_results'] @@ -313,7 +303,7 @@ class ComicVineTalker(QObject): # (iterate backwards for easy removal) for i in range(len(search_results) - 1, -1, -1): record = search_results[i] - for term in query_word_list: + for term in series_name.split(): if term not in record['name'].lower(): del search_results[i] break @@ -339,11 +329,14 @@ class ComicVineTalker(QObject): if cached_volume_result is not None: return cached_volume_result - volume_url = self.api_base_url + "/volume/" + CVTypeID.Volume + "-" + \ - str(series_id) + "/?api_key=" + self.api_key + \ - "&field_list=name,id,start_year,publisher,count_of_issues&format=json" + volume_url = self.api_base_url + "/volume/" + CVTypeID.Volume + "-" + str(series_id) - cv_response = self.getCVContent(volume_url) + params = { + 'api_key': self.api_key, + 'format': 'json', + 'field_list': 'name,id,start_year,publisher,count_of_issues' + } + cv_response = self.getCVContent(volume_url, params) volume_results = cv_response['results'] @@ -361,11 +354,13 @@ class ComicVineTalker(QObject): if cached_volume_issues_result is not None: return cached_volume_issues_result - #--------------------------------- - issues_url = self.api_base_url + "/issues/" + "?api_key=" + self.api_key + "&filter=volume:" + \ - str(series_id) + \ - "&field_list=id,volume,issue_number,name,image,cover_date,site_detail_url,description&format=json" - cv_response = self.getCVContent(issues_url) + params = { + 'api_key': self.api_key, + 'filter': 'volume:' + str(series_id), + 'format': 'json', + 'field_list': 'id,volume,issue_number,name,image,cover_date,site_detail_url,description' + } + cv_response = self.getCVContent(self.api_base_url + "/issues/", params) #------------------------------------ @@ -385,9 +380,8 @@ class ComicVineTalker(QObject): page += 1 offset += cv_response['number_of_page_results'] - # print issues_url+ "&offset="+str(offset) - cv_response = self.getCVContent( - issues_url + "&offset=" + str(offset)) + params['offset'] = offset + cv_response = self.getCVContent(self.api_base_url + "/issues/", params) volume_issues_result.extend(cv_response['results']) current_result_count += cv_response['number_of_page_results'] @@ -398,26 +392,24 @@ class ComicVineTalker(QObject): return volume_issues_result - def fetchIssuesByVolumeIssueNumAndYear( - self, volume_id_list, issue_number, year): - volume_filter = "volume:" + def fetchIssuesByVolumeIssueNumAndYear(self, volume_id_list, issue_number, year): + volume_filter = "" for vid in volume_id_list: volume_filter += str(vid) + "|" + filter = "volume:{},issue_number:{}".format(volume_filter, issue_number) - year_filter = "" - if year is not None and str(year).isdigit(): - year_filter = ",cover_date:{0}-1-1|{1}-1-1".format( - year, int(year) + 1) + intYear = utils.xlate(year, True) + if intYear is not None: + filter += ",cover_date:{}-1-1|{}-1-1".format(intYear, intYear + 1) - issue_number = urllib.parse.quote_plus(str(issue_number).encode("utf-8")) + params = { + 'api_key': self.api_key, + 'format': 'json', + 'field_list': 'id,volume,issue_number,name,image,cover_date,site_detail_url,description', + 'filter': filter + } - filter = "&filter=" + volume_filter + \ - year_filter + ",issue_number:" + issue_number - - issues_url = self.api_base_url + "/issues/" + "?api_key=" + self.api_key + filter + \ - "&field_list=id,volume,issue_number,name,image,cover_date,site_detail_url,description&format=json" - - cv_response = self.getCVContent(issues_url) + cv_response = self.getCVContent(self.api_base_url + "/issues", params) #------------------------------------ @@ -437,9 +429,8 @@ class ComicVineTalker(QObject): page += 1 offset += cv_response['number_of_page_results'] - # print issues_url+ "&offset="+str(offset) - cv_response = self.getCVContent( - issues_url + "&offset=" + str(offset)) + params['offset'] = offset + cv_response = self.getCVContent(self.api_base_url + "/issues/", params) filtered_issues_result.extend(cv_response['results']) current_result_count += cv_response['number_of_page_results'] @@ -463,11 +454,12 @@ class ComicVineTalker(QObject): break if (found): - issue_url = self.api_base_url + "/issue/" + CVTypeID.Issue + "-" + \ - str(record['id']) + "/?api_key=" + \ - self.api_key + "&format=json" - - cv_response = self.getCVContent(issue_url) + issue_url = self.api_base_url + "/issue/" + CVTypeID.Issue + "-" + str(record['id']) + params = { + 'api_key': self.api_key, + 'format': 'json' + } + cv_response = self.getCVContent(issue_url, params) issue_results = cv_response['results'] else: @@ -479,9 +471,12 @@ class ComicVineTalker(QObject): def fetchIssueDataByIssueID(self, issue_id, settings): - issue_url = self.api_base_url + "/issue/" + CVTypeID.Issue + "-" + \ - str(issue_id) + "/?api_key=" + self.api_key + "&format=json" - cv_response = self.getCVContent(issue_url) + issue_url = self.api_base_url + "/issue/" + CVTypeID.Issue + "-" + str(issue_id) + params = { + 'api_key': self.api_key, + 'format': 'json' + } + cv_response = self.getCVContent(issue_url, params) issue_results = cv_response['results'] @@ -670,9 +665,15 @@ class ComicVineTalker(QObject): if cached_details['image_url'] is not None: return cached_details - issue_url = self.api_base_url + "/issue/" + CVTypeID.Issue + "-" + \ - str(issue_id) + "/?api_key=" + self.api_key + \ - "&format=json&field_list=image,cover_date,site_detail_url" + issue_url = self.api_base_url + "/issue/" + CVTypeID.Issue + "-" + str(issue_id) + + params = { + 'api_key': self.api_key, + 'format': 'json', + 'field_list': 'image,cover_date,site_detail_url' + } + + cv_response = self.getCVContent(issue_url, params) details = dict() details['image_url'] = None @@ -680,8 +681,6 @@ class ComicVineTalker(QObject): details['cover_date'] = None details['site_detail_url'] = None - cv_response = self.getCVContent(issue_url) - details['image_url'] = cv_response['results']['image']['super_url'] details['thumb_image_url'] = cv_response[ 'results']['image']['thumb_url'] @@ -716,8 +715,7 @@ class ComicVineTalker(QObject): return url_list # scrape the CV issue page URL to get the alternate cover URLs - resp = urllib.request.urlopen(issue_page_url, context=self.ssl) - content = resp.read() + content = requests.get(issue_page_url, headers={'user-agent': 'comictagger/' + ctversion.version}).text alt_cover_url_list = self.parseOutAltCoverUrls(content) # cache this alt cover URL list @@ -727,9 +725,9 @@ class ComicVineTalker(QObject): def parseOutAltCoverUrls(self, page_html): soup = BeautifulSoup(page_html, "html.parser") - + alt_cover_url_list = [] - + # Using knowledge of the layout of the Comic Vine issue page here: # look for the divs that are in the classes 'imgboxart' and # 'issue-cover' @@ -738,15 +736,15 @@ class ComicVineTalker(QObject): for d in div_list: if 'class' in d.attrs: c = d['class'] - if ('imgboxart' in c and + if ('imgboxart' in c and 'issue-cover' in c and d.img['src'].startswith("http") ): - + covers_found += 1 if covers_found != 1: alt_cover_url_list.append(d.img['src']) - + return alt_cover_url_list def fetchCachedAlternateCoverURLs(self, issue_id): diff --git a/comictaggerlib/imagefetcher.py b/comictaggerlib/imagefetcher.py index 85a784e..5dbd215 100644 --- a/comictaggerlib/imagefetcher.py +++ b/comictaggerlib/imagefetcher.py @@ -19,9 +19,7 @@ import os import datetime import shutil import tempfile -import urllib.request, urllib.parse, urllib.error -import ssl -#import urllib2 +import requests try: from PyQt5.QtNetwork import QNetworkAccessManager, QNetworkRequest @@ -46,6 +44,7 @@ except ImportError: pass from .settings import ComicTaggerSettings +from . import ctversion class ImageFetcherException(Exception): @@ -66,9 +65,6 @@ class ImageFetcher(QObject): if not os.path.exists(self.db_file): self.create_image_db() - # always use a tls context for urlopen - self.ssl = ssl.SSLContext(ssl.PROTOCOL_TLS) - def clearCache(self): os.unlink(self.db_file) if os.path.isdir(self.cache_folder): @@ -90,7 +86,8 @@ class ImageFetcher(QObject): if blocking: if image_data is None: try: - image_data = urllib.request.urlopen(url, context=self.ssl).read() + print(url) + image_data = requests.get(url, headers={'user-agent': 'comictagger/' + ctversion.version}).content except Exception as e: print(e) raise ImageFetcherException("Network Error!") diff --git a/comictaggerlib/issueidentifier.py b/comictaggerlib/issueidentifier.py index 47b51af..292e84c 100644 --- a/comictaggerlib/issueidentifier.py +++ b/comictaggerlib/issueidentifier.py @@ -16,9 +16,6 @@ import sys import io -#import math -#import urllib2 -#import urllib try: from PIL import Image diff --git a/comictaggerlib/versionchecker.py b/comictaggerlib/versionchecker.py index 32b3684..e1fbad5 100644 --- a/comictaggerlib/versionchecker.py +++ b/comictaggerlib/versionchecker.py @@ -16,9 +16,9 @@ import sys import platform -import urllib.request, urllib.error, urllib.parse +import requests +import urllib.parse #import os -#import urllib try: from PyQt5.QtNetwork import QNetworkAccessManager, QNetworkRequest, QNetworkReply @@ -47,28 +47,30 @@ class VersionChecker(QObject): base_url = "http://comictagger1.appspot.com/latest" args = "" - + params = dict() if use_stats: + params = { + 'uuid': uuid, + 'version': ctversion.version + } if platform.system() == "Windows": - plat = "win" + params['platform'] = "win" elif platform.system() == "Linux": - plat = "lin" + params['platform'] = "lin" elif platform.system() == "Darwin": - plat = "mac" + params['platform'] = "mac" else: - plat = "other" - args = "?uuid={0}&platform={1}&version={2}".format( - uuid, plat, ctversion.version) - if not getattr(sys, 'frozen', None): - args += "&src=T" + params['platform'] = "other" - return base_url + args + if not getattr(sys, 'frozen', None): + params['src'] = 'T' + + return (base_url, params) def getLatestVersion(self, uuid, use_stats=True): - try: - resp = urllib.request.urlopen(self.getRequestUrl(uuid, use_stats)) - new_version = resp.read() + url, params = self.getRequestUrl(uuid, use_stats) + new_version = requests.get(url, params=params).text except Exception as e: return None @@ -79,12 +81,11 @@ class VersionChecker(QObject): versionRequestComplete = pyqtSignal(str) def asyncGetLatestVersion(self, uuid, use_stats): - - url = self.getRequestUrl(uuid, use_stats) + url, params = self.getRequestUrl(uuid, use_stats) self.nam = QNetworkAccessManager() self.nam.finished.connect(self.asyncGetLatestVersionComplete) - self.nam.get(QNetworkRequest(QUrl(str(url)))) + self.nam.get(QNetworkRequest(QUrl(str(url + '?' + urllib.parse.urlencode(params))))) def asyncGetLatestVersionComplete(self, reply): if (reply.error() != QNetworkReply.NoError): diff --git a/requirements.txt b/requirements.txt index da4343a..02d04d3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ configparser +requests beautifulsoup4 >= 4.1 natsort==3.5.2 PyPDF2==1.24 From fff28cf6ae0ea84ab9a6fdfd362791aa0402e11a Mon Sep 17 00:00:00 2001 From: lordwelch Date: Thu, 13 Feb 2020 00:27:08 -0800 Subject: [PATCH 3/3] Improve searchForSeries Refactor removearticles to only remove articles Add normalization on the search string and the series name results Searching now only compares ASCII a-z and 0-9 and all other characters are replaced with single space, this is done to both the search string and the result. This fixes an with names that are separated by a hyphen (-) in the filename but in the Comic Vine name are separated by a slash (/) and other similar issues. --- comicapi/utils.py | 10 -------- comictaggerlib/comicvinetalker.py | 42 ++++++++++++++++++++++--------- 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/comicapi/utils.py b/comicapi/utils.py index aeec166..f82f99a 100644 --- a/comicapi/utils.py +++ b/comicapi/utils.py @@ -148,16 +148,6 @@ def removearticles(text): newText = newText[:-1] - # now get rid of some other junk - newText = newText.replace(":", "") - newText = newText.replace(",", "") - newText = newText.replace("-", " ") - - # since the CV API changed, searches for series names with periods - # now explicitly require the period to be in the search key, - # so the line below is removed (for now) - #newText = newText.replace(".", "") - return newText diff --git a/comictaggerlib/comicvinetalker.py b/comictaggerlib/comicvinetalker.py index 0acd808..75f254a 100644 --- a/comictaggerlib/comicvinetalker.py +++ b/comictaggerlib/comicvinetalker.py @@ -21,6 +21,7 @@ import time import datetime import sys import ssl +import unicodedata #from pprint import pprint #import math @@ -203,8 +204,13 @@ class ComicVineTalker(QObject): def searchForSeries(self, series_name, callback=None, refresh_cache=False): - # remove cruft from the search string - series_name = utils.removearticles(series_name).lower().strip() + # normalize unicode and convert to ascii. Does not work for everything eg ½ to 1⁄2 not 1/2 + search_series_name = unicodedata.normalize('NFKD', series_name).encode('ascii', 'ignore').decode('ascii') + # comicvine ignores punctuation and accents + search_series_name = re.sub(r'[^A-Za-z0-9]+',' ', search_series_name) + # remove extra space and articles and all lower case + search_series_name = utils.removearticles(search_series_name).lower().strip() + # before we search online, look in our cache, since we might have # done this same search recently @@ -215,14 +221,12 @@ class ComicVineTalker(QObject): if len(cached_search_results) > 0: return cached_search_results - original_series_name = series_name - params = { 'api_key': self.api_key, 'format': 'json', 'resources': 'volume', - 'query': series_name, - 'field_list': 'name,id,start_year,publisher,image,description,count_of_issues', + 'query': search_series_name, + 'field_list': 'volume,name,id,start_year,publisher,image,description,count_of_issues', 'page': 1 } @@ -245,7 +249,7 @@ class ComicVineTalker(QObject): # 2. Halt when not all of our search terms are present in a result # 3. Halt when the results contain more (plus threshold) words than # our search - result_word_count_max = len(series_name.split()) + 3 + result_word_count_max = len(search_series_name.split()) + 3 total_result_count = min(total_result_count, max_results) @@ -266,9 +270,16 @@ class ComicVineTalker(QObject): last_result = search_results[-1]['name'] + # normalize unicode and convert to ascii. Does not work for everything eg ½ to 1⁄2 not 1/2 + last_result = unicodedata.normalize('NFKD', last_result).encode('ascii', 'ignore').decode('ascii') + # comicvine ignores punctuation and accents + last_result = re.sub(r'[^A-Za-z0-9]+',' ', last_result) + # remove extra space and articles and all lower case + last_result = utils.removearticles(last_result).lower().strip() + # See if the last result's name has all the of the search terms. # if not, break out of this, loop, we're done. - for term in series_name.split(): + for term in search_series_name.split(): if term not in last_result.lower(): #print("Term '{}' not in last result. Halting search result fetching".format(term)) stop_searching = True @@ -276,7 +287,7 @@ class ComicVineTalker(QObject): # Also, stop searching when the word count of last results is too much longer # than our search terms list - if len(utils.removearticles(last_result).split()) > result_word_count_max: + if len(last_result) > result_word_count_max: #print("Last result '{}' is too long. Halting search result fetching".format(last_result)) stop_searching = True @@ -303,8 +314,15 @@ class ComicVineTalker(QObject): # (iterate backwards for easy removal) for i in range(len(search_results) - 1, -1, -1): record = search_results[i] - for term in series_name.split(): - if term not in record['name'].lower(): + for term in search_series_name.split(): + # normalize unicode and convert to ascii. Does not work for everything eg ½ to 1⁄2 not 1/2 + recordName = unicodedata.normalize('NFKD', record['name']).encode('ascii', 'ignore').decode('ascii') + # comicvine ignores punctuation and accents + recordName = re.sub(r'[^A-Za-z0-9]+',' ', recordName) + # remove extra space and articles and all lower case + recordName = utils.removearticles(recordName).lower().strip() + + if term not in recordName: del search_results[i] break @@ -315,7 +333,7 @@ class ComicVineTalker(QObject): #print(u"{0}: {1} ({2})".format(search_results['results'][0]['id'], search_results['results'][0]['name'] , search_results['results'][0]['start_year'])) # cache these search results - cvc.add_search_results(original_series_name, search_results) + cvc.add_search_results(series_name, search_results) return search_results