Switch to API data for alt images, remove unneeded functions and removed async as new approach needed. See comments about fetch_partial_volume_data

This commit is contained in:
Mizaki 2022-10-26 00:29:30 +01:00
parent cab69a32be
commit 4514ae80d0
7 changed files with 59 additions and 198 deletions

View File

@ -204,7 +204,7 @@ class CoverImageWidget(QtWidgets.QWidget):
self.imageCount = len(self.url_list)
self.update_content()
# defer the alt cover search
# TODO No need to search for alt covers as they are now in ComicIssue data
if self.talker_api.talker.source_details.static_options.has_alt_covers:
QtCore.QTimer.singleShot(1, self.start_alt_cover_search)
@ -216,7 +216,7 @@ class CoverImageWidget(QtWidgets.QWidget):
# page URL should already be cached, so no need to defer
ComicTalker.alt_url_list_fetch_complete = self.sig.emit_list
self.talker_api.talker.async_fetch_alternate_cover_urls(utils.xlate(self.issue_id))
self.talker_api.talker.fetch_alternate_cover_urls(utils.xlate(self.issue_id))
def alt_cover_url_list_fetch_complete(self, url_list: list[str]) -> None:
if url_list:

View File

@ -96,16 +96,6 @@ class ComicCacher:
+ "PRIMARY KEY (id, source_name))"
)
cur.execute(
"CREATE TABLE AltCovers("
+ "issue_id INT NOT NULL,"
+ "url_list TEXT,"
+ "timestamp DATE DEFAULT (datetime('now','localtime')), "
+ "source_name TEXT NOT NULL,"
+ "aliases TEXT," # Newline separated
+ "PRIMARY KEY (issue_id, source_name))"
)
cur.execute(
"CREATE TABLE Issues("
+ "id INT NOT NULL,"
@ -120,6 +110,7 @@ class ComicCacher:
+ "timestamp DATE DEFAULT (datetime('now','localtime')), "
+ "source_name TEXT NOT NULL,"
+ "aliases TEXT," # Newline separated
+ "alt_images_url TEXT," # Comma separated URLs
+ "PRIMARY KEY (id, source_name))"
)
@ -186,47 +177,6 @@ class ComicCacher:
return results
def add_alt_covers(self, source_name: str, issue_id: int, url_list: list[str]) -> None:
con = lite.connect(self.db_file)
with con:
con.text_factory = str
cur = con.cursor()
# remove all previous entries with this search term
cur.execute("DELETE FROM AltCovers WHERE issue_id=? AND source_name=?", [issue_id, source_name])
url_list_str = ",".join(url_list)
# now add in new record
cur.execute(
"INSERT INTO AltCovers (source_name, issue_id, url_list) VALUES(?, ?, ?)",
(source_name, issue_id, url_list_str),
)
def get_alt_covers(self, source_name: str, issue_id: int) -> list[str]:
con = lite.connect(self.db_file)
with con:
cur = con.cursor()
con.text_factory = str
# purge stale issue info - probably issue data won't change
# much....
a_month_ago = datetime.datetime.today() - datetime.timedelta(days=30)
cur.execute("DELETE FROM AltCovers WHERE timestamp < ?", [str(a_month_ago)])
cur.execute("SELECT url_list FROM AltCovers WHERE issue_id=? AND source_name=?", [issue_id, source_name])
row = cur.fetchone()
if row is None:
return []
url_list_str = row[0]
if not url_list_str:
return []
url_list = str(url_list_str).split(",")
return url_list
def add_volume_info(self, source_name: str, volume_record: ComicVolume) -> None:
con = lite.connect(self.db_file)
@ -272,6 +222,7 @@ class ComicCacher:
"description": issue["description"],
"timestamp": timestamp,
"aliases": issue["aliases"],
"alt_images_url": issue["alt_images_url"],
}
self.upsert(cur, "issues", data)
@ -331,7 +282,7 @@ class ComicCacher:
cur.execute(
(
"SELECT source_name,id,name,issue_number,site_detail_url,cover_date,image_url,thumb_url,description,aliases"
"SELECT source_name,id,name,issue_number,site_detail_url,cover_date,image_url,thumb_url,description,aliases,alt_images_url"
" FROM Issues WHERE volume_id=? AND source_name=?"
),
[volume_id, source_name],
@ -350,6 +301,7 @@ class ComicCacher:
description=row[8],
volume=volume,
aliases=row[9],
alt_images_url=row[10],
)
results.append(record)
@ -369,7 +321,7 @@ class ComicCacher:
cur.execute(
(
"SELECT source_name,id,name,issue_number,site_detail_url,cover_date,image_url,thumb_url,description,aliases,volume_id"
"SELECT source_name,id,name,issue_number,site_detail_url,cover_date,image_url,thumb_url,description,aliases,volume_id,alt_images_url"
" FROM Issues WHERE id=? AND source_name=?"
),
[issue_id, source_name],
@ -391,9 +343,11 @@ class ComicCacher:
site_detail_url=row[4],
cover_date=row[5],
image_url=row[6],
image_thumb_url=row[7],
description=row[8],
volume=volume,
aliases=row[9],
alt_images_url=row[11],
)
return record

View File

@ -25,3 +25,4 @@ class ComicIssue(TypedDict, total=False):
name: Required[str]
site_detail_url: str
volume: ComicVolume
alt_images_url: str # Comma separated URLs

View File

@ -196,7 +196,7 @@ class TalkerBase:
"""This function is expected to handle a few possibilities:
1. Only series_id. Retrieve the SERIES/VOLUME information only.
2. series_id and issue_number. Retrieve the ISSUE information.
3. Only issue_id. Used solely by the CLI to retrieve the ISSUE information."""
3. Only issue_id. Retrieve the ISSUE information."""
raise NotImplementedError
def fetch_alternate_cover_urls(self, issue_id: int) -> list[str]:
@ -206,6 +206,3 @@ class TalkerBase:
self, volume_id_list: list[int], issue_number: str, year: str | int | None
) -> list[ComicIssue]:
raise NotImplementedError
def async_fetch_alternate_cover_urls(self, issue_id: int) -> None:
raise NotImplementedError

View File

@ -46,13 +46,6 @@ from comictalker.talkerbase import (
logger = logging.getLogger(__name__)
try:
from PyQt5 import QtCore, QtNetwork
qt_available = True
except ImportError:
qt_available = False
class CVTypeID:
Volume = "4050"
@ -72,6 +65,13 @@ class CVImage(TypedDict, total=False):
image_tags: str
class CVAltImages(TypedDict):
original_url: str
id: int
caption: str
image_tags: str
class CVPublisher(TypedDict, total=False):
api_detail_url: str
id: int
@ -140,6 +140,7 @@ class CVIssuesResults(TypedDict):
name: str
site_detail_url: str
volume: CVVolume
associated_images: list[CVAltImages]
class CVVolumeFullResult(TypedDict):
@ -295,10 +296,6 @@ class ComicVineTalker(TalkerBase):
self.series_match_thresh = series_match_thresh
# Used for async cover loading etc.
if qt_available:
self.nam = QtNetwork.QNetworkAccessManager()
def check_api_key(self, key: str, url: str) -> bool:
if not url:
url = self.api_base_url
@ -430,6 +427,15 @@ class ComicVineTalker(TalkerBase):
image_url = record["image"].get("super_url", "")
image_thumb_url = record["image"].get("thumb_url", "")
alt_images_list = []
for alt in record["associated_images"]:
# Convert to comma separated
for k, v in alt.items():
if k == "original_url":
alt_images_list.append(v)
alt_images_url = ",".join(alt_images_list)
formatted_results.append(
ComicIssue(
aliases=record["aliases"],
@ -442,6 +448,7 @@ class ComicVineTalker(TalkerBase):
name=record["name"],
site_detail_url=record.get("site_detail_url", ""),
volume=cast(ComicVolume, record["volume"]),
alt_images_url=alt_images_url,
)
)
@ -544,6 +551,7 @@ class ComicVineTalker(TalkerBase):
# To support volume only searching
def fetch_volume_data(self, series_id: int) -> GenericMetadata:
# TODO New cache table or expand current? Makes sense to cache as multiple chapters will want the same data
# How to format people in cache? characters and locations can be csv.
volume_url = urljoin(self.api_base_url, f"volume/{CVTypeID.Volume}-{series_id}")
@ -573,7 +581,8 @@ class ComicVineTalker(TalkerBase):
return comic_data
def fetch_partial_volume_data(self, series_id: int) -> ComicVolume:
# This is still required over fetch_volume_data because this can use the cache whereas fetch_volume_data always
# requires an API call
# before we search online, look in our cache, since we might already have this info
cvc = ComicCacher()
cached_volume_result = cvc.get_volume_info(series_id, self.source_name)
@ -598,31 +607,6 @@ class ComicVineTalker(TalkerBase):
return formatted_volume_results[0]
def fetch_partial_issue_data(self, issue_id: int) -> ComicIssue:
# before we search online, look in our cache, since we might already have this info
cvc = ComicCacher()
cached_issue_result = cvc.get_issue_info(issue_id, self.source_name)
if cached_issue_result is not None:
return cached_issue_result
params = {
"api_key": self.api_key,
"filter": f"id:{issue_id}",
"format": "json",
"field_list": "id,volume,issue_number,name,image,cover_date,site_detail_url,description,aliases",
"offset": 0,
}
cv_response = self.get_cv_content(urljoin(self.api_base_url, "issues/"), params)
issue_result = cast(CVIssuesResults, cv_response["results"])
formatted_issue_results = self.format_issue_results([issue_result])
if formatted_issue_results:
cvc.add_volume_issues_info(self.source_name, formatted_issue_results)
return formatted_issue_results[0]
def fetch_issues_by_volume(self, series_id: int) -> list[ComicIssue]:
# before we search online, look in our cache, since we might already have this info
cvc = ComicCacher()
@ -635,7 +619,7 @@ class ComicVineTalker(TalkerBase):
"api_key": self.api_key,
"filter": f"volume:{series_id}",
"format": "json",
"field_list": "id,volume,issue_number,name,image,cover_date,site_detail_url,description,aliases",
"field_list": "id,volume,issue_number,name,image,cover_date,site_detail_url,description,aliases,associated_images",
"offset": 0,
}
cv_response = self.get_cv_content(urljoin(self.api_base_url, "issues/"), params)
@ -682,7 +666,7 @@ class ComicVineTalker(TalkerBase):
params: dict[str, str | int] = {
"api_key": self.api_key,
"format": "json",
"field_list": "id,volume,issue_number,name,image,cover_date,site_detail_url,description,aliases",
"field_list": "id,volume,issue_number,name,image,cover_date,site_detail_url,description,aliases,associated_images",
"filter": flt,
}
@ -739,14 +723,14 @@ class ComicVineTalker(TalkerBase):
return self.map_cv_data_to_metadata(volume_results, issue_results)
def fetch_issue_data_by_issue_id(self, issue_id: int) -> GenericMetadata:
# Cache will always be missing data (person, character, location) so a call to the API is required
issue_url = urljoin(self.api_base_url, f"issue/{CVTypeID.Issue}-{issue_id}")
params = {"api_key": self.api_key, "format": "json"}
cv_response = self.get_cv_content(issue_url, params)
issue_results = cast(CVIssueDetailResults, cv_response["results"])
volume_results = self.fetch_partial_volume_data(issue_results["volume"]["id"])
volume_results = self.fetch_partial_volume_data(int(issue_results["volume"]["id"]))
# Now, map the Comic Vine data to generic metadata
md = self.map_cv_data_to_metadata(volume_results, issue_results)
@ -963,105 +947,35 @@ class ComicVineTalker(TalkerBase):
return newstring
def fetch_alternate_cover_urls(self, issue_id: int) -> list[str]:
url_list = self.fetch_cached_alternate_cover_urls(issue_id)
if url_list:
return url_list
issue_info = self.fetch_partial_issue_data(issue_id)
issue_url = issue_info["site_detail_url"]
# scrape the CV issue page URL to get the alternate cover URLs
content = requests.get(issue_url, headers={"user-agent": "comictagger/" + ctversion.version}).text
alt_cover_url_list = self.parse_out_alt_cover_urls(content)
# cache this alt cover URL list
self.cache_alternate_cover_urls(issue_id, alt_cover_url_list)
return alt_cover_url_list
def parse_out_alt_cover_urls(self, page_html: str) -> list[str]:
soup = BeautifulSoup(page_html, "html.parser")
alt_cover_url_list = []
# Using knowledge of the layout of the Comic Vine issue page here:
# look for the divs that are in the classes 'imgboxart' and 'issue-cover'
div_list = soup.find_all("div")
covers_found = 0
for d in div_list:
if "class" in d.attrs:
c = d["class"]
if "imgboxart" in c and "issue-cover" in c:
if d.img["src"].startswith("http"):
covers_found += 1
if covers_found != 1:
alt_cover_url_list.append(d.img["src"])
elif d.img["data-src"].startswith("http"):
covers_found += 1
if covers_found != 1:
alt_cover_url_list.append(d.img["data-src"])
return alt_cover_url_list
def fetch_cached_alternate_cover_urls(self, issue_id: int) -> list[str]:
# before we search online, look in our cache, since we might already have this info
# Should always be in cache
cvc = ComicCacher()
url_list = cvc.get_alt_covers(self.source_name, issue_id)
issue = cvc.get_issue_info(issue_id, self.source_name)
return url_list
if issue:
url_list = [x for x in issue["alt_images_url"].split(",") if x] # Prevent an empty string producing ['']
else:
# On the off chance it is not in cache
params = {
"api_key": self.api_key,
"filter": f"id:{issue_id}",
"format": "json",
"field_list": "id,volume,issue_number,name,image,cover_date,site_detail_url,description,aliases,associated_images",
"offset": 0,
}
cv_response = self.get_cv_content(urljoin(self.api_base_url, "issues/"), params)
def cache_alternate_cover_urls(self, issue_id: int, url_list: list[str]) -> None:
cvc = ComicCacher()
cvc.add_alt_covers(self.source_name, issue_id, url_list)
issue_result = cast(CVIssuesResults, cv_response["results"])
def async_fetch_issue_cover_url_complete(self, reply: QtNetwork.QNetworkReply) -> None:
# read in the response
data = reply.readAll()
# Format to expected output
formatted_volume_issues_result = self.format_issue_results(issue_result)
try:
cv_response = cast(CVResult, json.loads(bytes(data)))
except Exception:
logger.exception("Comic Vine query failed to get JSON data\n%s", str(data))
return
cvc.add_volume_issues_info(self.source_name, formatted_volume_issues_result)
if cv_response["status_code"] != 1:
logger.error("Comic Vine query failed with error: [%s]. ", cv_response["error"])
return
url_list = [x for x in issue["alt_images_url"].split(",") if x]
result = cast(CVIssuesResults, cv_response["results"])
image_url = result["image"]["super_url"]
thumb_url = result["image"]["thumb_url"]
ComicTalker.url_fetch_complete(image_url, thumb_url)
def async_fetch_alternate_cover_urls(self, issue_id: int) -> None:
# bypass async for now
url_list = self.fetch_alternate_cover_urls(issue_id)
ComicTalker.alt_url_list_fetch_complete(url_list)
return
# This async version requires the issue page url to be provided!
self.issue_id = issue_id
url_list = self.fetch_cached_alternate_cover_urls(issue_id)
if url_list:
ComicTalker.alt_url_list_fetch_complete(url_list)
issue_info = self.fetch_partial_issue_data(issue_id)
issue_url = issue_info["site_detail_url"]
self.nam.finished.connect(self.async_fetch_alternate_cover_urls_complete)
self.nam.get(QtNetwork.QNetworkRequest(QtCore.QUrl(str(issue_url))))
def async_fetch_alternate_cover_urls_complete(self, reply: QtNetwork.QNetworkReply) -> None:
# read in the response
html = str(reply.readAll())
alt_cover_url_list = self.parse_out_alt_cover_urls(html)
# cache this alt cover URL list
self.cache_alternate_cover_urls(cast(int, self.issue_id), alt_cover_url_list)
ComicTalker.alt_url_list_fetch_complete(alt_cover_url_list)
# CLI expects a list returned
return url_list
def repair_urls(self, issue_list: list[CVIssuesResults] | list[CVIssueDetailResults]) -> None:
# make sure there are URLs for the image fields

View File

@ -3,7 +3,7 @@ from __future__ import annotations
import pytest
import comictalker.comiccacher
from testing.comicdata import alt_covers, search_results
from testing.comicdata import search_results
def test_create_cache(settings):
@ -20,12 +20,6 @@ def test_search_results(comic_cache):
assert search_results == comic_cache.get_search_results("test", "test search")
@pytest.mark.parametrize("alt_cover", alt_covers)
def test_alt_covers(comic_cache, alt_cover):
comic_cache.add_alt_covers(**alt_cover, source_name="test")
assert alt_cover["url_list"] == comic_cache.get_alt_covers(issue_id=alt_cover["issue_id"], source_name="test")
@pytest.mark.parametrize("volume_info", search_results)
def test_volume_info(comic_cache, volume_info):
comic_cache.add_volume_info(volume_record=volume_info, source_name="test")

View File

@ -54,6 +54,7 @@ def test_fetch_issues_by_volume_issue_num_and_year(comicvine_api):
for r, e in zip(results, [cv_expected]):
del r["image_thumb_url"]
del r["image_url"]
del r["alt_images_url"]
assert r == e