Move map comic data to utils along with remove html. Alter tests.

This commit is contained in:
Mizaki 2022-11-05 16:49:59 +00:00
parent a724fd8430
commit 67be086638
6 changed files with 94 additions and 169 deletions

View File

@ -30,8 +30,8 @@ from comictaggerlib.imagefetcher import ImageFetcher, ImageFetcherException
from comictaggerlib.imagehasher import ImageHasher
from comictaggerlib.resulttypes import IssueResult
from comictaggerlib.settings import ComicTaggerSettings
from comictalker.talker_utils import parse_date_str
from comictalker.talkerbase import ComicTalker, TalkerError
from comictalker.utils import parse_date_str
logger = logging.getLogger(__name__)

View File

@ -17,14 +17,69 @@ from __future__ import annotations
import logging
import re
from datetime import datetime
from bs4 import BeautifulSoup
from comicapi import utils
from comicapi.genericmetadata import GenericMetadata
from comicapi.issuestring import IssueString
from comictaggerlib import ctversion
from comictalker.talkerbase import ComicIssue
logger = logging.getLogger(__name__)
def map_comic_issue_to_metadata(
issue_results: ComicIssue, source: str, remove_html_tables: bool = False, use_year_volume: bool = False
) -> GenericMetadata:
# Now, map the ComicIssue data to generic metadata
metadata = GenericMetadata()
metadata.is_empty = False
# Is this best way to go about checking?
if issue_results["volume"].get("name"):
metadata.series = utils.xlate(issue_results["volume"]["name"])
if issue_results.get("issue_number"):
metadata.issue = IssueString(issue_results["issue_number"]).as_string()
if issue_results.get("name"):
metadata.title = utils.xlate(issue_results["name"])
if issue_results.get("image_url"):
metadata.cover_image = issue_results["image_url"]
if issue_results["volume"].get("publisher"):
metadata.publisher = utils.xlate(issue_results["volume"]["publisher"])
metadata.day, metadata.month, metadata.year = utils.parse_date_str(issue_results["cover_date"])
metadata.comments = cleanup_html(issue_results["description"], remove_html_tables)
if use_year_volume:
metadata.volume = issue_results["volume"]["start_year"]
metadata.notes = (
f"Tagged with ComicTagger {ctversion.version} using info from {source} on"
f" {datetime.now():%Y-%m-%d %H:%M:%S}. [Issue ID {issue_results['id']}]"
)
metadata.web_link = issue_results["site_detail_url"]
for person in issue_results["credits"]:
if "role" in person:
roles = person["role"].split(",")
for role in roles:
# can we determine 'primary' from CV??
metadata.add_credit(person["name"], role.title().strip(), False)
if issue_results.get("characters"):
metadata.characters = ", ".join(issue_results["characters"])
if issue_results.get("teams"):
metadata.teams = ", ".join(issue_results["teams"])
if issue_results.get("locations"):
metadata.locations = ", ".join(issue_results["locations"])
if issue_results.get("story_arcs"):
metadata.story_arc = ", ".join(issue_results["story_arcs"])
return metadata
def parse_date_str(date_str: str) -> tuple[int | None, int | None, int | None]:
day = None
month = None

View File

@ -17,16 +17,15 @@ from __future__ import annotations
import json
import logging
import re
import time
from datetime import datetime
from typing import Any, Callable, cast
from urllib.parse import urljoin, urlsplit
import requests
from bs4 import BeautifulSoup
from typing_extensions import Required, TypedDict
import comictalker.talker_utils as talker_utils
from comicapi import utils
from comicapi.genericmetadata import GenericMetadata
from comicapi.issuestring import IssueString
@ -722,7 +721,12 @@ class ComicVineTalker(ComicTalker):
if f_record and f_record["complete"]:
# Cache had full record
return self.map_cv_data_to_metadata(f_record)
return talker_utils.map_comic_issue_to_metadata(
f_record,
self.source_name_friendly,
self.settings_options["remove_html_tables"]["value"],
self.settings_options["use_series_start_as_volume"]["value"],
)
if f_record is not None:
issue_url = urljoin(self.api_base_url, f"issue/{CVTypeID.Issue}-{f_record['id']}")
@ -742,7 +746,12 @@ class ComicVineTalker(ComicTalker):
else:
return GenericMetadata()
return self.map_cv_data_to_metadata(formatted_issues_result[0])
return talker_utils.map_comic_issue_to_metadata(
formatted_issues_result[0],
self.source_name_friendly,
self.settings_options["remove_html_tables"]["value"],
self.settings_options["use_series_start_as_volume"]["value"],
)
def fetch_issue_data_by_issue_id(self, issue_id: int) -> GenericMetadata:
# before we search online, look in our cache, since we might already have this info
@ -750,7 +759,12 @@ class ComicVineTalker(ComicTalker):
cached_issues_result = cvc.get_issue_info(issue_id, self.source_name)
if cached_issues_result and cached_issues_result["complete"]:
return self.map_cv_data_to_metadata(cached_issues_result)
return talker_utils.map_comic_issue_to_metadata(
cached_issues_result,
self.source_name_friendly,
self.settings_options["remove_html_tables"]["value"],
self.settings_options["use_series_start_as_volume"]["value"],
)
issue_url = urljoin(self.api_base_url, f"issue/{CVTypeID.Issue}-{issue_id}")
params = {"api_key": self.api_key, "format": "json"}
@ -768,9 +782,14 @@ class ComicVineTalker(ComicTalker):
cvc.add_volume_issues_info(self.source_name, formatted_issues_result)
# Now, map the ComicIssue data to generic metadata
return self.map_cv_data_to_metadata(formatted_issues_result[0])
return talker_utils.map_comic_issue_to_metadata(
formatted_issues_result[0],
self.source_name_friendly,
self.settings_options["remove_html_tables"]["value"],
self.settings_options["use_series_start_as_volume"]["value"],
)
# To support volume only searching. For testing only.
# To support volume only searching. For testing only. # TODO Delete or create ComicIssue to then map
def map_cv_volume_data_to_metadata(self, volume_results: CVVolumeFullResult) -> GenericMetadata:
# Now, map the Comic Vine data to generic metadata
@ -783,7 +802,7 @@ class ComicVineTalker(ComicTalker):
metadata.publisher = utils.xlate(volume_results["publisher"]["name"])
metadata.year = utils.xlate(volume_results["start_year"], True)
metadata.comments = self.cleanup_html(
metadata.comments = talker_utils.cleanup_html(
volume_results["description"], self.settings_options["remove_html_tables"]["value"]
)
if self.settings_options["use_series_start_as_volume"]["value"]:
@ -821,142 +840,6 @@ class ComicVineTalker(ComicTalker):
return metadata
def map_cv_data_to_metadata(self, issue_results: ComicIssue) -> GenericMetadata:
# TODO As this now takes ComicIssue, move to utils so other talkers can use it?
# Now, map the Comic Vine data to generic metadata
metadata = GenericMetadata()
metadata.is_empty = False
metadata.series = utils.xlate(issue_results["volume"]["name"])
metadata.issue = IssueString(issue_results["issue_number"]).as_string()
metadata.title = utils.xlate(issue_results["name"])
metadata.cover_image = issue_results["image_url"]
if issue_results["volume"].get("publisher") is not None:
metadata.publisher = utils.xlate(issue_results["volume"]["publisher"])
metadata.day, metadata.month, metadata.year = utils.parse_date_str(issue_results["cover_date"])
metadata.comments = self.cleanup_html(
issue_results["description"], self.settings_options["remove_html_tables"]["value"]
)
if self.settings_options["use_series_start_as_volume"]["value"]:
metadata.volume = issue_results["volume"]["start_year"]
metadata.notes = (
f"Tagged with ComicTagger {ctversion.version} using info from {self.source_name_friendly} on"
f" {datetime.now():%Y-%m-%d %H:%M:%S}. [Issue ID {issue_results['id']}]"
)
metadata.web_link = issue_results["site_detail_url"]
for person in issue_results["credits"]:
if "role" in person:
roles = person["role"].split(",")
for role in roles:
# can we determine 'primary' from CV??
metadata.add_credit(person["name"], role.title().strip(), False)
metadata.characters = ", ".join(issue_results["characters"])
metadata.teams = ", ".join(issue_results["teams"])
metadata.locations = ", ".join(issue_results["locations"])
metadata.story_arc = ", ".join(issue_results["story_arcs"])
return metadata
# TODO Move to utils?
def cleanup_html(self, string: str, remove_html_tables: bool) -> str:
if string is None:
return ""
# find any tables
soup = BeautifulSoup(string, "html.parser")
tables = soup.findAll("table")
# remove all newlines first
string = string.replace("\n", "")
# put in our own
string = string.replace("<br>", "\n")
string = string.replace("</li>", "\n")
string = string.replace("</p>", "\n\n")
string = string.replace("<h1>", "*")
string = string.replace("</h1>", "*\n")
string = string.replace("<h2>", "*")
string = string.replace("</h2>", "*\n")
string = string.replace("<h3>", "*")
string = string.replace("</h3>", "*\n")
string = string.replace("<h4>", "*")
string = string.replace("</h4>", "*\n")
string = string.replace("<h5>", "*")
string = string.replace("</h5>", "*\n")
string = string.replace("<h6>", "*")
string = string.replace("</h6>", "*\n")
# remove the tables
p = re.compile(r"<table[^<]*?>.*?</table>")
if remove_html_tables:
string = p.sub("", string)
string = string.replace("*List of covers and their creators:*", "")
else:
string = p.sub("{}", string)
# now strip all other tags
p = re.compile(r"<[^<]*?>")
newstring = p.sub("", string)
newstring = newstring.replace("&nbsp;", " ")
newstring = newstring.replace("&amp;", "&")
newstring = newstring.strip()
if not remove_html_tables:
# now rebuild the tables into text from BSoup
try:
table_strings = []
for table in tables:
rows = []
hdrs = []
col_widths = []
for hdr in table.findAll("th"):
item = hdr.string.strip()
hdrs.append(item)
col_widths.append(len(item))
rows.append(hdrs)
for row in table.findAll("tr"):
cols = []
col = row.findAll("td")
i = 0
for c in col:
item = c.string.strip()
cols.append(item)
if len(item) > col_widths[i]:
col_widths[i] = len(item)
i += 1
if len(cols) != 0:
rows.append(cols)
# now we have the data, make it into text
fmtstr = ""
for w in col_widths:
fmtstr += f" {{:{w + 1}}}|"
width = sum(col_widths) + len(col_widths) * 2
table_text = ""
counter = 0
for row in rows:
table_text += fmtstr.format(*row) + "\n"
if counter == 0 and len(hdrs) != 0:
table_text += "-" * width + "\n"
counter += 1
table_strings.append(table_text)
newstring = newstring.format(*table_strings)
except Exception:
# we caught an error rebuilding the table.
# just bail and remove the formatting
logger.exception("table parse error")
newstring.replace("{}", "")
return newstring
def repair_urls(self, issue_list: list[CVIssueDetailResults]) -> None:
# make sure there are URLs for the image fields
for issue in issue_list:

View File

@ -3,8 +3,8 @@ from __future__ import annotations
from typing import Any
import comicapi.genericmetadata
import comictalker.talkers.comicvine
from comicapi import utils
from comictalker.talker_utils import cleanup_html
def filter_field_list(cv_result, kwargs):
@ -190,9 +190,7 @@ cv_md = comicapi.genericmetadata.GenericMetadata(
volume=None,
genre=None,
language=None,
comments=comictalker.talkers.comicvine.ComicVineTalker().cleanup_html(
cv_issue_result["results"]["description"], False
),
comments=cleanup_html(cv_issue_result["results"]["description"], False),
volume_count=None,
critical_rating=None,
country=None,
@ -200,19 +198,19 @@ cv_md = comicapi.genericmetadata.GenericMetadata(
alternate_number=None,
alternate_count=None,
imprint=None,
notes="Tagged with ComicTagger 1.4.4a9.dev20 using info from Comic Vine on 2022-07-11 17:42:41. [Issue ID 140529]",
notes=None,
web_link=cv_issue_result["results"]["site_detail_url"],
format=None,
manga=None,
black_and_white=None,
page_count=None,
maturity_rating=None,
story_arc="",
story_arc=None,
series_group=None,
scan_info=None,
characters="",
teams="",
locations="",
characters=None,
teams=None,
locations=None,
credits=[
comicapi.genericmetadata.CreditMetadata(person=x["name"], role=x["role"].title(), primary=False)
for x in cv_issue_result["results"]["person_credits"]

View File

@ -45,9 +45,10 @@ def test_fetch_issues_by_volume(comicvine_api, comic_cache):
assert results == cache_issues
def test_fetch_issue_data_by_issue_id(comicvine_api, settings, mock_now, mock_version):
def test_fetch_issue_data_by_issue_id(comicvine_api, settings, mock_version):
ct = comictalker.talkers.comicvine.ComicVineTalker()
result = ct.fetch_comic_data(140529)
result.notes = None
assert result == testing.comicvine.cv_md
@ -81,7 +82,8 @@ cv_issue = [
@pytest.mark.parametrize("volume_id, issue_number, expected", cv_issue)
def test_fetch_issue_data(comicvine_api, settings, mock_now, mock_version, volume_id, issue_number, expected):
def test_fetch_issue_data(comicvine_api, settings, mock_version, volume_id, issue_number, expected):
ct = comictalker.talkers.comicvine.ComicVineTalker()
results = ct.fetch_issue_data(volume_id, issue_number)
results.notes = None
assert results == expected

View File

@ -1,7 +1,6 @@
from __future__ import annotations
import copy
import datetime
import io
import shutil
import unittest.mock
@ -117,18 +116,6 @@ def comicvine_api(monkeypatch, cbz, comic_cache) -> comictalker.talkers.comicvin
return cv
@pytest.fixture
def mock_now(monkeypatch):
class mydatetime:
time = datetime.datetime(2022, 7, 11, 17, 42, 41)
@classmethod
def now(cls):
return cls.time
monkeypatch.setattr(comictalker.talkers.comicvine, "datetime", mydatetime)
@pytest.fixture
def mock_version(monkeypatch):
version = "1.4.4a9.dev20"