"""Generic sources utils to format API data and the like. """ # Copyright 2012-2014 Anthony Beville # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import annotations import logging import re from bs4 import BeautifulSoup from comicapi import utils logger = logging.getLogger(__name__) def parse_date_str(date_str: str) -> tuple[int | None, int | None, int | None]: day = None month = None year = None if date_str: parts = date_str.split("-") year = utils.xlate(parts[0], True) if len(parts) > 1: month = utils.xlate(parts[1], True) if len(parts) > 2: day = utils.xlate(parts[2], True) return day, month, year def cleanup_html(string: str, remove_html_tables: bool) -> str: if string is None: return "" # find any tables soup = BeautifulSoup(string, "html.parser") tables = soup.findAll("table") # remove all newlines first string = string.replace("\n", "") # put in our own string = string.replace("
", "\n") string = string.replace("", "\n") string = string.replace("

", "\n\n") string = string.replace("

", "*") string = string.replace("

", "*\n") string = string.replace("

", "*") string = string.replace("

", "*\n") string = string.replace("

", "*") string = string.replace("

", "*\n") string = string.replace("

", "*") string = string.replace("

", "*\n") string = string.replace("
", "*") string = string.replace("
", "*\n") string = string.replace("
", "*") string = string.replace("
", "*\n") # remove the tables p = re.compile(r".*?") if remove_html_tables: string = p.sub("", string) string = string.replace("*List of covers and their creators:*", "") else: string = p.sub("{}", string) # now strip all other tags p = re.compile(r"<[^<]*?>") newstring = p.sub("", string) newstring = newstring.replace(" ", " ") newstring = newstring.replace("&", "&") newstring = newstring.strip() if not remove_html_tables: # now rebuild the tables into text from BSoup try: table_strings = [] for table in tables: rows = [] hdrs = [] col_widths = [] for hdr in table.findAll("th"): item = hdr.string.strip() hdrs.append(item) col_widths.append(len(item)) rows.append(hdrs) for row in table.findAll("tr"): cols = [] col = row.findAll("td") i = 0 for c in col: item = c.string.strip() cols.append(item) if len(item) > col_widths[i]: col_widths[i] = len(item) i += 1 if len(cols) != 0: rows.append(cols) # now we have the data, make it into text fmtstr = "" for w in col_widths: fmtstr += f" {{:{w + 1}}}|" width = sum(col_widths) + len(col_widths) * 2 table_text = "" counter = 0 for row in rows: table_text += fmtstr.format(*row) + "\n" if counter == 0 and len(hdrs) != 0: table_text += "-" * width + "\n" counter += 1 table_strings.append(table_text) newstring = newstring.format(*table_strings) except Exception: # we caught an error rebuilding the table. # just bail and remove the formatting logger.exception("table parse error") newstring.replace("{}", "") return newstring