diff --git a/comicapi/_url.py b/comicapi/_url.py new file mode 100644 index 0000000..2278829 --- /dev/null +++ b/comicapi/_url.py @@ -0,0 +1,468 @@ +# mypy: disable-error-code="no-redef" +from __future__ import annotations + +try: + from urllib3.exceptions import HTTPError, LocationParseError, LocationValueError + from urllib3.util import Url, parse_url +except ImportError: + + import re + import typing + + class HTTPError(Exception): + """Base exception used by this module.""" + + class LocationValueError(ValueError, HTTPError): + """Raised when there is something wrong with a given URL input.""" + + class LocationParseError(LocationValueError): + """Raised when get_host or similar fails to parse the URL input.""" + + def __init__(self, location: str) -> None: + message = f"Failed to parse: {location}" + super().__init__(message) + + self.location = location + + def to_str(x: str | bytes, encoding: str | None = None, errors: str | None = None) -> str: + if isinstance(x, str): + return x + elif not isinstance(x, bytes): + raise TypeError(f"not expecting type {type(x).__name__}") + if encoding or errors: + return x.decode(encoding or "utf-8", errors=errors or "strict") + return x.decode() + + # We only want to normalize urls with an HTTP(S) scheme. + # urllib3 infers URLs without a scheme (None) to be http. + _NORMALIZABLE_SCHEMES = ("http", "https", None) + + # Almost all of these patterns were derived from the + # 'rfc3986' module: https://github.com/python-hyper/rfc3986 + _PERCENT_RE = re.compile(r"%[a-fA-F0-9]{2}") + _SCHEME_RE = re.compile(r"^(?:[a-zA-Z][a-zA-Z0-9+-]*:|/)") + _URI_RE = re.compile( + r"^(?:([a-zA-Z][a-zA-Z0-9+.-]*):)?" r"(?://([^\\/?#]*))?" r"([^?#]*)" r"(?:\?([^#]*))?" r"(?:#(.*))?$", + re.UNICODE | re.DOTALL, + ) + + _IPV4_PAT = r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}" + _HEX_PAT = "[0-9A-Fa-f]{1,4}" + _LS32_PAT = "(?:{hex}:{hex}|{ipv4})".format(hex=_HEX_PAT, ipv4=_IPV4_PAT) + _subs = {"hex": _HEX_PAT, "ls32": _LS32_PAT} + _variations = [ + # 6( h16 ":" ) ls32 + "(?:%(hex)s:){6}%(ls32)s", + # "::" 5( h16 ":" ) ls32 + "::(?:%(hex)s:){5}%(ls32)s", + # [ h16 ] "::" 4( h16 ":" ) ls32 + "(?:%(hex)s)?::(?:%(hex)s:){4}%(ls32)s", + # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 + "(?:(?:%(hex)s:)?%(hex)s)?::(?:%(hex)s:){3}%(ls32)s", + # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 + "(?:(?:%(hex)s:){0,2}%(hex)s)?::(?:%(hex)s:){2}%(ls32)s", + # [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 + "(?:(?:%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s", + # [ *4( h16 ":" ) h16 ] "::" ls32 + "(?:(?:%(hex)s:){0,4}%(hex)s)?::%(ls32)s", + # [ *5( h16 ":" ) h16 ] "::" h16 + "(?:(?:%(hex)s:){0,5}%(hex)s)?::%(hex)s", + # [ *6( h16 ":" ) h16 ] "::" + "(?:(?:%(hex)s:){0,6}%(hex)s)?::", + ] + + _UNRESERVED_PAT = r"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._\-~" + _IPV6_PAT = "(?:" + "|".join([x % _subs for x in _variations]) + ")" + _ZONE_ID_PAT = "(?:%25|%)(?:[" + _UNRESERVED_PAT + "]|%[a-fA-F0-9]{2})+" + _IPV6_ADDRZ_PAT = r"\[" + _IPV6_PAT + r"(?:" + _ZONE_ID_PAT + r")?\]" + _REG_NAME_PAT = r"(?:[^\[\]%:/?#]|%[a-fA-F0-9]{2})*" + _TARGET_RE = re.compile(r"^(/[^?#]*)(?:\?([^#]*))?(?:#.*)?$") + + _IPV4_RE = re.compile("^" + _IPV4_PAT + "$") + _IPV6_RE = re.compile("^" + _IPV6_PAT + "$") + _IPV6_ADDRZ_RE = re.compile("^" + _IPV6_ADDRZ_PAT + "$") + _BRACELESS_IPV6_ADDRZ_RE = re.compile("^" + _IPV6_ADDRZ_PAT[2:-2] + "$") + _ZONE_ID_RE = re.compile("(" + _ZONE_ID_PAT + r")\]$") + + _HOST_PORT_PAT = ("^(%s|%s|%s)(?::0*?(|0|[1-9][0-9]{0,4}))?$") % ( + _REG_NAME_PAT, + _IPV4_PAT, + _IPV6_ADDRZ_PAT, + ) + _HOST_PORT_RE = re.compile(_HOST_PORT_PAT, re.UNICODE | re.DOTALL) + + _UNRESERVED_CHARS = set("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._-~") + _SUB_DELIM_CHARS = set("!$&'()*+,;=") + _USERINFO_CHARS = _UNRESERVED_CHARS | _SUB_DELIM_CHARS | {":"} + _PATH_CHARS = _USERINFO_CHARS | {"@", "/"} + _QUERY_CHARS = _FRAGMENT_CHARS = _PATH_CHARS | {"?"} + + class Url( + typing.NamedTuple( + "Url", + [ + ("scheme", typing.Optional[str]), + ("auth", typing.Optional[str]), + ("host", typing.Optional[str]), + ("port", typing.Optional[int]), + ("path", typing.Optional[str]), + ("query", typing.Optional[str]), + ("fragment", typing.Optional[str]), + ], + ) + ): + """ + Data structure for representing an HTTP URL. Used as a return value for + :func:`parse_url`. Both the scheme and host are normalized as they are + both case-insensitive according to RFC 3986. + """ + + def __new__( # type: ignore[no-untyped-def] + cls, + scheme: str | None = None, + auth: str | None = None, + host: str | None = None, + port: int | None = None, + path: str | None = None, + query: str | None = None, + fragment: str | None = None, + ): + if path and not path.startswith("/"): + path = "/" + path + if scheme is not None: + scheme = scheme.lower() + return super().__new__(cls, scheme, auth, host, port, path, query, fragment) + + @property + def hostname(self) -> str | None: + """For backwards-compatibility with urlparse. We're nice like that.""" + return self.host + + @property + def request_uri(self) -> str: + """Absolute path including the query string.""" + uri = self.path or "/" + + if self.query is not None: + uri += "?" + self.query + + return uri + + @property + def authority(self) -> str | None: + """ + Authority component as defined in RFC 3986 3.2. + This includes userinfo (auth), host and port. + + i.e. + userinfo@host:port + """ + userinfo = self.auth + netloc = self.netloc + if netloc is None or userinfo is None: + return netloc + else: + return f"{userinfo}@{netloc}" + + @property + def netloc(self) -> str | None: + """ + Network location including host and port. + + If you need the equivalent of urllib.parse's ``netloc``, + use the ``authority`` property instead. + """ + if self.host is None: + return None + if self.port: + return f"{self.host}:{self.port}" + return self.host + + @property + def url(self) -> str: + """ + Convert self into a url + + This function should more or less round-trip with :func:`.parse_url`. The + returned url may not be exactly the same as the url inputted to + :func:`.parse_url`, but it should be equivalent by the RFC (e.g., urls + with a blank port will have : removed). + + Example: + + .. code-block:: python + + import urllib3 + + U = urllib3.util.parse_url("https://google.com/mail/") + + print(U.url) + # "https://google.com/mail/" + + print( urllib3.util.Url("https", "username:password", + "host.com", 80, "/path", "query", "fragment" + ).url + ) + # "https://username:password@host.com:80/path?query#fragment" + """ + scheme, auth, host, port, path, query, fragment = self + url = "" + + # We use "is not None" we want things to happen with empty strings (or 0 port) + if scheme is not None: + url += scheme + "://" + if auth is not None: + url += auth + "@" + if host is not None: + url += host + if port is not None: + url += ":" + str(port) + if path is not None: + url += path + if query is not None: + url += "?" + query + if fragment is not None: + url += "#" + fragment + + return url + + def __str__(self) -> str: + return self.url + + @typing.overload + def _encode_invalid_chars(component: str, allowed_chars: typing.Container[str]) -> str: # Abstract + ... + + @typing.overload + def _encode_invalid_chars(component: None, allowed_chars: typing.Container[str]) -> None: # Abstract + ... + + def _encode_invalid_chars(component: str | None, allowed_chars: typing.Container[str]) -> str | None: + """Percent-encodes a URI component without reapplying + onto an already percent-encoded component. + """ + if component is None: + return component + + component = to_str(component) + + # Normalize existing percent-encoded bytes. + # Try to see if the component we're encoding is already percent-encoded + # so we can skip all '%' characters but still encode all others. + component, percent_encodings = _PERCENT_RE.subn(lambda match: match.group(0).upper(), component) + + uri_bytes = component.encode("utf-8", "surrogatepass") + is_percent_encoded = percent_encodings == uri_bytes.count(b"%") + encoded_component = bytearray() + + for i in range(0, len(uri_bytes)): + # Will return a single character bytestring + byte = uri_bytes[i : i + 1] + byte_ord = ord(byte) + if (is_percent_encoded and byte == b"%") or (byte_ord < 128 and byte.decode() in allowed_chars): + encoded_component += byte + continue + encoded_component.extend(b"%" + (hex(byte_ord)[2:].encode().zfill(2).upper())) + + return encoded_component.decode() + + def _remove_path_dot_segments(path: str) -> str: + # See http://tools.ietf.org/html/rfc3986#section-5.2.4 for pseudo-code + segments = path.split("/") # Turn the path into a list of segments + output = [] # Initialize the variable to use to store output + + for segment in segments: + # '.' is the current directory, so ignore it, it is superfluous + if segment == ".": + continue + # Anything other than '..', should be appended to the output + if segment != "..": + output.append(segment) + # In this case segment == '..', if we can, we should pop the last + # element + elif output: + output.pop() + + # If the path starts with '/' and the output is empty or the first string + # is non-empty + if path.startswith("/") and (not output or output[0]): + output.insert(0, "") + + # If the path starts with '/.' or '/..' ensure we add one more empty + # string to add a trailing '/' + if path.endswith(("/.", "/..")): + output.append("") + + return "/".join(output) + + @typing.overload + def _normalize_host(host: None, scheme: str | None) -> None: ... + + @typing.overload + def _normalize_host(host: str, scheme: str | None) -> str: ... + + def _normalize_host(host: str | None, scheme: str | None) -> str | None: + if host: + if scheme in _NORMALIZABLE_SCHEMES: + is_ipv6 = _IPV6_ADDRZ_RE.match(host) + if is_ipv6: + # IPv6 hosts of the form 'a::b%zone' are encoded in a URL as + # such per RFC 6874: 'a::b%25zone'. Unquote the ZoneID + # separator as necessary to return a valid RFC 4007 scoped IP. + match = _ZONE_ID_RE.search(host) + if match: + start, end = match.span(1) + zone_id = host[start:end] + + if zone_id.startswith("%25") and zone_id != "%25": + zone_id = zone_id[3:] + else: + zone_id = zone_id[1:] + zone_id = _encode_invalid_chars(zone_id, _UNRESERVED_CHARS) + return f"{host[:start].lower()}%{zone_id}{host[end:]}" + else: + return host.lower() + elif not _IPV4_RE.match(host): + return to_str( + b".".join([_idna_encode(label) for label in host.split(".")]), + "ascii", + ) + return host + + def _idna_encode(name: str) -> bytes: + if not name.isascii(): + try: + import idna + except ImportError: + raise LocationParseError("Unable to parse URL without the 'idna' module") from None + + try: + return idna.encode(name.lower(), strict=True, std3_rules=True) + except idna.IDNAError: + raise LocationParseError(f"Name '{name}' is not a valid IDNA label") from None + + return name.lower().encode("ascii") + + def _encode_target(target: str) -> str: + """Percent-encodes a request target so that there are no invalid characters + + Pre-condition for this function is that 'target' must start with '/'. + If that is the case then _TARGET_RE will always produce a match. + """ + match = _TARGET_RE.match(target) + if not match: # Defensive: + raise LocationParseError(f"{target!r} is not a valid request URI") + + path, query = match.groups() + encoded_target = _encode_invalid_chars(path, _PATH_CHARS) + if query is not None: + query = _encode_invalid_chars(query, _QUERY_CHARS) + encoded_target += "?" + query + return encoded_target + + def parse_url(url: str) -> Url: + """ + Given a url, return a parsed :class:`.Url` namedtuple. Best-effort is + performed to parse incomplete urls. Fields not provided will be None. + This parser is RFC 3986 and RFC 6874 compliant. + + The parser logic and helper functions are based heavily on + work done in the ``rfc3986`` module. + + :param str url: URL to parse into a :class:`.Url` namedtuple. + + Partly backwards-compatible with :mod:`urllib.parse`. + + Example: + + .. code-block:: python + + import urllib3 + + print( urllib3.util.parse_url('http://google.com/mail/')) + # Url(scheme='http', host='google.com', port=None, path='/mail/', ...) + + print( urllib3.util.parse_url('google.com:80')) + # Url(scheme=None, host='google.com', port=80, path=None, ...) + + print( urllib3.util.parse_url('/foo?bar')) + # Url(scheme=None, host=None, port=None, path='/foo', query='bar', ...) + """ + if not url: + # Empty + return Url() + + source_url = url + if not _SCHEME_RE.search(url): + url = "//" + url + + scheme: str | None + authority: str | None + auth: str | None + host: str | None + port: str | None + port_int: int | None + path: str | None + query: str | None + fragment: str | None + + try: + scheme, authority, path, query, fragment = _URI_RE.match(url).groups() # type: ignore[union-attr] + normalize_uri = scheme is None or scheme.lower() in _NORMALIZABLE_SCHEMES + + if scheme: + scheme = scheme.lower() + + if authority: + auth, _, host_port = authority.rpartition("@") + auth = auth or None + host, port = _HOST_PORT_RE.match(host_port).groups() # type: ignore[union-attr] + if auth and normalize_uri: + auth = _encode_invalid_chars(auth, _USERINFO_CHARS) + if port == "": + port = None + else: + auth, host, port = None, None, None + + if port is not None: + port_int = int(port) + if not (0 <= port_int <= 65535): + raise LocationParseError(url) + else: + port_int = None + + host = _normalize_host(host, scheme) + + if normalize_uri and path: + path = _remove_path_dot_segments(path) + path = _encode_invalid_chars(path, _PATH_CHARS) + if normalize_uri and query: + query = _encode_invalid_chars(query, _QUERY_CHARS) + if normalize_uri and fragment: + fragment = _encode_invalid_chars(fragment, _FRAGMENT_CHARS) + + except (ValueError, AttributeError) as e: + raise LocationParseError(source_url) from e + + # For the sake of backwards compatibility we put empty + # string values for path if there are any defined values + # beyond the path in the URL. + # TODO: Remove this when we break backwards compatibility. + if not path: + if query is not None or fragment is not None: + path = "" + else: + path = None + + return Url( + scheme=scheme, + auth=auth, + host=host, + port=port_int, + path=path, + query=query, + fragment=fragment, + ) + + +__all__ = ("Url", "parse_url", "HTTPError", "LocationParseError", "LocationValueError") diff --git a/comicapi/genericmetadata.py b/comicapi/genericmetadata.py index 35c68cf..8767d9e 100644 --- a/comicapi/genericmetadata.py +++ b/comicapi/genericmetadata.py @@ -31,6 +31,8 @@ from typing_extensions import NamedTuple, Required from comicapi import utils +from ._url import Url, parse_url + logger = logging.getLogger(__name__) @@ -133,7 +135,7 @@ class GenericMetadata: year: int | None = None language: str | None = None # 2 letter iso code country: str | None = None - web_link: str | None = None + web_links: list[Url] = dataclasses.field(default_factory=list) format: str | None = None manga: str | None = None black_and_white: bool | None = None @@ -253,7 +255,7 @@ class GenericMetadata: assign("year", new_md.year) assign("language", new_md.language) assign("country", new_md.country) - assign("web_link", new_md.web_link) + assign("web_links", new_md.web_links) assign("format", new_md.format) assign("manga", new_md.manga) assign("black_and_white", new_md.black_and_white) @@ -487,7 +489,9 @@ md_test: GenericMetadata = GenericMetadata( alternate_count=7, imprint="craphound.com", notes="Tagged with ComicTagger 1.3.2a5 using info from Comic Vine on 2022-04-16 15:52:26. [Issue ID 140529]", - web_link="https://comicvine.gamespot.com/cory-doctorows-futuristic-tales-of-the-here-and-no/4000-140529/", + web_links=[ + parse_url("https://comicvine.gamespot.com/cory-doctorows-futuristic-tales-of-the-here-and-no/4000-140529/") + ], format="Series", manga="No", black_and_white=None, @@ -551,3 +555,15 @@ md_test: GenericMetadata = GenericMetadata( last_mark=None, _cover_image=None, ) + + +__all__ = ( + "Url", + "parse_url", + "PageType", + "ImageMetadata", + "Credit", + "ComicSeries", + "TagOrigin", + "GenericMetadata", +) diff --git a/comicapi/metadata/comicrack.py b/comicapi/metadata/comicrack.py index b9782d6..1b02826 100644 --- a/comicapi/metadata/comicrack.py +++ b/comicapi/metadata/comicrack.py @@ -57,7 +57,7 @@ class ComicRack(Metadata): "month", "year", "language", - "web_link", + "web_links", "format", "manga", "black_and_white", @@ -229,7 +229,7 @@ class ComicRack(Metadata): assign("Month", md.month) assign("Year", md.year) assign("LanguageISO", md.language) - assign("Web", md.web_link) + assign("Web", " ".join(u.url for u in md.web_links)) assign("Format", md.format) assign("Manga", md.manga) assign("BlackAndWhite", "Yes" if md.black_and_white else None) @@ -313,7 +313,7 @@ class ComicRack(Metadata): md.month = utils.xlate_int(get("Month")) md.year = utils.xlate_int(get("Year")) md.language = utils.xlate(get("LanguageISO")) - md.web_link = utils.xlate(get("Web")) + md.web_links = utils.split_urls(utils.xlate(get("Web"))) md.format = utils.xlate(get("Format")) md.manga = utils.xlate(get("Manga")) md.maturity_rating = utils.xlate(get("AgeRating")) diff --git a/comicapi/utils.py b/comicapi/utils.py index cb44cfe..9364c7d 100644 --- a/comicapi/utils.py +++ b/comicapi/utils.py @@ -29,6 +29,8 @@ from typing import Any, TypeVar, cast import comicapi.data from comicapi import filenamelexer, filenameparser +from ._url import Url, parse_url + try: import icu @@ -275,6 +277,24 @@ def split(s: str | None, c: str) -> list[str]: return [] +def split_urls(s: str | None) -> list[Url]: + if s is None: + return [] + # Find occurences of ' http' + if s.count("http") > 1 and s.count(" http") >= 1: + urls = [] + # Split urls out + url_strings = split(s, " http") + # Return the scheme 'http' and parse the url + for i, url_string in enumerate(url_strings): + if not url_string.startswith("http"): + url_string = "http" + url_string + urls.append(parse_url(url_string)) + return urls + else: + return [parse_url(s)] + + def remove_articles(text: str) -> str: text = text.casefold() articles = [ diff --git a/comictaggerlib/cbltransformer.py b/comictaggerlib/cbltransformer.py index f5674ef..0fe3256 100644 --- a/comictaggerlib/cbltransformer.py +++ b/comictaggerlib/cbltransformer.py @@ -78,12 +78,13 @@ class CBLTransformer: self.metadata.description += self.metadata.notes if self.config.Comic_Book_Lover__copy_weblink_to_comments: - if self.metadata.web_link is not None: - if self.metadata.description is None: - self.metadata.description = "" + for web_link in self.metadata.web_links: + temp_desc = self.metadata.description + if temp_desc is None: + temp_desc = "" else: - self.metadata.description += "\n\n" - if self.metadata.web_link not in self.metadata.description: - self.metadata.description += self.metadata.web_link + temp_desc += "\n\n" + if web_link.url and web_link.url not in temp_desc: + self.metadata.description = temp_desc + web_link.url return self.metadata diff --git a/comictaggerlib/taggerwindow.py b/comictaggerlib/taggerwindow.py index 9e31465..4f0f8ec 100644 --- a/comictaggerlib/taggerwindow.py +++ b/comictaggerlib/taggerwindow.py @@ -844,7 +844,7 @@ class TaggerWindow(QtWidgets.QMainWindow): assign_text(self.leAltSeries, md.alternate_series) assign_text(self.leAltIssueNum, md.alternate_number) assign_text(self.leAltIssueCount, md.alternate_count) - assign_text(self.leWebLink, md.web_link) + assign_text(self.leWebLink, " ".join(u.url for u in md.web_links)) assign_text(self.teCharacters, "\n".join(md.characters)) assign_text(self.teTeams, "\n".join(md.teams)) assign_text(self.teLocations, "\n".join(md.locations)) @@ -967,7 +967,7 @@ class TaggerWindow(QtWidgets.QMainWindow): md.scan_info = utils.xlate(self.leScanInfo.text()) md.series_groups = utils.split(self.leSeriesGroup.text(), ",") md.alternate_series = self.leAltSeries.text() - md.web_link = utils.xlate(self.leWebLink.text()) + md.web_links = utils.split_urls(utils.xlate(self.leWebLink.text())) md.characters = set(utils.split(self.teCharacters.toPlainText(), "\n")) md.teams = set(utils.split(self.teTeams.toPlainText(), "\n")) md.locations = set(utils.split(self.teLocations.toPlainText(), "\n")) diff --git a/comictalker/talkers/comicvine.py b/comictalker/talkers/comicvine.py index 7611dd9..813b80a 100644 --- a/comictalker/talkers/comicvine.py +++ b/comictalker/talkers/comicvine.py @@ -29,6 +29,8 @@ import requests import settngs from pyrate_limiter import Limiter, RequestRate from typing_extensions import Required, TypedDict +from urllib3.exceptions import LocationParseError +from urllib3.util import parse_url from comicapi import utils from comicapi.genericmetadata import ComicSeries, GenericMetadata, TagOrigin @@ -643,10 +645,15 @@ class ComicVineTalker(ComicTalker): format=utils.xlate(series.format), volume_count=utils.xlate_int(series.count_of_volumes), title=utils.xlate(issue.get("name")), - web_link=utils.xlate(issue.get("site_detail_url")), series=utils.xlate(series.name), series_aliases=series.aliases, ) + url = utils.xlate(issue.get("site_detail_url")) + if url: + try: + md.web_links = [parse_url(url)] + except LocationParseError: + ... if issue.get("image") is None: md._cover_image = "" else: diff --git a/testing/comicvine.py b/testing/comicvine.py index 1edad87..72e79c7 100644 --- a/testing/comicvine.py +++ b/testing/comicvine.py @@ -185,7 +185,7 @@ comic_issue_result = comicapi.genericmetadata.GenericMetadata( issue=cv_issue_result["results"]["issue_number"], volume=None, title=cv_issue_result["results"]["name"], - web_link=cv_issue_result["results"]["site_detail_url"], + web_links=[comicapi.genericmetadata.parse_url(cv_issue_result["results"]["site_detail_url"])], ) cv_md = comicapi.genericmetadata.GenericMetadata( @@ -213,7 +213,7 @@ cv_md = comicapi.genericmetadata.GenericMetadata( alternate_count=None, imprint=None, notes=None, - web_link=cv_issue_result["results"]["site_detail_url"], + web_links=[comicapi.genericmetadata.parse_url(cv_issue_result["results"]["site_detail_url"])], format=None, manga=None, black_and_white=None,