add support for file scheme

split url parsing into separate function
This commit is contained in:
Matthew Welch 2024-12-06 11:31:09 -08:00
parent 57e1fae538
commit 015e46134e
3 changed files with 260 additions and 29 deletions

View File

@ -1,31 +1,61 @@
import os.path
import socket import socket
import ssl import ssl
import sys
SUPPORTED_SCHEMES = [
"http",
"https",
"file",
]
DEFAULT_FILE = "default.html"
class URL: class URL:
def __init__(self, url: str): def __init__(self, url_string: str | None = None):
self.scheme, url = url.split("://", 1) self.scheme: str = ""
assert self.scheme in ["http", "https"] self.host: str = ""
if "/" not in url: self.port: int = -1
url = url + "/" self.path: str = ""
self.host, url = url.split("/", 1) self.query: str = ""
self.path = "/" + url self.fragment: str = ""
self.default_port = False
if url_string is not None:
parse_url(url_string, self)
if self.scheme == "http": def to_string(self):
self.port = 80 url_string = self.scheme + ":"
elif self.scheme == "https": if self.host != "":
self.port = 443 url_string += "//" + self.host
if self.port != -1 and not self.default_port:
url_string += f":{self.port}"
url_string += self.path
if self.query != "":
url_string += "?" + self.query
if self.fragment != "":
url_string += "#" + self.fragment
return url_string
if ":" in self.host: def __str__(self):
self.host, port = self.host.split(":", 1) return self.to_string()
self.port = int(port)
def __repr__(self):
return f"<URL {self.to_string()}>"
class Request:
def __init__(self, url: URL, method: str = "GET"):
self.url = url
self.method = method
self.request_string = "" self.request_string = ""
self.http_version = "HTTP/1.1" self.http_version = "HTTP/1.1"
self.headers = {"Host": self.host} self.headers = {"Host": self.url.host}
def add_request_line(self, method): def add_request_line(self, method):
self.request_string += f"{method} {self.path} {self.http_version}\r\n" self.request_string += f"{method} {self.url.path} {self.http_version}\r\n"
def add_header(self, key, value): def add_header(self, key, value):
self.headers[key] = value self.headers[key] = value
@ -44,7 +74,13 @@ class URL:
self.request_string += f"{key}: {value}\r\n" self.request_string += f"{key}: {value}\r\n"
self.request_string += "\r\n" self.request_string += "\r\n"
def request(self, method: str = "GET", headers: dict = None) -> str: def send_request(self, *args, **kwargs):
if self.url.scheme in ["http", "https"]:
return self.http_request(*args, **kwargs)
elif self.url.scheme == "file":
return self.file_request()
def http_request(self, method: str = "GET", headers: dict = None) -> str:
if headers is not None: if headers is not None:
self.add_headers(headers) self.add_headers(headers)
s = socket.socket( s = socket.socket(
@ -52,10 +88,10 @@ class URL:
type=socket.SOCK_STREAM, type=socket.SOCK_STREAM,
proto=socket.IPPROTO_TCP, proto=socket.IPPROTO_TCP,
) )
s.connect((self.host, self.port)) s.connect((self.url.host, self.url.port))
if self.scheme == "https": if self.url.scheme == "https":
context = ssl.SSLContext(ssl.PROTOCOL_TLS) context = ssl.SSLContext(ssl.PROTOCOL_TLS)
s = context.wrap_socket(s, server_hostname=self.host) s = context.wrap_socket(s, server_hostname=self.url.host)
self.add_request_line(method) self.add_request_line(method)
self.add_default_headers() self.add_default_headers()
@ -81,6 +117,82 @@ class URL:
s.close() s.close()
return content return content
def file_request(self):
with open(self.url.path) as f:
return f.read()
def parse_url(url_string: str, url: URL | None = None) -> tuple[URL, bool]:
has_authority = False
if url is None:
url = URL()
try:
url.scheme, url_string = url_string.split(":", 1)
assert url.scheme in SUPPORTED_SCHEMES
if url_string.startswith("//"):
has_authority = True
url_string = url_string[2:]
i = 0
for char in url_string:
if char in ["/", "?", "#"]:
break
i += 1
url.host += char
url_string = url_string[i:]
i = 0
for char in url_string:
if char in ["?", "#"]:
break
url.path += char
url_string = url_string[i:]
if has_authority and url.host == "" and url.path == "":
return url, False
elif not has_authority and url.path == "":
return url, False
if url_string.startswith("?"):
url_string = url_string[1:]
i = 0
for char in url_string:
if char == "#":
break
i += 1
url.query += char
url_string = url_string[i:]
if url_string.startswith("#"):
url.fragment = url_string[1:]
if url.scheme == "http":
url.port = 80
url.default_port = True
elif url.scheme == "https":
url.port = 443
url.default_port = False
if url.scheme in ["http", "https"]:
if url.path == "" or url.path is None:
url.path = "/"
if url.scheme == "file":
print(f"{url.scheme=}")
print(f"{url.host=}")
print(f"{url.port=}")
print(f"{url.path=}")
print(f"{url.query=}")
print(f"{url.fragment=}")
if sys.platform == "win32" and url.path.startswith("/") and ":" in url.path:
url.path = url.path[1:]
if url.path == "" or url.path == "/":
return url, False
if ":" in url.host:
url.host, port = url.host.split(":", 1)
url.port = int(port)
return url, True
except Exception as e:
print(e)
return url, False
def show(body: str) -> None: def show(body: str) -> None:
in_tag = False in_tag = False
@ -93,11 +205,17 @@ def show(body: str) -> None:
print(char, end="") print(char, end="")
def load(url: URL): def load(url_string: str):
body = url.request() url, success = parse_url(url_string)
if not success:
default = os.path.abspath(DEFAULT_FILE)
if sys.platform == "win32":
default = "/" + default
url, _ = parse_url(f"file://{default}")
body = Request(url).send_request()
show(body) show(body)
if __name__ == '__main__': if __name__ == '__main__':
import sys load(sys.argv[1])
load(URL(sys.argv[1]))

10
default.html Normal file
View File

@ -0,0 +1,10 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
<p>Test</p>
</body>
</html>

View File

@ -1,9 +1,11 @@
import pytest import pytest
from browser import URL from browser import URL
from browser import parse_url
from browser import Request
@pytest.mark.parametrize( @pytest.mark.parametrize(
"url_string,scheme,host,port,path", "url_string,scheme,host,port,path,query,fragment,parse_success",
[ [
( (
"http://example.com", "http://example.com",
@ -11,6 +13,9 @@ from browser import URL
"example.com", "example.com",
80, 80,
"/", "/",
"",
"",
True,
), ),
( (
"http://example.com/", "http://example.com/",
@ -18,6 +23,9 @@ from browser import URL
"example.com", "example.com",
80, 80,
"/", "/",
"",
"",
True,
), ),
( (
"https://example.com/", "https://example.com/",
@ -25,6 +33,9 @@ from browser import URL
"example.com", "example.com",
443, 443,
"/", "/",
"",
"",
True,
), ),
( (
"http://example.com:5000/", "http://example.com:5000/",
@ -32,6 +43,9 @@ from browser import URL
"example.com", "example.com",
5000, 5000,
"/", "/",
"",
"",
True,
), ),
( (
"http://example.com:5000/test/example", "http://example.com:5000/test/example",
@ -39,6 +53,9 @@ from browser import URL
"example.com", "example.com",
5000, 5000,
"/test/example", "/test/example",
"",
"",
True,
), ),
( (
"https://example.com:5000/test/example", "https://example.com:5000/test/example",
@ -46,15 +63,101 @@ from browser import URL
"example.com", "example.com",
5000, 5000,
"/test/example", "/test/example",
"",
"",
True,
),
(
"file:///test.html",
"file",
"",
-1,
"/test.html",
"",
"",
True,
),
(
"file://file_host/test.html",
"file",
"file_host",
-1,
"/test.html",
"",
"",
True,
),
(
"file:///c:/test.txt",
"file",
"",
-1,
"c:/test.txt",
"",
"",
True,
),
(
r"file:///c:\test.txt",
"file",
"",
-1,
r"c:\test.txt",
"",
"",
True,
),
(
"file://file_host/",
"file",
"file_host",
-1,
"/",
"",
"",
False,
),
(
"htp://example.com/",
"htp",
"",
-1,
"",
"",
"",
False,
),
(
"file:test.txt",
"file",
"",
-1,
"test.txt",
"",
"",
True,
),
(
"file:/test.txt",
"file",
"",
-1,
"/test.txt",
"",
"",
True,
), ),
], ],
) )
def test_url_parsing(url_string, scheme, host, port, path): def test_url_parsing(url_string, scheme, host, port, path, query, fragment, parse_success):
url = URL(url_string) url, success = parse_url(url_string)
assert url.scheme == scheme assert url.scheme == scheme
assert url.host == host assert url.host == host
assert url.port == port assert url.port == port
assert url.path == path assert url.path == path
assert url.query == query
assert url.fragment == fragment
assert success == parse_success
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -67,7 +170,7 @@ def test_url_parsing(url_string, scheme, host, port, path):
) )
def test_http_request(http_server, url_string): def test_http_request(http_server, url_string):
url = URL(url_string) url = URL(url_string)
assert url.request() == "test" assert Request(url).send_request() == "test"
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -80,4 +183,4 @@ def test_http_request(http_server, url_string):
) )
def test_https_request(https_server, url_string): def test_https_request(https_server, url_string):
url = URL(url_string) url = URL(url_string)
assert url.request() == "test" assert Request(url).send_request() == "test"