Add URL parsing tests from WHATWG (#3188)
Co-authored-by: Kar Petrosyan <92274156+karpetrosyan@users.noreply.github.com>
This commit is contained in:
parent
92e9dfb399
commit
db9072f998
@ -253,22 +253,27 @@ def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:
|
||||
parsed_userinfo != "" or parsed_host != "" or parsed_port is not None
|
||||
)
|
||||
validate_path(path, has_scheme=has_scheme, has_authority=has_authority)
|
||||
if has_authority:
|
||||
if has_scheme or has_authority:
|
||||
path = normalize_path(path)
|
||||
|
||||
# The GEN_DELIMS set is... : / ? # [ ] @
|
||||
# These do not need to be percent-quoted unless they serve as delimiters for the
|
||||
# specific component.
|
||||
WHATWG_SAFE = '`{}%|^\\"'
|
||||
|
||||
# For 'path' we need to drop ? and # from the GEN_DELIMS set.
|
||||
parsed_path: str = quote(path, safe=SUB_DELIMS + ":/[]@")
|
||||
parsed_path: str = quote(path, safe=SUB_DELIMS + WHATWG_SAFE + ":/[]@")
|
||||
# For 'query' we need to drop '#' from the GEN_DELIMS set.
|
||||
parsed_query: str | None = (
|
||||
None if query is None else quote(query, safe=SUB_DELIMS + ":/?[]@")
|
||||
None
|
||||
if query is None
|
||||
else quote(query, safe=SUB_DELIMS + WHATWG_SAFE + ":/?[]@")
|
||||
)
|
||||
# For 'fragment' we can include all of the GEN_DELIMS set.
|
||||
parsed_fragment: str | None = (
|
||||
None if fragment is None else quote(fragment, safe=SUB_DELIMS + ":/?#[]@")
|
||||
None
|
||||
if fragment is None
|
||||
else quote(fragment, safe=SUB_DELIMS + WHATWG_SAFE + ":/?#[]@")
|
||||
)
|
||||
|
||||
# The parsed ASCII bytestrings are our canonical form.
|
||||
@ -321,7 +326,8 @@ def encode_host(host: str) -> str:
|
||||
# From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
|
||||
#
|
||||
# reg-name = *( unreserved / pct-encoded / sub-delims )
|
||||
return quote(host.lower(), safe=SUB_DELIMS)
|
||||
WHATWG_SAFE = '"`{}%|\\'
|
||||
return quote(host.lower(), safe=SUB_DELIMS + WHATWG_SAFE)
|
||||
|
||||
# IDNA hostnames
|
||||
try:
|
||||
@ -369,19 +375,17 @@ def validate_path(path: str, has_scheme: bool, has_authority: bool) -> None:
|
||||
# must either be empty or begin with a slash ("/") character."
|
||||
if path and not path.startswith("/"):
|
||||
raise InvalidURL("For absolute URLs, path must be empty or begin with '/'")
|
||||
else:
|
||||
|
||||
if not has_scheme and not has_authority:
|
||||
# If a URI does not contain an authority component, then the path cannot begin
|
||||
# with two slash characters ("//").
|
||||
if path.startswith("//"):
|
||||
raise InvalidURL(
|
||||
"URLs with no authority component cannot have a path starting with '//'"
|
||||
)
|
||||
raise InvalidURL("Relative URLs cannot have a path starting with '//'")
|
||||
|
||||
# In addition, a URI reference (Section 4.1) may be a relative-path reference,
|
||||
# in which case the first path segment cannot contain a colon (":") character.
|
||||
if path.startswith(":") and not has_scheme:
|
||||
raise InvalidURL(
|
||||
"URLs with no scheme component cannot have a path starting with ':'"
|
||||
)
|
||||
if path.startswith(":"):
|
||||
raise InvalidURL("Relative URLs cannot have a path starting with ':'")
|
||||
|
||||
|
||||
def normalize_path(path: str) -> str:
|
||||
|
||||
@ -230,8 +230,8 @@ def test_url_normalized_host():
|
||||
|
||||
|
||||
def test_url_percent_escape_host():
|
||||
url = httpx.URL("https://exam%le.com/")
|
||||
assert url.host == "exam%25le.com"
|
||||
url = httpx.URL("https://exam le.com/")
|
||||
assert url.host == "exam%20le.com"
|
||||
|
||||
|
||||
def test_url_ipv4_like_host():
|
||||
@ -415,17 +415,11 @@ def test_urlparse_with_invalid_path():
|
||||
|
||||
with pytest.raises(httpx.InvalidURL) as exc:
|
||||
httpx.URL(path="//abc")
|
||||
assert (
|
||||
str(exc.value)
|
||||
== "URLs with no authority component cannot have a path starting with '//'"
|
||||
)
|
||||
assert str(exc.value) == "Relative URLs cannot have a path starting with '//'"
|
||||
|
||||
with pytest.raises(httpx.InvalidURL) as exc:
|
||||
httpx.URL(path=":abc")
|
||||
assert (
|
||||
str(exc.value)
|
||||
== "URLs with no scheme component cannot have a path starting with ':'"
|
||||
)
|
||||
assert str(exc.value) == "Relative URLs cannot have a path starting with ':'"
|
||||
|
||||
|
||||
def test_url_with_relative_path():
|
||||
|
||||
52
tests/models/test_whatwg.py
Normal file
52
tests/models/test_whatwg.py
Normal file
@ -0,0 +1,52 @@
|
||||
# The WHATWG have various tests that can be used to validate the URL parsing.
|
||||
#
|
||||
# https://url.spec.whatwg.org/
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from httpx._urlparse import urlparse
|
||||
|
||||
# URL test cases from...
|
||||
# https://github.com/web-platform-tests/wpt/blob/master/url/resources/urltestdata.json
|
||||
with open("tests/models/whatwg.json", "r") as input:
|
||||
test_cases = json.load(input)
|
||||
test_cases = [
|
||||
item
|
||||
for item in test_cases
|
||||
if not isinstance(item, str) and not item.get("failure")
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_case", test_cases)
|
||||
def test_urlparse(test_case):
|
||||
if test_case["href"] in ("a: foo.com", "lolscheme:x x#x%20x"):
|
||||
# Skip these two test cases.
|
||||
# WHATWG cases where are not using percent-encoding for the space character.
|
||||
# Anyone know what's going on here?
|
||||
return
|
||||
|
||||
p = urlparse(test_case["href"])
|
||||
|
||||
# Test cases include the protocol with the trailing ":"
|
||||
protocol = p.scheme + ":"
|
||||
# Include the square brackets for IPv6 addresses.
|
||||
hostname = f"[{p.host}]" if ":" in p.host else p.host
|
||||
# The test cases use a string representation of the port.
|
||||
port = "" if p.port is None else str(p.port)
|
||||
# I have nothing to say about this one.
|
||||
path = p.path
|
||||
# The 'search' and 'hash' components in the whatwg tests are semantic, not literal.
|
||||
# Our parsing differentiates between no query/hash and empty-string query/hash.
|
||||
search = "" if p.query in (None, "") else "?" + str(p.query)
|
||||
hash = "" if p.fragment in (None, "") else "#" + str(p.fragment)
|
||||
|
||||
# URL hostnames are case-insensitive.
|
||||
# We normalize these, unlike the WHATWG test cases.
|
||||
assert protocol == test_case["protocol"]
|
||||
assert hostname.lower() == test_case["hostname"].lower()
|
||||
assert port == test_case["port"]
|
||||
assert path == test_case["pathname"]
|
||||
assert search == test_case["search"]
|
||||
assert hash == test_case["hash"]
|
||||
9746
tests/models/whatwg.json
Normal file
9746
tests/models/whatwg.json
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user