Review URL percent escaping sets, from whatwg. (#3371)

This commit is contained in:
Tom Christie 2024-10-28 15:06:10 +00:00 committed by GitHub
parent 489fef48ba
commit d293374b66
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 71 additions and 27 deletions

View File

@ -36,6 +36,66 @@ SUB_DELIMS = "!$&'()*+,;="
PERCENT_ENCODED_REGEX = re.compile("%[A-Fa-f0-9]{2}")
# https://url.spec.whatwg.org/#percent-encoded-bytes
# The fragment percent-encode set is the C0 control percent-encode set
# and U+0020 SPACE, U+0022 ("), U+003C (<), U+003E (>), and U+0060 (`).
FRAG_SAFE = "".join(
[chr(i) for i in range(0x20, 0x7F) if i not in (0x20, 0x22, 0x3C, 0x3E, 0x60)]
)
# The query percent-encode set is the C0 control percent-encode set
# and U+0020 SPACE, U+0022 ("), U+0023 (#), U+003C (<), and U+003E (>).
QUERY_SAFE = "".join(
[chr(i) for i in range(0x20, 0x7F) if i not in (0x20, 0x22, 0x23, 0x3C, 0x3E)]
)
# The path percent-encode set is the query percent-encode set
# and U+003F (?), U+0060 (`), U+007B ({), and U+007D (}).
PATH_SAFE = "".join(
[
chr(i)
for i in range(0x20, 0x7F)
if i not in (0x20, 0x22, 0x23, 0x3C, 0x3E) + (0x3F, 0x60, 0x7B, 0x7D)
]
)
# The userinfo percent-encode set is the path percent-encode set
# and U+002F (/), U+003A (:), U+003B (;), U+003D (=), U+0040 (@),
# U+005B ([) to U+005E (^), inclusive, and U+007C (|).
USERNAME_SAFE = "".join(
[
chr(i)
for i in range(0x20, 0x7F)
if i
not in (0x20, 0x22, 0x23, 0x3C, 0x3E)
+ (0x3F, 0x60, 0x7B, 0x7D)
+ (0x2F, 0x3A, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C)
]
)
PASSWORD_SAFE = "".join(
[
chr(i)
for i in range(0x20, 0x7F)
if i
not in (0x20, 0x22, 0x23, 0x3C, 0x3E)
+ (0x3F, 0x60, 0x7B, 0x7D)
+ (0x2F, 0x3A, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C)
]
)
# Note... The terminology 'userinfo' percent-encode set in the WHATWG document
# is used for the username and password quoting. For the joint userinfo component
# we remove U+003A (:) from the safe set.
USERINFO_SAFE = "".join(
[
chr(i)
for i in range(0x20, 0x7F)
if i
not in (0x20, 0x22, 0x23, 0x3C, 0x3E)
+ (0x3F, 0x60, 0x7B, 0x7D)
+ (0x2F, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C)
]
)
# {scheme}: (optional)
# //{authority} (optional)
@ -182,8 +242,8 @@ def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:
# Replace "username" and/or "password" with "userinfo".
if "username" in kwargs or "password" in kwargs:
username = quote(kwargs.pop("username", "") or "")
password = quote(kwargs.pop("password", "") or "")
username = quote(kwargs.pop("username", "") or "", safe=USERNAME_SAFE)
password = quote(kwargs.pop("password", "") or "", safe=PASSWORD_SAFE)
kwargs["userinfo"] = f"{username}:{password}" if password else username
# Replace "raw_path" with "path" and "query".
@ -238,7 +298,7 @@ def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:
authority = kwargs.get("authority", url_dict["authority"]) or ""
path = kwargs.get("path", url_dict["path"]) or ""
query = kwargs.get("query", url_dict["query"])
fragment = kwargs.get("fragment", url_dict["fragment"])
frag = kwargs.get("fragment", url_dict["fragment"])
# The AUTHORITY_REGEX will always match, but may have empty components.
authority_match = AUTHORITY_REGEX.match(authority)
@ -255,7 +315,7 @@ def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:
# We end up with a parsed representation of the URL,
# with components that are plain ASCII bytestrings.
parsed_scheme: str = scheme.lower()
parsed_userinfo: str = quote(userinfo, safe=SUB_DELIMS + ":")
parsed_userinfo: str = quote(userinfo, safe=USERINFO_SAFE)
parsed_host: str = encode_host(host)
parsed_port: int | None = normalize_port(port, scheme)
@ -267,25 +327,9 @@ def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:
if has_scheme or has_authority:
path = normalize_path(path)
# The GEN_DELIMS set is... : / ? # [ ] @
# These do not need to be percent-quoted unless they serve as delimiters for the
# specific component.
WHATWG_SAFE = '`{}%|^\\"'
# For 'path' we need to drop ? and # from the GEN_DELIMS set.
parsed_path: str = quote(path, safe=SUB_DELIMS + WHATWG_SAFE + ":/[]@")
# For 'query' we need to drop '#' from the GEN_DELIMS set.
parsed_query: str | None = (
None
if query is None
else quote(query, safe=SUB_DELIMS + WHATWG_SAFE + ":/?[]@")
)
# For 'fragment' we can include all of the GEN_DELIMS set.
parsed_fragment: str | None = (
None
if fragment is None
else quote(fragment, safe=SUB_DELIMS + WHATWG_SAFE + ":/?#[]@")
)
parsed_path: str = quote(path, safe=PATH_SAFE)
parsed_query: str | None = None if query is None else quote(query, safe=QUERY_SAFE)
parsed_frag: str | None = None if frag is None else quote(frag, safe=FRAG_SAFE)
# The parsed ASCII bytestrings are our canonical form.
# All properties of the URL are derived from these.
@ -296,7 +340,7 @@ def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:
parsed_port,
parsed_path,
parsed_query,
parsed_fragment,
parsed_frag,
)

View File

@ -294,7 +294,7 @@ def test_param_with_space():
assert str(url) == "http://webservice?u=with%20spaces"
def test_param_does_not_require_encoding():
def test_param_requires_encoding():
# Params passed as form key-value pairs should be escaped.
url = httpx.URL("http://webservice", params={"u": "%"})
assert str(url) == "http://webservice?u=%25"
@ -614,10 +614,10 @@ def test_url_copywith_userinfo_subcomponents():
}
url = httpx.URL("https://example.org")
new = url.copy_with(**copy_with_kwargs)
assert str(new) == "https://tom%40example.org:abc123%40%20%25@example.org"
assert str(new) == "https://tom%40example.org:abc123%40%20%@example.org"
assert new.username == "tom@example.org"
assert new.password == "abc123@ %"
assert new.userinfo == b"tom%40example.org:abc123%40%20%25"
assert new.userinfo == b"tom%40example.org:abc123%40%20%"
def test_url_copywith_invalid_component():