Review URL percent escaping sets, from whatwg. (#3371)
This commit is contained in:
parent
489fef48ba
commit
d293374b66
@ -36,6 +36,66 @@ SUB_DELIMS = "!$&'()*+,;="
|
|||||||
|
|
||||||
PERCENT_ENCODED_REGEX = re.compile("%[A-Fa-f0-9]{2}")
|
PERCENT_ENCODED_REGEX = re.compile("%[A-Fa-f0-9]{2}")
|
||||||
|
|
||||||
|
# https://url.spec.whatwg.org/#percent-encoded-bytes
|
||||||
|
|
||||||
|
# The fragment percent-encode set is the C0 control percent-encode set
|
||||||
|
# and U+0020 SPACE, U+0022 ("), U+003C (<), U+003E (>), and U+0060 (`).
|
||||||
|
FRAG_SAFE = "".join(
|
||||||
|
[chr(i) for i in range(0x20, 0x7F) if i not in (0x20, 0x22, 0x3C, 0x3E, 0x60)]
|
||||||
|
)
|
||||||
|
|
||||||
|
# The query percent-encode set is the C0 control percent-encode set
|
||||||
|
# and U+0020 SPACE, U+0022 ("), U+0023 (#), U+003C (<), and U+003E (>).
|
||||||
|
QUERY_SAFE = "".join(
|
||||||
|
[chr(i) for i in range(0x20, 0x7F) if i not in (0x20, 0x22, 0x23, 0x3C, 0x3E)]
|
||||||
|
)
|
||||||
|
|
||||||
|
# The path percent-encode set is the query percent-encode set
|
||||||
|
# and U+003F (?), U+0060 (`), U+007B ({), and U+007D (}).
|
||||||
|
PATH_SAFE = "".join(
|
||||||
|
[
|
||||||
|
chr(i)
|
||||||
|
for i in range(0x20, 0x7F)
|
||||||
|
if i not in (0x20, 0x22, 0x23, 0x3C, 0x3E) + (0x3F, 0x60, 0x7B, 0x7D)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# The userinfo percent-encode set is the path percent-encode set
|
||||||
|
# and U+002F (/), U+003A (:), U+003B (;), U+003D (=), U+0040 (@),
|
||||||
|
# U+005B ([) to U+005E (^), inclusive, and U+007C (|).
|
||||||
|
USERNAME_SAFE = "".join(
|
||||||
|
[
|
||||||
|
chr(i)
|
||||||
|
for i in range(0x20, 0x7F)
|
||||||
|
if i
|
||||||
|
not in (0x20, 0x22, 0x23, 0x3C, 0x3E)
|
||||||
|
+ (0x3F, 0x60, 0x7B, 0x7D)
|
||||||
|
+ (0x2F, 0x3A, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
PASSWORD_SAFE = "".join(
|
||||||
|
[
|
||||||
|
chr(i)
|
||||||
|
for i in range(0x20, 0x7F)
|
||||||
|
if i
|
||||||
|
not in (0x20, 0x22, 0x23, 0x3C, 0x3E)
|
||||||
|
+ (0x3F, 0x60, 0x7B, 0x7D)
|
||||||
|
+ (0x2F, 0x3A, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
# Note... The terminology 'userinfo' percent-encode set in the WHATWG document
|
||||||
|
# is used for the username and password quoting. For the joint userinfo component
|
||||||
|
# we remove U+003A (:) from the safe set.
|
||||||
|
USERINFO_SAFE = "".join(
|
||||||
|
[
|
||||||
|
chr(i)
|
||||||
|
for i in range(0x20, 0x7F)
|
||||||
|
if i
|
||||||
|
not in (0x20, 0x22, 0x23, 0x3C, 0x3E)
|
||||||
|
+ (0x3F, 0x60, 0x7B, 0x7D)
|
||||||
|
+ (0x2F, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
# {scheme}: (optional)
|
# {scheme}: (optional)
|
||||||
# //{authority} (optional)
|
# //{authority} (optional)
|
||||||
@ -182,8 +242,8 @@ def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:
|
|||||||
|
|
||||||
# Replace "username" and/or "password" with "userinfo".
|
# Replace "username" and/or "password" with "userinfo".
|
||||||
if "username" in kwargs or "password" in kwargs:
|
if "username" in kwargs or "password" in kwargs:
|
||||||
username = quote(kwargs.pop("username", "") or "")
|
username = quote(kwargs.pop("username", "") or "", safe=USERNAME_SAFE)
|
||||||
password = quote(kwargs.pop("password", "") or "")
|
password = quote(kwargs.pop("password", "") or "", safe=PASSWORD_SAFE)
|
||||||
kwargs["userinfo"] = f"{username}:{password}" if password else username
|
kwargs["userinfo"] = f"{username}:{password}" if password else username
|
||||||
|
|
||||||
# Replace "raw_path" with "path" and "query".
|
# Replace "raw_path" with "path" and "query".
|
||||||
@ -238,7 +298,7 @@ def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:
|
|||||||
authority = kwargs.get("authority", url_dict["authority"]) or ""
|
authority = kwargs.get("authority", url_dict["authority"]) or ""
|
||||||
path = kwargs.get("path", url_dict["path"]) or ""
|
path = kwargs.get("path", url_dict["path"]) or ""
|
||||||
query = kwargs.get("query", url_dict["query"])
|
query = kwargs.get("query", url_dict["query"])
|
||||||
fragment = kwargs.get("fragment", url_dict["fragment"])
|
frag = kwargs.get("fragment", url_dict["fragment"])
|
||||||
|
|
||||||
# The AUTHORITY_REGEX will always match, but may have empty components.
|
# The AUTHORITY_REGEX will always match, but may have empty components.
|
||||||
authority_match = AUTHORITY_REGEX.match(authority)
|
authority_match = AUTHORITY_REGEX.match(authority)
|
||||||
@ -255,7 +315,7 @@ def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:
|
|||||||
# We end up with a parsed representation of the URL,
|
# We end up with a parsed representation of the URL,
|
||||||
# with components that are plain ASCII bytestrings.
|
# with components that are plain ASCII bytestrings.
|
||||||
parsed_scheme: str = scheme.lower()
|
parsed_scheme: str = scheme.lower()
|
||||||
parsed_userinfo: str = quote(userinfo, safe=SUB_DELIMS + ":")
|
parsed_userinfo: str = quote(userinfo, safe=USERINFO_SAFE)
|
||||||
parsed_host: str = encode_host(host)
|
parsed_host: str = encode_host(host)
|
||||||
parsed_port: int | None = normalize_port(port, scheme)
|
parsed_port: int | None = normalize_port(port, scheme)
|
||||||
|
|
||||||
@ -267,25 +327,9 @@ def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:
|
|||||||
if has_scheme or has_authority:
|
if has_scheme or has_authority:
|
||||||
path = normalize_path(path)
|
path = normalize_path(path)
|
||||||
|
|
||||||
# The GEN_DELIMS set is... : / ? # [ ] @
|
parsed_path: str = quote(path, safe=PATH_SAFE)
|
||||||
# These do not need to be percent-quoted unless they serve as delimiters for the
|
parsed_query: str | None = None if query is None else quote(query, safe=QUERY_SAFE)
|
||||||
# specific component.
|
parsed_frag: str | None = None if frag is None else quote(frag, safe=FRAG_SAFE)
|
||||||
WHATWG_SAFE = '`{}%|^\\"'
|
|
||||||
|
|
||||||
# For 'path' we need to drop ? and # from the GEN_DELIMS set.
|
|
||||||
parsed_path: str = quote(path, safe=SUB_DELIMS + WHATWG_SAFE + ":/[]@")
|
|
||||||
# For 'query' we need to drop '#' from the GEN_DELIMS set.
|
|
||||||
parsed_query: str | None = (
|
|
||||||
None
|
|
||||||
if query is None
|
|
||||||
else quote(query, safe=SUB_DELIMS + WHATWG_SAFE + ":/?[]@")
|
|
||||||
)
|
|
||||||
# For 'fragment' we can include all of the GEN_DELIMS set.
|
|
||||||
parsed_fragment: str | None = (
|
|
||||||
None
|
|
||||||
if fragment is None
|
|
||||||
else quote(fragment, safe=SUB_DELIMS + WHATWG_SAFE + ":/?#[]@")
|
|
||||||
)
|
|
||||||
|
|
||||||
# The parsed ASCII bytestrings are our canonical form.
|
# The parsed ASCII bytestrings are our canonical form.
|
||||||
# All properties of the URL are derived from these.
|
# All properties of the URL are derived from these.
|
||||||
@ -296,7 +340,7 @@ def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:
|
|||||||
parsed_port,
|
parsed_port,
|
||||||
parsed_path,
|
parsed_path,
|
||||||
parsed_query,
|
parsed_query,
|
||||||
parsed_fragment,
|
parsed_frag,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -294,7 +294,7 @@ def test_param_with_space():
|
|||||||
assert str(url) == "http://webservice?u=with%20spaces"
|
assert str(url) == "http://webservice?u=with%20spaces"
|
||||||
|
|
||||||
|
|
||||||
def test_param_does_not_require_encoding():
|
def test_param_requires_encoding():
|
||||||
# Params passed as form key-value pairs should be escaped.
|
# Params passed as form key-value pairs should be escaped.
|
||||||
url = httpx.URL("http://webservice", params={"u": "%"})
|
url = httpx.URL("http://webservice", params={"u": "%"})
|
||||||
assert str(url) == "http://webservice?u=%25"
|
assert str(url) == "http://webservice?u=%25"
|
||||||
@ -614,10 +614,10 @@ def test_url_copywith_userinfo_subcomponents():
|
|||||||
}
|
}
|
||||||
url = httpx.URL("https://example.org")
|
url = httpx.URL("https://example.org")
|
||||||
new = url.copy_with(**copy_with_kwargs)
|
new = url.copy_with(**copy_with_kwargs)
|
||||||
assert str(new) == "https://tom%40example.org:abc123%40%20%25@example.org"
|
assert str(new) == "https://tom%40example.org:abc123%40%20%@example.org"
|
||||||
assert new.username == "tom@example.org"
|
assert new.username == "tom@example.org"
|
||||||
assert new.password == "abc123@ %"
|
assert new.password == "abc123@ %"
|
||||||
assert new.userinfo == b"tom%40example.org:abc123%40%20%25"
|
assert new.userinfo == b"tom%40example.org:abc123%40%20%"
|
||||||
|
|
||||||
|
|
||||||
def test_url_copywith_invalid_component():
|
def test_url_copywith_invalid_component():
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user