Differentiate between 'url.host' and 'url.raw_host' (#1590)

* Differentiate between 'url.host' and 'url.raw_host'
This commit is contained in:
Tom Christie 2021-04-23 11:00:53 +01:00 committed by GitHub
parent d98e9e7ae7
commit 39d8ee619e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 114 additions and 40 deletions

View File

@ -8,6 +8,7 @@ from collections.abc import MutableMapping
from http.cookiejar import Cookie, CookieJar
from urllib.parse import parse_qsl, quote, unquote, urlencode
import idna
import rfc3986
import rfc3986.exceptions
@ -60,15 +61,16 @@ from ._utils import (
class URL:
"""
url = httpx.URL("HTTPS://jo%40email.com:a%20secret@example.com:1234/pa%20th?search=ab#anchorlink")
url = httpx.URL("HTTPS://jo%40email.com:a%20secret@müller.de:1234/pa%20th?search=ab#anchorlink")
assert url.scheme == "https"
assert url.username == "jo@email.com"
assert url.password == "a secret"
assert url.userinfo == b"jo%40email.com:a%20secret"
assert url.host == "example.com"
assert url.host == "müller.de"
assert url.raw_host == b"xn--mller-kva.de"
assert url.port == 1234
assert url.netloc == "example.com:1234"
assert url.netloc == b"xn--mller-kva.de:1234"
assert url.path == "/pa th"
assert url.query == b"?search=ab"
assert url.raw_path == b"/pa%20th?search=ab"
@ -76,17 +78,28 @@ class URL:
The components of a URL are broken down like this:
https://jo%40email.com:a%20secret@example.com:1234/pa%20th?search=ab#anchorlink
[scheme][ username ] [password] [ host ][port][ path ] [ query ] [fragment]
[ userinfo ] [ netloc ][ raw_path ]
https://jo%40email.com:a%20secret@müller.de:1234/pa%20th?search=ab#anchorlink
[scheme] [ username ] [password] [ host ][port][ path ] [ query ] [fragment]
[ userinfo ] [ netloc ][ raw_path ]
Note that:
* `url.scheme` is normalized to always be lowercased.
* `url.host` is normalized to always be lowercased, and is IDNA encoded. For instance:
url = httpx.URL("http://中国.icom.museum")
assert url.host == "xn--fiqs8s.icom.museum"
* `url.host` is normalized to always be lowercased. Internationalized domain
names are represented in unicode, without IDNA encoding applied. For instance:
url = httpx.URL("http://中国.icom.museum")
assert url.host == "中国.icom.museum"
url = httpx.URL("http://xn--fiqs8s.icom.museum")
assert url.host == "中国.icom.museum"
* `url.raw_host` is normalized to always be lowercased, and is IDNA encoded.
url = httpx.URL("http://中国.icom.museum")
assert url.raw_host == b"xn--fiqs8s.icom.museum"
url = httpx.URL("http://xn--fiqs8s.icom.museum")
assert url.raw_host == b"xn--fiqs8s.icom.museum"
* `url.userinfo` is raw bytes, without URL escaping. Usually you'll want to work with
`url.username` and `url.password` instead, which handle the URL escaping.
@ -150,6 +163,14 @@ class URL:
"""
return self._uri_reference.scheme or ""
@property
def raw_scheme(self) -> bytes:
"""
The raw bytes representation of the URL scheme, such as b"http", b"https".
Always normalised to lowercase.
"""
return self.scheme.encode("ascii")
@property
def userinfo(self) -> bytes:
"""
@ -181,7 +202,7 @@ class URL:
def host(self) -> str:
"""
The URL host as a string.
Always normlized to lowercase, and IDNA encoded.
Always normalized to lowercase, with IDNA hosts decoded into unicode.
Examples:
@ -189,18 +210,52 @@ class URL:
assert url.host == "www.example.org"
url = httpx.URL("http://中国.icom.museum")
assert url.host == "xn--fiqs8s.icom.museum"
assert url.host == "中国.icom.museum"
url = httpx.URL("http://xn--fiqs8s.icom.museum")
assert url.host == "中国.icom.museum"
url = httpx.URL("https://[::ffff:192.168.0.1]")
assert url.host == "::ffff:192.168.0.1"
"""
host: str = self._uri_reference.host
host: str = self._uri_reference.host or ""
if host and ":" in host and host[0] == "[":
# it's an IPv6 address
host = host.lstrip("[").rstrip("]")
return host or ""
if host.startswith("xn--"):
host = idna.decode(host)
return host
@property
def raw_host(self) -> bytes:
"""
The raw bytes representation of the URL host.
Always normalized to lowercase, and IDNA encoded.
Examples:
url = httpx.URL("http://www.EXAMPLE.org")
assert url.raw_host == b"www.example.org"
url = httpx.URL("http://中国.icom.museum")
assert url.raw_host == b"xn--fiqs8s.icom.museum"
url = httpx.URL("http://xn--fiqs8s.icom.museum")
assert url.raw_host == b"xn--fiqs8s.icom.museum"
url = httpx.URL("https://[::ffff:192.168.0.1]")
assert url.raw_host == b"::ffff:192.168.0.1"
"""
host: str = self._uri_reference.host or ""
if host and ":" in host and host[0] == "[":
# it's an IPv6 address
host = host.lstrip("[").rstrip("]")
return host.encode("ascii")
@property
def port(self) -> typing.Optional[int]:
@ -211,14 +266,17 @@ class URL:
return int(port) if port else None
@property
def netloc(self) -> str:
def netloc(self) -> bytes:
"""
Either `<host>` or `<host>:<port>` as a string.
Always normlized to lowercase, and IDNA encoded.
Either `<host>` or `<host>:<port>` as bytes.
Always normalized to lowercase, and IDNA encoded.
"""
host = self._uri_reference.host or ""
port = self._uri_reference.port
return host if port is None else f"{host}:{port}"
netloc = host.encode("ascii")
if port:
netloc = netloc + b":" + str(port).encode("ascii")
return netloc
@property
def path(self) -> str:
@ -277,8 +335,8 @@ class URL:
Provides the (scheme, host, port, target) for the outgoing request.
"""
return (
self.scheme.encode("ascii"),
self.host.encode("ascii"),
self.raw_scheme,
self.raw_host,
self.port,
self.raw_path,
)
@ -293,7 +351,7 @@ class URL:
# URLs with a fragment portion as not absolute.
# What we actually care about is if the URL provides
# a scheme and hostname to which connections should be made.
return bool(self.scheme and self.host)
return bool(self._uri_reference.scheme and self._uri_reference.host)
@property
def is_relative_url(self) -> bool:
@ -321,7 +379,7 @@ class URL:
"userinfo": bytes,
"host": str,
"port": int,
"netloc": str,
"netloc": bytes,
"path": str,
"query": bytes,
"raw_path": bytes,
@ -354,12 +412,16 @@ class URL:
# it's an IPv6 address, so it should be hidden under bracket
host = f"[{host}]"
kwargs["netloc"] = f"{host}:{port}" if port is not None else host
kwargs["netloc"] = (
f"{host}:{port}".encode("ascii")
if port is not None
else host.encode("ascii")
)
if "userinfo" in kwargs or "netloc" in kwargs:
# Consolidate userinfo and netloc into authority.
userinfo = (kwargs.pop("userinfo", self.userinfo) or b"").decode("ascii")
netloc = kwargs.pop("netloc", self.netloc) or ""
netloc = (kwargs.pop("netloc", self.netloc) or b"").decode("ascii")
authority = f"{userinfo}@{netloc}" if userinfo else netloc
kwargs["authority"] = authority
@ -848,11 +910,10 @@ class Request:
)
if not has_host and self.url.host:
default_port = {"http": 80, "https": 443}.get(self.url.scheme)
if self.url.port is None or self.url.port == default_port:
host_header = self.url.host.encode("ascii")
else:
host_header = self.url.netloc.encode("ascii")
default_port = {"http": b":80", "https": b":443"}.get(self.url.scheme, b"")
host_header = self.url.netloc
if host_header.endswith(default_port):
host_header = host_header[: -len(default_port)]
auto_headers.append((b"Host", host_header))
if not has_content_length and self.method in ("POST", "PUT", "PATCH"):
auto_headers.append((b"Content-Length", b"0"))

View File

@ -4,41 +4,53 @@ import httpx
@pytest.mark.parametrize(
"given,idna,host,scheme,port",
"given,idna,host,raw_host,scheme,port",
[
(
"http://中国.icom.museum:80/",
"http://xn--fiqs8s.icom.museum:80/",
"xn--fiqs8s.icom.museum",
"中国.icom.museum",
b"xn--fiqs8s.icom.museum",
"http",
80,
),
(
"http://Königsgäßchen.de",
"http://xn--knigsgchen-b4a3dun.de",
"xn--knigsgchen-b4a3dun.de",
"königsgäßchen.de",
b"xn--knigsgchen-b4a3dun.de",
"http",
None,
),
("https://faß.de", "https://xn--fa-hia.de", "xn--fa-hia.de", "https", None),
(
"https://faß.de",
"https://xn--fa-hia.de",
"faß.de",
b"xn--fa-hia.de",
"https",
None,
),
(
"https://βόλος.com:443",
"https://xn--nxasmm1c.com:443",
"xn--nxasmm1c.com",
"βόλος.com",
b"xn--nxasmm1c.com",
"https",
443,
),
(
"http://ශ්‍රී.com:444",
"http://xn--10cl1a0b660p.com:444",
"xn--10cl1a0b660p.com",
"ශ්‍රී.com",
b"xn--10cl1a0b660p.com",
"http",
444,
),
(
"https://نامه‌ای.com:4433",
"https://xn--mgba3gch31f060k.com:4433",
"xn--mgba3gch31f060k.com",
"نامه‌ای.com",
b"xn--mgba3gch31f060k.com",
"https",
4433,
),
@ -52,10 +64,11 @@ import httpx
"https_with_custom_port",
],
)
def test_idna_url(given, idna, host, scheme, port):
def test_idna_url(given, idna, host, raw_host, scheme, port):
url = httpx.URL(given)
assert url == httpx.URL(idna)
assert url.host == host
assert url.raw_host == raw_host
assert url.scheme == scheme
assert url.port == port
@ -197,7 +210,7 @@ def test_url_copywith_authority_subcomponents():
def test_url_copywith_netloc():
copy_with_kwargs = {
"netloc": "example.net:444",
"netloc": b"example.net:444",
}
url = httpx.URL("https://example.org")
new = url.copy_with(**copy_with_kwargs)
@ -301,7 +314,7 @@ def test_ipv6_url():
url = httpx.URL("http://[::ffff:192.168.0.1]:5678/")
assert url.host == "::ffff:192.168.0.1"
assert url.netloc == "[::ffff:192.168.0.1]:5678"
assert url.netloc == b"[::ffff:192.168.0.1]:5678"
@pytest.mark.parametrize(
@ -317,7 +330,7 @@ def test_ipv6_url_copy_with_host(url_str, new_host):
url = httpx.URL(url_str).copy_with(host=new_host)
assert url.host == "::ffff:192.168.0.1"
assert url.netloc == "[::ffff:192.168.0.1]:1234"
assert url.netloc == b"[::ffff:192.168.0.1]:1234"
assert str(url) == "http://[::ffff:192.168.0.1]:1234"
@ -327,5 +340,5 @@ def test_ipv6_url_from_raw_url(host):
url = httpx.URL(raw_url)
assert url.host == "::ffff:192.168.0.1"
assert url.netloc == "[::ffff:192.168.0.1]:443"
assert url.netloc == b"[::ffff:192.168.0.1]:443"
assert str(url) == "https://[::ffff:192.168.0.1]:443/"