Supported text codecs should handle available aliases

This commit is contained in:
Tom Christie 2023-10-19 12:23:39 +01:00
parent 8e8ef6e9c6
commit cff58c91db

View File

@ -25,46 +25,49 @@ _HTML5_FORM_ENCODING_RE = re.compile(
r"|".join([re.escape(c) for c in _HTML5_FORM_ENCODING_REPLACEMENTS.keys()])
)
# Text codecs as supported by Chromium, Oct. 2023.
# For our supported text codecs, we start with the text codecs as supported by Chromium, Oct. 2023.
# https://chromium.googlesource.com/chromium/chromium/+/refs/heads/trunk/chrome/browser/character_encoding.cc#36
#
# Then limit those to any which documented as included by cpython,
# which drops "windows-874", "iso-8859-8-i".
#
# Then make sure we're referencing them with the canonical name as used by the Python codecs.
SUPPORTED_CODECS = {
"utf-8",
"utf-16le",
"iso-8859-1",
"windows-1252",
"gbk",
"gb18030",
"big5",
"big5-hkscs",
"euc-kr",
"shift-jis",
"euc-jp",
"iso-2022-jp",
"windows-874",
"iso-8859-15",
"macintosh",
"iso-8859-2",
"windows-1250",
"iso-8859-5",
"windows-1251",
"koi8-r",
"koi8-u",
"iso-8859-7",
"windows-1253",
"windows-1254",
"windows-1256",
"iso-8859-6",
"windows-1255",
"iso-8859-8-i",
"iso-8859-8",
"windows-1258",
"iso-8859-4",
"iso-8859-13",
"windows-1257",
"iso-8859-3",
"iso-8859-10",
"iso-8859-14",
"iso-8859-16",
"big5", # big5
"big5hkscs", # big5-hkscs
"cp1250", # windows-1250
"cp1251", # windows-1251
"cp1252", # windows-1252
"cp1253", # windows-1253
"cp1254", # windows-1254
"cp1255", # windows-1255
"cp1256", # windows-1256
"cp1257", # windows-1257
"cp1258", # windows-1258
"euc_jp", # euc-jp
"euc_kr", # euc-kr
"gb18030", # gb18030
"gbk", # gbk
"iso2022_jp", # iso-2022-jp
"iso8859-1", # iso-8859-1
"iso8859-2", # iso-8859-2
"iso8859-3", # iso-8859-3
"iso8859-4", # iso-8859-4
"iso8859-5", # iso-8859-5
"iso8859-6", # iso-8859-6
"iso8859-7", # iso-8859-7
"iso8859-8", # iso-8859-8
"iso8859-10", # iso-8859-10
"iso8859-13", # iso-8859-13
"iso8859-14", # iso-8859-14
"iso8859-15", # iso-8859-15
"iso8859-16", # iso-8859-16
"koi8-r", # koi8-r
"koi8-u", # koi8-u
"mac-roman", # macintosh
"shift_jis", # shift-jis
"utf-8", # utf-8
"utf-16-le", # utf-16le
}
@ -112,22 +115,14 @@ def primitive_value_to_str(value: "PrimitiveData") -> str:
def is_known_encoding(encoding: str) -> bool:
"""
Return `True` if `encoding` is a known codec.
Return `True` if `encoding` is a supported text codec.
"""
# Only allow text codecs within our supported range.
if encoding.lower().replace("_", "-") not in SUPPORTED_CODECS:
return False
# Also ensure that the codec is actually available.
# At the point of writing this was true for all the SUPPORTED_CODECS
# except "windows-874", "iso-8859-8-i", when using cpython.
# But there *could* feasibly be a different set of codecs available
# under some installations.
try:
codecs.lookup(encoding)
codec = codecs.lookup(encoding)
except LookupError:
return False
return True
return codec.name in SUPPORTED_CODECS
def format_form_param(name: str, value: str) -> bytes: