Supported text codecs should handle available aliases
This commit is contained in:
parent
8e8ef6e9c6
commit
cff58c91db
@ -25,46 +25,49 @@ _HTML5_FORM_ENCODING_RE = re.compile(
|
||||
r"|".join([re.escape(c) for c in _HTML5_FORM_ENCODING_REPLACEMENTS.keys()])
|
||||
)
|
||||
|
||||
# Text codecs as supported by Chromium, Oct. 2023.
|
||||
# For our supported text codecs, we start with the text codecs as supported by Chromium, Oct. 2023.
|
||||
# https://chromium.googlesource.com/chromium/chromium/+/refs/heads/trunk/chrome/browser/character_encoding.cc#36
|
||||
#
|
||||
# Then limit those to any which documented as included by cpython,
|
||||
# which drops "windows-874", "iso-8859-8-i".
|
||||
#
|
||||
# Then make sure we're referencing them with the canonical name as used by the Python codecs.
|
||||
SUPPORTED_CODECS = {
|
||||
"utf-8",
|
||||
"utf-16le",
|
||||
"iso-8859-1",
|
||||
"windows-1252",
|
||||
"gbk",
|
||||
"gb18030",
|
||||
"big5",
|
||||
"big5-hkscs",
|
||||
"euc-kr",
|
||||
"shift-jis",
|
||||
"euc-jp",
|
||||
"iso-2022-jp",
|
||||
"windows-874",
|
||||
"iso-8859-15",
|
||||
"macintosh",
|
||||
"iso-8859-2",
|
||||
"windows-1250",
|
||||
"iso-8859-5",
|
||||
"windows-1251",
|
||||
"koi8-r",
|
||||
"koi8-u",
|
||||
"iso-8859-7",
|
||||
"windows-1253",
|
||||
"windows-1254",
|
||||
"windows-1256",
|
||||
"iso-8859-6",
|
||||
"windows-1255",
|
||||
"iso-8859-8-i",
|
||||
"iso-8859-8",
|
||||
"windows-1258",
|
||||
"iso-8859-4",
|
||||
"iso-8859-13",
|
||||
"windows-1257",
|
||||
"iso-8859-3",
|
||||
"iso-8859-10",
|
||||
"iso-8859-14",
|
||||
"iso-8859-16",
|
||||
"big5", # big5
|
||||
"big5hkscs", # big5-hkscs
|
||||
"cp1250", # windows-1250
|
||||
"cp1251", # windows-1251
|
||||
"cp1252", # windows-1252
|
||||
"cp1253", # windows-1253
|
||||
"cp1254", # windows-1254
|
||||
"cp1255", # windows-1255
|
||||
"cp1256", # windows-1256
|
||||
"cp1257", # windows-1257
|
||||
"cp1258", # windows-1258
|
||||
"euc_jp", # euc-jp
|
||||
"euc_kr", # euc-kr
|
||||
"gb18030", # gb18030
|
||||
"gbk", # gbk
|
||||
"iso2022_jp", # iso-2022-jp
|
||||
"iso8859-1", # iso-8859-1
|
||||
"iso8859-2", # iso-8859-2
|
||||
"iso8859-3", # iso-8859-3
|
||||
"iso8859-4", # iso-8859-4
|
||||
"iso8859-5", # iso-8859-5
|
||||
"iso8859-6", # iso-8859-6
|
||||
"iso8859-7", # iso-8859-7
|
||||
"iso8859-8", # iso-8859-8
|
||||
"iso8859-10", # iso-8859-10
|
||||
"iso8859-13", # iso-8859-13
|
||||
"iso8859-14", # iso-8859-14
|
||||
"iso8859-15", # iso-8859-15
|
||||
"iso8859-16", # iso-8859-16
|
||||
"koi8-r", # koi8-r
|
||||
"koi8-u", # koi8-u
|
||||
"mac-roman", # macintosh
|
||||
"shift_jis", # shift-jis
|
||||
"utf-8", # utf-8
|
||||
"utf-16-le", # utf-16le
|
||||
}
|
||||
|
||||
|
||||
@ -112,22 +115,14 @@ def primitive_value_to_str(value: "PrimitiveData") -> str:
|
||||
|
||||
def is_known_encoding(encoding: str) -> bool:
|
||||
"""
|
||||
Return `True` if `encoding` is a known codec.
|
||||
Return `True` if `encoding` is a supported text codec.
|
||||
"""
|
||||
# Only allow text codecs within our supported range.
|
||||
if encoding.lower().replace("_", "-") not in SUPPORTED_CODECS:
|
||||
return False
|
||||
|
||||
# Also ensure that the codec is actually available.
|
||||
# At the point of writing this was true for all the SUPPORTED_CODECS
|
||||
# except "windows-874", "iso-8859-8-i", when using cpython.
|
||||
# But there *could* feasibly be a different set of codecs available
|
||||
# under some installations.
|
||||
try:
|
||||
codecs.lookup(encoding)
|
||||
codec = codecs.lookup(encoding)
|
||||
except LookupError:
|
||||
return False
|
||||
return True
|
||||
|
||||
return codec.name in SUPPORTED_CODECS
|
||||
|
||||
|
||||
def format_form_param(name: str, value: str) -> bytes:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user