PYTHON-721 - Add unicode_decode_error_handler to CodecOptions.
This commit is contained in:
parent
70be0d8911
commit
39308c4b2f
@ -98,10 +98,11 @@ def _get_int(data, position, dummy0, dummy1):
|
||||
return _UNPACK_INT(data[position:end])[0], end
|
||||
|
||||
|
||||
def _get_c_string(data, position):
|
||||
def _get_c_string(data, position, opts):
|
||||
"""Decode a BSON 'C' string to python unicode string."""
|
||||
end = data.index(b"\x00", position)
|
||||
return _utf_8_decode(data[position:end], None, True)[0], end + 1
|
||||
return _utf_8_decode(data[position:end],
|
||||
opts.unicode_decode_error_handler, True)[0], end + 1
|
||||
|
||||
|
||||
def _get_float(data, position, dummy0, dummy1):
|
||||
@ -110,7 +111,7 @@ def _get_float(data, position, dummy0, dummy1):
|
||||
return _UNPACK_FLOAT(data[position:end])[0], end
|
||||
|
||||
|
||||
def _get_string(data, position, obj_end, dummy):
|
||||
def _get_string(data, position, obj_end, opts):
|
||||
"""Decode a BSON string to python unicode string."""
|
||||
length = _UNPACK_INT(data[position:position + 4])[0]
|
||||
position += 4
|
||||
@ -119,7 +120,8 @@ def _get_string(data, position, obj_end, dummy):
|
||||
end = position + length - 1
|
||||
if data[end:end + 1] != b"\x00":
|
||||
raise InvalidBSON("invalid end of string")
|
||||
return _utf_8_decode(data[position:end], None, True)[0], end + 1
|
||||
return _utf_8_decode(data[position:end],
|
||||
opts.unicode_decode_error_handler, True)[0], end + 1
|
||||
|
||||
|
||||
def _get_object(data, position, obj_end, opts):
|
||||
@ -235,10 +237,10 @@ def _get_code_w_scope(data, position, obj_end, opts):
|
||||
return Code(code, scope), position
|
||||
|
||||
|
||||
def _get_regex(data, position, dummy0, dummy1):
|
||||
def _get_regex(data, position, dummy0, opts):
|
||||
"""Decode a BSON regex to bson.regex.Regex or a python pattern object."""
|
||||
pattern, position = _get_c_string(data, position)
|
||||
bson_flags, position = _get_c_string(data, position)
|
||||
pattern, position = _get_c_string(data, position, opts)
|
||||
bson_flags, position = _get_c_string(data, position, opts)
|
||||
bson_re = Regex(pattern, bson_flags)
|
||||
return bson_re, position
|
||||
|
||||
@ -295,7 +297,7 @@ def _element_to_dict(data, position, obj_end, opts):
|
||||
"""Decode a single key, value pair."""
|
||||
element_type = data[position:position + 1]
|
||||
position += 1
|
||||
element_name, position = _get_c_string(data, position)
|
||||
element_name, position = _get_c_string(data, position, opts)
|
||||
value, position = _ELEMENT_GETTER[element_type](data,
|
||||
position, obj_end, opts)
|
||||
return element_name, value, position
|
||||
|
||||
@ -118,10 +118,12 @@ _downcast_and_check(Py_ssize_t size, int extra) {
|
||||
*/
|
||||
int convert_codec_options(PyObject* options_obj, void* p) {
|
||||
codec_options_t* options = (codec_options_t*)p;
|
||||
if (!PyArg_ParseTuple(options_obj, "Obb",
|
||||
options->unicode_decode_error_handler = NULL;
|
||||
if (!PyArg_ParseTuple(options_obj, "Obbz",
|
||||
&options->document_class,
|
||||
&options->tz_aware,
|
||||
&options->uuid_rep)) {
|
||||
&options->uuid_rep,
|
||||
&options->unicode_decode_error_handler)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -137,6 +139,7 @@ void default_codec_options(codec_options_t* options) {
|
||||
// TODO: set to "1". PYTHON-526, setting tz_aware=True by default.
|
||||
options->tz_aware = 0;
|
||||
options->uuid_rep = PYTHON_LEGACY;
|
||||
options->unicode_decode_error_handler = NULL;
|
||||
}
|
||||
|
||||
void destroy_codec_options(codec_options_t* options) {
|
||||
@ -1560,7 +1563,9 @@ static PyObject* get_value(PyObject* self, const char* buffer,
|
||||
if (buffer[*position + value_length - 1]) {
|
||||
goto invalid;
|
||||
}
|
||||
value = PyUnicode_DecodeUTF8(buffer + *position, value_length - 1, "strict");
|
||||
value = PyUnicode_DecodeUTF8(
|
||||
buffer + *position, value_length - 1,
|
||||
options->unicode_decode_error_handler);
|
||||
if (!value) {
|
||||
goto invalid;
|
||||
}
|
||||
@ -1916,7 +1921,9 @@ static PyObject* get_value(PyObject* self, const char* buffer,
|
||||
if (pattern_length > BSON_MAX_SIZE || max < pattern_length) {
|
||||
goto invalid;
|
||||
}
|
||||
pattern = PyUnicode_DecodeUTF8(buffer + *position, pattern_length, "strict");
|
||||
pattern = PyUnicode_DecodeUTF8(
|
||||
buffer + *position, pattern_length,
|
||||
options->unicode_decode_error_handler);
|
||||
if (!pattern) {
|
||||
goto invalid;
|
||||
}
|
||||
@ -1980,8 +1987,9 @@ static PyObject* get_value(PyObject* self, const char* buffer,
|
||||
goto invalid;
|
||||
}
|
||||
|
||||
collection = PyUnicode_DecodeUTF8(buffer + *position,
|
||||
coll_length - 1, "strict");
|
||||
collection = PyUnicode_DecodeUTF8(
|
||||
buffer + *position, coll_length - 1,
|
||||
options->unicode_decode_error_handler);
|
||||
if (!collection) {
|
||||
goto invalid;
|
||||
}
|
||||
@ -2026,7 +2034,9 @@ static PyObject* get_value(PyObject* self, const char* buffer,
|
||||
if (buffer[*position + value_length - 1]) {
|
||||
goto invalid;
|
||||
}
|
||||
code = PyUnicode_DecodeUTF8(buffer + *position, value_length - 1, "strict");
|
||||
code = PyUnicode_DecodeUTF8(
|
||||
buffer + *position, value_length - 1,
|
||||
options->unicode_decode_error_handler);
|
||||
if (!code) {
|
||||
goto invalid;
|
||||
}
|
||||
@ -2068,7 +2078,9 @@ static PyObject* get_value(PyObject* self, const char* buffer,
|
||||
if (buffer[*position + code_size - 1]) {
|
||||
goto invalid;
|
||||
}
|
||||
code = PyUnicode_DecodeUTF8(buffer + *position, code_size - 1, "strict");
|
||||
code = PyUnicode_DecodeUTF8(
|
||||
buffer + *position, code_size - 1,
|
||||
options->unicode_decode_error_handler);
|
||||
if (!code) {
|
||||
goto invalid;
|
||||
}
|
||||
@ -2261,8 +2273,29 @@ static PyObject* _elements_to_dict(PyObject* self, const char* string,
|
||||
Py_DECREF(dict);
|
||||
return NULL;
|
||||
}
|
||||
name = PyUnicode_DecodeUTF8(string + position, name_length, "strict");
|
||||
name = PyUnicode_DecodeUTF8(
|
||||
string + position, name_length,
|
||||
options->unicode_decode_error_handler);
|
||||
if (!name) {
|
||||
/* If NULL is returned then wrap the UnicodeDecodeError
|
||||
in an InvalidBSON error */
|
||||
PyObject *etype, *evalue, *etrace;
|
||||
PyObject *InvalidBSON;
|
||||
|
||||
PyErr_Fetch(&etype, &evalue, &etrace);
|
||||
InvalidBSON = _error("InvalidBSON");
|
||||
if (InvalidBSON) {
|
||||
Py_DECREF(etype);
|
||||
etype = InvalidBSON;
|
||||
|
||||
if (evalue) {
|
||||
PyObject *msg = PyObject_Str(evalue);
|
||||
Py_DECREF(evalue);
|
||||
evalue = msg;
|
||||
}
|
||||
PyErr_NormalizeException(&etype, &evalue, &etrace);
|
||||
}
|
||||
PyErr_Restore(etype, evalue, etrace);
|
||||
Py_DECREF(dict);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -56,6 +56,7 @@ typedef struct codec_options_t {
|
||||
PyObject* document_class;
|
||||
unsigned char tz_aware;
|
||||
unsigned char uuid_rep;
|
||||
char* unicode_decode_error_handler;
|
||||
} codec_options_t;
|
||||
|
||||
/* C API functions */
|
||||
|
||||
@ -16,13 +16,15 @@
|
||||
|
||||
from collections import MutableMapping, namedtuple
|
||||
|
||||
from bson.py3compat import string_type
|
||||
from bson.binary import (ALL_UUID_REPRESENTATIONS,
|
||||
PYTHON_LEGACY,
|
||||
UUID_REPRESENTATION_NAMES)
|
||||
|
||||
|
||||
_options_base = namedtuple(
|
||||
'CodecOptions', ('document_class', 'tz_aware', 'uuid_representation'))
|
||||
'CodecOptions', ('document_class', 'tz_aware', 'uuid_representation',
|
||||
'unicode_decode_error_handler'))
|
||||
|
||||
|
||||
class CodecOptions(_options_base):
|
||||
@ -38,10 +40,21 @@ class CodecOptions(_options_base):
|
||||
- `uuid_representation`: The BSON representation to use when encoding
|
||||
and decoding instances of :class:`~uuid.UUID`. Defaults to
|
||||
:data:`~bson.binary.PYTHON_LEGACY`.
|
||||
- `unicode_decode_error_handler`: The error handler to use when decoding
|
||||
an invalid BSON string. Valid options include 'strict', 'replace', and
|
||||
'ignore'. Defaults to 'strict'.
|
||||
|
||||
.. warning:: Care must be taken when changing
|
||||
`unicode_decode_error_handler` from its default value ('strict').
|
||||
The 'replace' and 'ignore' modes should not be used when documents
|
||||
retrieved from the server will be modified in the client application
|
||||
and stored back to the server.
|
||||
|
||||
"""
|
||||
|
||||
def __new__(cls, document_class=dict,
|
||||
tz_aware=False, uuid_representation=PYTHON_LEGACY):
|
||||
tz_aware=False, uuid_representation=PYTHON_LEGACY,
|
||||
unicode_decode_error_handler="strict"):
|
||||
if not issubclass(document_class, MutableMapping):
|
||||
raise TypeError("document_class must be dict, bson.son.SON, or "
|
||||
"another subclass of collections.MutableMapping")
|
||||
@ -50,9 +63,12 @@ class CodecOptions(_options_base):
|
||||
if uuid_representation not in ALL_UUID_REPRESENTATIONS:
|
||||
raise ValueError("uuid_representation must be a value "
|
||||
"from bson.binary.ALL_UUID_REPRESENTATIONS")
|
||||
|
||||
if not isinstance(unicode_decode_error_handler, (string_type, None)):
|
||||
raise ValueError("unicode_decode_error_handler must be a string "
|
||||
"or None")
|
||||
return tuple.__new__(
|
||||
cls, (document_class, tz_aware, uuid_representation))
|
||||
cls, (document_class, tz_aware, uuid_representation,
|
||||
unicode_decode_error_handler))
|
||||
|
||||
def __repr__(self):
|
||||
document_class_repr = (
|
||||
@ -64,7 +80,9 @@ class CodecOptions(_options_base):
|
||||
|
||||
return (
|
||||
'CodecOptions(document_class=%s, tz_aware=%r, uuid_representation='
|
||||
'%s)' % (document_class_repr, self.tz_aware, uuid_rep_repr))
|
||||
'%s, unicode_decode_error_handler=%r)' %
|
||||
(document_class_repr, self.tz_aware, uuid_rep_repr,
|
||||
self.unicode_decode_error_handler))
|
||||
|
||||
|
||||
DEFAULT_CODEC_OPTIONS = CodecOptions()
|
||||
@ -78,4 +96,6 @@ def _parse_codec_options(options):
|
||||
tz_aware=options.get(
|
||||
'tz_aware', DEFAULT_CODEC_OPTIONS.tz_aware),
|
||||
uuid_representation=options.get(
|
||||
'uuidrepresentation', DEFAULT_CODEC_OPTIONS.uuid_representation))
|
||||
'uuidrepresentation', DEFAULT_CODEC_OPTIONS.uuid_representation),
|
||||
unicode_decode_error_handler=options.get(
|
||||
'unicode_decode_error_handler', "strict"))
|
||||
|
||||
@ -785,8 +785,9 @@ class TestCodecOptions(unittest.TestCase):
|
||||
self.assertRaises(ValueError, CodecOptions, uuid_representation=2)
|
||||
|
||||
def test_codec_options_repr(self):
|
||||
r = ('CodecOptions(document_class=dict, tz_aware=False, '
|
||||
'uuid_representation=PYTHON_LEGACY)')
|
||||
r = ("CodecOptions(document_class=dict, tz_aware=False, "
|
||||
"uuid_representation=PYTHON_LEGACY, "
|
||||
"unicode_decode_error_handler='strict')")
|
||||
self.assertEqual(r, repr(CodecOptions()))
|
||||
|
||||
def test_decode_all_defaults(self):
|
||||
@ -803,6 +804,71 @@ class TestCodecOptions(unittest.TestCase):
|
||||
self.assertEqual(decoded['uuid'], doc['uuid'])
|
||||
self.assertIsNone(decoded['dt'].tzinfo)
|
||||
|
||||
def test_unicode_decode_error_handler(self):
|
||||
enc = BSON.encode({"keystr": "foobar"})
|
||||
|
||||
# Test handling of bad key value.
|
||||
invalid_key = BSON(enc[:7] + b'\xe9' + enc[8:])
|
||||
replaced_key = b'ke\xef\xbf\xbdstr'.decode('utf-8')
|
||||
|
||||
dec = BSON.decode(invalid_key, CodecOptions(
|
||||
unicode_decode_error_handler="replace"))
|
||||
self.assertEqual(dec, {replaced_key: u("foobar")})
|
||||
|
||||
dec = BSON.decode(invalid_key, CodecOptions(
|
||||
unicode_decode_error_handler="ignore"))
|
||||
self.assertEqual(dec, {"kestr": "foobar"})
|
||||
|
||||
self.assertRaises(InvalidBSON, BSON.decode, invalid_key, CodecOptions(
|
||||
unicode_decode_error_handler="strict"))
|
||||
self.assertRaises(InvalidBSON, BSON.decode, invalid_key,
|
||||
CodecOptions())
|
||||
self.assertRaises(InvalidBSON, BSON.decode, invalid_key)
|
||||
|
||||
# Test handing of bad string value.
|
||||
invalid_val = BSON(enc[:18] + b'\xe9' + enc[19:])
|
||||
replaced_val = b'fo\xef\xbf\xbdbar'.decode('utf-8')
|
||||
|
||||
dec = BSON.decode(invalid_val, CodecOptions(
|
||||
unicode_decode_error_handler="replace"))
|
||||
self.assertEqual(dec, {"keystr": replaced_val})
|
||||
|
||||
dec = BSON.decode(invalid_val, CodecOptions(
|
||||
unicode_decode_error_handler="ignore"))
|
||||
self.assertEqual(dec, {"keystr": "fobar"})
|
||||
|
||||
self.assertRaises(InvalidBSON, BSON.decode, invalid_val, CodecOptions(
|
||||
unicode_decode_error_handler="strict"))
|
||||
self.assertRaises(InvalidBSON, BSON.decode, invalid_val,
|
||||
CodecOptions())
|
||||
self.assertRaises(InvalidBSON, BSON.decode, invalid_val)
|
||||
|
||||
# Test handing bad key + bad value.
|
||||
invalid_both = BSON(
|
||||
enc[:7] + b'\xe9' + enc[8:18] + b'\xe9' + enc[19:])
|
||||
|
||||
dec = BSON.decode(invalid_both, CodecOptions(
|
||||
unicode_decode_error_handler="replace"))
|
||||
self.assertEqual(dec, {replaced_key: replaced_val})
|
||||
|
||||
dec = BSON.decode(invalid_both, CodecOptions(
|
||||
unicode_decode_error_handler="ignore"))
|
||||
self.assertEqual(dec, {"kestr": "fobar"})
|
||||
|
||||
self.assertRaises(InvalidBSON, BSON.decode, invalid_both, CodecOptions(
|
||||
unicode_decode_error_handler="strict"))
|
||||
self.assertRaises(InvalidBSON, BSON.decode, invalid_both,
|
||||
CodecOptions())
|
||||
self.assertRaises(InvalidBSON, BSON.decode, invalid_both)
|
||||
|
||||
# Test handling bad error mode.
|
||||
dec = BSON.decode(enc, CodecOptions(
|
||||
unicode_decode_error_handler="junk"))
|
||||
self.assertEqual(dec, {"keystr": "foobar"})
|
||||
|
||||
self.assertRaises(InvalidBSON, BSON.decode, invalid_both,
|
||||
CodecOptions(unicode_decode_error_handler="junk"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
Loading…
Reference in New Issue
Block a user