PYTHON-721 - Add unicode_decode_error_handler to CodecOptions.

This commit is contained in:
aherlihy 2015-07-09 15:08:02 -04:00
parent 70be0d8911
commit 39308c4b2f
5 changed files with 147 additions and 25 deletions

View File

@ -98,10 +98,11 @@ def _get_int(data, position, dummy0, dummy1):
return _UNPACK_INT(data[position:end])[0], end
def _get_c_string(data, position):
def _get_c_string(data, position, opts):
"""Decode a BSON 'C' string to python unicode string."""
end = data.index(b"\x00", position)
return _utf_8_decode(data[position:end], None, True)[0], end + 1
return _utf_8_decode(data[position:end],
opts.unicode_decode_error_handler, True)[0], end + 1
def _get_float(data, position, dummy0, dummy1):
@ -110,7 +111,7 @@ def _get_float(data, position, dummy0, dummy1):
return _UNPACK_FLOAT(data[position:end])[0], end
def _get_string(data, position, obj_end, dummy):
def _get_string(data, position, obj_end, opts):
"""Decode a BSON string to python unicode string."""
length = _UNPACK_INT(data[position:position + 4])[0]
position += 4
@ -119,7 +120,8 @@ def _get_string(data, position, obj_end, dummy):
end = position + length - 1
if data[end:end + 1] != b"\x00":
raise InvalidBSON("invalid end of string")
return _utf_8_decode(data[position:end], None, True)[0], end + 1
return _utf_8_decode(data[position:end],
opts.unicode_decode_error_handler, True)[0], end + 1
def _get_object(data, position, obj_end, opts):
@ -235,10 +237,10 @@ def _get_code_w_scope(data, position, obj_end, opts):
return Code(code, scope), position
def _get_regex(data, position, dummy0, dummy1):
def _get_regex(data, position, dummy0, opts):
"""Decode a BSON regex to bson.regex.Regex or a python pattern object."""
pattern, position = _get_c_string(data, position)
bson_flags, position = _get_c_string(data, position)
pattern, position = _get_c_string(data, position, opts)
bson_flags, position = _get_c_string(data, position, opts)
bson_re = Regex(pattern, bson_flags)
return bson_re, position
@ -295,7 +297,7 @@ def _element_to_dict(data, position, obj_end, opts):
"""Decode a single key, value pair."""
element_type = data[position:position + 1]
position += 1
element_name, position = _get_c_string(data, position)
element_name, position = _get_c_string(data, position, opts)
value, position = _ELEMENT_GETTER[element_type](data,
position, obj_end, opts)
return element_name, value, position

View File

@ -118,10 +118,12 @@ _downcast_and_check(Py_ssize_t size, int extra) {
*/
int convert_codec_options(PyObject* options_obj, void* p) {
codec_options_t* options = (codec_options_t*)p;
if (!PyArg_ParseTuple(options_obj, "Obb",
options->unicode_decode_error_handler = NULL;
if (!PyArg_ParseTuple(options_obj, "Obbz",
&options->document_class,
&options->tz_aware,
&options->uuid_rep)) {
&options->uuid_rep,
&options->unicode_decode_error_handler)) {
return 0;
}
@ -137,6 +139,7 @@ void default_codec_options(codec_options_t* options) {
// TODO: set to "1". PYTHON-526, setting tz_aware=True by default.
options->tz_aware = 0;
options->uuid_rep = PYTHON_LEGACY;
options->unicode_decode_error_handler = NULL;
}
void destroy_codec_options(codec_options_t* options) {
@ -1560,7 +1563,9 @@ static PyObject* get_value(PyObject* self, const char* buffer,
if (buffer[*position + value_length - 1]) {
goto invalid;
}
value = PyUnicode_DecodeUTF8(buffer + *position, value_length - 1, "strict");
value = PyUnicode_DecodeUTF8(
buffer + *position, value_length - 1,
options->unicode_decode_error_handler);
if (!value) {
goto invalid;
}
@ -1916,7 +1921,9 @@ static PyObject* get_value(PyObject* self, const char* buffer,
if (pattern_length > BSON_MAX_SIZE || max < pattern_length) {
goto invalid;
}
pattern = PyUnicode_DecodeUTF8(buffer + *position, pattern_length, "strict");
pattern = PyUnicode_DecodeUTF8(
buffer + *position, pattern_length,
options->unicode_decode_error_handler);
if (!pattern) {
goto invalid;
}
@ -1980,8 +1987,9 @@ static PyObject* get_value(PyObject* self, const char* buffer,
goto invalid;
}
collection = PyUnicode_DecodeUTF8(buffer + *position,
coll_length - 1, "strict");
collection = PyUnicode_DecodeUTF8(
buffer + *position, coll_length - 1,
options->unicode_decode_error_handler);
if (!collection) {
goto invalid;
}
@ -2026,7 +2034,9 @@ static PyObject* get_value(PyObject* self, const char* buffer,
if (buffer[*position + value_length - 1]) {
goto invalid;
}
code = PyUnicode_DecodeUTF8(buffer + *position, value_length - 1, "strict");
code = PyUnicode_DecodeUTF8(
buffer + *position, value_length - 1,
options->unicode_decode_error_handler);
if (!code) {
goto invalid;
}
@ -2068,7 +2078,9 @@ static PyObject* get_value(PyObject* self, const char* buffer,
if (buffer[*position + code_size - 1]) {
goto invalid;
}
code = PyUnicode_DecodeUTF8(buffer + *position, code_size - 1, "strict");
code = PyUnicode_DecodeUTF8(
buffer + *position, code_size - 1,
options->unicode_decode_error_handler);
if (!code) {
goto invalid;
}
@ -2261,8 +2273,29 @@ static PyObject* _elements_to_dict(PyObject* self, const char* string,
Py_DECREF(dict);
return NULL;
}
name = PyUnicode_DecodeUTF8(string + position, name_length, "strict");
name = PyUnicode_DecodeUTF8(
string + position, name_length,
options->unicode_decode_error_handler);
if (!name) {
/* If NULL is returned then wrap the UnicodeDecodeError
in an InvalidBSON error */
PyObject *etype, *evalue, *etrace;
PyObject *InvalidBSON;
PyErr_Fetch(&etype, &evalue, &etrace);
InvalidBSON = _error("InvalidBSON");
if (InvalidBSON) {
Py_DECREF(etype);
etype = InvalidBSON;
if (evalue) {
PyObject *msg = PyObject_Str(evalue);
Py_DECREF(evalue);
evalue = msg;
}
PyErr_NormalizeException(&etype, &evalue, &etrace);
}
PyErr_Restore(etype, evalue, etrace);
Py_DECREF(dict);
return NULL;
}

View File

@ -56,6 +56,7 @@ typedef struct codec_options_t {
PyObject* document_class;
unsigned char tz_aware;
unsigned char uuid_rep;
char* unicode_decode_error_handler;
} codec_options_t;
/* C API functions */

View File

@ -16,13 +16,15 @@
from collections import MutableMapping, namedtuple
from bson.py3compat import string_type
from bson.binary import (ALL_UUID_REPRESENTATIONS,
PYTHON_LEGACY,
UUID_REPRESENTATION_NAMES)
_options_base = namedtuple(
'CodecOptions', ('document_class', 'tz_aware', 'uuid_representation'))
'CodecOptions', ('document_class', 'tz_aware', 'uuid_representation',
'unicode_decode_error_handler'))
class CodecOptions(_options_base):
@ -38,10 +40,21 @@ class CodecOptions(_options_base):
- `uuid_representation`: The BSON representation to use when encoding
and decoding instances of :class:`~uuid.UUID`. Defaults to
:data:`~bson.binary.PYTHON_LEGACY`.
- `unicode_decode_error_handler`: The error handler to use when decoding
an invalid BSON string. Valid options include 'strict', 'replace', and
'ignore'. Defaults to 'strict'.
.. warning:: Care must be taken when changing
`unicode_decode_error_handler` from its default value ('strict').
The 'replace' and 'ignore' modes should not be used when documents
retrieved from the server will be modified in the client application
and stored back to the server.
"""
def __new__(cls, document_class=dict,
tz_aware=False, uuid_representation=PYTHON_LEGACY):
tz_aware=False, uuid_representation=PYTHON_LEGACY,
unicode_decode_error_handler="strict"):
if not issubclass(document_class, MutableMapping):
raise TypeError("document_class must be dict, bson.son.SON, or "
"another subclass of collections.MutableMapping")
@ -50,9 +63,12 @@ class CodecOptions(_options_base):
if uuid_representation not in ALL_UUID_REPRESENTATIONS:
raise ValueError("uuid_representation must be a value "
"from bson.binary.ALL_UUID_REPRESENTATIONS")
if not isinstance(unicode_decode_error_handler, (string_type, None)):
raise ValueError("unicode_decode_error_handler must be a string "
"or None")
return tuple.__new__(
cls, (document_class, tz_aware, uuid_representation))
cls, (document_class, tz_aware, uuid_representation,
unicode_decode_error_handler))
def __repr__(self):
document_class_repr = (
@ -64,7 +80,9 @@ class CodecOptions(_options_base):
return (
'CodecOptions(document_class=%s, tz_aware=%r, uuid_representation='
'%s)' % (document_class_repr, self.tz_aware, uuid_rep_repr))
'%s, unicode_decode_error_handler=%r)' %
(document_class_repr, self.tz_aware, uuid_rep_repr,
self.unicode_decode_error_handler))
DEFAULT_CODEC_OPTIONS = CodecOptions()
@ -78,4 +96,6 @@ def _parse_codec_options(options):
tz_aware=options.get(
'tz_aware', DEFAULT_CODEC_OPTIONS.tz_aware),
uuid_representation=options.get(
'uuidrepresentation', DEFAULT_CODEC_OPTIONS.uuid_representation))
'uuidrepresentation', DEFAULT_CODEC_OPTIONS.uuid_representation),
unicode_decode_error_handler=options.get(
'unicode_decode_error_handler', "strict"))

View File

@ -785,8 +785,9 @@ class TestCodecOptions(unittest.TestCase):
self.assertRaises(ValueError, CodecOptions, uuid_representation=2)
def test_codec_options_repr(self):
r = ('CodecOptions(document_class=dict, tz_aware=False, '
'uuid_representation=PYTHON_LEGACY)')
r = ("CodecOptions(document_class=dict, tz_aware=False, "
"uuid_representation=PYTHON_LEGACY, "
"unicode_decode_error_handler='strict')")
self.assertEqual(r, repr(CodecOptions()))
def test_decode_all_defaults(self):
@ -803,6 +804,71 @@ class TestCodecOptions(unittest.TestCase):
self.assertEqual(decoded['uuid'], doc['uuid'])
self.assertIsNone(decoded['dt'].tzinfo)
def test_unicode_decode_error_handler(self):
enc = BSON.encode({"keystr": "foobar"})
# Test handling of bad key value.
invalid_key = BSON(enc[:7] + b'\xe9' + enc[8:])
replaced_key = b'ke\xef\xbf\xbdstr'.decode('utf-8')
dec = BSON.decode(invalid_key, CodecOptions(
unicode_decode_error_handler="replace"))
self.assertEqual(dec, {replaced_key: u("foobar")})
dec = BSON.decode(invalid_key, CodecOptions(
unicode_decode_error_handler="ignore"))
self.assertEqual(dec, {"kestr": "foobar"})
self.assertRaises(InvalidBSON, BSON.decode, invalid_key, CodecOptions(
unicode_decode_error_handler="strict"))
self.assertRaises(InvalidBSON, BSON.decode, invalid_key,
CodecOptions())
self.assertRaises(InvalidBSON, BSON.decode, invalid_key)
# Test handing of bad string value.
invalid_val = BSON(enc[:18] + b'\xe9' + enc[19:])
replaced_val = b'fo\xef\xbf\xbdbar'.decode('utf-8')
dec = BSON.decode(invalid_val, CodecOptions(
unicode_decode_error_handler="replace"))
self.assertEqual(dec, {"keystr": replaced_val})
dec = BSON.decode(invalid_val, CodecOptions(
unicode_decode_error_handler="ignore"))
self.assertEqual(dec, {"keystr": "fobar"})
self.assertRaises(InvalidBSON, BSON.decode, invalid_val, CodecOptions(
unicode_decode_error_handler="strict"))
self.assertRaises(InvalidBSON, BSON.decode, invalid_val,
CodecOptions())
self.assertRaises(InvalidBSON, BSON.decode, invalid_val)
# Test handing bad key + bad value.
invalid_both = BSON(
enc[:7] + b'\xe9' + enc[8:18] + b'\xe9' + enc[19:])
dec = BSON.decode(invalid_both, CodecOptions(
unicode_decode_error_handler="replace"))
self.assertEqual(dec, {replaced_key: replaced_val})
dec = BSON.decode(invalid_both, CodecOptions(
unicode_decode_error_handler="ignore"))
self.assertEqual(dec, {"kestr": "fobar"})
self.assertRaises(InvalidBSON, BSON.decode, invalid_both, CodecOptions(
unicode_decode_error_handler="strict"))
self.assertRaises(InvalidBSON, BSON.decode, invalid_both,
CodecOptions())
self.assertRaises(InvalidBSON, BSON.decode, invalid_both)
# Test handling bad error mode.
dec = BSON.decode(enc, CodecOptions(
unicode_decode_error_handler="junk"))
self.assertEqual(dec, {"keystr": "foobar"})
self.assertRaises(InvalidBSON, BSON.decode, invalid_both,
CodecOptions(unicode_decode_error_handler="junk"))
if __name__ == "__main__":
unittest.main()