From 39308c4b2f0d518ebbd20f94383131adcf788abc Mon Sep 17 00:00:00 2001 From: aherlihy Date: Thu, 9 Jul 2015 15:08:02 -0400 Subject: [PATCH] PYTHON-721 - Add unicode_decode_error_handler to CodecOptions. --- bson/__init__.py | 18 ++++++----- bson/_cbsonmodule.c | 51 +++++++++++++++++++++++++------ bson/_cbsonmodule.h | 1 + bson/codec_options.py | 32 ++++++++++++++++---- test/test_bson.py | 70 +++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 147 insertions(+), 25 deletions(-) diff --git a/bson/__init__.py b/bson/__init__.py index 77bae030a..e3b00b109 100644 --- a/bson/__init__.py +++ b/bson/__init__.py @@ -98,10 +98,11 @@ def _get_int(data, position, dummy0, dummy1): return _UNPACK_INT(data[position:end])[0], end -def _get_c_string(data, position): +def _get_c_string(data, position, opts): """Decode a BSON 'C' string to python unicode string.""" end = data.index(b"\x00", position) - return _utf_8_decode(data[position:end], None, True)[0], end + 1 + return _utf_8_decode(data[position:end], + opts.unicode_decode_error_handler, True)[0], end + 1 def _get_float(data, position, dummy0, dummy1): @@ -110,7 +111,7 @@ def _get_float(data, position, dummy0, dummy1): return _UNPACK_FLOAT(data[position:end])[0], end -def _get_string(data, position, obj_end, dummy): +def _get_string(data, position, obj_end, opts): """Decode a BSON string to python unicode string.""" length = _UNPACK_INT(data[position:position + 4])[0] position += 4 @@ -119,7 +120,8 @@ def _get_string(data, position, obj_end, dummy): end = position + length - 1 if data[end:end + 1] != b"\x00": raise InvalidBSON("invalid end of string") - return _utf_8_decode(data[position:end], None, True)[0], end + 1 + return _utf_8_decode(data[position:end], + opts.unicode_decode_error_handler, True)[0], end + 1 def _get_object(data, position, obj_end, opts): @@ -235,10 +237,10 @@ def _get_code_w_scope(data, position, obj_end, opts): return Code(code, scope), position -def _get_regex(data, position, dummy0, dummy1): +def _get_regex(data, position, dummy0, opts): """Decode a BSON regex to bson.regex.Regex or a python pattern object.""" - pattern, position = _get_c_string(data, position) - bson_flags, position = _get_c_string(data, position) + pattern, position = _get_c_string(data, position, opts) + bson_flags, position = _get_c_string(data, position, opts) bson_re = Regex(pattern, bson_flags) return bson_re, position @@ -295,7 +297,7 @@ def _element_to_dict(data, position, obj_end, opts): """Decode a single key, value pair.""" element_type = data[position:position + 1] position += 1 - element_name, position = _get_c_string(data, position) + element_name, position = _get_c_string(data, position, opts) value, position = _ELEMENT_GETTER[element_type](data, position, obj_end, opts) return element_name, value, position diff --git a/bson/_cbsonmodule.c b/bson/_cbsonmodule.c index f00c6f3c5..10460916c 100644 --- a/bson/_cbsonmodule.c +++ b/bson/_cbsonmodule.c @@ -118,10 +118,12 @@ _downcast_and_check(Py_ssize_t size, int extra) { */ int convert_codec_options(PyObject* options_obj, void* p) { codec_options_t* options = (codec_options_t*)p; - if (!PyArg_ParseTuple(options_obj, "Obb", + options->unicode_decode_error_handler = NULL; + if (!PyArg_ParseTuple(options_obj, "Obbz", &options->document_class, &options->tz_aware, - &options->uuid_rep)) { + &options->uuid_rep, + &options->unicode_decode_error_handler)) { return 0; } @@ -137,6 +139,7 @@ void default_codec_options(codec_options_t* options) { // TODO: set to "1". PYTHON-526, setting tz_aware=True by default. options->tz_aware = 0; options->uuid_rep = PYTHON_LEGACY; + options->unicode_decode_error_handler = NULL; } void destroy_codec_options(codec_options_t* options) { @@ -1560,7 +1563,9 @@ static PyObject* get_value(PyObject* self, const char* buffer, if (buffer[*position + value_length - 1]) { goto invalid; } - value = PyUnicode_DecodeUTF8(buffer + *position, value_length - 1, "strict"); + value = PyUnicode_DecodeUTF8( + buffer + *position, value_length - 1, + options->unicode_decode_error_handler); if (!value) { goto invalid; } @@ -1916,7 +1921,9 @@ static PyObject* get_value(PyObject* self, const char* buffer, if (pattern_length > BSON_MAX_SIZE || max < pattern_length) { goto invalid; } - pattern = PyUnicode_DecodeUTF8(buffer + *position, pattern_length, "strict"); + pattern = PyUnicode_DecodeUTF8( + buffer + *position, pattern_length, + options->unicode_decode_error_handler); if (!pattern) { goto invalid; } @@ -1980,8 +1987,9 @@ static PyObject* get_value(PyObject* self, const char* buffer, goto invalid; } - collection = PyUnicode_DecodeUTF8(buffer + *position, - coll_length - 1, "strict"); + collection = PyUnicode_DecodeUTF8( + buffer + *position, coll_length - 1, + options->unicode_decode_error_handler); if (!collection) { goto invalid; } @@ -2026,7 +2034,9 @@ static PyObject* get_value(PyObject* self, const char* buffer, if (buffer[*position + value_length - 1]) { goto invalid; } - code = PyUnicode_DecodeUTF8(buffer + *position, value_length - 1, "strict"); + code = PyUnicode_DecodeUTF8( + buffer + *position, value_length - 1, + options->unicode_decode_error_handler); if (!code) { goto invalid; } @@ -2068,7 +2078,9 @@ static PyObject* get_value(PyObject* self, const char* buffer, if (buffer[*position + code_size - 1]) { goto invalid; } - code = PyUnicode_DecodeUTF8(buffer + *position, code_size - 1, "strict"); + code = PyUnicode_DecodeUTF8( + buffer + *position, code_size - 1, + options->unicode_decode_error_handler); if (!code) { goto invalid; } @@ -2261,8 +2273,29 @@ static PyObject* _elements_to_dict(PyObject* self, const char* string, Py_DECREF(dict); return NULL; } - name = PyUnicode_DecodeUTF8(string + position, name_length, "strict"); + name = PyUnicode_DecodeUTF8( + string + position, name_length, + options->unicode_decode_error_handler); if (!name) { + /* If NULL is returned then wrap the UnicodeDecodeError + in an InvalidBSON error */ + PyObject *etype, *evalue, *etrace; + PyObject *InvalidBSON; + + PyErr_Fetch(&etype, &evalue, &etrace); + InvalidBSON = _error("InvalidBSON"); + if (InvalidBSON) { + Py_DECREF(etype); + etype = InvalidBSON; + + if (evalue) { + PyObject *msg = PyObject_Str(evalue); + Py_DECREF(evalue); + evalue = msg; + } + PyErr_NormalizeException(&etype, &evalue, &etrace); + } + PyErr_Restore(etype, evalue, etrace); Py_DECREF(dict); return NULL; } diff --git a/bson/_cbsonmodule.h b/bson/_cbsonmodule.h index 080705e7b..fe0d89f99 100644 --- a/bson/_cbsonmodule.h +++ b/bson/_cbsonmodule.h @@ -56,6 +56,7 @@ typedef struct codec_options_t { PyObject* document_class; unsigned char tz_aware; unsigned char uuid_rep; + char* unicode_decode_error_handler; } codec_options_t; /* C API functions */ diff --git a/bson/codec_options.py b/bson/codec_options.py index 002f3f4aa..e055c6c58 100644 --- a/bson/codec_options.py +++ b/bson/codec_options.py @@ -16,13 +16,15 @@ from collections import MutableMapping, namedtuple +from bson.py3compat import string_type from bson.binary import (ALL_UUID_REPRESENTATIONS, PYTHON_LEGACY, UUID_REPRESENTATION_NAMES) _options_base = namedtuple( - 'CodecOptions', ('document_class', 'tz_aware', 'uuid_representation')) + 'CodecOptions', ('document_class', 'tz_aware', 'uuid_representation', + 'unicode_decode_error_handler')) class CodecOptions(_options_base): @@ -38,10 +40,21 @@ class CodecOptions(_options_base): - `uuid_representation`: The BSON representation to use when encoding and decoding instances of :class:`~uuid.UUID`. Defaults to :data:`~bson.binary.PYTHON_LEGACY`. + - `unicode_decode_error_handler`: The error handler to use when decoding + an invalid BSON string. Valid options include 'strict', 'replace', and + 'ignore'. Defaults to 'strict'. + + .. warning:: Care must be taken when changing + `unicode_decode_error_handler` from its default value ('strict'). + The 'replace' and 'ignore' modes should not be used when documents + retrieved from the server will be modified in the client application + and stored back to the server. + """ def __new__(cls, document_class=dict, - tz_aware=False, uuid_representation=PYTHON_LEGACY): + tz_aware=False, uuid_representation=PYTHON_LEGACY, + unicode_decode_error_handler="strict"): if not issubclass(document_class, MutableMapping): raise TypeError("document_class must be dict, bson.son.SON, or " "another subclass of collections.MutableMapping") @@ -50,9 +63,12 @@ class CodecOptions(_options_base): if uuid_representation not in ALL_UUID_REPRESENTATIONS: raise ValueError("uuid_representation must be a value " "from bson.binary.ALL_UUID_REPRESENTATIONS") - + if not isinstance(unicode_decode_error_handler, (string_type, None)): + raise ValueError("unicode_decode_error_handler must be a string " + "or None") return tuple.__new__( - cls, (document_class, tz_aware, uuid_representation)) + cls, (document_class, tz_aware, uuid_representation, + unicode_decode_error_handler)) def __repr__(self): document_class_repr = ( @@ -64,7 +80,9 @@ class CodecOptions(_options_base): return ( 'CodecOptions(document_class=%s, tz_aware=%r, uuid_representation=' - '%s)' % (document_class_repr, self.tz_aware, uuid_rep_repr)) + '%s, unicode_decode_error_handler=%r)' % + (document_class_repr, self.tz_aware, uuid_rep_repr, + self.unicode_decode_error_handler)) DEFAULT_CODEC_OPTIONS = CodecOptions() @@ -78,4 +96,6 @@ def _parse_codec_options(options): tz_aware=options.get( 'tz_aware', DEFAULT_CODEC_OPTIONS.tz_aware), uuid_representation=options.get( - 'uuidrepresentation', DEFAULT_CODEC_OPTIONS.uuid_representation)) + 'uuidrepresentation', DEFAULT_CODEC_OPTIONS.uuid_representation), + unicode_decode_error_handler=options.get( + 'unicode_decode_error_handler', "strict")) diff --git a/test/test_bson.py b/test/test_bson.py index b4b676c5c..7aee358a1 100644 --- a/test/test_bson.py +++ b/test/test_bson.py @@ -785,8 +785,9 @@ class TestCodecOptions(unittest.TestCase): self.assertRaises(ValueError, CodecOptions, uuid_representation=2) def test_codec_options_repr(self): - r = ('CodecOptions(document_class=dict, tz_aware=False, ' - 'uuid_representation=PYTHON_LEGACY)') + r = ("CodecOptions(document_class=dict, tz_aware=False, " + "uuid_representation=PYTHON_LEGACY, " + "unicode_decode_error_handler='strict')") self.assertEqual(r, repr(CodecOptions())) def test_decode_all_defaults(self): @@ -803,6 +804,71 @@ class TestCodecOptions(unittest.TestCase): self.assertEqual(decoded['uuid'], doc['uuid']) self.assertIsNone(decoded['dt'].tzinfo) + def test_unicode_decode_error_handler(self): + enc = BSON.encode({"keystr": "foobar"}) + + # Test handling of bad key value. + invalid_key = BSON(enc[:7] + b'\xe9' + enc[8:]) + replaced_key = b'ke\xef\xbf\xbdstr'.decode('utf-8') + + dec = BSON.decode(invalid_key, CodecOptions( + unicode_decode_error_handler="replace")) + self.assertEqual(dec, {replaced_key: u("foobar")}) + + dec = BSON.decode(invalid_key, CodecOptions( + unicode_decode_error_handler="ignore")) + self.assertEqual(dec, {"kestr": "foobar"}) + + self.assertRaises(InvalidBSON, BSON.decode, invalid_key, CodecOptions( + unicode_decode_error_handler="strict")) + self.assertRaises(InvalidBSON, BSON.decode, invalid_key, + CodecOptions()) + self.assertRaises(InvalidBSON, BSON.decode, invalid_key) + + # Test handing of bad string value. + invalid_val = BSON(enc[:18] + b'\xe9' + enc[19:]) + replaced_val = b'fo\xef\xbf\xbdbar'.decode('utf-8') + + dec = BSON.decode(invalid_val, CodecOptions( + unicode_decode_error_handler="replace")) + self.assertEqual(dec, {"keystr": replaced_val}) + + dec = BSON.decode(invalid_val, CodecOptions( + unicode_decode_error_handler="ignore")) + self.assertEqual(dec, {"keystr": "fobar"}) + + self.assertRaises(InvalidBSON, BSON.decode, invalid_val, CodecOptions( + unicode_decode_error_handler="strict")) + self.assertRaises(InvalidBSON, BSON.decode, invalid_val, + CodecOptions()) + self.assertRaises(InvalidBSON, BSON.decode, invalid_val) + + # Test handing bad key + bad value. + invalid_both = BSON( + enc[:7] + b'\xe9' + enc[8:18] + b'\xe9' + enc[19:]) + + dec = BSON.decode(invalid_both, CodecOptions( + unicode_decode_error_handler="replace")) + self.assertEqual(dec, {replaced_key: replaced_val}) + + dec = BSON.decode(invalid_both, CodecOptions( + unicode_decode_error_handler="ignore")) + self.assertEqual(dec, {"kestr": "fobar"}) + + self.assertRaises(InvalidBSON, BSON.decode, invalid_both, CodecOptions( + unicode_decode_error_handler="strict")) + self.assertRaises(InvalidBSON, BSON.decode, invalid_both, + CodecOptions()) + self.assertRaises(InvalidBSON, BSON.decode, invalid_both) + + # Test handling bad error mode. + dec = BSON.decode(enc, CodecOptions( + unicode_decode_error_handler="junk")) + self.assertEqual(dec, {"keystr": "foobar"}) + + self.assertRaises(InvalidBSON, BSON.decode, invalid_both, + CodecOptions(unicode_decode_error_handler="junk")) + if __name__ == "__main__": unittest.main()