diff --git a/bson/__init__.py b/bson/__init__.py index b4e6aecdc..1efb1f7ff 100644 --- a/bson/__init__.py +++ b/bson/__init__.py @@ -69,7 +69,7 @@ from collections import abc as _abc from bson.binary import (Binary, UuidRepresentation, ALL_UUID_SUBTYPES, OLD_UUID_SUBTYPE, - JAVA_LEGACY, CSHARP_LEGACY, + JAVA_LEGACY, CSHARP_LEGACY, STANDARD, UUID_SUBTYPE) from bson.code import Code from bson.codec_options import ( @@ -265,20 +265,14 @@ def _get_binary(data, view, position, obj_end, opts, dummy1): raise InvalidBSON('bad binary object length') # Convert UUID subtypes to native UUIDs. - # TODO: PYTHON-2245 Decoding should follow UUID spec in PyMongo 4.0+ if subtype in ALL_UUID_SUBTYPES: - uuid_representation = opts.uuid_representation + uuid_rep = opts.uuid_representation binary_value = Binary(data[position:end], subtype) - if uuid_representation == UuidRepresentation.UNSPECIFIED: + if ((uuid_rep == UuidRepresentation.UNSPECIFIED) or + (subtype == UUID_SUBTYPE and uuid_rep != STANDARD) or + (subtype == OLD_UUID_SUBTYPE and uuid_rep == STANDARD)): return binary_value, end - if subtype == UUID_SUBTYPE: - # Legacy behavior: use STANDARD with binary subtype 4. - uuid_representation = UuidRepresentation.STANDARD - elif uuid_representation == UuidRepresentation.STANDARD: - # subtype == OLD_UUID_SUBTYPE - # Legacy behavior: STANDARD is the same as PYTHON_LEGACY. - uuid_representation = UuidRepresentation.PYTHON_LEGACY - return binary_value.as_uuid(uuid_representation), end + return binary_value.as_uuid(uuid_rep), end # Decode subtype 0 to 'bytes'. if subtype == 0: diff --git a/bson/_cbsonmodule.c b/bson/_cbsonmodule.c index 67ee01c72..93610f7c5 100644 --- a/bson/_cbsonmodule.c +++ b/bson/_cbsonmodule.c @@ -1760,8 +1760,7 @@ static PyObject* get_value(PyObject* self, PyObject* name, const char* buffer, if (!data) { goto invalid; } - /* Encode as UUID or Binary based on options->uuid_rep - * TODO: PYTHON-2245 Decoding should follow UUID spec in PyMongo 4.0 */ + /* Encode as UUID or Binary based on options->uuid_rep */ if (subtype == 3 || subtype == 4) { PyObject* binary_type = NULL; PyObject* binary_value = NULL; @@ -1782,15 +1781,12 @@ static PyObject* get_value(PyObject* self, PyObject* name, const char* buffer, goto uuiderror; } - if (uuid_rep == UNSPECIFIED) { + if ((uuid_rep == UNSPECIFIED) || + (subtype == 4 && uuid_rep != STANDARD) || + (subtype == 3 && uuid_rep == STANDARD)) { value = binary_value; Py_INCREF(value); } else { - if (subtype == 4) { - uuid_rep = STANDARD; - } else if (uuid_rep == STANDARD) { - uuid_rep = PYTHON_LEGACY; - } value = PyObject_CallMethod(binary_value, "as_uuid", "(i)", uuid_rep); } diff --git a/bson/binary.py b/bson/binary.py index 39bc69c04..dd12f56e2 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -39,7 +39,10 @@ OLD_UUID_SUBTYPE = 3 """Old BSON binary subtype for a UUID. :class:`uuid.UUID` instances will automatically be encoded -by :mod:`bson` using this subtype. +by :mod:`bson` using this subtype when using +:data:`UuidRepresentation.PYTHON_LEGACY`, +:data:`UuidRepresentation.JAVA_LEGACY`, or +:data:`UuidRepresentation.CSHARP_LEGACY`. .. versionadded:: 2.1 """ @@ -47,8 +50,10 @@ by :mod:`bson` using this subtype. UUID_SUBTYPE = 4 """BSON binary subtype for a UUID. -This is the new BSON binary subtype for UUIDs. The -current default is :data:`OLD_UUID_SUBTYPE`. +This is the standard BSON binary subtype for UUIDs. +:class:`uuid.UUID` instances will automatically be encoded +by :mod:`bson` using this subtype when using +:data:`UuidRepresentation.STANDARD`. """ diff --git a/bson/codec_options.py b/bson/codec_options.py index 9ce772427..6fcffcc17 100644 --- a/bson/codec_options.py +++ b/bson/codec_options.py @@ -219,7 +219,7 @@ class CodecOptions(_options_base): naive. Defaults to ``False``. - `uuid_representation`: The BSON representation to use when encoding and decoding instances of :class:`~uuid.UUID`. Defaults to - :data:`~bson.binary.UuidRepresentation.PYTHON_LEGACY`. New + :data:`~bson.binary.UuidRepresentation.UNSPECIFIED`. New applications should consider setting this to :data:`~bson.binary.UuidRepresentation.STANDARD` for cross language compatibility. See :ref:`handling-uuid-data-example` for details. @@ -233,6 +233,11 @@ class CodecOptions(_options_base): - `type_registry`: Instance of :class:`TypeRegistry` used to customize encoding and decoding behavior. + .. versionchanged:: 4.0 + The default for `uuid_representation` was changed from + :const:`~bson.binary.UuidRepresentation.PYTHON_LEGACY` to + :const:`~bson.binary.UuidRepresentation.UNSPECIFIED`. + .. versionadded:: 3.8 `type_registry` attribute. @@ -245,7 +250,7 @@ class CodecOptions(_options_base): def __new__(cls, document_class=dict, tz_aware=False, - uuid_representation=None, + uuid_representation=UuidRepresentation.UNSPECIFIED, unicode_decode_error_handler="strict", tzinfo=None, type_registry=None): if not (issubclass(document_class, _MutableMapping) or @@ -255,9 +260,7 @@ class CodecOptions(_options_base): "sublass of collections.abc.MutableMapping") if not isinstance(tz_aware, bool): raise TypeError("tz_aware must be True or False") - if uuid_representation is None: - uuid_representation = UuidRepresentation.PYTHON_LEGACY - elif uuid_representation not in ALL_UUID_REPRESENTATIONS: + if uuid_representation not in ALL_UUID_REPRESENTATIONS: raise ValueError("uuid_representation must be a value " "from bson.binary.UuidRepresentation") if not isinstance(unicode_decode_error_handler, (str, None)): @@ -327,21 +330,18 @@ class CodecOptions(_options_base): return CodecOptions(**opts) -DEFAULT_CODEC_OPTIONS = CodecOptions( - uuid_representation=UuidRepresentation.PYTHON_LEGACY) +DEFAULT_CODEC_OPTIONS = CodecOptions() def _parse_codec_options(options): """Parse BSON codec options.""" - return CodecOptions( - document_class=options.get( - 'document_class', DEFAULT_CODEC_OPTIONS.document_class), - tz_aware=options.get( - 'tz_aware', DEFAULT_CODEC_OPTIONS.tz_aware), - uuid_representation=options.get('uuidrepresentation'), - unicode_decode_error_handler=options.get( - 'unicode_decode_error_handler', - DEFAULT_CODEC_OPTIONS.unicode_decode_error_handler), - tzinfo=options.get('tzinfo', DEFAULT_CODEC_OPTIONS.tzinfo), - type_registry=options.get( - 'type_registry', DEFAULT_CODEC_OPTIONS.type_registry)) + kwargs = {} + for k in set(options) & {'document_class', 'tz_aware', + 'uuidrepresentation', + 'unicode_decode_error_handler', 'tzinfo', + 'type_registry'}: + if k == 'uuidrepresentation': + kwargs['uuid_representation'] = options[k] + else: + kwargs[k] = options[k] + return CodecOptions(**kwargs) diff --git a/bson/json_util.py b/bson/json_util.py index fe0bfe069..557388a51 100644 --- a/bson/json_util.py +++ b/bson/json_util.py @@ -222,7 +222,7 @@ class JSONOptions(CodecOptions): :class:`collections.MutableMapping`. Defaults to :class:`dict`. - `uuid_representation`: The :class:`~bson.binary.UuidRepresentation` to use when encoding and decoding instances of :class:`uuid.UUID`. - Defaults to :const:`~bson.binary.UuidRepresentation.PYTHON_LEGACY`. + Defaults to :const:`~bson.binary.UuidRepresentation.UNSPECIFIED`. - `tz_aware`: If ``True``, MongoDB Extended JSON's *Strict mode* type `Date` will be decoded to timezone aware instances of :class:`datetime.datetime`. Otherwise they will be naive. Defaults @@ -238,6 +238,9 @@ class JSONOptions(CodecOptions): .. versionchanged:: 4.0 The default for `json_mode` was changed from :const:`JSONMode.LEGACY` to :const:`JSONMode.RELAXED`. + The default for `uuid_representation` was changed from + :const:`~bson.binary.UuidRepresentation.PYTHON_LEGACY` to + :const:`~bson.binary.UuidRepresentation.UNSPECIFIED`. .. versionchanged:: 3.5 Accepts the optional parameter `json_mode`. diff --git a/doc/changelog.rst b/doc/changelog.rst index b7a082443..6adff2e71 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -17,6 +17,13 @@ Breaking Changes in 4.0 ....................... - Removed support for Python 2.7, 3.4, and 3.5. Python 3.6+ is now required. +- The default uuid_representation for :class:`~bson.codec_options.CodecOptions`, + :class:`~bson.json_util.JSONOptions`, and + :class:`~pymongo.mongo_client.MongoClient` has been changed from + :data:`bson.binary.UuidRepresentation.PYTHON_LEGACY` to + :data:`bson.binary.UuidRepresentation.UNSPECIFIED`. Attempting to encode a + :class:`uuid.UUID` instance to BSON or JSON now produces an error by default. + See :ref:`handling-uuid-data-example` for details. - Removed the ``waitQueueMultiple`` keyword argument to :class:`~pymongo.mongo_client.MongoClient` and removed :exc:`pymongo.errors.ExceededMaxWaiters`. diff --git a/doc/examples/uuid.rst b/doc/examples/uuid.rst index 4d55fbeae..d4a77d403 100644 --- a/doc/examples/uuid.rst +++ b/doc/examples/uuid.rst @@ -147,16 +147,15 @@ Consider the following situation:: collection.insert_one({'_id': 'foo', 'uuid': input_uuid}) assert collection.find_one({'uuid': Binary(input_uuid.bytes, 3)})['_id'] == 'foo' - # Retrieving this document using UuidRepresentation.STANDARD returns a native UUID + # Retrieving this document using UuidRepresentation.STANDARD returns a Binary instance std_opts = CodecOptions(uuid_representation=UuidRepresentation.STANDARD) std_collection = client.testdb.get_collection('test', codec_options=std_opts) doc = std_collection.find_one({'_id': 'foo'}) - assert doc['uuid'] == input_uuid + assert isinstance(doc['uuid'], Binary) - # Round-tripping the retrieved document silently changes the Binary subtype to 4 + # Round-tripping the retrieved document yields the exact same document std_collection.replace_one({'_id': 'foo'}, doc) - assert collection.find_one({'uuid': Binary(input_uuid.bytes, 3)}) is None - round_tripped_doc = collection.find_one({'uuid': Binary(input_uuid.bytes, 4)}) + round_tripped_doc = collection.find_one({'uuid': Binary(input_uuid.bytes, 3)}) assert doc == round_tripped_doc @@ -230,7 +229,7 @@ Applications can set the UUID representation in one of the following ways: #. At the ``MongoClient`` level using the ``uuidRepresentation`` URI option, e.g.:: - client = MongoClient("mongodb://a:27107/?uuidRepresentation=javaLegacy") + client = MongoClient("mongodb://a:27107/?uuidRepresentation=standard") Valid values are: @@ -240,6 +239,12 @@ Applications can set the UUID representation in one of the following ways: * - Value - UUID Representation + * - ``unspecified`` + - :ref:`unspecified-representation-details` + + * - ``standard`` + - :ref:`standard-representation-details` + * - ``pythonLegacy`` - :ref:`python-legacy-representation-details` @@ -249,17 +254,11 @@ Applications can set the UUID representation in one of the following ways: * - ``csharpLegacy`` - :ref:`csharp-legacy-representation-details` - * - ``standard`` - - :ref:`standard-representation-details` - - * - ``unspecified`` - - :ref:`unspecified-representation-details` - #. At the ``MongoClient`` level using the ``uuidRepresentation`` kwarg option, e.g.:: from bson.binary import UuidRepresentation - client = MongoClient(uuidRepresentation=UuidRepresentation.PYTHON_LEGACY) + client = MongoClient(uuidRepresentation=UuidRepresentation.STANDARD) #. At the ``Database`` or ``Collection`` level by supplying a suitable :class:`~bson.codec_options.CodecOptions` instance, e.g.:: @@ -288,186 +287,51 @@ Supported UUID Representations - Decode :class:`~bson.binary.Binary` subtype 4 to - Decode :class:`~bson.binary.Binary` subtype 3 to - * - :ref:`python-legacy-representation-details` - - Yes, in PyMongo>=2.9,<4 - - :class:`~bson.binary.Binary` subtype 3 with standard byte-order - - :class:`uuid.UUID` in PyMongo<4; :class:`~bson.binary.Binary` subtype 4 in PyMongo>=4 - - :class:`uuid.UUID` - - * - :ref:`java-legacy-representation-details` - - No - - :class:`~bson.binary.Binary` subtype 3 with Java legacy byte-order - - :class:`uuid.UUID` in PyMongo<4; :class:`~bson.binary.Binary` subtype 4 in PyMongo>=4 - - :class:`uuid.UUID` - - * - :ref:`csharp-legacy-representation-details` - - No - - :class:`~bson.binary.Binary` subtype 3 with C# legacy byte-order - - :class:`uuid.UUID` in PyMongo<4; :class:`~bson.binary.Binary` subtype 4 in PyMongo>=4 - - :class:`uuid.UUID` - * - :ref:`standard-representation-details` - No - :class:`~bson.binary.Binary` subtype 4 - :class:`uuid.UUID` - - :class:`uuid.UUID` in PyMongo<4; :class:`~bson.binary.Binary` subtype 3 in PyMongo>=4 + - :class:`~bson.binary.Binary` subtype 3 * - :ref:`unspecified-representation-details` - Yes, in PyMongo>=4 - Raise :exc:`ValueError` - :class:`~bson.binary.Binary` subtype 4 - - :class:`uuid.UUID` in PyMongo<4; :class:`~bson.binary.Binary` subtype 3 in PyMongo>=4 + - :class:`~bson.binary.Binary` subtype 3 + + * - :ref:`python-legacy-representation-details` + - No + - :class:`~bson.binary.Binary` subtype 3 with standard byte-order + - :class:`~bson.binary.Binary` subtype 4 + - :class:`uuid.UUID` + + * - :ref:`java-legacy-representation-details` + - No + - :class:`~bson.binary.Binary` subtype 3 with Java legacy byte-order + - :class:`~bson.binary.Binary` subtype 4 + - :class:`uuid.UUID` + + * - :ref:`csharp-legacy-representation-details` + - No + - :class:`~bson.binary.Binary` subtype 3 with C# legacy byte-order + - :class:`~bson.binary.Binary` subtype 4 + - :class:`uuid.UUID` We now detail the behavior and use-case for each supported UUID representation. -.. _python-legacy-representation-details: - -``PYTHON_LEGACY`` -^^^^^^^^^^^^^^^^^ - -.. attention:: This uuid representation should be used when reading UUIDs - generated by existing applications that use the Python driver - but **don't** explicitly set a UUID representation. - -.. attention:: :data:`~bson.binary.UuidRepresentation.PYTHON_LEGACY` - has been the default uuid representation since PyMongo 2.9. - -The :data:`~bson.binary.UuidRepresentation.PYTHON_LEGACY` representation -corresponds to the legacy representation of UUIDs used by PyMongo. This -representation conforms with -`RFC 4122 Section 4.1.2 `_. - -The following example illustrates the use of this representation:: - - from bson.codec_options import CodecOptions, DEFAULT_CODEC_OPTIONS - from bson.binary import UuidRepresentation - - # No configured UUID representation - collection = client.python_legacy.get_collection('test', codec_options=DEFAULT_CODEC_OPTIONS) - - # Using UuidRepresentation.PYTHON_LEGACY - pylegacy_opts = CodecOptions(uuid_representation=UuidRepresentation.PYTHON_LEGACY) - pylegacy_collection = client.python_legacy.get_collection('test', codec_options=pylegacy_opts) - - # UUIDs written by PyMongo with no UuidRepresentation configured can be queried using PYTHON_LEGACY - uuid_1 = uuid4() - collection.insert_one({'uuid': uuid_1}) - document = pylegacy_collection.find_one({'uuid': uuid_1}) - - # UUIDs written using PYTHON_LEGACY can be read by PyMongo with no UuidRepresentation configured - uuid_2 = uuid4() - pylegacy_collection.insert_one({'uuid': uuid_2}) - document = collection.find_one({'uuid': uuid_2}) - -``PYTHON_LEGACY`` encodes native :class:`uuid.UUID` objects to -:class:`~bson.binary.Binary` subtype 3 objects, preserving the same -byte-order as :attr:`~uuid.UUID.bytes`:: - - from bson.binary import Binary - - document = collection.find_one({'uuid': Binary(uuid_2.bytes, subtype=3)}) - assert document['uuid'] == uuid_2 - -.. _java-legacy-representation-details: - -``JAVA_LEGACY`` -^^^^^^^^^^^^^^^ - -.. attention:: This UUID representation should be used when reading UUIDs - written to MongoDB by the legacy applications (i.e. applications that don't - use the ``STANDARD`` representation) using the Java driver. - -The :data:`~bson.binary.UuidRepresentation.JAVA_LEGACY` representation -corresponds to the legacy representation of UUIDs used by the MongoDB Java -Driver. - -.. note:: The ``JAVA_LEGACY`` representation reverses the order of bytes 0-7, - and bytes 8-15. - -As an example, consider the same UUID described in :ref:`example-legacy-uuid`. -Let us assume that an application used the Java driver without an explicitly -specified UUID representation to insert the example UUID -``00112233-4455-6677-8899-aabbccddeeff`` into MongoDB. If we try to read this -value using PyMongo with no UUID representation specified, we end up with an -entirely different UUID:: - - UUID('77665544-3322-1100-ffee-ddccbbaa9988') - -However, if we explicitly set the representation to -:data:`~bson.binary.UuidRepresentation.JAVA_LEGACY`, we get the correct result:: - - UUID('00112233-4455-6677-8899-aabbccddeeff') - -PyMongo uses the specified UUID representation to reorder the BSON bytes and -load them correctly. ``JAVA_LEGACY`` encodes native :class:`uuid.UUID` objects -to :class:`~bson.binary.Binary` subtype 3 objects, while performing the same -byte-reordering as the legacy Java driver's UUID to BSON encoder. - -.. _csharp-legacy-representation-details: - -``CSHARP_LEGACY`` -^^^^^^^^^^^^^^^^^ - -.. attention:: This UUID representation should be used when reading UUIDs - written to MongoDB by the legacy applications (i.e. applications that don't - use the ``STANDARD`` representation) using the C# driver. - -The :data:`~bson.binary.UuidRepresentation.CSHARP_LEGACY` representation -corresponds to the legacy representation of UUIDs used by the MongoDB Java -Driver. - -.. note:: The ``CSHARP_LEGACY`` representation reverses the order of bytes 0-3, - bytes 4-5, and bytes 6-7. - -As an example, consider the same UUID described in :ref:`example-legacy-uuid`. -Let us assume that an application used the C# driver without an explicitly -specified UUID representation to insert the example UUID -``00112233-4455-6677-8899-aabbccddeeff`` into MongoDB. If we try to read this -value using PyMongo with no UUID representation specified, we end up with an -entirely different UUID:: - - UUID('33221100-5544-7766-8899-aabbccddeeff') - -However, if we explicitly set the representation to -:data:`~bson.binary.UuidRepresentation.CSHARP_LEGACY`, we get the correct result:: - - UUID('00112233-4455-6677-8899-aabbccddeeff') - -PyMongo uses the specified UUID representation to reorder the BSON bytes and -load them correctly. ``CSHARP_LEGACY`` encodes native :class:`uuid.UUID` -objects to :class:`~bson.binary.Binary` subtype 3 objects, while performing -the same byte-reordering as the legacy C# driver's UUID to BSON encoder. - -.. _standard-representation-details: - -``STANDARD`` -^^^^^^^^^^^^ - -.. attention:: This UUID representation should be used by new applications - that have never stored UUIDs in MongoDB. - -The :data:`~bson.binary.UuidRepresentation.STANDARD` representation -enables cross-language compatibility by ensuring the same byte-ordering -when encoding UUIDs from all drivers. UUIDs written by a driver with this -representation configured will be handled correctly by every other provided -it is also configured with the ``STANDARD`` representation. - -``STANDARD`` encodes native :class:`uuid.UUID` objects to -:class:`~bson.binary.Binary` subtype 4 objects. - .. _unspecified-representation-details: ``UNSPECIFIED`` ^^^^^^^^^^^^^^^ .. attention:: Starting in PyMongo 4.0, - :data:`~bson.binary.UuidRepresentation.UNSPECIFIED` will be the default + :data:`~bson.binary.UuidRepresentation.UNSPECIFIED` is the default UUID representation used by PyMongo. The :data:`~bson.binary.UuidRepresentation.UNSPECIFIED` representation prevents the incorrect interpretation of UUID bytes by stopping short of -automatically converting UUID fields in BSON to native UUID types. Loading +automatically converting UUID fields in BSON to native UUID types. Decoding a UUID when using this representation returns a :class:`~bson.binary.Binary` object instead. If required, users can coerce the decoded :class:`~bson.binary.Binary` objects into native UUIDs using the @@ -513,5 +377,135 @@ Instead, applications using :data:`~bson.binary.UuidRepresentation.UNSPECIFIED` must explicitly coerce a native UUID using the :meth:`~bson.binary.Binary.from_uuid` method:: - explicit_binary = Binary.from_uuid(uuid4(), UuidRepresentation.PYTHON_LEGACY) + explicit_binary = Binary.from_uuid(uuid4(), UuidRepresentation.STANDARD) unspec_collection.insert_one({'_id': 'bar', 'uuid': explicit_binary}) + +.. _standard-representation-details: + +``STANDARD`` +^^^^^^^^^^^^ + +.. attention:: This UUID representation should be used by new applications or + applications that are encoding and/or decoding UUIDs in MongoDB for the + first time. + +The :data:`~bson.binary.UuidRepresentation.STANDARD` representation +enables cross-language compatibility by ensuring the same byte-ordering +when encoding UUIDs from all drivers. UUIDs written by a driver with this +representation configured will be handled correctly by every other provided +it is also configured with the ``STANDARD`` representation. + +``STANDARD`` encodes native :class:`uuid.UUID` objects to +:class:`~bson.binary.Binary` subtype 4 objects. + +.. _python-legacy-representation-details: + +``PYTHON_LEGACY`` +^^^^^^^^^^^^^^^^^ + +.. attention:: This uuid representation should be used when reading UUIDs + generated by existing applications that use the Python driver + but **don't** explicitly set a UUID representation. + +.. attention:: :data:`~bson.binary.UuidRepresentation.PYTHON_LEGACY` + was the default uuid representation in PyMongo 3. + +The :data:`~bson.binary.UuidRepresentation.PYTHON_LEGACY` representation +corresponds to the legacy representation of UUIDs used by PyMongo. This +representation conforms with +`RFC 4122 Section 4.1.2 `_. + +The following example illustrates the use of this representation:: + + from bson.codec_options import CodecOptions, DEFAULT_CODEC_OPTIONS + from bson.binary import Binary, UuidRepresentation + + # No configured UUID representation + collection = client.python_legacy.get_collection('test', codec_options=DEFAULT_CODEC_OPTIONS) + + # Using UuidRepresentation.PYTHON_LEGACY + pylegacy_opts = CodecOptions(uuid_representation=UuidRepresentation.PYTHON_LEGACY) + pylegacy_collection = client.python_legacy.get_collection('test', codec_options=pylegacy_opts) + + # UUIDs written by PyMongo 3 with no UuidRepresentation configured + # (or PyMongo 4.0 with PYTHON_LEGACY) can be queried using PYTHON_LEGACY + uuid_1 = uuid4() + pylegacy_collection.insert_one({'uuid': uuid_1}) + document = pylegacy_collection.find_one({'uuid': uuid_1}) + +``PYTHON_LEGACY`` encodes native :class:`uuid.UUID` objects to +:class:`~bson.binary.Binary` subtype 3 objects, preserving the same +byte-order as :attr:`~uuid.UUID.bytes`:: + + from bson.binary import Binary + + document = collection.find_one({'uuid': Binary(uuid_2.bytes, subtype=3)}) + assert document['uuid'] == uuid_2 + +.. _java-legacy-representation-details: + +``JAVA_LEGACY`` +^^^^^^^^^^^^^^^ + +.. attention:: This UUID representation should be used when reading UUIDs + written to MongoDB by the legacy applications (i.e. applications that don't + use the ``STANDARD`` representation) using the Java driver. + +The :data:`~bson.binary.UuidRepresentation.JAVA_LEGACY` representation +corresponds to the legacy representation of UUIDs used by the MongoDB Java +Driver. + +.. note:: The ``JAVA_LEGACY`` representation reverses the order of bytes 0-7, + and bytes 8-15. + +As an example, consider the same UUID described in :ref:`example-legacy-uuid`. +Let us assume that an application used the Java driver without an explicitly +specified UUID representation to insert the example UUID +``00112233-4455-6677-8899-aabbccddeeff`` into MongoDB. If we try to read this +value using ``PYTHON_LEGACY``, we end up with an entirely different UUID:: + + UUID('77665544-3322-1100-ffee-ddccbbaa9988') + +However, if we explicitly set the representation to +:data:`~bson.binary.UuidRepresentation.JAVA_LEGACY`, we get the correct result:: + + UUID('00112233-4455-6677-8899-aabbccddeeff') + +PyMongo uses the specified UUID representation to reorder the BSON bytes and +load them correctly. ``JAVA_LEGACY`` encodes native :class:`uuid.UUID` objects +to :class:`~bson.binary.Binary` subtype 3 objects, while performing the same +byte-reordering as the legacy Java driver's UUID to BSON encoder. + +.. _csharp-legacy-representation-details: + +``CSHARP_LEGACY`` +^^^^^^^^^^^^^^^^^ + +.. attention:: This UUID representation should be used when reading UUIDs + written to MongoDB by the legacy applications (i.e. applications that don't + use the ``STANDARD`` representation) using the C# driver. + +The :data:`~bson.binary.UuidRepresentation.CSHARP_LEGACY` representation +corresponds to the legacy representation of UUIDs used by the MongoDB Java +Driver. + +.. note:: The ``CSHARP_LEGACY`` representation reverses the order of bytes 0-3, + bytes 4-5, and bytes 6-7. + +As an example, consider the same UUID described in :ref:`example-legacy-uuid`. +Let us assume that an application used the C# driver without an explicitly +specified UUID representation to insert the example UUID +``00112233-4455-6677-8899-aabbccddeeff`` into MongoDB. If we try to read this +value using PYTHON_LEGACY, we end up with an entirely different UUID:: + + UUID('33221100-5544-7766-8899-aabbccddeeff') + +However, if we explicitly set the representation to +:data:`~bson.binary.UuidRepresentation.CSHARP_LEGACY`, we get the correct result:: + + UUID('00112233-4455-6677-8899-aabbccddeeff') + +PyMongo uses the specified UUID representation to reorder the BSON bytes and +load them correctly. ``CSHARP_LEGACY`` encodes native :class:`uuid.UUID` +objects to :class:`~bson.binary.Binary` subtype 3 objects, while performing +the same byte-reordering as the legacy C# driver's UUID to BSON encoder. diff --git a/doc/faq.rst b/doc/faq.rst index 5454db448..c2a6fc7f7 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -266,7 +266,7 @@ collection, configured to use :class:`~bson.son.SON` instead of dict: >>> opts CodecOptions(document_class=, tz_aware=False, - uuid_representation=UuidRepresentation.PYTHON_LEGACY, + uuid_representation=UuidRepresentation.UNSPECIFIED, unicode_decode_error_handler='strict', tzinfo=None, type_registry=TypeRegistry(type_codecs=[], fallback_encoder=None)) diff --git a/doc/migrate-to-pymongo4.rst b/doc/migrate-to-pymongo4.rst index fe10a6087..2baac8eee 100644 --- a/doc/migrate-to-pymongo4.rst +++ b/doc/migrate-to-pymongo4.rst @@ -774,3 +774,14 @@ subdocument containing a ``$ref`` field would be decoded as a :class:`~bson.dbref.DBRef`. .. _DBRef specification: https://github.com/mongodb/specifications/blob/5a8c8d7/source/dbref.rst + +Encoding a UUID raises an error by default +.......................................... + +The default uuid_representation for :class:`~bson.codec_options.CodecOptions`, +:class:`~bson.json_util.JSONOptions`, and +:class:`~pymongo.mongo_client.MongoClient` has been changed from +:data:`bson.binary.UuidRepresentation.PYTHON_LEGACY` to +:data:`bson.binary.UuidRepresentation.UNSPECIFIED`. Attempting to encode a +:class:`uuid.UUID` instance to BSON or JSON now produces an error by default. +See :ref:`handling-uuid-data-example` for details. diff --git a/pymongo/mongo_client.py b/pymongo/mongo_client.py index adfb6834e..cce5f14d0 100644 --- a/pymongo/mongo_client.py +++ b/pymongo/mongo_client.py @@ -325,8 +325,8 @@ class MongoClient(common.BaseObject): speed. 9 is best compression. Defaults to -1. - `uuidRepresentation`: The BSON representation to use when encoding from and decoding to instances of :class:`~uuid.UUID`. Valid - values are `pythonLegacy` (the default), `javaLegacy`, - `csharpLegacy`, `standard` and `unspecified`. New applications + values are `pythonLegacy`, `javaLegacy`, `csharpLegacy`, `standard` + and `unspecified` (the default). New applications should consider setting this to `standard` for cross language compatibility. See :ref:`handling-uuid-data-example` for details. - `unicode_decode_error_handler`: The error handler to apply when @@ -501,6 +501,8 @@ class MongoClient(common.BaseObject): .. versionchanged:: 4.0 Removed the ``waitQueueMultiple`` and ``socketKeepAlive`` keyword arguments. + The default for `uuidRepresentation` was changed from + ``pythonLegacy`` to ``unspecified``. .. versionchanged:: 3.12 Added the ``server_api`` keyword argument. diff --git a/test/test_binary.py b/test/test_binary.py index 440c04667..e6b681fc5 100644 --- a/test/test_binary.py +++ b/test/test_binary.py @@ -145,15 +145,18 @@ class TestBinary(unittest.TestCase): self.assertEqual(hash(Binary(b"hello world", 42)), hash(two)) def test_uuid_subtype_4(self): - """uuid_representation should be ignored when decoding subtype 4 for - all UuidRepresentation values except UNSPECIFIED.""" + """Only STANDARD should decode subtype 4 as native uuid.""" expected_uuid = uuid.uuid4() - doc = {"uuid": Binary(expected_uuid.bytes, 4)} + expected_bin = Binary(expected_uuid.bytes, 4) + doc = {"uuid": expected_bin} encoded = encode(doc) - for uuid_representation in (set(ALL_UUID_REPRESENTATIONS) - - {UuidRepresentation.UNSPECIFIED}): - options = CodecOptions(uuid_representation=uuid_representation) - self.assertEqual(expected_uuid, decode(encoded, options)["uuid"]) + for uuid_rep in (UuidRepresentation.PYTHON_LEGACY, + UuidRepresentation.JAVA_LEGACY, + UuidRepresentation.CSHARP_LEGACY): + opts = CodecOptions(uuid_representation=uuid_rep) + self.assertEqual(expected_bin, decode(encoded, opts)["uuid"]) + opts = CodecOptions(uuid_representation=UuidRepresentation.STANDARD) + self.assertEqual(expected_uuid, decode(encoded, opts)["uuid"]) def test_legacy_java_uuid(self): # Test decoding @@ -522,29 +525,25 @@ class TestUuidSpecImplicitCoding(IntegrationTest): # Implicit decoding prose test #1 def test_decoding_1(self): - # TODO: these assertions will change after PYTHON-2245. Specifically, - # the 'standard' field will be decoded as a Binary subtype 4. - binary_value = Binary.from_uuid( - self.uuid, UuidRepresentation.PYTHON_LEGACY) + standard_binary = Binary.from_uuid( + self.uuid, UuidRepresentation.STANDARD) self._test_decoding( "javaLegacy", UuidRepresentation.JAVA_LEGACY, - self.uuid, self.uuid) + standard_binary, self.uuid) self._test_decoding( "csharpLegacy", UuidRepresentation.CSHARP_LEGACY, - self.uuid, self.uuid) + standard_binary, self.uuid) self._test_decoding( "pythonLegacy", UuidRepresentation.PYTHON_LEGACY, - self.uuid, self.uuid) + standard_binary, self.uuid) # Implicit decoding pose test #2 def test_decoding_2(self): - # TODO: these assertions will change after PYTHON-2245. Specifically, - # the 'legacy' field will be decoded as a Binary subtype 3. - binary_value = Binary.from_uuid( + legacy_binary = Binary.from_uuid( self.uuid, UuidRepresentation.PYTHON_LEGACY) self._test_decoding( "standard", UuidRepresentation.PYTHON_LEGACY, - self.uuid, binary_value.as_uuid(UuidRepresentation.PYTHON_LEGACY)) + self.uuid, legacy_binary) # Implicit decoding pose test #3 def test_decoding_3(self): diff --git a/test/test_bson.py b/test/test_bson.py index 89f4a1117..5c0f163bb 100644 --- a/test/test_bson.py +++ b/test/test_bson.py @@ -638,8 +638,13 @@ class TestBSON(unittest.TestCase): def test_uuid(self): id = uuid.uuid4() - transformed_id = decode(encode({"id": id}))["id"] + # The default uuid_representation is UNSPECIFIED + with self.assertRaisesRegex(ValueError, 'cannot encode native uuid'): + bson.decode_all(encode({'uuid': id})) + opts = CodecOptions(uuid_representation=UuidRepresentation.STANDARD) + transformed_id = decode(encode({"id": id}, codec_options=opts), + codec_options=opts)["id"] self.assertTrue(isinstance(transformed_id, uuid.UUID)) self.assertEqual(id, transformed_id) self.assertNotEqual(uuid.uuid4(), transformed_id) @@ -648,8 +653,9 @@ class TestBSON(unittest.TestCase): id = uuid.uuid4() legacy = Binary.from_uuid(id, UuidRepresentation.PYTHON_LEGACY) self.assertEqual(3, legacy.subtype) - transformed = decode(encode({"uuid": legacy}))["uuid"] - self.assertTrue(isinstance(transformed, uuid.UUID)) + bin = decode(encode({"uuid": legacy}))["uuid"] + self.assertTrue(isinstance(bin, Binary)) + transformed = bin.as_uuid(UuidRepresentation.PYTHON_LEGACY) self.assertEqual(id, transformed) # The C extension was segfaulting on unicode RegExs, so we have this test @@ -965,7 +971,7 @@ class TestCodecOptions(unittest.TestCase): def test_codec_options_repr(self): r = ("CodecOptions(document_class=dict, tz_aware=False, " - "uuid_representation=UuidRepresentation.PYTHON_LEGACY, " + "uuid_representation=UuidRepresentation.UNSPECIFIED, " "unicode_decode_error_handler='strict', " "tzinfo=None, type_registry=TypeRegistry(type_codecs=[], " "fallback_encoder=None))") @@ -973,17 +979,16 @@ class TestCodecOptions(unittest.TestCase): def test_decode_all_defaults(self): # Test decode_all()'s default document_class is dict and tz_aware is - # False. The default uuid_representation is PYTHON_LEGACY but this - # decodes same as STANDARD, so all this test proves about UUID decoding - # is that it's not CSHARP_LEGACY or JAVA_LEGACY. + # False. doc = {'sub_document': {}, - 'uuid': uuid.uuid4(), 'dt': datetime.datetime.utcnow()} decoded = bson.decode_all(bson.encode(doc))[0] self.assertIsInstance(decoded['sub_document'], dict) - self.assertEqual(decoded['uuid'], doc['uuid']) self.assertIsNone(decoded['dt'].tzinfo) + # The default uuid_representation is UNSPECIFIED + with self.assertRaisesRegex(ValueError, 'cannot encode native uuid'): + bson.decode_all(bson.encode({'uuid': uuid.uuid4()})) def test_unicode_decode_error_handler(self): enc = encode({"keystr": "foobar"}) diff --git a/test/test_change_stream.py b/test/test_change_stream.py index e891e1403..a49f6972b 100644 --- a/test/test_change_stream.py +++ b/test/test_change_stream.py @@ -896,7 +896,8 @@ class TestDatabaseChangeStream(TestChangeStreamBase, APITestsMixin): with self.change_stream() as change_stream: for collname in collnames: self._insert_and_check( - change_stream, collname, {'_id': uuid.uuid4()}) + change_stream, collname, + {'_id': Binary.from_uuid(uuid.uuid4())}) def test_isolation(self): # Ensure inserts to other dbs don't show up in our ChangeStream. @@ -905,9 +906,11 @@ class TestDatabaseChangeStream(TestChangeStreamBase, APITestsMixin): other_db, self.db, msg="Isolation must be tested on separate DBs") collname = self.id() with self.change_stream() as change_stream: - other_db[collname].insert_one({'_id': uuid.uuid4()}) + other_db[collname].insert_one( + {'_id': Binary.from_uuid(uuid.uuid4())}) self._insert_and_check( - change_stream, collname, {'_id': uuid.uuid4()}) + change_stream, collname, + {'_id': Binary.from_uuid(uuid.uuid4())}) self.client.drop_database(other_db) diff --git a/test/test_common.py b/test/test_common.py index 87b5ce4c9..dcd618c50 100644 --- a/test/test_common.py +++ b/test/test_common.py @@ -19,7 +19,7 @@ import uuid sys.path[0:0] = [""] -from bson.binary import Binary, PYTHON_LEGACY, STANDARD +from bson.binary import Binary, PYTHON_LEGACY, STANDARD, UuidRepresentation from bson.codec_options import CodecOptions from bson.objectid import ObjectId from pymongo.errors import OperationFailure @@ -40,12 +40,15 @@ class TestCommon(IntegrationTest): coll.drop() # Test property - self.assertEqual(PYTHON_LEGACY, + self.assertEqual(UuidRepresentation.UNSPECIFIED, coll.codec_options.uuid_representation) # Test basic query uu = uuid.uuid4() # Insert as binary subtype 3 + coll = self.db.get_collection( + "uuid", CodecOptions(uuid_representation=PYTHON_LEGACY)) + legacy_opts = coll.codec_options coll.insert_one({'uu': uu}) self.assertEqual(uu, coll.find_one({'uu': uu})['uu']) coll = self.db.get_collection( @@ -53,7 +56,7 @@ class TestCommon(IntegrationTest): self.assertEqual(STANDARD, coll.codec_options.uuid_representation) self.assertEqual(None, coll.find_one({'uu': uu})) uul = Binary.from_uuid(uu, PYTHON_LEGACY) - self.assertEqual(uu, coll.find_one({'uu': uul})['uu']) + self.assertEqual(uul, coll.find_one({'uu': uul})['uu']) # Test count_documents self.assertEqual(0, coll.count_documents({'uu': uu})) @@ -98,9 +101,10 @@ class TestCommon(IntegrationTest): self.assertEqual(5, coll.find_one({'_id': uu})['i']) # Test command - self.assertEqual(5, self.db.command('findAndModify', 'uuid', - update={'$set': {'i': 6}}, - query={'_id': uu})['value']['i']) + self.assertEqual(5, self.db.command( + 'findAndModify', 'uuid', + update={'$set': {'i': 6}}, + query={'_id': uu}, codec_options=legacy_opts)['value']['i']) self.assertEqual(6, self.db.command( 'findAndModify', 'uuid', update={'$set': {'i': 7}}, diff --git a/test/test_json_util.py b/test/test_json_util.py index c20c793d3..791e3de18 100644 --- a/test/test_json_util.py +++ b/test/test_json_util.py @@ -298,9 +298,22 @@ class TestJsonUtil(unittest.TestCase): self.assertEqual(dct, rtdct) self.assertEqual('{"ts": {"$timestamp": {"t": 4, "i": 13}}}', res) + def test_uuid_default(self): + # Cannot directly encode native UUIDs with the default + # uuid_representation. + doc = {'uuid': uuid.UUID('f47ac10b-58cc-4372-a567-0e02b2c3d479')} + with self.assertRaisesRegex(ValueError, 'cannot encode native uuid'): + json_util.dumps(doc) + legacy_jsn = '{"uuid": {"$uuid": "f47ac10b58cc4372a5670e02b2c3d479"}}' + expected = {'uuid': Binary( + b'\xf4z\xc1\x0bX\xccCr\xa5g\x0e\x02\xb2\xc3\xd4y', 4)} + self.assertEqual(json_util.loads(legacy_jsn), expected) + def test_uuid(self): doc = {'uuid': uuid.UUID('f47ac10b-58cc-4372-a567-0e02b2c3d479')} - self.round_trip(doc) + uuid_legacy_opts = LEGACY_JSON_OPTIONS.with_options( + uuid_representation=UuidRepresentation.PYTHON_LEGACY) + self.round_trip(doc, json_options=uuid_legacy_opts) self.assertEqual( '{"uuid": {"$uuid": "f47ac10b58cc4372a5670e02b2c3d479"}}', json_util.dumps(doc, json_options=LEGACY_JSON_OPTIONS)) @@ -308,7 +321,8 @@ class TestJsonUtil(unittest.TestCase): '{"uuid": ' '{"$binary": "9HrBC1jMQ3KlZw4CssPUeQ==", "$type": "03"}}', json_util.dumps( - doc, json_options=STRICT_JSON_OPTIONS)) + doc, json_options=STRICT_JSON_OPTIONS.with_options( + uuid_representation=UuidRepresentation.PYTHON_LEGACY))) self.assertEqual( '{"uuid": ' '{"$binary": "9HrBC1jMQ3KlZw4CssPUeQ==", "$type": "04"}}', @@ -319,7 +333,8 @@ class TestJsonUtil(unittest.TestCase): self.assertEqual( doc, json_util.loads( '{"uuid": ' - '{"$binary": "9HrBC1jMQ3KlZw4CssPUeQ==", "$type": "03"}}')) + '{"$binary": "9HrBC1jMQ3KlZw4CssPUeQ==", "$type": "03"}}', + json_options=uuid_legacy_opts)) for uuid_representation in (set(ALL_UUID_REPRESENTATIONS) - {UuidRepresentation.UNSPECIFIED}): options = JSONOptions( diff --git a/test/test_raw_bson.py b/test/test_raw_bson.py index 229382f3d..7fb53c6da 100644 --- a/test/test_raw_bson.py +++ b/test/test_raw_bson.py @@ -19,7 +19,7 @@ import uuid sys.path[0:0] = [""] from bson import decode, encode -from bson.binary import Binary, JAVA_LEGACY +from bson.binary import Binary, JAVA_LEGACY, UuidRepresentation from bson.codec_options import CodecOptions from bson.errors import InvalidBSON from bson.raw_bson import RawBSONDocument, DEFAULT_RAW_BSON_OPTIONS @@ -92,7 +92,12 @@ class TestRawBSONDocument(IntegrationTest): 'bin3': Binary(uid.bytes, 3)} raw = RawBSONDocument(encode(doc)) coll.insert_one(raw) - self.assertEqual(coll.find_one(), {'_id': 1, 'bin4': uid, 'bin3': uid}) + self.assertEqual(coll.find_one(), doc) + uuid_coll = coll.with_options( + codec_options=coll.codec_options.with_options( + uuid_representation=UuidRepresentation.STANDARD)) + self.assertEqual(uuid_coll.find_one(), + {'_id': 1, 'bin4': uid, 'bin3': Binary(uid.bytes, 3)}) # Test that the raw bytes haven't changed. raw_coll = coll.with_options(codec_options=DEFAULT_RAW_BSON_OPTIONS) @@ -185,3 +190,7 @@ class TestRawBSONDocument(IntegrationTest): for rkey, elt in zip(rawdoc, keyvaluepairs): self.assertEqual(rkey, elt[0]) + + +if __name__ == "__main__": + unittest.main()