From f67e9ae2072c95bb4f1b342a58b427850d0ff27d Mon Sep 17 00:00:00 2001 From: Noah Stapp Date: Thu, 11 Jan 2024 16:44:38 -0800 Subject: [PATCH] PYTHON-1374 Optimize json_util performance (#1460) --- bson/json_util.py | 141 +++++++++++++++++++++++++--------------------- doc/changelog.rst | 1 + 2 files changed, 79 insertions(+), 63 deletions(-) diff --git a/bson/json_util.py b/bson/json_util.py index 3ef15fd2c..ada6e89c5 100644 --- a/bson/json_util.py +++ b/bson/json_util.py @@ -137,7 +137,7 @@ from bson.max_key import MaxKey from bson.min_key import MinKey from bson.objectid import ObjectId from bson.regex import Regex -from bson.son import RE_TYPE, SON +from bson.son import RE_TYPE from bson.timestamp import Timestamp from bson.tz_util import utc @@ -505,7 +505,7 @@ def _json_convert(obj: Any, json_options: JSONOptions = DEFAULT_JSON_OPTIONS) -> converted into json. """ if hasattr(obj, "items"): - return SON(((k, _json_convert(v, json_options)) for k, v in obj.items())) + return {k: _json_convert(v, json_options) for k, v in obj.items()} elif hasattr(obj, "__iter__") and not isinstance(obj, (str, bytes)): return [_json_convert(v, json_options) for v in obj] try: @@ -826,20 +826,83 @@ def _parse_canonical_maxkey(doc: Any) -> MaxKey: def _encode_binary(data: bytes, subtype: int, json_options: JSONOptions) -> Any: if json_options.json_mode == JSONMode.LEGACY: - return SON([("$binary", base64.b64encode(data).decode()), ("$type", "%02x" % subtype)]) - return { - "$binary": SON([("base64", base64.b64encode(data).decode()), ("subType", "%02x" % subtype)]) - } + return {"$binary": base64.b64encode(data).decode(), "$type": "%02x" % subtype} + return {"$binary": {"base64": base64.b64encode(data).decode(), "subType": "%02x" % subtype}} + + +def _encode_datetimems(obj: Any, json_options: JSONOptions) -> dict: + if ( + json_options.datetime_representation == DatetimeRepresentation.ISO8601 + and 0 <= int(obj) <= _max_datetime_ms() + ): + return default(obj.as_datetime(), json_options) + elif json_options.datetime_representation == DatetimeRepresentation.LEGACY: + return {"$date": str(int(obj))} + return {"$date": {"$numberLong": str(int(obj))}} + + +def _encode_code(obj: Code, json_options: JSONOptions) -> dict: + if obj.scope is None: + return {"$code": str(obj)} + else: + return {"$code": str(obj), "$scope": _json_convert(obj.scope, json_options)} + + +def _encode_int64(obj: Int64, json_options: JSONOptions) -> Any: + if json_options.strict_number_long: + return {"$numberLong": str(obj)} + else: + return obj + + +# Encoders for BSON types +_encoders = { + 5: lambda obj, json_options: _encode_binary(obj, obj.subtype, json_options), # Binary + 7: lambda obj, json_options: {"$oid": str(obj)}, # noqa: ARG005 ObjectId + 9: _encode_datetimems, # DatetimeMS + 13: _encode_code, # Code + 17: lambda obj, json_options: {"$timestamp": {"t": obj.time, "i": obj.inc}}, # noqa: ARG005 Timestamp + 18: _encode_int64, # Int64 + 19: lambda obj, json_options: {"$numberDecimal": str(obj)}, # noqa: ARG005 Decimal128 + 100: lambda obj, json_options: _json_convert(obj.as_doc(), json_options=json_options), # DBRef + 127: lambda obj, json_options: {"$maxKey": 1}, # noqa: ARG005 MaxKey + 255: lambda obj, json_options: {"$minKey": 1}, # noqa: ARG005 MinKey +} def default(obj: Any, json_options: JSONOptions = DEFAULT_JSON_OPTIONS) -> Any: # We preserve key order when rendering SON, DBRef, etc. as JSON by # returning a SON for those types instead of a dict. - if isinstance(obj, ObjectId): - return {"$oid": str(obj)} - if isinstance(obj, DBRef): - return _json_convert(obj.as_doc(), json_options=json_options) - if isinstance(obj, datetime.datetime): + if isinstance(obj, bool): + return obj + elif isinstance(obj, (RE_TYPE, Regex)): + flags = "" + if obj.flags & re.IGNORECASE: + flags += "i" + if obj.flags & re.LOCALE: + flags += "l" + if obj.flags & re.MULTILINE: + flags += "m" + if obj.flags & re.DOTALL: + flags += "s" + if obj.flags & re.UNICODE: + flags += "u" + if obj.flags & re.VERBOSE: + flags += "x" + if isinstance(obj.pattern, str): + pattern = obj.pattern + else: + pattern = obj.pattern.decode("utf-8") + if json_options.json_mode == JSONMode.LEGACY: + return {"$regex": pattern, "$options": flags} + return {"$regularExpression": {"pattern": pattern, "options": flags}} + elif hasattr(obj, "_type_marker"): + type_marker = obj._type_marker + try: + return _encoders[type_marker](obj, json_options) # type: ignore[no-untyped-call] + except KeyError: + raise TypeError("%r is not JSON serializable" % obj) from None + elif isinstance(obj, datetime.datetime): if json_options.datetime_representation == DatetimeRepresentation.ISO8601: if not obj.tzinfo: obj = obj.replace(tzinfo=utc) @@ -860,67 +923,19 @@ def default(obj: Any, json_options: JSONOptions = DEFAULT_JSON_OPTIONS) -> Any: if json_options.datetime_representation == DatetimeRepresentation.LEGACY: return {"$date": millis} return {"$date": {"$numberLong": str(millis)}} - if isinstance(obj, DatetimeMS): - if ( - json_options.datetime_representation == DatetimeRepresentation.ISO8601 - and 0 <= int(obj) <= _max_datetime_ms() - ): - return default(obj.as_datetime(), json_options) - elif json_options.datetime_representation == DatetimeRepresentation.LEGACY: - return {"$date": str(int(obj))} - return {"$date": {"$numberLong": str(int(obj))}} - if json_options.strict_number_long and isinstance(obj, Int64): - return {"$numberLong": str(obj)} - if isinstance(obj, (RE_TYPE, Regex)): - flags = "" - if obj.flags & re.IGNORECASE: - flags += "i" - if obj.flags & re.LOCALE: - flags += "l" - if obj.flags & re.MULTILINE: - flags += "m" - if obj.flags & re.DOTALL: - flags += "s" - if obj.flags & re.UNICODE: - flags += "u" - if obj.flags & re.VERBOSE: - flags += "x" - if isinstance(obj.pattern, str): - pattern = obj.pattern - else: - pattern = obj.pattern.decode("utf-8") - if json_options.json_mode == JSONMode.LEGACY: - return SON([("$regex", pattern), ("$options", flags)]) - return {"$regularExpression": SON([("pattern", pattern), ("options", flags)])} - if isinstance(obj, MinKey): - return {"$minKey": 1} - if isinstance(obj, MaxKey): - return {"$maxKey": 1} - if isinstance(obj, Timestamp): - return {"$timestamp": SON([("t", obj.time), ("i", obj.inc)])} - if isinstance(obj, Code): - if obj.scope is None: - return {"$code": str(obj)} - return SON([("$code", str(obj)), ("$scope", _json_convert(obj.scope, json_options))]) - if isinstance(obj, Binary): - return _encode_binary(obj, obj.subtype, json_options) - if isinstance(obj, bytes): + elif isinstance(obj, bytes): return _encode_binary(obj, 0, json_options) - if isinstance(obj, uuid.UUID): + elif isinstance(obj, uuid.UUID): if json_options.strict_uuid: binval = Binary.from_uuid(obj, uuid_representation=json_options.uuid_representation) return _encode_binary(binval, binval.subtype, json_options) else: return {"$uuid": obj.hex} - if isinstance(obj, Decimal128): - return {"$numberDecimal": str(obj)} - if isinstance(obj, bool): - return obj - if json_options.json_mode == JSONMode.CANONICAL and isinstance(obj, int): + elif json_options.json_mode == JSONMode.CANONICAL and isinstance(obj, int): if -(2**31) <= obj < 2**31: return {"$numberInt": str(obj)} return {"$numberLong": str(obj)} - if json_options.json_mode != JSONMode.LEGACY and isinstance(obj, float): + elif json_options.json_mode != JSONMode.LEGACY and isinstance(obj, float): if math.isnan(obj): return {"$numberDouble": "NaN"} elif math.isinf(obj): diff --git a/doc/changelog.rst b/doc/changelog.rst index 799bf8c41..98f535245 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -13,6 +13,7 @@ PyMongo 4.7 brings a number of improvements including: - Fixed a bug where inflating a :class:`~bson.raw_bson.RawBSONDocument` containing a :class:`~bson.code.Code` would cause an error. - Replaced usage of :class:`bson.son.SON` on all internal classes and commands to dict, :attr:`options.pool_options.metadata` is now of type ``dict`` as opposed to :class:`bson.son.SON`. +- Significantly improved the performance of encoding BSON documents to JSON. Changes in Version 4.6.1 ------------------------