From 14002a5a0d294cae8c2c5349e1a92364381dfd4d Mon Sep 17 00:00:00 2001 From: Ben Warner Date: Wed, 27 Jul 2022 16:53:52 -0700 Subject: [PATCH] PYTHON-1824 Allow encoding/decoding out-of-range datetimes via DatetimeMS and datetime_conversion (#981) https://jira.mongodb.org/browse/PYTHON-1824 Co-authored-by: Ben Warner --- bson/__init__.py | 47 ++++------ bson/_cbsonmodule.c | 138 +++++++++++++++++++++++++++- bson/_cbsonmodule.h | 1 + bson/codec_options.py | 27 +++++- bson/codec_options.pyi | 8 ++ bson/datetime_ms.py | 157 ++++++++++++++++++++++++++++++++ bson/json_util.py | 43 +++++++-- doc/api/bson/datetime_ms.rst | 4 + doc/api/bson/index.rst | 1 + doc/examples/datetimes.rst | 54 +++++++++++ pymongo/common.py | 18 +++- pymongo/mongo_client.py | 8 ++ test/test_bson.py | 170 ++++++++++++++++++++++++++++++++++- test/test_client.py | 23 ++++- test/test_json_util.py | 68 +++++++++++++- 15 files changed, 721 insertions(+), 46 deletions(-) create mode 100644 bson/datetime_ms.py create mode 100644 doc/api/bson/datetime_ms.rst diff --git a/bson/__init__.py b/bson/__init__.py index 2db1fb5d0..4283faf7d 100644 --- a/bson/__init__.py +++ b/bson/__init__.py @@ -54,7 +54,6 @@ bytes [#bytes]_ binary both subtype 0. It will be decoded back to bytes. """ -import calendar import datetime import itertools import re @@ -100,9 +99,18 @@ from bson.code import Code from bson.codec_options import ( DEFAULT_CODEC_OPTIONS, CodecOptions, + DatetimeConversionOpts, _DocumentType, _raw_document_class, ) +from bson.datetime_ms import ( + EPOCH_AWARE, + EPOCH_NAIVE, + DatetimeMS, + _datetime_to_millis, + _millis_to_datetime, + utc, +) from bson.dbref import DBRef from bson.decimal128 import Decimal128 from bson.errors import InvalidBSON, InvalidDocument, InvalidStringData @@ -113,7 +121,6 @@ from bson.objectid import ObjectId from bson.regex import Regex from bson.son import RE_TYPE, SON from bson.timestamp import Timestamp -from bson.tz_util import utc # Import some modules for type-checking only. if TYPE_CHECKING: @@ -187,12 +194,10 @@ __all__ = [ "is_valid", "BSON", "has_c", + "DatetimeConversionOpts", + "DatetimeMS", ] -EPOCH_AWARE = datetime.datetime.fromtimestamp(0, utc) -EPOCH_NAIVE = datetime.datetime.utcfromtimestamp(0) - - BSONNUM = b"\x01" # Floating point BSONSTR = b"\x02" # UTF-8 string BSONOBJ = b"\x03" # Embedded document @@ -413,7 +418,7 @@ def _get_boolean( def _get_date( data: Any, view: Any, position: int, dummy0: int, opts: CodecOptions, dummy1: Any -) -> Tuple[datetime.datetime, int]: +) -> Tuple[Union[datetime.datetime, DatetimeMS], int]: """Decode a BSON datetime to python datetime.datetime.""" return _millis_to_datetime(_UNPACK_LONG_FROM(data, position)[0], opts), position + 8 @@ -724,6 +729,12 @@ def _encode_datetime(name: bytes, value: datetime.datetime, dummy0: Any, dummy1: return b"\x09" + name + _PACK_LONG(millis) +def _encode_datetime_ms(name: bytes, value: DatetimeMS, dummy0: Any, dummy1: Any) -> bytes: + """Encode datetime.datetime.""" + millis = int(value) + return b"\x09" + name + _PACK_LONG(millis) + + def _encode_none(name: bytes, dummy0: Any, dummy1: Any, dummy2: Any) -> bytes: """Encode python None.""" return b"\x0A" + name @@ -814,6 +825,7 @@ _ENCODERS = { bool: _encode_bool, bytes: _encode_bytes, datetime.datetime: _encode_datetime, + DatetimeMS: _encode_datetime_ms, dict: _encode_mapping, float: _encode_float, int: _encode_int, @@ -948,27 +960,6 @@ if _USE_C: _dict_to_bson = _cbson._dict_to_bson # noqa: F811 -def _millis_to_datetime(millis: int, opts: CodecOptions) -> datetime.datetime: - """Convert milliseconds since epoch UTC to datetime.""" - diff = ((millis % 1000) + 1000) % 1000 - seconds = (millis - diff) // 1000 - micros = diff * 1000 - if opts.tz_aware: - dt = EPOCH_AWARE + datetime.timedelta(seconds=seconds, microseconds=micros) - if opts.tzinfo: - dt = dt.astimezone(opts.tzinfo) - return dt - else: - return EPOCH_NAIVE + datetime.timedelta(seconds=seconds, microseconds=micros) - - -def _datetime_to_millis(dtm: datetime.datetime) -> int: - """Convert datetime to milliseconds since epoch UTC.""" - if dtm.utcoffset() is not None: - dtm = dtm - dtm.utcoffset() # type: ignore - return int(calendar.timegm(dtm.timetuple()) * 1000 + dtm.microsecond // 1000) - - _CODEC_OPTIONS_TYPE_ERROR = TypeError("codec_options must be an instance of CodecOptions") diff --git a/bson/_cbsonmodule.c b/bson/_cbsonmodule.c index da6a5cbda..019f049bb 100644 --- a/bson/_cbsonmodule.c +++ b/bson/_cbsonmodule.c @@ -52,6 +52,9 @@ struct module_state { PyObject* BSONInt64; PyObject* Decimal128; PyObject* Mapping; + PyObject* DatetimeMS; + PyObject* _min_datetime_ms; + PyObject* _max_datetime_ms; }; #define GETSTATE(m) ((struct module_state*)PyModule_GetState(m)) @@ -72,6 +75,12 @@ struct module_state { /* The smallest possible BSON document, i.e. "{}" */ #define BSON_MIN_SIZE 5 +/* Datetime codec options */ +#define DATETIME 1 +#define DATETIME_CLAMP 2 +#define DATETIME_MS 3 +#define DATETIME_AUTO 4 + /* Get an error class from the bson.errors module. * * Returns a new ref */ @@ -179,6 +188,45 @@ static long long millis_from_datetime(PyObject* datetime) { return millis; } +/* Extended-range datetime, returns a DatetimeMS object with millis */ +static PyObject* datetime_ms_from_millis(PyObject* self, long long millis){ + // Allocate a new DatetimeMS object. + struct module_state *state = GETSTATE(self); + + PyObject* dt; + PyObject* ll_millis; + + if (!(ll_millis = PyLong_FromLongLong(millis))){ + return NULL; + } + dt = PyObject_CallFunctionObjArgs(state->DatetimeMS, ll_millis, NULL); + Py_DECREF(ll_millis); + return dt; +} + +/* Extended-range datetime, takes a DatetimeMS object and extracts the long long value. */ +static int millis_from_datetime_ms(PyObject* dt, long long* out){ + PyObject* ll_millis; + long long millis; + + if (!(ll_millis = PyNumber_Long(dt))){ + if (PyErr_Occurred()) { // TypeError + return 0; + } + } + + if ((millis = PyLong_AsLongLong(ll_millis)) == -1){ + if (PyErr_Occurred()) { /* Overflow */ + PyErr_SetString(PyExc_OverflowError, + "MongoDB datetimes can only handle up to 8-byte ints"); + return 0; + } + } + Py_DECREF(ll_millis); + *out = millis; + return 1; +} + /* Just make this compatible w/ the old API. */ int buffer_write_bytes(buffer_t buffer, const char* data, int size) { if (pymongo_buffer_write(buffer, data, size)) { @@ -342,7 +390,10 @@ static int _load_python_objects(PyObject* module) { _load_object(&state->BSONInt64, "bson.int64", "Int64") || _load_object(&state->Decimal128, "bson.decimal128", "Decimal128") || _load_object(&state->UUID, "uuid", "UUID") || - _load_object(&state->Mapping, "collections.abc", "Mapping")) { + _load_object(&state->Mapping, "collections.abc", "Mapping") || + _load_object(&state->DatetimeMS, "bson.datetime_ms", "DatetimeMS") || + _load_object(&state->_min_datetime_ms, "bson.datetime_ms", "_min_datetime_ms") || + _load_object(&state->_max_datetime_ms, "bson.datetime_ms", "_max_datetime_ms")) { return 1; } /* Reload our REType hack too. */ @@ -466,13 +517,14 @@ int convert_codec_options(PyObject* options_obj, void* p) { options->unicode_decode_error_handler = NULL; - if (!PyArg_ParseTuple(options_obj, "ObbzOO", + if (!PyArg_ParseTuple(options_obj, "ObbzOOb", &options->document_class, &options->tz_aware, &options->uuid_rep, &options->unicode_decode_error_handler, &options->tzinfo, - &type_registry_obj)) + &type_registry_obj, + &options->datetime_conversion)) return 0; type_marker = _type_marker(options->document_class); @@ -1049,6 +1101,13 @@ static int _write_element_to_buffer(PyObject* self, buffer_t buffer, } *(pymongo_buffer_get_buffer(buffer) + type_byte) = 0x09; return buffer_write_int64(buffer, (int64_t)millis); + } else if (PyObject_TypeCheck(value, (PyTypeObject *) state->DatetimeMS)) { + long long millis; + if (!millis_from_datetime_ms(value, &millis)) { + return 0; + } + *(pymongo_buffer_get_buffer(buffer) + type_byte) = 0x09; + return buffer_write_int64(buffer, (int64_t)millis); } else if (PyObject_TypeCheck(value, state->REType)) { return _write_regex_to_buffer(buffer, type_byte, value); } @@ -1854,8 +1913,79 @@ static PyObject* get_value(PyObject* self, PyObject* name, const char* buffer, } memcpy(&millis, buffer + *position, 8); millis = (int64_t)BSON_UINT64_FROM_LE(millis); - naive = datetime_from_millis(millis); *position += 8; + + if (options->datetime_conversion == DATETIME_MS){ + value = datetime_ms_from_millis(self, millis); + break; + } + + int dt_clamp = options->datetime_conversion == DATETIME_CLAMP; + int dt_auto = options->datetime_conversion == DATETIME_AUTO; + + + if (dt_clamp || dt_auto){ + PyObject *min_millis_fn = _get_object(state->_min_datetime_ms, "bson.datetime_ms", "_min_datetime_ms"); + PyObject *max_millis_fn = _get_object(state->_max_datetime_ms, "bson.datetime_ms", "_max_datetime_ms"); + PyObject *min_millis_fn_res; + PyObject *max_millis_fn_res; + int64_t min_millis; + int64_t max_millis; + + if (min_millis_fn == NULL || max_millis_fn == NULL) { + Py_XDECREF(min_millis_fn); + Py_XDECREF(max_millis_fn); + goto invalid; + } + + if (options->tz_aware){ + PyObject* tzinfo = options->tzinfo; + if (tzinfo == Py_None) { + // Default to UTC. + utc_type = _get_object(state->UTC, "bson.tz_util", "utc"); + tzinfo = utc_type; + } + min_millis_fn_res = PyObject_CallFunctionObjArgs(min_millis_fn, tzinfo, NULL); + max_millis_fn_res = PyObject_CallFunctionObjArgs(max_millis_fn, tzinfo, NULL); + } else { + min_millis_fn_res = PyObject_CallObject(min_millis_fn, NULL); + max_millis_fn_res = PyObject_CallObject(max_millis_fn, NULL); + } + + Py_DECREF(min_millis_fn); + Py_DECREF(max_millis_fn); + + if (!min_millis_fn_res || !max_millis_fn_res){ + Py_XDECREF(min_millis_fn_res); + Py_XDECREF(max_millis_fn_res); + goto invalid; + } + + min_millis = PyLong_AsLongLong(min_millis_fn_res); + max_millis = PyLong_AsLongLong(max_millis_fn_res); + + if ((min_millis == -1 || max_millis == -1) && PyErr_Occurred()) + { + // min/max_millis check + goto invalid; + } + + if (dt_clamp) { + if (millis < min_millis) { + millis = min_millis; + } else if (millis > max_millis) { + millis = max_millis; + } + // Continues from here to return a datetime. + } else if (dt_auto) { + if (millis < min_millis || millis > max_millis){ + value = datetime_ms_from_millis(self, millis); + break; // Out-of-range so done. + } + } + } + + naive = datetime_from_millis(millis); if (!options->tz_aware) { /* In the naive case, we're done here. */ value = naive; break; diff --git a/bson/_cbsonmodule.h b/bson/_cbsonmodule.h index 12a2c8ac6..6ff453b8f 100644 --- a/bson/_cbsonmodule.h +++ b/bson/_cbsonmodule.h @@ -62,6 +62,7 @@ typedef struct codec_options_t { char* unicode_decode_error_handler; PyObject* tzinfo; type_registry_t type_registry; + unsigned char datetime_conversion; PyObject* options_obj; unsigned char is_raw_bson; } codec_options_t; diff --git a/bson/codec_options.py b/bson/codec_options.py index 4eaff59ea..a29c87892 100644 --- a/bson/codec_options.py +++ b/bson/codec_options.py @@ -16,6 +16,7 @@ import abc import datetime +import enum from collections.abc import MutableMapping as _MutableMapping from typing import ( Any, @@ -198,6 +199,16 @@ class TypeRegistry(object): ) +class DatetimeConversionOpts(enum.IntEnum): + DATETIME = 1 + DATETIME_CLAMP = 2 + DATETIME_MS = 3 + DATETIME_AUTO = 4 + + def __repr__(self): + return f"{self.value}" + + class _BaseCodecOptions(NamedTuple): document_class: Type[Mapping[str, Any]] tz_aware: bool @@ -205,6 +216,7 @@ class _BaseCodecOptions(NamedTuple): unicode_decode_error_handler: str tzinfo: Optional[datetime.tzinfo] type_registry: TypeRegistry + datetime_conversion: Optional[DatetimeConversionOpts] class CodecOptions(_BaseCodecOptions): @@ -268,7 +280,13 @@ class CodecOptions(_BaseCodecOptions): encoded/decoded. - `type_registry`: Instance of :class:`TypeRegistry` used to customize encoding and decoding behavior. - + - `datetime_conversion`: Specifies how UTC datetimes should be decoded + within BSON. Valid options include 'datetime_ms' to return as a + DatetimeMS, 'datetime' to return as a datetime.datetime and + raising a ValueError for out-of-range values, 'datetime_auto' to + return DatetimeMS objects when the underlying datetime is + out-of-range and 'datetime_clamp' to clamp to the minimum and + maximum possible datetimes. Defaults to 'datetime'. .. versionchanged:: 4.0 The default for `uuid_representation` was changed from :const:`~bson.binary.UuidRepresentation.PYTHON_LEGACY` to @@ -292,6 +310,7 @@ class CodecOptions(_BaseCodecOptions): unicode_decode_error_handler: str = "strict", tzinfo: Optional[datetime.tzinfo] = None, type_registry: Optional[TypeRegistry] = None, + datetime_conversion: Optional[DatetimeConversionOpts] = DatetimeConversionOpts.DATETIME, ) -> "CodecOptions": doc_class = document_class or dict # issubclass can raise TypeError for generic aliases like SON[str, Any]. @@ -336,6 +355,7 @@ class CodecOptions(_BaseCodecOptions): unicode_decode_error_handler, tzinfo, type_registry, + datetime_conversion, ), ) @@ -350,7 +370,7 @@ class CodecOptions(_BaseCodecOptions): return ( "document_class=%s, tz_aware=%r, uuid_representation=%s, " "unicode_decode_error_handler=%r, tzinfo=%r, " - "type_registry=%r" + "type_registry=%r, datetime_conversion=%r" % ( document_class_repr, self.tz_aware, @@ -358,6 +378,7 @@ class CodecOptions(_BaseCodecOptions): self.unicode_decode_error_handler, self.tzinfo, self.type_registry, + self.datetime_conversion, ) ) @@ -371,6 +392,7 @@ class CodecOptions(_BaseCodecOptions): "unicode_decode_error_handler": self.unicode_decode_error_handler, "tzinfo": self.tzinfo, "type_registry": self.type_registry, + "datetime_conversion": self.datetime_conversion, } def __repr__(self): @@ -406,6 +428,7 @@ def _parse_codec_options(options: Any) -> CodecOptions: "unicode_decode_error_handler", "tzinfo", "type_registry", + "datetime_conversion", }: if k == "uuidrepresentation": kwargs["uuid_representation"] = options[k] diff --git a/bson/codec_options.pyi b/bson/codec_options.pyi index 9d5f5c265..260407524 100644 --- a/bson/codec_options.pyi +++ b/bson/codec_options.pyi @@ -21,6 +21,7 @@ you get the error: "TypeError: 'type' object is not subscriptable". import datetime import abc +import enum from typing import Tuple, Generic, Optional, Mapping, Any, TypeVar, Type, Dict, Iterable, Tuple, MutableMapping, Callable, Union @@ -54,6 +55,11 @@ class TypeRegistry: _DocumentType = TypeVar("_DocumentType", bound=Mapping[str, Any]) +class DatetimeConversionOpts(int, enum.Enum): + DATETIME = ... + DATETIME_CLAMP = ... + DATETIME_MS = ... + DATETIME_AUTO = ... class CodecOptions(Tuple, Generic[_DocumentType]): document_class: Type[_DocumentType] @@ -62,6 +68,7 @@ class CodecOptions(Tuple, Generic[_DocumentType]): unicode_decode_error_handler: Optional[str] tzinfo: Optional[datetime.tzinfo] type_registry: TypeRegistry + datetime_conversion: Optional[int] def __new__( cls: Type[CodecOptions], @@ -71,6 +78,7 @@ class CodecOptions(Tuple, Generic[_DocumentType]): unicode_decode_error_handler: Optional[str] = ..., tzinfo: Optional[datetime.tzinfo] = ..., type_registry: Optional[TypeRegistry] = ..., + datetime_conversion: Optional[int] = ..., ) -> CodecOptions[_DocumentType]: ... # CodecOptions API diff --git a/bson/datetime_ms.py b/bson/datetime_ms.py new file mode 100644 index 000000000..f3e25ed05 --- /dev/null +++ b/bson/datetime_ms.py @@ -0,0 +1,157 @@ +# Copyright 2022-present MongoDB, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you +# may not use this file except in compliance with the License. You +# may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. + +"""Tools for representing the BSON datetime type.""" + +import calendar +import datetime +import functools +from typing import Any, Union, cast + +from bson.codec_options import ( + DEFAULT_CODEC_OPTIONS, + CodecOptions, + DatetimeConversionOpts, +) +from bson.tz_util import utc + +EPOCH_AWARE = datetime.datetime.fromtimestamp(0, utc) +EPOCH_NAIVE = datetime.datetime.utcfromtimestamp(0) + + +class DatetimeMS: + __slots__ = ("_value",) + + def __init__(self, value: Union[int, datetime.datetime]): + """Represents a BSON UTC datetime. + + BSON UTC datetimes are defined as an int64 of milliseconds since the Unix + epoch. The principal use of DatetimeMS is to represent datetimes outside + the range of the Python builtin :class:`~datetime.datetime` class when + encoding/decoding BSON. + + To decode UTC datetimes as a ``DatetimeMS``,`datetime_conversion` in + :class:`~bson.CodecOptions` must be set to 'datetime_ms' or + 'datetime_auto'. See :ref:`handling-out-of-range-datetimes` for details. + + :Parameters: + - `value`: An instance of :class:`datetime.datetime` to be + represented as milliseconds since the Unix epoch, or int of + milliseconds since the Unix epoch. + + .. versionadded:: 4.3 + """ + if isinstance(value, int): + if not (-(2**63) <= value <= 2**63 - 1): + raise OverflowError("Must be a 64-bit integer of milliseconds") + self._value = value + elif isinstance(value, datetime.datetime): + self._value = _datetime_to_millis(value) + else: + raise TypeError(f"{type(value)} is not a valid type for DatetimeMS") + + def __hash__(self) -> int: + return hash(self._value) + + def __repr__(self) -> str: + return type(self).__name__ + "(" + str(self._value) + ")" + + def __lt__(self, other: Union["DatetimeMS", int]) -> bool: + return self._value < other + + def __le__(self, other: Union["DatetimeMS", int]) -> bool: + return self._value <= other + + def __eq__(self, other: Any) -> bool: + if isinstance(other, DatetimeMS): + return self._value == other._value + return False + + def __ne__(self, other: Any) -> bool: + if isinstance(other, DatetimeMS): + return self._value != other._value + return True + + def __gt__(self, other: Union["DatetimeMS", int]) -> bool: + return self._value > other + + def __ge__(self, other: Union["DatetimeMS", int]) -> bool: + return self._value >= other + + _type_marker = 9 + + def as_datetime(self, codec_options: CodecOptions = DEFAULT_CODEC_OPTIONS) -> datetime.datetime: + """Create a Python :class:`~datetime.datetime` from this DatetimeMS object. + + :Parameters: + - `codec_options`: A CodecOptions instance for specifying how the + resulting DatetimeMS object will be formatted using ``tz_aware`` + and ``tz_info``. Defaults to + :const:`~bson.codec_options.DEFAULT_CODEC_OPTIONS`. + """ + return cast(datetime.datetime, _millis_to_datetime(self._value, codec_options)) + + def __int__(self) -> int: + return self._value + + +# Inclusive and exclusive min and max for timezones. +# Timezones are hashed by their offset, which is a timedelta +# and therefore there are more than 24 possible timezones. +@functools.lru_cache(maxsize=None) +def _min_datetime_ms(tz=datetime.timezone.utc): + return _datetime_to_millis(datetime.datetime.min.replace(tzinfo=tz)) + + +@functools.lru_cache(maxsize=None) +def _max_datetime_ms(tz=datetime.timezone.utc): + return _datetime_to_millis(datetime.datetime.max.replace(tzinfo=tz)) + + +def _millis_to_datetime(millis: int, opts: CodecOptions) -> Union[datetime.datetime, DatetimeMS]: + """Convert milliseconds since epoch UTC to datetime.""" + if ( + opts.datetime_conversion == DatetimeConversionOpts.DATETIME + or opts.datetime_conversion == DatetimeConversionOpts.DATETIME_CLAMP + or opts.datetime_conversion == DatetimeConversionOpts.DATETIME_AUTO + ): + tz = opts.tzinfo or datetime.timezone.utc + if opts.datetime_conversion == DatetimeConversionOpts.DATETIME_CLAMP: + millis = max(_min_datetime_ms(tz), min(millis, _max_datetime_ms(tz))) + elif opts.datetime_conversion == DatetimeConversionOpts.DATETIME_AUTO: + if not (_min_datetime_ms(tz) <= millis <= _max_datetime_ms(tz)): + return DatetimeMS(millis) + + diff = ((millis % 1000) + 1000) % 1000 + seconds = (millis - diff) // 1000 + micros = diff * 1000 + + if opts.tz_aware: + dt = EPOCH_AWARE + datetime.timedelta(seconds=seconds, microseconds=micros) + if opts.tzinfo: + dt = dt.astimezone(tz) + return dt + else: + return EPOCH_NAIVE + datetime.timedelta(seconds=seconds, microseconds=micros) + elif opts.datetime_conversion == DatetimeConversionOpts.DATETIME_MS: + return DatetimeMS(millis) + else: + raise ValueError("datetime_conversion must be an element of DatetimeConversionOpts") + + +def _datetime_to_millis(dtm: datetime.datetime) -> int: + """Convert datetime to milliseconds since epoch UTC.""" + if dtm.utcoffset() is not None: + dtm = dtm - dtm.utcoffset() # type: ignore + return int(calendar.timegm(dtm.timetuple()) * 1000 + dtm.microsecond // 1000) diff --git a/bson/json_util.py b/bson/json_util.py index 369c3d5f4..0b5494e85 100644 --- a/bson/json_util.py +++ b/bson/json_util.py @@ -94,11 +94,16 @@ import re import uuid from typing import Any, Dict, Mapping, Optional, Sequence, Tuple, Type, Union, cast -import bson -from bson import EPOCH_AWARE from bson.binary import ALL_UUID_SUBTYPES, UUID_SUBTYPE, Binary, UuidRepresentation from bson.code import Code -from bson.codec_options import CodecOptions +from bson.codec_options import CodecOptions, DatetimeConversionOpts +from bson.datetime_ms import ( + EPOCH_AWARE, + DatetimeMS, + _datetime_to_millis, + _max_datetime_ms, + _millis_to_datetime, +) from bson.dbref import DBRef from bson.decimal128 import Decimal128 from bson.int64 import Int64 @@ -228,6 +233,14 @@ class JSONOptions(CodecOptions): - `tzinfo`: A :class:`datetime.tzinfo` subclass that specifies the timezone from which :class:`~datetime.datetime` objects should be decoded. Defaults to :const:`~bson.tz_util.utc`. + - `datetime_conversion`: Specifies how UTC datetimes should be decoded + within BSON. Valid options include 'datetime_ms' to return as a + DatetimeMS, 'datetime' to return as a datetime.datetime and + raising a ValueError for out-of-range values, 'datetime_auto' to + return DatetimeMS objects when the underlying datetime is + out-of-range and 'datetime_clamp' to clamp to the minimum and + maximum possible datetimes. Defaults to 'datetime'. See + :ref:`handling-out-of-range-datetimes` for details. - `args`: arguments to :class:`~bson.codec_options.CodecOptions` - `kwargs`: arguments to :class:`~bson.codec_options.CodecOptions` @@ -594,7 +607,9 @@ def _parse_canonical_binary(doc: Any, json_options: JSONOptions) -> Union[Binary return _binary_or_uuid(data, int(subtype, 16), json_options) -def _parse_canonical_datetime(doc: Any, json_options: JSONOptions) -> datetime.datetime: +def _parse_canonical_datetime( + doc: Any, json_options: JSONOptions +) -> Union[datetime.datetime, DatetimeMS]: """Decode a JSON datetime to python datetime.datetime.""" dtm = doc["$date"] if len(doc) != 1: @@ -647,10 +662,15 @@ def _parse_canonical_datetime(doc: Any, json_options: JSONOptions) -> datetime.d if json_options.tz_aware: if json_options.tzinfo: aware = aware.astimezone(json_options.tzinfo) + if json_options.datetime_conversion == DatetimeConversionOpts.DATETIME_MS: + return DatetimeMS(aware) return aware else: - return aware.replace(tzinfo=None) - return bson._millis_to_datetime(int(dtm), json_options) + aware_tzinfo_none = aware.replace(tzinfo=None) + if json_options.datetime_conversion == DatetimeConversionOpts.DATETIME_MS: + return DatetimeMS(aware_tzinfo_none) + return aware_tzinfo_none + return _millis_to_datetime(int(dtm), json_options) def _parse_canonical_oid(doc: Any) -> ObjectId: @@ -806,10 +826,19 @@ def default(obj: Any, json_options: JSONOptions = DEFAULT_JSON_OPTIONS) -> Any: "$date": "%s%s%s" % (obj.strftime("%Y-%m-%dT%H:%M:%S"), fracsecs, tz_string) } - millis = bson._datetime_to_millis(obj) + millis = _datetime_to_millis(obj) if json_options.datetime_representation == DatetimeRepresentation.LEGACY: return {"$date": millis} return {"$date": {"$numberLong": str(millis)}} + if isinstance(obj, DatetimeMS): + if ( + json_options.datetime_representation == DatetimeRepresentation.ISO8601 + and 0 <= int(obj) <= _max_datetime_ms() + ): + return default(obj.as_datetime(), json_options) + elif json_options.datetime_representation == DatetimeRepresentation.LEGACY: + return {"$date": str(int(obj))} + return {"$date": {"$numberLong": str(int(obj))}} if json_options.strict_number_long and isinstance(obj, Int64): return {"$numberLong": str(obj)} if isinstance(obj, (RE_TYPE, Regex)): diff --git a/doc/api/bson/datetime_ms.rst b/doc/api/bson/datetime_ms.rst new file mode 100644 index 000000000..254f115eb --- /dev/null +++ b/doc/api/bson/datetime_ms.rst @@ -0,0 +1,4 @@ +:mod:`datetime_ms` -- Support for BSON UTC Datetime +=================================================== +.. automodule:: bson.datetime_ms + :members: diff --git a/doc/api/bson/index.rst b/doc/api/bson/index.rst index 5f15ed99e..72baae68a 100644 --- a/doc/api/bson/index.rst +++ b/doc/api/bson/index.rst @@ -13,6 +13,7 @@ Sub-modules: binary code codec_options + datetime_ms dbref decimal128 errors diff --git a/doc/examples/datetimes.rst b/doc/examples/datetimes.rst index d712ce613..b9c509e07 100644 --- a/doc/examples/datetimes.rst +++ b/doc/examples/datetimes.rst @@ -102,3 +102,57 @@ out of MongoDB in US/Pacific time: >>> result = aware_times.find_one() datetime.datetime(2002, 10, 27, 6, 0, # doctest: +NORMALIZE_WHITESPACE tzinfo=) + +.. _handling-out-of-range-datetimes: + +Handling out of range datetimes +------------------------------- + +Python's :class:`~datetime.datetime` can only represent datetimes within the +range allowed by +:attr:`~datetime.datetime.min` and :attr:`~datetime.datetime.max`, whereas +the range of datetimes allowed in BSON can represent any 64-bit number +of milliseconds from the Unix epoch. To deal with this, we can use the +:class:`bson.datetime_ms.DatetimeMS` object, which is a wrapper for the +:class:`int` built-in. + +To decode UTC datetime values as :class:`~bson.datetime_ms.DatetimeMS`, +:class:`~bson.codec_options.CodecOptions` should have its +``datetime_conversion`` parameter set to one of the options available in +:class:`bson.datetime_ms.DatetimeConversionOpts`. These include +:attr:`~bson.datetime_ms.DatetimeConversionOpts.DATETIME`, +:attr:`~bson.datetime_ms.DatetimeConversionOpts.DATETIME_MS`, +:attr:`~bson.datetime_ms.DatetimeConversionOpts.DATETIME_AUTO`, +:attr:`~bson.datetime_ms.DatetimeConversionOpts.DATETIME_CLAMP`. +:attr:`~bson.datetime_ms.DatetimeConversionOpts.DATETIME` is the default +option and has the behavior of raising an exception upon attempting to +decode an out-of-range date. +:attr:`~bson.datetime_ms.DatetimeConversionOpts.DATETIME_MS` will only return +:class:`~bson.datetime_ms.DatetimeMS` objects, regardless of whether the +represented datetime is in- or out-of-range. +:attr:`~bson.datetime_ms.DatetimeConversionOpts.DATETIME_AUTO` will return +:class:`~datetime.datetime` if the underlying UTC datetime is within range, +or :class:`~bson.datetime_ms.DatetimeMS` if the underlying datetime +cannot be represented using the builtin Python :class:`~datetime.datetime`. +:attr:`~bson.datetime_ms.DatetimeConversionOpts.DATETIME_CLAMP` will clamp +resulting :class:`~datetime.datetime` objects to be within +:attr:`~datetime.datetime.min` and :attr:`~datetime.datetime.max` +(trimmed to `999000` microseconds). + +An example of encoding and decoding using `DATETIME_MS` is as follows: + +.. doctest:: + >>> from datetime import datetime + >>> from bson import encode, decode + >>> from bson.datetime_ms import DatetimeMS + >>> from bson.codec_options import CodecOptions,DatetimeConversionOpts + >>> x = encode({"x": datetime(1970, 1, 1)}) + >>> x + b'\x10\x00\x00\x00\tx\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' + >>> decode(x, codec_options=CodecOptions(datetime_conversion=DatetimeConversionOpts.DATETIME_MS)) + {'x': DatetimeMS(0)} + +:class:`~bson.datetime_ms.DatetimeMS` objects have support for rich comparison +methods against other instances of :class:`~bson.datetime_ms.DatetimeMS`. +They can also be converted to :class:`~datetime.datetime` objects with +:meth:`~bson.datetime_ms.DatetimeMS.to_datetime()`. diff --git a/pymongo/common.py b/pymongo/common.py index 6ffc97f2a..319b07193 100644 --- a/pymongo/common.py +++ b/pymongo/common.py @@ -36,7 +36,7 @@ from urllib.parse import unquote_plus from bson import SON from bson.binary import UuidRepresentation -from bson.codec_options import CodecOptions, TypeRegistry +from bson.codec_options import CodecOptions, DatetimeConversionOpts, TypeRegistry from bson.raw_bson import RawBSONDocument from pymongo.auth import MECHANISMS from pymongo.compression_support import ( @@ -620,6 +620,21 @@ def validate_auto_encryption_opts_or_none(option: Any, value: Any) -> Optional[A return value +def validate_datetime_conversion(option: Any, value: Any) -> Optional[DatetimeConversionOpts]: + """Validate a DatetimeConversionOpts string.""" + if value is None: + return DatetimeConversionOpts.DATETIME + + if isinstance(value, str): + if value.isdigit(): + return DatetimeConversionOpts(int(value)) + return DatetimeConversionOpts[value] + elif isinstance(value, int): + return DatetimeConversionOpts(value) + + raise TypeError("%s must be a str or int representing DatetimeConversionOpts" % (option,)) + + # Dictionary where keys are the names of public URI options, and values # are lists of aliases for that option. URI_OPTIONS_ALIAS_MAP: Dict[str, List[str]] = { @@ -684,6 +699,7 @@ NONSPEC_OPTIONS_VALIDATOR_MAP: Dict[str, Callable[[Any, Any], Any]] = { "uuidrepresentation": validate_uuid_representation, "waitqueuemultiple": validate_non_negative_integer_or_none, "waitqueuetimeoutms": validate_timeout_or_none, + "datetime_conversion": validate_datetime_conversion, } # Dictionary where keys are the names of keyword-only options for the diff --git a/pymongo/mongo_client.py b/pymongo/mongo_client.py index 080ae8757..fd4c0e84b 100644 --- a/pymongo/mongo_client.py +++ b/pymongo/mongo_client.py @@ -239,6 +239,14 @@ class MongoClient(common.BaseObject, Generic[_DocumentType]): - `type_registry` (optional): instance of :class:`~bson.codec_options.TypeRegistry` to enable encoding and decoding of custom types. + - `datetime_conversion`: Specifies how UTC datetimes should be decoded + within BSON. Valid options include 'datetime_ms' to return as a + DatetimeMS, 'datetime' to return as a datetime.datetime and + raising a ValueError for out-of-range values, 'datetime_auto' to + return DatetimeMS objects when the underlying datetime is + out-of-range and 'datetime_clamp' to clamp to the minimum and + maximum possible datetimes. Defaults to 'datetime'. See + :ref:`handling-out-of-range-datetimes` for details. | **Other optional parameters can be passed as keyword arguments:** diff --git a/test/test_bson.py b/test/test_bson.py index aa77954fa..0893000c0 100644 --- a/test/test_bson.py +++ b/test/test_bson.py @@ -38,7 +38,9 @@ import bson from bson import ( BSON, EPOCH_AWARE, + DatetimeMS, Regex, + _datetime_to_millis, decode, decode_all, decode_file_iter, @@ -48,7 +50,7 @@ from bson import ( ) from bson.binary import Binary, UuidRepresentation from bson.code import Code -from bson.codec_options import CodecOptions +from bson.codec_options import CodecOptions, DatetimeConversionOpts from bson.dbref import DBRef from bson.errors import InvalidBSON, InvalidDocument from bson.int64 import Int64 @@ -978,7 +980,7 @@ class TestCodecOptions(unittest.TestCase): "uuid_representation=UuidRepresentation.UNSPECIFIED, " "unicode_decode_error_handler='strict', " "tzinfo=None, type_registry=TypeRegistry(type_codecs=[], " - "fallback_encoder=None))" + "fallback_encoder=None), datetime_conversion=1)" ) self.assertEqual(r, repr(CodecOptions())) @@ -1153,5 +1155,169 @@ class TestCodecOptions(unittest.TestCase): self.assertTrue(decoded["_id"].generation_time) +class TestDatetimeConversion(unittest.TestCase): + def test_comps(self): + # Tests other timestamp formats. + # Test each of the rich comparison methods. + pairs = [ + (DatetimeMS(-1), DatetimeMS(1)), + (DatetimeMS(0), DatetimeMS(0)), + (DatetimeMS(1), DatetimeMS(-1)), + ] + + comp_ops = ["__lt__", "__le__", "__eq__", "__ne__", "__gt__", "__ge__"] + for lh, rh in pairs: + for op in comp_ops: + self.assertEqual(getattr(lh, op)(rh), getattr(lh._value, op)(rh._value)) + + def test_class_conversions(self): + # Test class conversions. + dtr1 = DatetimeMS(1234) + dt1 = dtr1.as_datetime() + self.assertEqual(dtr1, DatetimeMS(dt1)) + + dt2 = datetime.datetime(1969, 1, 1) + dtr2 = DatetimeMS(dt2) + self.assertEqual(dtr2.as_datetime(), dt2) + + # Test encode and decode without codec options. Expect: DatetimeMS => datetime + dtr1 = DatetimeMS(0) + enc1 = encode({"x": dtr1}) + dec1 = decode(enc1) + self.assertEqual(dec1["x"], datetime.datetime(1970, 1, 1)) + self.assertNotEqual(type(dtr1), type(dec1["x"])) + + # Test encode and decode with codec options. Expect: UTCDateimteRaw => DatetimeMS + opts1 = CodecOptions(datetime_conversion=DatetimeConversionOpts.DATETIME_MS) + enc1 = encode({"x": dtr1}) + dec1 = decode(enc1, opts1) + self.assertEqual(type(dtr1), type(dec1["x"])) + self.assertEqual(dtr1, dec1["x"]) + + # Expect: datetime => DatetimeMS + opts1 = CodecOptions(datetime_conversion=DatetimeConversionOpts.DATETIME_MS) + dt1 = datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc) + enc1 = encode({"x": dt1}) + dec1 = decode(enc1, opts1) + self.assertEqual(dec1["x"], DatetimeMS(0)) + self.assertNotEqual(dt1, type(dec1["x"])) + + def test_clamping(self): + # Test clamping from below and above. + opts1 = CodecOptions( + datetime_conversion=DatetimeConversionOpts.DATETIME_CLAMP, + tz_aware=True, + tzinfo=datetime.timezone.utc, + ) + below = encode({"x": DatetimeMS(_datetime_to_millis(datetime.datetime.min) - 1)}) + dec_below = decode(below, opts1) + self.assertEqual( + dec_below["x"], datetime.datetime.min.replace(tzinfo=datetime.timezone.utc) + ) + + above = encode({"x": DatetimeMS(_datetime_to_millis(datetime.datetime.max) + 1)}) + dec_above = decode(above, opts1) + self.assertEqual( + dec_above["x"], + datetime.datetime.max.replace(tzinfo=datetime.timezone.utc, microsecond=999000), + ) + + def test_tz_clamping(self): + # Naive clamping to local tz. + opts1 = CodecOptions( + datetime_conversion=DatetimeConversionOpts.DATETIME_CLAMP, tz_aware=False + ) + below = encode({"x": DatetimeMS(_datetime_to_millis(datetime.datetime.min) - 24 * 60 * 60)}) + + dec_below = decode(below, opts1) + self.assertEqual(dec_below["x"], datetime.datetime.min) + + above = encode({"x": DatetimeMS(_datetime_to_millis(datetime.datetime.max) + 24 * 60 * 60)}) + dec_above = decode(above, opts1) + self.assertEqual( + dec_above["x"], + datetime.datetime.max.replace(microsecond=999000), + ) + + # Aware clamping. + opts2 = CodecOptions( + datetime_conversion=DatetimeConversionOpts.DATETIME_CLAMP, tz_aware=True + ) + below = encode({"x": DatetimeMS(_datetime_to_millis(datetime.datetime.min) - 24 * 60 * 60)}) + dec_below = decode(below, opts2) + self.assertEqual( + dec_below["x"], datetime.datetime.min.replace(tzinfo=datetime.timezone.utc) + ) + + above = encode({"x": DatetimeMS(_datetime_to_millis(datetime.datetime.max) + 24 * 60 * 60)}) + dec_above = decode(above, opts2) + self.assertEqual( + dec_above["x"], + datetime.datetime.max.replace(tzinfo=datetime.timezone.utc, microsecond=999000), + ) + + def test_datetime_auto(self): + # Naive auto, in range. + opts1 = CodecOptions(datetime_conversion=DatetimeConversionOpts.DATETIME_AUTO) + inr = encode({"x": datetime.datetime(1970, 1, 1)}, codec_options=opts1) + dec_inr = decode(inr) + self.assertEqual(dec_inr["x"], datetime.datetime(1970, 1, 1)) + + # Naive auto, below range. + below = encode({"x": DatetimeMS(_datetime_to_millis(datetime.datetime.min) - 24 * 60 * 60)}) + dec_below = decode(below, opts1) + self.assertEqual( + dec_below["x"], DatetimeMS(_datetime_to_millis(datetime.datetime.min) - 24 * 60 * 60) + ) + + # Naive auto, above range. + above = encode({"x": DatetimeMS(_datetime_to_millis(datetime.datetime.max) + 24 * 60 * 60)}) + dec_above = decode(above, opts1) + self.assertEqual( + dec_above["x"], + DatetimeMS(_datetime_to_millis(datetime.datetime.max) + 24 * 60 * 60), + ) + + # Aware auto, in range. + opts2 = CodecOptions( + datetime_conversion=DatetimeConversionOpts.DATETIME_AUTO, + tz_aware=True, + tzinfo=datetime.timezone.utc, + ) + inr = encode({"x": datetime.datetime(1970, 1, 1)}, codec_options=opts2) + dec_inr = decode(inr) + self.assertEqual(dec_inr["x"], datetime.datetime(1970, 1, 1)) + + # Aware auto, below range. + below = encode({"x": DatetimeMS(_datetime_to_millis(datetime.datetime.min) - 24 * 60 * 60)}) + dec_below = decode(below, opts2) + self.assertEqual( + dec_below["x"], DatetimeMS(_datetime_to_millis(datetime.datetime.min) - 24 * 60 * 60) + ) + + # Aware auto, above range. + above = encode({"x": DatetimeMS(_datetime_to_millis(datetime.datetime.max) + 24 * 60 * 60)}) + dec_above = decode(above, opts2) + self.assertEqual( + dec_above["x"], + DatetimeMS(_datetime_to_millis(datetime.datetime.max) + 24 * 60 * 60), + ) + + def test_millis_from_datetime_ms(self): + # Test 65+ bit integer conversion, expect OverflowError. + big_ms = 2**65 + with self.assertRaises(OverflowError): + encode({"x": DatetimeMS(big_ms)}) + + # Subclass of DatetimeMS w/ __int__ override, expect an Error. + class DatetimeMSOverride(DatetimeMS): + def __int__(self): + return float(self._value) + + float_ms = DatetimeMSOverride(2) + with self.assertRaises(TypeError): + encode({"x": float_ms}) + + if __name__ == "__main__": unittest.main() diff --git a/test/test_client.py b/test/test_client.py index 3630cec06..f520043ec 100644 --- a/test/test_client.py +++ b/test/test_client.py @@ -65,7 +65,12 @@ from test.utils import ( import pymongo from bson import encode -from bson.codec_options import CodecOptions, TypeEncoder, TypeRegistry +from bson.codec_options import ( + CodecOptions, + DatetimeConversionOpts, + TypeEncoder, + TypeRegistry, +) from bson.son import SON from bson.tz_util import utc from pymongo import event_loggers, message, monitoring @@ -386,14 +391,17 @@ class ClientUnitTest(unittest.TestCase): # Ensure codec options are passed in correctly uuid_representation_label = "javaLegacy" unicode_decode_error_handler = "ignore" + datetime_conversion = "DATETIME_CLAMP" uri = ( "mongodb://%s:%d/foo?tz_aware=true&uuidrepresentation=" "%s&unicode_decode_error_handler=%s" + "&datetime_conversion=%s" % ( client_context.host, client_context.port, uuid_representation_label, unicode_decode_error_handler, + datetime_conversion, ) ) c = MongoClient(uri, connect=False) @@ -403,6 +411,19 @@ class ClientUnitTest(unittest.TestCase): c.codec_options.uuid_representation, _UUID_REPRESENTATIONS[uuid_representation_label] ) self.assertEqual(c.codec_options.unicode_decode_error_handler, unicode_decode_error_handler) + self.assertEqual( + c.codec_options.datetime_conversion, DatetimeConversionOpts[datetime_conversion] + ) + + # Change the passed datetime_conversion to a number and re-assert. + uri = uri.replace( + datetime_conversion, f"{int(DatetimeConversionOpts[datetime_conversion])}" + ) + c = MongoClient(uri, connect=False) + + self.assertEqual( + c.codec_options.datetime_conversion, DatetimeConversionOpts[datetime_conversion] + ) def test_uri_option_precedence(self): # Ensure kwarg options override connection string options. diff --git a/test/test_json_util.py b/test/test_json_util.py index ee5b7abb4..576746e86 100644 --- a/test/test_json_util.py +++ b/test/test_json_util.py @@ -21,11 +21,13 @@ import sys import uuid from typing import Any, List, MutableMapping +from bson.codec_options import CodecOptions, DatetimeConversionOpts + sys.path[0:0] = [""] from test import IntegrationTest, unittest -from bson import EPOCH_AWARE, EPOCH_NAIVE, SON, json_util +from bson import EPOCH_AWARE, EPOCH_NAIVE, SON, DatetimeMS, json_util from bson.binary import ( ALL_UUID_REPRESENTATIONS, MD5_SUBTYPE, @@ -35,6 +37,7 @@ from bson.binary import ( UuidRepresentation, ) from bson.code import Code +from bson.datetime_ms import _max_datetime_ms from bson.dbref import DBRef from bson.int64 import Int64 from bson.json_util import ( @@ -241,6 +244,69 @@ class TestJsonUtil(unittest.TestCase): ), ) + def test_datetime_ms(self): + # Test ISO8601 in-range + dat_min = {"x": DatetimeMS(0)} + dat_max = {"x": DatetimeMS(_max_datetime_ms())} + opts = JSONOptions(datetime_representation=DatetimeRepresentation.ISO8601) + + self.assertEqual( + dat_min["x"].as_datetime(CodecOptions(tz_aware=False)), + json_util.loads(json_util.dumps(dat_min))["x"], + ) + self.assertEqual( + dat_max["x"].as_datetime(CodecOptions(tz_aware=False)), + json_util.loads(json_util.dumps(dat_max))["x"], + ) + + # Test ISO8601 out-of-range + dat_min = {"x": DatetimeMS(-1)} + dat_max = {"x": DatetimeMS(_max_datetime_ms() + 1)} + + self.assertEqual('{"x": {"$date": {"$numberLong": "-1"}}}', json_util.dumps(dat_min)) + self.assertEqual( + '{"x": {"$date": {"$numberLong": "' + str(int(dat_max["x"])) + '"}}}', + json_util.dumps(dat_max), + ) + # Test legacy. + opts = JSONOptions( + datetime_representation=DatetimeRepresentation.LEGACY, json_mode=JSONMode.LEGACY + ) + self.assertEqual('{"x": {"$date": "-1"}}', json_util.dumps(dat_min, json_options=opts)) + self.assertEqual( + '{"x": {"$date": "' + str(int(dat_max["x"])) + '"}}', + json_util.dumps(dat_max, json_options=opts), + ) + + # Test regular. + opts = JSONOptions( + datetime_representation=DatetimeRepresentation.NUMBERLONG, json_mode=JSONMode.LEGACY + ) + self.assertEqual( + '{"x": {"$date": {"$numberLong": "-1"}}}', json_util.dumps(dat_min, json_options=opts) + ) + self.assertEqual( + '{"x": {"$date": {"$numberLong": "' + str(int(dat_max["x"])) + '"}}}', + json_util.dumps(dat_max, json_options=opts), + ) + + # Test decode from datetime.datetime to DatetimeMS + dat_min = {"x": datetime.datetime.min} + dat_max = {"x": DatetimeMS(_max_datetime_ms()).as_datetime(CodecOptions(tz_aware=False))} + opts = JSONOptions( + datetime_representation=DatetimeRepresentation.ISO8601, + datetime_conversion=DatetimeConversionOpts.DATETIME_MS, + ) + + self.assertEqual( + DatetimeMS(dat_min["x"]), + json_util.loads(json_util.dumps(dat_min), json_options=opts)["x"], + ) + self.assertEqual( + DatetimeMS(dat_max["x"]), + json_util.loads(json_util.dumps(dat_max), json_options=opts)["x"], + ) + def test_regex_object_hook(self): # Extended JSON format regular expression. pat = "a*b"