[DRIVERS-2926] [PYTHON-4577] BSON Binary Vector Subtype Support (#1813)
Co-authored-by: Steven Silvester <steve.silvester@mongodb.com> Co-authored-by: Steven Silvester <steven.silvester@ieee.org>
This commit is contained in:
parent
545b88cbd3
commit
ae6cfd6d10
@ -76,6 +76,9 @@ do
|
||||
atlas-data-lake-testing|data_lake)
|
||||
cpjson atlas-data-lake-testing/tests/ data_lake
|
||||
;;
|
||||
bson-binary-vector|bson_binary_vector)
|
||||
cpjson bson-binary-vector/tests/ bson_binary_vector
|
||||
;;
|
||||
bson-corpus|bson_corpus)
|
||||
cpjson bson-corpus/tests/ bson_corpus
|
||||
;;
|
||||
|
||||
152
bson/binary.py
152
bson/binary.py
@ -13,7 +13,10 @@
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Tuple, Type, Union
|
||||
import struct
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import TYPE_CHECKING, Any, Sequence, Tuple, Type, Union
|
||||
from uuid import UUID
|
||||
|
||||
"""Tools for representing BSON binary data.
|
||||
@ -191,21 +194,75 @@ SENSITIVE_SUBTYPE = 8
|
||||
"""
|
||||
|
||||
|
||||
VECTOR_SUBTYPE = 9
|
||||
"""**(BETA)** BSON binary subtype for densely packed vector data.
|
||||
|
||||
.. versionadded:: 4.10
|
||||
"""
|
||||
|
||||
|
||||
USER_DEFINED_SUBTYPE = 128
|
||||
"""BSON binary subtype for any user defined structure.
|
||||
"""
|
||||
|
||||
|
||||
class BinaryVectorDtype(Enum):
|
||||
"""**(BETA)** Datatypes of vector subtype.
|
||||
|
||||
:param FLOAT32: (0x27) Pack list of :class:`float` as float32
|
||||
:param INT8: (0x03) Pack list of :class:`int` in [-128, 127] as signed int8
|
||||
:param PACKED_BIT: (0x10) Pack list of :class:`int` in [0, 255] as unsigned uint8
|
||||
|
||||
The `PACKED_BIT` value represents a special case where vector values themselves
|
||||
can only be of two values (0 or 1) but these are packed together into groups of 8,
|
||||
a byte. In Python, these are displayed as ints in range [0, 255]
|
||||
|
||||
Each value is of type bytes with a length of one.
|
||||
|
||||
.. versionadded:: 4.10
|
||||
"""
|
||||
|
||||
INT8 = b"\x03"
|
||||
FLOAT32 = b"\x27"
|
||||
PACKED_BIT = b"\x10"
|
||||
|
||||
|
||||
@dataclass
|
||||
class BinaryVector:
|
||||
"""**(BETA)** Vector of numbers along with metadata for binary interoperability.
|
||||
.. versionadded:: 4.10
|
||||
"""
|
||||
|
||||
__slots__ = ("data", "dtype", "padding")
|
||||
|
||||
def __init__(self, data: Sequence[float | int], dtype: BinaryVectorDtype, padding: int = 0):
|
||||
"""
|
||||
:param data: Sequence of numbers representing the mathematical vector.
|
||||
:param dtype: The data type stored in binary
|
||||
:param padding: The number of bits in the final byte that are to be ignored
|
||||
when a vector element's size is less than a byte
|
||||
and the length of the vector is not a multiple of 8.
|
||||
"""
|
||||
self.data = data
|
||||
self.dtype = dtype
|
||||
self.padding = padding
|
||||
|
||||
|
||||
class Binary(bytes):
|
||||
"""Representation of BSON binary data.
|
||||
|
||||
This is necessary because we want to represent Python strings as
|
||||
the BSON string type. We need to wrap binary data so we can tell
|
||||
We want to represent Python strings as the BSON string type.
|
||||
We need to wrap binary data so that we can tell
|
||||
the difference between what should be considered binary data and
|
||||
what should be considered a string when we encode to BSON.
|
||||
|
||||
Raises TypeError if `data` is not an instance of :class:`bytes`
|
||||
or `subtype` is not an instance of :class:`int`.
|
||||
**(BETA)** Subtype 9 provides a space-efficient representation of 1-dimensional vector data.
|
||||
Its data is prepended with two bytes of metadata.
|
||||
The first (dtype) describes its data type, such as float32 or int8.
|
||||
The second (padding) prescribes the number of bits to ignore in the final byte.
|
||||
This is relevant when the element size of the dtype is not a multiple of 8.
|
||||
|
||||
Raises TypeError if `subtype` is not an instance of :class:`int`.
|
||||
Raises ValueError if `subtype` is not in [0, 256).
|
||||
|
||||
.. note::
|
||||
@ -218,7 +275,10 @@ class Binary(bytes):
|
||||
to use
|
||||
|
||||
.. versionchanged:: 3.9
|
||||
Support any bytes-like type that implements the buffer protocol.
|
||||
Support any bytes-like type that implements the buffer protocol.
|
||||
|
||||
.. versionchanged:: 4.10
|
||||
**(BETA)** Addition of vector subtype.
|
||||
"""
|
||||
|
||||
_type_marker = 5
|
||||
@ -337,6 +397,86 @@ class Binary(bytes):
|
||||
f"cannot decode subtype {self.subtype} to {UUID_REPRESENTATION_NAMES[uuid_representation]}"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_vector(
|
||||
cls: Type[Binary],
|
||||
vector: list[int, float],
|
||||
dtype: BinaryVectorDtype,
|
||||
padding: int = 0,
|
||||
) -> Binary:
|
||||
"""**(BETA)** Create a BSON :class:`~bson.binary.Binary` of Vector subtype from a list of Numbers.
|
||||
|
||||
To interpret the representation of the numbers, a data type must be included.
|
||||
See :class:`~bson.binary.BinaryVectorDtype` for available types and descriptions.
|
||||
|
||||
The dtype and padding are prepended to the binary data's value.
|
||||
|
||||
:param vector: List of values
|
||||
:param dtype: Data type of the values
|
||||
:param padding: For fractional bytes, number of bits to ignore at end of vector.
|
||||
:return: Binary packed data identified by dtype and padding.
|
||||
|
||||
.. versionadded:: 4.10
|
||||
"""
|
||||
if dtype == BinaryVectorDtype.INT8: # pack ints in [-128, 127] as signed int8
|
||||
format_str = "b"
|
||||
if padding:
|
||||
raise ValueError(f"padding does not apply to {dtype=}")
|
||||
elif dtype == BinaryVectorDtype.PACKED_BIT: # pack ints in [0, 255] as unsigned uint8
|
||||
format_str = "B"
|
||||
elif dtype == BinaryVectorDtype.FLOAT32: # pack floats as float32
|
||||
format_str = "f"
|
||||
if padding:
|
||||
raise ValueError(f"padding does not apply to {dtype=}")
|
||||
else:
|
||||
raise NotImplementedError("%s not yet supported" % dtype)
|
||||
|
||||
metadata = struct.pack("<sB", dtype.value, padding)
|
||||
data = struct.pack(f"{len(vector)}{format_str}", *vector)
|
||||
return cls(metadata + data, subtype=VECTOR_SUBTYPE)
|
||||
|
||||
def as_vector(self) -> BinaryVector:
|
||||
"""**(BETA)** From the Binary, create a list of numbers, along with dtype and padding.
|
||||
|
||||
:return: BinaryVector
|
||||
|
||||
.. versionadded:: 4.10
|
||||
"""
|
||||
|
||||
if self.subtype != VECTOR_SUBTYPE:
|
||||
raise ValueError(f"Cannot decode subtype {self.subtype} as a vector.")
|
||||
|
||||
position = 0
|
||||
dtype, padding = struct.unpack_from("<sB", self, position)
|
||||
position += 2
|
||||
dtype = BinaryVectorDtype(dtype)
|
||||
n_values = len(self) - position
|
||||
|
||||
if dtype == BinaryVectorDtype.INT8:
|
||||
dtype_format = "b"
|
||||
format_string = f"{n_values}{dtype_format}"
|
||||
vector = list(struct.unpack_from(format_string, self, position))
|
||||
return BinaryVector(vector, dtype, padding)
|
||||
|
||||
elif dtype == BinaryVectorDtype.FLOAT32:
|
||||
n_bytes = len(self) - position
|
||||
n_values = n_bytes // 4
|
||||
if n_bytes % 4:
|
||||
raise ValueError(
|
||||
"Corrupt data. N bytes for a float32 vector must be a multiple of 4."
|
||||
)
|
||||
vector = list(struct.unpack_from(f"{n_values}f", self, position))
|
||||
return BinaryVector(vector, dtype, padding)
|
||||
|
||||
elif dtype == BinaryVectorDtype.PACKED_BIT:
|
||||
# data packed as uint8
|
||||
dtype_format = "B"
|
||||
unpacked_uint8s = list(struct.unpack_from(f"{n_values}{dtype_format}", self, position))
|
||||
return BinaryVector(unpacked_uint8s, dtype, padding)
|
||||
|
||||
else:
|
||||
raise NotImplementedError("Binary Vector dtype %s not yet supported" % dtype.name)
|
||||
|
||||
@property
|
||||
def subtype(self) -> int:
|
||||
"""Subtype of this binary data."""
|
||||
|
||||
@ -21,6 +21,14 @@
|
||||
.. autoclass:: UuidRepresentation
|
||||
:members:
|
||||
|
||||
.. autoclass:: BinaryVectorDtype
|
||||
:members:
|
||||
:show-inheritance:
|
||||
|
||||
.. autoclass:: BinaryVector
|
||||
:members:
|
||||
|
||||
|
||||
.. autoclass:: Binary(data, subtype=BINARY_SUBTYPE)
|
||||
:members:
|
||||
:show-inheritance:
|
||||
|
||||
@ -19,7 +19,6 @@ in this release.
|
||||
|
||||
.. _PyMongo 4.10 release notes in JIRA: https://jira.mongodb.org/secure/ReleaseNote.jspa?projectId=10004&version=40553
|
||||
|
||||
|
||||
Changes in Version 4.9.0
|
||||
-------------------------
|
||||
|
||||
|
||||
42
test/bson_binary_vector/float32.json
Normal file
42
test/bson_binary_vector/float32.json
Normal file
@ -0,0 +1,42 @@
|
||||
{
|
||||
"description": "Tests of Binary subtype 9, Vectors, with dtype FLOAT32",
|
||||
"test_key": "vector",
|
||||
"tests": [
|
||||
{
|
||||
"description": "Simple Vector FLOAT32",
|
||||
"valid": true,
|
||||
"vector": [127.0, 7.0],
|
||||
"dtype_hex": "0x27",
|
||||
"dtype_alias": "FLOAT32",
|
||||
"padding": 0,
|
||||
"canonical_bson": "1C00000005766563746F72000A0000000927000000FE420000E04000"
|
||||
},
|
||||
{
|
||||
"description": "Empty Vector FLOAT32",
|
||||
"valid": true,
|
||||
"vector": [],
|
||||
"dtype_hex": "0x27",
|
||||
"dtype_alias": "FLOAT32",
|
||||
"padding": 0,
|
||||
"canonical_bson": "1400000005766563746F72000200000009270000"
|
||||
},
|
||||
{
|
||||
"description": "Infinity Vector FLOAT32",
|
||||
"valid": true,
|
||||
"vector": ["-inf", 0.0, "inf"],
|
||||
"dtype_hex": "0x27",
|
||||
"dtype_alias": "FLOAT32",
|
||||
"padding": 0,
|
||||
"canonical_bson": "2000000005766563746F72000E000000092700000080FF000000000000807F00"
|
||||
},
|
||||
{
|
||||
"description": "FLOAT32 with padding",
|
||||
"valid": false,
|
||||
"vector": [127.0, 7.0],
|
||||
"dtype_hex": "0x27",
|
||||
"dtype_alias": "FLOAT32",
|
||||
"padding": 3
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
57
test/bson_binary_vector/int8.json
Normal file
57
test/bson_binary_vector/int8.json
Normal file
@ -0,0 +1,57 @@
|
||||
{
|
||||
"description": "Tests of Binary subtype 9, Vectors, with dtype INT8",
|
||||
"test_key": "vector",
|
||||
"tests": [
|
||||
{
|
||||
"description": "Simple Vector INT8",
|
||||
"valid": true,
|
||||
"vector": [127, 7],
|
||||
"dtype_hex": "0x03",
|
||||
"dtype_alias": "INT8",
|
||||
"padding": 0,
|
||||
"canonical_bson": "1600000005766563746F7200040000000903007F0700"
|
||||
},
|
||||
{
|
||||
"description": "Empty Vector INT8",
|
||||
"valid": true,
|
||||
"vector": [],
|
||||
"dtype_hex": "0x03",
|
||||
"dtype_alias": "INT8",
|
||||
"padding": 0,
|
||||
"canonical_bson": "1400000005766563746F72000200000009030000"
|
||||
},
|
||||
{
|
||||
"description": "Overflow Vector INT8",
|
||||
"valid": false,
|
||||
"vector": [128],
|
||||
"dtype_hex": "0x03",
|
||||
"dtype_alias": "INT8",
|
||||
"padding": 0
|
||||
},
|
||||
{
|
||||
"description": "Underflow Vector INT8",
|
||||
"valid": false,
|
||||
"vector": [-129],
|
||||
"dtype_hex": "0x03",
|
||||
"dtype_alias": "INT8",
|
||||
"padding": 0
|
||||
},
|
||||
{
|
||||
"description": "INT8 with padding",
|
||||
"valid": false,
|
||||
"vector": [127, 7],
|
||||
"dtype_hex": "0x03",
|
||||
"dtype_alias": "INT8",
|
||||
"padding": 3
|
||||
},
|
||||
{
|
||||
"description": "INT8 with float inputs",
|
||||
"valid": false,
|
||||
"vector": [127.77, 7.77],
|
||||
"dtype_hex": "0x03",
|
||||
"dtype_alias": "INT8",
|
||||
"padding": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
50
test/bson_binary_vector/packed_bit.json
Normal file
50
test/bson_binary_vector/packed_bit.json
Normal file
@ -0,0 +1,50 @@
|
||||
{
|
||||
"description": "Tests of Binary subtype 9, Vectors, with dtype PACKED_BIT",
|
||||
"test_key": "vector",
|
||||
"tests": [
|
||||
{
|
||||
"description": "Simple Vector PACKED_BIT",
|
||||
"valid": true,
|
||||
"vector": [127, 7],
|
||||
"dtype_hex": "0x10",
|
||||
"dtype_alias": "PACKED_BIT",
|
||||
"padding": 0,
|
||||
"canonical_bson": "1600000005766563746F7200040000000910007F0700"
|
||||
},
|
||||
{
|
||||
"description": "Empty Vector PACKED_BIT",
|
||||
"valid": true,
|
||||
"vector": [],
|
||||
"dtype_hex": "0x10",
|
||||
"dtype_alias": "PACKED_BIT",
|
||||
"padding": 0,
|
||||
"canonical_bson": "1400000005766563746F72000200000009100000"
|
||||
},
|
||||
{
|
||||
"description": "PACKED_BIT with padding",
|
||||
"valid": true,
|
||||
"vector": [127, 7],
|
||||
"dtype_hex": "0x10",
|
||||
"dtype_alias": "PACKED_BIT",
|
||||
"padding": 3,
|
||||
"canonical_bson": "1600000005766563746F7200040000000910037F0700"
|
||||
},
|
||||
{
|
||||
"description": "Overflow Vector PACKED_BIT",
|
||||
"valid": false,
|
||||
"vector": [256],
|
||||
"dtype_hex": "0x10",
|
||||
"dtype_alias": "PACKED_BIT",
|
||||
"padding": 0
|
||||
},
|
||||
{
|
||||
"description": "Underflow Vector PACKED_BIT",
|
||||
"valid": false,
|
||||
"vector": [-1],
|
||||
"dtype_hex": "0x10",
|
||||
"dtype_alias": "PACKED_BIT",
|
||||
"padding": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@ -74,6 +74,36 @@
|
||||
"description": "$type query operator (conflicts with legacy $binary form with $type field)",
|
||||
"canonical_bson": "180000000378001000000010247479706500020000000000",
|
||||
"canonical_extjson": "{\"x\" : { \"$type\" : {\"$numberInt\": \"2\"}}}"
|
||||
},
|
||||
{
|
||||
"description": "subtype 0x09 Vector FLOAT32",
|
||||
"canonical_bson": "170000000578000A0000000927000000FE420000E04000",
|
||||
"canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"JwAAAP5CAADgQA==\", \"subType\": \"09\"}}}"
|
||||
},
|
||||
{
|
||||
"description": "subtype 0x09 Vector INT8",
|
||||
"canonical_bson": "11000000057800040000000903007F0700",
|
||||
"canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"AwB/Bw==\", \"subType\": \"09\"}}}"
|
||||
},
|
||||
{
|
||||
"description": "subtype 0x09 Vector PACKED_BIT",
|
||||
"canonical_bson": "11000000057800040000000910007F0700",
|
||||
"canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"EAB/Bw==\", \"subType\": \"09\"}}}"
|
||||
},
|
||||
{
|
||||
"description": "subtype 0x09 Vector (Zero-length) FLOAT32",
|
||||
"canonical_bson": "0F0000000578000200000009270000",
|
||||
"canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"JwA=\", \"subType\": \"09\"}}}"
|
||||
},
|
||||
{
|
||||
"description": "subtype 0x09 Vector (Zero-length) INT8",
|
||||
"canonical_bson": "0F0000000578000200000009030000",
|
||||
"canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"AwA=\", \"subType\": \"09\"}}}"
|
||||
},
|
||||
{
|
||||
"description": "subtype 0x09 Vector (Zero-length) PACKED_BIT",
|
||||
"canonical_bson": "0F0000000578000200000009100000",
|
||||
"canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"EAA=\", \"subType\": \"09\"}}}"
|
||||
}
|
||||
],
|
||||
"decodeErrors": [
|
||||
|
||||
@ -49,8 +49,9 @@ from bson import (
|
||||
decode_iter,
|
||||
encode,
|
||||
is_valid,
|
||||
json_util,
|
||||
)
|
||||
from bson.binary import USER_DEFINED_SUBTYPE, Binary, UuidRepresentation
|
||||
from bson.binary import USER_DEFINED_SUBTYPE, Binary, BinaryVectorDtype, UuidRepresentation
|
||||
from bson.code import Code
|
||||
from bson.codec_options import CodecOptions, DatetimeConversion
|
||||
from bson.datetime_ms import _DATETIME_ERROR_SUGGESTION
|
||||
@ -148,6 +149,9 @@ class TestBSON(unittest.TestCase):
|
||||
helper({"a binary": Binary(b"test", 128)})
|
||||
helper({"a binary": Binary(b"test", 254)})
|
||||
helper({"another binary": Binary(b"test", 2)})
|
||||
helper({"binary packed bit vector": Binary(b"\x10\x00\x7f\x07", 9)})
|
||||
helper({"binary int8 vector": Binary(b"\x03\x00\x7f\x07", 9)})
|
||||
helper({"binary float32 vector": Binary(b"'\x00\x00\x00\xfeB\x00\x00\xe0@", 9)})
|
||||
helper(SON([("test dst", datetime.datetime(1993, 4, 4, 2))]))
|
||||
helper(SON([("test negative dst", datetime.datetime(1, 1, 1, 1, 1, 1))]))
|
||||
helper({"big float": float(10000000000)})
|
||||
@ -447,6 +451,20 @@ class TestBSON(unittest.TestCase):
|
||||
encode({"test": Binary(b"test", 128)}),
|
||||
b"\x14\x00\x00\x00\x05\x74\x65\x73\x74\x00\x04\x00\x00\x00\x80\x74\x65\x73\x74\x00",
|
||||
)
|
||||
self.assertEqual(
|
||||
encode({"vector_int8": Binary.from_vector([-128, -1, 127], BinaryVectorDtype.INT8)}),
|
||||
b"\x1c\x00\x00\x00\x05vector_int8\x00\x05\x00\x00\x00\t\x03\x00\x80\xff\x7f\x00",
|
||||
)
|
||||
self.assertEqual(
|
||||
encode({"vector_bool": Binary.from_vector([1, 127], BinaryVectorDtype.PACKED_BIT)}),
|
||||
b"\x1b\x00\x00\x00\x05vector_bool\x00\x04\x00\x00\x00\t\x10\x00\x01\x7f\x00",
|
||||
)
|
||||
self.assertEqual(
|
||||
encode(
|
||||
{"vector_float32": Binary.from_vector([-1.1, 1.1e10], BinaryVectorDtype.FLOAT32)}
|
||||
),
|
||||
b"$\x00\x00\x00\x05vector_float32\x00\n\x00\x00\x00\t'\x00\xcd\xcc\x8c\xbf\xac\xe9#P\x00",
|
||||
)
|
||||
self.assertEqual(encode({"test": None}), b"\x0B\x00\x00\x00\x0A\x74\x65\x73\x74\x00\x00")
|
||||
self.assertEqual(
|
||||
encode({"date": datetime.datetime(2007, 1, 8, 0, 30, 11)}),
|
||||
@ -711,9 +729,66 @@ class TestBSON(unittest.TestCase):
|
||||
transformed = bin.as_uuid(UuidRepresentation.PYTHON_LEGACY)
|
||||
self.assertEqual(id, transformed)
|
||||
|
||||
# The C extension was segfaulting on unicode RegExs, so we have this test
|
||||
# that doesn't really test anything but the lack of a segfault.
|
||||
def test_vector(self):
|
||||
"""Tests of subtype 9"""
|
||||
# We start with valid cases, across the 3 dtypes implemented.
|
||||
# Work with a simple vector that can be interpreted as int8, float32, or ubyte
|
||||
list_vector = [127, 7]
|
||||
# As INT8, vector has length 2
|
||||
binary_vector = Binary.from_vector(list_vector, BinaryVectorDtype.INT8)
|
||||
vector = binary_vector.as_vector()
|
||||
assert vector.data == list_vector
|
||||
# test encoding roundtrip
|
||||
assert {"vector": binary_vector} == decode(encode({"vector": binary_vector}))
|
||||
# test json roundtrip
|
||||
assert binary_vector == json_util.loads(json_util.dumps(binary_vector))
|
||||
|
||||
# For vectors of bits, aka PACKED_BIT type, vector has length 8 * 2
|
||||
packed_bit_binary = Binary.from_vector(list_vector, BinaryVectorDtype.PACKED_BIT)
|
||||
packed_bit_vec = packed_bit_binary.as_vector()
|
||||
assert packed_bit_vec.data == list_vector
|
||||
|
||||
# A padding parameter permits vectors of length that aren't divisible by 8
|
||||
# The following ignores the last 3 bits in list_vector,
|
||||
# hence it's length is 8 * len(list_vector) - padding
|
||||
padding = 3
|
||||
padded_vec = Binary.from_vector(list_vector, BinaryVectorDtype.PACKED_BIT, padding=padding)
|
||||
assert padded_vec.as_vector().data == list_vector
|
||||
# To visualize how this looks as a binary vector..
|
||||
uncompressed = ""
|
||||
for val in list_vector:
|
||||
uncompressed += format(val, "08b")
|
||||
assert uncompressed[:-padding] == "0111111100000"
|
||||
|
||||
# It is worthwhile explicitly showing the values encoded to BSON
|
||||
padded_doc = {"padded_vec": padded_vec}
|
||||
assert (
|
||||
encode(padded_doc)
|
||||
== b"\x1a\x00\x00\x00\x05padded_vec\x00\x04\x00\x00\x00\t\x10\x03\x7f\x07\x00"
|
||||
)
|
||||
# and dumped to json
|
||||
assert (
|
||||
json_util.dumps(padded_doc)
|
||||
== '{"padded_vec": {"$binary": {"base64": "EAN/Bw==", "subType": "09"}}}'
|
||||
)
|
||||
|
||||
# FLOAT32 is also implemented
|
||||
float_binary = Binary.from_vector(list_vector, BinaryVectorDtype.FLOAT32)
|
||||
assert all(isinstance(d, float) for d in float_binary.as_vector().data)
|
||||
|
||||
# Now some invalid cases
|
||||
for x in [-1, 257]:
|
||||
try:
|
||||
Binary.from_vector([x], BinaryVectorDtype.PACKED_BIT)
|
||||
except Exception as exc:
|
||||
self.assertTrue(isinstance(exc, struct.error))
|
||||
else:
|
||||
self.fail("Failed to raise an exception.")
|
||||
|
||||
def test_unicode_regex(self):
|
||||
"""Tests we do not get a segfault for C extension on unicode RegExs.
|
||||
This had been happening.
|
||||
"""
|
||||
regex = re.compile("revisi\xf3n")
|
||||
decode(encode({"regex": regex}))
|
||||
|
||||
|
||||
105
test/test_bson_binary_vector.py
Normal file
105
test/test_bson_binary_vector.py
Normal file
@ -0,0 +1,105 @@
|
||||
# Copyright 2024-present MongoDB, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import binascii
|
||||
import codecs
|
||||
import json
|
||||
import struct
|
||||
from pathlib import Path
|
||||
from test import unittest
|
||||
|
||||
from bson import decode, encode
|
||||
from bson.binary import Binary, BinaryVectorDtype
|
||||
|
||||
_TEST_PATH = Path(__file__).parent / "bson_binary_vector"
|
||||
|
||||
|
||||
class TestBSONBinaryVector(unittest.TestCase):
|
||||
"""Runs Binary Vector subtype tests.
|
||||
|
||||
Follows the style of the BSON corpus specification tests.
|
||||
Tests are automatically generated on import
|
||||
from json files in _TEST_PATH via `create_tests`.
|
||||
The actual tests are defined in the inner function `run_test`
|
||||
of the test generator `create_test`."""
|
||||
|
||||
|
||||
def create_test(case_spec):
|
||||
"""Create standard test given specification in json.
|
||||
|
||||
We use the naming convention expected (exp) and observed (obj)
|
||||
to differentiate what is in the json (expected or suffix _exp)
|
||||
from what is produced by the API (observed or suffix _obs)
|
||||
"""
|
||||
test_key = case_spec.get("test_key")
|
||||
|
||||
def run_test(self):
|
||||
for test_case in case_spec.get("tests", []):
|
||||
description = test_case["description"]
|
||||
vector_exp = test_case["vector"]
|
||||
dtype_hex_exp = test_case["dtype_hex"]
|
||||
dtype_alias_exp = test_case.get("dtype_alias")
|
||||
padding_exp = test_case.get("padding", 0)
|
||||
canonical_bson_exp = test_case.get("canonical_bson")
|
||||
# Convert dtype hex string into bytes
|
||||
dtype_exp = BinaryVectorDtype(int(dtype_hex_exp, 16).to_bytes(1, byteorder="little"))
|
||||
|
||||
if test_case["valid"]:
|
||||
# Convert bson string to bytes
|
||||
cB_exp = binascii.unhexlify(canonical_bson_exp.encode("utf8"))
|
||||
decoded_doc = decode(cB_exp)
|
||||
binary_obs = decoded_doc[test_key]
|
||||
# Handle special float cases like '-inf'
|
||||
if dtype_exp in [BinaryVectorDtype.FLOAT32]:
|
||||
vector_exp = [float(x) for x in vector_exp]
|
||||
|
||||
# Test round-tripping canonical bson.
|
||||
self.assertEqual(encode(decoded_doc), cB_exp, description)
|
||||
|
||||
# Test BSON to Binary Vector
|
||||
vector_obs = binary_obs.as_vector()
|
||||
self.assertEqual(vector_obs.dtype, dtype_exp, description)
|
||||
if dtype_alias_exp:
|
||||
self.assertEqual(
|
||||
vector_obs.dtype, BinaryVectorDtype[dtype_alias_exp], description
|
||||
)
|
||||
self.assertEqual(vector_obs.data, vector_exp, description)
|
||||
self.assertEqual(vector_obs.padding, padding_exp, description)
|
||||
|
||||
# Test Binary Vector to BSON
|
||||
vector_exp = Binary.from_vector(vector_exp, dtype_exp, padding_exp)
|
||||
cB_obs = binascii.hexlify(encode({test_key: vector_exp})).decode().upper()
|
||||
self.assertEqual(cB_obs, canonical_bson_exp, description)
|
||||
|
||||
else:
|
||||
with self.assertRaises((struct.error, ValueError), msg=description):
|
||||
Binary.from_vector(vector_exp, dtype_exp, padding_exp)
|
||||
|
||||
return run_test
|
||||
|
||||
|
||||
def create_tests():
|
||||
for filename in _TEST_PATH.glob("*.json"):
|
||||
with codecs.open(str(filename), encoding="utf-8") as test_file:
|
||||
test_method = create_test(json.load(test_file))
|
||||
setattr(TestBSONBinaryVector, "test_" + filename.stem, test_method)
|
||||
|
||||
|
||||
create_tests()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Loading…
Reference in New Issue
Block a user