[DRIVERS-2926] [PYTHON-4577] BSON Binary Vector Subtype Support (#1813)

Co-authored-by: Steven Silvester <steve.silvester@mongodb.com>
Co-authored-by: Steven Silvester <steven.silvester@ieee.org>
This commit is contained in:
Casey Clements 2024-09-30 22:13:09 -04:00 committed by GitHub
parent 545b88cbd3
commit ae6cfd6d10
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 519 additions and 10 deletions

View File

@ -76,6 +76,9 @@ do
atlas-data-lake-testing|data_lake)
cpjson atlas-data-lake-testing/tests/ data_lake
;;
bson-binary-vector|bson_binary_vector)
cpjson bson-binary-vector/tests/ bson_binary_vector
;;
bson-corpus|bson_corpus)
cpjson bson-corpus/tests/ bson_corpus
;;

View File

@ -13,7 +13,10 @@
# limitations under the License.
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Tuple, Type, Union
import struct
from dataclasses import dataclass
from enum import Enum
from typing import TYPE_CHECKING, Any, Sequence, Tuple, Type, Union
from uuid import UUID
"""Tools for representing BSON binary data.
@ -191,21 +194,75 @@ SENSITIVE_SUBTYPE = 8
"""
VECTOR_SUBTYPE = 9
"""**(BETA)** BSON binary subtype for densely packed vector data.
.. versionadded:: 4.10
"""
USER_DEFINED_SUBTYPE = 128
"""BSON binary subtype for any user defined structure.
"""
class BinaryVectorDtype(Enum):
"""**(BETA)** Datatypes of vector subtype.
:param FLOAT32: (0x27) Pack list of :class:`float` as float32
:param INT8: (0x03) Pack list of :class:`int` in [-128, 127] as signed int8
:param PACKED_BIT: (0x10) Pack list of :class:`int` in [0, 255] as unsigned uint8
The `PACKED_BIT` value represents a special case where vector values themselves
can only be of two values (0 or 1) but these are packed together into groups of 8,
a byte. In Python, these are displayed as ints in range [0, 255]
Each value is of type bytes with a length of one.
.. versionadded:: 4.10
"""
INT8 = b"\x03"
FLOAT32 = b"\x27"
PACKED_BIT = b"\x10"
@dataclass
class BinaryVector:
"""**(BETA)** Vector of numbers along with metadata for binary interoperability.
.. versionadded:: 4.10
"""
__slots__ = ("data", "dtype", "padding")
def __init__(self, data: Sequence[float | int], dtype: BinaryVectorDtype, padding: int = 0):
"""
:param data: Sequence of numbers representing the mathematical vector.
:param dtype: The data type stored in binary
:param padding: The number of bits in the final byte that are to be ignored
when a vector element's size is less than a byte
and the length of the vector is not a multiple of 8.
"""
self.data = data
self.dtype = dtype
self.padding = padding
class Binary(bytes):
"""Representation of BSON binary data.
This is necessary because we want to represent Python strings as
the BSON string type. We need to wrap binary data so we can tell
We want to represent Python strings as the BSON string type.
We need to wrap binary data so that we can tell
the difference between what should be considered binary data and
what should be considered a string when we encode to BSON.
Raises TypeError if `data` is not an instance of :class:`bytes`
or `subtype` is not an instance of :class:`int`.
**(BETA)** Subtype 9 provides a space-efficient representation of 1-dimensional vector data.
Its data is prepended with two bytes of metadata.
The first (dtype) describes its data type, such as float32 or int8.
The second (padding) prescribes the number of bits to ignore in the final byte.
This is relevant when the element size of the dtype is not a multiple of 8.
Raises TypeError if `subtype` is not an instance of :class:`int`.
Raises ValueError if `subtype` is not in [0, 256).
.. note::
@ -218,7 +275,10 @@ class Binary(bytes):
to use
.. versionchanged:: 3.9
Support any bytes-like type that implements the buffer protocol.
Support any bytes-like type that implements the buffer protocol.
.. versionchanged:: 4.10
**(BETA)** Addition of vector subtype.
"""
_type_marker = 5
@ -337,6 +397,86 @@ class Binary(bytes):
f"cannot decode subtype {self.subtype} to {UUID_REPRESENTATION_NAMES[uuid_representation]}"
)
@classmethod
def from_vector(
cls: Type[Binary],
vector: list[int, float],
dtype: BinaryVectorDtype,
padding: int = 0,
) -> Binary:
"""**(BETA)** Create a BSON :class:`~bson.binary.Binary` of Vector subtype from a list of Numbers.
To interpret the representation of the numbers, a data type must be included.
See :class:`~bson.binary.BinaryVectorDtype` for available types and descriptions.
The dtype and padding are prepended to the binary data's value.
:param vector: List of values
:param dtype: Data type of the values
:param padding: For fractional bytes, number of bits to ignore at end of vector.
:return: Binary packed data identified by dtype and padding.
.. versionadded:: 4.10
"""
if dtype == BinaryVectorDtype.INT8: # pack ints in [-128, 127] as signed int8
format_str = "b"
if padding:
raise ValueError(f"padding does not apply to {dtype=}")
elif dtype == BinaryVectorDtype.PACKED_BIT: # pack ints in [0, 255] as unsigned uint8
format_str = "B"
elif dtype == BinaryVectorDtype.FLOAT32: # pack floats as float32
format_str = "f"
if padding:
raise ValueError(f"padding does not apply to {dtype=}")
else:
raise NotImplementedError("%s not yet supported" % dtype)
metadata = struct.pack("<sB", dtype.value, padding)
data = struct.pack(f"{len(vector)}{format_str}", *vector)
return cls(metadata + data, subtype=VECTOR_SUBTYPE)
def as_vector(self) -> BinaryVector:
"""**(BETA)** From the Binary, create a list of numbers, along with dtype and padding.
:return: BinaryVector
.. versionadded:: 4.10
"""
if self.subtype != VECTOR_SUBTYPE:
raise ValueError(f"Cannot decode subtype {self.subtype} as a vector.")
position = 0
dtype, padding = struct.unpack_from("<sB", self, position)
position += 2
dtype = BinaryVectorDtype(dtype)
n_values = len(self) - position
if dtype == BinaryVectorDtype.INT8:
dtype_format = "b"
format_string = f"{n_values}{dtype_format}"
vector = list(struct.unpack_from(format_string, self, position))
return BinaryVector(vector, dtype, padding)
elif dtype == BinaryVectorDtype.FLOAT32:
n_bytes = len(self) - position
n_values = n_bytes // 4
if n_bytes % 4:
raise ValueError(
"Corrupt data. N bytes for a float32 vector must be a multiple of 4."
)
vector = list(struct.unpack_from(f"{n_values}f", self, position))
return BinaryVector(vector, dtype, padding)
elif dtype == BinaryVectorDtype.PACKED_BIT:
# data packed as uint8
dtype_format = "B"
unpacked_uint8s = list(struct.unpack_from(f"{n_values}{dtype_format}", self, position))
return BinaryVector(unpacked_uint8s, dtype, padding)
else:
raise NotImplementedError("Binary Vector dtype %s not yet supported" % dtype.name)
@property
def subtype(self) -> int:
"""Subtype of this binary data."""

View File

@ -21,6 +21,14 @@
.. autoclass:: UuidRepresentation
:members:
.. autoclass:: BinaryVectorDtype
:members:
:show-inheritance:
.. autoclass:: BinaryVector
:members:
.. autoclass:: Binary(data, subtype=BINARY_SUBTYPE)
:members:
:show-inheritance:

View File

@ -19,7 +19,6 @@ in this release.
.. _PyMongo 4.10 release notes in JIRA: https://jira.mongodb.org/secure/ReleaseNote.jspa?projectId=10004&version=40553
Changes in Version 4.9.0
-------------------------

View File

@ -0,0 +1,42 @@
{
"description": "Tests of Binary subtype 9, Vectors, with dtype FLOAT32",
"test_key": "vector",
"tests": [
{
"description": "Simple Vector FLOAT32",
"valid": true,
"vector": [127.0, 7.0],
"dtype_hex": "0x27",
"dtype_alias": "FLOAT32",
"padding": 0,
"canonical_bson": "1C00000005766563746F72000A0000000927000000FE420000E04000"
},
{
"description": "Empty Vector FLOAT32",
"valid": true,
"vector": [],
"dtype_hex": "0x27",
"dtype_alias": "FLOAT32",
"padding": 0,
"canonical_bson": "1400000005766563746F72000200000009270000"
},
{
"description": "Infinity Vector FLOAT32",
"valid": true,
"vector": ["-inf", 0.0, "inf"],
"dtype_hex": "0x27",
"dtype_alias": "FLOAT32",
"padding": 0,
"canonical_bson": "2000000005766563746F72000E000000092700000080FF000000000000807F00"
},
{
"description": "FLOAT32 with padding",
"valid": false,
"vector": [127.0, 7.0],
"dtype_hex": "0x27",
"dtype_alias": "FLOAT32",
"padding": 3
}
]
}

View File

@ -0,0 +1,57 @@
{
"description": "Tests of Binary subtype 9, Vectors, with dtype INT8",
"test_key": "vector",
"tests": [
{
"description": "Simple Vector INT8",
"valid": true,
"vector": [127, 7],
"dtype_hex": "0x03",
"dtype_alias": "INT8",
"padding": 0,
"canonical_bson": "1600000005766563746F7200040000000903007F0700"
},
{
"description": "Empty Vector INT8",
"valid": true,
"vector": [],
"dtype_hex": "0x03",
"dtype_alias": "INT8",
"padding": 0,
"canonical_bson": "1400000005766563746F72000200000009030000"
},
{
"description": "Overflow Vector INT8",
"valid": false,
"vector": [128],
"dtype_hex": "0x03",
"dtype_alias": "INT8",
"padding": 0
},
{
"description": "Underflow Vector INT8",
"valid": false,
"vector": [-129],
"dtype_hex": "0x03",
"dtype_alias": "INT8",
"padding": 0
},
{
"description": "INT8 with padding",
"valid": false,
"vector": [127, 7],
"dtype_hex": "0x03",
"dtype_alias": "INT8",
"padding": 3
},
{
"description": "INT8 with float inputs",
"valid": false,
"vector": [127.77, 7.77],
"dtype_hex": "0x03",
"dtype_alias": "INT8",
"padding": 0
}
]
}

View File

@ -0,0 +1,50 @@
{
"description": "Tests of Binary subtype 9, Vectors, with dtype PACKED_BIT",
"test_key": "vector",
"tests": [
{
"description": "Simple Vector PACKED_BIT",
"valid": true,
"vector": [127, 7],
"dtype_hex": "0x10",
"dtype_alias": "PACKED_BIT",
"padding": 0,
"canonical_bson": "1600000005766563746F7200040000000910007F0700"
},
{
"description": "Empty Vector PACKED_BIT",
"valid": true,
"vector": [],
"dtype_hex": "0x10",
"dtype_alias": "PACKED_BIT",
"padding": 0,
"canonical_bson": "1400000005766563746F72000200000009100000"
},
{
"description": "PACKED_BIT with padding",
"valid": true,
"vector": [127, 7],
"dtype_hex": "0x10",
"dtype_alias": "PACKED_BIT",
"padding": 3,
"canonical_bson": "1600000005766563746F7200040000000910037F0700"
},
{
"description": "Overflow Vector PACKED_BIT",
"valid": false,
"vector": [256],
"dtype_hex": "0x10",
"dtype_alias": "PACKED_BIT",
"padding": 0
},
{
"description": "Underflow Vector PACKED_BIT",
"valid": false,
"vector": [-1],
"dtype_hex": "0x10",
"dtype_alias": "PACKED_BIT",
"padding": 0
}
]
}

View File

@ -74,6 +74,36 @@
"description": "$type query operator (conflicts with legacy $binary form with $type field)",
"canonical_bson": "180000000378001000000010247479706500020000000000",
"canonical_extjson": "{\"x\" : { \"$type\" : {\"$numberInt\": \"2\"}}}"
},
{
"description": "subtype 0x09 Vector FLOAT32",
"canonical_bson": "170000000578000A0000000927000000FE420000E04000",
"canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"JwAAAP5CAADgQA==\", \"subType\": \"09\"}}}"
},
{
"description": "subtype 0x09 Vector INT8",
"canonical_bson": "11000000057800040000000903007F0700",
"canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"AwB/Bw==\", \"subType\": \"09\"}}}"
},
{
"description": "subtype 0x09 Vector PACKED_BIT",
"canonical_bson": "11000000057800040000000910007F0700",
"canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"EAB/Bw==\", \"subType\": \"09\"}}}"
},
{
"description": "subtype 0x09 Vector (Zero-length) FLOAT32",
"canonical_bson": "0F0000000578000200000009270000",
"canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"JwA=\", \"subType\": \"09\"}}}"
},
{
"description": "subtype 0x09 Vector (Zero-length) INT8",
"canonical_bson": "0F0000000578000200000009030000",
"canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"AwA=\", \"subType\": \"09\"}}}"
},
{
"description": "subtype 0x09 Vector (Zero-length) PACKED_BIT",
"canonical_bson": "0F0000000578000200000009100000",
"canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"EAA=\", \"subType\": \"09\"}}}"
}
],
"decodeErrors": [

View File

@ -49,8 +49,9 @@ from bson import (
decode_iter,
encode,
is_valid,
json_util,
)
from bson.binary import USER_DEFINED_SUBTYPE, Binary, UuidRepresentation
from bson.binary import USER_DEFINED_SUBTYPE, Binary, BinaryVectorDtype, UuidRepresentation
from bson.code import Code
from bson.codec_options import CodecOptions, DatetimeConversion
from bson.datetime_ms import _DATETIME_ERROR_SUGGESTION
@ -148,6 +149,9 @@ class TestBSON(unittest.TestCase):
helper({"a binary": Binary(b"test", 128)})
helper({"a binary": Binary(b"test", 254)})
helper({"another binary": Binary(b"test", 2)})
helper({"binary packed bit vector": Binary(b"\x10\x00\x7f\x07", 9)})
helper({"binary int8 vector": Binary(b"\x03\x00\x7f\x07", 9)})
helper({"binary float32 vector": Binary(b"'\x00\x00\x00\xfeB\x00\x00\xe0@", 9)})
helper(SON([("test dst", datetime.datetime(1993, 4, 4, 2))]))
helper(SON([("test negative dst", datetime.datetime(1, 1, 1, 1, 1, 1))]))
helper({"big float": float(10000000000)})
@ -447,6 +451,20 @@ class TestBSON(unittest.TestCase):
encode({"test": Binary(b"test", 128)}),
b"\x14\x00\x00\x00\x05\x74\x65\x73\x74\x00\x04\x00\x00\x00\x80\x74\x65\x73\x74\x00",
)
self.assertEqual(
encode({"vector_int8": Binary.from_vector([-128, -1, 127], BinaryVectorDtype.INT8)}),
b"\x1c\x00\x00\x00\x05vector_int8\x00\x05\x00\x00\x00\t\x03\x00\x80\xff\x7f\x00",
)
self.assertEqual(
encode({"vector_bool": Binary.from_vector([1, 127], BinaryVectorDtype.PACKED_BIT)}),
b"\x1b\x00\x00\x00\x05vector_bool\x00\x04\x00\x00\x00\t\x10\x00\x01\x7f\x00",
)
self.assertEqual(
encode(
{"vector_float32": Binary.from_vector([-1.1, 1.1e10], BinaryVectorDtype.FLOAT32)}
),
b"$\x00\x00\x00\x05vector_float32\x00\n\x00\x00\x00\t'\x00\xcd\xcc\x8c\xbf\xac\xe9#P\x00",
)
self.assertEqual(encode({"test": None}), b"\x0B\x00\x00\x00\x0A\x74\x65\x73\x74\x00\x00")
self.assertEqual(
encode({"date": datetime.datetime(2007, 1, 8, 0, 30, 11)}),
@ -711,9 +729,66 @@ class TestBSON(unittest.TestCase):
transformed = bin.as_uuid(UuidRepresentation.PYTHON_LEGACY)
self.assertEqual(id, transformed)
# The C extension was segfaulting on unicode RegExs, so we have this test
# that doesn't really test anything but the lack of a segfault.
def test_vector(self):
"""Tests of subtype 9"""
# We start with valid cases, across the 3 dtypes implemented.
# Work with a simple vector that can be interpreted as int8, float32, or ubyte
list_vector = [127, 7]
# As INT8, vector has length 2
binary_vector = Binary.from_vector(list_vector, BinaryVectorDtype.INT8)
vector = binary_vector.as_vector()
assert vector.data == list_vector
# test encoding roundtrip
assert {"vector": binary_vector} == decode(encode({"vector": binary_vector}))
# test json roundtrip
assert binary_vector == json_util.loads(json_util.dumps(binary_vector))
# For vectors of bits, aka PACKED_BIT type, vector has length 8 * 2
packed_bit_binary = Binary.from_vector(list_vector, BinaryVectorDtype.PACKED_BIT)
packed_bit_vec = packed_bit_binary.as_vector()
assert packed_bit_vec.data == list_vector
# A padding parameter permits vectors of length that aren't divisible by 8
# The following ignores the last 3 bits in list_vector,
# hence it's length is 8 * len(list_vector) - padding
padding = 3
padded_vec = Binary.from_vector(list_vector, BinaryVectorDtype.PACKED_BIT, padding=padding)
assert padded_vec.as_vector().data == list_vector
# To visualize how this looks as a binary vector..
uncompressed = ""
for val in list_vector:
uncompressed += format(val, "08b")
assert uncompressed[:-padding] == "0111111100000"
# It is worthwhile explicitly showing the values encoded to BSON
padded_doc = {"padded_vec": padded_vec}
assert (
encode(padded_doc)
== b"\x1a\x00\x00\x00\x05padded_vec\x00\x04\x00\x00\x00\t\x10\x03\x7f\x07\x00"
)
# and dumped to json
assert (
json_util.dumps(padded_doc)
== '{"padded_vec": {"$binary": {"base64": "EAN/Bw==", "subType": "09"}}}'
)
# FLOAT32 is also implemented
float_binary = Binary.from_vector(list_vector, BinaryVectorDtype.FLOAT32)
assert all(isinstance(d, float) for d in float_binary.as_vector().data)
# Now some invalid cases
for x in [-1, 257]:
try:
Binary.from_vector([x], BinaryVectorDtype.PACKED_BIT)
except Exception as exc:
self.assertTrue(isinstance(exc, struct.error))
else:
self.fail("Failed to raise an exception.")
def test_unicode_regex(self):
"""Tests we do not get a segfault for C extension on unicode RegExs.
This had been happening.
"""
regex = re.compile("revisi\xf3n")
decode(encode({"regex": regex}))

View File

@ -0,0 +1,105 @@
# Copyright 2024-present MongoDB, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import binascii
import codecs
import json
import struct
from pathlib import Path
from test import unittest
from bson import decode, encode
from bson.binary import Binary, BinaryVectorDtype
_TEST_PATH = Path(__file__).parent / "bson_binary_vector"
class TestBSONBinaryVector(unittest.TestCase):
"""Runs Binary Vector subtype tests.
Follows the style of the BSON corpus specification tests.
Tests are automatically generated on import
from json files in _TEST_PATH via `create_tests`.
The actual tests are defined in the inner function `run_test`
of the test generator `create_test`."""
def create_test(case_spec):
"""Create standard test given specification in json.
We use the naming convention expected (exp) and observed (obj)
to differentiate what is in the json (expected or suffix _exp)
from what is produced by the API (observed or suffix _obs)
"""
test_key = case_spec.get("test_key")
def run_test(self):
for test_case in case_spec.get("tests", []):
description = test_case["description"]
vector_exp = test_case["vector"]
dtype_hex_exp = test_case["dtype_hex"]
dtype_alias_exp = test_case.get("dtype_alias")
padding_exp = test_case.get("padding", 0)
canonical_bson_exp = test_case.get("canonical_bson")
# Convert dtype hex string into bytes
dtype_exp = BinaryVectorDtype(int(dtype_hex_exp, 16).to_bytes(1, byteorder="little"))
if test_case["valid"]:
# Convert bson string to bytes
cB_exp = binascii.unhexlify(canonical_bson_exp.encode("utf8"))
decoded_doc = decode(cB_exp)
binary_obs = decoded_doc[test_key]
# Handle special float cases like '-inf'
if dtype_exp in [BinaryVectorDtype.FLOAT32]:
vector_exp = [float(x) for x in vector_exp]
# Test round-tripping canonical bson.
self.assertEqual(encode(decoded_doc), cB_exp, description)
# Test BSON to Binary Vector
vector_obs = binary_obs.as_vector()
self.assertEqual(vector_obs.dtype, dtype_exp, description)
if dtype_alias_exp:
self.assertEqual(
vector_obs.dtype, BinaryVectorDtype[dtype_alias_exp], description
)
self.assertEqual(vector_obs.data, vector_exp, description)
self.assertEqual(vector_obs.padding, padding_exp, description)
# Test Binary Vector to BSON
vector_exp = Binary.from_vector(vector_exp, dtype_exp, padding_exp)
cB_obs = binascii.hexlify(encode({test_key: vector_exp})).decode().upper()
self.assertEqual(cB_obs, canonical_bson_exp, description)
else:
with self.assertRaises((struct.error, ValueError), msg=description):
Binary.from_vector(vector_exp, dtype_exp, padding_exp)
return run_test
def create_tests():
for filename in _TEST_PATH.glob("*.json"):
with codecs.open(str(filename), encoding="utf-8") as test_file:
test_method = create_test(json.load(test_file))
setattr(TestBSONBinaryVector, "test_" + filename.stem, test_method)
create_tests()
if __name__ == "__main__":
unittest.main()