Merge branch 'master' of github.com:mongodb/mongo-python-driver

This commit is contained in:
Steven Silvester 2025-04-09 10:45:55 -05:00
commit 50d9b26cfd
No known key found for this signature in database
GPG Key ID: B1BF5EC3A8B32F91
4 changed files with 58 additions and 23 deletions

View File

@ -462,6 +462,10 @@ class Binary(bytes):
raise ValueError(f"{padding=}. It must be in [0,1, ..7].")
if padding and not vector:
raise ValueError("Empty vector with non-zero padding.")
if padding and not (vector[-1] & ((1 << padding) - 1)) == 0: # type: ignore
raise ValueError(
"If padding p is provided, all bits in the final byte lower than p must be 0."
)
elif dtype == BinaryVectorDtype.FLOAT32: # pack floats as float32
format_str = "f"
if padding:
@ -490,6 +494,11 @@ class Binary(bytes):
dtype = BinaryVectorDtype(dtype)
n_values = len(self) - position
if padding and dtype != BinaryVectorDtype.PACKED_BIT:
raise ValueError(
f"Corrupt data. Padding ({padding}) must be 0 for all but PACKED_BIT dtypes. ({dtype=})"
)
if dtype == BinaryVectorDtype.INT8:
dtype_format = "b"
format_string = f"<{n_values}{dtype_format}"
@ -513,6 +522,12 @@ class Binary(bytes):
dtype_format = "B"
format_string = f"<{n_values}{dtype_format}"
unpacked_uint8s = list(struct.unpack_from(format_string, self, position))
if padding and not n_values:
raise ValueError("Corrupt data. Vector has a padding P, but no data.")
if padding and n_values and not (unpacked_uint8s[-1] & ((1 << padding) - 1)) == 0:
raise ValueError(
"Corrupt data. Vector has a padding P, but bits in the final byte lower than P are non-zero."
)
return BinaryVector(unpacked_uint8s, dtype, padding)
else:

View File

@ -20,6 +20,24 @@
"padding": 0,
"canonical_bson": "1600000005766563746F7200040000000910007F0700"
},
{
"description": "PACKED_BIT with padding",
"valid": true,
"vector": [127, 8],
"dtype_hex": "0x10",
"dtype_alias": "PACKED_BIT",
"padding": 3,
"canonical_bson": "1600000005766563746F7200040000000910037F0800"
},
{
"description": "PACKED_BIT with inconsistent padding",
"valid": false,
"vector": [127, 7],
"dtype_hex": "0x10",
"dtype_alias": "PACKED_BIT",
"padding": 3,
"canonical_bson": "1600000005766563746F7200040000000910037F0700"
},
{
"description": "Empty Vector PACKED_BIT",
"valid": true,
@ -29,15 +47,6 @@
"padding": 0,
"canonical_bson": "1400000005766563746F72000200000009100000"
},
{
"description": "PACKED_BIT with padding",
"valid": true,
"vector": [127, 7],
"dtype_hex": "0x10",
"dtype_alias": "PACKED_BIT",
"padding": 3,
"canonical_bson": "1600000005766563746F7200040000000910037F0700"
},
{
"description": "Overflow Vector PACKED_BIT",
"valid": false,

View File

@ -739,7 +739,7 @@ class TestBSON(unittest.TestCase):
"""Tests of subtype 9"""
# We start with valid cases, across the 3 dtypes implemented.
# Work with a simple vector that can be interpreted as int8, float32, or ubyte
list_vector = [127, 7]
list_vector = [127, 8]
# As INT8, vector has length 2
binary_vector = Binary.from_vector(list_vector, BinaryVectorDtype.INT8)
vector = binary_vector.as_vector()
@ -764,18 +764,18 @@ class TestBSON(unittest.TestCase):
uncompressed = ""
for val in list_vector:
uncompressed += format(val, "08b")
assert uncompressed[:-padding] == "0111111100000"
assert uncompressed[:-padding] == "0111111100001"
# It is worthwhile explicitly showing the values encoded to BSON
padded_doc = {"padded_vec": padded_vec}
assert (
encode(padded_doc)
== b"\x1a\x00\x00\x00\x05padded_vec\x00\x04\x00\x00\x00\t\x10\x03\x7f\x07\x00"
== b"\x1a\x00\x00\x00\x05padded_vec\x00\x04\x00\x00\x00\t\x10\x03\x7f\x08\x00"
)
# and dumped to json
assert (
json_util.dumps(padded_doc)
== '{"padded_vec": {"$binary": {"base64": "EAN/Bw==", "subType": "09"}}}'
== '{"padded_vec": {"$binary": {"base64": "EAN/CA==", "subType": "09"}}}'
)
# FLOAT32 is also implemented

View File

@ -48,11 +48,11 @@ def create_test(case_spec):
def run_test(self):
for test_case in case_spec.get("tests", []):
description = test_case["description"]
vector_exp = test_case.get("vector", [])
vector_exp = test_case.get("vector", None)
dtype_hex_exp = test_case["dtype_hex"]
dtype_alias_exp = test_case.get("dtype_alias")
padding_exp = test_case.get("padding", 0)
canonical_bson_exp = test_case.get("canonical_bson")
canonical_bson_exp = test_case.get("canonical_bson", None)
# Convert dtype hex string into bytes
dtype_exp = BinaryVectorDtype(int(dtype_hex_exp, 16).to_bytes(1, byteorder="little"))
@ -85,14 +85,25 @@ def create_test(case_spec):
self.assertEqual(cB_obs, canonical_bson_exp, description)
else:
with self.assertRaises((struct.error, ValueError), msg=description):
# Tests Binary.from_vector
Binary.from_vector(vector_exp, dtype_exp, padding_exp)
# Tests Binary.as_vector
cB_exp = binascii.unhexlify(canonical_bson_exp.encode("utf8"))
decoded_doc = decode(cB_exp)
binary_obs = decoded_doc[test_key]
binary_obs.as_vector()
"""
#### To prove correct in an invalid case (`valid:false`), one MUST
- if the vector field is present, raise an exception when attempting to encode a document from the numeric values,
dtype, and padding.
- if the canonical_bson field is present, raise an exception when attempting to deserialize it into the corresponding
numeric values, as the field contains corrupted data.
"""
# Tests Binary.from_vector()
if vector_exp is not None:
with self.assertRaises((struct.error, ValueError), msg=description):
Binary.from_vector(vector_exp, dtype_exp, padding_exp)
# Tests Binary.as_vector()
if canonical_bson_exp is not None:
with self.assertRaises((struct.error, ValueError), msg=description):
cB_exp = binascii.unhexlify(canonical_bson_exp.encode("utf8"))
decoded_doc = decode(cB_exp)
binary_obs = decoded_doc[test_key]
binary_obs.as_vector()
return run_test