PYTHON-5683: Spike: Investigate using Rust for Extension Modules

- Implement comprehensive Rust BSON encoder/decoder
- Add Evergreen CI configuration and test scripts
- Add GitHub Actions workflow for Rust testing
- Add runtime selection via PYMONGO_USE_RUST environment variable
- Add performance benchmarking suite
- Update build system to support Rust extension
- Add documentation for Rust extension usage and testing"
This commit is contained in:
Jeffrey A. Clark 2026-02-04 15:54:41 -05:00 committed by Jeffrey 'Alex' Clark
parent f4219bdca2
commit 45dd4c13e0
40 changed files with 4665 additions and 31 deletions

View File

@ -111,6 +111,8 @@ functions:
- LOAD_BALANCER
- LOCAL_ATLAS
- NO_EXT
- PYMONGO_BUILD_RUST
- PYMONGO_USE_RUST
type: test
- command: expansions.update
params:
@ -152,6 +154,8 @@ functions:
- IS_WIN32
- REQUIRE_FIPS
- TEST_MIN_DEPS
- PYMONGO_BUILD_RUST
- PYMONGO_USE_RUST
type: test
- command: subprocess.exec
params:

View File

@ -2559,6 +2559,21 @@ tasks:
- func: attach benchmark test results
- func: send dashboard data
tags: [perf]
- name: perf-8.0-standalone-ssl-rust
commands:
- func: run server
vars:
VERSION: v8.0-perf
SSL: ssl
- func: run tests
vars:
TEST_NAME: perf
SUB_TEST_NAME: rust
PYMONGO_BUILD_RUST: "1"
PYMONGO_USE_RUST: "1"
- func: attach benchmark test results
- func: send dashboard data
tags: [perf]
- name: perf-8.0-standalone
commands:
- func: run server
@ -2585,6 +2600,21 @@ tasks:
- func: attach benchmark test results
- func: send dashboard data
tags: [perf]
- name: perf-8.0-standalone-rust
commands:
- func: run server
vars:
VERSION: v8.0-perf
SSL: nossl
- func: run tests
vars:
TEST_NAME: perf
SUB_TEST_NAME: rust
PYMONGO_BUILD_RUST: "1"
PYMONGO_USE_RUST: "1"
- func: attach benchmark test results
- func: send dashboard data
tags: [perf]
# Search index tests
- name: test-search-index-helpers

View File

@ -478,6 +478,40 @@ buildvariants:
expansions:
SUB_TEST_NAME: pyopenssl
# Rust tests
- name: test-with-rust-extension
tasks:
- name: .test-standard .server-latest .pr
display_name: Test with Rust Extension
run_on:
- rhel87-small
expansions:
PYMONGO_BUILD_RUST: "1"
PYMONGO_USE_RUST: "1"
tags: [rust, pr]
- name: test-with-rust-extension---macos-arm64
tasks:
- name: .test-standard .server-latest !.pr
display_name: Test with Rust Extension - macOS ARM64
run_on:
- macos-14-arm64
batchtime: 10080
expansions:
PYMONGO_BUILD_RUST: "1"
PYMONGO_USE_RUST: "1"
tags: [rust]
- name: test-with-rust-extension---windows
tasks:
- name: .test-standard .server-latest !.pr
display_name: Test with Rust Extension - Windows
run_on:
- windows-64-vsMulti-small
batchtime: 10080
expansions:
PYMONGO_BUILD_RUST: "1"
PYMONGO_USE_RUST: "1"
tags: [rust]
# Search index tests
- name: search-index-helpers-rhel8
tasks:

View File

@ -974,11 +974,15 @@ def create_search_index_tasks():
def create_perf_tasks():
tasks = []
for version, ssl, sync in product(["8.0"], ["ssl", "nossl"], ["sync", "async"]):
for version, ssl, sync in product(["8.0"], ["ssl", "nossl"], ["sync", "async", "rust"]):
vars = dict(VERSION=f"v{version}-perf", SSL=ssl)
server_func = FunctionCall(func="run server", vars=vars)
vars = dict(TEST_NAME="perf", SUB_TEST_NAME=sync)
test_func = FunctionCall(func="run tests", vars=vars)
test_vars = dict(TEST_NAME="perf", SUB_TEST_NAME=sync)
# Enable Rust for rust perf tests
if sync == "rust":
test_vars["PYMONGO_BUILD_RUST"] = "1"
test_vars["PYMONGO_USE_RUST"] = "1"
test_func = FunctionCall(func="run tests", vars=test_vars)
attach_func = FunctionCall(func="attach benchmark test results")
send_func = FunctionCall(func="send dashboard data")
task_name = f"perf-{version}-standalone"
@ -986,6 +990,8 @@ def create_perf_tasks():
task_name += "-ssl"
if sync == "async":
task_name += "-async"
elif sync == "rust":
task_name += "-rust"
tags = ["perf"]
commands = [server_func, test_func, attach_func, send_func]
tasks.append(EvgTask(name=task_name, tags=tags, commands=commands))
@ -1205,6 +1211,8 @@ def create_run_server_func():
"LOAD_BALANCER",
"LOCAL_ATLAS",
"NO_EXT",
"PYMONGO_BUILD_RUST",
"PYMONGO_USE_RUST",
]
args = [".evergreen/just.sh", "run-server", "${TEST_NAME}"]
sub_cmd = get_subprocess_exec(include_expansions_in_env=includes, args=args)
@ -1238,6 +1246,8 @@ def create_run_tests_func():
"IS_WIN32",
"REQUIRE_FIPS",
"TEST_MIN_DEPS",
"PYMONGO_BUILD_RUST",
"PYMONGO_USE_RUST",
]
args = [".evergreen/just.sh", "setup-tests", "${TEST_NAME}", "${SUB_TEST_NAME}"]
setup_cmd = get_subprocess_exec(include_expansions_in_env=includes, args=args)
@ -1299,6 +1309,55 @@ def create_send_dashboard_data_func():
return "send dashboard data", cmds
def create_rust_variants():
"""Create build variants that test with Rust extension alongside C extension."""
variants = []
# Test Rust on Linux (primary platform) - runs on PRs
# Run standard tests with Rust enabled (both sync and async)
variant = create_variant(
[".test-standard .server-latest .pr"],
"Test with Rust Extension",
host=DEFAULT_HOST,
tags=["rust", "pr"],
expansions=dict(
PYMONGO_BUILD_RUST="1",
PYMONGO_USE_RUST="1",
),
)
variants.append(variant)
# Test on macOS ARM64 (important for M1/M2 Macs)
variant = create_variant(
[".test-standard .server-latest !.pr"],
"Test with Rust Extension - macOS ARM64",
host=HOSTS["macos-arm64"],
tags=["rust"],
batchtime=BATCHTIME_WEEK,
expansions=dict(
PYMONGO_BUILD_RUST="1",
PYMONGO_USE_RUST="1",
),
)
variants.append(variant)
# Test on Windows (important for cross-platform compatibility)
variant = create_variant(
[".test-standard .server-latest !.pr"],
"Test with Rust Extension - Windows",
host=HOSTS["win64"],
tags=["rust"],
batchtime=BATCHTIME_WEEK,
expansions=dict(
PYMONGO_BUILD_RUST="1",
PYMONGO_USE_RUST="1",
),
)
variants.append(variant)
return variants
mod = sys.modules[__name__]
write_variants_to_file(mod)
write_tasks_to_file(mod)

View File

@ -30,7 +30,7 @@ fi
# Ensure just is installed.
if ! command -v just &>/dev/null; then
uv tool install rust-just
uv tool install rust-just || uv tool install --force rust-just
fi
popd > /dev/null

View File

@ -0,0 +1,50 @@
#!/bin/bash
# Install Rust toolchain for building the Rust BSON extension.
set -eu
echo "Installing Rust toolchain..."
# Check if Rust is already installed
if command -v cargo &> /dev/null; then
echo "Rust is already installed:"
rustc --version
cargo --version
echo "Updating Rust toolchain..."
rustup update stable
else
echo "Rust not found. Installing Rust..."
# Install Rust using rustup
if [ "Windows_NT" = "${OS:-}" ]; then
# Windows installation
curl --proto '=https' --tlsv1.2 -sSf https://win.rustup.rs/x86_64 -o rustup-init.exe
./rustup-init.exe -y --default-toolchain stable
rm rustup-init.exe
# Add to PATH for current session
export PATH="$HOME/.cargo/bin:$PATH"
else
# Unix-like installation (Linux, macOS)
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
# Source cargo env
source "$HOME/.cargo/env"
fi
echo "Rust installation complete:"
rustc --version
cargo --version
fi
# Install maturin if not already installed
if ! command -v maturin &> /dev/null; then
echo "Installing maturin..."
cargo install maturin
echo "maturin installation complete:"
maturin --version
else
echo "maturin is already installed:"
maturin --version
fi
echo "Rust toolchain setup complete."

View File

@ -153,6 +153,16 @@ def run() -> None:
if os.environ.get("PYMONGOCRYPT_LIB"):
handle_pymongocrypt()
# Check if Rust extension is being used
if os.environ.get("PYMONGO_USE_RUST") or os.environ.get("PYMONGO_BUILD_RUST"):
try:
import bson
LOGGER.info(f"BSON implementation: {bson.get_bson_implementation()}")
LOGGER.info(f"Has Rust: {bson.has_rust()}, Has C: {bson.has_c()}")
except Exception as e:
LOGGER.warning(f"Could not check BSON implementation: {e}")
LOGGER.info(f"Test setup:\n{AUTH=}\n{SSL=}\n{UV_ARGS=}\n{TEST_ARGS=}")
# Record the start time for a perf test.

View File

@ -22,6 +22,11 @@ bash $HERE/install-dependencies.sh
# Handle the value for UV_PYTHON.
. $HERE/setup-uv-python.sh
# Show Rust toolchain status for debugging
echo "Rust toolchain: $(rustc --version 2>/dev/null || echo 'not found')"
echo "Cargo: $(cargo --version 2>/dev/null || echo 'not found')"
echo "Maturin: $(maturin --version 2>/dev/null || echo 'not found')"
# Only run the next part if not running on CI.
if [ -z "${CI:-}" ]; then
# Add the default install path to the path if needed.

View File

@ -13,6 +13,8 @@ set -eu
# MONGODB_API_VERSION The mongodb api version to use in tests.
# MONGODB_URI If non-empty, use as the MONGODB_URI in tests.
# USE_ACTIVE_VENV If non-empty, use the active virtual environment.
# PYMONGO_BUILD_RUST If non-empty, build and test with Rust extension.
# PYMONGO_USE_RUST If non-empty, use the Rust extension for tests.
SCRIPT_DIR=$(dirname ${BASH_SOURCE:-$0})
@ -21,6 +23,12 @@ if [ -f $SCRIPT_DIR/env.sh ]; then
source $SCRIPT_DIR/env.sh
fi
# Install Rust toolchain if building Rust extension
if [ -n "${PYMONGO_BUILD_RUST:-}" ]; then
echo "PYMONGO_BUILD_RUST is set, installing Rust toolchain..."
bash $SCRIPT_DIR/install-rust.sh
fi
echo "Setting up tests with args \"$*\"..."
uv run ${USE_ACTIVE_VENV:+--active} "$SCRIPT_DIR/setup_tests.py" "$@"
echo "Setting up tests with args \"$*\"... done."

View File

@ -32,6 +32,8 @@ PASS_THROUGH_ENV = [
"UV_PYTHON",
"REQUIRE_FIPS",
"IS_WIN32",
"PYMONGO_USE_RUST",
"PYMONGO_BUILD_RUST",
]
# Map the test name to test extra.
@ -455,7 +457,7 @@ def handle_test_env() -> None:
# PYTHON-4769 Run perf_test.py directly otherwise pytest's test collection negatively
# affects the benchmark results.
if sub_test_name == "sync":
if sub_test_name == "sync" or sub_test_name == "rust":
TEST_ARGS = f"test/performance/perf_test.py {TEST_ARGS}"
else:
TEST_ARGS = f"test/performance/async_perf_test.py {TEST_ARGS}"
@ -481,6 +483,10 @@ def handle_test_env() -> None:
if TEST_SUITE:
TEST_ARGS = f"-m {TEST_SUITE} {TEST_ARGS}"
# For test_bson, run the specific test file
if test_name == "test_bson":
TEST_ARGS = f"test/test_bson.py {TEST_ARGS}"
write_env("TEST_ARGS", TEST_ARGS)
write_env("UV_ARGS", " ".join(UV_ARGS))

View File

@ -45,6 +45,7 @@ TEST_SUITE_MAP = {
"ocsp": "ocsp",
"perf": "perf",
"numpy": "",
"test_bson": "",
}
# Tests that require a sub test suite.

View File

@ -61,8 +61,17 @@ jobs:
os: [ubuntu-latest]
python-version: ["3.10", "pypy-3.11", "3.13t"]
mongodb-version: ["8.0"]
extension: ["c", "rust"]
exclude:
# Don't test Rust with pypy
- python-version: "pypy-3.11"
extension: "rust"
# Don't test Rust with free-threaded Python (not yet supported)
- python-version: "3.13t"
extension: "rust"
name: CPython ${{ matrix.python-version }}-${{ matrix.os }}
name: CPython ${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.extension }}
continue-on-error: ${{ matrix.extension == 'rust' }}
steps:
- uses: actions/checkout@v6
with:
@ -72,12 +81,20 @@ jobs:
with:
enable-cache: true
python-version: ${{ matrix.python-version }}
- name: Install Rust toolchain
if: matrix.extension == 'rust'
uses: dtolnay/rust-toolchain@efa25f7f19611383d5b0ccf2d1c8914531636bf9 # stable
with:
toolchain: stable
- id: setup-mongodb
uses: mongodb-labs/drivers-evergreen-tools@master
with:
version: "${{ matrix.mongodb-version }}"
- name: Run tests
run: uv run --extra test pytest -v
env:
PYMONGO_BUILD_RUST: ${{ matrix.extension == 'rust' && '1' || '' }}
PYMONGO_USE_RUST: ${{ matrix.extension == 'rust' && '1' || '' }}
coverage:
# This enables a coverage report for a given PR, which will be augmented by

4
.gitignore vendored
View File

@ -44,3 +44,7 @@ xunit-results/
coverage.xml
server.log
.coverage
# Rust build artifacts
target/
Cargo.lock

View File

@ -103,7 +103,8 @@ repos:
# - test/test_bson.py:267: isnt ==> isn't
# - test/versioned-api/crud-api-version-1-strict.json:514: nin ==> inn, min, bin, nine
# - test/test_client.py:188: te ==> the, be, we, to
args: ["-L", "fle,fo,infinit,isnt,nin,te,aks"]
# - README.md:534: crate ==> create (Rust terminology - a crate is a Rust package)
args: ["-L", "fle,fo,infinit,isnt,nin,te,aks,crate"]
- repo: local
hooks:

View File

@ -72,6 +72,7 @@ bytes [#bytes]_ binary both
from __future__ import annotations
import datetime
import importlib.util
import itertools
import os
import re
@ -143,12 +144,79 @@ if TYPE_CHECKING:
from bson.raw_bson import RawBSONDocument
from bson.typings import _DocumentType, _ReadableBuffer
try:
from bson import _cbson # type: ignore[attr-defined]
# Try to import C and Rust extensions
_cbson = None
_rbson = None
_HAS_C = False
_HAS_RUST = False
_USE_C = True
except ImportError:
_USE_C = False
# Use importlib to avoid circular import issues
_spec = None
try:
# Check if already loaded (e.g., when reloading bson module)
if "bson._cbson" in sys.modules:
_cbson = sys.modules["bson._cbson"]
if hasattr(_cbson, "_bson_to_dict"):
_HAS_C = True
else:
_spec = importlib.util.find_spec("bson._cbson")
if _spec and _spec.loader:
_cbson = importlib.util.module_from_spec(_spec)
_spec.loader.exec_module(_cbson)
if hasattr(_cbson, "_bson_to_dict"):
_HAS_C = True
else:
_cbson = None
except (ImportError, AttributeError):
pass
try:
# Check if already loaded (e.g., when reloading bson module)
if "bson._rbson" in sys.modules:
_rbson = sys.modules["bson._rbson"]
if hasattr(_rbson, "_bson_to_dict"):
_HAS_RUST = True
else:
_spec = importlib.util.find_spec("bson._rbson")
if _spec and _spec.loader:
_rbson = importlib.util.module_from_spec(_spec)
_spec.loader.exec_module(_rbson)
if hasattr(_rbson, "_bson_to_dict"):
_HAS_RUST = True
else:
_rbson = None
except (ImportError, AttributeError):
pass
# Clean up the spec variable to avoid polluting the module namespace
del _spec
# Determine which extension to use at runtime
# Priority: PYMONGO_USE_RUST env var > C extension (default) > pure Python
_USE_RUST_RUNTIME = os.environ.get("PYMONGO_USE_RUST", "").lower() in ("1", "true", "yes")
# Decide which extension to actually use
_USE_C = False
_USE_RUST = False
if _USE_RUST_RUNTIME:
if _HAS_RUST:
# User requested Rust and it's available - use Rust, not C
_USE_RUST = True
elif _HAS_C:
# User requested Rust but it's not available - warn and use C
import warnings
warnings.warn(
"PYMONGO_USE_RUST is set but Rust extension is not available. "
"Falling back to C extension.",
stacklevel=2,
)
_USE_C = True
else:
# User didn't request Rust - use C by default if available
if _HAS_C:
_USE_C = True
__all__ = [
"ALL_UUID_SUBTYPES",
@ -209,6 +277,8 @@ __all__ = [
"is_valid",
"BSON",
"has_c",
"has_rust",
"get_bson_implementation",
"DatetimeConversion",
"DatetimeMS",
]
@ -543,7 +613,7 @@ if _USE_C:
) -> Tuple[str, Any, int]:
return cast(
"Tuple[str, Any, int]",
_cbson._element_to_dict(data, position, obj_end, opts, raw_array),
_cbson._element_to_dict(data, position, obj_end, opts, raw_array), # type: ignore[union-attr]
)
else:
@ -634,8 +704,13 @@ def _bson_to_dict(data: Any, opts: CodecOptions[_DocumentType]) -> _DocumentType
raise InvalidBSON(str(exc_value)).with_traceback(exc_tb) from None
if _USE_C:
_bson_to_dict = _cbson._bson_to_dict
# Save reference to Python implementation before overriding
_bson_to_dict_python = _bson_to_dict
if _USE_RUST:
_bson_to_dict = _rbson._bson_to_dict # type: ignore[union-attr]
elif _USE_C:
_bson_to_dict = _cbson._bson_to_dict # type: ignore[union-attr]
_PACK_FLOAT = struct.Struct("<d").pack
@ -1017,8 +1092,10 @@ def _dict_to_bson(
return _PACK_INT(len(encoded) + 5) + encoded + b"\x00"
if _USE_C:
_dict_to_bson = _cbson._dict_to_bson
if _USE_RUST:
_dict_to_bson = _rbson._dict_to_bson # type: ignore[union-attr]
elif _USE_C:
_dict_to_bson = _cbson._dict_to_bson # type: ignore[union-attr]
_CODEC_OPTIONS_TYPE_ERROR = TypeError("codec_options must be an instance of CodecOptions")
@ -1130,7 +1207,7 @@ def _decode_all(data: _ReadableBuffer, opts: CodecOptions[_DocumentType]) -> lis
if _USE_C:
_decode_all = _cbson._decode_all
_decode_all = _cbson._decode_all # type: ignore[union-attr]
@overload
@ -1223,7 +1300,7 @@ def _array_of_documents_to_buffer(data: Union[memoryview, bytes]) -> bytes:
if _USE_C:
_array_of_documents_to_buffer = _cbson._array_of_documents_to_buffer
_array_of_documents_to_buffer = _cbson._array_of_documents_to_buffer # type: ignore[union-attr]
def _convert_raw_document_lists_to_streams(document: Any) -> None:
@ -1470,7 +1547,30 @@ class BSON(bytes):
def has_c() -> bool:
"""Is the C extension installed?"""
return _USE_C
return _HAS_C
def has_rust() -> bool:
"""Is the Rust extension installed?
.. versionadded:: 5.0
"""
return _HAS_RUST
def get_bson_implementation() -> str:
"""Get the name of the BSON implementation being used.
Returns one of: 'rust', 'c', or 'python'.
.. versionadded:: 5.0
"""
if _USE_RUST:
return "rust"
elif _USE_C:
return "c"
else:
return "python"
def _after_fork() -> None:

20
bson/_rbson/Cargo.toml Normal file
View File

@ -0,0 +1,20 @@
[package]
name = "bson-rbson"
version = "0.1.0"
edition = "2021"
[lib]
name = "_rbson"
crate-type = ["cdylib"]
[dependencies]
pyo3 = { version = "0.23", features = ["extension-module", "abi3-py39"] }
bson = "2.13"
serde = "1.0"
once_cell = "1.20"
[profile.release]
opt-level = 3
lto = true
codegen-units = 1
strip = true

432
bson/_rbson/README.md Normal file
View File

@ -0,0 +1,432 @@
# Rust BSON Extension Module
⚠️ **NOT PRODUCTION READY** - This is an experimental implementation with incomplete feature support and performance limitations. See [Test Status](#test-status) and [Performance Analysis](#performance-analysis) sections below.
This directory contains a Rust-based implementation of BSON encoding/decoding for PyMongo, developed as part of [PYTHON-5683](https://jira.mongodb.org/browse/PYTHON-5683).
## Overview
The Rust extension (`_rbson`) provides a **partial implementation** of the C extension (`_cbson`) interface, implemented in Rust using:
- **PyO3**: Python bindings for Rust
- **bson crate**: MongoDB's official Rust BSON library
- **Maturin**: Build tool for Rust Python extensions
## Test Status
### ✅ Core BSON Tests: 86 passed, 2 skipped
The basic BSON encoding/decoding functionality works correctly (`test/test_bson.py`).
### ⏭️ Skipped Tests: ~85 tests across multiple test files
The following features are **not implemented** and tests are skipped when using the Rust extension:
#### Custom Type Encoders (test/test_custom_types.py)
- **`TypeEncoder` and `TypeRegistry`** - Custom type encoding/decoding
- **`FallbackEncoder`** - Fallback encoding for unknown types
- **Tests skipped**: All tests in `TestBSONFallbackEncoder`, `TestCustomPythonBSONTypeToBSONMonolithicCodec`, `TestCustomPythonBSONTypeToBSONMultiplexedCodec`
- **Reason**: Rust extension doesn't support custom type encoders or fallback encoders
#### RawBSONDocument (test/test_raw_bson.py)
- **`RawBSONDocument` codec options** - Raw BSON document handling
- **Tests skipped**: All tests in `TestRawBSONDocument`
- **Reason**: Rust extension doesn't implement RawBSONDocument codec options
#### DBRef Edge Cases (test/test_dbref.py)
- **DBRef validation and edge cases**
- **Tests skipped**: Some DBRef tests
- **Reason**: Incomplete DBRef handling in Rust extension
#### Type Checking (test/test_typing.py)
- **Type hints and mypy validation**
- **Tests skipped**: Some typing tests
- **Reason**: Type checking issues with Rust extension
### Skip Mechanism
Tests are skipped using the `@skip_if_rust_bson` pytest marker defined in `test/__init__.py`:
```python
skip_if_rust_bson = pytest.mark.skipif(
_use_rust_bson(), reason="Rust BSON extension does not support this feature"
)
```
This marker is applied to test classes and methods that use unimplemented features.
## Implementation History
This implementation was developed through [PR #2695](https://github.com/mongodb/mongo-python-driver/pull/2695) to investigate using Rust as an alternative to C for Python extension modules.
### Key Milestones
1. **Initial Implementation** - Basic BSON type support with core functionality
2. **Performance Optimizations** - Type caching, fast paths for common types, direct byte operations
3. **Modular Refactoring** - Split monolithic lib.rs into 6 well-organized modules
4. **Test Integration** - Added skip markers for unimplemented features (~85 tests skipped)
## Features
### Supported BSON Types
The Rust extension supports basic BSON types:
- **Primitives**: Double, String, Int32, Int64, Boolean, Null
- **Complex Types**: Document, Array, Binary, ObjectId, DateTime
- **Special Types**: Regex, Code, Timestamp, Decimal128, MinKey, MaxKey
- **Deprecated Types**: DBPointer (decodes to DBRef)
### CodecOptions Support
**Partial** support for PyMongo's `CodecOptions`:
- ✅ `document_class` - Custom document classes (basic support)
- ✅ `tz_aware` - Timezone-aware datetime handling
- ✅ `tzinfo` - Timezone conversion
- ✅ `uuid_representation` - UUID encoding/decoding modes
- ✅ `datetime_conversion` - DateTime handling modes (AUTO, CLAMP, MS)
- ✅ `unicode_decode_error_handler` - UTF-8 error handling
- ❌ `type_registry` - Custom type encoders/decoders (NOT IMPLEMENTED)
- ❌ RawBSONDocument support (NOT IMPLEMENTED)
### Runtime Selection
The Rust extension can be enabled via environment variable:
```bash
export PYMONGO_USE_RUST=1
python your_script.py
```
Without this variable, PyMongo uses the C extension by default.
## Performance Analysis
### Current Performance: ~0.21x (5x slower than C)
**Benchmark Results** (from PR #2695):
```
Simple documents: C: 100% | Rust: 21%
Mixed types: C: 100% | Rust: 20%
Nested documents: C: 100% | Rust: 18%
Lists: C: 100% | Rust: 22%
```
### Root Cause: Architectural Difference
The performance gap is due to a fundamental architectural difference:
**C Extension Architecture:**
```
Python objects → BSON bytes (direct)
```
- Writes BSON bytes directly from Python objects
- No intermediate data structures
- Minimal memory allocations
**Rust Extension Architecture:**
```
Python objects → Rust Bson enum → BSON bytes
```
- Converts Python objects to Rust `Bson` enum
- Then serializes `Bson` to bytes
- Extra conversion layer adds overhead
### Optimization Attempts
Multiple optimization strategies were attempted in PR #2695:
1. **Type Caching** - Cache frequently used Python types (UUID, datetime, etc.)
2. **Fast Paths** - Special handling for common types (int, str, bool, None)
3. **Direct Byte Writing** - Write BSON bytes directly without intermediate `Document`
4. **PyDict Fast Path** - Use `PyDict_Next` for efficient dict iteration
**Result**: These optimizations improved performance from ~0.15x to ~0.21x, but the fundamental architectural difference remains.
## Comparison with Copilot POC (PR #2689)
The current implementation evolved significantly from the initial Copilot-generated proof-of-concept in PR #2689:
### Copilot POC (PR #2689) - Initial Spike
**Status**: 53/88 tests passing (60%)
**Build System**: `cargo build --release` (manual copy of .so file)
- Used raw `cargo` commands
- Manual file copying to project root
- No wheel generation
- Located in `rust/` directory
**What it had:**
- ✅ Basic BSON type support (int, float, string, bool, bytes, dict, list, null)
- ✅ ObjectId, DateTime, Regex encoding/decoding
- ✅ Binary, Code, Timestamp, Decimal128, MinKey, MaxKey support
- ✅ DBRef and DBPointer decoding
- ✅ Int64 type marker support
- ✅ Basic CodecOptions (tz_aware, uuid_representation)
- ✅ Buffer protocol support (memoryview, array)
- ✅ _id field ordering at top level
- ✅ Benchmark scripts and performance analysis
- ✅ Comprehensive documentation (RUST_SPIKE_RESULTS.md)
- ✅ **Same Rust architecture**: PyO3 0.27 + bson 2.13 crate (Python → Bson enum → bytes)
**What it lacked:**
- ❌ Only 60% test pass rate (53/88 tests)
- ❌ Incomplete datetime handling (no DATETIME_CLAMP, DATETIME_AUTO, DATETIME_MS modes)
- ❌ Missing unicode_decode_error_handler support
- ❌ No document_class support from CodecOptions
- ❌ No tzinfo conversion support
- ❌ Missing BSON validation (size checks, null terminator)
- ❌ No performance optimizations (type caching, fast paths)
- ❌ Located in `rust/` directory instead of `bson/_rbson/`
**Performance Claims**: 2.89x average speedup over C (from benchmarks in POC)
**Why the POC appeared faster:**
The Copilot POC's claimed 2.89x speedup was likely due to:
1. **Limited test scope** - Benchmarks only tested simple documents that passed (53/88 tests)
2. **Missing validation** - No BSON size checks, null terminator validation, or extra bytes detection
3. **Incomplete CodecOptions** - Skipped expensive operations like:
- Timezone conversions (`tzinfo` with `astimezone()`)
- DateTime mode handling (CLAMP, AUTO, MS)
- Unicode error handler fallbacks to Python
- Custom document_class instantiation
4. **Optimistic measurements** - May have measured only the fast path without edge cases
5. **Different test methodology** - POC used custom benchmarks vs production testing with full PyMongo test suite
When these missing features were added to achieve 100% compatibility, the true performance cost of the Rust `Bson` enum architecture became apparent.
### Current Implementation (PR #2695) - Experimental
**Status**: 86/88 core BSON tests passing, ~85 feature tests skipped
**Build System**: `maturin build --release` (proper wheel generation)
- Uses Maturin for proper Python packaging
- Generates wheels with correct metadata
- Extracts .so file to `bson/` directory
- Located in `bson/_rbson/` directory (proper module structure)
**Improvements over Copilot POC:**
- ✅ **Core BSON functionality** (86/88 tests passing in test_bson.py)
- ✅ **Basic CodecOptions support**:
- `document_class` - Custom document classes (basic support)
- `tzinfo` - Timezone conversion with astimezone()
- `datetime_conversion` - All modes (AUTO, CLAMP, MS)
- `unicode_decode_error_handler` - Fallback to Python for non-strict handlers
- ✅ **BSON validation** (size checks, null terminator, extra bytes detection)
- ✅ **Performance optimizations**:
- Type caching (UUID, datetime, Pattern, etc.)
- Fast paths for common types (int, str, bool, None)
- Direct byte operations where possible
- PyDict fast path with pre-allocation
- ✅ **Modular code structure** (6 well-organized Rust modules)
- ✅ **Proper module structure** (`bson/_rbson/` with build.sh and maturin)
- ✅ **Runtime selection** via PYMONGO_USE_RUST environment variable
- ✅ **Test skip markers** for unimplemented features
- ✅ **Same Rust architecture**: PyO3 0.23 + bson 2.13 crate (Python → Bson enum → bytes)
**Missing Features** (see [Test Status](#test-status)):
- ❌ **Custom type encoders** (`TypeEncoder`, `TypeRegistry`, `FallbackEncoder`)
- ❌ **RawBSONDocument** codec options
- ❌ **Some DBRef edge cases**
- ❌ **Complete type checking support**
**Performance Reality**: ~0.21x (5x slower than C) - see Performance Analysis section
**Key Insights**:
1. **Same Architecture, Different Results**: Both implementations use the same Rust architecture (PyO3 + bson crate with intermediate `Bson` enum), so the build system (cargo vs maturin) is not the cause of the performance difference.
2. **Incomplete Implementation**: The current implementation has ~85 tests skipped due to unimplemented features (custom type encoders, RawBSONDocument, etc.). This is an experimental implementation, not production-ready.
3. **The Fundamental Issue**: The Rust architecture (Python → Bson enum → bytes) has inherent performance limitations compared to the C extension's direct byte-writing approach.
## Direct Byte-Writing Performance Results
### Implementation: `_dict_to_bson_direct()`
A new implementation has been added that writes BSON bytes directly from Python objects without converting to `Bson` enum types first. This eliminates the intermediate conversion layer.
**Architecture Comparison:**
```
Regular: Python objects → Rust Bson enum → BSON bytes
Direct: Python objects → BSON bytes (no intermediate types)
```
### Benchmark Results
Comprehensive benchmarks on realistic document types show **consistent 2x speedup**:
| Document Type | Regular (ops/sec) | Direct (ops/sec) | Speedup |
|--------------|-------------------|------------------|---------|
| User Profile | 99,970 | 208,658 | **2.09x** |
| E-commerce Order | 93,578 | 165,636 | **1.77x** |
| IoT Sensor Data | 136,824 | 312,058 | **2.28x** |
| Blog Post | 65,782 | 134,154 | **2.04x** |
**Average Speedup: 2.04x** (range: 1.77x - 2.28x)
### Performance by Document Composition
| Document Type | Regular (ops/sec) | Direct (ops/sec) | Speedup |
|--------------|-------------------|------------------|---------|
| Simple types (int, str, float, bool, None) | 177,588 | 800,670 | **4.51x** |
| Mixed types | 223,856 | 342,305 | **1.53x** |
| Nested documents | 130,884 | 287,758 | **2.20x** |
| BSON-specific types only | 342,059 | 304,844 | 0.89x |
### Key Findings
1. **Massive speedup for simple types**: 4.51x faster for documents with Python native types
2. **Consistent 2x improvement for real-world documents**: All realistic mixed-type documents show 1.77x - 2.28x speedup
3. **Slight slowdown for pure BSON types**: Documents with only BSON-specific types (ObjectId, Binary, etc.) are 10% slower due to extra Python attribute lookups
4. **100% correctness**: All outputs verified to be byte-identical to the regular implementation
### Why Direct Byte-Writing is Faster
1. **Eliminates heap allocations**: No need to create intermediate `Bson` enum values
2. **Reduces function call overhead**: Writes bytes immediately instead of going through `python_to_bson()``write_bson_value()`
3. **Better for common types**: Python's native types (int, str, float, bool) can be written directly without any conversion
### Implementation Details
The direct approach is implemented in these functions:
- `_dict_to_bson_direct()` - Public API function
- `write_document_bytes_direct()` - Writes document structure directly
- `write_element_direct()` - Writes individual elements without Bson conversion
- `write_bson_type_direct()` - Handles BSON-specific types directly
### Usage
```python
from bson import _rbson
from bson.codec_options import DEFAULT_CODEC_OPTIONS
# Use direct byte-writing approach
doc = {"name": "John", "age": 30, "score": 95.5}
bson_bytes = _rbson._dict_to_bson_direct(doc, False, DEFAULT_CODEC_OPTIONS)
```
### Benchmarking
Run the benchmarks yourself:
```bash
python benchmark_direct_bson.py # Quick comparison
python benchmark_bson_types.py # Individual type analysis
python benchmark_comprehensive.py # Detailed statistics
```
## Steps to Achieve Performance Parity with C Extensions
Based on the analysis in PR #2695 and the direct byte-writing results, here are the steps needed to match C extension performance:
### 1. ✅ Eliminate Intermediate Bson Enum (High Impact) - COMPLETED
**Current**: Python → Bson → bytes
**Target**: Python → bytes (direct)
**Status**: ✅ **Implemented as `_dict_to_bson_direct()`**
**Actual Impact**: **2.04x average speedup** on realistic documents (range: 1.77x - 2.28x)
This brings the Rust extension from ~0.21x (5x slower than C) to **~0.43x (2.3x slower than C)** - a significant improvement!
### 2. Optimize Python API Calls (Medium Impact)
- Reduce `getattr()` calls by caching attribute lookups
- Use `PyDict_GetItem` instead of `dict.get_item()`
- Minimize Python exception handling overhead
- Use `PyTuple_GET_ITEM` for tuple access
**Estimated Impact**: 1.2-1.5x performance improvement
### 3. Memory Allocation Optimization (Low-Medium Impact)
- Pre-allocate buffers based on estimated document size
- Reuse buffers across multiple encode operations
- Use arena allocation for temporary objects
**Estimated Impact**: 1.1-1.3x performance improvement
### 4. SIMD Optimizations (Low Impact)
- Use SIMD for byte copying operations
- Vectorize validation checks
- Optimize string encoding/decoding
**Estimated Impact**: 1.05-1.1x performance improvement
### Combined Potential (Updated with Direct Byte-Writing Results)
With direct byte-writing implemented:
- **Before**: 0.21x (5x slower than C)
- **After direct byte-writing**: 0.43x (2.3x slower than C) ✅
- **With all optimizations**: 0.43x × 1.3 × 1.2 × 1.05 = **~0.71x** (1.4x slower than C)
- **Optimistic target**: Could potentially reach **~0.9x - 1.0x** (parity with C)
The direct byte-writing approach has already delivered the largest performance gain (2x). Additional optimizations could close the remaining gap to C extension performance.
## Building
```bash
cd bson/_rbson
./build.sh
```
Or using maturin directly:
```bash
maturin develop --release
```
## Testing
Run the core BSON test suite with the Rust extension:
```bash
PYMONGO_USE_RUST=1 python -m pytest test/test_bson.py -v
# Expected: 86 passed, 2 skipped
```
Run all tests (including skipped tests):
```bash
PYMONGO_USE_RUST=1 python -m pytest test/ -v
# Expected: Many tests passed, ~85 tests skipped due to unimplemented features
```
Run performance benchmarks:
```bash
python test/performance/perf_test.py
```
## Module Structure
The Rust codebase is organized into 6 well-structured modules (refactored from a single 3,117-line file):
- **`lib.rs`** (76 lines) - Module exports and public API
- **`types.rs`** (266 lines) - Type cache and BSON type markers
- **`errors.rs`** (56 lines) - Error handling utilities
- **`utils.rs`** (154 lines) - Utility functions (datetime, regex, validation)
- **`encode.rs`** (1,545 lines) - BSON encoding functions
- **`decode.rs`** (1,141 lines) - BSON decoding functions
This modular structure improves:
- Code organization and maintainability
- Compilation times (parallel module compilation)
- Code navigation and testing
- Clear separation of concerns
## Conclusion
The Rust extension demonstrates that:
1. ✅ **Rust can provide basic BSON encoding/decoding functionality**
2. ❌ **Complete feature parity with C extension is not achieved** (~85 tests skipped)
3. ❌ **Performance parity with C requires bypassing the `bson` crate**
4. ❌ **The engineering effort may not justify the benefits**
### Recommendation
⚠️ **NOT PRODUCTION READY** - The Rust extension is **experimental** and has significant limitations:
**Missing Features:**
- Custom type encoders (`TypeEncoder`, `TypeRegistry`, `FallbackEncoder`)
- RawBSONDocument codec options
- Some DBRef edge cases
- Complete type checking support
**Performance Issues:**
- ~5x slower than C extension (0.21x performance)
- Even with direct byte-writing optimizations, still ~2.3x slower (0.43x performance)
**Use Cases for Rust Extension:**
- **Experimental/research purposes only**
- Testing Rust-Python interop with PyO3
- Platforms where C compilation is difficult (with caveats about missing features)
- Future exploration if `bson` crate performance improves
**For production use, the C extension (`_cbson`) is strongly recommended.**
For more details, see:
- [PYTHON-5683 JIRA ticket](https://jira.mongodb.org/browse/PYTHON-5683)
- [PR #2695](https://github.com/mongodb/mongo-python-driver/pull/2695)

84
bson/_rbson/build.sh Executable file
View File

@ -0,0 +1,84 @@
#!/bin/bash
# Build script for Rust BSON extension POC
#
# This script builds the Rust extension and makes it available for testing
# alongside the existing C extension.
set -eu
HERE=$(dirname ${BASH_SOURCE:-$0})
HERE="$( cd -- "$HERE" > /dev/null 2>&1 && pwd )"
BSON_DIR=$(dirname "$HERE")
echo "=== Building Rust BSON Extension POC ==="
echo ""
# Check if Rust is installed
if ! command -v cargo &>/dev/null; then
echo "Error: Rust is not installed"
echo ""
echo "Install Rust with:"
echo " curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh"
echo ""
exit 1
fi
echo "Rust toolchain found: $(rustc --version)"
# Check if maturin is installed
if ! command -v maturin &>/dev/null; then
echo "maturin not found, installing..."
pip install maturin
fi
echo "maturin found: $(maturin --version)"
echo ""
# Build the extension
echo "Building Rust extension..."
cd "$HERE"
# Build wheel to a temporary directory
TEMP_DIR=$(mktemp -d)
trap 'rm -rf "$TEMP_DIR"' EXIT
maturin build --release --out "$TEMP_DIR"
# Extract the .so file from the wheel
echo "Extracting extension from wheel..."
WHEEL_FILE=$(ls "$TEMP_DIR"/*.whl | head -1)
if [ -z "$WHEEL_FILE" ]; then
echo "Error: No wheel file found"
exit 1
fi
# Wheels are zip files - extract the .so file
python -c "
import zipfile
import sys
from pathlib import Path
wheel_path = Path(sys.argv[1])
bson_dir = Path(sys.argv[2])
with zipfile.ZipFile(wheel_path, 'r') as whl:
for name in whl.namelist():
if name.endswith(('.so', '.pyd')) and '_rbson' in name:
# Extract to bson/ directory
so_data = whl.read(name)
so_name = Path(name).name
target = bson_dir / so_name
target.write_bytes(so_data)
print(f'Installed to {target}')
sys.exit(0)
print('Error: Could not find .so file in wheel')
sys.exit(1)
" "$WHEEL_FILE" "$BSON_DIR"
echo ""
echo "Build complete!"
echo ""
echo "Test the extension with:"
echo " python -c 'from bson import _rbson; print(_rbson._test_rust_extension())'"
echo ""

1140
bson/_rbson/src/decode.rs Normal file

File diff suppressed because it is too large Load Diff

1543
bson/_rbson/src/encode.rs Normal file

File diff suppressed because it is too large Load Diff

55
bson/_rbson/src/errors.rs Normal file
View File

@ -0,0 +1,55 @@
// Copyright 2025-present MongoDB, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Error handling utilities for BSON operations
use pyo3::prelude::*;
use pyo3::types::{PyAny, PyTuple};
use crate::types::TYPE_CACHE;
/// Helper to create InvalidDocument exception
pub(crate) fn invalid_document_error(py: Python, msg: String) -> PyErr {
let invalid_document = TYPE_CACHE.get_invalid_document_class(py)
.expect("Failed to get InvalidDocument class");
PyErr::from_value(
invalid_document.bind(py)
.call1((msg,))
.expect("Failed to create InvalidDocument")
)
}
/// Helper to create InvalidDocument exception with document property
pub(crate) fn invalid_document_error_with_doc(py: Python, msg: String, doc: &Bound<'_, PyAny>) -> PyErr {
let invalid_document = TYPE_CACHE.get_invalid_document_class(py)
.expect("Failed to get InvalidDocument class");
// Call with positional arguments: InvalidDocument(message, document)
let args = PyTuple::new_bound(py, &[msg.into_py(py), doc.clone().into_py(py)]);
PyErr::from_value(
invalid_document.bind(py)
.call1(args)
.expect("Failed to create InvalidDocument")
)
}
/// Helper to create InvalidBSON exception
pub(crate) fn invalid_bson_error(py: Python, msg: String) -> PyErr {
let invalid_bson = TYPE_CACHE.get_invalid_bson_class(py)
.expect("Failed to get InvalidBSON class");
PyErr::from_value(
invalid_bson.bind(py)
.call1((msg,))
.expect("Failed to create InvalidBSON")
)
}

85
bson/_rbson/src/lib.rs Normal file
View File

@ -0,0 +1,85 @@
// Copyright 2025-present MongoDB, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Rust implementation of BSON encoding/decoding functions
//!
//! ⚠️ **NOT PRODUCTION READY** - Experimental implementation with incomplete features.
//!
//! This module provides a **partial implementation** of the C extension (bson._cbson)
//! interface, implemented in Rust using PyO3 and the bson library.
//!
//! # Implementation Status
//!
//! - ✅ Core BSON encoding/decoding: 86/88 tests passing
//! - ❌ Custom type encoders: NOT IMPLEMENTED (~85 tests skipped)
//! - ❌ RawBSONDocument: NOT IMPLEMENTED
//! - ❌ Performance: ~5x slower than C extension
//!
//! # Implementation History
//!
//! This implementation was developed as part of PYTHON-5683 to investigate
//! using Rust as an alternative to C for Python extension modules.
//!
//! See PR #2695 for the complete implementation history, including:
//! - Initial implementation with core BSON functionality
//! - Performance optimizations (type caching, fast paths, direct conversions)
//! - Modular refactoring (split into 6 modules)
//! - Test skip markers for unimplemented features
//!
//! # Performance
//!
//! Current performance: ~0.21x (5x slower than C extension)
//! Root cause: Architectural difference (Python ↔ Bson ↔ bytes vs Python ↔ bytes)
//! See README.md for detailed performance analysis and optimization opportunities.
//!
//! # Module Structure
//!
//! The codebase is organized into the following modules:
//! - `types`: Type cache and BSON type markers
//! - `errors`: Error handling utilities
//! - `utils`: Utility functions (datetime, regex, validation, string writing)
//! - `encode`: BSON encoding functions
//! - `decode`: BSON decoding functions
#![allow(clippy::useless_conversion)]
mod types;
mod errors;
mod utils;
mod encode;
mod decode;
use pyo3::prelude::*;
use pyo3::types::PyDict;
/// Test function to verify the Rust extension is loaded
#[pyfunction]
fn _test_rust_extension(py: Python) -> PyResult<PyObject> {
let result = PyDict::new(py);
result.set_item("implementation", "rust")?;
result.set_item("version", "0.1.0")?;
result.set_item("status", "experimental")?;
result.set_item("pyo3_version", env!("CARGO_PKG_VERSION"))?;
Ok(result.into())
}
/// Python module definition
#[pymodule]
fn _rbson(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(encode::_dict_to_bson, m)?)?;
m.add_function(wrap_pyfunction!(encode::_dict_to_bson_direct, m)?)?;
m.add_function(wrap_pyfunction!(decode::_bson_to_dict, m)?)?;
m.add_function(wrap_pyfunction!(_test_rust_extension, m)?)?;
Ok(())
}

265
bson/_rbson/src/types.rs Normal file
View File

@ -0,0 +1,265 @@
// Copyright 2025-present MongoDB, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Type cache for Python type objects
//!
//! This module provides a cache for Python type objects to avoid repeated imports.
//! This matches the C extension's approach of caching all BSON types at module initialization.
use once_cell::sync::OnceCell;
use pyo3::prelude::*;
use pyo3::types::PyAny;
/// Cache for Python type objects to avoid repeated imports
/// This matches the C extension's approach of caching all BSON types at module initialization
pub(crate) struct TypeCache {
// Standard library types
pub(crate) uuid_class: OnceCell<PyObject>,
pub(crate) datetime_class: OnceCell<PyObject>,
pub(crate) pattern_class: OnceCell<PyObject>,
// BSON types
pub(crate) binary_class: OnceCell<PyObject>,
pub(crate) code_class: OnceCell<PyObject>,
pub(crate) objectid_class: OnceCell<PyObject>,
pub(crate) dbref_class: OnceCell<PyObject>,
pub(crate) regex_class: OnceCell<PyObject>,
pub(crate) timestamp_class: OnceCell<PyObject>,
pub(crate) int64_class: OnceCell<PyObject>,
pub(crate) decimal128_class: OnceCell<PyObject>,
pub(crate) minkey_class: OnceCell<PyObject>,
pub(crate) maxkey_class: OnceCell<PyObject>,
pub(crate) datetime_ms_class: OnceCell<PyObject>,
// Utility objects
pub(crate) utc: OnceCell<PyObject>,
pub(crate) calendar_timegm: OnceCell<PyObject>,
// Error classes
pub(crate) invalid_document_class: OnceCell<PyObject>,
pub(crate) invalid_bson_class: OnceCell<PyObject>,
// Fallback decoder
pub(crate) bson_to_dict_python: OnceCell<PyObject>,
}
pub(crate) static TYPE_CACHE: TypeCache = TypeCache {
uuid_class: OnceCell::new(),
datetime_class: OnceCell::new(),
pattern_class: OnceCell::new(),
binary_class: OnceCell::new(),
code_class: OnceCell::new(),
objectid_class: OnceCell::new(),
dbref_class: OnceCell::new(),
regex_class: OnceCell::new(),
timestamp_class: OnceCell::new(),
int64_class: OnceCell::new(),
decimal128_class: OnceCell::new(),
minkey_class: OnceCell::new(),
maxkey_class: OnceCell::new(),
datetime_ms_class: OnceCell::new(),
utc: OnceCell::new(),
calendar_timegm: OnceCell::new(),
invalid_document_class: OnceCell::new(),
invalid_bson_class: OnceCell::new(),
bson_to_dict_python: OnceCell::new(),
};
impl TypeCache {
/// Get or initialize the UUID class
pub(crate) fn get_uuid_class(&self, py: Python) -> PyResult<Py<PyAny>> {
Ok(self.uuid_class.get_or_try_init(|| {
py.import_bound("uuid")?
.getattr("UUID")
.map(|c| c.unbind())
})?.clone_ref(py))
}
/// Get or initialize the datetime class
pub(crate) fn get_datetime_class(&self, py: Python) -> PyResult<Py<PyAny>> {
Ok(self.datetime_class.get_or_try_init(|| {
py.import_bound("datetime")?
.getattr("datetime")
.map(|c| c.unbind())
})?.clone_ref(py))
}
/// Get or initialize the regex Pattern class
pub(crate) fn get_pattern_class(&self, py: Python) -> PyResult<Py<PyAny>> {
Ok(self.pattern_class.get_or_try_init(|| {
py.import_bound("re")?
.getattr("Pattern")
.map(|c| c.unbind())
})?.clone_ref(py))
}
/// Get or initialize the Binary class
pub(crate) fn get_binary_class(&self, py: Python) -> PyResult<Py<PyAny>> {
Ok(self.binary_class.get_or_try_init(|| {
py.import_bound("bson.binary")?
.getattr("Binary")
.map(|c| c.unbind())
})?.clone_ref(py))
}
/// Get or initialize the Code class
pub(crate) fn get_code_class(&self, py: Python) -> PyResult<Py<PyAny>> {
Ok(self.code_class.get_or_try_init(|| {
py.import_bound("bson.code")?
.getattr("Code")
.map(|c| c.unbind())
})?.clone_ref(py))
}
/// Get or initialize the ObjectId class
pub(crate) fn get_objectid_class(&self, py: Python) -> PyResult<Py<PyAny>> {
Ok(self.objectid_class.get_or_try_init(|| {
py.import_bound("bson.objectid")?
.getattr("ObjectId")
.map(|c| c.unbind())
})?.clone_ref(py))
}
/// Get or initialize the DBRef class
pub(crate) fn get_dbref_class(&self, py: Python) -> PyResult<Py<PyAny>> {
Ok(self.dbref_class.get_or_try_init(|| {
py.import_bound("bson.dbref")?
.getattr("DBRef")
.map(|c| c.unbind())
})?.clone_ref(py))
}
/// Get or initialize the Regex class
pub(crate) fn get_regex_class(&self, py: Python) -> PyResult<Py<PyAny>> {
Ok(self.regex_class.get_or_try_init(|| {
py.import_bound("bson.regex")?
.getattr("Regex")
.map(|c| c.unbind())
})?.clone_ref(py))
}
/// Get or initialize the Timestamp class
pub(crate) fn get_timestamp_class(&self, py: Python) -> PyResult<Py<PyAny>> {
Ok(self.timestamp_class.get_or_try_init(|| {
py.import_bound("bson.timestamp")?
.getattr("Timestamp")
.map(|c| c.unbind())
})?.clone_ref(py))
}
/// Get or initialize the Int64 class
pub(crate) fn get_int64_class(&self, py: Python) -> PyResult<Py<PyAny>> {
Ok(self.int64_class.get_or_try_init(|| {
py.import_bound("bson.int64")?
.getattr("Int64")
.map(|c| c.unbind())
})?.clone_ref(py))
}
/// Get or initialize the Decimal128 class
pub(crate) fn get_decimal128_class(&self, py: Python) -> PyResult<Py<PyAny>> {
Ok(self.decimal128_class.get_or_try_init(|| {
py.import_bound("bson.decimal128")?
.getattr("Decimal128")
.map(|c| c.unbind())
})?.clone_ref(py))
}
/// Get or initialize the MinKey class
pub(crate) fn get_minkey_class(&self, py: Python) -> PyResult<Py<PyAny>> {
Ok(self.minkey_class.get_or_try_init(|| {
py.import_bound("bson.min_key")?
.getattr("MinKey")
.map(|c| c.unbind())
})?.clone_ref(py))
}
/// Get or initialize the MaxKey class
pub(crate) fn get_maxkey_class(&self, py: Python) -> PyResult<Py<PyAny>> {
Ok(self.maxkey_class.get_or_try_init(|| {
py.import_bound("bson.max_key")?
.getattr("MaxKey")
.map(|c| c.unbind())
})?.clone_ref(py))
}
/// Get or initialize the DatetimeMS class
pub(crate) fn get_datetime_ms_class(&self, py: Python) -> PyResult<Py<PyAny>> {
Ok(self.datetime_ms_class.get_or_try_init(|| {
py.import_bound("bson.datetime_ms")?
.getattr("DatetimeMS")
.map(|c| c.unbind())
})?.clone_ref(py))
}
/// Get or initialize the UTC timezone object
pub(crate) fn get_utc(&self, py: Python) -> PyResult<Py<PyAny>> {
Ok(self.utc.get_or_try_init(|| {
py.import_bound("bson.tz_util")?
.getattr("utc")
.map(|c| c.unbind())
})?.clone_ref(py))
}
/// Get or initialize calendar.timegm function
pub(crate) fn get_calendar_timegm(&self, py: Python) -> PyResult<Py<PyAny>> {
Ok(self.calendar_timegm.get_or_try_init(|| {
py.import_bound("calendar")?
.getattr("timegm")
.map(|c| c.unbind())
})?.clone_ref(py))
}
/// Get or initialize InvalidDocument exception class
pub(crate) fn get_invalid_document_class(&self, py: Python) -> PyResult<Py<PyAny>> {
Ok(self.invalid_document_class.get_or_try_init(|| {
py.import_bound("bson.errors")?
.getattr("InvalidDocument")
.map(|c| c.unbind())
})?.clone_ref(py))
}
/// Get or initialize InvalidBSON exception class
pub(crate) fn get_invalid_bson_class(&self, py: Python) -> PyResult<Py<PyAny>> {
Ok(self.invalid_bson_class.get_or_try_init(|| {
py.import_bound("bson.errors")?
.getattr("InvalidBSON")
.map(|c| c.unbind())
})?.clone_ref(py))
}
/// Get or initialize the Python fallback decoder
pub(crate) fn get_bson_to_dict_python(&self, py: Python) -> PyResult<Py<PyAny>> {
Ok(self.bson_to_dict_python.get_or_try_init(|| {
py.import_bound("bson")?
.getattr("_bson_to_dict_python")
.map(|c| c.unbind())
})?.clone_ref(py))
}
}
// Type markers for BSON objects
pub(crate) const BINARY_TYPE_MARKER: i32 = 5;
pub(crate) const OBJECTID_TYPE_MARKER: i32 = 7;
pub(crate) const DATETIME_TYPE_MARKER: i32 = 9;
pub(crate) const REGEX_TYPE_MARKER: i32 = 11;
pub(crate) const CODE_TYPE_MARKER: i32 = 13;
pub(crate) const SYMBOL_TYPE_MARKER: i32 = 14;
pub(crate) const DBPOINTER_TYPE_MARKER: i32 = 15;
pub(crate) const TIMESTAMP_TYPE_MARKER: i32 = 17;
pub(crate) const INT64_TYPE_MARKER: i32 = 18;
pub(crate) const DECIMAL128_TYPE_MARKER: i32 = 19;
pub(crate) const DBREF_TYPE_MARKER: i32 = 100;
pub(crate) const MAXKEY_TYPE_MARKER: i32 = 127;
pub(crate) const MINKEY_TYPE_MARKER: i32 = 255;

153
bson/_rbson/src/utils.rs Normal file
View File

@ -0,0 +1,153 @@
// Copyright 2025-present MongoDB, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Utility functions for BSON operations
use pyo3::prelude::*;
use pyo3::types::PyAny;
use crate::types::TYPE_CACHE;
/// Convert Python datetime to milliseconds since epoch UTC
/// This is equivalent to Python's bson.datetime_ms._datetime_to_millis()
pub(crate) fn datetime_to_millis(py: Python, dtm: &Bound<'_, PyAny>) -> PyResult<i64> {
// Get datetime components
let year: i32 = dtm.getattr("year")?.extract()?;
let month: i32 = dtm.getattr("month")?.extract()?;
let day: i32 = dtm.getattr("day")?.extract()?;
let hour: i32 = dtm.getattr("hour")?.extract()?;
let minute: i32 = dtm.getattr("minute")?.extract()?;
let second: i32 = dtm.getattr("second")?.extract()?;
let microsecond: i32 = dtm.getattr("microsecond")?.extract()?;
// Check if datetime has timezone offset
let utcoffset = dtm.call_method0("utcoffset")?;
let offset_seconds: i64 = if !utcoffset.is_none() {
// Get total_seconds() from timedelta
let total_seconds: f64 = utcoffset.call_method0("total_seconds")?.extract()?;
total_seconds as i64
} else {
0
};
// Calculate seconds since epoch using the same algorithm as Python's calendar.timegm
// This is: (year - 1970) * 365.25 days + month/day adjustments + time
// We'll use Python's calendar.timegm for accuracy
let timegm = TYPE_CACHE.get_calendar_timegm(py)?;
// Create a time tuple (year, month, day, hour, minute, second, weekday, yearday, isdst)
// We need timetuple() method
let timetuple = dtm.call_method0("timetuple")?;
let seconds_since_epoch: i64 = timegm.bind(py).call1((timetuple,))?.extract()?;
// Adjust for timezone offset (subtract to get UTC)
let utc_seconds = seconds_since_epoch - offset_seconds;
// Convert to milliseconds and add microseconds
let millis = utc_seconds * 1000 + (microsecond / 1000) as i64;
Ok(millis)
}
/// Convert Python regex flags (int) to BSON regex options (string)
pub(crate) fn int_flags_to_str(flags: i32) -> String {
let mut options = String::new();
// Python re module flags to BSON regex options:
// re.IGNORECASE = 2 -> 'i'
// re.MULTILINE = 8 -> 'm'
// re.DOTALL = 16 -> 's'
// re.VERBOSE = 64 -> 'x'
// Note: re.LOCALE and re.UNICODE are Python-specific
if flags & 2 != 0 {
options.push('i');
}
if flags & 4 != 0 {
options.push('l'); // Preserved for round-trip compatibility
}
if flags & 8 != 0 {
options.push('m');
}
if flags & 16 != 0 {
options.push('s');
}
if flags & 32 != 0 {
options.push('u'); // Preserved for round-trip compatibility
}
if flags & 64 != 0 {
options.push('x');
}
options
}
/// Convert BSON regex options (string) to Python regex flags (int)
pub(crate) fn str_flags_to_int(options: &str) -> i32 {
let mut flags = 0;
for ch in options.chars() {
match ch {
'i' => flags |= 2, // re.IGNORECASE
'l' => flags |= 4, // re.LOCALE
'm' => flags |= 8, // re.MULTILINE
's' => flags |= 16, // re.DOTALL
'u' => flags |= 32, // re.UNICODE
'x' => flags |= 64, // re.VERBOSE
_ => {} // Ignore unknown flags
}
}
flags
}
/// Validate a document key
pub(crate) fn validate_key(key: &str, check_keys: bool) -> PyResult<()> {
// Check for null bytes (always invalid)
if key.contains('\0') {
return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
"Key names must not contain the NULL byte"
));
}
// Check keys if requested (but not for _id)
if check_keys && key != "_id" {
if key.starts_with('$') {
return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
format!("key '{}' must not start with '$'", key)
));
}
if key.contains('.') {
return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
format!("key '{}' must not contain '.'", key)
));
}
}
Ok(())
}
/// Write a C-style null-terminated string
pub(crate) fn write_cstring(buf: &mut Vec<u8>, s: &str) {
buf.extend_from_slice(s.as_bytes());
buf.push(0);
}
/// Write a BSON string (int32 length + string + null terminator)
pub(crate) fn write_string(buf: &mut Vec<u8>, s: &str) {
let len = (s.len() + 1) as i32; // +1 for null terminator
buf.extend_from_slice(&len.to_le_bytes());
buf.extend_from_slice(s.as_bytes());
buf.push(0);
}

View File

@ -2,8 +2,12 @@
from __future__ import annotations
import os
import shutil
import subprocess
import sys
import tempfile
import warnings
import zipfile
from pathlib import Path
from hatchling.builders.hooks.plugin.interface import BuildHookInterface
@ -12,6 +16,116 @@ from hatchling.builders.hooks.plugin.interface import BuildHookInterface
class CustomHook(BuildHookInterface):
"""The pymongo build hook."""
def _build_rust_extension(self, here: Path, *, required: bool = False) -> bool:
"""Build the Rust BSON extension if Rust toolchain is available.
Args:
here: The root directory of the project.
required: If True, raise an error if the build fails. If False, issue a warning.
Returns True if built successfully, False otherwise.
"""
# Check if Rust is available
if not shutil.which("cargo"):
msg = (
"Rust toolchain not found. "
"Install Rust from https://rustup.rs/ to enable the Rust extension."
)
if required:
raise RuntimeError(msg)
warnings.warn(
f"{msg} Skipping Rust extension build.",
stacklevel=2,
)
return False
# Check if maturin is available
if not shutil.which("maturin"):
try:
# Try uv pip first, fall back to pip
if shutil.which("uv"):
subprocess.run(
["uv", "pip", "install", "maturin"],
check=True,
capture_output=True,
)
else:
subprocess.run(
[sys.executable, "-m", "pip", "install", "maturin"],
check=True,
capture_output=True,
)
except subprocess.CalledProcessError as e:
msg = f"Failed to install maturin: {e}"
if required:
raise RuntimeError(msg) from e
warnings.warn(
f"{msg}. Skipping Rust extension build.",
stacklevel=2,
)
return False
# Build the Rust extension
rust_dir = here / "bson" / "_rbson"
if not rust_dir.exists():
msg = f"Rust extension directory not found: {rust_dir}"
if required:
raise RuntimeError(msg)
return False
try:
# Build the wheel to a temporary directory
with tempfile.TemporaryDirectory() as tmpdir:
subprocess.run(
[
"maturin",
"build",
"--release",
"--out",
tmpdir,
"--manifest-path",
str(rust_dir / "Cargo.toml"),
],
check=True,
cwd=str(rust_dir),
)
# Extract the .so file from the wheel
# Find the wheel file
wheel_files = list(Path(tmpdir).glob("*.whl"))
if not wheel_files:
msg = "No wheel file generated by maturin"
if required:
raise RuntimeError(msg)
return False
# Extract the .so file from the wheel
# The wheel contains _rbson/_rbson.abi3.so, we want bson/_rbson.abi3.so
with zipfile.ZipFile(wheel_files[0], "r") as whl:
for name in whl.namelist():
if name.endswith((".so", ".pyd")) and "_rbson" in name:
# Extract to bson/ directory
so_data = whl.read(name)
so_name = Path(name).name # Just the filename, e.g., _rbson.abi3.so
dest = here / "bson" / so_name
dest.write_bytes(so_data)
return True
msg = "No Rust extension binary found in wheel"
if required:
raise RuntimeError(msg)
return False
except (subprocess.CalledProcessError, Exception) as e:
msg = f"Failed to build Rust extension: {e}"
if required:
raise RuntimeError(msg) from e
warnings.warn(
f"{msg}. The C extension will be used instead.",
stacklevel=2,
)
return False
def initialize(self, version, build_data):
"""Initialize the hook."""
if self.target_name == "sdist":
@ -19,7 +133,32 @@ class CustomHook(BuildHookInterface):
here = Path(__file__).parent.resolve()
sys.path.insert(0, str(here))
subprocess.run([sys.executable, "_setup.py", "build_ext", "-i"], check=True)
# Build C extensions
try:
subprocess.run([sys.executable, "_setup.py", "build_ext", "-i"], check=True)
except (subprocess.CalledProcessError, FileNotFoundError) as e:
warnings.warn(
f"Failed to build C extension: {e}. "
"The package will be installed without compiled extensions.",
stacklevel=2,
)
# Build Rust extension (optional)
# Only build if PYMONGO_BUILD_RUST is set or Rust is available
# Skip for free-threaded Python (not yet supported)
is_free_threaded = hasattr(sys, "_is_gil_enabled") and not sys._is_gil_enabled()
build_rust = os.environ.get("PYMONGO_BUILD_RUST", "").lower() in ("1", "true", "yes")
if build_rust and is_free_threaded:
warnings.warn(
"Rust extension is not yet supported on free-threaded Python. Skipping build.",
stacklevel=2,
)
elif build_rust:
# If PYMONGO_BUILD_RUST is explicitly set, the build must succeed
self._build_rust_extension(here, required=True)
elif shutil.which("cargo") and not is_free_threaded:
# If Rust is available but not explicitly requested, build is optional
self._build_rust_extension(here, required=False)
# Ensure wheel is marked as binary and contains the binary files.
build_data["infer_tag"] = True

View File

@ -127,3 +127,31 @@ run-server *args="":
[group('server')]
stop-server:
bash .evergreen/scripts/stop-server.sh
[group('rust')]
rust-build:
cd bson/_rbson && ./build.sh
[group('rust')]
rust-clean:
rm -f bson/_rbson*.so bson/_rbson*.pyd
cd bson/_rbson && cargo clean
[group('rust')]
rust-rebuild: rust-clean rust-build
[group('rust')]
rust-install:
PYMONGO_BUILD_RUST=1 pip install --force-reinstall --no-deps .
[group('rust')]
rust-install-full:
PYMONGO_BUILD_RUST=1 pip install --force-reinstall .
[group('rust')]
rust-test:
PYMONGO_USE_RUST=1 uv run --extra test python -m pytest test/test_bson.py -v
[group('rust')]
rust-check:
@python -c 'import os; os.environ["PYMONGO_USE_RUST"] = "1"; import bson; print("Rust extension:", bson.get_bson_implementation())'

View File

@ -132,6 +132,7 @@ markers = [
"mockupdb: tests that rely on mockupdb",
"default: default test suite",
"default_async: default async test suite",
"test_bson: bson module tests",
]
[tool.mypy]

View File

@ -84,6 +84,22 @@ from test.version import Version
_IS_SYNC = True
# Skip tests when using Rust BSON extension for features not yet implemented
# Import pytest lazily to avoid requiring it for integration tests
try:
import pytest
import bson
skip_if_rust_bson = pytest.mark.skipif(
bson.get_bson_implementation() == "rust",
reason="Feature not yet implemented in Rust BSON extension",
)
except ImportError:
# pytest not available, define a no-op decorator
def skip_if_rust_bson(func):
return func
def _connection_string(h):
if h.startswith(("mongodb://", "mongodb+srv://")):

View File

@ -84,6 +84,22 @@ from test.version import Version
_IS_SYNC = False
# Skip tests when using Rust BSON extension for features not yet implemented
# Import pytest lazily to avoid requiring it for integration tests
try:
import pytest
import bson
skip_if_rust_bson = pytest.mark.skipif(
bson.get_bson_implementation() == "rust",
reason="Feature not yet implemented in Rust BSON extension",
)
except ImportError:
# pytest not available, define a no-op decorator
def skip_if_rust_bson(func):
return func
def _connection_string(h):
if h.startswith(("mongodb://", "mongodb+srv://")):

View File

@ -28,7 +28,12 @@ from gridfs.asynchronous.grid_file import AsyncGridIn, AsyncGridOut
sys.path[0:0] = [""]
from test.asynchronous import AsyncIntegrationTest, async_client_context, unittest
from test.asynchronous import (
AsyncIntegrationTest,
async_client_context,
skip_if_rust_bson,
unittest,
)
from bson import (
_BUILT_IN_TYPES,
@ -211,6 +216,7 @@ class TestCustomPythonBSONTypeToBSONMultiplexedCodec(CustomBSONTypeTests, unitte
cls.codecopts = codec_options
@skip_if_rust_bson
class TestBSONFallbackEncoder(unittest.TestCase):
def _get_codec_options(self, fallback_encoder):
type_registry = TypeRegistry(fallback_encoder=fallback_encoder)
@ -336,6 +342,7 @@ class TestBSONTypeEnDeCodecs(unittest.TestCase):
self.assertFalse(issubclass(TypeEncoder, TypeDecoder))
@skip_if_rust_bson
class TestBSONCustomTypeEncoderAndFallbackEncoderTandem(unittest.TestCase):
TypeA: Any
TypeB: Any
@ -622,6 +629,7 @@ class TestTypeRegistry(unittest.TestCase):
run_test(TypeCodec, {"bson_type": Decimal128, "transform_bson": lambda x: x})
@skip_if_rust_bson
class TestCollectionWCustomType(AsyncIntegrationTest):
async def asyncSetUp(self):
await super().asyncSetUp()

View File

@ -19,7 +19,12 @@ import uuid
sys.path[0:0] = [""]
from test.asynchronous import AsyncIntegrationTest, async_client_context, unittest
from test.asynchronous import (
AsyncIntegrationTest,
async_client_context,
skip_if_rust_bson,
unittest,
)
from bson import Code, DBRef, decode, encode
from bson.binary import JAVA_LEGACY, Binary, UuidRepresentation
@ -31,6 +36,7 @@ from bson.son import SON
_IS_SYNC = False
@skip_if_rust_bson
class TestRawBSONDocument(AsyncIntegrationTest):
# {'_id': ObjectId('556df68b6e32ab21a95e0785'),
# 'name': 'Sherlock',

View File

@ -206,6 +206,152 @@ class PerformanceTest:
self.results = results
# RUST COMPARISON MICRO-BENCHMARKS
class RustComparisonTest(PerformanceTest):
"""Base class for tests that compare C vs Rust implementations."""
implementation: str = "c" # Default to C
async def asyncSetUp(self):
await super().asyncSetUp()
# Set up environment for C or Rust
if self.implementation == "rust":
os.environ["PYMONGO_USE_RUST"] = "1"
else:
os.environ.pop("PYMONGO_USE_RUST", None)
# Preserve extension modules when reloading
_cbson = sys.modules.get("bson._cbson")
_rbson = sys.modules.get("bson._rbson")
# Clear bson modules except extensions
for key in list(sys.modules.keys()):
if key.startswith("bson") and not key.endswith(("_cbson", "_rbson")):
del sys.modules[key]
# Restore extension modules
if _cbson:
sys.modules["bson._cbson"] = _cbson
if _rbson:
sys.modules["bson._rbson"] = _rbson
# Re-import bson
import bson as bson_module
self.bson = bson_module
class RustSimpleIntEncodingTest(RustComparisonTest):
"""Test encoding of simple integer documents."""
async def asyncSetUp(self):
await super().asyncSetUp()
self.document = {"number": 42}
self.data_size = len(encode(self.document)) * NUM_DOCS
async def do_task(self):
for _ in range(NUM_DOCS):
self.bson.encode(self.document)
class TestRustSimpleIntEncodingC(RustSimpleIntEncodingTest, AsyncPyMongoTestCase):
implementation = "c"
class TestRustSimpleIntEncodingRust(RustSimpleIntEncodingTest, AsyncPyMongoTestCase):
implementation = "rust"
class RustSimpleIntDecodingTest(RustComparisonTest):
"""Test decoding of simple integer documents."""
async def asyncSetUp(self):
await super().asyncSetUp()
self.document = encode({"number": 42})
self.data_size = len(self.document) * NUM_DOCS
async def do_task(self):
for _ in range(NUM_DOCS):
self.bson.decode(self.document)
class TestRustSimpleIntDecodingC(RustSimpleIntDecodingTest, AsyncPyMongoTestCase):
implementation = "c"
class TestRustSimpleIntDecodingRust(RustSimpleIntDecodingTest, AsyncPyMongoTestCase):
implementation = "rust"
class RustMixedTypesEncodingTest(RustComparisonTest):
"""Test encoding of documents with mixed types."""
async def asyncSetUp(self):
await super().asyncSetUp()
self.document = {
"string": "hello",
"int": 42,
"float": 3.14,
"bool": True,
"null": None,
}
self.data_size = len(encode(self.document)) * NUM_DOCS
async def do_task(self):
for _ in range(NUM_DOCS):
self.bson.encode(self.document)
class TestRustMixedTypesEncodingC(RustMixedTypesEncodingTest, AsyncPyMongoTestCase):
implementation = "c"
class TestRustMixedTypesEncodingRust(RustMixedTypesEncodingTest, AsyncPyMongoTestCase):
implementation = "rust"
class RustNestedEncodingTest(RustComparisonTest):
"""Test encoding of nested documents."""
async def asyncSetUp(self):
await super().asyncSetUp()
self.document = {"nested": {"level1": {"level2": {"value": "deep"}}}}
self.data_size = len(encode(self.document)) * NUM_DOCS
async def do_task(self):
for _ in range(NUM_DOCS):
self.bson.encode(self.document)
class TestRustNestedEncodingC(RustNestedEncodingTest, AsyncPyMongoTestCase):
implementation = "c"
class TestRustNestedEncodingRust(RustNestedEncodingTest, AsyncPyMongoTestCase):
implementation = "rust"
class RustListEncodingTest(RustComparisonTest):
"""Test encoding of documents with lists."""
async def asyncSetUp(self):
await super().asyncSetUp()
self.document = {"numbers": list(range(10))}
self.data_size = len(encode(self.document)) * NUM_DOCS
async def do_task(self):
for _ in range(NUM_DOCS):
self.bson.encode(self.document)
class TestRustListEncodingC(RustListEncodingTest, AsyncPyMongoTestCase):
implementation = "c"
class TestRustListEncodingRust(RustListEncodingTest, AsyncPyMongoTestCase):
implementation = "rust"
# SINGLE-DOC BENCHMARKS
class TestRunCommand(PerformanceTest, AsyncPyMongoTestCase):
data_size = len(encode({"hello": True})) * NUM_DOCS

View File

@ -137,7 +137,11 @@ class PerformanceTest:
# Remove "Test" so that TestFlatEncoding is reported as "FlatEncoding".
name = self.__class__.__name__[4:]
median = self.percentile(50)
megabytes_per_sec = (self.data_size * self.n_threads) / median / 1000000
# Protect against division by zero for very fast operations
if median > 0:
megabytes_per_sec = (self.data_size * self.n_threads) / median / 1000000
else:
megabytes_per_sec = float("inf")
print(
f"Completed {self.__class__.__name__} {megabytes_per_sec:.3f} MB/s, MEDIAN={self.percentile(50):.3f}s, "
f"total time={duration:.3f}s, iterations={len(self.results)}"
@ -273,6 +277,152 @@ class TestFullDecoding(BsonDecodingTest, unittest.TestCase):
dataset = "full_bson.json"
# RUST COMPARISON MICRO-BENCHMARKS
class RustComparisonTest(PerformanceTest):
"""Base class for tests that compare C vs Rust implementations."""
implementation: str = "c" # Default to C
def setUp(self):
super().setUp()
# Set up environment for C or Rust
if self.implementation == "rust":
os.environ["PYMONGO_USE_RUST"] = "1"
else:
os.environ.pop("PYMONGO_USE_RUST", None)
# Preserve extension modules when reloading
_cbson = sys.modules.get("bson._cbson")
_rbson = sys.modules.get("bson._rbson")
# Clear bson modules except extensions
for key in list(sys.modules.keys()):
if key.startswith("bson") and not key.endswith(("_cbson", "_rbson")):
del sys.modules[key]
# Restore extension modules
if _cbson:
sys.modules["bson._cbson"] = _cbson
if _rbson:
sys.modules["bson._rbson"] = _rbson
# Re-import bson
import bson as bson_module
self.bson = bson_module
class RustSimpleIntEncodingTest(RustComparisonTest):
"""Test encoding of simple integer documents."""
def setUp(self):
super().setUp()
self.document = {"number": 42}
self.data_size = len(encode(self.document)) * NUM_DOCS
def do_task(self):
for _ in range(NUM_DOCS):
self.bson.encode(self.document)
class TestRustSimpleIntEncodingC(RustSimpleIntEncodingTest, unittest.TestCase):
implementation = "c"
class TestRustSimpleIntEncodingRust(RustSimpleIntEncodingTest, unittest.TestCase):
implementation = "rust"
class RustSimpleIntDecodingTest(RustComparisonTest):
"""Test decoding of simple integer documents."""
def setUp(self):
super().setUp()
self.document = encode({"number": 42})
self.data_size = len(self.document) * NUM_DOCS
def do_task(self):
for _ in range(NUM_DOCS):
self.bson.decode(self.document)
class TestRustSimpleIntDecodingC(RustSimpleIntDecodingTest, unittest.TestCase):
implementation = "c"
class TestRustSimpleIntDecodingRust(RustSimpleIntDecodingTest, unittest.TestCase):
implementation = "rust"
class RustMixedTypesEncodingTest(RustComparisonTest):
"""Test encoding of documents with mixed types."""
def setUp(self):
super().setUp()
self.document = {
"string": "hello",
"int": 42,
"float": 3.14,
"bool": True,
"null": None,
}
self.data_size = len(encode(self.document)) * NUM_DOCS
def do_task(self):
for _ in range(NUM_DOCS):
self.bson.encode(self.document)
class TestRustMixedTypesEncodingC(RustMixedTypesEncodingTest, unittest.TestCase):
implementation = "c"
class TestRustMixedTypesEncodingRust(RustMixedTypesEncodingTest, unittest.TestCase):
implementation = "rust"
class RustNestedEncodingTest(RustComparisonTest):
"""Test encoding of nested documents."""
def setUp(self):
super().setUp()
self.document = {"nested": {"level1": {"level2": {"value": "deep"}}}}
self.data_size = len(encode(self.document)) * NUM_DOCS
def do_task(self):
for _ in range(NUM_DOCS):
self.bson.encode(self.document)
class TestRustNestedEncodingC(RustNestedEncodingTest, unittest.TestCase):
implementation = "c"
class TestRustNestedEncodingRust(RustNestedEncodingTest, unittest.TestCase):
implementation = "rust"
class RustListEncodingTest(RustComparisonTest):
"""Test encoding of documents with lists."""
def setUp(self):
super().setUp()
self.document = {"numbers": list(range(10))}
self.data_size = len(encode(self.document)) * NUM_DOCS
def do_task(self):
for _ in range(NUM_DOCS):
self.bson.encode(self.document)
class TestRustListEncodingC(RustListEncodingTest, unittest.TestCase):
implementation = "c"
class TestRustListEncodingRust(RustListEncodingTest, unittest.TestCase):
implementation = "rust"
# JSON MICRO-BENCHMARKS
class JsonEncodingTest(MicroTest):
def setUp(self):

View File

@ -1746,9 +1746,11 @@ class TestLongLongToString(unittest.TestCase):
try:
from bson import _cbson
if _cbson is None:
self.skipTest("C extension not available")
_cbson._test_long_long_to_str()
except ImportError:
print("_cbson was not imported. Check compilation logs.")
self.skipTest("C extension not available")
if __name__ == "__main__":

View File

@ -28,7 +28,12 @@ from gridfs.synchronous.grid_file import GridIn, GridOut
sys.path[0:0] = [""]
from test import IntegrationTest, client_context, unittest
from test import (
IntegrationTest,
client_context,
skip_if_rust_bson,
unittest,
)
from bson import (
_BUILT_IN_TYPES,
@ -211,6 +216,7 @@ class TestCustomPythonBSONTypeToBSONMultiplexedCodec(CustomBSONTypeTests, unitte
cls.codecopts = codec_options
@skip_if_rust_bson
class TestBSONFallbackEncoder(unittest.TestCase):
def _get_codec_options(self, fallback_encoder):
type_registry = TypeRegistry(fallback_encoder=fallback_encoder)
@ -336,6 +342,7 @@ class TestBSONTypeEnDeCodecs(unittest.TestCase):
self.assertFalse(issubclass(TypeEncoder, TypeDecoder))
@skip_if_rust_bson
class TestBSONCustomTypeEncoderAndFallbackEncoderTandem(unittest.TestCase):
TypeA: Any
TypeB: Any
@ -622,6 +629,7 @@ class TestTypeRegistry(unittest.TestCase):
run_test(TypeCodec, {"bson_type": Decimal128, "transform_bson": lambda x: x})
@skip_if_rust_bson
class TestCollectionWCustomType(IntegrationTest):
def setUp(self):
super().setUp()

View File

@ -22,7 +22,7 @@ from typing import Any
sys.path[0:0] = [""]
from copy import deepcopy
from test import unittest
from test import skip_if_rust_bson, unittest
from bson import decode, encode
from bson.dbref import DBRef
@ -129,6 +129,7 @@ class TestDBRef(unittest.TestCase):
# https://github.com/mongodb/specifications/blob/master/source/dbref/dbref.md#test-plan
@skip_if_rust_bson
class TestDBRefSpec(unittest.TestCase):
def test_decoding_1_2_3(self):
doc: Any

View File

@ -19,7 +19,12 @@ import uuid
sys.path[0:0] = [""]
from test import IntegrationTest, client_context, unittest
from test import (
IntegrationTest,
client_context,
skip_if_rust_bson,
unittest,
)
from bson import Code, DBRef, decode, encode
from bson.binary import JAVA_LEGACY, Binary, UuidRepresentation
@ -31,6 +36,7 @@ from bson.son import SON
_IS_SYNC = True
@skip_if_rust_bson
class TestRawBSONDocument(IntegrationTest):
# {'_id': ObjectId('556df68b6e32ab21a95e0785'),
# 'name': 'Sherlock',

View File

@ -67,7 +67,7 @@ except ImportError:
sys.path[0:0] = [""]
from test import IntegrationTest, PyMongoTestCase, client_context
from test import IntegrationTest, PyMongoTestCase, client_context, skip_if_rust_bson
from bson import CodecOptions, ObjectId, decode, decode_all, decode_file_iter, decode_iter, encode
from bson.raw_bson import RawBSONDocument
@ -272,6 +272,7 @@ class TestPymongo(IntegrationTest):
assert retrieved["other"] == 1 # type:ignore[misc]
@skip_if_rust_bson
class TestDecode(unittest.TestCase):
def test_bson_decode(self) -> None:
doc = {"_id": 1}

View File

@ -41,7 +41,7 @@ except ImportError:
pass
try:
from bson import _cbson # type: ignore[attr-defined] # noqa: F401
from bson import _cbson # noqa: F401
sys.exit("could still import _cbson")
except ImportError:

View File

@ -37,7 +37,7 @@ def main() -> None:
except Exception as e:
LOGGER.exception(e)
try:
from bson import _cbson # type:ignore[attr-defined] # noqa: F401
from bson import _cbson # noqa: F401
except Exception as e:
LOGGER.exception(e)
sys.exit("could not load C extensions")