PYTHON-346 - Optimize decoding of most types.

This change provides perf improvements for decoding
most types in pure python. Like the previous changes
for encoding, the biggest improvements are seen decoding
BSON arrays to python lists - over 150% using pypy.
This commit is contained in:
Bernie Hackett 2014-08-22 11:40:49 -07:00
parent adb37e2802
commit 3e059f76d7

View File

@ -24,6 +24,8 @@ import struct
import sys
import uuid
from codecs import utf_8_decode as _utf_8_decode
from bson.binary import (Binary, OLD_UUID_SUBTYPE,
JAVA_LEGACY, CSHARP_LEGACY)
from bson.bsonint64 import BSONInt64
@ -58,8 +60,6 @@ EPOCH_AWARE = datetime.datetime.fromtimestamp(0, utc)
EPOCH_NAIVE = datetime.datetime.utcfromtimestamp(0)
EMPTY = b""
ZERO = b"\x00"
ONE = b"\x01"
BSONNUM = b"\x01" # Floating point
BSONSTR = b"\x02" # UTF-8 string
@ -83,54 +83,49 @@ BSONMIN = b"\xFF" # Min key
BSONMAX = b"\x7F" # Max key
_UNPACK_FLOAT = struct.Struct("<d").unpack
_UNPACK_INT = struct.Struct("<i").unpack
_UNPACK_LENGTH_SUBTYPE = struct.Struct("<iB").unpack
_UNPACK_LONG = struct.Struct("<q").unpack
_UNPACK_TIMESTAMP = struct.Struct("<II").unpack
def _get_int(data, position, dummy):
"""Decode a BSON int32 to python int."""
try:
value = struct.unpack("<i", data[position:position + 4])[0]
except struct.error:
raise InvalidBSON()
position += 4
return value, position
end = position + 4
return _UNPACK_INT(data[position:end])[0], end
def _get_c_string(data, position, length=None):
def _get_c_string(data, position):
"""Decode a BSON 'C' string to python unicode string."""
if length is None:
try:
end = data.index(ZERO, position)
except ValueError:
raise InvalidBSON()
else:
end = position + length
value = data[position:end].decode("utf-8")
position = end + 1
return value, position
end = data.index(b"\x00", position)
return _utf_8_decode(data[position:end])[0], end + 1
def _get_float(data, position, dummy):
"""Decode a BSON double to python float."""
num = struct.unpack("<d", data[position:position + 8])[0]
position += 8
return num, position
end = position + 8
return _UNPACK_FLOAT(data[position:end])[0], end
def _get_string(data, position, dummy):
"""Decode a BSON string to python unicode string."""
length = struct.unpack("<i", data[position:position + 4])[0]
if length <= 0 or (len(data) - position - 4) < length:
raise InvalidBSON("invalid string length")
length = _UNPACK_INT(data[position:position + 4])[0]
position += 4
if data[position + length - 1:position + length] != ZERO:
if length <= 0 or (len(data) - position) < length:
raise InvalidBSON("invalid string length")
end = position + length - 1
if data[end:end + 1] != b"\x00":
raise InvalidBSON("invalid end of string")
return _get_c_string(data, position, length - 1)
return _utf_8_decode(data[position:end])[0], end + 1
def _get_object(data, position, opts):
"""Decode a BSON subdocument to as_class or bson.dbref.DBRef."""
obj_size = struct.unpack("<i", data[position:position + 4])[0]
if data[position + obj_size - 1:position + obj_size] != ZERO:
if data[position + obj_size - 1:position + obj_size] != b"\x00":
raise InvalidBSON("bad eoo")
# TODO - Eliminate this copy.
encoded = data[position + 4:position + obj_size - 1]
obj = _elements_to_dict(encoded, opts)
@ -143,76 +138,83 @@ def _get_object(data, position, opts):
def _get_array(data, position, opts):
"""Decode a BSON array to python list."""
obj, position = _get_object(data, position, opts)
size = _UNPACK_INT(data[position:position + 4])[0]
end = position + size - 1
if data[end:end + 1] != b"\x00":
raise InvalidBSON("bad eoo")
position += 4
end -= 1
result = []
i = 0
while True:
try:
result.append(obj[str(i)])
i += 1
except KeyError:
break
return result, position
# Avoid doing global and attibute lookups in the loop.
append = result.append
index = data.index
getter = _ELEMENT_GETTER
while position < end:
element_type = data[position:position + 1]
# Just skip the keys.
position = index(b'\x00', position) + 1
value, position = getter[element_type](data, position, opts)
append(value)
return result, position + 1
def _get_binary(data, position, opts):
"""Decode a BSON binary to bson.binary.Binary or python UUID."""
length, position = _get_int(data, position, opts)
subtype = ord(data[position:position + 1])
position += 1
length, subtype = _UNPACK_LENGTH_SUBTYPE(data[position:position + 5])
position += 5
if subtype == 2:
length2, position = _get_int(data, position, opts)
length2 = _UNPACK_INT(data[position:position + 4])[0]
position += 4
if length2 != length - 4:
raise InvalidBSON("invalid binary (st 2) - lengths don't match!")
length = length2
end = position + length
if subtype in (3, 4):
# Java Legacy
uuid_subtype = opts[2]
if uuid_subtype == JAVA_LEGACY:
java = data[position:position + length]
java = data[position:end]
value = uuid.UUID(bytes=java[0:8][::-1] + java[8:16][::-1])
# C# legacy
elif uuid_subtype == CSHARP_LEGACY:
value = uuid.UUID(bytes_le=data[position:position + length])
value = uuid.UUID(bytes_le=data[position:end])
# Python
else:
value = uuid.UUID(bytes=data[position:position + length])
position += length
return (value, position)
value = uuid.UUID(bytes=data[position:end])
return value, end
# Python3 special case. Decode subtype 0 to 'bytes'.
if PY3 and subtype == 0:
value = data[position:position + length]
value = data[position:end]
else:
value = Binary(data[position:position + length], subtype)
position += length
return value, position
value = Binary(data[position:end], subtype)
return value, end
def _get_oid(data, position, dummy):
"""Decode a BSON ObjectId to bson.objectid.ObjectId."""
value = ObjectId(data[position:position + 12])
position += 12
return value, position
end = position + 12
return ObjectId(data[position:end]), end
def _get_boolean(data, position, dummy):
"""Decode a BSON true/false to python True/False."""
value = data[position:position + 1] == ONE
position += 1
return value, position
end = position + 1
return data[position:end] == b"\x01", end
def _get_date(data, position, opts):
"""Decode a BSON datetime to python datetime.datetime."""
millis = struct.unpack("<q", data[position:position + 8])[0]
end = position + 8
millis = _UNPACK_LONG(data[position:end])[0]
diff = millis % 1000
seconds = (millis - diff) / 1000
position += 8
if opts[1]:
dtime = EPOCH_AWARE + datetime.timedelta(seconds=seconds)
else:
dtime = EPOCH_NAIVE + datetime.timedelta(seconds=seconds)
return dtime.replace(microsecond=diff * 1000), position
return dtime.replace(microsecond=diff * 1000), end
def _get_code(data, position, opts):
@ -223,8 +225,7 @@ def _get_code(data, position, opts):
def _get_code_w_scope(data, position, opts):
"""Decode a BSON code_w_scope to bson.code.Code."""
_, position = _get_int(data, position, opts)
code, position = _get_string(data, position, opts)
code, position = _get_string(data, position + 4, opts)
scope, position = _get_object(data, position, opts)
return Code(code, scope), position
@ -249,15 +250,15 @@ def _get_ref(data, position, opts):
def _get_timestamp(data, position, dummy):
"""Decode a BSON timestamp to bson.timestamp.Timestamp."""
inc, timestamp = struct.unpack("<II", data[position:position + 8])
return Timestamp(timestamp, inc), position + 8
end = position + 8
inc, timestamp = _UNPACK_TIMESTAMP(data[position:end])
return Timestamp(timestamp, inc), end
def _get_int64(data, position, dummy):
"""Decode a BSON int64 to bson.bsonint64.BSONInt64."""
value = BSONInt64(struct.unpack("<q", data[position:position + 8])[0])
position += 8
return value, position
end = position + 8
return BSONInt64(_UNPACK_LONG(data[position:end])[0]), end
_ELEMENT_GETTER = {
@ -266,17 +267,17 @@ _ELEMENT_GETTER = {
BSONOBJ: _get_object,
BSONARR: _get_array,
BSONBIN: _get_binary,
BSONUND: lambda x, y, z: (None, y), # undefined
BSONUND: lambda x, y, z: (None, y), # Deprecated undefined
BSONOID: _get_oid,
BSONBOO: _get_boolean,
BSONDAT: _get_date,
BSONNUL: lambda x, y, z: (None, y),
BSONRGX: _get_regex,
BSONREF: _get_ref,
BSONCOD: _get_code, # code
BSONSYM: _get_string, # symbol
BSONREF: _get_ref, # Deprecated DBPointer
BSONCOD: _get_code,
BSONSYM: _get_string, # Deprecated symbol
BSONCWS: _get_code_w_scope,
BSONINT: _get_int, # number_int
BSONINT: _get_int,
BSONTIM: _get_timestamp,
BSONLON: _get_int64,
BSONMIN: lambda x, y, z: (MinKey(), y),
@ -305,26 +306,34 @@ def _elements_to_dict(data, opts):
def _bson_to_dict(data, as_class, tz_aware, uuid_subtype, compile_re):
"""Decode a BSON string to as_class."""
obj_size = struct.unpack("<i", data[:4])[0]
obj_size = _UNPACK_INT(data[:4])[0]
length = len(data)
if length < obj_size:
raise InvalidBSON("objsize too large")
if obj_size != length or data[obj_size - 1:obj_size] != ZERO:
if obj_size != length or data[obj_size - 1:obj_size] != b"\x00":
raise InvalidBSON("bad eoo")
# TODO - Eliminate this copy.
elements = data[4:obj_size - 1]
dct = _elements_to_dict(elements,
(as_class, tz_aware, uuid_subtype, compile_re))
return dct, data[obj_size:]
try:
dct = _elements_to_dict(elements,
(as_class, tz_aware, uuid_subtype, compile_re))
except InvalidBSON:
raise
except Exception:
# Change exception type to InvalidBSON but preserve traceback.
_, exc_value, exc_tb = sys.exc_info()
reraise(InvalidBSON, exc_value, exc_tb)
# TODO - Get rid of the second return value here and in the _cbson.
return dct, None
if _USE_C:
_bson_to_dict = _cbson._bson_to_dict
_PACK_FLOAT = struct.Struct("<d").pack
_PACK_INT = struct.Struct("<i").pack
_PACK_INT_SUB = struct.Struct("<iB").pack
_PACK_LENGTH_SUBTYPE = struct.Struct("<iB").pack
_PACK_LONG = struct.Struct("<q").pack
_PACK_INC_TIME = struct.Struct("<II").pack
_PACK_TIMESTAMP = struct.Struct("<II").pack
_LIST_NAMES = tuple(b(str(i)) + b"\x00" for i in range(1000))
@ -449,7 +458,7 @@ def _encode_binary(name, value, dummy0, dummy1):
subtype = value.subtype
if subtype == 2:
value = _PACK_INT(len(value)) + value
return b"\x05" + name + _PACK_INT_SUB(len(value), subtype) + value
return b"\x05" + name + _PACK_LENGTH_SUBTYPE(len(value), subtype) + value
def _encode_uuid(name, value, dummy, uuid_subtype):
@ -546,7 +555,7 @@ def _encode_int(name, value, dummy0, dummy1):
def _encode_timestamp(name, value, dummy0, dummy1):
"""Encode bson.timestamp.Timestamp."""
return b"\x11" + name + _PACK_INC_TIME(value.inc, value.time)
return b"\x11" + name + _PACK_TIMESTAMP(value.inc, value.time)
def _encode_long(name, value, dummy0, dummy1):
@ -684,15 +693,16 @@ def decode_all(data, as_class=dict,
end = len(data) - 1
try:
while position < end:
obj_size = struct.unpack("<i", data[position:position + 4])[0]
obj_size = _UNPACK_INT(data[position:position + 4])[0]
if len(data) - position < obj_size:
raise InvalidBSON("objsize too large")
if data[position + obj_size - 1:position + obj_size] != ZERO:
if data[position + obj_size - 1:position + obj_size] != b"\x00":
raise InvalidBSON("bad eoo")
# TODO - Eliminate this copy.
elements = data[position + 4:position + obj_size - 1]
position += obj_size
docs.append(_elements_to_dict(elements,
(as_class, tz_aware, uuid_subtype, compile_re)))
docs.append(_elements_to_dict(
elements, (as_class, tz_aware, uuid_subtype, compile_re)))
return docs
except InvalidBSON:
raise
@ -720,8 +730,8 @@ def is_valid(bson):
raise TypeError("BSON data must be an instance of a subclass of bytes")
try:
(_, remainder) = _bson_to_dict(bson, dict, True, OLD_UUID_SUBTYPE, True)
return remainder == EMPTY
_bson_to_dict(bson, dict, True, OLD_UUID_SUBTYPE, True)
return True
except Exception:
return False