Python 3 bson changes PYTHON-84

2012-03-20 13:56:15 -07:00 · 2012-03-20 13:56:15 -07:00 · 7474f5cde8
commit 7474f5cde8
parent 459b579c0b
7 changed files with 220 additions and 102 deletions
--- a/bson/init.py
+++ b/bson/init.py
@ -19,6 +19,7 @@ import calendar
 import datetime
 import re
 import struct
+import sys
 import warnings

 from bson.binary import Binary, OLD_UUID_SUBTYPE
@ -30,13 +31,14 @@ from bson.errors import (InvalidBSON,
 from bson.max_key import MaxKey
 from bson.min_key import MinKey
 from bson.objectid import ObjectId
+from bson.py3compat import b, binary_type
 from bson.son import SON
 from bson.timestamp import Timestamp
 from bson.tz_util import utc


 try:
-    import _cbson
+    from bson import _cbson
    _use_c = True
 except ImportError:
    _use_c = False
@ -47,6 +49,8 @@ try:
 except ImportError:
    _use_uuid = False

+PY3 = sys.version_info[0] == 3
+

 # This sort of sucks, but seems to be as good as it gets...
 RE_TYPE = type(re.compile(""))
@ -59,6 +63,31 @@ MIN_INT64 = -9223372036854775808
 EPOCH_AWARE = datetime.datetime.fromtimestamp(0, utc)
 EPOCH_NAIVE = datetime.datetime.utcfromtimestamp(0)

+EMPTY = b("")
+ZERO  = b("\x00")
+ONE   = b("\x01")
+
+BSONNUM = b("\x01") # Floating point
+BSONSTR = b("\x02") # UTF-8 string
+BSONOBJ = b("\x03") # Embedded document
+BSONARR = b("\x04") # Array
+BSONBIN = b("\x05") # Binary
+BSONUND = b("\x06") # Undefined
+BSONOID = b("\x07") # ObjectId
+BSONBOO = b("\x08") # Boolean
+BSONDAT = b("\x09") # UTC Datetime
+BSONNUL = b("\x0A") # Null
+BSONRGX = b("\x0B") # Regex
+BSONREF = b("\x0C") # DBRef
+BSONCOD = b("\x0D") # Javascript code
+BSONSYM = b("\x0E") # Symbol
+BSONCWS = b("\x0F") # Javascript code with scope
+BSONINT = b("\x10") # 32bit int
+BSONTIM = b("\x11") # Timestamp
+BSONLON = b("\x12") # 64bit int
+BSONMIN = b("\xFF") # Min key
+BSONMAX = b("\x7F") # Max key
+

 def _get_int(data, position, as_class=None, tz_aware=False, unsigned=False):
    format = unsigned and "I" or "i"
@ -73,27 +102,30 @@ def _get_int(data, position, as_class=None, tz_aware=False, unsigned=False):
 def _get_c_string(data, position, length=None):
    if length is None:
        try:
-            end = data.index("\x00", position)
+            end = data.index(ZERO, position)
        except ValueError:
            raise InvalidBSON()
    else:
        end = position + length
-    value = unicode(data[position:end], "utf-8")
+    value = data[position:end].decode("utf-8")
    position = end + 1

    return value, position


 def _make_c_string(string, check_null=False):
-    if check_null and "\x00" in string:
-        raise InvalidDocument("BSON keys / regex patterns must not "
-                              "contain a NULL character")
    if isinstance(string, unicode):
-        return string.encode("utf-8") + "\x00"
+        if check_null and "\x00" in string:
+            raise InvalidDocument("BSON keys / regex patterns must not "
+                                  "contain a NULL character")
+        return string.encode("utf-8") + ZERO
    else:
+        if check_null and ZERO in string:
+            raise InvalidDocument("BSON keys / regex patterns must not "
+                                  "contain a NULL character")
        try:
            string.decode("utf-8")
-            return string + "\x00"
+            return string + ZERO
        except UnicodeError:
            raise InvalidStringData("strings in documents must be valid "
                                    "UTF-8: %r" % string)
@ -137,7 +169,7 @@ def _get_array(data, position, as_class, tz_aware):

 def _get_binary(data, position, as_class, tz_aware):
    length, position = _get_int(data, position)
-    subtype = ord(data[position])
+    subtype = ord(data[position:position + 1])
    position += 1
    if subtype == 2:
        length2, position = _get_int(data, position)
@ -148,7 +180,11 @@ def _get_binary(data, position, as_class, tz_aware):
        value = uuid.UUID(bytes=data[position:position + length])
        position += length
        return (value, position)
-    value = Binary(data[position:position + length], subtype)
+    # Python3 special case. Decode subtype 0 to 'bytes'.
+    if PY3 and subtype == 0:
+        value = data[position:position + length]
+    else:
+        value = Binary(data[position:position + length], subtype)
    position += length
    return value, position

@ -160,7 +196,7 @@ def _get_oid(data, position, as_class, tz_aware):


 def _get_boolean(data, position, as_class, tz_aware):
-    value = data[position] == "\x01"
+    value = data[position:position + 1] == ONE
    position += 1
    return value, position

@ -223,36 +259,38 @@ def _get_timestamp(data, position, as_class, tz_aware):

 def _get_long(data, position, as_class, tz_aware):
    # Have to cast to long; on 32-bit unpack may return an int.
+    # 2to3 will change long to int. That's fine since long doesn't
+    # exist in python3.
    value = long(struct.unpack("<q", data[position:position + 8])[0])
    position += 8
    return value, position


 _element_getter = {
-    "\x01": _get_number,
-    "\x02": _get_string,
-    "\x03": _get_object,
-    "\x04": _get_array,
-    "\x05": _get_binary,
-    "\x06": _get_null,  # undefined
-    "\x07": _get_oid,
-    "\x08": _get_boolean,
-    "\x09": _get_date,
-    "\x0A": _get_null,
-    "\x0B": _get_regex,
-    "\x0C": _get_ref,
-    "\x0D": _get_code,  # code
-    "\x0E": _get_string,  # symbol
-    "\x0F": _get_code_w_scope,
-    "\x10": _get_int,  # number_int
-    "\x11": _get_timestamp,
-    "\x12": _get_long,
-    "\xFF": lambda w, x, y, z: (MinKey(), x),
-    "\x7F": lambda w, x, y, z: (MaxKey(), x)}
+    BSONNUM: _get_number,
+    BSONSTR: _get_string,
+    BSONOBJ: _get_object,
+    BSONARR: _get_array,
+    BSONBIN: _get_binary,
+    BSONUND: _get_null,  # undefined
+    BSONOID: _get_oid,
+    BSONBOO: _get_boolean,
+    BSONDAT: _get_date,
+    BSONNUL: _get_null,
+    BSONRGX: _get_regex,
+    BSONREF: _get_ref,
+    BSONCOD: _get_code,  # code
+    BSONSYM: _get_string,  # symbol
+    BSONCWS: _get_code_w_scope,
+    BSONINT: _get_int,  # number_int
+    BSONTIM: _get_timestamp,
+    BSONLON: _get_long, # Same as _get_int after 2to3 runs.
+    BSONMIN: lambda w, x, y, z: (MinKey(), x),
+    BSONMAX: lambda w, x, y, z: (MaxKey(), x)}


 def _element_to_dict(data, position, as_class, tz_aware):
-    element_type = data[position]
+    element_type = data[position:position + 1]
    position += 1
    element_name, position = _get_c_string(data, position)
    value, position = _element_getter[element_type](data, position,
@ -273,7 +311,7 @@ def _bson_to_dict(data, as_class, tz_aware):
    obj_size = struct.unpack("<i", data[:4])[0]
    if len(data) < obj_size:
        raise InvalidBSON("objsize too large")
-    if data[obj_size - 1] != "\x00":
+    if data[obj_size - 1:obj_size] != ZERO:
        raise InvalidBSON("bad eoo")
    elements = data[4:obj_size - 1]
    return (_elements_to_dict(elements, as_class, tz_aware), data[obj_size:])
@ -294,71 +332,78 @@ def _element_to_bson(key, value, check_keys, uuid_subtype):

    name = _make_c_string(key, True)
    if isinstance(value, float):
-        return "\x01" + name + struct.pack("<d", value)
+        return BSONNUM + name + struct.pack("<d", value)

-    # Use Binary w/ subtype 3 for UUID instances
    if _use_uuid:
        if isinstance(value, uuid.UUID):
-            value = Binary(value.bytes, subtype=uuid_subtype)
+            # Python 3.0(.1) returns a bytearray instance for bytes (3.1 and
+            # newer just return a bytes instance). Convert that to binary_type
+            # for compatibility.
+            value = Binary(binary_type(value.bytes), subtype=uuid_subtype)

    if isinstance(value, Binary):
        subtype = value.subtype
        if subtype == 2:
            value = struct.pack("<i", len(value)) + value
-        return "\x05%s%s%s%s" % (name, struct.pack("<i", len(value)),
-                                 chr(subtype), value)
+        return (BSONBIN + name +
+                struct.pack("<i", len(value)) + b(chr(subtype)) + value)
    if isinstance(value, Code):
        cstring = _make_c_string(value)
        if not value.scope:
            length = struct.pack("<i", len(cstring))
-            return "\x0D" + name + length + cstring
+            return BSONCOD + name + length + cstring
        scope = _dict_to_bson(value.scope, False, uuid_subtype, False)
        full_length = struct.pack("<i", 8 + len(cstring) + len(scope))
        length = struct.pack("<i", len(cstring))
-        return "\x0F" + name + full_length + length + cstring + scope
-    if isinstance(value, str):
+        return BSONCWS + name + full_length + length + cstring + scope
+    if isinstance(value, binary_type):
+        if PY3:
+            # Python3 special case. Store 'bytes' as BSON binary subtype 0.
+            return (BSONBIN + name +
+                    struct.pack("<i", len(value)) + ZERO + value)
        cstring = _make_c_string(value)
        length = struct.pack("<i", len(cstring))
-        return "\x02" + name + length + cstring
+        return BSONSTR + name + length + cstring
    if isinstance(value, unicode):
        cstring = _make_c_string(value)
        length = struct.pack("<i", len(cstring))
-        return "\x02" + name + length + cstring
+        return BSONSTR + name + length + cstring
    if isinstance(value, dict):
-        return "\x03" + name + _dict_to_bson(value, check_keys, uuid_subtype, False)
+        return BSONOBJ + name + _dict_to_bson(value, check_keys, uuid_subtype, False)
    if isinstance(value, (list, tuple)):
        as_dict = SON(zip([str(i) for i in range(len(value))], value))
-        return "\x04" + name + _dict_to_bson(as_dict, check_keys, uuid_subtype, False)
+        return BSONARR + name + _dict_to_bson(as_dict, check_keys, uuid_subtype, False)
    if isinstance(value, ObjectId):
-        return "\x07" + name + value.binary
+        return BSONOID + name + value.binary
    if value is True:
-        return "\x08" + name + "\x01"
+        return BSONBOO + name + ONE
    if value is False:
-        return "\x08" + name + "\x00"
+        return BSONBOO + name + ZERO
    if isinstance(value, int):
        # TODO this is an ugly way to check for this...
        if value > MAX_INT64 or value < MIN_INT64:
            raise OverflowError("BSON can only handle up to 8-byte ints")
        if value > MAX_INT32 or value < MIN_INT32:
-            return "\x12" + name + struct.pack("<q", value)
-        return "\x10" + name + struct.pack("<i", value)
+            return BSONLON + name + struct.pack("<q", value)
+        return BSONINT + name + struct.pack("<i", value)
+    # 2to3 will convert long to int here since there is no long in python3.
+    # That's OK. The previous if block will match instead.
    if isinstance(value, long):
-        # XXX No long type in Python 3
        if value > MAX_INT64 or value < MIN_INT64:
            raise OverflowError("BSON can only handle up to 8-byte ints")
-        return "\x12" + name + struct.pack("<q", value)
+        return BSONLON + name + struct.pack("<q", value)
    if isinstance(value, datetime.datetime):
        if value.utcoffset() is not None:
            value = value - value.utcoffset()
        millis = int(calendar.timegm(value.timetuple()) * 1000 +
                     value.microsecond / 1000)
-        return "\x09" + name + struct.pack("<q", millis)
+        return BSONDAT + name + struct.pack("<q", millis)
    if isinstance(value, Timestamp):
        time = struct.pack("<I", value.time)
        inc = struct.pack("<I", value.inc)
-        return "\x11" + name + inc + time
+        return BSONTIM + name + inc + time
    if value is None:
-        return "\x0A" + name
+        return BSONNUL + name
    if isinstance(value, RE_TYPE):
        pattern = value.pattern
        flags = ""
@ -374,14 +419,14 @@ def _element_to_bson(key, value, check_keys, uuid_subtype):
            flags += "u"
        if value.flags & re.VERBOSE:
            flags += "x"
-        return "\x0B" + name + _make_c_string(pattern, True) + \
+        return BSONRGX + name + _make_c_string(pattern, True) + \
            _make_c_string(flags)
    if isinstance(value, DBRef):
        return _element_to_bson(key, value.as_doc(), False, uuid_subtype)
    if isinstance(value, MinKey):
-        return "\xFF" + name
+        return BSONMIN + name
    if isinstance(value, MaxKey):
-        return "\x7F" + name
+        return BSONMAX + name

    raise InvalidDocument("cannot convert value of type %s to bson" %
                          type(value))
@ -398,9 +443,9 @@ def _dict_to_bson(dict, check_keys, uuid_subtype, top_level=True):
    except AttributeError:
        raise TypeError("encoder expected a mapping type but got: %r" % dict)

-    encoded = ''.join(elements)
+    encoded = EMPTY.join(elements)
    length = len(encoded) + 5
-    return struct.pack("<i", length) + encoded + "\x00"
+    return struct.pack("<i", length) + encoded + ZERO
 if _use_c:
    _dict_to_bson = _cbson._dict_to_bson

@ -427,8 +472,8 @@ def decode_all(data, as_class=dict, tz_aware=True):
    while position < end:
        obj_size = struct.unpack("<i", data[position:position + 4])[0]
        if len(data) - position < obj_size:
-           raise InvalidBSON("objsize too large")
-        if data[position + obj_size - 1] != "\x00":
+            raise InvalidBSON("objsize too large")
+        if data[position + obj_size - 1:position + obj_size] != ZERO:
            raise InvalidBSON("bad eoo")
        elements = data[position + 4:position + obj_size - 1]
        position += obj_size
@ -448,17 +493,18 @@ def is_valid(bson):
    :Parameters:
      - `bson`: the data to be validated
    """
-    if not isinstance(bson, str):
-        raise TypeError("BSON data must be an instance of a subclass of str")
+    if not isinstance(bson, binary_type):
+        raise TypeError("BSON data must be an instance "
+                        "of a subclass of %s" % (binary_type.__name__,))

    try:
        (_, remainder) = _bson_to_dict(bson, dict, True)
-        return remainder == ""
+        return remainder == EMPTY
    except:
        return False


-class BSON(str):
+class BSON(binary_type):
    """BSON (Binary JSON) data.
    """

--- a/bson/binary.py
+++ b/bson/binary.py
@ -18,6 +18,8 @@ except ImportError:
    # Python2.4 doesn't have a uuid module.
    pass

+from bson.py3compat import binary_type
+
 """Tools for representing BSON binary data.
 """

@ -78,7 +80,7 @@ USER_DEFINED_SUBTYPE = 128
 """


-class Binary(str):
+class Binary(binary_type):
    """Representation of BSON binary data.

    This is necessary because we want to represent Python strings as
@ -98,13 +100,14 @@ class Binary(str):
    """

    def __new__(cls, data, subtype=BINARY_SUBTYPE):
-        if not isinstance(data, str):
-            raise TypeError("data must be an instance of str")
+        if not isinstance(data, binary_type):
+            raise TypeError("data must be an "
+                            "instance of %s" % (binary_type.__name__,))
        if not isinstance(subtype, int):
            raise TypeError("subtype must be an instance of int")
        if subtype >= 256 or subtype < 0:
            raise ValueError("subtype must be contained in [0, 256)")
-        self = str.__new__(cls, data)
+        self = binary_type.__new__(cls, data)
        self.__subtype = subtype
        return self

@ -116,7 +119,7 @@ class Binary(str):

    def __eq__(self, other):
        if isinstance(other, Binary):
-            return (self.__subtype, str(self)) == (other.subtype, str(other))
+            return (self.__subtype, binary_type(self)) == (other.subtype, binary_type(other))
        # We don't return NotImplemented here because if we did then
        # Binary("foo") == "foo" would return True, since Binary is a
        # subclass of str...
@ -126,7 +129,7 @@ class Binary(str):
        return not self == other

    def __repr__(self):
-        return "Binary(%s, %s)" % (str.__repr__(self), self.__subtype)
+        return "Binary(%s, %s)" % (binary_type.__repr__(self), self.__subtype)


 class UUIDLegacy(Binary):
@ -169,7 +172,10 @@ class UUIDLegacy(Binary):
    def __new__(cls, obj):
        if not isinstance(obj, UUID):
            raise TypeError("obj must be an instance of uuid.UUID")
-        self = Binary.__new__(cls, obj.bytes, OLD_UUID_SUBTYPE)
+        # Python 3.0(.1) returns a bytearray instance for bytes (3.1 and
+        # newer just return a bytes instance). Convert that to binary_type
+        # for compatibility.
+        self = Binary.__new__(cls, binary_type(obj.bytes), OLD_UUID_SUBTYPE)
        self.__uuid = obj
        return self

--- a/bson/code.py
+++ b/bson/code.py
@ -15,7 +15,6 @@
 """Tools for representing JavaScript code in BSON.
 """

-
 class Code(str):
    """BSON's JavaScript code type.

@ -41,8 +40,10 @@ class Code(str):
    """

    def __new__(cls, code, scope=None, **kwargs):
-        if not isinstance(code, basestring):
-            raise TypeError("code must be an instance of basestring")
+        text_type = basestring
+        if not isinstance(code, text_type):
+            raise TypeError("code must be an "
+                            "instance of %s" % (text_type.__name__,))

        self = str.__new__(cls, code)

--- a/bson/dbref.py
+++ b/bson/dbref.py
@ -14,9 +14,10 @@

 """Tools for manipulating DBRefs (references to MongoDB documents)."""

-from bson.son import SON
 from copy import deepcopy

+from bson.son import SON
+

 class DBRef(object):
    """A reference to a document stored in MongoDB.
@ -45,10 +46,12 @@ class DBRef(object):

        .. mongodoc:: dbrefs
        """
-        if not isinstance(collection, basestring):
-            raise TypeError("collection must be an instance of basestring")
-        if database is not None and not isinstance(database, basestring):
-            raise TypeError("database must be an instance of basestring")
+        text_type = basestring
+        if not isinstance(collection, text_type):
+            raise TypeError("collection must be an "
+                            "instance of %s" % (text_type.__name__,))
+        if database is not None and not isinstance(database, text_type):
+            raise TypeError("database must be an instance of %s" % (text_type.__name__,))

        self.__collection = collection
        self.__id = id
@ -110,12 +113,13 @@ class DBRef(object):
        return "DBRef(%r, %r, %r%s)" % (self.collection, self.id,
                                        self.database, extra)

-    def __cmp__(self, other):
+    def __eq__(self, other):
        if isinstance(other, DBRef):
-            return cmp([self.__database, self.__collection,
-                        self.__id, self.__kwargs],
-                       [other.__database, other.__collection,
-                        other.__id, other.__kwargs])
+            us = [self.__database, self.__collection,
+                  self.__id, self.__kwargs]
+            them = [other.__database, other.__collection,
+                    other.__id, other.__kwargs]
+            return us == them
        return NotImplemented

    def __hash__(self):
--- a/bson/objectid.py
+++ b/bson/objectid.py
@ -16,6 +16,7 @@
 <http://dochub.mongodb.org/core/objectids>`_.
 """

+import binascii
 import calendar
 import datetime
 try:
@ -31,14 +32,18 @@ import threading
 import time

 from bson.errors import InvalidId
+from bson.py3compat import (b, binary_type, text_type,
+                            bytes_from_hex, string_types)
 from bson.tz_util import utc

+EMPTY = b("")
+ZERO  = b("\x00")

 def _machine_bytes():
    """Get the machine portion of an ObjectId.
    """
    machine_hash = _md5func()
-    machine_hash.update(socket.gethostname())
+    machine_hash.update(socket.gethostname().encode())
    return machine_hash.digest()[0:3]


@ -114,13 +119,13 @@ class ObjectId(object):
        if generation_time.utcoffset() is not None:
            generation_time = generation_time - generation_time.utcoffset()
        ts = calendar.timegm(generation_time.timetuple())
-        oid = struct.pack(">i", int(ts)) + "\x00" * 8
+        oid = struct.pack(">i", int(ts)) + ZERO * 8
        return cls(oid)

    def __generate(self):
        """Generate a new value for this ObjectId.
        """
-        oid = ""
+        oid = EMPTY

        # 4 bytes current time
        oid += struct.pack(">i", int(time.time()))
@ -150,19 +155,23 @@ class ObjectId(object):
        """
        if isinstance(oid, ObjectId):
            self.__id = oid.__id
-        elif isinstance(oid, basestring):
+        elif isinstance(oid, string_types):
            if len(oid) == 12:
-                self.__id = oid
+                if isinstance(oid, binary_type):
+                    self.__id = oid
+                else:
+                    raise InvalidId("%s is not a valid ObjectId" % oid)
            elif len(oid) == 24:
                try:
-                    self.__id = oid.decode("hex")
-                except TypeError:
+                    self.__id = bytes_from_hex(oid)
+                except (TypeError, ValueError):
                    raise InvalidId("%s is not a valid ObjectId" % oid)
            else:
                raise InvalidId("%s is not a valid ObjectId" % oid)
        else:
-            raise TypeError("id must be an instance of (str, ObjectId), "
-                            "not %s" % type(oid))
+            raise TypeError("id must be an instance of (%s, %s, ObjectId), "
+                            "not %s" % (binary_type.__name__,
+                                        text_type.__name__, type(oid)))

    @property
    def binary(self):
@ -199,19 +208,27 @@ class ObjectId(object):
        # Provide backwards compatability with OIDs
        # pickled with pymongo-1.9.
        if isinstance(value, dict):
-            self.__id = value['_ObjectId__id']
+            try:
+                # Hack for unpickling ObjectIds created in python2
+                self.__id = value['_ObjectId__id'].encode('latin-1')
+            except UnicodeDecodeError:
+                self.__id = value['_ObjectId__id']
        else:
-            self.__id = value
+            try:
+                # Hack for unpickling ObjectIds created in python2
+                self.__id = value.encode('latin-1')
+            except (UnicodeDecodeError, AttributeError):
+                self.__id = value

    def __str__(self):
-        return self.__id.encode("hex")
+        return binascii.hexlify(self.__id).decode()

    def __repr__(self):
-        return "ObjectId('%s')" % self.__id.encode("hex")
+        return "ObjectId('%s')" % (str(self),)

-    def __cmp__(self, other):
+    def __eq__(self, other):
        if isinstance(other, ObjectId):
-            return cmp(self.__id, other.__id)
+            return self.__id == other.__id
        return NotImplemented

    def __hash__(self):
--- a/bson/py3compat.py
+++ b/bson/py3compat.py
@ -0,0 +1,44 @@
+# Copyright 2009-2012 10gen, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License.  You
+# may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.  See the License for the specific language governing
+# permissions and limitations under the License.
+
+"""Utility functions and definitions for python3 compatibility."""
+
+import sys
+
+PY3 = sys.version_info[0] == 3
+
+if PY3:
+    import codecs
+    def b(s):
+        return codecs.latin_1_encode(s)[0]
+
+    def bytes_from_hex(h):
+        return bytes.fromhex(h)
+
+    binary_type = bytes
+    text_type   = str
+
+else:
+    def b(s):
+        return s
+
+    def bytes_from_hex(h):
+        return h.decode('hex')
+
+    binary_type = str
+    # 2to3 will convert this to "str". That's okay
+    # since we won't ever get here under python3.
+    text_type   = unicode
+
+string_types = (binary_type, text_type)
--- a/bson/son.py
+++ b/bson/son.py
@ -130,7 +130,7 @@ class SON(dict):
        return [v for _, v in self.iteritems()]

    def items(self):
-        return list(self.iteritems())
+        return list(SON.iteritems(self))

    def clear(self):
        for key in self.keys():