diff --git a/bson/__init__.py b/bson/__init__.py index d0410ac68..cca2a233b 100644 --- a/bson/__init__.py +++ b/bson/__init__.py @@ -19,6 +19,7 @@ import calendar import datetime import re import struct +import sys import warnings from bson.binary import Binary, OLD_UUID_SUBTYPE @@ -30,13 +31,14 @@ from bson.errors import (InvalidBSON, from bson.max_key import MaxKey from bson.min_key import MinKey from bson.objectid import ObjectId +from bson.py3compat import b, binary_type from bson.son import SON from bson.timestamp import Timestamp from bson.tz_util import utc try: - import _cbson + from bson import _cbson _use_c = True except ImportError: _use_c = False @@ -47,6 +49,8 @@ try: except ImportError: _use_uuid = False +PY3 = sys.version_info[0] == 3 + # This sort of sucks, but seems to be as good as it gets... RE_TYPE = type(re.compile("")) @@ -59,6 +63,31 @@ MIN_INT64 = -9223372036854775808 EPOCH_AWARE = datetime.datetime.fromtimestamp(0, utc) EPOCH_NAIVE = datetime.datetime.utcfromtimestamp(0) +EMPTY = b("") +ZERO = b("\x00") +ONE = b("\x01") + +BSONNUM = b("\x01") # Floating point +BSONSTR = b("\x02") # UTF-8 string +BSONOBJ = b("\x03") # Embedded document +BSONARR = b("\x04") # Array +BSONBIN = b("\x05") # Binary +BSONUND = b("\x06") # Undefined +BSONOID = b("\x07") # ObjectId +BSONBOO = b("\x08") # Boolean +BSONDAT = b("\x09") # UTC Datetime +BSONNUL = b("\x0A") # Null +BSONRGX = b("\x0B") # Regex +BSONREF = b("\x0C") # DBRef +BSONCOD = b("\x0D") # Javascript code +BSONSYM = b("\x0E") # Symbol +BSONCWS = b("\x0F") # Javascript code with scope +BSONINT = b("\x10") # 32bit int +BSONTIM = b("\x11") # Timestamp +BSONLON = b("\x12") # 64bit int +BSONMIN = b("\xFF") # Min key +BSONMAX = b("\x7F") # Max key + def _get_int(data, position, as_class=None, tz_aware=False, unsigned=False): format = unsigned and "I" or "i" @@ -73,27 +102,30 @@ def _get_int(data, position, as_class=None, tz_aware=False, unsigned=False): def _get_c_string(data, position, length=None): if length is None: try: - end = data.index("\x00", position) + end = data.index(ZERO, position) except ValueError: raise InvalidBSON() else: end = position + length - value = unicode(data[position:end], "utf-8") + value = data[position:end].decode("utf-8") position = end + 1 return value, position def _make_c_string(string, check_null=False): - if check_null and "\x00" in string: - raise InvalidDocument("BSON keys / regex patterns must not " - "contain a NULL character") if isinstance(string, unicode): - return string.encode("utf-8") + "\x00" + if check_null and "\x00" in string: + raise InvalidDocument("BSON keys / regex patterns must not " + "contain a NULL character") + return string.encode("utf-8") + ZERO else: + if check_null and ZERO in string: + raise InvalidDocument("BSON keys / regex patterns must not " + "contain a NULL character") try: string.decode("utf-8") - return string + "\x00" + return string + ZERO except UnicodeError: raise InvalidStringData("strings in documents must be valid " "UTF-8: %r" % string) @@ -137,7 +169,7 @@ def _get_array(data, position, as_class, tz_aware): def _get_binary(data, position, as_class, tz_aware): length, position = _get_int(data, position) - subtype = ord(data[position]) + subtype = ord(data[position:position + 1]) position += 1 if subtype == 2: length2, position = _get_int(data, position) @@ -148,7 +180,11 @@ def _get_binary(data, position, as_class, tz_aware): value = uuid.UUID(bytes=data[position:position + length]) position += length return (value, position) - value = Binary(data[position:position + length], subtype) + # Python3 special case. Decode subtype 0 to 'bytes'. + if PY3 and subtype == 0: + value = data[position:position + length] + else: + value = Binary(data[position:position + length], subtype) position += length return value, position @@ -160,7 +196,7 @@ def _get_oid(data, position, as_class, tz_aware): def _get_boolean(data, position, as_class, tz_aware): - value = data[position] == "\x01" + value = data[position:position + 1] == ONE position += 1 return value, position @@ -223,36 +259,38 @@ def _get_timestamp(data, position, as_class, tz_aware): def _get_long(data, position, as_class, tz_aware): # Have to cast to long; on 32-bit unpack may return an int. + # 2to3 will change long to int. That's fine since long doesn't + # exist in python3. value = long(struct.unpack(" MAX_INT64 or value < MIN_INT64: raise OverflowError("BSON can only handle up to 8-byte ints") if value > MAX_INT32 or value < MIN_INT32: - return "\x12" + name + struct.pack(" MAX_INT64 or value < MIN_INT64: raise OverflowError("BSON can only handle up to 8-byte ints") - return "\x12" + name + struct.pack("= 256 or subtype < 0: raise ValueError("subtype must be contained in [0, 256)") - self = str.__new__(cls, data) + self = binary_type.__new__(cls, data) self.__subtype = subtype return self @@ -116,7 +119,7 @@ class Binary(str): def __eq__(self, other): if isinstance(other, Binary): - return (self.__subtype, str(self)) == (other.subtype, str(other)) + return (self.__subtype, binary_type(self)) == (other.subtype, binary_type(other)) # We don't return NotImplemented here because if we did then # Binary("foo") == "foo" would return True, since Binary is a # subclass of str... @@ -126,7 +129,7 @@ class Binary(str): return not self == other def __repr__(self): - return "Binary(%s, %s)" % (str.__repr__(self), self.__subtype) + return "Binary(%s, %s)" % (binary_type.__repr__(self), self.__subtype) class UUIDLegacy(Binary): @@ -169,7 +172,10 @@ class UUIDLegacy(Binary): def __new__(cls, obj): if not isinstance(obj, UUID): raise TypeError("obj must be an instance of uuid.UUID") - self = Binary.__new__(cls, obj.bytes, OLD_UUID_SUBTYPE) + # Python 3.0(.1) returns a bytearray instance for bytes (3.1 and + # newer just return a bytes instance). Convert that to binary_type + # for compatibility. + self = Binary.__new__(cls, binary_type(obj.bytes), OLD_UUID_SUBTYPE) self.__uuid = obj return self diff --git a/bson/code.py b/bson/code.py index d637e39eb..d7703597f 100644 --- a/bson/code.py +++ b/bson/code.py @@ -15,7 +15,6 @@ """Tools for representing JavaScript code in BSON. """ - class Code(str): """BSON's JavaScript code type. @@ -41,8 +40,10 @@ class Code(str): """ def __new__(cls, code, scope=None, **kwargs): - if not isinstance(code, basestring): - raise TypeError("code must be an instance of basestring") + text_type = basestring + if not isinstance(code, text_type): + raise TypeError("code must be an " + "instance of %s" % (text_type.__name__,)) self = str.__new__(cls, code) diff --git a/bson/dbref.py b/bson/dbref.py index 1f4b727b2..2611ce67e 100644 --- a/bson/dbref.py +++ b/bson/dbref.py @@ -14,9 +14,10 @@ """Tools for manipulating DBRefs (references to MongoDB documents).""" -from bson.son import SON from copy import deepcopy +from bson.son import SON + class DBRef(object): """A reference to a document stored in MongoDB. @@ -45,10 +46,12 @@ class DBRef(object): .. mongodoc:: dbrefs """ - if not isinstance(collection, basestring): - raise TypeError("collection must be an instance of basestring") - if database is not None and not isinstance(database, basestring): - raise TypeError("database must be an instance of basestring") + text_type = basestring + if not isinstance(collection, text_type): + raise TypeError("collection must be an " + "instance of %s" % (text_type.__name__,)) + if database is not None and not isinstance(database, text_type): + raise TypeError("database must be an instance of %s" % (text_type.__name__,)) self.__collection = collection self.__id = id @@ -110,12 +113,13 @@ class DBRef(object): return "DBRef(%r, %r, %r%s)" % (self.collection, self.id, self.database, extra) - def __cmp__(self, other): + def __eq__(self, other): if isinstance(other, DBRef): - return cmp([self.__database, self.__collection, - self.__id, self.__kwargs], - [other.__database, other.__collection, - other.__id, other.__kwargs]) + us = [self.__database, self.__collection, + self.__id, self.__kwargs] + them = [other.__database, other.__collection, + other.__id, other.__kwargs] + return us == them return NotImplemented def __hash__(self): diff --git a/bson/objectid.py b/bson/objectid.py index dc4107332..c4aeec9e9 100644 --- a/bson/objectid.py +++ b/bson/objectid.py @@ -16,6 +16,7 @@ `_. """ +import binascii import calendar import datetime try: @@ -31,14 +32,18 @@ import threading import time from bson.errors import InvalidId +from bson.py3compat import (b, binary_type, text_type, + bytes_from_hex, string_types) from bson.tz_util import utc +EMPTY = b("") +ZERO = b("\x00") def _machine_bytes(): """Get the machine portion of an ObjectId. """ machine_hash = _md5func() - machine_hash.update(socket.gethostname()) + machine_hash.update(socket.gethostname().encode()) return machine_hash.digest()[0:3] @@ -114,13 +119,13 @@ class ObjectId(object): if generation_time.utcoffset() is not None: generation_time = generation_time - generation_time.utcoffset() ts = calendar.timegm(generation_time.timetuple()) - oid = struct.pack(">i", int(ts)) + "\x00" * 8 + oid = struct.pack(">i", int(ts)) + ZERO * 8 return cls(oid) def __generate(self): """Generate a new value for this ObjectId. """ - oid = "" + oid = EMPTY # 4 bytes current time oid += struct.pack(">i", int(time.time())) @@ -150,19 +155,23 @@ class ObjectId(object): """ if isinstance(oid, ObjectId): self.__id = oid.__id - elif isinstance(oid, basestring): + elif isinstance(oid, string_types): if len(oid) == 12: - self.__id = oid + if isinstance(oid, binary_type): + self.__id = oid + else: + raise InvalidId("%s is not a valid ObjectId" % oid) elif len(oid) == 24: try: - self.__id = oid.decode("hex") - except TypeError: + self.__id = bytes_from_hex(oid) + except (TypeError, ValueError): raise InvalidId("%s is not a valid ObjectId" % oid) else: raise InvalidId("%s is not a valid ObjectId" % oid) else: - raise TypeError("id must be an instance of (str, ObjectId), " - "not %s" % type(oid)) + raise TypeError("id must be an instance of (%s, %s, ObjectId), " + "not %s" % (binary_type.__name__, + text_type.__name__, type(oid))) @property def binary(self): @@ -199,19 +208,27 @@ class ObjectId(object): # Provide backwards compatability with OIDs # pickled with pymongo-1.9. if isinstance(value, dict): - self.__id = value['_ObjectId__id'] + try: + # Hack for unpickling ObjectIds created in python2 + self.__id = value['_ObjectId__id'].encode('latin-1') + except UnicodeDecodeError: + self.__id = value['_ObjectId__id'] else: - self.__id = value + try: + # Hack for unpickling ObjectIds created in python2 + self.__id = value.encode('latin-1') + except (UnicodeDecodeError, AttributeError): + self.__id = value def __str__(self): - return self.__id.encode("hex") + return binascii.hexlify(self.__id).decode() def __repr__(self): - return "ObjectId('%s')" % self.__id.encode("hex") + return "ObjectId('%s')" % (str(self),) - def __cmp__(self, other): + def __eq__(self, other): if isinstance(other, ObjectId): - return cmp(self.__id, other.__id) + return self.__id == other.__id return NotImplemented def __hash__(self): diff --git a/bson/py3compat.py b/bson/py3compat.py new file mode 100644 index 000000000..d4ec4e63b --- /dev/null +++ b/bson/py3compat.py @@ -0,0 +1,44 @@ +# Copyright 2009-2012 10gen, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you +# may not use this file except in compliance with the License. You +# may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. + +"""Utility functions and definitions for python3 compatibility.""" + +import sys + +PY3 = sys.version_info[0] == 3 + +if PY3: + import codecs + def b(s): + return codecs.latin_1_encode(s)[0] + + def bytes_from_hex(h): + return bytes.fromhex(h) + + binary_type = bytes + text_type = str + +else: + def b(s): + return s + + def bytes_from_hex(h): + return h.decode('hex') + + binary_type = str + # 2to3 will convert this to "str". That's okay + # since we won't ever get here under python3. + text_type = unicode + +string_types = (binary_type, text_type) diff --git a/bson/son.py b/bson/son.py index dafe34f4e..5602d5c2c 100644 --- a/bson/son.py +++ b/bson/son.py @@ -130,7 +130,7 @@ class SON(dict): return [v for _, v in self.iteritems()] def items(self): - return list(self.iteritems()) + return list(SON.iteritems(self)) def clear(self): for key in self.keys():