diff --git a/pymongo/_cbsonmodule.c b/pymongo/_cbsonmodule.c index 7b8b78568..4102ca113 100644 --- a/pymongo/_cbsonmodule.c +++ b/pymongo/_cbsonmodule.c @@ -31,6 +31,7 @@ #include #include "time_helpers.h" +#include "encoding_helpers.h" static PyObject* InvalidName; static PyObject* InvalidDocument; @@ -185,17 +186,6 @@ static int write_string(bson_buffer* buffer, PyObject* py_string) { return 1; } -/* returns 0 on invalid ascii */ -static int validate_ascii(const char* data, int length) { - int i; - for (i = 0; i < length; i++) { - if (data[i] & 0x80) { - return 0; - } - } - return 1; -} - /* TODO our platform better be little-endian w/ 4-byte ints! */ /* Write a single value to the buffer (also write it's type_byte, for which * space has already been reserved. @@ -393,8 +383,10 @@ static int write_element_to_buffer(bson_buffer* buffer, int type_byte, PyObject* int result; *(buffer->buffer + type_byte) = 0x02; - if (!validate_ascii(PyString_AsString(value), PyString_Size(value))) { - PyErr_SetString(InvalidStringData, "strings in documents must be ASCII only"); + if (!is_legal_utf8_string((const unsigned char*)PyString_AsString(value), + PyString_Size(value))) { + PyErr_SetString(InvalidStringData, + "strings in documents must be valid UTF-8"); return 0; } result = write_string(buffer, value); @@ -659,6 +651,13 @@ static int decode_and_write_pair(bson_buffer* buffer, PyObject* key, } else if (PyString_Check(key)) { encoded = key; Py_INCREF(encoded); + + if (!is_legal_utf8_string((const unsigned char*)PyString_AsString(encoded), + PyString_Size(encoded))) { + PyErr_SetString(InvalidStringData, + "strings in documents must be valid UTF-8"); + return 0; + } } else { PyObject* errmsg = PyString_FromString("documents must have only string keys, key was "); PyObject* repr = PyObject_Repr(key); diff --git a/pymongo/bson.py b/pymongo/bson.py index 83469b272..82eed1f32 100644 --- a/pymongo/bson.py +++ b/pymongo/bson.py @@ -64,10 +64,15 @@ def _get_c_string(data): def _make_c_string(string): if "\x00" in string: raise InvalidStringData("BSON strings must not contain a NULL character") - try: + if isinstance(string, unicode): return string.encode("utf-8") + "\x00" - except: - raise InvalidStringData("strings in documents must be ASCII only") + else: + try: + string.decode("utf-8") + return string + "\x00" + except: + raise InvalidStringData("strings in documents must be valid " + "UTF-8: %r" % string) def _validate_number(data): diff --git a/pymongo/encoding_helpers.c b/pymongo/encoding_helpers.c new file mode 100644 index 000000000..6ec0f25d8 --- /dev/null +++ b/pymongo/encoding_helpers.c @@ -0,0 +1,107 @@ +/* + * Copyright 2009 10gen, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright 2001 Unicode, Inc. + * + * Disclaimer + * + * This source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute This Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + */ + +/* + * Index into the table below with the first byte of a UTF-8 sequence to + * get the number of trailing bytes that are supposed to follow it. + */ +static const char trailingBytesForUTF8[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 +}; + +/* --------------------------------------------------------------------- */ + +/* + * Utility routine to tell whether a sequence of bytes is legal UTF-8. + * This must be called with the length pre-determined by the first byte. + * The length can be set by: + * length = trailingBytesForUTF8[*source]+1; + * and the sequence is illegal right away if there aren't that many bytes + * available. + * If presented with a length > 4, this returns 0. The Unicode + * definition of UTF-8 goes up to 4-byte sequences. + */ +static unsigned char isLegalUTF8(const unsigned char* source, int length) { + unsigned char a; + const unsigned char* srcptr = source + length; + switch (length) { + default: return 0; + /* Everything else falls through when "true"... */ + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0; + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0; + case 2: if ((a = (*--srcptr)) > 0xBF) return 0; + switch (*source) { + /* no fall-through in this inner switch */ + case 0xE0: if (a < 0xA0) return 0; break; + case 0xF0: if (a < 0x90) return 0; break; + case 0xF4: if (a > 0x8F) return 0; break; + default: if (a < 0x80) return 0; + } + case 1: if (*source >= 0x80 && *source < 0xC2) return 0; + if (*source > 0xF4) return 0; + } + return 1; +} + +/* --------------------------------------------------------------------- */ + +/* + * Return whether a string containing UTF-8 is legal. + */ +unsigned char is_legal_utf8_string(const unsigned char* string, const int length) { + int position = 0; + + while (position < length) { + int sequence_length = trailingBytesForUTF8[*(string + position)] + 1; + if ((position + sequence_length) > length) { + return 0; + } + if (!isLegalUTF8(string + position, sequence_length)) { + return 0; + } + position += sequence_length; + } + return 1; +} diff --git a/pymongo/encoding_helpers.h b/pymongo/encoding_helpers.h new file mode 100644 index 000000000..0c087271e --- /dev/null +++ b/pymongo/encoding_helpers.h @@ -0,0 +1,22 @@ +/* + * Copyright 2009 10gen, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ENCODING_HELPERS_H +#define ENCODING_HELPERS_H + +unsigned char is_legal_utf8_string(const unsigned char* string, const int length); + +#endif diff --git a/pymongo/time_helpers.c b/pymongo/time_helpers.c index ec815fde3..62ecc410d 100644 --- a/pymongo/time_helpers.c +++ b/pymongo/time_helpers.c @@ -1,3 +1,19 @@ +/* + * Copyright 2009 10gen, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + /* Copyright (c) 1998-2003 Carnegie Mellon University. All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -38,6 +54,7 @@ * * */ + /* * Copyright (c) 1987, 1989, 1993 * The Regents of the University of California. All rights reserved. diff --git a/setup.py b/setup.py index 1a780aa09..8a75e8022 100755 --- a/setup.py +++ b/setup.py @@ -120,7 +120,8 @@ c_ext = Feature( ext_modules=[Extension('pymongo._cbson', include_dirs=['pymongo'], sources=['pymongo/_cbsonmodule.c', - 'pymongo/time_helpers.c'])]) + 'pymongo/time_helpers.c', + 'pymongo/encoding_helpers.c'])]) if "--no_ext" in sys.argv: sys.argv = [x for x in sys.argv if x != "--no_ext"] diff --git a/test/test_bson.py b/test/test_bson.py index 476e38c20..b43ae38a0 100644 --- a/test/test_bson.py +++ b/test/test_bson.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- +# # Copyright 2009 10gen, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -250,6 +252,19 @@ class TestBSON(unittest.TestCase): def test_large_document(self): self.assertRaises(InvalidDocument, BSON.from_dict, {"key": "x"*4*1024*1024}) + def test_utf8(self): + w = {u"aéあ": u"aéあ"} + self.assertEqual(w, BSON.from_dict(w).to_dict()) + + x = {u"aéあ".encode("utf-8"): u"aéあ".encode("utf-8")} + self.assertEqual(w, BSON.from_dict(x).to_dict()) + + y = {"hello": u"aé".encode("iso-8859-1")} + self.assertRaises(InvalidStringData, BSON.from_dict, y) + + z = {u"aé".encode("iso-8859-1"): "hello"} + self.assertRaises(InvalidStringData, BSON.from_dict, z) + # TODO this test doesn't pass w/ C extension # # timegm doesn't handle years < 1900 (negative), at least on OS X