allow any utf-8 string rather than just ascii. using unicode is still preferred because we don't have to check it at all

2009-12-01 13:27:44 -05:00 · 2009-12-01 13:27:44 -05:00 · e1927afdf2
commit e1927afdf2
parent 9f18a4e8df
7 changed files with 183 additions and 17 deletions
--- a/pymongo/_cbsonmodule.c
+++ b/pymongo/_cbsonmodule.c
@ -31,6 +31,7 @@
 #include <datetime.h>

 #include "time_helpers.h"
+#include "encoding_helpers.h"

 static PyObject* InvalidName;
 static PyObject* InvalidDocument;
@ -185,17 +186,6 @@ static int write_string(bson_buffer* buffer, PyObject* py_string) {
    return 1;
 }

-/* returns 0 on invalid ascii */
-static int validate_ascii(const char* data, int length) {
-    int i;
-    for (i = 0; i < length; i++) {
-        if (data[i] & 0x80) {
-            return 0;
-        }
-    }
-    return 1;
-}
-
 /* TODO our platform better be little-endian w/ 4-byte ints! */
 /* Write a single value to the buffer (also write it's type_byte, for which
 * space has already been reserved.
@ -393,8 +383,10 @@ static int write_element_to_buffer(bson_buffer* buffer, int type_byte, PyObject*
        int result;

        *(buffer->buffer + type_byte) = 0x02;
-        if (!validate_ascii(PyString_AsString(value), PyString_Size(value))) {
-            PyErr_SetString(InvalidStringData, "strings in documents must be ASCII only");
+        if (!is_legal_utf8_string((const unsigned char*)PyString_AsString(value),
+                                  PyString_Size(value))) {
+            PyErr_SetString(InvalidStringData,
+                            "strings in documents must be valid UTF-8");
            return 0;
        }
        result = write_string(buffer, value);
@ -659,6 +651,13 @@ static int decode_and_write_pair(bson_buffer* buffer, PyObject* key,
    } else if (PyString_Check(key)) {
        encoded = key;
        Py_INCREF(encoded);
+
+        if (!is_legal_utf8_string((const unsigned char*)PyString_AsString(encoded),
+                                  PyString_Size(encoded))) {
+            PyErr_SetString(InvalidStringData,
+                            "strings in documents must be valid UTF-8");
+            return 0;
+        }
    } else {
        PyObject* errmsg = PyString_FromString("documents must have only string keys, key was ");
        PyObject* repr = PyObject_Repr(key);
--- a/pymongo/bson.py
+++ b/pymongo/bson.py
@ -64,10 +64,15 @@ def _get_c_string(data):
 def _make_c_string(string):
    if "\x00" in string:
        raise InvalidStringData("BSON strings must not contain a NULL character")
-    try:
+    if isinstance(string, unicode):
        return string.encode("utf-8") + "\x00"
-    except:
-        raise InvalidStringData("strings in documents must be ASCII only")
+    else:
+        try:
+            string.decode("utf-8")
+            return string + "\x00"
+        except:
+            raise InvalidStringData("strings in documents must be valid "
+                                    "UTF-8: %r" % string)


 def _validate_number(data):
--- a/pymongo/encoding_helpers.c
+++ b/pymongo/encoding_helpers.c
@ -0,0 +1,107 @@
+/*
+ * Copyright 2009 10gen, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright 2001 Unicode, Inc.
+ *
+ * Disclaimer
+ *
+ * This source code is provided as is by Unicode, Inc. No claims are
+ * made as to fitness for any particular purpose. No warranties of any
+ * kind are expressed or implied. The recipient agrees to determine
+ * applicability of information provided. If this file has been
+ * purchased on magnetic or optical media from Unicode, Inc., the
+ * sole remedy for any claim will be exchange of defective media
+ * within 90 days of receipt.
+ *
+ * Limitations on Rights to Redistribute This Code
+ *
+ * Unicode, Inc. hereby grants the right to freely use the information
+ * supplied in this file in the creation of products supporting the
+ * Unicode Standard, and to make copies of this file in any form
+ * for internal or external distribution as long as this notice
+ * remains attached.
+ */
+
+/*
+ * Index into the table below with the first byte of a UTF-8 sequence to
+ * get the number of trailing bytes that are supposed to follow it.
+ */
+static const char trailingBytesForUTF8[256] = {
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
+};
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Utility routine to tell whether a sequence of bytes is legal UTF-8.
+ * This must be called with the length pre-determined by the first byte.
+ * The length can be set by:
+ *  length = trailingBytesForUTF8[*source]+1;
+ * and the sequence is illegal right away if there aren't that many bytes
+ * available.
+ * If presented with a length > 4, this returns 0.  The Unicode
+ * definition of UTF-8 goes up to 4-byte sequences.
+ */
+static unsigned char isLegalUTF8(const unsigned char* source, int length) {
+    unsigned char a;
+    const unsigned char* srcptr = source + length;
+    switch (length) {
+    default: return 0;
+        /* Everything else falls through when "true"... */
+    case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
+    case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
+    case 2: if ((a = (*--srcptr)) > 0xBF) return 0;
+        switch (*source) {
+            /* no fall-through in this inner switch */
+            case 0xE0: if (a < 0xA0) return 0; break;
+            case 0xF0: if (a < 0x90) return 0; break;
+            case 0xF4: if (a > 0x8F) return 0; break;
+            default:  if (a < 0x80) return 0;
+        }
+        case 1: if (*source >= 0x80 && *source < 0xC2) return 0;
+        if (*source > 0xF4) return 0;
+    }
+    return 1;
+}
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Return whether a string containing UTF-8 is legal.
+ */
+unsigned char is_legal_utf8_string(const unsigned char* string, const int length) {
+    int position = 0;
+
+    while (position < length) {
+        int sequence_length = trailingBytesForUTF8[*(string + position)] + 1;
+        if ((position + sequence_length) > length) {
+            return 0;
+        }
+        if (!isLegalUTF8(string + position, sequence_length)) {
+            return 0;
+        }
+        position += sequence_length;
+    }
+    return 1;
+}
--- a/pymongo/encoding_helpers.h
+++ b/pymongo/encoding_helpers.h
@ -0,0 +1,22 @@
+/*
+ * Copyright 2009 10gen, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ENCODING_HELPERS_H
+#define ENCODING_HELPERS_H
+
+unsigned char is_legal_utf8_string(const unsigned char* string, const int length);
+
+#endif
--- a/pymongo/time_helpers.c
+++ b/pymongo/time_helpers.c
@ -1,3 +1,19 @@
+/*
+ * Copyright 2009 10gen, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 /* Copyright (c) 1998-2003 Carnegie Mellon University.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
@ -38,6 +54,7 @@
 *
 *
 */
+
 /*
 * Copyright (c) 1987, 1989, 1993
 *    The Regents of the University of California.  All rights reserved.
--- a/setup.py
+++ b/setup.py
@ -120,7 +120,8 @@ c_ext = Feature(
    ext_modules=[Extension('pymongo._cbson',
                           include_dirs=['pymongo'],
                           sources=['pymongo/_cbsonmodule.c',
-                                    'pymongo/time_helpers.c'])])
+                                    'pymongo/time_helpers.c',
+                                    'pymongo/encoding_helpers.c'])])

 if "--no_ext" in sys.argv:
    sys.argv = [x for x in sys.argv if x != "--no_ext"]
--- a/test/test_bson.py
+++ b/test/test_bson.py
@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+#
 # Copyright 2009 10gen, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -250,6 +252,19 @@ class TestBSON(unittest.TestCase):
    def test_large_document(self):
        self.assertRaises(InvalidDocument, BSON.from_dict, {"key": "x"*4*1024*1024})

+    def test_utf8(self):
+        w = {u"aéあ": u"aéあ"}
+        self.assertEqual(w, BSON.from_dict(w).to_dict())
+
+        x = {u"aéあ".encode("utf-8"): u"aéあ".encode("utf-8")}
+        self.assertEqual(w, BSON.from_dict(x).to_dict())
+
+        y = {"hello": u"aé".encode("iso-8859-1")}
+        self.assertRaises(InvalidStringData, BSON.from_dict, y)
+
+        z = {u"aé".encode("iso-8859-1"): "hello"}
+        self.assertRaises(InvalidStringData, BSON.from_dict, z)
+
 # TODO this test doesn't pass w/ C extension
 #
 # timegm doesn't handle years < 1900 (negative), at least on OS X