allow any utf-8 string rather than just ascii. using unicode is still preferred because we don't have to check it at all

This commit is contained in:
Mike Dirolf 2009-12-01 13:27:44 -05:00
parent 9f18a4e8df
commit e1927afdf2
7 changed files with 183 additions and 17 deletions

View File

@ -31,6 +31,7 @@
#include <datetime.h>
#include "time_helpers.h"
#include "encoding_helpers.h"
static PyObject* InvalidName;
static PyObject* InvalidDocument;
@ -185,17 +186,6 @@ static int write_string(bson_buffer* buffer, PyObject* py_string) {
return 1;
}
/* returns 0 on invalid ascii */
static int validate_ascii(const char* data, int length) {
int i;
for (i = 0; i < length; i++) {
if (data[i] & 0x80) {
return 0;
}
}
return 1;
}
/* TODO our platform better be little-endian w/ 4-byte ints! */
/* Write a single value to the buffer (also write it's type_byte, for which
* space has already been reserved.
@ -393,8 +383,10 @@ static int write_element_to_buffer(bson_buffer* buffer, int type_byte, PyObject*
int result;
*(buffer->buffer + type_byte) = 0x02;
if (!validate_ascii(PyString_AsString(value), PyString_Size(value))) {
PyErr_SetString(InvalidStringData, "strings in documents must be ASCII only");
if (!is_legal_utf8_string((const unsigned char*)PyString_AsString(value),
PyString_Size(value))) {
PyErr_SetString(InvalidStringData,
"strings in documents must be valid UTF-8");
return 0;
}
result = write_string(buffer, value);
@ -659,6 +651,13 @@ static int decode_and_write_pair(bson_buffer* buffer, PyObject* key,
} else if (PyString_Check(key)) {
encoded = key;
Py_INCREF(encoded);
if (!is_legal_utf8_string((const unsigned char*)PyString_AsString(encoded),
PyString_Size(encoded))) {
PyErr_SetString(InvalidStringData,
"strings in documents must be valid UTF-8");
return 0;
}
} else {
PyObject* errmsg = PyString_FromString("documents must have only string keys, key was ");
PyObject* repr = PyObject_Repr(key);

View File

@ -64,10 +64,15 @@ def _get_c_string(data):
def _make_c_string(string):
if "\x00" in string:
raise InvalidStringData("BSON strings must not contain a NULL character")
try:
if isinstance(string, unicode):
return string.encode("utf-8") + "\x00"
except:
raise InvalidStringData("strings in documents must be ASCII only")
else:
try:
string.decode("utf-8")
return string + "\x00"
except:
raise InvalidStringData("strings in documents must be valid "
"UTF-8: %r" % string)
def _validate_number(data):

107
pymongo/encoding_helpers.c Normal file
View File

@ -0,0 +1,107 @@
/*
* Copyright 2009 10gen, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* Copyright 2001 Unicode, Inc.
*
* Disclaimer
*
* This source code is provided as is by Unicode, Inc. No claims are
* made as to fitness for any particular purpose. No warranties of any
* kind are expressed or implied. The recipient agrees to determine
* applicability of information provided. If this file has been
* purchased on magnetic or optical media from Unicode, Inc., the
* sole remedy for any claim will be exchange of defective media
* within 90 days of receipt.
*
* Limitations on Rights to Redistribute This Code
*
* Unicode, Inc. hereby grants the right to freely use the information
* supplied in this file in the creation of products supporting the
* Unicode Standard, and to make copies of this file in any form
* for internal or external distribution as long as this notice
* remains attached.
*/
/*
* Index into the table below with the first byte of a UTF-8 sequence to
* get the number of trailing bytes that are supposed to follow it.
*/
static const char trailingBytesForUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};
/* --------------------------------------------------------------------- */
/*
* Utility routine to tell whether a sequence of bytes is legal UTF-8.
* This must be called with the length pre-determined by the first byte.
* The length can be set by:
* length = trailingBytesForUTF8[*source]+1;
* and the sequence is illegal right away if there aren't that many bytes
* available.
* If presented with a length > 4, this returns 0. The Unicode
* definition of UTF-8 goes up to 4-byte sequences.
*/
static unsigned char isLegalUTF8(const unsigned char* source, int length) {
unsigned char a;
const unsigned char* srcptr = source + length;
switch (length) {
default: return 0;
/* Everything else falls through when "true"... */
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
case 2: if ((a = (*--srcptr)) > 0xBF) return 0;
switch (*source) {
/* no fall-through in this inner switch */
case 0xE0: if (a < 0xA0) return 0; break;
case 0xF0: if (a < 0x90) return 0; break;
case 0xF4: if (a > 0x8F) return 0; break;
default: if (a < 0x80) return 0;
}
case 1: if (*source >= 0x80 && *source < 0xC2) return 0;
if (*source > 0xF4) return 0;
}
return 1;
}
/* --------------------------------------------------------------------- */
/*
* Return whether a string containing UTF-8 is legal.
*/
unsigned char is_legal_utf8_string(const unsigned char* string, const int length) {
int position = 0;
while (position < length) {
int sequence_length = trailingBytesForUTF8[*(string + position)] + 1;
if ((position + sequence_length) > length) {
return 0;
}
if (!isLegalUTF8(string + position, sequence_length)) {
return 0;
}
position += sequence_length;
}
return 1;
}

View File

@ -0,0 +1,22 @@
/*
* Copyright 2009 10gen, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef ENCODING_HELPERS_H
#define ENCODING_HELPERS_H
unsigned char is_legal_utf8_string(const unsigned char* string, const int length);
#endif

View File

@ -1,3 +1,19 @@
/*
* Copyright 2009 10gen, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Copyright (c) 1998-2003 Carnegie Mellon University. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -38,6 +54,7 @@
*
*
*/
/*
* Copyright (c) 1987, 1989, 1993
* The Regents of the University of California. All rights reserved.

View File

@ -120,7 +120,8 @@ c_ext = Feature(
ext_modules=[Extension('pymongo._cbson',
include_dirs=['pymongo'],
sources=['pymongo/_cbsonmodule.c',
'pymongo/time_helpers.c'])])
'pymongo/time_helpers.c',
'pymongo/encoding_helpers.c'])])
if "--no_ext" in sys.argv:
sys.argv = [x for x in sys.argv if x != "--no_ext"]

View File

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
#
# Copyright 2009 10gen, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
@ -250,6 +252,19 @@ class TestBSON(unittest.TestCase):
def test_large_document(self):
self.assertRaises(InvalidDocument, BSON.from_dict, {"key": "x"*4*1024*1024})
def test_utf8(self):
w = {u"aéあ": u"aéあ"}
self.assertEqual(w, BSON.from_dict(w).to_dict())
x = {u"aéあ".encode("utf-8"): u"aéあ".encode("utf-8")}
self.assertEqual(w, BSON.from_dict(x).to_dict())
y = {"hello": u"".encode("iso-8859-1")}
self.assertRaises(InvalidStringData, BSON.from_dict, y)
z = {u"".encode("iso-8859-1"): "hello"}
self.assertRaises(InvalidStringData, BSON.from_dict, z)
# TODO this test doesn't pass w/ C extension
#
# timegm doesn't handle years < 1900 (negative), at least on OS X