allow any utf-8 string rather than just ascii. using unicode is still preferred because we don't have to check it at all
This commit is contained in:
parent
9f18a4e8df
commit
e1927afdf2
@ -31,6 +31,7 @@
|
||||
#include <datetime.h>
|
||||
|
||||
#include "time_helpers.h"
|
||||
#include "encoding_helpers.h"
|
||||
|
||||
static PyObject* InvalidName;
|
||||
static PyObject* InvalidDocument;
|
||||
@ -185,17 +186,6 @@ static int write_string(bson_buffer* buffer, PyObject* py_string) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* returns 0 on invalid ascii */
|
||||
static int validate_ascii(const char* data, int length) {
|
||||
int i;
|
||||
for (i = 0; i < length; i++) {
|
||||
if (data[i] & 0x80) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* TODO our platform better be little-endian w/ 4-byte ints! */
|
||||
/* Write a single value to the buffer (also write it's type_byte, for which
|
||||
* space has already been reserved.
|
||||
@ -393,8 +383,10 @@ static int write_element_to_buffer(bson_buffer* buffer, int type_byte, PyObject*
|
||||
int result;
|
||||
|
||||
*(buffer->buffer + type_byte) = 0x02;
|
||||
if (!validate_ascii(PyString_AsString(value), PyString_Size(value))) {
|
||||
PyErr_SetString(InvalidStringData, "strings in documents must be ASCII only");
|
||||
if (!is_legal_utf8_string((const unsigned char*)PyString_AsString(value),
|
||||
PyString_Size(value))) {
|
||||
PyErr_SetString(InvalidStringData,
|
||||
"strings in documents must be valid UTF-8");
|
||||
return 0;
|
||||
}
|
||||
result = write_string(buffer, value);
|
||||
@ -659,6 +651,13 @@ static int decode_and_write_pair(bson_buffer* buffer, PyObject* key,
|
||||
} else if (PyString_Check(key)) {
|
||||
encoded = key;
|
||||
Py_INCREF(encoded);
|
||||
|
||||
if (!is_legal_utf8_string((const unsigned char*)PyString_AsString(encoded),
|
||||
PyString_Size(encoded))) {
|
||||
PyErr_SetString(InvalidStringData,
|
||||
"strings in documents must be valid UTF-8");
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
PyObject* errmsg = PyString_FromString("documents must have only string keys, key was ");
|
||||
PyObject* repr = PyObject_Repr(key);
|
||||
|
||||
@ -64,10 +64,15 @@ def _get_c_string(data):
|
||||
def _make_c_string(string):
|
||||
if "\x00" in string:
|
||||
raise InvalidStringData("BSON strings must not contain a NULL character")
|
||||
try:
|
||||
if isinstance(string, unicode):
|
||||
return string.encode("utf-8") + "\x00"
|
||||
except:
|
||||
raise InvalidStringData("strings in documents must be ASCII only")
|
||||
else:
|
||||
try:
|
||||
string.decode("utf-8")
|
||||
return string + "\x00"
|
||||
except:
|
||||
raise InvalidStringData("strings in documents must be valid "
|
||||
"UTF-8: %r" % string)
|
||||
|
||||
|
||||
def _validate_number(data):
|
||||
|
||||
107
pymongo/encoding_helpers.c
Normal file
107
pymongo/encoding_helpers.c
Normal file
@ -0,0 +1,107 @@
|
||||
/*
|
||||
* Copyright 2009 10gen, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright 2001 Unicode, Inc.
|
||||
*
|
||||
* Disclaimer
|
||||
*
|
||||
* This source code is provided as is by Unicode, Inc. No claims are
|
||||
* made as to fitness for any particular purpose. No warranties of any
|
||||
* kind are expressed or implied. The recipient agrees to determine
|
||||
* applicability of information provided. If this file has been
|
||||
* purchased on magnetic or optical media from Unicode, Inc., the
|
||||
* sole remedy for any claim will be exchange of defective media
|
||||
* within 90 days of receipt.
|
||||
*
|
||||
* Limitations on Rights to Redistribute This Code
|
||||
*
|
||||
* Unicode, Inc. hereby grants the right to freely use the information
|
||||
* supplied in this file in the creation of products supporting the
|
||||
* Unicode Standard, and to make copies of this file in any form
|
||||
* for internal or external distribution as long as this notice
|
||||
* remains attached.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Index into the table below with the first byte of a UTF-8 sequence to
|
||||
* get the number of trailing bytes that are supposed to follow it.
|
||||
*/
|
||||
static const char trailingBytesForUTF8[256] = {
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
|
||||
};
|
||||
|
||||
/* --------------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* Utility routine to tell whether a sequence of bytes is legal UTF-8.
|
||||
* This must be called with the length pre-determined by the first byte.
|
||||
* The length can be set by:
|
||||
* length = trailingBytesForUTF8[*source]+1;
|
||||
* and the sequence is illegal right away if there aren't that many bytes
|
||||
* available.
|
||||
* If presented with a length > 4, this returns 0. The Unicode
|
||||
* definition of UTF-8 goes up to 4-byte sequences.
|
||||
*/
|
||||
static unsigned char isLegalUTF8(const unsigned char* source, int length) {
|
||||
unsigned char a;
|
||||
const unsigned char* srcptr = source + length;
|
||||
switch (length) {
|
||||
default: return 0;
|
||||
/* Everything else falls through when "true"... */
|
||||
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
|
||||
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
|
||||
case 2: if ((a = (*--srcptr)) > 0xBF) return 0;
|
||||
switch (*source) {
|
||||
/* no fall-through in this inner switch */
|
||||
case 0xE0: if (a < 0xA0) return 0; break;
|
||||
case 0xF0: if (a < 0x90) return 0; break;
|
||||
case 0xF4: if (a > 0x8F) return 0; break;
|
||||
default: if (a < 0x80) return 0;
|
||||
}
|
||||
case 1: if (*source >= 0x80 && *source < 0xC2) return 0;
|
||||
if (*source > 0xF4) return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* --------------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* Return whether a string containing UTF-8 is legal.
|
||||
*/
|
||||
unsigned char is_legal_utf8_string(const unsigned char* string, const int length) {
|
||||
int position = 0;
|
||||
|
||||
while (position < length) {
|
||||
int sequence_length = trailingBytesForUTF8[*(string + position)] + 1;
|
||||
if ((position + sequence_length) > length) {
|
||||
return 0;
|
||||
}
|
||||
if (!isLegalUTF8(string + position, sequence_length)) {
|
||||
return 0;
|
||||
}
|
||||
position += sequence_length;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
22
pymongo/encoding_helpers.h
Normal file
22
pymongo/encoding_helpers.h
Normal file
@ -0,0 +1,22 @@
|
||||
/*
|
||||
* Copyright 2009 10gen, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef ENCODING_HELPERS_H
|
||||
#define ENCODING_HELPERS_H
|
||||
|
||||
unsigned char is_legal_utf8_string(const unsigned char* string, const int length);
|
||||
|
||||
#endif
|
||||
@ -1,3 +1,19 @@
|
||||
/*
|
||||
* Copyright 2009 10gen, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* Copyright (c) 1998-2003 Carnegie Mellon University. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
@ -38,6 +54,7 @@
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 1987, 1989, 1993
|
||||
* The Regents of the University of California. All rights reserved.
|
||||
|
||||
3
setup.py
3
setup.py
@ -120,7 +120,8 @@ c_ext = Feature(
|
||||
ext_modules=[Extension('pymongo._cbson',
|
||||
include_dirs=['pymongo'],
|
||||
sources=['pymongo/_cbsonmodule.c',
|
||||
'pymongo/time_helpers.c'])])
|
||||
'pymongo/time_helpers.c',
|
||||
'pymongo/encoding_helpers.c'])])
|
||||
|
||||
if "--no_ext" in sys.argv:
|
||||
sys.argv = [x for x in sys.argv if x != "--no_ext"]
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Copyright 2009 10gen, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ -250,6 +252,19 @@ class TestBSON(unittest.TestCase):
|
||||
def test_large_document(self):
|
||||
self.assertRaises(InvalidDocument, BSON.from_dict, {"key": "x"*4*1024*1024})
|
||||
|
||||
def test_utf8(self):
|
||||
w = {u"aéあ": u"aéあ"}
|
||||
self.assertEqual(w, BSON.from_dict(w).to_dict())
|
||||
|
||||
x = {u"aéあ".encode("utf-8"): u"aéあ".encode("utf-8")}
|
||||
self.assertEqual(w, BSON.from_dict(x).to_dict())
|
||||
|
||||
y = {"hello": u"aé".encode("iso-8859-1")}
|
||||
self.assertRaises(InvalidStringData, BSON.from_dict, y)
|
||||
|
||||
z = {u"aé".encode("iso-8859-1"): "hello"}
|
||||
self.assertRaises(InvalidStringData, BSON.from_dict, z)
|
||||
|
||||
# TODO this test doesn't pass w/ C extension
|
||||
#
|
||||
# timegm doesn't handle years < 1900 (negative), at least on OS X
|
||||
|
||||
Loading…
Reference in New Issue
Block a user