PYTHON-3048 Fixed bug with incorrect validation of UTF-8 regex patterns (#970)
This commit is contained in:
parent
be3008aa11
commit
3f7231a1a2
@ -71,26 +71,3 @@ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
||||
OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
||||
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
3) License Notice for encoding_helpers.c
|
||||
----------------------------------------
|
||||
|
||||
Portions Copyright 2001 Unicode, Inc.
|
||||
|
||||
Disclaimer
|
||||
|
||||
This source code is provided as is by Unicode, Inc. No claims are
|
||||
made as to fitness for any particular purpose. No warranties of any
|
||||
kind are expressed or implied. The recipient agrees to determine
|
||||
applicability of information provided. If this file has been
|
||||
purchased on magnetic or optical media from Unicode, Inc., the
|
||||
sole remedy for any claim will be exchange of defective media
|
||||
within 90 days of receipt.
|
||||
|
||||
Limitations on Rights to Redistribute This Code
|
||||
|
||||
Unicode, Inc. hereby grants the right to freely use the information
|
||||
supplied in this file in the creation of products supporting the
|
||||
Unicode Standard, and to make copies of this file in any form
|
||||
for internal or external distribution as long as this notice
|
||||
remains attached.
|
||||
|
||||
@ -26,7 +26,6 @@
|
||||
|
||||
#include "buffer.h"
|
||||
#include "time64.h"
|
||||
#include "encoding_helpers.h"
|
||||
|
||||
#define _CBSON_MODULE
|
||||
#include "_cbsonmodule.h"
|
||||
@ -553,12 +552,12 @@ static int _write_regex_to_buffer(
|
||||
PyObject* py_flags;
|
||||
PyObject* py_pattern;
|
||||
PyObject* encoded_pattern;
|
||||
PyObject* decoded_pattern;
|
||||
long int_flags;
|
||||
char flags[FLAGS_SIZE];
|
||||
char check_utf8 = 0;
|
||||
const char* pattern_data;
|
||||
int pattern_length, flags_length;
|
||||
result_t status;
|
||||
|
||||
/*
|
||||
* Both the builtin re type and our Regex class have attributes
|
||||
@ -597,18 +596,8 @@ static int _write_regex_to_buffer(
|
||||
Py_DECREF(encoded_pattern);
|
||||
return 0;
|
||||
}
|
||||
status = cbson_check_string((const unsigned char*)pattern_data,
|
||||
pattern_length, check_utf8, 1);
|
||||
if (status == NOT_UTF_8) {
|
||||
PyObject* InvalidStringData = _error("InvalidStringData");
|
||||
if (InvalidStringData) {
|
||||
PyErr_SetString(InvalidStringData,
|
||||
"regex patterns must be valid UTF-8");
|
||||
Py_DECREF(InvalidStringData);
|
||||
}
|
||||
Py_DECREF(encoded_pattern);
|
||||
return 0;
|
||||
} else if (status == HAS_NULL) {
|
||||
|
||||
if (strlen(pattern_data) != (size_t) pattern_length){
|
||||
PyObject* InvalidDocument = _error("InvalidDocument");
|
||||
if (InvalidDocument) {
|
||||
PyErr_SetString(InvalidDocument,
|
||||
@ -619,6 +608,22 @@ static int _write_regex_to_buffer(
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (check_utf8) {
|
||||
decoded_pattern = PyUnicode_DecodeUTF8(pattern_data, (Py_ssize_t) pattern_length, NULL);
|
||||
if (decoded_pattern == NULL) {
|
||||
PyErr_Clear();
|
||||
PyObject* InvalidStringData = _error("InvalidStringData");
|
||||
if (InvalidStringData) {
|
||||
PyErr_SetString(InvalidStringData,
|
||||
"regex patterns must be valid UTF-8");
|
||||
Py_DECREF(InvalidStringData);
|
||||
}
|
||||
Py_DECREF(encoded_pattern);
|
||||
return 0;
|
||||
}
|
||||
Py_DECREF(decoded_pattern);
|
||||
}
|
||||
|
||||
if (!buffer_write_bytes(buffer, pattern_data, pattern_length + 1)) {
|
||||
Py_DECREF(encoded_pattern);
|
||||
return 0;
|
||||
|
||||
@ -1,118 +0,0 @@
|
||||
/*
|
||||
* Copyright 2009-2015 MongoDB, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "encoding_helpers.h"
|
||||
|
||||
/*
|
||||
* Portions Copyright 2001 Unicode, Inc.
|
||||
*
|
||||
* Disclaimer
|
||||
*
|
||||
* This source code is provided as is by Unicode, Inc. No claims are
|
||||
* made as to fitness for any particular purpose. No warranties of any
|
||||
* kind are expressed or implied. The recipient agrees to determine
|
||||
* applicability of information provided. If this file has been
|
||||
* purchased on magnetic or optical media from Unicode, Inc., the
|
||||
* sole remedy for any claim will be exchange of defective media
|
||||
* within 90 days of receipt.
|
||||
*
|
||||
* Limitations on Rights to Redistribute This Code
|
||||
*
|
||||
* Unicode, Inc. hereby grants the right to freely use the information
|
||||
* supplied in this file in the creation of products supporting the
|
||||
* Unicode Standard, and to make copies of this file in any form
|
||||
* for internal or external distribution as long as this notice
|
||||
* remains attached.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Index into the table below with the first byte of a UTF-8 sequence to
|
||||
* get the number of trailing bytes that are supposed to follow it.
|
||||
*/
|
||||
static const char trailingBytesForUTF8[256] = {
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
|
||||
};
|
||||
|
||||
/* --------------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* Utility routine to tell whether a sequence of bytes is legal UTF-8.
|
||||
* This must be called with the length pre-determined by the first byte.
|
||||
* The length can be set by:
|
||||
* length = trailingBytesForUTF8[*source]+1;
|
||||
* and the sequence is illegal right away if there aren't that many bytes
|
||||
* available.
|
||||
* If presented with a length > 4, this returns 0. The Unicode
|
||||
* definition of UTF-8 goes up to 4-byte sequences.
|
||||
*/
|
||||
static unsigned char isLegalUTF8(const unsigned char* source, int length) {
|
||||
unsigned char a;
|
||||
const unsigned char* srcptr = source + length;
|
||||
switch (length) {
|
||||
default: return 0;
|
||||
/* Everything else falls through when "true"... */
|
||||
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
|
||||
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
|
||||
case 2: if ((a = (*--srcptr)) > 0xBF) return 0;
|
||||
switch (*source) {
|
||||
/* no fall-through in this inner switch */
|
||||
case 0xE0: if (a < 0xA0) return 0; break;
|
||||
case 0xF0: if (a < 0x90) return 0; break;
|
||||
case 0xF4: if ((a > 0x8F) || (a < 0x80)) return 0; break;
|
||||
default: if (a < 0x80) return 0;
|
||||
}
|
||||
case 1: if (*source >= 0x80 && *source < 0xC2) return 0;
|
||||
if (*source > 0xF4) return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
result_t cbson_check_string(const unsigned char* string, const int length,
|
||||
const char check_utf8, const char check_null) {
|
||||
int position = 0;
|
||||
/* By default we go character by character. Will be different for checking
|
||||
* UTF-8 */
|
||||
int sequence_length = 1;
|
||||
|
||||
if (!check_utf8 && !check_null) {
|
||||
return VALID;
|
||||
}
|
||||
|
||||
while (position < length) {
|
||||
if (check_null && *(string + position) == 0) {
|
||||
return HAS_NULL;
|
||||
}
|
||||
if (check_utf8) {
|
||||
sequence_length = trailingBytesForUTF8[*(string + position)] + 1;
|
||||
if ((position + sequence_length) > length) {
|
||||
return NOT_UTF_8;
|
||||
}
|
||||
if (!isLegalUTF8(string + position, sequence_length)) {
|
||||
return NOT_UTF_8;
|
||||
}
|
||||
}
|
||||
position += sequence_length;
|
||||
}
|
||||
|
||||
return VALID;
|
||||
}
|
||||
@ -1,29 +0,0 @@
|
||||
/*
|
||||
* Copyright 2009-2015 MongoDB, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef ENCODING_HELPERS_H
|
||||
#define ENCODING_HELPERS_H
|
||||
|
||||
typedef enum {
|
||||
VALID,
|
||||
NOT_UTF_8,
|
||||
HAS_NULL
|
||||
} result_t;
|
||||
|
||||
result_t cbson_check_string(const unsigned char* string, const int length,
|
||||
const char check_utf8, const char check_null);
|
||||
|
||||
#endif
|
||||
@ -19,6 +19,8 @@ Bug fixes
|
||||
|
||||
- Fixed a bug where :meth:`~pymongo.collection.Collection.estimated_document_count`
|
||||
would fail with a "CommandNotSupportedOnView" error on views (`PYTHON-2885`_).
|
||||
- Fixed a bug where invalid UTF-8 strings could be passed as patterns for :class:`~bson.regex.Regex`
|
||||
objects (`PYTHON-3048`_). :func:`bson.encode` now correctly raises :class:`bson.errors.InvalidStringData`.
|
||||
|
||||
Unavoidable breaking changes
|
||||
............................
|
||||
@ -38,6 +40,7 @@ Issues Resolved
|
||||
See the `PyMongo 4.2 release notes in JIRA`_ for the list of resolved issues
|
||||
in this release.
|
||||
|
||||
.. _PYTHON-3048: https://jira.mongodb.org/browse/PYTHON-3048
|
||||
.. _PYTHON-2885: https://jira.mongodb.org/browse/PYTHON-2885
|
||||
.. _PYTHON-3167: https://jira.mongodb.org/browse/PYTHON-3167
|
||||
.. _PyMongo 4.2 release notes in JIRA: https://jira.mongodb.org/secure/ReleaseNote.jspa?projectId=10004&version=33196
|
||||
|
||||
7
setup.py
7
setup.py
@ -255,12 +255,7 @@ ext_modules = [
|
||||
Extension(
|
||||
"bson._cbson",
|
||||
include_dirs=["bson"],
|
||||
sources=[
|
||||
"bson/_cbsonmodule.c",
|
||||
"bson/time64.c",
|
||||
"bson/buffer.c",
|
||||
"bson/encoding_helpers.c",
|
||||
],
|
||||
sources=["bson/_cbsonmodule.c", "bson/time64.c", "bson/buffer.c"],
|
||||
),
|
||||
Extension(
|
||||
"pymongo._cmessage",
|
||||
|
||||
Loading…
Reference in New Issue
Block a user