From e4cf504559a7b9cb9e7faabf4d54be01e9a3a352 Mon Sep 17 00:00:00 2001 From: "A. Jesse Jiryu Davis" Date: Tue, 6 Aug 2013 18:36:33 -0400 Subject: [PATCH] Option to not compile BSON regexes. PYTHON-500 Add a 'compile_re' parameter to Collection.find, Collection.find_one, and json_util.loads. If it's False, regular expressions are encoded as a new class, Regex, instead of passed to re.compile(). This allows PyMongo to handle regular expressions that don't compile in Python but are valid in other contexts like MongoDB queries. --- bson/__init__.py | 127 ++++++++------ bson/_cbsonmodule.c | 297 +++++++++++++++++++-------------- bson/json_util.py | 21 ++- bson/regex.py | 83 +++++++++ bson/son.py | 51 +++--- doc/api/bson/index.rst | 1 + doc/api/bson/regex.rst | 7 + doc/api/pymongo/collection.rst | 2 +- pymongo/collection.py | 10 +- pymongo/cursor.py | 14 +- pymongo/database.py | 11 +- pymongo/helpers.py | 6 +- test/test_bson.py | 40 ++++- test/test_collection.py | 32 +++- test/test_cursor.py | 3 + test/test_database.py | 19 ++- test/test_json_util.py | 24 +++ 17 files changed, 530 insertions(+), 218 deletions(-) create mode 100644 bson/regex.py create mode 100644 doc/api/bson/regex.rst diff --git a/bson/__init__.py b/bson/__init__.py index 3ad2cd3f9..a6f993db5 100644 --- a/bson/__init__.py +++ b/bson/__init__.py @@ -32,6 +32,7 @@ from bson.max_key import MaxKey from bson.min_key import MinKey from bson.objectid import ObjectId from bson.py3compat import b, binary_type +from bson.regex import Regex from bson.son import SON, RE_TYPE from bson.timestamp import Timestamp from bson.tz_util import utc @@ -90,7 +91,8 @@ BSONMAX = b("\x7F") # Max key def _get_int(data, position, as_class=None, - tz_aware=False, uuid_subtype=OLD_UUID_SUBTYPE, unsigned=False): + tz_aware=False, uuid_subtype=OLD_UUID_SUBTYPE, + compile_re=True, unsigned=False): format = unsigned and "I" or "i" try: value = struct.unpack("<%s" % format, data[position:position + 4])[0] @@ -132,13 +134,13 @@ def _make_c_string(string, check_null=False): "UTF-8: %r" % string) -def _get_number(data, position, as_class, tz_aware, uuid_subtype): +def _get_number(data, position, as_class, tz_aware, uuid_subtype, compile_re): num = struct.unpack("MinKey, "bson.min_key", "MinKey") || _load_object(&state->MaxKey, "bson.max_key", "MaxKey") || _load_object(&state->UTC, "bson.tz_util", "utc") || - _load_object(&state->RECompile, "re", "compile")) { + _load_object(&state->RECompile, "re", "compile") || + _load_object(&state->Regex, "bson.regex", "Regex")) { return 1; } /* If we couldn't import uuid then we must be on 2.4. Just ignore. */ @@ -440,6 +443,130 @@ _set_cannot_encode(PyObject* value) { } } +/* + * Encode a builtin Python regular expression or our custom Regex class. + * + * Sets exception and returns 0 on failure. + */ +static int _write_regex_to_buffer( + buffer_t buffer, int type_byte, PyObject* value) { + + struct module_state *state = GETSTATE(self); + PyObject* py_flags; + PyObject* py_pattern; + PyObject* encoded_pattern; + long int_flags; + char flags[FLAGS_SIZE]; + char check_utf8 = 0; + const char* pattern_data; + int pattern_length, flags_length; + result_t status; + + /* + * Both the builtin re type and our Regex class have attributes + * "flags" and "pattern". + */ + py_flags = PyObject_GetAttrString(value, "flags"); + if (!py_flags) { + return 0; + } +#if PY_MAJOR_VERSION >= 3 + int_flags = PyLong_AsLong(py_flags); +#else + int_flags = PyInt_AsLong(py_flags); +#endif + Py_DECREF(py_flags); + py_pattern = PyObject_GetAttrString(value, "pattern"); + if (!py_pattern) { + return 0; + } + + if (PyUnicode_Check(py_pattern)) { + encoded_pattern = PyUnicode_AsUTF8String(py_pattern); + Py_DECREF(py_pattern); + if (!encoded_pattern) { + return 0; + } + } else { + encoded_pattern = py_pattern; + check_utf8 = 1; + } + +#if PY_MAJOR_VERSION >= 3 + if (!(pattern_data = PyBytes_AsString(encoded_pattern))) { + Py_DECREF(encoded_pattern); + return 0; + } + if ((pattern_length = _downcast_and_check(PyBytes_Size(encoded_pattern), 0)) == -1) { + Py_DECREF(encoded_pattern); + return 0; + } +#else + if (!(pattern_data = PyString_AsString(encoded_pattern))) { + Py_DECREF(encoded_pattern); + return 0; + } + if ((pattern_length = _downcast_and_check(PyString_Size(encoded_pattern), 0)) == -1) { + Py_DECREF(encoded_pattern); + return 0; + } +#endif + status = check_string((const unsigned char*)pattern_data, + pattern_length, check_utf8, 1); + if (status == NOT_UTF_8) { + PyObject* InvalidStringData = _error("InvalidStringData"); + if (InvalidStringData) { + PyErr_SetString(InvalidStringData, + "regex patterns must be valid UTF-8"); + Py_DECREF(InvalidStringData); + } + Py_DECREF(encoded_pattern); + return 0; + } else if (status == HAS_NULL) { + PyObject* InvalidDocument = _error("InvalidDocument"); + if (InvalidDocument) { + PyErr_SetString(InvalidDocument, + "regex patterns must not contain the NULL byte"); + Py_DECREF(InvalidDocument); + } + Py_DECREF(encoded_pattern); + return 0; + } + + if (!buffer_write_bytes(buffer, pattern_data, pattern_length + 1)) { + Py_DECREF(encoded_pattern); + return 0; + } + Py_DECREF(encoded_pattern); + + flags[0] = 0; + + if (int_flags & 2) { + STRCAT(flags, FLAGS_SIZE, "i"); + } + if (int_flags & 4) { + STRCAT(flags, FLAGS_SIZE, "l"); + } + if (int_flags & 8) { + STRCAT(flags, FLAGS_SIZE, "m"); + } + if (int_flags & 16) { + STRCAT(flags, FLAGS_SIZE, "s"); + } + if (int_flags & 32) { + STRCAT(flags, FLAGS_SIZE, "u"); + } + if (int_flags & 64) { + STRCAT(flags, FLAGS_SIZE, "x"); + } + flags_length = (int)strlen(flags) + 1; + if (!buffer_write_bytes(buffer, flags, flags_length)) { + return 0; + } + *(buffer_get_buffer(buffer) + type_byte) = 0x0B; + return 1; +} + /* TODO our platform better be little-endian w/ 4-byte ints! */ /* Write a single value to the buffer (also write its type_byte, for which * space has already been reserved. @@ -574,6 +701,11 @@ static int _write_element_to_buffer(PyObject* self, buffer_t buffer, *(buffer_get_buffer(buffer) + type_byte) = 0x07; return 1; } + case 11: + { + /* Regex */ + return _write_regex_to_buffer(buffer, type_byte, value); + } case 13: { /* Code */ @@ -890,115 +1022,7 @@ static int _write_element_to_buffer(PyObject* self, buffer_t buffer, *(buffer_get_buffer(buffer) + type_byte) = 0x09; return buffer_write_bytes(buffer, (const char*)&millis, 8); } else if (PyObject_TypeCheck(value, state->REType)) { - PyObject* py_flags; - PyObject* py_pattern; - PyObject* encoded_pattern; - long int_flags; - char flags[FLAGS_SIZE]; - char check_utf8 = 0; - const char* pattern_data; - int pattern_length, flags_length; - result_t status; - - py_flags = PyObject_GetAttrString(value, "flags"); - if (!py_flags) { - return 0; - } -#if PY_MAJOR_VERSION >= 3 - int_flags = PyLong_AsLong(py_flags); -#else - int_flags = PyInt_AsLong(py_flags); -#endif - Py_DECREF(py_flags); - py_pattern = PyObject_GetAttrString(value, "pattern"); - if (!py_pattern) { - return 0; - } - - if (PyUnicode_Check(py_pattern)) { - encoded_pattern = PyUnicode_AsUTF8String(py_pattern); - Py_DECREF(py_pattern); - if (!encoded_pattern) { - return 0; - } - } else { - encoded_pattern = py_pattern; - check_utf8 = 1; - } - -#if PY_MAJOR_VERSION >= 3 - if (!(pattern_data = PyBytes_AsString(encoded_pattern))) { - Py_DECREF(encoded_pattern); - return 0; - } - if ((pattern_length = _downcast_and_check(PyBytes_Size(encoded_pattern), 0)) == -1) { - Py_DECREF(encoded_pattern); - return 0; - } -#else - if (!(pattern_data = PyString_AsString(encoded_pattern))) { - Py_DECREF(encoded_pattern); - return 0; - } - if ((pattern_length = _downcast_and_check(PyString_Size(encoded_pattern), 0)) == -1) { - Py_DECREF(encoded_pattern); - return 0; - } -#endif - status = check_string((const unsigned char*)pattern_data, - pattern_length, check_utf8, 1); - if (status == NOT_UTF_8) { - PyObject* InvalidStringData = _error("InvalidStringData"); - if (InvalidStringData) { - PyErr_SetString(InvalidStringData, - "regex patterns must be valid UTF-8"); - Py_DECREF(InvalidStringData); - } - Py_DECREF(encoded_pattern); - return 0; - } else if (status == HAS_NULL) { - PyObject* InvalidDocument = _error("InvalidDocument"); - if (InvalidDocument) { - PyErr_SetString(InvalidDocument, - "regex patterns must not contain the NULL byte"); - Py_DECREF(InvalidDocument); - } - Py_DECREF(encoded_pattern); - return 0; - } - - if (!buffer_write_bytes(buffer, pattern_data, pattern_length + 1)) { - Py_DECREF(encoded_pattern); - return 0; - } - Py_DECREF(encoded_pattern); - - flags[0] = 0; - /* TODO don't hardcode these */ - if (int_flags & 2) { - STRCAT(flags, FLAGS_SIZE, "i"); - } - if (int_flags & 4) { - STRCAT(flags, FLAGS_SIZE, "l"); - } - if (int_flags & 8) { - STRCAT(flags, FLAGS_SIZE, "m"); - } - if (int_flags & 16) { - STRCAT(flags, FLAGS_SIZE, "s"); - } - if (int_flags & 32) { - STRCAT(flags, FLAGS_SIZE, "u"); - } - if (int_flags & 64) { - STRCAT(flags, FLAGS_SIZE, "x"); - } - flags_length = (int)strlen(flags) + 1; - if (!buffer_write_bytes(buffer, flags, flags_length)) { - return 0; - } - *(buffer_get_buffer(buffer) + type_byte) = 0x0B; - return 1; + return _write_regex_to_buffer(buffer, type_byte, value); } /* @@ -1435,7 +1459,8 @@ static PyObject* _cbson_dict_to_bson(PyObject* self, PyObject* args) { static PyObject* get_value(PyObject* self, const char* buffer, unsigned* position, unsigned char type, unsigned max, PyObject* as_class, - unsigned char tz_aware, unsigned char uuid_subtype) { + unsigned char tz_aware, unsigned char uuid_subtype, + unsigned char compile_re) { struct module_state *state = GETSTATE(self); PyObject* value = NULL; @@ -1495,7 +1520,8 @@ static PyObject* get_value(PyObject* self, const char* buffer, unsigned* positio goto invalid; } value = elements_to_dict(self, buffer + *position + 4, - size - 5, as_class, tz_aware, uuid_subtype); + size - 5, as_class, tz_aware, uuid_subtype, + compile_re); if (!value) { return NULL; } @@ -1587,7 +1613,8 @@ static PyObject* get_value(PyObject* self, const char* buffer, unsigned* positio } to_append = get_value(self, buffer, position, bson_type, max - (unsigned)key_size, - as_class, tz_aware, uuid_subtype); + as_class, tz_aware, uuid_subtype, + compile_re); Py_LeaveRecursiveCall(); if (!to_append) { Py_DECREF(value); @@ -1850,7 +1877,18 @@ static PyObject* get_value(PyObject* self, const char* buffer, unsigned* positio } } *position += (unsigned)flags_length + 1; - if ((compile_func = _get_object(state->RECompile, "re", "compile"))) { + + /* + * Use re.compile() if we're configured to compile regular + * expressions, else create an instance of our Regex class. + */ + if (compile_re) { + compile_func = _get_object(state->RECompile, "re", "compile"); + } else { + compile_func = _get_object(state->Regex, "bson.regex", "Regex"); + } + + if (compile_func) { value = PyObject_CallFunction(compile_func, "Oi", pattern, flags); Py_DECREF(compile_func); } @@ -1990,7 +2028,7 @@ static PyObject* get_value(PyObject* self, const char* buffer, unsigned* positio } scope = elements_to_dict(self, buffer + *position + 4, scope_size - 5, (PyObject*)&PyDict_Type, - tz_aware, uuid_subtype); + tz_aware, uuid_subtype, compile_re); if (!scope) { Py_DECREF(code); return NULL; @@ -2098,7 +2136,8 @@ static PyObject* get_value(PyObject* self, const char* buffer, unsigned* positio static PyObject* _elements_to_dict(PyObject* self, const char* string, unsigned max, PyObject* as_class, unsigned char tz_aware, - unsigned char uuid_subtype) { + unsigned char uuid_subtype, + unsigned char compile_re) { unsigned position = 0; PyObject* dict = PyObject_CallObject(as_class, NULL); if (!dict) { @@ -2126,7 +2165,8 @@ static PyObject* _elements_to_dict(PyObject* self, const char* string, } position += (unsigned)name_length + 1; value = get_value(self, string, &position, type, - max - position, as_class, tz_aware, uuid_subtype); + max - position, as_class, tz_aware, uuid_subtype, + compile_re); if (!value) { Py_DECREF(name); Py_DECREF(dict); @@ -2143,12 +2183,13 @@ static PyObject* _elements_to_dict(PyObject* self, const char* string, static PyObject* elements_to_dict(PyObject* self, const char* string, unsigned max, PyObject* as_class, unsigned char tz_aware, - unsigned char uuid_subtype) { + unsigned char uuid_subtype, + unsigned char compile_re) { PyObject* result; if (Py_EnterRecursiveCall(" while decoding a BSON document")) return NULL; result = _elements_to_dict(self, string, max, - as_class, tz_aware, uuid_subtype); + as_class, tz_aware, uuid_subtype, compile_re); Py_LeaveRecursiveCall(); return result; } @@ -2161,11 +2202,14 @@ static PyObject* _cbson_bson_to_dict(PyObject* self, PyObject* args) { PyObject* as_class; unsigned char tz_aware; unsigned char uuid_subtype; + unsigned char compile_re; + PyObject* dict; PyObject* remainder; PyObject* result; - if (!PyArg_ParseTuple(args, "OObb", &bson, &as_class, &tz_aware, &uuid_subtype)) { + if (!PyArg_ParseTuple( + args, "OObbb", &bson, &as_class, &tz_aware, &uuid_subtype, &compile_re)) { return NULL; } @@ -2231,7 +2275,7 @@ static PyObject* _cbson_bson_to_dict(PyObject* self, PyObject* args) { } dict = elements_to_dict(self, string + 4, (unsigned)size - 5, - as_class, tz_aware, uuid_subtype); + as_class, tz_aware, uuid_subtype, compile_re); if (!dict) { return NULL; } @@ -2260,8 +2304,11 @@ static PyObject* _cbson_decode_all(PyObject* self, PyObject* args) { PyObject* as_class = (PyObject*)&PyDict_Type; unsigned char tz_aware = 1; unsigned char uuid_subtype = 3; + unsigned char compile_re; - if (!PyArg_ParseTuple(args, "O|Obb", &bson, &as_class, &tz_aware, &uuid_subtype)) { + if (!PyArg_ParseTuple( + args, "O|Obbb", + &bson, &as_class, &tz_aware, &uuid_subtype, &compile_re)) { return NULL; } @@ -2332,7 +2379,7 @@ static PyObject* _cbson_decode_all(PyObject* self, PyObject* args) { } dict = elements_to_dict(self, string + 4, (unsigned)size - 5, - as_class, tz_aware, uuid_subtype); + as_class, tz_aware, uuid_subtype, compile_re); if (!dict) { Py_DECREF(result); return NULL; @@ -2364,6 +2411,7 @@ static int _cbson_traverse(PyObject *m, visitproc visit, void *arg) { Py_VISIT(GETSTATE(m)->ObjectId); Py_VISIT(GETSTATE(m)->DBRef); Py_VISIT(GETSTATE(m)->RECompile); + Py_VISIT(GETSTATE(m)->Regex); Py_VISIT(GETSTATE(m)->UUID); Py_VISIT(GETSTATE(m)->Timestamp); Py_VISIT(GETSTATE(m)->MinKey); @@ -2379,6 +2427,7 @@ static int _cbson_clear(PyObject *m) { Py_CLEAR(GETSTATE(m)->ObjectId); Py_CLEAR(GETSTATE(m)->DBRef); Py_CLEAR(GETSTATE(m)->RECompile); + Py_CLEAR(GETSTATE(m)->Regex); Py_CLEAR(GETSTATE(m)->UUID); Py_CLEAR(GETSTATE(m)->Timestamp); Py_CLEAR(GETSTATE(m)->MinKey); diff --git a/bson/json_util.py b/bson/json_util.py index e5056d895..9439c4b8d 100644 --- a/bson/json_util.py +++ b/bson/json_util.py @@ -90,6 +90,7 @@ from bson.dbref import DBRef from bson.max_key import MaxKey from bson.min_key import MinKey from bson.objectid import ObjectId +from bson.regex import Regex from bson.timestamp import Timestamp from bson.py3compat import PY3, binary_type, string_types @@ -120,10 +121,20 @@ def loads(s, *args, **kwargs): """Helper function that wraps :class:`json.loads`. Automatically passes the object_hook for BSON type conversion. + + :Parameters: + - `compile_re` (optional): if ``False``, don't attempt to compile + BSON regular expressions into Python regular expressions. Return + instances of :class:`~bson.bsonregex.BSONRegex` instead. + + .. versionchanged:: 2.7 + Added ``compile_re`` option. """ if not json_lib: raise Exception("No json library available") - kwargs['object_hook'] = object_hook + + compile_re = kwargs.pop('compile_re', True) + kwargs['object_hook'] = lambda dct: object_hook(dct, compile_re) return json.loads(s, *args, **kwargs) @@ -141,7 +152,7 @@ def _json_convert(obj): return obj -def object_hook(dct): +def object_hook(dct, compile_re=True): if "$oid" in dct: return ObjectId(str(dct["$oid"])) if "$ref" in dct: @@ -154,7 +165,11 @@ def object_hook(dct): # PyMongo always adds $options but some other tools may not. for opt in dct.get("$options", ""): flags |= _RE_OPT_TABLE.get(opt, 0) - return re.compile(dct["$regex"], flags) + + if compile_re: + return re.compile(dct["$regex"], flags) + else: + return Regex(dct["$regex"], flags) if "$minKey" in dct: return MinKey() if "$maxKey" in dct: diff --git a/bson/regex.py b/bson/regex.py new file mode 100644 index 000000000..f790ddc3c --- /dev/null +++ b/bson/regex.py @@ -0,0 +1,83 @@ +# Copyright 2013 MongoDB, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tools for representing MongoDB regular expressions. +""" + +import re + +from bson.py3compat import string_types + + +def str_flags_to_int(str_flags): + flags = 0 + if "i" in str_flags: + flags |= re.IGNORECASE + if "l" in str_flags: + flags |= re.LOCALE + if "m" in str_flags: + flags |= re.MULTILINE + if "s" in str_flags: + flags |= re.DOTALL + if "u" in str_flags: + flags |= re.UNICODE + if "x" in str_flags: + flags |= re.VERBOSE + + return flags + + +class Regex(object): + """BSON regular expression data.""" + _type_marker = 11 + + def __init__(self, pattern, flags=0): + """BSON regular expression data. + + This class is useful to store and retrieve regular expressions that are + incompatible with Python's regular expression dialect. + + :Parameters: + - `pattern`: string + - `flags`: (optional) an integer bitmask, or a string of flag + characters like "im" for IGNORECASE and MULTILINE + """ + if not isinstance(pattern, string_types): + raise TypeError("pattern must be a string, not %s" % type(pattern)) + self.pattern = pattern + + if isinstance(flags, string_types): + self.flags = str_flags_to_int(flags) + elif isinstance(flags, int): + self.flags = flags + else: + raise TypeError( + "flags must be a string or int, not %s" % type(flags)) + + def __eq__(self, other): + if isinstance(other, Regex): + return self.pattern == self.pattern and self.flags == other.flags + else: + return NotImplemented + + def __ne__(self, other): + return not self == other + + def __repr__(self): + return "Regex(%r, %r)" % (self.pattern, self.flags) + + def compile(self): + """Compile this ``Regex`` as a Python regular expression. + """ + return re.compile(self.pattern, self.flags) diff --git a/bson/son.py b/bson/son.py index 7390e30db..29ada2ffb 100644 --- a/bson/son.py +++ b/bson/son.py @@ -35,29 +35,29 @@ class SON(dict): The mapping from Python types to BSON types is as follows: - =================================== ============= =================== - Python Type BSON Type Supported Direction - =================================== ============= =================== - None null both - bool boolean both - int [#int]_ int32 / int64 py -> bson - long int64 both - float number (real) both - string string py -> bson - unicode string both - list array both - dict / `SON` object both - datetime.datetime [#dt]_ [#dt2]_ date both - compiled re regex both - `bson.binary.Binary` binary both - `bson.objectid.ObjectId` oid both - `bson.dbref.DBRef` dbref both - None undefined bson -> py - unicode code bson -> py - `bson.code.Code` code py -> bson - unicode symbol bson -> py - bytes (Python 3) [#bytes]_ binary both - =================================== ============= =================== + ======================================= ============= =================== + Python Type BSON Type Supported Direction + ======================================= ============= =================== + None null both + bool boolean both + int [#int]_ int32 / int64 py -> bson + long int64 both + float number (real) both + string string py -> bson + unicode string both + list array both + dict / `SON` object both + datetime.datetime [#dt]_ [#dt2]_ date both + `bson.regex.Regex` / compiled re [#re]_ regex both + `bson.binary.Binary` binary both + `bson.objectid.ObjectId` oid both + `bson.dbref.DBRef` dbref both + None undefined bson -> py + unicode code bson -> py + `bson.code.Code` code py -> bson + unicode symbol bson -> py + bytes (Python 3) [#bytes]_ binary both + ======================================= ============= =================== Note that to save binary data it must be wrapped as an instance of `bson.binary.Binary`. Otherwise it will be saved as a BSON string @@ -71,6 +71,11 @@ class SON(dict): millisecond when saved .. [#dt2] all datetime.datetime instances are treated as *naive*. clients should always use UTC. + .. [#re] :class:`~bson.regex.Regex` instances and regular expression + objects from ``re.compile()`` are both saved as BSON regular expressions. + BSON regular expressions are decoded as Python regular expressions by + default, or as :class:`~bson.regex.Regex` instances if the ``compile_re`` + option is set to ``False``. .. [#bytes] The bytes type from Python 3.x is encoded as BSON binary with subtype 0. In Python 3.x it will be decoded back to bytes. In Python 2.x it will be decoded to an instance of :class:`~bson.binary.Binary` with diff --git a/doc/api/bson/index.rst b/doc/api/bson/index.rst index 745dc0dd6..8b8a90504 100644 --- a/doc/api/bson/index.rst +++ b/doc/api/bson/index.rst @@ -11,6 +11,7 @@ Sub-modules: :maxdepth: 2 binary + regex code dbref errors diff --git a/doc/api/bson/regex.rst b/doc/api/bson/regex.rst new file mode 100644 index 000000000..3a5603774 --- /dev/null +++ b/doc/api/bson/regex.rst @@ -0,0 +1,7 @@ +:mod:`regex` -- Tools for representing MongoDB regular expressions +================================================================== +.. versionadded:: 2.7 + +.. automodule:: bson.regex + :synopsis: Tools for representing MongoDB regular expressions + :members: diff --git a/doc/api/pymongo/collection.rst b/doc/api/pymongo/collection.rst index 3288d8916..ea6c915b3 100644 --- a/doc/api/pymongo/collection.rst +++ b/doc/api/pymongo/collection.rst @@ -33,7 +33,7 @@ .. automethod:: update(spec, document[, upsert=False[, manipulate=False[, safe=None[, multi=False[, check_keys=True[, **kwargs]]]]]]) .. automethod:: remove([spec_or_id=None[, safe=None[, **kwargs]]]) .. automethod:: drop - .. automethod:: find([spec=None[, fields=None[, skip=0[, limit=0[, timeout=True[, snapshot=False[, tailable=False[, sort=None[, max_scan=None[, as_class=None[, slave_okay=False[, await_data=False[, partial=False[, manipulate=True[, read_preference=ReadPreference.PRIMARY[, exhaust=False[,**kwargs]]]]]]]]]]]]]]]]]) + .. automethod:: find([spec=None[, fields=None[, skip=0[, limit=0[, timeout=True[, snapshot=False[, tailable=False[, sort=None[, max_scan=None[, as_class=None[, slave_okay=False[, await_data=False[, partial=False[, manipulate=True[, read_preference=ReadPreference.PRIMARY[, exhaust=False, [compile_re=True, [,**kwargs]]]]]]]]]]]]]]]]]]) .. automethod:: find_one([spec_or_id=None[, *args[, **kwargs]]]) .. automethod:: count .. automethod:: create_index diff --git a/pymongo/collection.py b/pymongo/collection.py index 466095bf5..ab87c4682 100644 --- a/pymongo/collection.py +++ b/pymongo/collection.py @@ -690,6 +690,9 @@ class Collection(common.BaseObject): the nearest member may accept reads. Default 15 milliseconds. **Ignored by mongos** and must be configured on the command line. See the localThreshold_ option for more information. + - `compile_re` (optional): if ``False``, don't attempt to compile + BSON regex objects into Python regexes. Return instances of + :class:`~bson.regex.Regex` instead. - `exhaust` (optional): If ``True`` create an "exhaust" cursor. MongoDB will stream batched results to the client without waiting for the client to request each batch, reducing latency. @@ -717,12 +720,15 @@ class Collection(common.BaseObject): 5. The `network_timeout` option is ignored when using the `exhaust` option. - .. note:: The `manipulate` parameter may default to False in - a future release. + .. note:: The `manipulate` and `compile_re` parameters may default to + False in future releases. .. note:: The `max_scan` parameter requires server version **>= 1.5.1** + .. versionadded:: 2.7 + The ``compile_re`` parameter. + .. versionadded:: 2.3 The `tag_sets` and `secondary_acceptable_latency_ms` parameters. diff --git a/pymongo/cursor.py b/pymongo/cursor.py index de999af06..7e145bc1b 100644 --- a/pymongo/cursor.py +++ b/pymongo/cursor.py @@ -69,8 +69,8 @@ class Cursor(object): await_data=False, partial=False, manipulate=True, read_preference=ReadPreference.PRIMARY, tag_sets=[{}], secondary_acceptable_latency_ms=None, - exhaust=False, _must_use_master=False, _uuid_subtype=None, - _first_batch=None, _cursor_id=None, + exhaust=False, compile_re=True, _must_use_master=False, + _uuid_subtype=None, _first_batch=None, _cursor_id=None, **kwargs): """Create a new cursor. @@ -152,6 +152,7 @@ class Cursor(object): self.__tag_sets = tag_sets self.__secondary_acceptable_latency_ms = secondary_acceptable_latency_ms self.__tz_aware = collection.database.connection.tz_aware + self.__compile_re = compile_re self.__must_use_master = _must_use_master self.__uuid_subtype = _uuid_subtype or collection.uuid_subtype @@ -225,8 +226,8 @@ class Cursor(object): "batch_size", "max_scan", "as_class", "slave_okay", "manipulate", "read_preference", "tag_sets", "secondary_acceptable_latency_ms", - "must_use_master", "uuid_subtype", "query_flags", - "kwargs") + "must_use_master", "uuid_subtype", "compile_re", + "query_flags", "kwargs") data = dict((k, v) for k, v in self.__dict__.iteritems() if k.startswith('_Cursor__') and k[9:] in values_to_clone) if deepcopy: @@ -667,6 +668,7 @@ class Cursor(object): r = database.command("count", self.__collection.name, allowable_errors=["ns missing"], uuid_subtype=self.__uuid_subtype, + compile_re=self.__compile_re, **command) if r.get("errmsg", "") == "ns missing": return 0 @@ -718,6 +720,7 @@ class Cursor(object): return database.command("distinct", self.__collection.name, uuid_subtype=self.__uuid_subtype, + compile_re=self.__compile_re, **options)["values"] def explain(self): @@ -829,7 +832,8 @@ class Cursor(object): response = helpers._unpack_response(response, self.__id, self.__as_class, self.__tz_aware, - self.__uuid_subtype) + self.__uuid_subtype, + self.__compile_re) except AutoReconnect: # Don't send kill cursors to another server after a "not master" # error. It's completely pointless. diff --git a/pymongo/database.py b/pymongo/database.py index ae26dee4c..5832f1d09 100644 --- a/pymongo/database.py +++ b/pymongo/database.py @@ -273,7 +273,7 @@ class Database(common.BaseObject): def command(self, command, value=1, check=True, allowable_errors=[], - uuid_subtype=OLD_UUID_SUBTYPE, **kwargs): + uuid_subtype=OLD_UUID_SUBTYPE, compile_re=True, **kwargs): """Issue a MongoDB command. Send command `command` to the database and return the @@ -318,6 +318,12 @@ class Database(common.BaseObject): in this list will be ignored by error-checking - `uuid_subtype` (optional): The BSON binary subtype to use for a UUID used in this command. + - `compile_re` (optional): if ``False``, don't attempt to compile + BSON regular expressions into Python regular expressions. Return + instances of :class:`~bson.regex.Regex` instead. Can avoid + :exc:`~bson.errors.InvalidBSON` errors when receiving + Python-incompatible regular expressions, for example from + ``currentOp`` - `read_preference`: The read preference for this connection. See :class:`~pymongo.read_preferences.ReadPreference` for available options. @@ -337,6 +343,8 @@ class Database(common.BaseObject): .. note:: ``command`` ignores the ``network_timeout`` parameter. + .. versionchanged:: 2.7 + Added ``compile_re`` option. .. versionchanged:: 2.3 Added `tag_sets` and `secondary_acceptable_latency_ms` options. .. versionchanged:: 2.2 @@ -390,6 +398,7 @@ class Database(common.BaseObject): extra_opts['secondary_acceptable_latency_ms'] = kwargs.pop( 'secondary_acceptable_latency_ms', self.secondary_acceptable_latency_ms) + extra_opts['compile_re'] = compile_re fields = kwargs.get('fields') if fields is not None and not isinstance(fields, dict): diff --git a/pymongo/helpers.py b/pymongo/helpers.py index 38d88b904..904c5913b 100644 --- a/pymongo/helpers.py +++ b/pymongo/helpers.py @@ -73,7 +73,8 @@ def _index_document(index_list): def _unpack_response(response, cursor_id=None, as_class=dict, - tz_aware=False, uuid_subtype=OLD_UUID_SUBTYPE): + tz_aware=False, uuid_subtype=OLD_UUID_SUBTYPE, + compile_re=True): """Unpack a response from the database. Check the response for errors and unpack, returning a dictionary @@ -108,7 +109,8 @@ def _unpack_response(response, cursor_id=None, as_class=dict, result["starting_from"] = struct.unpack("= 2.1.0") + + db = self.client.pymongo_test + db.test.drop() + db.test.insert({'r': re.compile('.*')}) + + result = db.test.aggregate([]) + self.assertTrue(isinstance(result['result'][0]['r'], RE_TYPE)) + result = db.test.aggregate([], compile_re=False) + self.assertTrue(isinstance(result['result'][0]['r'], Regex)) + def test_aggregation_cursor_validation(self): if not version.at_least(self.db.connection, (2, 5, 1)): raise SkipTest("Aggregation cursor requires MongoDB >= 2.5.1") @@ -2148,6 +2162,22 @@ class TestCollection(unittest.TestCase): self.assertEqual(2, c.find_one(manipulate=True)['foo']) c.remove({}) + def test_compile_re(self): + c = self.client.pymongo_test.test + c.drop() + c.insert({'r': re.compile('.*')}) + + # Test find_one with compile_re. + self.assertTrue(isinstance(c.find_one()['r'], RE_TYPE)) + self.assertTrue(isinstance(c.find_one(compile_re=False)['r'], Regex)) + + # Test find with compile_re. + for doc in c.find(): + self.assertTrue(isinstance(doc['r'], RE_TYPE)) + + for doc in c.find(compile_re=False): + self.assertTrue(isinstance(doc['r'], Regex)) + if __name__ == "__main__": unittest.main() diff --git a/test/test_cursor.py b/test/test_cursor.py index f02b326c3..8e9901ccf 100644 --- a/test/test_cursor.py +++ b/test/test_cursor.py @@ -552,6 +552,7 @@ class TestCursor(unittest.TestCase): await_data=True, partial=True, manipulate=False, + compile_re=False, fields={'_id': False}).limit(2) cursor.add_option(128) @@ -565,6 +566,8 @@ class TestCursor(unittest.TestCase): cursor2._Cursor__slave_okay) self.assertEqual(cursor._Cursor__manipulate, cursor2._Cursor__manipulate) + self.assertEqual(cursor._Cursor__compile_re, + cursor2._Cursor__compile_re) self.assertEqual(cursor._Cursor__query_flags, cursor2._Cursor__query_flags) diff --git a/test/test_database.py b/test/test_database.py index b3fdf9bda..d69cc6738 100644 --- a/test/test_database.py +++ b/test/test_database.py @@ -16,6 +16,7 @@ import datetime import os +import re import sys import warnings @@ -25,9 +26,10 @@ import unittest from nose.plugins.skip import SkipTest from bson.code import Code +from bson.regex import Regex from bson.dbref import DBRef from bson.objectid import ObjectId -from bson.son import SON +from bson.son import SON, RE_TYPE from pymongo import (ALL, auth, OFF, @@ -303,6 +305,21 @@ class TestDatabase(unittest.TestCase): if not is_mongos(self.client): db.command('eval', 'sleep(100)', network_timeout=0.001) + def test_command_with_compile_re(self): + # Using 'aggregate' as our example command, since it's an easy way to + # retrieve a BSON regex from a collection using a command. + if not version.at_least(self.client, (2, 1, 0)): + raise SkipTest('Need aggregation to test compile_re') + + db = self.client.pymongo_test + db.test.drop() + db.test.insert({'r': re.compile('.*')}) + + result = db.command('aggregate', 'test', pipeline=[]) + self.assertTrue(isinstance(result['result'][0]['r'], RE_TYPE)) + result = db.command('aggregate', 'test', pipeline=[], compile_re=False) + self.assertTrue(isinstance(result['result'][0]['r'], Regex)) + def test_last_status(self): db = self.client.pymongo_test diff --git a/test/test_json_util.py b/test/test_json_util.py index 6a21ba154..cc299e28b 100644 --- a/test/test_json_util.py +++ b/test/test_json_util.py @@ -32,6 +32,8 @@ from bson.dbref import DBRef from bson.max_key import MaxKey from bson.min_key import MinKey from bson.objectid import ObjectId +from bson.regex import Regex +from bson.son import RE_TYPE from bson.timestamp import Timestamp from bson.tz_util import utc @@ -74,6 +76,22 @@ class TestJsonUtil(unittest.TestCase): self.round_trip({"date": datetime.datetime(2009, 12, 9, 15, 49, 45, 191000, utc)}) + def test_regex_object_hook(self): + import json + + # Extended JSON format regular expression. + pat = 'a*b' + json_re = '{"$regex": "%s", "$options": "u"}' % pat + loaded = json_util.object_hook(json.loads(json_re)) + self.assertTrue(isinstance(loaded, RE_TYPE)) + self.assertEqual(pat, loaded.pattern) + self.assertEqual(re.U, loaded.flags) + + loaded = json_util.object_hook(json.loads(json_re), compile_re=False) + self.assertTrue(isinstance(loaded, Regex)) + self.assertEqual(pat, loaded.pattern) + self.assertEqual(re.U, loaded.flags) + def test_regex(self): res = self.round_tripped({"r": re.compile("a*b", re.IGNORECASE)})["r"] self.assertEqual("a*b", res.pattern) @@ -95,6 +113,12 @@ class TestJsonUtil(unittest.TestCase): expected_flags = re.U self.assertEqual(expected_flags, res.flags) + self.assertEqual( + Regex('.*', 'ilm'), + json_util.loads( + '{"r": {"$regex": ".*", "$options": "ilm"}}', + compile_re=False)['r']) + def test_minkey(self): self.round_trip({"m": MinKey()})