Automatically split batch inserts PYTHON-414

With this change PyMongo will automatically
split large batch inserts into multiple messages
based on client.max_message_size (the largest
message mongod/s will accept).
This commit is contained in:
behackett 2013-08-13 10:53:02 -07:00
parent 55dd328df1
commit 3afd684224
8 changed files with 502 additions and 22 deletions

View File

@ -140,3 +140,7 @@ int buffer_get_position(buffer_t buffer) {
char* buffer_get_buffer(buffer_t buffer) {
return buffer->buffer;
}
void buffer_update_position(buffer_t buffer, buffer_position new_position) {
buffer->position = new_position;
}

View File

@ -51,5 +51,6 @@ int buffer_write_at_position(buffer_t buffer, buffer_position position, const ch
* since they break the abstraction. */
buffer_position buffer_get_position(buffer_t buffer);
char* buffer_get_buffer(buffer_t buffer);
void buffer_update_position(buffer_t buffer, buffer_position new_position);
#endif

View File

@ -131,10 +131,36 @@ static int add_last_error(PyObject* self, buffer_t buffer,
return 1;
}
static int init_insert_buffer(buffer_t buffer, int request_id, int options,
const char* coll_name, int coll_name_len) {
/* Save space for message length */
int length_location = buffer_save_space(buffer, 4);
if (length_location == -1) {
PyErr_NoMemory();
return length_location;
}
if (!buffer_write_bytes(buffer, (const char*)&request_id, 4) ||
!buffer_write_bytes(buffer,
"\x00\x00\x00\x00"
"\xd2\x07\x00\x00",
8) ||
!buffer_write_bytes(buffer, (const char*)&options, 4) ||
!buffer_write_bytes(buffer,
coll_name,
coll_name_len + 1)) {
return -1;
}
return length_location;
}
static PyObject* _cbson_insert_message(PyObject* self, PyObject* args) {
/* NOTE just using a random number as the request_id */
/* Note: As of PyMongo 2.6, this function is no longer used. It
* is being kept (with tests) for backwards compatibility with 3rd
* party libraries that may currently be using it, but will likely
* be removed in a future release. */
struct module_state *state = GETSTATE(self);
/* NOTE just using a random number as the request_id */
int request_id = rand();
char* collection_name = NULL;
int collection_name_length;
@ -172,23 +198,13 @@ static PyObject* _cbson_insert_message(PyObject* self, PyObject* args) {
return NULL;
}
// save space for message length
length_location = buffer_save_space(buffer, 4);
length_location = init_insert_buffer(buffer,
request_id,
options,
collection_name,
collection_name_length);
if (length_location == -1) {
PyMem_Free(collection_name);
PyErr_NoMemory();
return NULL;
}
if (!buffer_write_bytes(buffer, (const char*)&request_id, 4) ||
!buffer_write_bytes(buffer,
"\x00\x00\x00\x00"
"\xd2\x07\x00\x00",
8) ||
!buffer_write_bytes(buffer, (const char*)&options, 4) ||
!buffer_write_bytes(buffer,
collection_name,
collection_name_length + 1)) {
PyMem_Free(collection_name);
buffer_free(buffer);
return NULL;
}
@ -259,6 +275,14 @@ static PyObject* _cbson_insert_message(PyObject* self, PyObject* args) {
return result;
}
PyDoc_STRVAR(_cbson_insert_message_doc,
"Create an insert message to be sent to MongoDB\n\
\n\
Note: As of PyMongo 2.6, this function is no longer used. It\n\
is being kept (with tests) for backwards compatibility with 3rd\n\
party libraries that may currently be using it, but will likely\n\
be removed in a future release.");
static PyObject* _cbson_update_message(PyObject* self, PyObject* args) {
/* NOTE just using a random number as the request_id */
struct module_state *state = GETSTATE(self);
@ -512,15 +536,290 @@ static PyObject* _cbson_get_more_message(PyObject* self, PyObject* args) {
return result;
}
static PyObject* _cbson_do_batched_insert(PyObject* self, PyObject* args) {
struct module_state *state = GETSTATE(self);
/* NOTE just using a random number as the request_id */
int request_id = rand();
int options = 0, max_size = 0;
int length_location, message_length;
int collection_name_length;
char* collection_name = NULL;
PyObject* docs;
PyObject* doc;
PyObject* iterator;
PyObject* client;
PyObject* last_error_args;
PyObject* result;
unsigned char check_keys;
unsigned char safe;
unsigned char continue_on_error;
unsigned char uuid_subtype;
long max_bson_size;
long max_message_size;
buffer_t buffer;
PyObject *exc_type = NULL, *exc_value = NULL, *exc_trace = NULL;
if (!PyArg_ParseTuple(args, "et#ObbObbO",
"utf-8",
&collection_name,
&collection_name_length,
&docs, &check_keys, &safe,
&last_error_args,
&continue_on_error,
&uuid_subtype, &client)) {
return NULL;
}
if (continue_on_error) {
options += 1;
}
#if PY_MAJOR_VERSION >= 3
max_bson_size = PyLong_AsLong(
PyObject_GetAttrString(client, "max_bson_size"));
#else
max_bson_size = PyInt_AsLong(
PyObject_GetAttrString(client, "max_bson_size"));
#endif
if (max_bson_size == -1) {
PyMem_Free(collection_name);
return NULL;
}
#if PY_MAJOR_VERSION >= 3
max_message_size = PyLong_AsLong(
PyObject_GetAttrString(client, "max_message_size"));
#else
max_message_size = PyInt_AsLong(
PyObject_GetAttrString(client, "max_message_size"));
#endif
if (max_message_size == -1) {
PyMem_Free(collection_name);
return NULL;
}
buffer = buffer_new();
if (!buffer) {
PyErr_NoMemory();
PyMem_Free(collection_name);
return NULL;
}
length_location = init_insert_buffer(buffer,
request_id,
options,
collection_name,
collection_name_length);
if (length_location == -1) {
goto insertfail;
}
iterator = PyObject_GetIter(docs);
if (iterator == NULL) {
PyObject* InvalidOperation = _error("InvalidOperation");
if (InvalidOperation) {
PyErr_SetString(InvalidOperation, "input is not iterable");
Py_DECREF(InvalidOperation);
}
goto insertfail;
}
while ((doc = PyIter_Next(iterator)) != NULL) {
int before = buffer_get_position(buffer);
int cur_size;
if (!write_dict(state->_cbson, buffer, doc, check_keys, uuid_subtype, 1)) {
Py_DECREF(doc);
goto iterfail;
}
Py_DECREF(doc);
cur_size = buffer_get_position(buffer) - before;
max_size = (cur_size > max_size) ? cur_size : max_size;
if (cur_size > max_bson_size) {
PyObject* InvalidDocument = _error("InvalidDocument");
if (InvalidDocument) {
const char* msg = ("BSON document too large (%ld bytes)"
" - the connected server supports"
" BSON document sizes up to %ld bytes.");
#if PY_MAJOR_VERSION >= 3
PyObject* error = PyUnicode_FromFormat(msg,
cur_size, max_bson_size);
#else
PyObject* error = PyString_FromFormat(msg,
cur_size, max_bson_size);
#endif
if (error) {
PyErr_SetObject(InvalidDocument, error);
Py_DECREF(error);
}
Py_DECREF(InvalidDocument);
}
goto iterfail;
}
/* We have enough data, send this batch. */
if (buffer_get_position(buffer) > max_message_size) {
int new_request_id = rand();
int message_start;
PyObject* send_gle = Py_False;
buffer_t new_buffer = buffer_new();
if (!new_buffer) {
PyErr_NoMemory();
goto iterfail;
}
message_start = init_insert_buffer(new_buffer,
new_request_id,
options,
collection_name,
collection_name_length);
if (message_start == -1) {
buffer_free(new_buffer);
goto iterfail;
}
/* Copy the overflow encoded document into the new buffer. */
if (!buffer_write_bytes(new_buffer,
(const char*)buffer_get_buffer(buffer) + before, cur_size)) {
buffer_free(new_buffer);
goto iterfail;
}
/* Roll back to the beginning of this document. */
buffer_update_position(buffer, before);
message_length = buffer_get_position(buffer) - length_location;
memcpy(buffer_get_buffer(buffer) + length_location, &message_length, 4);
/* If we are doing unacknowledged writes *and* continue_on_error
* is True it's pointless (and slower) to send GLE. */
if (safe || !continue_on_error) {
send_gle = Py_True;
if (!add_last_error(self, buffer, request_id, collection_name,
collection_name_length, last_error_args)) {
buffer_free(new_buffer);
goto iterfail;
}
}
/* Objectify buffer */
result = Py_BuildValue("i" BYTES_FORMAT_STRING, request_id,
buffer_get_buffer(buffer),
buffer_get_position(buffer));
buffer_free(buffer);
buffer = new_buffer;
request_id = new_request_id;
length_location = message_start;
if (!PyObject_CallMethod(client,
"_send_message", "NO", result, send_gle)) {
PyObject *etype = NULL, *evalue = NULL, *etrace = NULL;
PyErr_Fetch(&etype, &evalue, &etrace);
PyObject* OperationFailure = _error("OperationFailure");
if (OperationFailure) {
if (PyErr_GivenExceptionMatches(etype, OperationFailure)) {
if (!safe || continue_on_error) {
Py_DECREF(OperationFailure);
if (!safe) {
/* We're doing unacknowledged writes and
* continue_on_error is False. Just return. */
Py_DECREF(etype);
Py_DECREF(evalue);
Py_DECREF(etrace);
Py_DECREF(iterator);
buffer_free(buffer);
PyMem_Free(collection_name);
Py_RETURN_NONE;
}
/* continue_on_error is True, store the error
* details to re-raise after the final batch */
Py_XDECREF(exc_type);
Py_XDECREF(exc_value);
Py_XDECREF(exc_trace);
exc_type = etype;
exc_value = evalue;
exc_trace = etrace;
continue;
}
}
Py_DECREF(OperationFailure);
}
/* This isn't OperationFailure, we couldn't
* import OperationFailure, or we are doing
* acknowledged writes. Re-raise immediately. */
PyErr_Restore(etype, evalue, etrace);
goto iterfail;
}
}
}
Py_DECREF(iterator);
if (PyErr_Occurred()) {
goto insertfail;
}
if (!max_size) {
PyObject* InvalidOperation = _error("InvalidOperation");
if (InvalidOperation) {
PyErr_SetString(InvalidOperation, "cannot do an empty bulk insert");
Py_DECREF(InvalidOperation);
}
goto insertfail;
}
message_length = buffer_get_position(buffer) - length_location;
memcpy(buffer_get_buffer(buffer) + length_location, &message_length, 4);
if (safe) {
if (!add_last_error(self, buffer, request_id, collection_name,
collection_name_length, last_error_args)) {
goto insertfail;
}
}
PyMem_Free(collection_name);
/* objectify buffer */
result = Py_BuildValue("i" BYTES_FORMAT_STRING, request_id,
buffer_get_buffer(buffer),
buffer_get_position(buffer));
buffer_free(buffer);
/* Send the last (or only) batch */
if (!PyObject_CallMethod(client, "_send_message", "NN",
result, PyBool_FromLong((long)safe))) {
Py_XDECREF(exc_type);
Py_XDECREF(exc_value);
Py_XDECREF(exc_trace);
return NULL;
}
if (exc_type) {
/* Re-raise any previously stored exception
* due to continue_on_error being True */
PyErr_Restore(exc_type, exc_value, exc_trace);
return NULL;
}
Py_RETURN_NONE;
iterfail:
Py_DECREF(iterator);
insertfail:
Py_XDECREF(exc_type);
Py_XDECREF(exc_value);
Py_XDECREF(exc_trace);
buffer_free(buffer);
PyMem_Free(collection_name);
return NULL;
}
static PyMethodDef _CMessageMethods[] = {
{"_insert_message", _cbson_insert_message, METH_VARARGS,
"create an insert message to be sent to MongoDB"},
_cbson_insert_message_doc},
{"_update_message", _cbson_update_message, METH_VARARGS,
"create an update message to be sent to MongoDB"},
{"_query_message", _cbson_query_message, METH_VARARGS,
"create a query message to be sent to MongoDB"},
{"_get_more_message", _cbson_get_more_message, METH_VARARGS,
"create a get more message to be sent to MongoDB"},
{"_do_batched_insert", _cbson_do_batched_insert, METH_VARARGS,
"insert a batch of documents, splitting the batch as needed"},
{NULL, NULL, 0, NULL}
};

View File

@ -351,10 +351,10 @@ class Collection(common.BaseObject):
docs = [self.__database._fix_incoming(doc, self) for doc in docs]
safe, options = self._get_write_mode(safe, **kwargs)
self.__database.connection._send_message(
message.insert(self.__full_name, docs,
check_keys, safe, options,
continue_on_error, self.__uuid_subtype), safe)
message._do_batched_insert(self.__full_name, docs,
check_keys, safe, options,
continue_on_error, self.__uuid_subtype,
self.database.connection)
ids = [doc.get("_id", None) for doc in docs]
if return_one:

View File

@ -34,7 +34,7 @@ try:
_use_c = True
except ImportError:
_use_c = False
from pymongo.errors import InvalidOperation
from pymongo.errors import InvalidDocument, InvalidOperation, OperationFailure
__ZERO = b("\x00\x00\x00\x00")
@ -70,6 +70,12 @@ def __pack_message(operation, data):
def insert(collection_name, docs, check_keys,
safe, last_error_args, continue_on_error, uuid_subtype):
"""Get an **insert** message.
.. note:: As of PyMongo 2.6, this function is no longer used. It
is being kept (with tests) for backwards compatibility with 3rd
party libraries that may currently be using it, but will likely
be removed in a future release.
"""
options = 0
if continue_on_error:
@ -181,3 +187,68 @@ def kill_cursors(cursor_ids):
for cursor_id in cursor_ids:
data += struct.pack("<q", cursor_id)
return __pack_message(2007, data)
def _do_batched_insert(collection_name, docs, check_keys,
safe, last_error_args, continue_on_error, uuid_subtype, client):
"""Insert `docs` using multiple batches.
"""
def _insert_message(insert_message, send_safe):
"""Build the insert message with header and GLE.
"""
request_id, final_message = __pack_message(2002, insert_message)
if send_safe:
request_id, error_message, _ = __last_error(collection_name,
last_error_args)
final_message += error_message
return request_id, final_message
if not docs:
raise InvalidOperation("cannot do an empty bulk insert")
last_error = None
begin = struct.pack("<i", int(continue_on_error))
begin += bson._make_c_string(collection_name)
message_length = len(begin)
data = [begin]
for doc in docs:
encoded = bson.BSON.encode(doc, check_keys, uuid_subtype)
encoded_length = len(encoded)
if encoded_length > client.max_bson_size:
raise InvalidDocument("BSON document too large (%d bytes)"
" - the connected server supports"
" BSON document sizes up to %d"
" bytes." %
(encoded_length, client.max_bson_size))
message_length += encoded_length
if message_length < client.max_message_size:
data.append(encoded)
continue
# We have enough data, send this message.
send_safe = safe or not continue_on_error
try:
client._send_message(_insert_message(EMPTY.join(data),
send_safe), send_safe)
# Exception type could be OperationFailure or a subtype
# (e.g. DuplicateKeyError)
except OperationFailure, exc:
# Like it says, continue on error...
if continue_on_error:
# Store exception details to re-raise after the final batch.
last_error = exc
# With unacknowledged writes just return at the first error.
elif not safe:
return
# With acknowledged writes raise immediately.
else:
raise
message_length = len(begin) + encoded_length
data = [begin, encoded]
client._send_message(_insert_message(EMPTY.join(data), safe), safe)
# Re-raise any exception stored due to continue_on_error
if last_error is not None:
raise last_error
if _use_c:
_do_batched_insert = _cmessage._do_batched_insert

View File

@ -580,6 +580,15 @@ class MongoClient(common.BaseObject):
"""
return self.__max_bson_size
@property
def max_message_size(self):
"""Return the maximum message size the connected server
accepts in bytes.
.. versionadded:: 2.6
"""
return self.__max_message_size
def __simple_command(self, sock_info, dbname, spec):
"""Send a command to the server.
"""
@ -622,6 +631,10 @@ class MongoClient(common.BaseObject):
if "maxBsonObjectSize" in response:
self.__max_bson_size = response["maxBsonObjectSize"]
if "maxMessageSizeBytes" in response:
self.__max_message_size = response["maxMessageSizeBytes"]
else:
self.__max_message_size = 2 * self.max_bson_size
# Replica Set?
if not self.__direct:

View File

@ -413,6 +413,8 @@ class Member(object):
self.tags = ismaster_response.get('tags', {})
self.max_bson_size = ismaster_response.get(
'maxBsonObjectSize', MAX_BSON_SIZE)
self.max_message_size = ismaster_response.get(
'maxMessageSizeBytes', 2 * self.max_bson_size)
def clone_with(self, ismaster_response, ping_time_sample):
"""Get a clone updated with ismaster response and a single ping time.
@ -985,6 +987,16 @@ class MongoReplicaSetClient(common.BaseObject):
return rs_state.primary_member.max_bson_size
return 0
@property
def max_message_size(self):
"""Returns the maximum message size the connected primary
accepts in bytes. Returns 0 if no primary is available.
"""
rs_state = self.__rs_state
if rs_state.primary_member:
return rs_state.primary_member.max_message_size
return 0
@property
def auto_start_request(self):
"""Is auto_start_request enabled?

View File

@ -36,6 +36,7 @@ from bson.py3compat import b
from bson.son import SON
from pymongo import (ASCENDING, DESCENDING, GEO2D,
GEOHAYSTACK, GEOSPHERE, HASHED)
from pymongo import message as message_module
from pymongo.collection import Collection
from pymongo.cursor import Cursor
from pymongo.son_manipulator import SONManipulator
@ -1674,6 +1675,85 @@ class TestCollection(unittest.TestCase):
{"bar": "x"}, {"bar": "x" * (max_size - 14)})
self.db.test.update({"bar": "x"}, {"bar": "x" * (max_size - 15)})
def test_insert_large_batch(self):
max_bson_size = self.db.connection.max_bson_size
big_string = 'x' * (max_bson_size - 100)
self.db.test.drop()
self.assertEqual(0, self.db.test.count())
# Batch insert that requires 2 batches
batch = [{'x': big_string}, {'x': big_string},
{'x': big_string}, {'x': big_string}]
self.assertTrue(self.db.test.insert(batch, w=1))
self.assertEqual(4, self.db.test.count())
batch[1]['_id'] = batch[0]['_id']
# Test that inserts fail after first error, acknowledged.
self.db.test.drop()
self.assertRaises(DuplicateKeyError, self.db.test.insert, batch, w=1)
self.assertEqual(1, self.db.test.count())
# Test that inserts fail after first error, unacknowledged.
self.db.test.drop()
self.assertTrue(self.db.test.insert(batch, w=0))
self.assertEqual(1, self.db.test.count())
# 2 batches, 2 errors, acknowledged, continue on error
self.db.test.drop()
batch[3]['_id'] = batch[2]['_id']
try:
self.db.test.insert(batch, continue_on_error=True, w=1)
except OperationFailure, e:
# Make sure we report the last error, not the first.
self.assertTrue(str(batch[2]['_id']) in str(e))
else:
self.fail('OpreationFailure not raised.')
# Only the first and third documents should be inserted.
self.assertEqual(2, self.db.test.count())
# 2 batches, 2 errors, unacknowledged, continue on error
self.db.test.drop()
self.assertTrue(self.db.test.insert(batch, continue_on_error=True, w=0))
# Only the first and third documents should be inserted.
self.assertEqual(2, self.db.test.count())
# Starting in PyMongo 2.6 we no longer use message.insert for inserts, but
# message.insert is part of the public API. Do minimal testing here; there
# isn't really a better place.
def test_insert_message_creation(self):
send = self.db.connection._send_message
name = "%s.%s" % (self.db.name, "test")
def do_insert(args):
send(message_module.insert(*args), args[3])
self.db.drop_collection("test")
self.db.test.insert({'_id': 0}, w=1)
self.assertTrue(1, self.db.test.count())
simple_args = (name, [{'_id': 0}], True, False, {}, False, 3)
gle_args = (name, [{'_id': 0}], True, True, {'w': 1}, False, 3)
coe_args = (name, [{'_id': 0}, {'_id': 1}],
True, True, {'w': 1}, True, 3)
self.assertEqual(None, do_insert(simple_args))
self.assertTrue(1, self.db.test.count())
self.assertRaises(DuplicateKeyError, do_insert, gle_args)
self.assertTrue(1, self.db.test.count())
self.assertRaises(DuplicateKeyError, do_insert, coe_args)
self.assertTrue(2, self.db.test.count())
if have_uuid:
doc = {'_id': 2, 'uuid': uuid.uuid4()}
uuid_sub_args = (name, [doc],
True, True, {'w': 1}, True, 6)
do_insert(uuid_sub_args)
coll = self.db.test
self.assertNotEqual(doc, coll.find_one({'_id': 2}))
coll.uuid_subtype = 6
self.assertEqual(doc, coll.find_one({'_id': 2}))
def test_map_reduce(self):
if not version.at_least(self.db.connection, (1, 1, 1)):
raise SkipTest("mapReduce command requires MongoDB >= 1.1.1")