SERVER-107819 Concurrent createIndex+collMod can leave behind inconsistent collection options (#42830)
GitOrigin-RevId: 28a8bc1dbaabe13692042f2bb3e628da2207d0c6
This commit is contained in:
parent
989e1357f1
commit
30bb760be0
@ -406,6 +406,8 @@ last-continuous:
|
||||
ticket: SERVER-99618
|
||||
- test_file: jstests/concurrency/fsm_workloads/ddl/collMod/collMod.js
|
||||
ticket: SERVER-99618
|
||||
- test_file: jstests/concurrency/fsm_workloads/ddl/collMod/collMod_options_with_index_ops.js
|
||||
ticket: SERVER-107819
|
||||
- test_file: jstests/sharding/check_metadata_consistency.js
|
||||
ticket: SERVER-98664
|
||||
- test_file: jstests/core/timeseries/query/bucket_unpacking_with_limit.js
|
||||
@ -943,6 +945,8 @@ last-lts:
|
||||
ticket: SERVER-99618
|
||||
- test_file: jstests/concurrency/fsm_workloads/ddl/collMod/collMod.js
|
||||
ticket: SERVER-99618
|
||||
- test_file: jstests/concurrency/fsm_workloads/ddl/collMod/collMod_options_with_index_ops.js
|
||||
ticket: SERVER-107819
|
||||
- test_file: jstests/sharding/check_metadata_consistency.js
|
||||
ticket: SERVER-98664
|
||||
- test_file: jstests/core/timeseries/query/bucket_unpacking_with_limit.js
|
||||
|
||||
@ -0,0 +1,59 @@
|
||||
/**
|
||||
* Repeatedly run collMod on the collection options along with creating and dropping indexes of a
|
||||
* collection.
|
||||
*
|
||||
* @tags: [
|
||||
* requires_sharding,
|
||||
* ]
|
||||
*/
|
||||
|
||||
import {
|
||||
uniformDistTransitions
|
||||
} from "jstests/concurrency/fsm_workload_helpers/state_transition_utils.js";
|
||||
|
||||
export const $config = (function() {
|
||||
const states = {
|
||||
init: function(db, collName) {
|
||||
const data = [];
|
||||
for (let i = 0; i < 500; i++) {
|
||||
data.push({x: Random.randInt(100)});
|
||||
}
|
||||
db[collName].insertMany(data);
|
||||
},
|
||||
collMod: function(db, collName) {
|
||||
const collModOp =
|
||||
Random.randInt(2) == 0 ? {validator: {y: {$exists: false}}} : {validator: {}};
|
||||
assert.commandWorkedOrFailedWithCode(db.runCommand({collMod: collName, ...collModOp}), [
|
||||
// Conflicted with another thread's collMod command
|
||||
ErrorCodes.ConflictingOperationInProgress,
|
||||
]);
|
||||
},
|
||||
createIndex: function(db, collName) {
|
||||
assert.commandWorkedOrFailedWithCode(db[collName].createIndex({x: 1}), [
|
||||
ErrorCodes.IndexBuildAlreadyInProgress,
|
||||
// The index was dropped while it was being built
|
||||
// TODO SERVER-75675: Remove once createIndex serializes with dropIndex
|
||||
ErrorCodes.IndexBuildAborted,
|
||||
]);
|
||||
},
|
||||
dropIndex: function(db, collName) {
|
||||
assert.commandWorkedOrFailedWithCode(db[collName].dropIndex({x: 1}), [
|
||||
// TODO SERVER-107420: Remove IndexNotFound from acceptable dropIndexes errors
|
||||
// once 9.0 becomes LTS
|
||||
ErrorCodes.IndexNotFound,
|
||||
]);
|
||||
},
|
||||
checkMetadataConsistency: function(db, collName) {
|
||||
const inconsistencies = db[collName].checkMetadataConsistency().toArray();
|
||||
assert.eq(0, inconsistencies.length, tojson(inconsistencies));
|
||||
},
|
||||
};
|
||||
|
||||
return {
|
||||
threadCount: 5,
|
||||
iterations: 200,
|
||||
states: states,
|
||||
startState: "init",
|
||||
transitions: uniformDistTransitions(states),
|
||||
};
|
||||
})();
|
||||
116
jstests/noPassthrough/ddl/coll_mod_concurrent_index_build.js
Normal file
116
jstests/noPassthrough/ddl/coll_mod_concurrent_index_build.js
Normal file
@ -0,0 +1,116 @@
|
||||
/**
|
||||
* Regression test for SERVER-107819. Tests running a collMod operation while an index build is
|
||||
* in progress in a non-DB-primary shard.
|
||||
*/
|
||||
import {getTimeseriesCollForDDLOps} from "jstests/core/timeseries/libs/viewless_timeseries_util.js";
|
||||
import {Thread} from "jstests/libs/parallelTester.js";
|
||||
import {ShardingTest} from "jstests/libs/shardingtest.js";
|
||||
import {IndexBuildTest} from "jstests/noPassthrough/libs/index_builds/index_build.js";
|
||||
import {CreateShardedCollectionUtil} from "jstests/sharding/libs/create_sharded_collection_util.js";
|
||||
|
||||
const st = new ShardingTest({shards: 2, config: 1});
|
||||
const db = st.s.getDB("test");
|
||||
assert.commandWorked(
|
||||
st.s.adminCommand({enableSharding: db.getName(), primaryShard: st.shard0.shardName}));
|
||||
|
||||
function runCollModWithConcurrentIndexBuild(coll, collModOptions, {permitIndexBuildAbort} = {}) {
|
||||
// Force a two-phase index build to hang on the non DB primary shard.
|
||||
const shardConn = st.rs1.getPrimary();
|
||||
IndexBuildTest.pauseIndexBuilds(shardConn);
|
||||
const awaitIndexBuild = IndexBuildTest.startIndexBuild(
|
||||
st.s,
|
||||
coll.getFullName(),
|
||||
{a: 1},
|
||||
{} /* options */,
|
||||
permitIndexBuildAbort ? [ErrorCodes.IndexBuildAborted] : [],
|
||||
);
|
||||
IndexBuildTest.waitForIndexBuildToScanCollection(
|
||||
shardConn.getDB(db.getName()), coll.getName(), "a_1");
|
||||
|
||||
// Use collMod to change the granularity on the collection.
|
||||
const collModThread = new Thread(
|
||||
function(host, dbName, collName, collModOptions) {
|
||||
const db = new Mongo(host).getDB(dbName);
|
||||
assert.commandWorked(db.runCommand({collMod: collName, ...collModOptions}));
|
||||
},
|
||||
db.getMongo().host,
|
||||
db.getName(),
|
||||
coll.getName(),
|
||||
collModOptions,
|
||||
);
|
||||
collModThread.start();
|
||||
|
||||
// Wait some time for the collMod to likely run into the index build.
|
||||
sleep(1000);
|
||||
|
||||
// Let everything finish and assert collection options are still consistent.
|
||||
IndexBuildTest.resumeIndexBuilds(shardConn);
|
||||
|
||||
collModThread.join();
|
||||
awaitIndexBuild();
|
||||
|
||||
const inconsistencies = coll.getDB().checkMetadataConsistency().toArray();
|
||||
assert.eq(0, inconsistencies.length, tojson(inconsistencies));
|
||||
}
|
||||
|
||||
{
|
||||
// Regular unsharded collection outside DB primary.
|
||||
const coll = db.getCollection("unsharded");
|
||||
coll.insertOne({x: 123});
|
||||
assert.commandWorked(
|
||||
st.s.adminCommand({moveCollection: coll.getFullName(), toShard: st.shard1.shardName}));
|
||||
runCollModWithConcurrentIndexBuild(coll, {validator: {x: 123}});
|
||||
}
|
||||
|
||||
{
|
||||
// Regular sharded collection, data on both shards.
|
||||
const coll = db.getCollection("sharded");
|
||||
CreateShardedCollectionUtil.shardCollectionWithChunks(coll, {x: 1}, [
|
||||
{min: {x: MinKey}, max: {x: 0}, shard: st.shard1.shardName},
|
||||
{min: {x: 0}, max: {x: MaxKey}, shard: st.shard1.shardName},
|
||||
]);
|
||||
coll.insertMany([{x: -321}, {x: 123}]);
|
||||
runCollModWithConcurrentIndexBuild(coll, {validator: {x: 123}});
|
||||
}
|
||||
|
||||
{
|
||||
// Timeseries collection sharded on meta, data only on non-primary shard.
|
||||
const coll = db.getCollection("ts_sharded_on_meta");
|
||||
CreateShardedCollectionUtil.shardCollectionWithChunks(
|
||||
coll,
|
||||
{m: 1},
|
||||
[{min: {meta: MinKey}, max: {meta: MaxKey}, shard: st.shard1.shardName}],
|
||||
{timeseries: {timeField: "t", metaField: "m"}},
|
||||
);
|
||||
coll.insertOne({t: ISODate(), m: 123, temp: 42});
|
||||
// Since timeseries granularity changes block CRUD operations, the index build may be aborted to
|
||||
// unblock them.
|
||||
runCollModWithConcurrentIndexBuild(
|
||||
coll, {timeseries: {granularity: "minutes"}}, {permitIndexBuildAbort: true});
|
||||
}
|
||||
|
||||
{
|
||||
// Timeseries collection sharded on time, data only on non-primary shard.
|
||||
const coll = db.getCollection("ts_sharded_on_time");
|
||||
assert.commandWorked(
|
||||
st.s.adminCommand({
|
||||
shardCollection: coll.getFullName(),
|
||||
key: {t: 1},
|
||||
timeseries: {timeField: "t"},
|
||||
}),
|
||||
);
|
||||
assert.commandWorked(
|
||||
st.s.adminCommand({
|
||||
moveChunk: getTimeseriesCollForDDLOps(db, coll).getFullName(),
|
||||
find: {"control.min.t": 0},
|
||||
to: st.shard1.shardName,
|
||||
}),
|
||||
);
|
||||
coll.insertOne({t: ISODate(), temp: 42});
|
||||
// Since timeseries granularity changes block CRUD operations, the index build may be aborted to
|
||||
// unblock them.
|
||||
runCollModWithConcurrentIndexBuild(
|
||||
coll, {timeseries: {granularity: "minutes"}}, {permitIndexBuildAbort: true});
|
||||
}
|
||||
|
||||
st.stop();
|
||||
@ -1,6 +1,6 @@
|
||||
// Helper functions for testing index builds.
|
||||
|
||||
import {getTimeseriesCollForDDLOps} from "jstests/core/timeseries/libs/viewless_timeseries_util.js";
|
||||
import {getTimeseriesBucketsColl} from "jstests/core/timeseries/libs/viewless_timeseries_util.js";
|
||||
import {configureFailPoint} from "jstests/libs/fail_point_util.js";
|
||||
import {getCollectionNameFromFullNamespace} from "jstests/libs/namespace_utils.js";
|
||||
import {funWithArgs} from "jstests/libs/parallel_shell_helpers.js";
|
||||
@ -1123,9 +1123,8 @@ export const ResumableIndexBuildTest = class {
|
||||
* TODO(SERVER-101609): Remove this function and use `coll` directly
|
||||
*/
|
||||
function getTargetCollForIndexBuilds(coll) {
|
||||
if (coll.getMetadata()?.type === 'timeseries') {
|
||||
return getTimeseriesCollForDDLOps(coll.getDB(), coll);
|
||||
}
|
||||
|
||||
return coll;
|
||||
// For sharded legacy timeseries collections, the `coll` view only exists on the primary shard,
|
||||
// so we won't find it when directly targeting a shard. Check for `system.buckets.coll` instead.
|
||||
const bucketsColl = getTimeseriesBucketsColl(coll);
|
||||
return bucketsColl.exists() ? bucketsColl : coll;
|
||||
}
|
||||
|
||||
@ -45,6 +45,7 @@
|
||||
#include "mongo/db/concurrency/lock_manager_defs.h"
|
||||
#include "mongo/db/database_name.h"
|
||||
#include "mongo/db/dbdirectclient.h"
|
||||
#include "mongo/db/index_builds/index_builds_coordinator.h"
|
||||
#include "mongo/db/namespace_string.h"
|
||||
#include "mongo/db/operation_context.h"
|
||||
#include "mongo/db/s/collection_sharding_runtime.h"
|
||||
@ -53,6 +54,7 @@
|
||||
#include "mongo/db/s/sharding_recovery_service.h"
|
||||
#include "mongo/db/s/sharding_state.h"
|
||||
#include "mongo/db/service_context.h"
|
||||
#include "mongo/db/shard_role.h"
|
||||
#include "mongo/db/timeseries/catalog_helper.h"
|
||||
#include "mongo/db/timeseries/timeseries_collmod.h"
|
||||
#include "mongo/db/transaction/transaction_participant.h"
|
||||
@ -67,6 +69,7 @@
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
|
||||
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kSharding
|
||||
|
||||
@ -105,6 +108,58 @@ void releaseCriticalSectionInEmptySession(OperationContext* opCtx,
|
||||
}
|
||||
}
|
||||
|
||||
// If collMod command fails with BackgroundOperationInProgressForNamespace (index build in progress,
|
||||
// non retriable error), transforms it to an error that the collMod coordinator will retry.
|
||||
// This prevents it from failing in a midway state (SERVER-107819).
|
||||
// TODO SERVER-75675: Remove this once collMod serializes with index builds.
|
||||
template <typename T>
|
||||
auto retryOnBackgroundOperationInProgressForNamespace(OperationContext* opCtx,
|
||||
const NamespaceString& ns,
|
||||
bool abortIndexBuilds,
|
||||
T&& func) {
|
||||
if (abortIndexBuilds) {
|
||||
try {
|
||||
return func();
|
||||
} catch (const ExceptionFor<ErrorCodes::BackgroundOperationInProgressForNamespace>& ex) {
|
||||
LOGV2(10781900,
|
||||
"collMod DDL participant failed due to a background operation. "
|
||||
"Aborting in-progress index builds.",
|
||||
"ex"_attr = redact(ex));
|
||||
|
||||
// TODO SERVER-105548 switch back to acquireCollection once 9.0 becomes last LTS
|
||||
auto [translatedNs, uuid] = [&]() {
|
||||
auto [collAcq, _] = timeseries::acquireCollectionWithBucketsLookup(
|
||||
opCtx,
|
||||
CollectionAcquisitionRequest::fromOpCtx(
|
||||
opCtx, ns, AcquisitionPrerequisites::OperationType::kRead),
|
||||
LockMode::MODE_IS);
|
||||
tassert(10768102,
|
||||
"Collection not found in collMod after index build error",
|
||||
collAcq.exists());
|
||||
return std::make_tuple(collAcq.nss(), collAcq.uuid());
|
||||
}();
|
||||
|
||||
IndexBuildsCoordinator::get(opCtx)->abortCollectionIndexBuilds(
|
||||
opCtx, translatedNs, uuid, "ShardSvrCollModParticipantCommand");
|
||||
}
|
||||
|
||||
// Fall-through and immediately retry. If it fails again due to a newly started index build,
|
||||
// we will still bubble the error up to the coordinator, which will retry after a backoff.
|
||||
}
|
||||
|
||||
try {
|
||||
return func();
|
||||
} catch (const ExceptionFor<ErrorCodes::BackgroundOperationInProgressForNamespace>& ex) {
|
||||
// We can not wait for index builds here, since we have a session checked out.
|
||||
// Bubble the error up to the DDL coordinator so it retries later, with a backoff.
|
||||
LOGV2(10781901,
|
||||
"collMod DDL participant failed due to a background operation. "
|
||||
"Re-throwing as a retriable error for the DDL coordinator to retry.",
|
||||
"ex"_attr = redact(ex));
|
||||
uasserted(ErrorCodes::LockBusy, "collMod failed due to a background operation");
|
||||
}
|
||||
}
|
||||
|
||||
class ShardSvrCollModParticipantCommand final
|
||||
: public TypedCommand<ShardSvrCollModParticipantCommand> {
|
||||
public:
|
||||
@ -189,10 +244,16 @@ public:
|
||||
// This flag is set from the collMod coordinator. We do not allow view definition change
|
||||
// on non-primary shards since it's not in the view catalog.
|
||||
auto performViewChange = request().getPerformViewChange();
|
||||
uassertStatusOK(timeseries::processCollModCommandWithTimeSeriesTranslation(
|
||||
opCtx, ns(), cmd, performViewChange, &builder));
|
||||
auto collmodReply =
|
||||
CollModReply::parse(IDLParserContext("CollModReply"), builder.obj());
|
||||
// If this collMod required blocking CRUD operations, prefer aborting the index builds
|
||||
// in progress in order to not delay unblocking of CRUD operations in other shards.
|
||||
// Otherwise, prefer waiting for the index builds to complete.
|
||||
auto abortIndexBuilds = request().getNeedsUnblock();
|
||||
auto collmodReply = retryOnBackgroundOperationInProgressForNamespace(
|
||||
opCtx, ns(), abortIndexBuilds, [&]() {
|
||||
uassertStatusOK(timeseries::processCollModCommandWithTimeSeriesTranslation(
|
||||
opCtx, ns(), cmd, performViewChange, &builder));
|
||||
return CollModReply::parse(IDLParserContext("CollModReply"), builder.obj());
|
||||
});
|
||||
|
||||
// Since no write that generated a retryable write oplog entry with this sessionId
|
||||
// and txnNumber happened, we need to make a dummy write so that the session gets
|
||||
|
||||
Loading…
Reference in New Issue
Block a user