diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml index a976569e716..1d0bf8a8b67 100644 --- a/etc/backports_required_for_multiversion_tests.yml +++ b/etc/backports_required_for_multiversion_tests.yml @@ -406,6 +406,8 @@ last-continuous: ticket: SERVER-99618 - test_file: jstests/concurrency/fsm_workloads/ddl/collMod/collMod.js ticket: SERVER-99618 + - test_file: jstests/concurrency/fsm_workloads/ddl/collMod/collMod_options_with_index_ops.js + ticket: SERVER-107819 - test_file: jstests/sharding/check_metadata_consistency.js ticket: SERVER-98664 - test_file: jstests/core/timeseries/query/bucket_unpacking_with_limit.js @@ -943,6 +945,8 @@ last-lts: ticket: SERVER-99618 - test_file: jstests/concurrency/fsm_workloads/ddl/collMod/collMod.js ticket: SERVER-99618 + - test_file: jstests/concurrency/fsm_workloads/ddl/collMod/collMod_options_with_index_ops.js + ticket: SERVER-107819 - test_file: jstests/sharding/check_metadata_consistency.js ticket: SERVER-98664 - test_file: jstests/core/timeseries/query/bucket_unpacking_with_limit.js diff --git a/jstests/concurrency/fsm_workloads/ddl/collMod/collMod_options_with_index_ops.js b/jstests/concurrency/fsm_workloads/ddl/collMod/collMod_options_with_index_ops.js new file mode 100644 index 00000000000..98c4f59789d --- /dev/null +++ b/jstests/concurrency/fsm_workloads/ddl/collMod/collMod_options_with_index_ops.js @@ -0,0 +1,59 @@ +/** + * Repeatedly run collMod on the collection options along with creating and dropping indexes of a + * collection. + * + * @tags: [ + * requires_sharding, + * ] + */ + +import { + uniformDistTransitions +} from "jstests/concurrency/fsm_workload_helpers/state_transition_utils.js"; + +export const $config = (function() { + const states = { + init: function(db, collName) { + const data = []; + for (let i = 0; i < 500; i++) { + data.push({x: Random.randInt(100)}); + } + db[collName].insertMany(data); + }, + collMod: function(db, collName) { + const collModOp = + Random.randInt(2) == 0 ? {validator: {y: {$exists: false}}} : {validator: {}}; + assert.commandWorkedOrFailedWithCode(db.runCommand({collMod: collName, ...collModOp}), [ + // Conflicted with another thread's collMod command + ErrorCodes.ConflictingOperationInProgress, + ]); + }, + createIndex: function(db, collName) { + assert.commandWorkedOrFailedWithCode(db[collName].createIndex({x: 1}), [ + ErrorCodes.IndexBuildAlreadyInProgress, + // The index was dropped while it was being built + // TODO SERVER-75675: Remove once createIndex serializes with dropIndex + ErrorCodes.IndexBuildAborted, + ]); + }, + dropIndex: function(db, collName) { + assert.commandWorkedOrFailedWithCode(db[collName].dropIndex({x: 1}), [ + // TODO SERVER-107420: Remove IndexNotFound from acceptable dropIndexes errors + // once 9.0 becomes LTS + ErrorCodes.IndexNotFound, + ]); + }, + checkMetadataConsistency: function(db, collName) { + const inconsistencies = db[collName].checkMetadataConsistency().toArray(); + assert.eq(0, inconsistencies.length, tojson(inconsistencies)); + }, + }; + + return { + threadCount: 5, + iterations: 200, + states: states, + startState: "init", + transitions: uniformDistTransitions(states), + }; +})(); diff --git a/jstests/noPassthrough/ddl/coll_mod_concurrent_index_build.js b/jstests/noPassthrough/ddl/coll_mod_concurrent_index_build.js new file mode 100644 index 00000000000..b5f80163089 --- /dev/null +++ b/jstests/noPassthrough/ddl/coll_mod_concurrent_index_build.js @@ -0,0 +1,116 @@ +/** + * Regression test for SERVER-107819. Tests running a collMod operation while an index build is + * in progress in a non-DB-primary shard. + */ +import {getTimeseriesCollForDDLOps} from "jstests/core/timeseries/libs/viewless_timeseries_util.js"; +import {Thread} from "jstests/libs/parallelTester.js"; +import {ShardingTest} from "jstests/libs/shardingtest.js"; +import {IndexBuildTest} from "jstests/noPassthrough/libs/index_builds/index_build.js"; +import {CreateShardedCollectionUtil} from "jstests/sharding/libs/create_sharded_collection_util.js"; + +const st = new ShardingTest({shards: 2, config: 1}); +const db = st.s.getDB("test"); +assert.commandWorked( + st.s.adminCommand({enableSharding: db.getName(), primaryShard: st.shard0.shardName})); + +function runCollModWithConcurrentIndexBuild(coll, collModOptions, {permitIndexBuildAbort} = {}) { + // Force a two-phase index build to hang on the non DB primary shard. + const shardConn = st.rs1.getPrimary(); + IndexBuildTest.pauseIndexBuilds(shardConn); + const awaitIndexBuild = IndexBuildTest.startIndexBuild( + st.s, + coll.getFullName(), + {a: 1}, + {} /* options */, + permitIndexBuildAbort ? [ErrorCodes.IndexBuildAborted] : [], + ); + IndexBuildTest.waitForIndexBuildToScanCollection( + shardConn.getDB(db.getName()), coll.getName(), "a_1"); + + // Use collMod to change the granularity on the collection. + const collModThread = new Thread( + function(host, dbName, collName, collModOptions) { + const db = new Mongo(host).getDB(dbName); + assert.commandWorked(db.runCommand({collMod: collName, ...collModOptions})); + }, + db.getMongo().host, + db.getName(), + coll.getName(), + collModOptions, + ); + collModThread.start(); + + // Wait some time for the collMod to likely run into the index build. + sleep(1000); + + // Let everything finish and assert collection options are still consistent. + IndexBuildTest.resumeIndexBuilds(shardConn); + + collModThread.join(); + awaitIndexBuild(); + + const inconsistencies = coll.getDB().checkMetadataConsistency().toArray(); + assert.eq(0, inconsistencies.length, tojson(inconsistencies)); +} + +{ + // Regular unsharded collection outside DB primary. + const coll = db.getCollection("unsharded"); + coll.insertOne({x: 123}); + assert.commandWorked( + st.s.adminCommand({moveCollection: coll.getFullName(), toShard: st.shard1.shardName})); + runCollModWithConcurrentIndexBuild(coll, {validator: {x: 123}}); +} + +{ + // Regular sharded collection, data on both shards. + const coll = db.getCollection("sharded"); + CreateShardedCollectionUtil.shardCollectionWithChunks(coll, {x: 1}, [ + {min: {x: MinKey}, max: {x: 0}, shard: st.shard1.shardName}, + {min: {x: 0}, max: {x: MaxKey}, shard: st.shard1.shardName}, + ]); + coll.insertMany([{x: -321}, {x: 123}]); + runCollModWithConcurrentIndexBuild(coll, {validator: {x: 123}}); +} + +{ + // Timeseries collection sharded on meta, data only on non-primary shard. + const coll = db.getCollection("ts_sharded_on_meta"); + CreateShardedCollectionUtil.shardCollectionWithChunks( + coll, + {m: 1}, + [{min: {meta: MinKey}, max: {meta: MaxKey}, shard: st.shard1.shardName}], + {timeseries: {timeField: "t", metaField: "m"}}, + ); + coll.insertOne({t: ISODate(), m: 123, temp: 42}); + // Since timeseries granularity changes block CRUD operations, the index build may be aborted to + // unblock them. + runCollModWithConcurrentIndexBuild( + coll, {timeseries: {granularity: "minutes"}}, {permitIndexBuildAbort: true}); +} + +{ + // Timeseries collection sharded on time, data only on non-primary shard. + const coll = db.getCollection("ts_sharded_on_time"); + assert.commandWorked( + st.s.adminCommand({ + shardCollection: coll.getFullName(), + key: {t: 1}, + timeseries: {timeField: "t"}, + }), + ); + assert.commandWorked( + st.s.adminCommand({ + moveChunk: getTimeseriesCollForDDLOps(db, coll).getFullName(), + find: {"control.min.t": 0}, + to: st.shard1.shardName, + }), + ); + coll.insertOne({t: ISODate(), temp: 42}); + // Since timeseries granularity changes block CRUD operations, the index build may be aborted to + // unblock them. + runCollModWithConcurrentIndexBuild( + coll, {timeseries: {granularity: "minutes"}}, {permitIndexBuildAbort: true}); +} + +st.stop(); diff --git a/jstests/noPassthrough/libs/index_builds/index_build.js b/jstests/noPassthrough/libs/index_builds/index_build.js index 02f0f38bd5f..dffa0f4820a 100644 --- a/jstests/noPassthrough/libs/index_builds/index_build.js +++ b/jstests/noPassthrough/libs/index_builds/index_build.js @@ -1,6 +1,6 @@ // Helper functions for testing index builds. -import {getTimeseriesCollForDDLOps} from "jstests/core/timeseries/libs/viewless_timeseries_util.js"; +import {getTimeseriesBucketsColl} from "jstests/core/timeseries/libs/viewless_timeseries_util.js"; import {configureFailPoint} from "jstests/libs/fail_point_util.js"; import {getCollectionNameFromFullNamespace} from "jstests/libs/namespace_utils.js"; import {funWithArgs} from "jstests/libs/parallel_shell_helpers.js"; @@ -1123,9 +1123,8 @@ export const ResumableIndexBuildTest = class { * TODO(SERVER-101609): Remove this function and use `coll` directly */ function getTargetCollForIndexBuilds(coll) { - if (coll.getMetadata()?.type === 'timeseries') { - return getTimeseriesCollForDDLOps(coll.getDB(), coll); - } - - return coll; + // For sharded legacy timeseries collections, the `coll` view only exists on the primary shard, + // so we won't find it when directly targeting a shard. Check for `system.buckets.coll` instead. + const bucketsColl = getTimeseriesBucketsColl(coll); + return bucketsColl.exists() ? bucketsColl : coll; } diff --git a/src/mongo/db/s/shardsvr_collmod_participant_command.cpp b/src/mongo/db/s/shardsvr_collmod_participant_command.cpp index df45f991371..7928d178d57 100644 --- a/src/mongo/db/s/shardsvr_collmod_participant_command.cpp +++ b/src/mongo/db/s/shardsvr_collmod_participant_command.cpp @@ -45,6 +45,7 @@ #include "mongo/db/concurrency/lock_manager_defs.h" #include "mongo/db/database_name.h" #include "mongo/db/dbdirectclient.h" +#include "mongo/db/index_builds/index_builds_coordinator.h" #include "mongo/db/namespace_string.h" #include "mongo/db/operation_context.h" #include "mongo/db/s/collection_sharding_runtime.h" @@ -53,6 +54,7 @@ #include "mongo/db/s/sharding_recovery_service.h" #include "mongo/db/s/sharding_state.h" #include "mongo/db/service_context.h" +#include "mongo/db/shard_role.h" #include "mongo/db/timeseries/catalog_helper.h" #include "mongo/db/timeseries/timeseries_collmod.h" #include "mongo/db/transaction/transaction_participant.h" @@ -67,6 +69,7 @@ #include #include +#include #define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kSharding @@ -105,6 +108,58 @@ void releaseCriticalSectionInEmptySession(OperationContext* opCtx, } } +// If collMod command fails with BackgroundOperationInProgressForNamespace (index build in progress, +// non retriable error), transforms it to an error that the collMod coordinator will retry. +// This prevents it from failing in a midway state (SERVER-107819). +// TODO SERVER-75675: Remove this once collMod serializes with index builds. +template +auto retryOnBackgroundOperationInProgressForNamespace(OperationContext* opCtx, + const NamespaceString& ns, + bool abortIndexBuilds, + T&& func) { + if (abortIndexBuilds) { + try { + return func(); + } catch (const ExceptionFor& ex) { + LOGV2(10781900, + "collMod DDL participant failed due to a background operation. " + "Aborting in-progress index builds.", + "ex"_attr = redact(ex)); + + // TODO SERVER-105548 switch back to acquireCollection once 9.0 becomes last LTS + auto [translatedNs, uuid] = [&]() { + auto [collAcq, _] = timeseries::acquireCollectionWithBucketsLookup( + opCtx, + CollectionAcquisitionRequest::fromOpCtx( + opCtx, ns, AcquisitionPrerequisites::OperationType::kRead), + LockMode::MODE_IS); + tassert(10768102, + "Collection not found in collMod after index build error", + collAcq.exists()); + return std::make_tuple(collAcq.nss(), collAcq.uuid()); + }(); + + IndexBuildsCoordinator::get(opCtx)->abortCollectionIndexBuilds( + opCtx, translatedNs, uuid, "ShardSvrCollModParticipantCommand"); + } + + // Fall-through and immediately retry. If it fails again due to a newly started index build, + // we will still bubble the error up to the coordinator, which will retry after a backoff. + } + + try { + return func(); + } catch (const ExceptionFor& ex) { + // We can not wait for index builds here, since we have a session checked out. + // Bubble the error up to the DDL coordinator so it retries later, with a backoff. + LOGV2(10781901, + "collMod DDL participant failed due to a background operation. " + "Re-throwing as a retriable error for the DDL coordinator to retry.", + "ex"_attr = redact(ex)); + uasserted(ErrorCodes::LockBusy, "collMod failed due to a background operation"); + } +} + class ShardSvrCollModParticipantCommand final : public TypedCommand { public: @@ -189,10 +244,16 @@ public: // This flag is set from the collMod coordinator. We do not allow view definition change // on non-primary shards since it's not in the view catalog. auto performViewChange = request().getPerformViewChange(); - uassertStatusOK(timeseries::processCollModCommandWithTimeSeriesTranslation( - opCtx, ns(), cmd, performViewChange, &builder)); - auto collmodReply = - CollModReply::parse(IDLParserContext("CollModReply"), builder.obj()); + // If this collMod required blocking CRUD operations, prefer aborting the index builds + // in progress in order to not delay unblocking of CRUD operations in other shards. + // Otherwise, prefer waiting for the index builds to complete. + auto abortIndexBuilds = request().getNeedsUnblock(); + auto collmodReply = retryOnBackgroundOperationInProgressForNamespace( + opCtx, ns(), abortIndexBuilds, [&]() { + uassertStatusOK(timeseries::processCollModCommandWithTimeSeriesTranslation( + opCtx, ns(), cmd, performViewChange, &builder)); + return CollModReply::parse(IDLParserContext("CollModReply"), builder.obj()); + }); // Since no write that generated a retryable write oplog entry with this sessionId // and txnNumber happened, we need to make a dummy write so that the session gets