SERVER-117612 Prevent convertToCapped to hold the critical section after failing to clean up (#53234)

GitOrigin-RevId: 15931a285de918ab317afa0016a3a41821130add
This commit is contained in:
Marcos Grillo 2026-05-07 20:24:07 +02:00 committed by MongoDB Bot
parent 59ff43bce4
commit e842bbac7f
2 changed files with 21 additions and 34 deletions

View File

@ -1,10 +1,14 @@
/**
* Tests the convertToCapped coordinator behavior when there is an hypothetical failure during the
* local convertToCapped operation.
* @tags: [
* # TODO SERVER-126244: Remove once 9.0 becomes last LTS.
* # In the newest version convertToCapped will also retry errors in untracked collections.
* multiversion_incompatible,
* ]
*/
import {configureFailPoint} from "jstests/libs/fail_point_util.js";
import {FixtureHelpers} from "jstests/libs/fixture_helpers.js";
import {funWithArgs} from "jstests/libs/parallel_shell_helpers.js";
import {ShardingTest} from "jstests/libs/shardingtest.js";
@ -29,27 +33,13 @@ assert.commandWorked(st.s.adminCommand({enableSharding: db.getName(), primarySha
const waitForConvertToCapped = startParallelShell(
funWithArgs(
function (dbName, collName, isTracked) {
function (dbName, collName) {
const testDB = db.getSiblingDB(dbName);
if (isTracked) {
// The coordinator will eventually succeed since once the collection has been
// locally capped, the sharding catalog has to be updated.
assert.commandWorked(testDB.runCommand({convertToCapped: collName, size: 1000}));
} else {
// The coordinator will fail the first time because it doesn't retry on error if the
// collection is not tracked (since the sharding catalog doesn't need to be
// updated). So let's retry the operation once the failpoint has been disabled to
// make sure it eventually succeeds.
assert.commandFailedWithCode(
testDB.runCommand({convertToCapped: collName, size: 1000}),
ErrorCodes.InternalError,
);
assert.commandWorked(testDB.runCommand({convertToCapped: collName, size: 1000}));
}
// An error on the coordinator right after the collection is locally capped should trigger the retry logic, so the operation eventually succeeds.
assert.commandWorked(testDB.runCommand({convertToCapped: collName, size: 1000}));
},
db.getName(),
coll.getName(),
FixtureHelpers.isTracked(coll),
),
st.s.port,
);

View File

@ -522,16 +522,14 @@ ExecutorFuture<void> ConvertToCappedCoordinator::_runImpl(
}
}
// If the coordinator succeeded to convert the collection to capped and the
// collection is tracked, the sharding catalog must be updated. Thus throw the error
// and retry relying on _mustAlwaysMakeProgress that will always be true reached
// this phase.
if (_mustAlwaysMakeProgress() || _isRetriableErrorForDDLCoordinator(status)) {
// Retry the operation.
return status;
}
if (_doc.getPhase() >= Phase::kAcquireCriticalSectionOnCoordinator) {
// Trigger cleanup for non-retriable errors in phases where the critical section is
// held but the collection hasn't been capped yet — the operation can safely be
// aborted. _mustAlwaysMakeProgress() returning true from
// kAcquireCriticalSectionOnCoordinator ensures that if triggerCleanup() fails
// transiently, the base class retries and this block is reached again.
if (!_isRetriableErrorForDDLCoordinator(status) &&
_doc.getPhase() >= Phase::kAcquireCriticalSectionOnCoordinator &&
_doc.getPhase() < Phase::kConvertCollectionToCappedOnDataShard) {
triggerCleanup(opCtx, status);
MONGO_UNREACHABLE_TASSERT(10083519);
}
@ -546,12 +544,11 @@ bool ConvertToCappedCoordinator::isInCriticalSection(Phase phase) const {
}
bool ConvertToCappedCoordinator::_mustAlwaysMakeProgress() {
// If the collection was originally tracked on the sharding catalog, the coodinator must always
// make forward progress after converting the collection to capped in order to align local and
// sharding catalog.
const bool isCollectionTrackedOnTheShardingCatalog = _doc.getOriginalCollection().has_value();
return isCollectionTrackedOnTheShardingCatalog &&
_doc.getPhase() >= Phase::kConvertCollectionToCappedOnDataShard;
// Once the critical section is acquired, the coordinator must always make forward progress
// so cleanup (critical section release) cannot be skipped. _mustAlwaysMakeProgress()
// returning true ensures that even if triggerCleanup() fails transiently, the base class
// retries _runImpl and calls triggerCleanup again.
return _doc.getPhase() >= Phase::kAcquireCriticalSectionOnCoordinator;
}
ExecutorFuture<void> ConvertToCappedCoordinator::_cleanupOnAbort(