SERVER-117612 Prevent convertToCapped to hold the critical section after failing to clean up (#53234)

GitOrigin-RevId: 15931a285de918ab317afa0016a3a41821130add
2026-05-07 20:24:07 +02:00 · 2026-05-07 20:24:07 +02:00 · e842bbac7f
commit e842bbac7f
parent 59ff43bce4
2 changed files with 21 additions and 34 deletions
--- a/jstests/sharding/capped_collections/convert_to_capped_failure.js
+++ b/jstests/sharding/capped_collections/convert_to_capped_failure.js
@ -1,10 +1,14 @@
 /**
 * Tests the convertToCapped coordinator behavior when there is an hypothetical failure during the
 * local convertToCapped operation.
+ * @tags: [
+ *  # TODO SERVER-126244: Remove once 9.0 becomes last LTS.
+ *  # In the newest version convertToCapped will also retry errors in untracked collections.
+ *  multiversion_incompatible,
+ * ]
 */

 import {configureFailPoint} from "jstests/libs/fail_point_util.js";
-import {FixtureHelpers} from "jstests/libs/fixture_helpers.js";
 import {funWithArgs} from "jstests/libs/parallel_shell_helpers.js";
 import {ShardingTest} from "jstests/libs/shardingtest.js";

@ -29,27 +33,13 @@ assert.commandWorked(st.s.adminCommand({enableSharding: db.getName(), primarySha

    const waitForConvertToCapped = startParallelShell(
        funWithArgs(
-            function (dbName, collName, isTracked) {
+            function (dbName, collName) {
                const testDB = db.getSiblingDB(dbName);
-                if (isTracked) {
-                    // The coordinator will eventually succeed since once the collection has been
-                    // locally capped, the sharding catalog has to be updated.
-                    assert.commandWorked(testDB.runCommand({convertToCapped: collName, size: 1000}));
-                } else {
-                    // The coordinator will fail the first time because it doesn't retry on error if the
-                    // collection is not tracked (since the sharding catalog doesn't need to be
-                    // updated). So let's retry the operation once the failpoint has been disabled to
-                    // make sure it eventually succeeds.
-                    assert.commandFailedWithCode(
-                        testDB.runCommand({convertToCapped: collName, size: 1000}),
-                        ErrorCodes.InternalError,
-                    );
-                    assert.commandWorked(testDB.runCommand({convertToCapped: collName, size: 1000}));
-                }
+                // An error on the coordinator right after the collection is locally capped should trigger the retry logic, so the operation eventually succeeds.
+                assert.commandWorked(testDB.runCommand({convertToCapped: collName, size: 1000}));
            },
            db.getName(),
            coll.getName(),
-            FixtureHelpers.isTracked(coll),
        ),
        st.s.port,
    );
--- a/src/mongo/db/global_catalog/ddl/convert_to_capped_coordinator.cpp
+++ b/src/mongo/db/global_catalog/ddl/convert_to_capped_coordinator.cpp
@ -522,16 +522,14 @@ ExecutorFuture<void> ConvertToCappedCoordinator::_runImpl(
                    }
                }

-                // If the coordinator succeeded to convert the collection to capped and the
-                // collection is tracked, the sharding catalog must be updated. Thus throw the error
-                // and retry relying on _mustAlwaysMakeProgress that will always be true reached
-                // this phase.
-                if (_mustAlwaysMakeProgress() || _isRetriableErrorForDDLCoordinator(status)) {
-                    // Retry the operation.
-                    return status;
-                }
-
-                if (_doc.getPhase() >= Phase::kAcquireCriticalSectionOnCoordinator) {
+                // Trigger cleanup for non-retriable errors in phases where the critical section is
+                // held but the collection hasn't been capped yet — the operation can safely be
+                // aborted. _mustAlwaysMakeProgress() returning true from
+                // kAcquireCriticalSectionOnCoordinator ensures that if triggerCleanup() fails
+                // transiently, the base class retries and this block is reached again.
+                if (!_isRetriableErrorForDDLCoordinator(status) &&
+                    _doc.getPhase() >= Phase::kAcquireCriticalSectionOnCoordinator &&
+                    _doc.getPhase() < Phase::kConvertCollectionToCappedOnDataShard) {
                    triggerCleanup(opCtx, status);
                    MONGO_UNREACHABLE_TASSERT(10083519);
                }
@ -546,12 +544,11 @@ bool ConvertToCappedCoordinator::isInCriticalSection(Phase phase) const {
 }

 bool ConvertToCappedCoordinator::_mustAlwaysMakeProgress() {
-    // If the collection was originally tracked on the sharding catalog, the coodinator must always
-    // make forward progress after converting the collection to capped in order to align local and
-    // sharding catalog.
-    const bool isCollectionTrackedOnTheShardingCatalog = _doc.getOriginalCollection().has_value();
-    return isCollectionTrackedOnTheShardingCatalog &&
-        _doc.getPhase() >= Phase::kConvertCollectionToCappedOnDataShard;
+    // Once the critical section is acquired, the coordinator must always make forward progress
+    // so cleanup (critical section release) cannot be skipped. _mustAlwaysMakeProgress()
+    // returning true ensures that even if triggerCleanup() fails transiently, the base class
+    // retries _runImpl and calls triggerCleanup again.
+    return _doc.getPhase() >= Phase::kAcquireCriticalSectionOnCoordinator;
 }

 ExecutorFuture<void> ConvertToCappedCoordinator::_cleanupOnAbort(