mongo/jstests/replsets/initial_sync_survives_restart.js
Zac 591928c619 SERVER-108478 JS formatted by prettier and remove clang-format (#39656)
GitOrigin-RevId: 6c8f6aded47f260aa4f7c231b17dae3302cb1e04
2025-08-21 17:27:09 +00:00

128 lines
5.3 KiB
JavaScript

/**
* Tests that initial sync survives a restart during each stage of the cloning process.
* @tags: [
* requires_persistence,
* ]
*/
import {configureFailPoint, kDefaultWaitForFailPointTimeout} from "jstests/libs/fail_point_util.js";
import {ReplSetTest} from "jstests/libs/replsettest.js";
const testName = "initial_sync_survives_restart";
const rst = new ReplSetTest({name: testName, nodes: 1});
rst.startSet();
rst.initiate(null, null, {initiateWithDefaultElectionTimeout: true});
const primary = rst.getPrimary();
const primaryDb = primary.getDB("test");
// The default WC is majority and this test can't satisfy majority writes.
assert.commandWorked(
primary.adminCommand({setDefaultRWConcern: 1, defaultWriteConcern: {w: 1}, writeConcern: {w: "majority"}}),
);
// Add some data to be cloned.
assert.commandWorked(primaryDb.test.insert([{a: 1}, {b: 2}, {c: 3}]));
jsTest.log("Adding a new node to the replica set");
const secondary = rst.add({
rsConfig: {priority: 0, votes: 0},
setParameter: {
"failpoint.initialSyncHangBeforeCopyingDatabases": tojson({mode: "alwaysOn"}),
// This test is specifically testing that the cloners stop, so we turn off the
// oplog fetcher to ensure that we don't inadvertently test that instead.
"failpoint.hangBeforeStartingOplogFetcher": tojson({mode: "alwaysOn"}),
"numInitialSyncAttempts": 1,
},
});
rst.reInitiate();
rst.waitForState(secondary, ReplSetTest.State.STARTUP_2);
function retryStage(rst, {cloner, stage, extraData}) {
const nRetries = 2;
let primary = rst.getPrimary();
const secondary = rst.getSecondary();
const secondaryDb = secondary.getDB("test");
const failPointData = Object.merge(extraData || {}, {cloner: cloner, stage: stage});
// Set us up to stop right before the given stage.
const beforeStageFailPoint = configureFailPoint(secondaryDb, "hangBeforeClonerStage", failPointData);
// Set us up to stop after the given stage. This will also release the failpoint for the
// previous stage, if it was set.
const afterStageFailPoint = configureFailPoint(secondaryDb, "hangAfterClonerStage", failPointData);
// Release the initial failpoint if set.
assert.commandWorked(
secondaryDb.adminCommand({configureFailPoint: "initialSyncHangBeforeCopyingDatabases", mode: "off"}),
);
beforeStageFailPoint.wait();
jsTestLog("Testing restart of sync source in cloner " + cloner + " stage " + stage);
// We stop the node and wait for it, then start it separately, to avoid the clone completing
// before the node actually stops.
rst.stop(primary, null, null, {forRestart: true, waitPid: true});
// Release the syncing node fail point to allow retries while the node is down and restarting.
beforeStageFailPoint.off();
// Make sure some retries happen while the sync source is completely down.
let beforeRetryFailPoint = configureFailPoint(secondaryDb, "hangBeforeRetryingClonerStage", failPointData, {
skip: nRetries,
});
beforeRetryFailPoint.wait();
beforeRetryFailPoint.off();
// Turning on hangBeforeFinishRecovery prevents the sync source from coming
// out of RECOVERING so we can ensure the syncing node does some retries
// while the sync source is not ready.
const options = {
setParameter: {"failpoint.hangBeforeFinishRecovery": tojson({mode: "alwaysOn"})},
waitForConnect: true,
};
primary = rst.start(primary, options, true /* restart */);
// Wait for the sync source to be in RECOVERING.
assert.commandWorked(
primary.adminCommand({
waitForFailPoint: "hangBeforeFinishRecovery",
timesEntered: 1,
maxTimeMS: kDefaultWaitForFailPointTimeout,
}),
);
rst.waitForState(primary, ReplSetTest.State.RECOVERING);
// Make sure some retries happen while the sync source is available and in "RECOVERING"
beforeRetryFailPoint = configureFailPoint(secondaryDb, "hangBeforeRetryingClonerStage", failPointData, {
skip: nRetries,
});
beforeRetryFailPoint.wait();
beforeRetryFailPoint.off();
// Now let the sync source finish recovering and keep retrying.
assert.commandWorked(primary.adminCommand({configureFailPoint: "hangBeforeFinishRecovery", mode: "off"}));
afterStageFailPoint.wait();
jsTestLog("Cloner " + cloner + " stage " + stage + " complete.");
return afterStageFailPoint;
}
retryStage(rst, {cloner: "AllDatabaseCloner", stage: "connect"});
retryStage(rst, {cloner: "AllDatabaseCloner", stage: "listDatabases"});
retryStage(rst, {cloner: "DatabaseCloner", stage: "listCollections", extraData: {database: "test"}});
retryStage(rst, {cloner: "CollectionCloner", stage: "count", extraData: {nss: "test.test"}});
retryStage(rst, {cloner: "CollectionCloner", stage: "listIndexes", extraData: {nss: "test.test"}});
const afterStageFailPoint = retryStage(rst, {
cloner: "CollectionCloner",
stage: "query",
extraData: {nss: "test.test"},
});
jsTestLog("Releasing the oplog fetcher failpoint.");
assert.commandWorked(
secondary.getDB("test").adminCommand({configureFailPoint: "hangBeforeStartingOplogFetcher", mode: "off"}),
);
jsTestLog("Releasing the final cloner failpoint.");
afterStageFailPoint.off();
jsTestLog("Waiting for initial sync to complete.");
// Wait for initial sync to complete.
rst.awaitSecondaryNodes(null, [secondary]);
rst.stopSet();