SERVER-113405: Don't transition to STARTUP2 until _initAndListen finishes (#47447)
Co-authored-by: Denis Trailin <denis.trailin@mongodb.com> Co-authored-by: Billy Donahue <BillyDonahue@users.noreply.github.com> GitOrigin-RevId: ae74dc8adef59af5a5f8a0ee041cf992aed6e2a6
This commit is contained in:
parent
9343c350ae
commit
0f8f221459
52
jstests/replsets/init_and_listen_blocks_startup2.js
Normal file
52
jstests/replsets/init_and_listen_blocks_startup2.js
Normal file
@ -0,0 +1,52 @@
|
||||
/**
|
||||
* Tests that a replica set node does not transition from STARTUP to STARTUP2
|
||||
* until _initAndListen completes. See SERVER-113405.
|
||||
* @tags: [
|
||||
* requires_fcv_83,
|
||||
* requires_persistence,
|
||||
* ]
|
||||
*/
|
||||
|
||||
import {ReplSetTest} from "jstests/libs/replsettest.js";
|
||||
|
||||
const rst = new ReplSetTest({name: jsTestName(), nodes: 1});
|
||||
rst.startSet();
|
||||
rst.initiate();
|
||||
assert.commandWorked(rst.getPrimary().getDB("test").coll.insert({x: 1}));
|
||||
|
||||
// Restart with failpoint that blocks signalInitAndListenComplete.
|
||||
const restartNode = rst.restart(0, {
|
||||
setParameter: "failpoint.hangBeforeNotifyStorageStartupRecoveryComplete=" + tojson({mode: "alwaysOn"}),
|
||||
});
|
||||
|
||||
// Wait for node to reach the failpoint (transport layer ready, but signal blocked)
|
||||
assert.commandWorked(
|
||||
restartNode.adminCommand({
|
||||
waitForFailPoint: "hangBeforeNotifyStorageStartupRecoveryComplete",
|
||||
timesEntered: 1,
|
||||
maxTimeMS: 60000,
|
||||
}),
|
||||
);
|
||||
|
||||
// Verify node has NOT transitioned to STARTUP2 while failpoint is active
|
||||
assert(
|
||||
!checkLog.checkContainsOnce(restartNode, '"newState":"STARTUP2"'),
|
||||
"Node should not transition to STARTUP2 until _initAndListen completes",
|
||||
);
|
||||
|
||||
jsTestLog("Verified: STARTUP2 blocked while failpoint active");
|
||||
|
||||
// Disable failpoint to allow signal, which will unblock the wait
|
||||
assert.commandWorked(
|
||||
restartNode.adminCommand({configureFailPoint: "hangBeforeNotifyStorageStartupRecoveryComplete", mode: "off"}),
|
||||
);
|
||||
|
||||
// Verify node transitions to STARTUP2 after signal is sent
|
||||
assert.soon(
|
||||
() => checkLog.checkContainsOnce(restartNode, '"newState":"STARTUP2"'),
|
||||
"Node should transition to STARTUP2 after signal is sent",
|
||||
);
|
||||
|
||||
rst.waitForState(restartNode, [ReplSetTest.State.PRIMARY, ReplSetTest.State.SECONDARY]);
|
||||
|
||||
rst.stopSet();
|
||||
@ -778,6 +778,14 @@ void ReplicationCoordinatorImpl::_finishLoadLocalConfig(
|
||||
_externalState->setGlobalTimestamp(getServiceContext(), lastOpTime.getTimestamp());
|
||||
|
||||
auto opCtx = cc().makeOperationContext();
|
||||
|
||||
// Wait until _initAndListen completes before allowing transition to STARTUP2.
|
||||
// _finishLoadLocalConfig runs asynchronously after startup() returns, and _setCurrentRSConfig
|
||||
// below can trigger initial sync (via _startDataReplication). Initial sync takes locks that
|
||||
// could conflict with operations still running in _initAndListen after startup() returns,
|
||||
// potentially causing a livelock.
|
||||
getServiceContext()->waitForStartupComplete();
|
||||
|
||||
stdx::unique_lock lock(_mutex);
|
||||
invariant(_rsConfigState == kConfigStartingUp);
|
||||
const PostMemberStateUpdateAction action =
|
||||
@ -5138,15 +5146,6 @@ void ReplicationCoordinatorImpl::_setStableTimestampForStorage(WithLock lk) {
|
||||
}
|
||||
|
||||
void ReplicationCoordinatorImpl::finishRecoveryIfEligible(OperationContext* opCtx) {
|
||||
// It doesn't make sense to become a secondary before _initAndListen
|
||||
// finishes. Perhaps more importantly, we need to take the Global lock
|
||||
// several times in _initAndListen, and we don't want to reacquire (and not
|
||||
// yield) the Global lock below if we race with taking the Global lock in
|
||||
// _initAndListen.
|
||||
LOGV2(
|
||||
6295104,
|
||||
"Starting ReplicationCoordinatorImpl::finishRecoveryIfEligible after startup completes...");
|
||||
opCtx->getServiceContext()->waitForStartupComplete();
|
||||
LOGV2(6295105, "Starting ReplicationCoordinatorImpl::finishRecoveryIfEligible");
|
||||
if (MONGO_unlikely(hangBeforeFinishRecovery.shouldFail())) {
|
||||
hangBeforeFinishRecovery.pauseWhileSet(opCtx);
|
||||
|
||||
@ -49,6 +49,7 @@
|
||||
#include "mongo/transport/session.h"
|
||||
#include "mongo/transport/transport_layer_manager.h"
|
||||
#include "mongo/util/assert_util.h"
|
||||
#include "mongo/util/fail_point.h"
|
||||
#include "mongo/util/observable_mutex_registry.h"
|
||||
#include "mongo/util/processinfo.h"
|
||||
#include "mongo/util/scopeguard.h"
|
||||
@ -68,6 +69,8 @@ namespace {
|
||||
|
||||
ServiceContext* globalServiceContext = nullptr;
|
||||
|
||||
MONGO_FAIL_POINT_DEFINE(hangBeforeNotifyStorageStartupRecoveryComplete);
|
||||
|
||||
} // namespace
|
||||
|
||||
ClientLock::ClientLock(Client* client) : service_context_detail::ObjectLock<Client>(client) {}
|
||||
@ -500,6 +503,10 @@ void ServiceContext::waitForStartupComplete() {
|
||||
}
|
||||
|
||||
void ServiceContext::notifyStorageStartupRecoveryComplete() {
|
||||
if (MONGO_unlikely(hangBeforeNotifyStorageStartupRecoveryComplete.shouldFail())) {
|
||||
LOGV2(11340502, "Pausing at fail point hangBeforeNotifyStorageStartupRecoveryComplete");
|
||||
hangBeforeNotifyStorageStartupRecoveryComplete.pauseWhileSet();
|
||||
}
|
||||
{
|
||||
stdx::lock_guard lk(_mutex);
|
||||
_startupComplete = true;
|
||||
|
||||
Loading…
Reference in New Issue
Block a user