SERVER-102527 Check for connection from replicaset to config server (#35024)

GitOrigin-RevId: 5359405fb4cf03659bc20dfa930bfe128f7f8479
This commit is contained in:
wolfee 2025-04-17 20:32:52 +02:00 committed by MongoDB Bot
parent 6c745f1111
commit 19bac3d8eb
15 changed files with 193 additions and 13 deletions

1
.github/CODEOWNERS vendored
View File

@ -2263,6 +2263,7 @@ WORKSPACE.bazel @10gen/devprod-build @svc-auto-approve-bot
/src/mongo/db/s/**/sharding_server_status.cpp @10gen/query-integration-observability @svc-auto-approve-bot
/src/mongo/db/s/**/*remove_shard* @10gen/server-catalog-and-routing @svc-auto-approve-bot
/src/mongo/db/s/**/*topology_change* @10gen/server-catalog-and-routing @svc-auto-approve-bot
/src/mongo/db/s/**/*check_can_connect_to* @10gen/server-catalog-and-routing @svc-auto-approve-bot
# The following patterns are parsed from ./src/mongo/db/s/balancer/OWNERS.yml
/src/mongo/db/s/balancer/**/* @10gen/server-catalog-and-routing @svc-auto-approve-bot

View File

@ -613,6 +613,10 @@ const internalCommandsMap = {
testname: "_shardsvrGetStatsForBalancing",
command: {_shardsvrGetStatsForBalancing: "ns", collections: [], scaleFactor: 1},
},
_shardsvrCheckCanConnectToConfigServer: {
testname: "_shardsvrCheckCanConnectToConfigServer",
command: {_shardsvrCheckCanConnectToConfigServer: "host:20022"},
},
_shardsvrJoinMigrations: {
testname: "_shardsvrJoinMigrations",
command: {_shardsvrJoinMigrations: 1},

View File

@ -181,6 +181,7 @@ let viewsCommandTests = {
_shardsvrDropDatabase: {skip: isAnInternalCommand},
_shardsvrDropDatabaseParticipant: {skip: isAnInternalCommand},
_shardsvrGetStatsForBalancing: {skip: isAnInternalCommand},
_shardsvrCheckCanConnectToConfigServer: {skip: isAnInternalCommand},
_shardsvrJoinMigrations: {skip: isAnInternalCommand},
_shardsvrMergeAllChunksOnShard: {skip: isAnInternalCommand},
_shardsvrMovePrimary: {skip: isAnInternalCommand},

View File

@ -174,6 +174,7 @@ const wcCommandsTests = {
_shardsvrDropDatabaseParticipant: {skip: "internal command"},
_shardsvrEndMigrationBlockingOperation: {skip: "internal command"},
_shardsvrGetStatsForBalancing: {skip: "internal command"},
_shardsvrCheckCanConnectToConfigServer: {skip: "internal command"},
_shardsvrJoinDDLCoordinators: {skip: "internal command"},
_shardsvrJoinMigrations: {skip: "internal command"},
_shardsvrMergeAllChunksOnShard: {skip: "internal command"},
@ -3266,6 +3267,7 @@ const wcTimeseriesViewsCommandsTests = {
_shardsvrEndMigrationBlockingOperation: {skip: "internal command"},
_shardsvrFetchCollMetadata: {skip: "internal command"},
_shardsvrGetStatsForBalancing: {skip: "internal command"},
_shardsvrCheckCanConnectToConfigServer: {skip: "internal command"},
_shardsvrJoinDDLCoordinators: {skip: "internal command"},
_shardsvrJoinMigrations: {skip: "internal command"},
_shardsvrMergeAllChunksOnShard: {skip: "internal command"},

View File

@ -58,6 +58,7 @@ const allCommands = {
_configsvrDropIndexCatalogEntry: {skip: isAnInternalCommand},
_configsvrEnsureChunkVersionIsGreaterThan: {skip: isAnInternalCommand},
_configsvrGetHistoricalPlacement: {skip: isAnInternalCommand},
_configsvrHelloMe: {skip: isAnInternalCommand},
_configsvrMoveRange: {skip: isAnInternalCommand},
_configsvrRemoveChunks: {skip: isAnInternalCommand},
_configsvrRemoveShard: {skip: isAnInternalCommand},
@ -119,6 +120,7 @@ const allCommands = {
_shardsvrCoordinateMultiUpdate: {skip: isAnInternalCommand},
_shardsvrEndMigrationBlockingOperation: {skip: isAnInternalCommand},
_shardsvrGetStatsForBalancing: {skip: isAnInternalCommand},
_shardsvrCheckCanConnectToConfigServer: {skip: isAnInternalCommand},
_shardsvrJoinDDLCoordinators: {skip: isAnInternalCommand},
_shardsvrJoinMigrations: {skip: isAnInternalCommand},
_shardsvrMergeAllChunksOnShard: {skip: isAnInternalCommand},

View File

@ -107,6 +107,7 @@ const allCommands = {
_shardsvrDropIndexes: {skip: isAnInternalCommand},
_shardsvrCreateCollectionParticipant: {skip: isPrimaryOnly},
_shardsvrGetStatsForBalancing: {skip: isPrimaryOnly},
_shardsvrCheckCanConnectToConfigServer: {skip: isPrimaryOnly},
_shardsvrJoinMigrations: {skip: isAnInternalCommand},
_shardsvrJoinDDLCoordinators: {skip: isPrimaryOnly},
_shardsvrMergeAllChunksOnShard: {skip: isPrimaryOnly},

View File

@ -114,6 +114,7 @@ const allCommands = {
_shardsvrCoordinateMultiUpdate: {skip: isAnInternalCommand},
_shardsvrEndMigrationBlockingOperation: {skip: isAnInternalCommand},
_shardsvrGetStatsForBalancing: {skip: isAnInternalCommand},
_shardsvrCheckCanConnectToConfigServer: {skip: isAnInternalCommand},
_shardsvrJoinDDLCoordinators: {skip: isAnInternalCommand},
_shardsvrJoinMigrations: {skip: isAnInternalCommand},
_shardsvrMergeAllChunksOnShard: {skip: isAnInternalCommand},

View File

@ -178,6 +178,7 @@ let testCases = {
_shardsvrDropDatabaseParticipant: {skip: "internal command"},
_shardsvrEndMigrationBlockingOperation: {skip: "internal command"},
_shardsvrGetStatsForBalancing: {skip: "internal command"},
_shardsvrCheckCanConnectToConfigServer: {skip: "internal command"},
_shardsvrJoinDDLCoordinators: {skip: "internal command"},
_shardsvrJoinMigrations: {skip: "internal command"},
_shardsvrMergeAllChunksOnShard: {skip: "internal command"},

View File

@ -23,3 +23,4 @@ mixedShardTest(allowTLS, preferTLS, true);
print("=== Testing allowTLS/requireTLS cluster - SHOULD FAIL ===");
mixedShardTest(allowTLS, requireTLS, false);
mixedShardTest(requireTLS, allowTLS, false);

View File

@ -15,6 +15,7 @@ print("=== Testing disabled cluster ===");
mixedShardTest(disabled, disabled, true);
print("=== Testing disabled/preferTLS cluster - SHOULD FAIL ===");
mixedShardTest(disabled, preferTLS, false);
mixedShardTest(preferTLS, disabled, false);
print("=== Testing allowTLS/disabled cluster ===");

View File

@ -1387,6 +1387,7 @@ mongo_cc_library(
"shardsvr_abort_reshard_collection_command.cpp",
"shardsvr_add_shard_cmd.cpp",
"shardsvr_change_primary_command.cpp",
"shardsvr_check_can_connect_to_config_server_cmd.cpp",
"shardsvr_check_metadata_consistency_command.cpp",
"shardsvr_check_metadata_consistency_participant_command.cpp",
"shardsvr_cleanup_reshard_collection_command.cpp",

View File

@ -320,3 +320,6 @@ filters:
- "*topology_change*":
approvers:
- 10gen/server-catalog-and-routing
- "*check_can_connect_to*":
approvers:
- 10gen/server-catalog-and-routing

View File

@ -36,6 +36,7 @@
#include "mongo/db/s/topology_change_helpers.h"
#include "mongo/db/s/user_writes_critical_section_document_gen.h"
#include "mongo/db/vector_clock_mutable.h"
#include "mongo/s/request_types/add_shard_gen.h"
#include "mongo/util/assert_util.h"
#include "src/mongo/db/list_collections_gen.h"
@ -87,9 +88,8 @@ ExecutorFuture<void> AddShardCoordinator::_runImpl(
executor,
token);
} catch (const DBException&) {
// if we are not able to validate the host as a shard after
// multiple try, we don't want to continue, so we remove
// the replicaset monitor and give up.
// If we are not able to validate the host as a shard after multiple tries, we
// don't want to continue, so we remove the replicaset monitor and give up.
topology_change_helpers::removeReplicaSetMonitor(opCtx,
_doc.getConnectionString());
_completeOnError = true;
@ -111,6 +111,44 @@ ExecutorFuture<void> AddShardCoordinator::_runImpl(
_checkExistingDataOnShard(opCtx, targeter, **executor);
}
// (Generic FCV reference): These FCV checks should exist across LTS binary
// versions.
const auto currentFCV =
serverGlobalParams.featureCompatibility.acquireFCVSnapshot().getVersion();
invariant(currentFCV == multiversion::GenericFCV::kLatest ||
currentFCV == multiversion::GenericFCV::kLastContinuous ||
currentFCV == multiversion::GenericFCV::kLastLTS);
_setFCVOnReplicaSet(opCtx, currentFCV, **executor);
const auto host = uassertStatusOK(
Grid::get(opCtx)->shardRegistry()->getConfigShard()->getTargeter()->findHost(
opCtx, ReadPreferenceSetting{ReadPreference::PrimaryOnly}));
try {
_runWithRetries(
[&]() {
ShardsvrCheckCanConnectToConfigServer cmd(host);
cmd.setDbName(DatabaseName::kAdmin);
uassertStatusOK(
topology_change_helpers::runCommandForAddShard(opCtx,
_getTargeter(opCtx),
DatabaseName::kAdmin,
cmd.toBSON(),
**executor)
.commandStatus);
},
executor,
token);
} catch (const DBException&) {
// If the replica set is not able to contact us after multiple tries, we don't
// want to continue, so we remove the replicaset monitor and give up.
topology_change_helpers::removeReplicaSetMonitor(opCtx,
_doc.getConnectionString());
_completeOnError = true;
throw;
}
std::string shardName =
topology_change_helpers::createShardName(opCtx,
_getTargeter(opCtx),
@ -126,16 +164,6 @@ ExecutorFuture<void> AddShardCoordinator::_runImpl(
[this, _ = shared_from_this(), executor](auto* opCtx) {
auto& targeter = _getTargeter(opCtx);
// (Generic FCV reference): These FCV checks should exist across LTS binary
// versions.
const auto currentFCV =
serverGlobalParams.featureCompatibility.acquireFCVSnapshot().getVersion();
invariant(currentFCV == multiversion::GenericFCV::kLatest ||
currentFCV == multiversion::GenericFCV::kLastContinuous ||
currentFCV == multiversion::GenericFCV::kLastLTS);
_setFCVOnReplicaSet(opCtx, currentFCV, **executor);
_dropSessionsCollection(opCtx, **executor);
topology_change_helpers::getClusterTimeKeysFromReplicaSet(
@ -416,6 +444,9 @@ void AddShardCoordinator::_runWithRetries(std::function<void()>&& function,
if (status.isOK()) {
return true;
}
if (!_isRetriableErrorForDDLCoordinator(status)) {
return true;
}
failCounter++;
if (failCounter >= kMaxFailedRetryCount) {
return true;

View File

@ -0,0 +1,118 @@
/**
* Copyright (C) 2025-present MongoDB, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the Server Side Public License, version 1,
* as published by MongoDB, Inc.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* Server Side Public License for more details.
*
* You should have received a copy of the Server Side Public License
* along with this program. If not, see
* <http://www.mongodb.com/licensing/server-side-public-license>.
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the Server Side Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/
#include "mongo/db/auth/authorization_session.h"
#include "mongo/db/commands.h"
#include "mongo/db/repl/hello/hello_gen.h"
#include "mongo/executor/async_rpc.h"
#include "mongo/executor/network_interface_factory.h"
#include "mongo/executor/thread_pool_task_executor.h"
#include "mongo/s/request_types/add_shard_gen.h"
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kSharding
namespace mongo {
namespace {
class ShardsvrCheckCanConnectToConfigServerCommand
: public TypedCommand<ShardsvrCheckCanConnectToConfigServerCommand> {
public:
using Request = ShardsvrCheckCanConnectToConfigServer;
ShardsvrCheckCanConnectToConfigServerCommand() : TypedCommand(Request::kCommandName) {}
class Invocation final : public InvocationBase {
public:
using InvocationBase::InvocationBase;
void typedRun(OperationContext* opCtx) {
opCtx->setAlwaysInterruptAtStepDownOrUp_UNSAFE();
HelloCommand helloCmd;
helloCmd.setDbName(DatabaseName::kAdmin);
ConnectionString cstr(request().getCommandParameter());
auto taskExecutor = executor::ThreadPoolTaskExecutor::create(
std::make_unique<ThreadPool>(ThreadPool::Options{}),
executor::makeNetworkInterface("HelloMe-TaskExecutor"));
taskExecutor->startup();
auto options = std::make_shared<async_rpc::AsyncRPCOptions<HelloCommand>>(
taskExecutor, opCtx->getCancellationToken(), helloCmd);
try {
async_rpc::sendCommand<HelloCommand>(options, opCtx, cstr).get();
} catch (const ExceptionFor<ErrorCodes::RemoteCommandExecutionError>& ex) {
uassertStatusOK(async_rpc::unpackRPCStatus(ex.toStatus()));
}
}
private:
bool supportsWriteConcern() const override {
return false;
}
// The command parameter happens to be string so it's historically been interpreted
// by parseNs as a collection. Continuing to do so here for unexamined compatibility.
NamespaceString ns() const override {
return NamespaceString(request().getDbName());
}
void doCheckAuthorization(OperationContext* opCtx) const override {
uassert(ErrorCodes::Unauthorized,
"Unauthorized",
AuthorizationSession::get(opCtx->getClient())
->isAuthorizedForActionsOnResource(
ResourcePattern::forClusterResource(request().getDbName().tenantId()),
ActionType::internal));
}
};
bool skipApiVersionCheck() const override {
// Internal command (server to server).
return true;
}
std::string help() const override {
return "Internal command, that tries to contact the host specified in the parameter.";
}
AllowedOnSecondary secondaryAllowed(ServiceContext*) const override {
return AllowedOnSecondary::kNever;
}
bool adminOnly() const override {
return true;
}
};
MONGO_REGISTER_COMMAND(ShardsvrCheckCanConnectToConfigServerCommand).forShard();
} // namespace
} // namespace mongo

View File

@ -32,6 +32,7 @@ global:
imports:
- "mongo/db/basic_types.idl"
- "mongo/db/s/type_shard_identity.idl"
- "mongo/util/net/hostandport.idl"
structs:
AddShardRequestBase:
@ -88,3 +89,14 @@ commands:
shardIdentity:
description: "Identity metadata for the new shard"
type: ShardIdentity
# Internal command where the config server expects the shard to send a hello command to the
# config server. This is used to find out if the contact could be done both ways (without
# network or security issue)
_shardsvrCheckCanConnectToConfigServer:
command_name: _shardsvrCheckCanConnectToConfigServer
cpp_name: ShardsvrCheckCanConnectToConfigServer
description: "Internal command where the config server expects the shard to send a hello command to the config server. This is used to find out if the contact could be done both ways (without network or security issue)"
namespace: type
type: HostAndPort
api_version: ""