SERVER-102527 Check for connection from replicaset to config server (#35024)
GitOrigin-RevId: 5359405fb4cf03659bc20dfa930bfe128f7f8479
This commit is contained in:
parent
6c745f1111
commit
19bac3d8eb
1
.github/CODEOWNERS
vendored
1
.github/CODEOWNERS
vendored
@ -2263,6 +2263,7 @@ WORKSPACE.bazel @10gen/devprod-build @svc-auto-approve-bot
|
||||
/src/mongo/db/s/**/sharding_server_status.cpp @10gen/query-integration-observability @svc-auto-approve-bot
|
||||
/src/mongo/db/s/**/*remove_shard* @10gen/server-catalog-and-routing @svc-auto-approve-bot
|
||||
/src/mongo/db/s/**/*topology_change* @10gen/server-catalog-and-routing @svc-auto-approve-bot
|
||||
/src/mongo/db/s/**/*check_can_connect_to* @10gen/server-catalog-and-routing @svc-auto-approve-bot
|
||||
|
||||
# The following patterns are parsed from ./src/mongo/db/s/balancer/OWNERS.yml
|
||||
/src/mongo/db/s/balancer/**/* @10gen/server-catalog-and-routing @svc-auto-approve-bot
|
||||
|
||||
@ -613,6 +613,10 @@ const internalCommandsMap = {
|
||||
testname: "_shardsvrGetStatsForBalancing",
|
||||
command: {_shardsvrGetStatsForBalancing: "ns", collections: [], scaleFactor: 1},
|
||||
},
|
||||
_shardsvrCheckCanConnectToConfigServer: {
|
||||
testname: "_shardsvrCheckCanConnectToConfigServer",
|
||||
command: {_shardsvrCheckCanConnectToConfigServer: "host:20022"},
|
||||
},
|
||||
_shardsvrJoinMigrations: {
|
||||
testname: "_shardsvrJoinMigrations",
|
||||
command: {_shardsvrJoinMigrations: 1},
|
||||
|
||||
@ -181,6 +181,7 @@ let viewsCommandTests = {
|
||||
_shardsvrDropDatabase: {skip: isAnInternalCommand},
|
||||
_shardsvrDropDatabaseParticipant: {skip: isAnInternalCommand},
|
||||
_shardsvrGetStatsForBalancing: {skip: isAnInternalCommand},
|
||||
_shardsvrCheckCanConnectToConfigServer: {skip: isAnInternalCommand},
|
||||
_shardsvrJoinMigrations: {skip: isAnInternalCommand},
|
||||
_shardsvrMergeAllChunksOnShard: {skip: isAnInternalCommand},
|
||||
_shardsvrMovePrimary: {skip: isAnInternalCommand},
|
||||
|
||||
@ -174,6 +174,7 @@ const wcCommandsTests = {
|
||||
_shardsvrDropDatabaseParticipant: {skip: "internal command"},
|
||||
_shardsvrEndMigrationBlockingOperation: {skip: "internal command"},
|
||||
_shardsvrGetStatsForBalancing: {skip: "internal command"},
|
||||
_shardsvrCheckCanConnectToConfigServer: {skip: "internal command"},
|
||||
_shardsvrJoinDDLCoordinators: {skip: "internal command"},
|
||||
_shardsvrJoinMigrations: {skip: "internal command"},
|
||||
_shardsvrMergeAllChunksOnShard: {skip: "internal command"},
|
||||
@ -3266,6 +3267,7 @@ const wcTimeseriesViewsCommandsTests = {
|
||||
_shardsvrEndMigrationBlockingOperation: {skip: "internal command"},
|
||||
_shardsvrFetchCollMetadata: {skip: "internal command"},
|
||||
_shardsvrGetStatsForBalancing: {skip: "internal command"},
|
||||
_shardsvrCheckCanConnectToConfigServer: {skip: "internal command"},
|
||||
_shardsvrJoinDDLCoordinators: {skip: "internal command"},
|
||||
_shardsvrJoinMigrations: {skip: "internal command"},
|
||||
_shardsvrMergeAllChunksOnShard: {skip: "internal command"},
|
||||
|
||||
@ -58,6 +58,7 @@ const allCommands = {
|
||||
_configsvrDropIndexCatalogEntry: {skip: isAnInternalCommand},
|
||||
_configsvrEnsureChunkVersionIsGreaterThan: {skip: isAnInternalCommand},
|
||||
_configsvrGetHistoricalPlacement: {skip: isAnInternalCommand},
|
||||
_configsvrHelloMe: {skip: isAnInternalCommand},
|
||||
_configsvrMoveRange: {skip: isAnInternalCommand},
|
||||
_configsvrRemoveChunks: {skip: isAnInternalCommand},
|
||||
_configsvrRemoveShard: {skip: isAnInternalCommand},
|
||||
@ -119,6 +120,7 @@ const allCommands = {
|
||||
_shardsvrCoordinateMultiUpdate: {skip: isAnInternalCommand},
|
||||
_shardsvrEndMigrationBlockingOperation: {skip: isAnInternalCommand},
|
||||
_shardsvrGetStatsForBalancing: {skip: isAnInternalCommand},
|
||||
_shardsvrCheckCanConnectToConfigServer: {skip: isAnInternalCommand},
|
||||
_shardsvrJoinDDLCoordinators: {skip: isAnInternalCommand},
|
||||
_shardsvrJoinMigrations: {skip: isAnInternalCommand},
|
||||
_shardsvrMergeAllChunksOnShard: {skip: isAnInternalCommand},
|
||||
|
||||
@ -107,6 +107,7 @@ const allCommands = {
|
||||
_shardsvrDropIndexes: {skip: isAnInternalCommand},
|
||||
_shardsvrCreateCollectionParticipant: {skip: isPrimaryOnly},
|
||||
_shardsvrGetStatsForBalancing: {skip: isPrimaryOnly},
|
||||
_shardsvrCheckCanConnectToConfigServer: {skip: isPrimaryOnly},
|
||||
_shardsvrJoinMigrations: {skip: isAnInternalCommand},
|
||||
_shardsvrJoinDDLCoordinators: {skip: isPrimaryOnly},
|
||||
_shardsvrMergeAllChunksOnShard: {skip: isPrimaryOnly},
|
||||
|
||||
@ -114,6 +114,7 @@ const allCommands = {
|
||||
_shardsvrCoordinateMultiUpdate: {skip: isAnInternalCommand},
|
||||
_shardsvrEndMigrationBlockingOperation: {skip: isAnInternalCommand},
|
||||
_shardsvrGetStatsForBalancing: {skip: isAnInternalCommand},
|
||||
_shardsvrCheckCanConnectToConfigServer: {skip: isAnInternalCommand},
|
||||
_shardsvrJoinDDLCoordinators: {skip: isAnInternalCommand},
|
||||
_shardsvrJoinMigrations: {skip: isAnInternalCommand},
|
||||
_shardsvrMergeAllChunksOnShard: {skip: isAnInternalCommand},
|
||||
|
||||
@ -178,6 +178,7 @@ let testCases = {
|
||||
_shardsvrDropDatabaseParticipant: {skip: "internal command"},
|
||||
_shardsvrEndMigrationBlockingOperation: {skip: "internal command"},
|
||||
_shardsvrGetStatsForBalancing: {skip: "internal command"},
|
||||
_shardsvrCheckCanConnectToConfigServer: {skip: "internal command"},
|
||||
_shardsvrJoinDDLCoordinators: {skip: "internal command"},
|
||||
_shardsvrJoinMigrations: {skip: "internal command"},
|
||||
_shardsvrMergeAllChunksOnShard: {skip: "internal command"},
|
||||
|
||||
@ -23,3 +23,4 @@ mixedShardTest(allowTLS, preferTLS, true);
|
||||
|
||||
print("=== Testing allowTLS/requireTLS cluster - SHOULD FAIL ===");
|
||||
mixedShardTest(allowTLS, requireTLS, false);
|
||||
mixedShardTest(requireTLS, allowTLS, false);
|
||||
|
||||
@ -15,6 +15,7 @@ print("=== Testing disabled cluster ===");
|
||||
mixedShardTest(disabled, disabled, true);
|
||||
|
||||
print("=== Testing disabled/preferTLS cluster - SHOULD FAIL ===");
|
||||
mixedShardTest(disabled, preferTLS, false);
|
||||
mixedShardTest(preferTLS, disabled, false);
|
||||
|
||||
print("=== Testing allowTLS/disabled cluster ===");
|
||||
|
||||
@ -1387,6 +1387,7 @@ mongo_cc_library(
|
||||
"shardsvr_abort_reshard_collection_command.cpp",
|
||||
"shardsvr_add_shard_cmd.cpp",
|
||||
"shardsvr_change_primary_command.cpp",
|
||||
"shardsvr_check_can_connect_to_config_server_cmd.cpp",
|
||||
"shardsvr_check_metadata_consistency_command.cpp",
|
||||
"shardsvr_check_metadata_consistency_participant_command.cpp",
|
||||
"shardsvr_cleanup_reshard_collection_command.cpp",
|
||||
|
||||
@ -320,3 +320,6 @@ filters:
|
||||
- "*topology_change*":
|
||||
approvers:
|
||||
- 10gen/server-catalog-and-routing
|
||||
- "*check_can_connect_to*":
|
||||
approvers:
|
||||
- 10gen/server-catalog-and-routing
|
||||
|
||||
@ -36,6 +36,7 @@
|
||||
#include "mongo/db/s/topology_change_helpers.h"
|
||||
#include "mongo/db/s/user_writes_critical_section_document_gen.h"
|
||||
#include "mongo/db/vector_clock_mutable.h"
|
||||
#include "mongo/s/request_types/add_shard_gen.h"
|
||||
#include "mongo/util/assert_util.h"
|
||||
#include "src/mongo/db/list_collections_gen.h"
|
||||
|
||||
@ -87,9 +88,8 @@ ExecutorFuture<void> AddShardCoordinator::_runImpl(
|
||||
executor,
|
||||
token);
|
||||
} catch (const DBException&) {
|
||||
// if we are not able to validate the host as a shard after
|
||||
// multiple try, we don't want to continue, so we remove
|
||||
// the replicaset monitor and give up.
|
||||
// If we are not able to validate the host as a shard after multiple tries, we
|
||||
// don't want to continue, so we remove the replicaset monitor and give up.
|
||||
topology_change_helpers::removeReplicaSetMonitor(opCtx,
|
||||
_doc.getConnectionString());
|
||||
_completeOnError = true;
|
||||
@ -111,6 +111,44 @@ ExecutorFuture<void> AddShardCoordinator::_runImpl(
|
||||
_checkExistingDataOnShard(opCtx, targeter, **executor);
|
||||
}
|
||||
|
||||
// (Generic FCV reference): These FCV checks should exist across LTS binary
|
||||
// versions.
|
||||
const auto currentFCV =
|
||||
serverGlobalParams.featureCompatibility.acquireFCVSnapshot().getVersion();
|
||||
invariant(currentFCV == multiversion::GenericFCV::kLatest ||
|
||||
currentFCV == multiversion::GenericFCV::kLastContinuous ||
|
||||
currentFCV == multiversion::GenericFCV::kLastLTS);
|
||||
|
||||
_setFCVOnReplicaSet(opCtx, currentFCV, **executor);
|
||||
|
||||
const auto host = uassertStatusOK(
|
||||
Grid::get(opCtx)->shardRegistry()->getConfigShard()->getTargeter()->findHost(
|
||||
opCtx, ReadPreferenceSetting{ReadPreference::PrimaryOnly}));
|
||||
|
||||
try {
|
||||
_runWithRetries(
|
||||
[&]() {
|
||||
ShardsvrCheckCanConnectToConfigServer cmd(host);
|
||||
cmd.setDbName(DatabaseName::kAdmin);
|
||||
uassertStatusOK(
|
||||
topology_change_helpers::runCommandForAddShard(opCtx,
|
||||
_getTargeter(opCtx),
|
||||
DatabaseName::kAdmin,
|
||||
cmd.toBSON(),
|
||||
**executor)
|
||||
.commandStatus);
|
||||
},
|
||||
executor,
|
||||
token);
|
||||
} catch (const DBException&) {
|
||||
// If the replica set is not able to contact us after multiple tries, we don't
|
||||
// want to continue, so we remove the replicaset monitor and give up.
|
||||
topology_change_helpers::removeReplicaSetMonitor(opCtx,
|
||||
_doc.getConnectionString());
|
||||
_completeOnError = true;
|
||||
throw;
|
||||
}
|
||||
|
||||
std::string shardName =
|
||||
topology_change_helpers::createShardName(opCtx,
|
||||
_getTargeter(opCtx),
|
||||
@ -126,16 +164,6 @@ ExecutorFuture<void> AddShardCoordinator::_runImpl(
|
||||
[this, _ = shared_from_this(), executor](auto* opCtx) {
|
||||
auto& targeter = _getTargeter(opCtx);
|
||||
|
||||
// (Generic FCV reference): These FCV checks should exist across LTS binary
|
||||
// versions.
|
||||
const auto currentFCV =
|
||||
serverGlobalParams.featureCompatibility.acquireFCVSnapshot().getVersion();
|
||||
invariant(currentFCV == multiversion::GenericFCV::kLatest ||
|
||||
currentFCV == multiversion::GenericFCV::kLastContinuous ||
|
||||
currentFCV == multiversion::GenericFCV::kLastLTS);
|
||||
|
||||
_setFCVOnReplicaSet(opCtx, currentFCV, **executor);
|
||||
|
||||
_dropSessionsCollection(opCtx, **executor);
|
||||
|
||||
topology_change_helpers::getClusterTimeKeysFromReplicaSet(
|
||||
@ -416,6 +444,9 @@ void AddShardCoordinator::_runWithRetries(std::function<void()>&& function,
|
||||
if (status.isOK()) {
|
||||
return true;
|
||||
}
|
||||
if (!_isRetriableErrorForDDLCoordinator(status)) {
|
||||
return true;
|
||||
}
|
||||
failCounter++;
|
||||
if (failCounter >= kMaxFailedRetryCount) {
|
||||
return true;
|
||||
|
||||
@ -0,0 +1,118 @@
|
||||
/**
|
||||
* Copyright (C) 2025-present MongoDB, Inc.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the Server Side Public License, version 1,
|
||||
* as published by MongoDB, Inc.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* Server Side Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the Server Side Public License
|
||||
* along with this program. If not, see
|
||||
* <http://www.mongodb.com/licensing/server-side-public-license>.
|
||||
*
|
||||
* As a special exception, the copyright holders give permission to link the
|
||||
* code of portions of this program with the OpenSSL library under certain
|
||||
* conditions as described in each individual source file and distribute
|
||||
* linked combinations including the program with the OpenSSL library. You
|
||||
* must comply with the Server Side Public License in all respects for
|
||||
* all of the code used other than as permitted herein. If you modify file(s)
|
||||
* with this exception, you may extend this exception to your version of the
|
||||
* file(s), but you are not obligated to do so. If you do not wish to do so,
|
||||
* delete this exception statement from your version. If you delete this
|
||||
* exception statement from all source files in the program, then also delete
|
||||
* it in the license file.
|
||||
*/
|
||||
|
||||
#include "mongo/db/auth/authorization_session.h"
|
||||
#include "mongo/db/commands.h"
|
||||
#include "mongo/db/repl/hello/hello_gen.h"
|
||||
#include "mongo/executor/async_rpc.h"
|
||||
#include "mongo/executor/network_interface_factory.h"
|
||||
#include "mongo/executor/thread_pool_task_executor.h"
|
||||
#include "mongo/s/request_types/add_shard_gen.h"
|
||||
|
||||
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kSharding
|
||||
|
||||
namespace mongo {
|
||||
namespace {
|
||||
|
||||
class ShardsvrCheckCanConnectToConfigServerCommand
|
||||
: public TypedCommand<ShardsvrCheckCanConnectToConfigServerCommand> {
|
||||
public:
|
||||
using Request = ShardsvrCheckCanConnectToConfigServer;
|
||||
|
||||
ShardsvrCheckCanConnectToConfigServerCommand() : TypedCommand(Request::kCommandName) {}
|
||||
|
||||
class Invocation final : public InvocationBase {
|
||||
public:
|
||||
using InvocationBase::InvocationBase;
|
||||
|
||||
void typedRun(OperationContext* opCtx) {
|
||||
opCtx->setAlwaysInterruptAtStepDownOrUp_UNSAFE();
|
||||
|
||||
HelloCommand helloCmd;
|
||||
helloCmd.setDbName(DatabaseName::kAdmin);
|
||||
|
||||
ConnectionString cstr(request().getCommandParameter());
|
||||
|
||||
auto taskExecutor = executor::ThreadPoolTaskExecutor::create(
|
||||
std::make_unique<ThreadPool>(ThreadPool::Options{}),
|
||||
executor::makeNetworkInterface("HelloMe-TaskExecutor"));
|
||||
taskExecutor->startup();
|
||||
|
||||
auto options = std::make_shared<async_rpc::AsyncRPCOptions<HelloCommand>>(
|
||||
taskExecutor, opCtx->getCancellationToken(), helloCmd);
|
||||
|
||||
try {
|
||||
async_rpc::sendCommand<HelloCommand>(options, opCtx, cstr).get();
|
||||
} catch (const ExceptionFor<ErrorCodes::RemoteCommandExecutionError>& ex) {
|
||||
uassertStatusOK(async_rpc::unpackRPCStatus(ex.toStatus()));
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
bool supportsWriteConcern() const override {
|
||||
return false;
|
||||
}
|
||||
|
||||
// The command parameter happens to be string so it's historically been interpreted
|
||||
// by parseNs as a collection. Continuing to do so here for unexamined compatibility.
|
||||
NamespaceString ns() const override {
|
||||
return NamespaceString(request().getDbName());
|
||||
}
|
||||
|
||||
void doCheckAuthorization(OperationContext* opCtx) const override {
|
||||
uassert(ErrorCodes::Unauthorized,
|
||||
"Unauthorized",
|
||||
AuthorizationSession::get(opCtx->getClient())
|
||||
->isAuthorizedForActionsOnResource(
|
||||
ResourcePattern::forClusterResource(request().getDbName().tenantId()),
|
||||
ActionType::internal));
|
||||
}
|
||||
};
|
||||
|
||||
bool skipApiVersionCheck() const override {
|
||||
// Internal command (server to server).
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string help() const override {
|
||||
return "Internal command, that tries to contact the host specified in the parameter.";
|
||||
}
|
||||
|
||||
AllowedOnSecondary secondaryAllowed(ServiceContext*) const override {
|
||||
return AllowedOnSecondary::kNever;
|
||||
}
|
||||
|
||||
bool adminOnly() const override {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
MONGO_REGISTER_COMMAND(ShardsvrCheckCanConnectToConfigServerCommand).forShard();
|
||||
|
||||
} // namespace
|
||||
} // namespace mongo
|
||||
@ -32,6 +32,7 @@ global:
|
||||
imports:
|
||||
- "mongo/db/basic_types.idl"
|
||||
- "mongo/db/s/type_shard_identity.idl"
|
||||
- "mongo/util/net/hostandport.idl"
|
||||
|
||||
structs:
|
||||
AddShardRequestBase:
|
||||
@ -88,3 +89,14 @@ commands:
|
||||
shardIdentity:
|
||||
description: "Identity metadata for the new shard"
|
||||
type: ShardIdentity
|
||||
|
||||
# Internal command where the config server expects the shard to send a hello command to the
|
||||
# config server. This is used to find out if the contact could be done both ways (without
|
||||
# network or security issue)
|
||||
_shardsvrCheckCanConnectToConfigServer:
|
||||
command_name: _shardsvrCheckCanConnectToConfigServer
|
||||
cpp_name: ShardsvrCheckCanConnectToConfigServer
|
||||
description: "Internal command where the config server expects the shard to send a hello command to the config server. This is used to find out if the contact could be done both ways (without network or security issue)"
|
||||
namespace: type
|
||||
type: HostAndPort
|
||||
api_version: ""
|
||||
|
||||
Loading…
Reference in New Issue
Block a user