SERVER-110412 Add errorCode parameter to killOp command (#44110)

Co-authored-by: Gabriel Marks <gabriel.marks@mongodb.com>
GitOrigin-RevId: d7418ad53a3de814981229e45cb933b6736554ba
This commit is contained in:
Ivan Fefer 2025-11-21 11:16:57 +01:00 committed by MongoDB Bot
parent 7b9c390b62
commit 10a2eed9b3
15 changed files with 270 additions and 44 deletions

View File

@ -0,0 +1,90 @@
/**
* Test that a user may only override killOp error code if they have the proper privileges.
*
* @tags: [requires_fcv_83, requires_sharding]
*/
import {configureFailPoint} from "jstests/libs/fail_point_util.js";
import {FixtureHelpers} from "jstests/libs/fixture_helpers.js";
import {ShardingTest} from "jstests/libs/shardingtest.js";
function runTest(m, failPointName) {
const db = m.getDB("foo");
const admin = m.getDB("admin");
admin.createUser({user: "admin", pwd: "password", roles: jsTest.adminUserRoles});
admin.auth("admin", "password");
const logReader = {db: "admin", role: "clusterMonitor"};
db.createUser({user: "reader", pwd: "reader", roles: [{db: "foo", role: "read"}, logReader]});
admin.createRole({
role: "opAdmin",
roles: [],
privileges: [{resource: {cluster: true}, actions: ["inprog", "killop"]}],
});
db.createUser({user: "opAdmin", pwd: "opAdmin", roles: [{role: "opAdmin", db: "admin"}]});
const t = db.killop_error_code;
t.insertOne({x: 1});
if (!FixtureHelpers.isMongos(db)) {
assert.commandWorked(db.adminCommand({setParameter: 1, internalQueryExecYieldIterations: 1}));
}
admin.logout();
// Only used for nice error messages.
function getAllLocalOps() {
return admin.aggregate([{$currentOp: {allUsers: true, localOps: true}}]).toArray();
}
function getExpectedOpIds() {
return admin
.aggregate([{$currentOp: {localOps: true}}])
.toArray()
.filter((op) => op.command.comment === "killop_error_code")
.map((op) => op.opid);
}
let queryAsReader =
'db = db.getSiblingDB("foo"); db.auth("reader", "reader"); assert.commandFailedWithCode(db.runCommand({find: "killop_error_code", comment: "killop_error_code"}), ErrorCodes.InterruptedDueToOverload);';
jsTest.log.info("Starting long-running operation");
db.auth("reader", "reader");
const failpoint = configureFailPoint(m, failPointName);
const query = startParallelShell(queryAsReader, m.port);
jsTest.log.info("Finding ops in $currentOp output");
assert.soon(
() => getExpectedOpIds().length === 1,
() => tojson(getAllLocalOps()),
);
const current_op_id = getExpectedOpIds()[0];
jsTest.log.info("Checking that the user cannot kill the op with a custom error code");
assert.commandFailedWithCode(
db.adminCommand({killOp: 1, op: current_op_id, errorCode: ErrorCodes.InterruptedDueToOverload}),
ErrorCodes.Unauthorized,
);
db.logout();
db.auth("opAdmin", "opAdmin");
jsTest.log.info("Checking that an administrative user can kill the op only with a valid custom error code");
assert.commandFailedWithCode(
db.adminCommand({killOp: 1, op: current_op_id, errorCode: ErrorCodes.DuplicateKey}),
ErrorCodes.Unauthorized,
);
assert.commandWorked(
db.adminCommand({killOp: 1, op: current_op_id, errorCode: ErrorCodes.InterruptedDueToOverload}),
);
db.logout();
failpoint.off();
query();
}
let conn = MongoRunner.runMongod({auth: ""});
runTest(conn, "setYieldAllLocksHang");
MongoRunner.stopMongod(conn);
let st = new ShardingTest({shards: 1, keyFile: "jstests/libs/key1"});
runTest(st.s, "waitInFindBeforeMakingBatch");
st.stop();

View File

@ -5756,6 +5756,58 @@ export const authCommandsLib = {
{runOnDb: secondDbName, roles: {}},
],
},
{
testname: "killOpWrongErrorCode",
command: {killOp: 1, op: 123, errorCode: ErrorCodes.DuplicateKey},
skipSharded: true,
skipTest: (conn) => !isFeatureEnabled(conn, "featureFlagKillOpErrorCodeOverride"),
testcases: [
{
runOnDb: adminDbName,
roles: {},
},
],
},
{
testname: "killOpWrongErrorCode",
command: {killOp: 1, op: "shard1:123", errorCode: ErrorCodes.DuplicateKey},
skipUnlessSharded: true,
skipTest: (conn) => !isFeatureEnabled(conn, "featureFlagKillOpErrorCodeOverride"),
testcases: [
{
runOnDb: adminDbName,
roles: {},
expectFail: true, // we won't be able to find the shardId
},
],
},
{
testname: "killOpErrorCode",
command: {killOp: 1, op: 123, errorCode: ErrorCodes.InterruptedDueToOverload},
skipSharded: true,
skipTest: (conn) => !isFeatureEnabled(conn, "featureFlagKillOpErrorCodeOverride"),
testcases: [
{
runOnDb: adminDbName,
roles: roles_hostManager,
privileges: [{resource: {cluster: true}, actions: ["killop"]}],
},
],
},
{
testname: "killOpErrorCode",
command: {killOp: 1, op: "shard1:123", errorCode: ErrorCodes.InterruptedDueToOverload},
skipUnlessSharded: true,
skipTest: (conn) => !isFeatureEnabled(conn, "featureFlagKillOpErrorCodeOverride"),
testcases: [
{
runOnDb: adminDbName,
roles: roles_hostManager,
privileges: [{resource: {cluster: true}, actions: ["killop"]}],
expectFail: true, // we won't be able to find the shardId
},
],
},
// The rest of kill sessions auth testing is in the kill_sessions fixture (because calling
// the commands logged in as a different user needs to have different results). These tests
// merely verify that the hostManager is the only role with killAnySession.

View File

@ -4,6 +4,7 @@
import {waitForCurOpByFailPointNoNS} from "jstests/libs/curop_helpers.js";
import {configureFailPoint} from "jstests/libs/fail_point_util.js";
import {ShardingTest} from "jstests/libs/shardingtest.js";
import {FeatureFlagUtil} from "jstests/libs/feature_flag_util.js";
const st = new ShardingTest({shards: 2});
const conn = st.s;
@ -13,38 +14,50 @@ const db = conn.getDB("killOp");
const coll = db.test;
assert.commandWorked(db.getCollection(coll.getName()).insert({x: 1}));
const kFailPointName = "waitInFindBeforeMakingBatch";
const fp = configureFailPoint(conn, kFailPointName);
function runKillOpTest(expectedErrorCode) {
const kFailPointName = "waitInFindBeforeMakingBatch";
const fp = configureFailPoint(conn, kFailPointName);
const queryToKill =
`assert.commandFailedWithCode(db.getSiblingDB('${db.getName()}')` +
`.runCommand({find: '${coll.getName()}', filter: {x: 1}}), ErrorCodes.Interrupted);`;
const awaitShell = startParallelShell(queryToKill, conn.port);
const queryToKill =
`assert.commandFailedWithCode(db.getSiblingDB('${db.getName()}')` +
`.runCommand({find: '${coll.getName()}', filter: {x: 1}}), ${expectedErrorCode});`;
const awaitShell = startParallelShell(queryToKill, conn.port);
const curOpFilter = {
ns: coll.getFullName(),
"command.filter": {x: 1},
};
const curOpFilter = {
ns: coll.getFullName(),
"command.filter": {x: 1},
};
// Wait for the operation to start.
const curOps = waitForCurOpByFailPointNoNS(db, kFailPointName, curOpFilter, {localOps: true});
const opId = curOps[0].opid;
// Wait for the operation to start.
const curOps = waitForCurOpByFailPointNoNS(db, kFailPointName, curOpFilter, {localOps: true});
const opId = curOps[0].opid;
// Kill the operation.
assert.commandWorked(db.killOp(opId));
// Kill the operation.
const command = {killOp: 1, op: opId};
if (expectedErrorCode !== ErrorCodes.Interrupted) {
command.errorCode = expectedErrorCode;
}
assert.commandWorked(adminDB.runCommand(command));
// Ensure that the operation gets marked kill pending while it's still hanging.
let result = adminDB.aggregate([{$currentOp: {localOps: true}}, {$match: curOpFilter}]).toArray();
assert(result.length === 1, tojson(result));
assert(result[0].hasOwnProperty("killPending"));
assert.eq(true, result[0].killPending);
// Ensure that the operation gets marked kill pending while it's still hanging.
let result = adminDB.aggregate([{$currentOp: {localOps: true}}, {$match: curOpFilter}]).toArray();
assert(result.length === 1, tojson(result));
assert(result[0].hasOwnProperty("killPending"));
assert.eq(true, result[0].killPending);
// Release the failpoint. The operation should check for interrupt and then finish.
fp.off();
// Release the failpoint. The operation should check for interrupt and then finish.
fp.off();
awaitShell();
awaitShell();
result = adminDB.aggregate([{$currentOp: {localOps: true}}, {$match: curOpFilter}]).toArray();
assert(result.length === 0, tojson(result));
result = adminDB.aggregate([{$currentOp: {localOps: true}}, {$match: curOpFilter}]).toArray();
assert(result.length === 0, tojson(result));
}
runKillOpTest(ErrorCodes.Interrupted);
if (FeatureFlagUtil.isPresentAndEnabled(conn, "KillOpErrorCodeOverride")) {
runKillOpTest(ErrorCodes.InterruptedDueToOverload);
}
st.stop();

View File

@ -980,6 +980,11 @@ error_codes:
# Bad BSON column format found during decompression
- {code: 472, name: InvalidBSONColumn, categories: [ValidationError]}
- {
code: 473,
name: InterruptedDueToOverload,
categories: [Interruption, SystemOverloadedError],
}
# Error codes 4000-8999 are reserved.

View File

@ -48,11 +48,12 @@ public:
const BSONObj& cmdObj,
BSONObjBuilder& result) final {
long long opId = KillOpCmdBase::parseOpId(cmdObj);
ErrorCodes::Error errorCode = KillOpCmdBase::parseErrorCode(opCtx, cmdObj);
// Used by tests to check if auth checks passed.
result.append("info", "attempting to kill op");
LOGV2(20482, "Going to kill op", "opId"_attr = opId);
KillOpCmdBase::killLocalOperation(opCtx, opId);
KillOpCmdBase::killLocalOperation(opCtx, opId, errorCode);
reportSuccessfulCompletion(opCtx, dbName, cmdObj);
// killOp always reports success once past the auth check.

View File

@ -36,6 +36,7 @@
#include "mongo/db/auth/user_name.h"
#include "mongo/db/client.h"
#include "mongo/db/operation_killer.h"
#include "mongo/db/server_feature_flags_gen.h"
#include "mongo/logv2/attribute_storage.h"
#include "mongo/logv2/log.h"
#include "mongo/rpc/metadata/client_metadata.h"
@ -50,9 +51,17 @@
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kCommand
namespace mongo {
namespace {
const static absl::flat_hash_set<ErrorCodes::Error> kAllowedKillOpErrorCodes = {
ErrorCodes::Interrupted,
ErrorCodes::InterruptedDueToOverload,
};
} // namespace
void KillOpCmdBase::reportSuccessfulCompletion(OperationContext* opCtx,
const DatabaseName& dbName,
const BSONObj& cmdObj) {
@ -91,9 +100,17 @@ Status KillOpCmdBase::checkAuthForOperation(OperationContext* workerOpCtx,
auto* worker = workerOpCtx->getClient();
auto opKiller = OperationKiller(worker);
ErrorCodes::Error errorCode = parseErrorCode(workerOpCtx, cmdObj);
if (!kAllowedKillOpErrorCodes.contains(errorCode)) {
return Status(ErrorCodes::Unauthorized, "Unauthorized");
}
if (opKiller.isGenerallyAuthorizedToKill()) {
return Status::OK();
}
// Only generally authorized to kill users can specify a custom error code.
if (errorCode != kDefaultErrorCode) {
return Status(ErrorCodes::Unauthorized, "Unauthorized");
}
if (isKillingLocalOp(cmdObj.getField("op"))) {
// Look up the OperationContext and see if we have permission to kill it. This is done once
@ -102,7 +119,7 @@ Status KillOpCmdBase::checkAuthForOperation(OperationContext* workerOpCtx,
long long opId = parseOpId(cmdObj);
auto target = worker->getServiceContext()->getLockedClient(opId);
if (OperationKiller(worker).isAuthorizedToKill(target)) {
if (opKiller.isAuthorizedToKill(target)) {
// We were authorized to interact with the target Client
return Status::OK();
}
@ -111,8 +128,10 @@ Status KillOpCmdBase::checkAuthForOperation(OperationContext* workerOpCtx,
return Status(ErrorCodes::Unauthorized, "Unauthorized");
}
void KillOpCmdBase::killLocalOperation(OperationContext* opCtx, OperationId opToKill) {
OperationKiller(opCtx->getClient()).killOperation(opToKill);
void KillOpCmdBase::killLocalOperation(OperationContext* opCtx,
OperationId opToKill,
ErrorCodes::Error killCode) {
OperationKiller(opCtx->getClient()).killOperation(opToKill, killCode);
}
bool KillOpCmdBase::isKillingLocalOp(const BSONElement& opElem) {
@ -130,4 +149,15 @@ unsigned int KillOpCmdBase::parseOpId(const BSONObj& cmdObj) {
return static_cast<unsigned int>(op);
}
ErrorCodes::Error KillOpCmdBase::parseErrorCode(OperationContext* opCtx, const BSONObj& cmdObj) {
if (gFeatureFlagKillOpErrorCodeOverride.isEnabledUseLatestFCVWhenUninitialized(
VersionContext::getDecoration(opCtx),
serverGlobalParams.featureCompatibility.acquireFCVSnapshot())) {
if (auto bsonErrorCode = cmdObj.getField("errorCode")) {
return ErrorCodes::Error{bsonErrorCode.numberInt()};
}
}
return kDefaultErrorCode;
}
} // namespace mongo

View File

@ -68,10 +68,14 @@ public:
const BSONObj& cmdObj) const final;
protected:
static constexpr ErrorCodes::Error kDefaultErrorCode = ErrorCodes::Interrupted;
/**
* Kill an operation running on this instance of mongod or mongos.
*/
static void killLocalOperation(OperationContext* opCtx, OperationId opToKill);
static void killLocalOperation(OperationContext* opCtx,
OperationId opToKill,
ErrorCodes::Error killCode);
/**
* Extract the "op" field from 'cmdObj' and convert the value to unsigned int. Since BSON only
@ -81,6 +85,12 @@ protected:
*/
static unsigned int parseOpId(const BSONObj& cmdObj);
/**
* Extract the "errorCode" field from 'cmdObj' and convert the value to ErrorCodes::Error. If
* the field is missing, will return the default kill code for the command.
*/
static ErrorCodes::Error parseErrorCode(OperationContext* opCtx, const BSONObj& cmdObj);
static void reportSuccessfulCompletion(OperationContext* opCtx,
const DatabaseName& dbName,
const BSONObj& cmdObj);

View File

@ -2051,6 +2051,8 @@ private:
opCtx);
} catch (const ExceptionFor<ErrorCodes::Interrupted>&) {
throw;
} catch (const ExceptionFor<ErrorCodes::InterruptedDueToOverload>&) {
throw;
} catch (const DBException& ex) {
uasserted(ErrorCodes::TemporarilyUnavailable,
str::stream()

View File

@ -617,8 +617,8 @@ bool ShardingDDLCoordinator::_isRetriableErrorForDDLCoordinator(const Status& st
status.isA<ErrorCategory::CancellationError>() ||
status.isA<ErrorCategory::ExceededTimeLimitError>() ||
status.isA<ErrorCategory::WriteConcernError>() ||
status == ErrorCodes::FailedToSatisfyReadPreference || status == ErrorCodes::Interrupted ||
status == ErrorCodes::LockBusy || status == ErrorCodes::CommandNotFound;
status == ErrorCodes::FailedToSatisfyReadPreference || status == ErrorCodes::LockBusy ||
status == ErrorCodes::CommandNotFound;
}
ShardingDDLCoordinatorExternalState* ShardingDDLCoordinator::_getExternalState() {

View File

@ -65,7 +65,7 @@ bool OperationKiller::isAuthorizedToKill(const ClientLock& target) const {
return false;
}
void OperationKiller::killOperation(OperationId opId) {
void OperationKiller::killOperation(OperationId opId, ErrorCodes::Error killCode) {
auto serviceContext = _myClient->getServiceContext();
auto target = serviceContext->getLockedClient(opId);
@ -79,12 +79,12 @@ void OperationKiller::killOperation(OperationId opId) {
return;
}
serviceContext->killOperation(target, target->getOperationContext());
serviceContext->killOperation(target, target->getOperationContext(), killCode);
LOGV2(20884, "Killed operation", "opId"_attr = opId);
}
void OperationKiller::killOperation(const OperationKey& opKey) {
void OperationKiller::killOperation(const OperationKey& opKey, ErrorCodes::Error killCode) {
auto opId = OperationKeyManager::get(_myClient).at(opKey);
if (!opId) {
@ -92,7 +92,7 @@ void OperationKiller::killOperation(const OperationKey& opKey) {
return;
}
killOperation(*opId);
killOperation(*opId, killCode);
}
} // namespace mongo

View File

@ -30,7 +30,6 @@
#pragma once
#include "mongo/db/client.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/operation_id.h"
#include "mongo/db/service_context.h"
@ -61,8 +60,9 @@ public:
/**
* Kill an operation running on this instance of mongod or mongos.
*/
void killOperation(OperationId opId);
void killOperation(const OperationKey& opKey);
void killOperation(OperationId opId, ErrorCodes::Error killCode = ErrorCodes::Interrupted);
void killOperation(const OperationKey& opKey,
ErrorCodes::Error killCode = ErrorCodes::Interrupted);
private:
Client* const _myClient;

View File

@ -282,7 +282,8 @@ void CursorManager::unpin(OperationContext* opCtx,
// interesting in proactively cleaning up that cursor's resources. In these cases, we
// proactively delete the cursor. In other cases we preserve the error code so that the client
// will see the reason the cursor was killed when asking for the next batch.
if (interruptStatus == ErrorCodes::Interrupted || cursor->isKillPending()) {
if (interruptStatus == ErrorCodes::Interrupted ||
interruptStatus == ErrorCodes::InterruptedDueToOverload || cursor->isKillPending()) {
LOGV2(20530,
"Removing cursor after completing batch",
"cursorId"_attr = cursor->cursorid(),

View File

@ -216,3 +216,9 @@ feature_flags:
cpp_varname: gFeatureFlagDedicatedPortForMaintenanceOperations
default: false
fcv_gated: false
featureFlagKillOpErrorCodeOverride:
description: "Feature flag to enable specifying custom error codes when killing operations"
cpp_varname: gFeatureFlagKillOpErrorCodeOverride
default: true
version: 8.3
fcv_gated: true

View File

@ -1022,6 +1022,9 @@ void StorageEngineImpl::TimestampMonitor::_startup() {
LOGV2(6183601,
"Timestamp monitor got interrupted due to repl state change, retrying");
return;
} catch (const ExceptionFor<ErrorCodes::InterruptedDueToOverload>&) {
LOGV2(6183602, "Timestamp monitor got interrupted due to overload, retrying");
return;
} catch (const ExceptionFor<ErrorCodes::InterruptedAtShutdown>& ex) {
if (_shuttingDown) {
return;

View File

@ -67,16 +67,18 @@ public:
BSONElement element = cmdObj.getField("op");
uassert(50759, "Did not provide \"op\" field", element.ok());
ErrorCodes::Error errorCode = KillOpCmdBase::parseErrorCode(opCtx, cmdObj);
if (isKillingLocalOp(element)) {
const unsigned int opId = KillOpCmdBase::parseOpId(cmdObj);
killLocalOperation(opCtx, opId);
killLocalOperation(opCtx, opId, errorCode);
reportSuccessfulCompletion(opCtx, dbName, cmdObj);
// killOp always reports success once past the auth check.
return true;
} else if (element.type() == BSONType::string) {
// It's a string. Should be of the form shardid:opid.
if (_killShardOperation(opCtx, element.str(), result)) {
if (_killShardOperation(opCtx, element.str(), errorCode, result)) {
reportSuccessfulCompletion(opCtx, dbName, cmdObj);
return true;
} else {
@ -91,6 +93,7 @@ public:
private:
static bool _killShardOperation(OperationContext* opCtx,
const std::string& opToKill,
ErrorCodes::Error errorCode,
BSONObjBuilder& result) {
// The format of op is shardid:opid
// This is different than the format passed to the mongod killOp command.
@ -119,7 +122,7 @@ private:
result.append("shard", shardIdent);
result.append("shardid", opId);
auto cmdToSend = BSON("killOp" << 1 << "op" << opId);
BSONObj cmdToSend = _makeShardCommand(opId, errorCode);
shard
->runCommandWithIndefiniteRetries(opCtx,
ReadPreferenceSetting{ReadPreference::PrimaryOnly},
@ -133,6 +136,16 @@ private:
// whether the shard reported success or not.
return true;
}
static BSONObj _makeShardCommand(int opId, ErrorCodes::Error errorCode) {
BSONObjBuilder builder;
builder.append("killOp", 1);
builder.append("op", opId);
if (errorCode != kDefaultErrorCode) {
builder.append("errorCode", errorCode);
}
return builder.obj();
};
};
MONGO_REGISTER_COMMAND(ClusterKillOpCommand).forRouter();