SERVER-119393 Implement aliveness check (#53952)

GitOrigin-RevId: 24809ae7cd7c4117aa884be3b38fba10ecb2c396
This commit is contained in:
Henri Nikku 2026-05-21 09:20:26 +01:00 committed by MongoDB Bot
parent daec0b9e75
commit 1ba8dd0ddf
3 changed files with 413 additions and 1 deletions

View File

@ -36,8 +36,10 @@
#include "mongo/db/pipeline/document_source_single_document_transformation.h"
#include "mongo/db/pipeline/pipeline.h"
#include "mongo/db/query/compiler/dependency_analysis/document_transformation_helpers.h"
#include "mongo/util/dynamic_bitset.h"
#include "mongo/util/string_map.h"
#include <algorithm>
#include <utility>
#include <vector>
@ -537,6 +539,28 @@ struct CanPathBeArrayForNss {
};
} // namespace
namespace {
/// RAII helper that appends a dotted path component to 'buf' on construction and restores
/// 'buf' to its prior length on destruction. Modeled after FieldRef::FieldRefTempAppend.
class ScopedPathComponent {
public:
ScopedPathComponent(std::string& buf, StringData component) : _buf(buf), _prevLen(buf.size()) {
if (!_buf.empty()) {
_buf.push_back('.');
}
_buf.append(component.data(), component.size());
}
~ScopedPathComponent() {
_buf.resize(_prevLen);
}
private:
std::string& _buf;
size_t _prevLen;
};
} // namespace
class DependencyGraph::Impl {
public:
explicit Impl(const DocumentSourceContainer& container,
@ -753,13 +777,134 @@ public:
BSONObj toBSON() const;
/// Implements DependencyGraph::getDeadFields. Sub-pipelines are not analyzed here.
void collectDeadFields(std::vector<DeadField>& out) const {
if (_stages.empty()) {
return;
}
auto alive = computeAliveFields();
std::string pathBuf;
for (StageId stageId{0}; stageId < _stages.getNextId(); stageId.value++) {
if (const auto& stage = _stages[stageId]; stage.isSingleDocumentTransformation) {
walkPotentiallyDeadFields(stage.scope, alive, pathBuf, out);
}
}
}
private:
using ParsedPath = boost::container::small_vector<StringPool::Id, 8>;
using ParsedPathView = std::span<StringPool::Id>;
using FieldList = boost::container::small_vector<FieldId, 8>;
using Bitset = DynamicBitset<size_t, 1>;
class Serializer;
/**
* Builds a bitmap indexed by FieldIds where 'true' means the field's value is accessed
* somewhere downstream. I.e., the field is either read by a stage, surviving in the pipeline's
* final output scope, or transitively required by another alive field's dependencies.
*
* The algorithm works as follows:
* We first seed the alive set with every field directly read by a stage and every field
* surviving in the pipeline's final output scope, then iteratively drain a worklist, marking
* each alive field's own dependencies alive, until the bitset reaches a fixpoint.
*/
Bitset computeAliveFields() const {
Bitset alive(_fields.size());
if (_stages.empty()) {
return alive;
}
std::vector<FieldId> worklist;
auto markAlive = [&](FieldId fieldId) {
if (fieldId && !alive.test(fieldId.value)) {
alive.set(fieldId.value);
worklist.push_back(fieldId);
}
};
auto markAllScopeFieldsAlive = [&](ScopeId scopeId) {
if (scopeId) {
for (auto&& [name, fieldId] : _scopes[scopeId].fields) {
markAlive(fieldId);
}
// The 'missing' field is always alive.
markAlive(_scopes[scopeId].missingField);
}
};
// A 'whole document' dependency reads every field visible at the reader's input, which
// is the scope produced by the previous stage. For the first stage that is the base
// document, but base-document fields don't have FieldIds, so there is nothing to mark.
auto markPredecessorScopeAlive = [&](StageId scopeId) {
if (scopeId > StageId(0)) {
StageId prevStage{scopeId.value - 1};
markAllScopeFieldsAlive(_stages[prevStage].scope);
}
};
auto markDependenciesAlive = [&](const FieldDependencies& deps, StageId scopeId) {
if (deps.dependsOnWholeDocument()) {
markPredecessorScopeAlive(scopeId);
} else {
for (FieldId dep : deps) {
markAlive(dep);
}
}
};
// Seed: every stage's direct dependencies plus the pipeline's final output scope.
for (StageId stageId{0}; stageId.value < static_cast<int32_t>(_stages.size());
stageId.value++) {
markDependenciesAlive(_stages[stageId].dependencies, stageId);
}
markAllScopeFieldsAlive(_stages.back().scope);
// Propagate: every alive field implicitly requires its own dependencies to be alive too.
while (!worklist.empty()) {
FieldId aliveField = worklist.back();
worklist.pop_back();
const auto& field = _fields[aliveField];
markDependenciesAlive(field.dependencies, _scopes[field.declaringScope].stage);
}
return alive;
}
/// Emits a DeadField for each leaf field newly declared by the stage that owns 'scopeId' (or
/// any of its embedded scopes) and is not in 'alive'. The 'pathBuf' parameter is reused across
/// recursive calls to avoid reallocating the string buffer separately for each field.
void walkPotentiallyDeadFields(ScopeId scopeId,
const Bitset& alive,
std::string& pathBuf,
std::vector<DeadField>& out) const {
const StageId stageId = _scopes[scopeId].stage;
for (auto&& [nameId, fieldId] : _scopes[scopeId].fields) {
if (!fieldId) {
continue;
}
const auto& field = _fields[fieldId];
if (_scopes[field.declaringScope].stage != stageId) {
continue;
}
const bool hasOwnEmbeddedScope = field.embeddedScope != ScopeId::none() &&
_scopes[field.embeddedScope].stage == stageId;
const bool isDead = !hasOwnEmbeddedScope && !alive.test(fieldId.value);
if (!isDead && !hasOwnEmbeddedScope) {
continue;
}
ScopedPathComponent component{pathBuf, _strings.get(nameId)};
if (isDead) {
out.push_back(DeadField{_stages[stageId].documentSource,
FieldPath{pathBuf,
/*precomputeHashes*/ false,
/*validateFieldNames*/ false}});
}
if (hasOwnEmbeddedScope) {
walkPotentiallyDeadFields(field.embeddedScope, alive, pathBuf, out);
}
}
}
/**
* Declares a scope (or embedded scope), which is defined by the given state and
@ -1690,7 +1835,6 @@ private:
absl::erase_if(_constants,
[invalidField](const auto& entry) { return entry.first >= invalidField; });
// Clean up for invalidated stages.
size_t subpipelinesToRemove = 0;
for (auto sid = invalidStage; sid < _stages.getNextId(); sid.value++) {
@ -1830,6 +1974,12 @@ std::string DependencyGraph::toDebugString() const {
return tojson(bson, ExtendedRelaxedV2_0_0, true /*pretty*/);
}
std::vector<DeadField> DependencyGraph::getDeadFields() const {
std::vector<DeadField> out;
_impl->collectDeadFields(out);
return out;
}
class DependencyGraph::Impl::Serializer {
public:
Serializer(const DependencyGraph::Impl& graph) : _graph(graph) {}

View File

@ -33,6 +33,7 @@
#include "mongo/base/string_data.h"
#include "mongo/db/exec/document_value/value.h"
#include "mongo/db/pipeline/document_source.h"
#include "mongo/db/pipeline/field_path.h"
#include "mongo/util/modules.h"
#include <cstddef>
@ -62,6 +63,18 @@ struct DeclaringStageResult {
bool fromSubpipeline = false;
};
/**
* A "dead" field detected by the aliveness analysis, i.e. a field path that was introduced by a
* pipeline stage but whose value is never used by any downstream stage and does not appear
* in the pipeline's final output.
*/
struct DeadField {
/// The stage which introduced this path.
boost::intrusive_ptr<mongo::DocumentSource> stage;
/// The field path that was introduced.
FieldPath path;
};
/**
* Represents dependencies between fields and stages in a pipeline. Can be partially rebuilt when
* the pipeline changes.
@ -187,6 +200,19 @@ public:
*/
void resize(DocumentSourceContainer::const_iterator newEndIt);
/**
* Returns the set of "dead" fields introduced by single-document transformation stages
* that are guaranteed to never affect the pipeline output.
*
* TODO(SERVER-127211): a field referenced by an intermediate stage is not dead, even if that
* stage is itself dead.
*
* TODO(SERVER-127212): also walk sub-pipelines and return their dead fields. For now,
* sub-pipelines are not analyzed; call getSubpipelineGraph(stage)->getDeadFields() to
* inspect a sub-pipeline.
*/
std::vector<DeadField> getDeadFields() const;
std::string toDebugString() const;
BSONObj toBSON() const;

View File

@ -49,8 +49,10 @@
#include "mongo/dbtests/dbtests.h" // IWYU pragma: keep
#include "mongo/unittest/unittest.h"
#include <algorithm>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "absl/strings/str_split.h"
@ -92,6 +94,21 @@ protected:
setPipeline(std::move(p));
}
/**
* Asserts that 'graph->getDeadFields()' produces the given (stage, path) pairs.
*/
void assertDeadFieldsEq(std::vector<std::pair<const DocumentSource*, std::string>> expected) {
auto dead = graph->getDeadFields();
std::vector<std::pair<const DocumentSource*, std::string>> actual;
actual.reserve(dead.size());
for (const auto& df : dead) {
actual.emplace_back(df.stage.get(), df.path.fullPath());
}
std::sort(actual.begin(), actual.end());
std::sort(expected.begin(), expected.end());
ASSERT_EQ(actual, expected);
}
/**
* Runs the given assertions after rebuilding the graph from every stage.
*
@ -2777,5 +2794,224 @@ TEST_F(PipelineDependencyGraphTest, CanPathBeArrayDeepScalarShadow) {
});
}
TEST_F(PipelineDependencyGraphTest, AlivenessFlagsUnreadAddFields) {
setPipeline("[{$addFields: {foo: 1}}, {$group: {_id: '$bar'}}]");
runTest([&] { assertDeadFieldsEq({{stages[0].get(), "foo"}}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessKeepsAliveAddFields) {
setPipeline("[{$addFields: {foo: 1}}, {$group: {_id: '$foo'}}]");
runTest([&] { assertDeadFieldsEq({}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessPartialAddFields) {
setPipeline("[{$addFields: {foo: 1, bar: 2}}, {$group: {_id: '$bar'}}]");
runTest([&] { assertDeadFieldsEq({{stages[0].get(), "foo"}}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessUsedInMatchThenDropped) {
setPipeline("[{$match: {foo: 5}}, {$project: {_id: 1}}]");
runTest([&] { assertDeadFieldsEq({}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessOverwriteMakesEarlierDead) {
setPipeline("[{$set: {foo: 1}}, {$set: {foo: 2}}, {$group: {_id: '$foo'}}]");
runTest([&] { assertDeadFieldsEq({{stages[0].get(), "foo"}}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessRenameTargetUnreadIsDead) {
setPipeline("[{$set: {foo: 1}}, {$set: {bar: '$foo'}}, {$group: {_id: '$foo'}}]");
runTest([&] { assertDeadFieldsEq({{stages[1].get(), "bar"}}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessRenameTargetAliveKeepsSource) {
setPipeline("[{$set: {foo: 1}}, {$set: {bar: '$foo'}}, {$group: {_id: '$bar'}}]");
runTest([&] { assertDeadFieldsEq({}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessFieldOnlyUsedByDeadFieldIsDead) {
setPipeline("[{$set: {foo: 1}}, {$set: {bar: '$foo'}}, {$group: {_id: '$baz'}}]");
runTest([&] {
// To be done (follow-up): This should include "foo" from the first stage.
assertDeadFieldsEq({{stages[1].get(), "bar"}});
});
}
TEST_F(PipelineDependencyGraphTest, AlivenessFinalScopePreservation) {
// foo survives to the pipeline output, so it is alive even though no stage reads it.
setPipeline("[{$set: {foo: 1}}]");
runTest([&] { assertDeadFieldsEq({}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessNestedPath) {
setPipeline("[{$set: {'a.b': 1}}, {$group: {_id: '$x'}}]");
runTest([&] { assertDeadFieldsEq({{stages[0].get(), "a.b"}}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessUnsetReported) {
setPipeline("[{$unset: 'foo'}, {$project: {bar: 1}}]");
runTest([&] { assertDeadFieldsEq({{stages[0].get(), "foo"}}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessExclusionProjectionReported) {
setPipeline("[{$project: {foo: 0}}, {$project: {bar: 1}}]");
runTest([&] { assertDeadFieldsEq({{stages[0].get(), "foo"}}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessUnwindNotReported) {
setPipeline("[{$unwind: '$arr'}, {$group: {_id: 1}}]");
runTest([&] { assertDeadFieldsEq({}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessGroupIdRenameNotReported) {
// $group emits the _id rename via kAllExcept, but $group is not a single-document
// transformation so we do not report its _id as dead even when it's unused.
setPipeline("[{$group: {_id: '$foo'}}, {$project: {bar: 1}}]");
runTest([&] { assertDeadFieldsEq({}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessReturnsAllDeadAtOnce) {
// Two independent dead fields in different stages.
setPipeline("[{$set: {foo: 1}}, {$set: {bar: 2}}, {$group: {_id: '$baz'}}]");
runTest([&] { assertDeadFieldsEq({{stages[0].get(), "foo"}, {stages[1].get(), "bar"}}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessSubpipelineNotAnalyzedAtTopLevel) {
// 'deadInner' is written and then dropped by $group inside the sub-pipeline, but the
// top-level analysis does not recurse into it.
setPipeline(R"([{$lookup: {
from: "coll_b",
localField: "a",
foreignField: "b",
as: "docs",
let: {},
pipeline: [
{$set: {deadInner: 1}},
{$group: {_id: "$other"}}
]
}}])");
runTest([&] {
assertDeadFieldsEq({});
const auto* subGraph = graph->getSubpipelineGraph(stages[0].get());
ASSERT_NOT_EQUALS(subGraph, nullptr);
auto dead = subGraph->getDeadFields();
ASSERT_EQ(dead.size(), 1u);
ASSERT_EQ(dead[0].path.fullPath(), "deadInner");
});
}
TEST_F(PipelineDependencyGraphTest, AlivenessSubpipelineAliveFieldIsNotReported) {
// 'extra' survives to the sub-pipeline's final scope, so it is alive in the sub-graph too.
setPipeline(R"([{$lookup: {
from: "coll_b",
localField: "a",
foreignField: "b",
as: "docs",
let: {},
pipeline: [{$set: {extra: 1}}]
}}])");
runTest([&] {
assertDeadFieldsEq({});
const auto* subGraph = graph->getSubpipelineGraph(stages[0].get());
ASSERT_NOT_EQUALS(subGraph, nullptr);
ASSERT_EQ(subGraph->getDeadFields().size(), 0u);
});
}
TEST_F(PipelineDependencyGraphTest, AlivenessRenameFromBaseCollectionFieldAliveIsNotDead) {
// Rename whose source is a base-collection field. The target is referenced downstream, so
// nothing is dead.
setPipeline("[{$set: {bar: '$foo'}}, {$group: {_id: '$bar'}}]");
runTest([&] { assertDeadFieldsEq({}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessRenameFromBaseCollectionFieldUnreadIsDead) {
// Rename whose source is a base-collection field but the target is never referenced.
setPipeline("[{$set: {bar: '$foo'}}, {$group: {_id: '$baz'}}]");
runTest([&] { assertDeadFieldsEq({{stages[0].get(), "bar"}}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessRenameFromExplicitlyMissingFieldIsNotDead) {
// After the inclusion projection, every field other than 'x' is known to be missing, so
// '$z' resolves to the explicitly-missing field. The rename target is referenced downstream and
// must stay alive.
setPipeline("[{$project: {x: 1}}, {$set: {y: '$z'}}, {$group: {_id: '$y'}}]");
runTest([&] { assertDeadFieldsEq({}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessRenameFromExplicitlyMissingFieldIsDead) {
// Same as above, but the rename target is unread, so 'y' is reported as dead.
setPipeline("[{$project: {x: 1}}, {$set: {y: '$z'}}, {$group: {_id: '$x'}}]");
runTest([&] { assertDeadFieldsEq({{stages[1].get(), "y"}}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessExpressionWithDepsTargetAlive) {
// The expression at the rename target references two upstream fields; the target itself is
// referenced by $group, so nothing is dead.
setPipeline(
"[{$set: {a: 1, b: 2}}, {$set: {c: {$add: ['$a', '$b']}}}, "
"{$group: {_id: '$c'}}]");
runTest([&] { assertDeadFieldsEq({}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessExpressionWithDepsTargetDead) {
// The expression's dependents stay alive through the transitive worklist propagation
// (because they are referenced by an alive field), but 'c' itself is unread and dead.
setPipeline(
"[{$set: {a: 1, b: 2}}, {$set: {c: {$add: ['$a', '$b']}}}, "
"{$group: {_id: '$a'}}]");
runTest([&] { assertDeadFieldsEq({{stages[1].get(), "c"}}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessExpressionDepsKeepBaseFieldsAlive) {
pathArrayness->addPath("base", {}, false);
setPipeline(
"[{$set: {a: 1}}, {$set: {c: {$add: ['$a', '$base']}}}, "
"{$group: {_id: '$c'}}]");
runTest([&] { assertDeadFieldsEq({}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessRenameOfRenameOuterDead) {
// 'b' renames the base field 'a', then 'c' renames 'b'. 'b' is still considered alive because
// it's referenced by the (itself dead) rename in stage 1; only 'c' is reported as dead.
// TODO(SERVER-127211): a field referenced only by a dead stage should also be reported as dead.
pathArrayness->addPath("a", {}, false);
setPipeline("[{$set: {b: '$a'}}, {$set: {c: '$b'}}, {$group: {_id: '$other'}}]");
runTest([&] { assertDeadFieldsEq({{stages[1].get(), "c"}}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessRenameOfRenameTargetAliveKeepsChain) {
// 'b' renames 'a' and then 'c' renames 'b'; 'c' is referenced downstream so the whole chain is
// alive.
pathArrayness->addPath("a", {}, false);
setPipeline("[{$set: {b: '$a'}}, {$set: {c: '$b'}}, {$group: {_id: '$c'}}]");
runTest([&] { assertDeadFieldsEq({}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessRenameOfRenameOnlyMiddleAlive) {
// 'b' renames 'a' and 'c' renames 'b'; only 'b' is referenced downstream, so 'c' is dead but
// 'b' is alive.
pathArrayness->addPath("a", {}, false);
setPipeline("[{$set: {b: '$a'}}, {$set: {c: '$b'}}, {$group: {_id: '$b'}}]");
runTest([&] { assertDeadFieldsEq({{stages[1].get(), "c"}}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessThreeLevelNestedPathDead) {
setPipeline("[{$set: {'a.b.c': 1}}, {$group: {_id: '$x'}}]");
runTest([&] { assertDeadFieldsEq({{stages[0].get(), "a.b.c"}}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessThreeLevelNestedPathAlive) {
setPipeline("[{$set: {'a.b.c': 1}}, {$group: {_id: '$a.b.c'}}]");
runTest([&] { assertDeadFieldsEq({}); });
}
TEST_F(PipelineDependencyGraphTest, AlivenessThreeLevelNestedPathSiblingDead) {
// 'a.b.c' is referenced, but 'a.b.d' isn't, so the latter is dead.
setPipeline("[{$set: {'a.b.c': 1, 'a.b.d': 2}}, {$group: {_id: '$a.b.c'}}]");
runTest([&] { assertDeadFieldsEq({{stages[0].get(), "a.b.d"}}); });
}
} // namespace
} // namespace mongo::pipeline::dependency_graph