SERVER-124332 Populate sampling information into explain (#54416)

GitOrigin-RevId: 97cc09b511347d9c11759e623f061c710d68dd9f
This commit is contained in:
Carlos Alonso 2026-05-26 17:24:26 +02:00 committed by MongoDB Bot
parent f54d4c0e40
commit 9bb4967995
22 changed files with 326 additions and 129 deletions

View File

@ -25,7 +25,7 @@ for (let i = 0; i < docsPerColl; i++) {
assert.commandWorked(collC.insert({c: i, val: 10 - i, groupKey: i}));
}
const executionStatsIngoredFields = [
const executionStatsIgnoredFields = [
"executionTimeMillis",
"executionTimeMillisEstimate",
"saveState",
@ -36,10 +36,17 @@ const executionStatsIngoredFields = [
"numKeysEstimate",
];
const stagesIgnoredFields = ["slots", "optimizationTimeMillis", "planCacheKey", "querySettings", "isCached"];
const stagesIgnoredFields = [
"slots",
"optimizationTimeMillis",
"planCacheKey",
"querySettings",
"isCached",
"ceSamplingMetadata",
];
const mongosIgnoredFields = ["works", "needTime", "queryHash", "planCacheShapeHash", "optimizationTimeMillis"].concat(
executionStatsIngoredFields,
executionStatsIgnoredFields,
stagesIgnoredFields,
);
@ -108,7 +115,7 @@ function assertExplainEq(getUnion, getRegular) {
documentEqWithIgnoredFields(
unionStats.executionStages,
regularStats.executionStages,
executionStatsIngoredFields,
executionStatsIgnoredFields,
),
buildErrorString(unionStats, regularStats, "executionStages"),
);
@ -122,7 +129,7 @@ function assertExplainEq(getUnion, getRegular) {
assert(
arrayEqWithIgnoredFields(union, regular.stages, [
...stagesIgnoredFields,
...executionStatsIngoredFields,
...executionStatsIgnoredFields,
]),
buildErrorString(union, regular, "stages with executionStats"),
);

View File

@ -141,12 +141,17 @@ function dropSamplesCollection() {
});
}
function getWinningPlanCE(query) {
function getWinningPlanMetadata(query) {
const explain = coll.find(query).explain();
const plan = getWinningPlanFromExplain(explain);
assert(isCollscan(db, plan), `expected a COLLSCAN plan: ${tojson(plan)}`);
assert.eq(plan.estimatesMetadata.ceSource, "Sampling", plan);
return plan.cardinalityEstimate;
assert(isCollscan(db, plan), "expected a COLLSCAN plan", {plan});
assert.eq(plan.estimatesMetadata.ceSource, "Sampling", "expected Sampling CE source", {plan});
const ceSamplingMetadata = explain.queryPlanner.ceSamplingMetadata;
assert(ceSamplingMetadata, "expected ceSamplingMetadata in queryPlanner", {explain});
const ns = coll.getFullName();
const meta = ceSamplingMetadata[ns];
assert(meta, "expected ceSamplingMetadata entry for namespace " + ns, {ceSamplingMetadata});
return meta;
}
const prevCBRConfig = getCBRConfig(db);
@ -172,42 +177,27 @@ try {
}),
);
// The test detects whether the persistent sample was used via CE as an indirect signal.
// Persistent sample docs are all tagged `kPersistentTag`; source collection docs are all
// tagged "from_source". Running `find({tag: kPersistentTag})` yields:
// - Hit (persistent sample loaded): every sampled doc matches → selectivity 1.0
// → CE = collCard = kSourceSize.
// - Miss (on-the-fly sample used): no source doc carries kPersistentTag → selectivity 0
// → CE = 1. See CardinalityEstimator::clampZeroEstimates().
// This approach is fragile: any change to zero-estimate clamping will break these assertions.
// TODO SERVER-124332: Replace with a direct check of the sampling source from explain output.
const kPersistentTag = "from_persistent";
const kHitCE = kSourceSize;
const kMissCE = 1;
const kSampleDocs = Array.from({length: kSampleSize}, (_, i) => ({_id: i, a: i}));
{
jsTest.log.info("Testing random sampling technique with a persistent sample hit");
assert.commandWorked(db.adminCommand({setParameter: 1, internalQuerySamplingCEMethod: "random"}));
resetCollections();
const persistedDocs = [];
for (let i = 0; i < kSampleSize; i++) {
persistedDocs.push({_id: i, tag: kPersistentTag});
}
insertPersistedSample(
buildPersistentSampleDoc({
collectionUuid: getCollectionUuidString(),
method: "random",
sampleSize: kSampleSize,
docs: persistedDocs,
docs: kSampleDocs,
}),
);
const foundCE = getWinningPlanCE({tag: kPersistentTag});
assert.eq(
foundCE,
kHitCE,
`Unexpected CE for random technique when persistent sample is hit: ${foundCE}. Expected ${kHitCE}`,
);
const meta = getWinningPlanMetadata({a: {$gte: 0}});
assert.eq(meta.sampleSource, "persisted", "expected persisted sample on hit", {meta});
assert.eq(meta.sampleTechnique, "random", "expected random technique", {meta});
assert.eq(meta.sampleDocCount, kSampleSize, "expected docCount to match persisted sample size", {meta});
assert.eq(meta.sampleRequestedDocCount, kSampleSize, "expected requestedDocCount to match", {meta});
assert(!meta.hasOwnProperty("sampleNumChunks"), "random technique should not have numChunks", {meta});
}
{
@ -215,12 +205,11 @@ try {
assert.commandWorked(db.adminCommand({setParameter: 1, internalQuerySamplingCEMethod: "random"}));
resetCollections();
const foundCE = getWinningPlanCE({tag: kPersistentTag});
assert.eq(
foundCE,
kMissCE,
`Unexpected CE for random technique when no persistent sample exists: ${foundCE}. Expected ${kMissCE}`,
);
const meta = getWinningPlanMetadata({a: {$gte: 0}});
assert.eq(meta.sampleSource, "onTheFly", "expected on-the-fly sample on miss", {meta});
assert.eq(meta.sampleTechnique, "random", "expected random technique", {meta});
assert.eq(meta.sampleRequestedDocCount, kSampleSize, "expected requestedDocCount to match", {meta});
assert(!meta.hasOwnProperty("sampleNumChunks"), "random technique should not have numChunks", {meta});
}
{
@ -228,26 +217,21 @@ try {
assert.commandWorked(db.adminCommand({setParameter: 1, internalQuerySamplingCEMethod: "chunk"}));
resetCollections();
const persistedDocs = [];
for (let i = 0; i < kSampleSize; i++) {
persistedDocs.push({_id: i, tag: kPersistentTag});
}
insertPersistedSample(
buildPersistentSampleDoc({
collectionUuid: getCollectionUuidString(),
method: "chunk",
sampleSize: kSampleSize,
docs: persistedDocs,
docs: kSampleDocs,
numChunks: kNumChunks,
}),
);
const foundCE = getWinningPlanCE({tag: kPersistentTag});
assert.eq(
foundCE,
kHitCE,
`Unexpected CE for chunk technique when persistent sample is hit: ${foundCE}. Expected ${kHitCE}`,
);
const meta = getWinningPlanMetadata({a: {$gte: 0}});
assert.eq(meta.sampleSource, "persisted", "expected persisted sample on hit", {meta});
assert.eq(meta.sampleTechnique, "chunk", "expected chunk technique", {meta});
assert.eq(meta.sampleNumChunks, kNumChunks, "expected numChunks to match", {meta});
assert.eq(meta.sampleDocCount, kSampleSize, "expected docCount to match persisted sample size", {meta});
assert.eq(meta.sampleRequestedDocCount, kSampleSize, "expected requestedDocCount to match", {meta});
}
{
@ -255,12 +239,11 @@ try {
assert.commandWorked(db.adminCommand({setParameter: 1, internalQuerySamplingCEMethod: "chunk"}));
resetCollections();
const foundCE = getWinningPlanCE({tag: kPersistentTag});
assert.eq(
foundCE,
kMissCE,
`Unexpected CE for chunk technique when no persistent sample exists: ${foundCE}. Expected ${kMissCE}`,
);
const meta = getWinningPlanMetadata({a: {$gte: 0}});
assert.eq(meta.sampleSource, "onTheFly", "expected on-the-fly sample on miss", {meta});
assert.eq(meta.sampleTechnique, "chunk", "expected chunk technique", {meta});
assert.eq(meta.sampleNumChunks, kNumChunks, "expected numChunks to match", {meta});
assert.eq(meta.sampleRequestedDocCount, kSampleSize, "expected requestedDocCount to match", {meta});
}
} finally {
setCBRConfig(db, prevCBRConfig);

View File

@ -280,7 +280,7 @@ Status SubplanStage::pickBestPlan(const QueryPlannerParams& plannerParams,
samplingEstimator.get(),
exactCardinality.get(),
std::move(branchResult->solutions),
_query->getExplain().has_value());
*_query);
if (!statusWithCBRSolns.isOK()) {
str::stream ss;
ss << "Can't plan for subchild " << branchResult->canonicalQuery->toString() << " "

View File

@ -47,6 +47,7 @@ using TopLevelFieldsProjection = StringSet;
using ProjectionParams = std::variant<NoProjection, TopLevelFieldsProjection>;
using CardinalityEstimate = mongo::cost_based_ranker::CardinalityEstimate;
using SamplingMetadata = mongo::cost_based_ranker::SamplingMetadata;
class SamplingEstimator {
public:
@ -130,6 +131,11 @@ public:
virtual double getCollCard() const = 0;
virtual size_t getSampleSize() const = 0;
/**
* Returns metadata about the sample used for cardinality estimation.
*/
virtual SamplingMetadata getSamplingMetadata() const = 0;
};
} // namespace mongo::ce

View File

@ -557,7 +557,11 @@ void SamplingEstimatorImpl::generateChunkSample() {
}
void SamplingEstimatorImpl::generateSample(ce::ProjectionParams projectionParams) {
_isSampleGenerated = true;
tassert(12433201, "SamplingEstimatorImpl must not be reused", !_isSampleGenerated);
// The final sample size (_sampleSize) may not be exactly the requested one
// (_requestedSampleSize). Capturing here the requested sample size before it gets updated.
_requestedSampleSize = _sampleSize;
if (auto topLevelSampleFieldNames =
std::get_if<ce::TopLevelFieldsProjection>(&projectionParams)) {
validateTopLevelSampleFieldNames(*topLevelSampleFieldNames);
@ -570,17 +574,25 @@ void SamplingEstimatorImpl::generateSample(ce::ProjectionParams projectionParams
if (internalQuerySamplingBySequentialScan.load()) {
// This is only used for testing purposes when a repeatable sample is needed.
_usedSamplingTechnique = cost_based_ranker::SamplingTechnique::kSeqScan;
generateSampleBySeqScanningForTesting();
} else if (_sampleSize >= _collectionCard.cardinality().v()) {
// If the required sample is larger than the collection, the sample is generated from all
// the documents on the collection.
_usedSamplingTechnique = cost_based_ranker::SamplingTechnique::kFullCollScan;
generateFullCollScanSample();
} else if (_samplingStyle == SamplingCEMethodEnum::kRandom) {
_usedSamplingTechnique = cost_based_ranker::SamplingTechnique::kRandom;
generateRandomSample();
} else {
tassert(9372901, "The number of chunks should be positive.", _numChunks && *_numChunks > 0);
_usedSamplingTechnique = cost_based_ranker::SamplingTechnique::kChunk;
generateChunkSample();
}
if (!_wasSamplePersisted) {
_sampleCreatedAt = Date_t::now();
}
_isSampleGenerated = true;
}
void SamplingEstimatorImpl::generateSampleBySeqScanningForTesting() {
@ -994,10 +1006,35 @@ Status SamplingEstimatorImpl::tryLoadPersistentSample(SamplingCEMethodEnum metho
_sample = parsed.getValue().getDocs();
_sampleSize = _sample.size();
_uniqueDocCount = boost::none;
_isSampleGenerated = true;
_wasSamplePersisted = true;
_sampleCreatedAt = parsed.getValue().getCreatedAt();
return Status::OK();
}
SamplingMetadata SamplingEstimatorImpl::getSamplingMetadata() const {
tassert(
12433200, "getSamplingMetadata() called before sample was generated", _isSampleGenerated);
// Account for: vector metadata, BSONObj object overhead per slot, and per-document
// buffer allocation (BSON data + SharedBuffer::Holder ref-count header).
size_t memorySizeBytes = sizeof(std::vector<BSONObj>) + sizeof(BSONObj) * _sample.capacity();
for (const auto& doc : _sample) {
tassert(12433202, "Sample documents must be owned BSONObjs", doc.isOwned());
// TODO SERVER-126975. Read this from the persisted doc.
memorySizeBytes += SharedBuffer::kHolderSize + static_cast<size_t>(doc.objsize());
}
SamplingMetadata meta;
meta.isPersisted = _wasSamplePersisted;
meta.docCount = _sample.size();
meta.requestedDocCount = _requestedSampleSize;
meta.memorySizeBytes = memorySizeBytes;
meta.technique = *_usedSamplingTechnique;
if (*_usedSamplingTechnique == cost_based_ranker::SamplingTechnique::kChunk) {
meta.numChunks = _numChunks;
}
meta.createdAt = _sampleCreatedAt;
return meta;
}
SamplingEstimatorImpl::~SamplingEstimatorImpl() {}
CardinalityEstimate SamplingEstimatorImpl::estimateNDV(

View File

@ -198,6 +198,17 @@ public:
return _sampleSize;
}
/*
* Returns the sampling metadata for the generated sample, which includes:
* - the sampling technique
* - the requested sample size
* - the actual sample size
* - the memory size of the sample in bytes
* - the sampling source (persistent vs on-the-fly)
* - the date and time when the sample was generated
*/
SamplingMetadata getSamplingMetadata() const final;
/**
* For each document in a given sample, this helper calculates the number of
* index keys which satisfy 'bounds', which may be >1 in the case of multi-key
@ -390,6 +401,22 @@ private:
// 'analyze' constructs its estimator with kOnTheFlySample so it always collects a fresh sample
// (otherwise a refresh would just re-read the sample it's about to replace).
SamplingSourceEnum _samplingSource;
// Set to true when tryLoadPersistentSample() successfully loads a sample from the stats
// collection. Used to populate SamplingMetadata for explain output.
bool _wasSamplePersisted = false;
// The timestamp when the sample was created. For persisted samples this is read from the
// stored document; for on-the-fly samples it is set to Date_t::now() at the end of
// generateSample(). Always valid after generateSample() completes.
boost::optional<Date_t> _sampleCreatedAt;
// The number of documents requested when generateSample() was called. May differ from the
// actual sample size (_sampleSize) in the following cases:
// 1. The collection is smaller than the requested sample size (full collection scan used).
// 2. Chunk-based sampling: if a random cursor lands on the last document in the collection,
// no full chunk can be collected for that cursor, so the actual sample is smaller.
size_t _requestedSampleSize = 0;
// The actual sampling strategy used. Set by generateSample() before dispatch.
boost::optional<cost_based_ranker::SamplingTechnique> _usedSamplingTechnique;
};
} // namespace mongo::ce

View File

@ -2246,54 +2246,6 @@ TEST_F(SamplingEstimatorTest, ChunkSamplingSkipsPersistentSampleWhenFeatureFlagD
}
}
TEST_F(SamplingEstimatorTest, LoadPersistentSampleResetsUniqueDocCountCache) {
// _uniqueDocCount is a lazy cache of countUniqueDocuments(_sample), populated on the first
// estimateNDV call after a sample is loaded. When tryLoadPersistentSample replaces _sample it
// must also clear the cache — otherwise a stale count from a previous sample would be used.
// TODO SERVER-112627: Remove once featureFlagPersistentStats is enabled by default.
RAIIServerParameterControllerForTest persistentStatsFlag{"featureFlagPersistentStats", true};
insertDocuments(kTestNss, {BSON("_id" << 1 << "tag" << "not_persisted")});
const UUID uuid = [&] {
auto srcColl = acquireCollection(operationContext(), kTestNss);
return srcColl.getCollectionPtr()->uuid();
}();
std::vector<BSONObj> persistedDocs{BSON("_id" << 2 << "tag" << "persisted"),
BSON("_id" << 3 << "tag" << "persisted"),
BSON("_id" << 4 << "tag" << "persisted")};
createCollAndInsertDocuments(
operationContext(),
NamespaceStringUtil::deserialize(kTestNss.dbName(), kSamplesCollectionName),
{buildPersistentSampleDoc(
uuid, SamplingCEMethodEnum::kRandom, persistedDocs.size(), persistedDocs)});
auto coll = acquireCollection(operationContext(), kTestNss);
auto colls = MultipleCollectionAccessor(coll, {}, false);
SamplingEstimatorForTesting estimator(operationContext(),
colls,
kTestNss,
PlanYieldPolicy::YieldPolicy::YIELD_AUTO,
persistedDocs.size(),
SamplingCEMethodEnum::kRandom,
numChunks,
makeCardinalityEstimate(100));
estimator.generateSample(ce::NoProjection{});
ASSERT_FALSE(estimator.getUniqueDocCountForTesting().has_value());
for (const auto& doc : estimator.getSample()) {
ASSERT_EQUALS(doc.getStringField("tag"), "persisted");
}
// Simulate the cache being populated by a prior estimateNDV call.
estimator.setUniqueDocCountForTesting(99);
ASSERT_TRUE(estimator.getUniqueDocCountForTesting().has_value());
// A second generateSample via the persistent path must clear the cache.
estimator.generateSample(ce::NoProjection{});
ASSERT_FALSE(estimator.getUniqueDocCountForTesting().has_value());
for (const auto& doc : estimator.getSample()) {
ASSERT_EQUALS(doc.getStringField("tag"), "persisted");
}
}
TEST_F(SamplingEstimatorTest, MalformedPersistentSampleFallsBackToOnTheFly) {
// A doc with the correct _id key exists in system.stats.samples but is malformed (sampleSize
// field disagrees with the docs array length). tryLoadPersistentSample must log the error and
@ -2352,4 +2304,28 @@ TEST_F(SamplingEstimatorTest, MalformedPersistentSampleFallsBackToOnTheFly) {
}
}
DEATH_TEST_F(SamplingEstimatorTestDeathTest,
GenerateSampleAssertsOnReuse,
"SamplingEstimatorImpl must not be reused") {
auto estimator =
createSamplingEstimatorForTesting(10 /* collCard */, kSampleSize, ce::NoProjection{});
// This is indeed the 2nd call since createSamplingEstimatorForTesting() calls generateSample()
// once already.
estimator.generateSample(ce::NoProjection{});
}
DEATH_TEST_F(SamplingEstimatorTestDeathTest,
GetSamplingMetadataAssertsOnNonOwnedDoc,
"Sample documents must be owned BSONObjs") {
auto estimator =
createSamplingEstimatorForTesting(10 /* collCard */, kSampleSize, ce::NoProjection{});
// Create a non-owned BSONObj (raw-pointer view into an existing buffer).
auto owned = BSON("a" << 1);
BSONObj unowned(owned.objdata());
estimator.setSampleForTesting({unowned});
estimator.getSamplingMetadata();
}
} // namespace mongo::ce

View File

@ -29,17 +29,16 @@
#include "mongo/bson/json.h"
#include "mongo/db/matcher/expression.h"
#include "mongo/db/query/canonical_query.h"
#include "mongo/db/query/compiler/ce/sampling/sampling_estimator.h"
#include "mongo/db/query/compiler/ce/sampling/sampling_test_utils.h"
#include "mongo/db/query/compiler/optimizer/cost_based_ranker/cbr_rewrites.h"
#include "mongo/db/query/compiler/optimizer/cost_based_ranker/cbr_test_utils.h"
#include "mongo/db/query/compiler/optimizer/index_bounds_builder/index_bounds_builder.h"
#include "mongo/db/query/compiler/rewrites/matcher/expression_optimizer.h"
#include "mongo/unittest/unittest.h"
#include <limits>
#include <gmock/gmock.h>
namespace mongo::cost_based_ranker {
namespace {
@ -1054,6 +1053,9 @@ public:
size_t getSampleSize() const override {
return 100;
}
ce::SamplingMetadata getSamplingMetadata() const override {
MONGO_UNREACHABLE;
}
};
// Build IndexBounds with a single-point OIL on "a" and 'bIntervalCount' point intervals on "b".
@ -1471,6 +1473,9 @@ public:
size_t getSampleSize() const override {
MONGO_UNIMPLEMENTED;
}
ce::SamplingMetadata getSamplingMetadata() const override {
MONGO_UNREACHABLE;
}
private:
double _ndv;

View File

@ -576,6 +576,29 @@ CardinalityEstimate operator*(const SelectivityEstimate& s, const CardinalityEst
CardinalityEstimate operator*(const CardinalityEstimate& ce, const SelectivityEstimate& s);
/**
* The actual strategy used to generate the sample.
*/
enum class SamplingTechnique {
kRandom,
kChunk,
kFullCollScan,
kSeqScan,
};
/**
* Metadata about the sample used when 'ceSource == Sampling'.
*/
struct SamplingMetadata {
bool isPersisted;
size_t docCount; // number of documents in the sample
size_t requestedDocCount; // number of documents originally requested
size_t memorySizeBytes;
SamplingTechnique technique;
boost::optional<int> numChunks;
boost::optional<Date_t> createdAt;
};
/**
* The optimizer's estimate of a single QSN in the physical plan.
*/

View File

@ -170,12 +170,8 @@ const QuerySolution* bestCBRPlan(CanonicalQuery* cq,
double generateSampleTimeMS = generateSampleTimer.elapsed().count() / 1000.0;
Timer planningTimer;
auto statusWithCBRSolns =
QueryPlanner::planWithCostBasedRanking(plannerParams,
samplingEstimator.get(),
nullptr,
std::move(statusWithMultiPlanSolns),
cq->getExplain().has_value());
auto statusWithCBRSolns = QueryPlanner::planWithCostBasedRanking(
plannerParams, samplingEstimator.get(), nullptr, std::move(statusWithMultiPlanSolns), *cq);
double planTimeMS = planningTimer.elapsed().count() / 1000.0;
if (timeProfile.has_value()) {

View File

@ -322,8 +322,7 @@ StatusWith<JoinReorderedExecutorResult> getJoinReorderedExecutor(
// Select access plans for each table in the join.
auto yieldPolicy = PlanYieldPolicy::YieldPolicy::YIELD_AUTO;
SamplingEstimatorMap samplingEstimators = makeSamplingEstimators(mca, model.graph, yieldPolicy);
auto swAccessPlans = singleTableAccessPlans(
opCtx, mca, model.graph, samplingEstimators, expCtx->getExplain().has_value());
auto swAccessPlans = singleTableAccessPlans(opCtx, mca, model.graph, samplingEstimators);
if (!swAccessPlans.isOK()) {
return swAccessPlans.getStatus();
}

View File

@ -82,8 +82,7 @@ StatusWith<SingleTableAccessPlansResult> singleTableAccessPlans(
OperationContext* opCtx,
const MultipleCollectionAccessor& collections,
const JoinGraph& graph,
const SamplingEstimatorMap& samplingEstimators,
bool isExplain) {
const SamplingEstimatorMap& samplingEstimators) {
const auto numNodes = graph.numNodes();
QuerySolutionMap solns;
cost_based_ranker::EstimateMap estimates;
@ -148,7 +147,7 @@ StatusWith<SingleTableAccessPlansResult> singleTableAccessPlans(
samplingEstimator.get(),
nullptr /*exactCardinality*/,
std::move(swSolns.getValue()),
isExplain);
*node.accessPath);
// Return bad status if CBR is unable to produce a plan
if (!swCbrResult.isOK()) {
return swCbrResult.getStatus();

View File

@ -56,7 +56,6 @@ StatusWith<SingleTableAccessPlansResult> singleTableAccessPlans(
OperationContext* opCtx,
const MultipleCollectionAccessor& collections,
const JoinGraph& model,
const SamplingEstimatorMap& samplingEstimators,
bool isExplain);
const SamplingEstimatorMap& samplingEstimators);
} // namespace mongo::join_ordering

View File

@ -97,7 +97,7 @@ TEST_F(SingleTableAccessTestFixture, EstimatesPopulated) {
ASSERT(node2);
JoinGraph graph(std::move(mgraph));
auto swRes = singleTableAccessPlans(opCtx, mca, graph, estimators, false);
auto swRes = singleTableAccessPlans(opCtx, mca, graph, estimators);
ASSERT_OK(swRes);
auto& res = swRes.getValue();

View File

@ -232,6 +232,10 @@ public:
MONGO_UNREACHABLE;
}
ce::SamplingMetadata getSamplingMetadata() const override {
MONGO_UNREACHABLE;
}
private:
CardinalityEstimate _collCard;
stdx::unordered_map<std::vector<FieldPath>, CardinalityEstimate> _fakeEstimates;

View File

@ -172,6 +172,41 @@ void generatePlannerInfo(PlanExecutor* exec,
}
auto&& explainer = exec->getPlanExplainer();
if (const auto ceSamplingMeta = explainer.getCeSamplingMetadata(); ceSamplingMeta.has_value()) {
BSONObjBuilder ceSamplingMetaBob(plannerBob.subobjStart("ceSamplingMetadata"));
for (const auto& [ns, meta] : ceSamplingMeta.value()) {
BSONObjBuilder nsMetaBob(ceSamplingMetaBob.subobjStart(ns));
nsMetaBob.append("sampleSource", meta.isPersisted ? "persisted" : "onTheFly");
static constexpr auto techniqueToStr =
[](cost_based_ranker::SamplingTechnique t) -> StringData {
switch (t) {
case cost_based_ranker::SamplingTechnique::kRandom:
return "random"_sd;
case cost_based_ranker::SamplingTechnique::kChunk:
return "chunk"_sd;
case cost_based_ranker::SamplingTechnique::kFullCollScan:
return "fullCollScan"_sd;
case cost_based_ranker::SamplingTechnique::kSeqScan:
return "seqScan"_sd;
}
MONGO_UNREACHABLE;
};
nsMetaBob.append("sampleTechnique", techniqueToStr(meta.technique));
if (meta.technique == cost_based_ranker::SamplingTechnique::kChunk && meta.numChunks) {
nsMetaBob.appendNumber("sampleNumChunks", *meta.numChunks);
}
nsMetaBob.appendNumber("sampleRequestedDocCount",
static_cast<long long>(meta.requestedDocCount));
nsMetaBob.appendNumber("sampleDocCount", static_cast<long long>(meta.docCount));
nsMetaBob.appendNumber("sampleMemorySizeBytes",
static_cast<long long>(meta.memorySizeBytes));
tassert(12433203,
"SamplingMetadata::createdAt must be set before explain is generated",
meta.createdAt.has_value());
nsMetaBob.appendDate("sampleCreatedAt", meta.createdAt.value());
}
}
auto&& enumeratorInfo = explainer.getEnumeratorInfo();
plannerBob.append("maxIndexedOrSolutionsReached", enumeratorInfo.hitIndexedOrLimit);
plannerBob.append("maxIndexedAndSolutionsReached", enumeratorInfo.hitIndexedAndLimit);

View File

@ -39,6 +39,7 @@
#include "mongo/db/query/stage_builder/classic_stage_builder.h"
#include "mongo/util/duration.h"
#include "mongo/util/modules.h"
#include "mongo/util/string_map.h"
namespace mongo {
@ -59,6 +60,9 @@ struct PlanExplainerData {
boost::optional<double> multiPlannerWinningPlanScore;
stage_builder::PlanStageToQsnMap planStageQsnMap;
cost_based_ranker::EstimateMap estimates;
// Namespace-keyed map of sampling metadata emitted under queryPlanner.ceSamplingMetadata.
// Populated on the explain path when CBR used a sampling estimator.
StringMap<cost_based_ranker::SamplingMetadata> ceSamplingMetadata;
bool fromPlanCache = false;
};
@ -70,6 +74,12 @@ inline PlanExplainerData& operator<<(PlanExplainerData& lhs, PlanExplainerData&&
for (auto& [k, v] : rhs.estimates) {
lhs.estimates.insert_or_assign(k, std::move(v));
}
for (auto& [ns, meta] : rhs.ceSamplingMetadata) {
tassert(12433204,
"ceSamplingMetadata already has an entry for namespace during merge",
!lhs.ceSamplingMetadata.contains(ns));
lhs.ceSamplingMetadata.emplace(ns, std::move(meta));
}
return lhs;
}
@ -195,6 +205,16 @@ public:
_solution = qs;
}
/**
* Returns the per-collection sampling metadata to be emitted under
* queryPlanner.ceSamplingMetadata in explain output. Returns boost::none if no sampling
* metadata is available (e.g., CBR was not used, or this is not a classic-engine plan).
*/
virtual boost::optional<StringMap<cost_based_ranker::SamplingMetadata>> getCeSamplingMetadata()
const {
return boost::none;
}
protected:
const QuerySolution* _solution{nullptr};
PlanEnumeratorExplainInfo _enumeratorExplainInfo;

View File

@ -77,6 +77,14 @@ public:
std::vector<PlanStatsDetails> getCachedPlanStats(const plan_cache_debug_info::DebugInfo&,
ExplainOptions::Verbosity) const;
boost::optional<StringMap<cost_based_ranker::SamplingMetadata>> getCeSamplingMetadata()
const override {
if (_explainData.ceSamplingMetadata.empty()) {
return boost::none;
}
return _explainData.ceSamplingMetadata;
}
private:
/**
* A helper that formats the plan stats into a BSON object and collects summary stats.

View File

@ -34,6 +34,7 @@
#include "mongo/db/pipeline/pipeline_d.h"
#include "mongo/db/query/canonical_query.h"
#include "mongo/db/query/compiler/physical_model/query_solution/query_solution.h"
#include "mongo/db/query/explain.h"
#include "mongo/db/query/explain_diagnostic_printer.h"
#include "mongo/db/query/get_executor.h"
#include "mongo/db/query/multiple_collection_accessor.h"
@ -661,5 +662,69 @@ TEST_F(PlanExplainerTest, PlanExplainerDataMergeFull) {
ASSERT_EQ(data1.estimates.size(), 2);
}
TEST_F(PlanExplainerTest, CBRSamplingMetadataSerializedInExplain) {
// Verify that when CBR uses sampling CE, the 'ceSamplingMetadata' section appears in the
// queryPlanner explain output and contains the expected fields for each collection.
RAIIServerParameterControllerForTest samplingController("internalQueryCBRCEMode", "samplingCE");
const auto verbosity = ExplainOptions::Verbosity::kQueryPlanner;
expCtx->setExplain(verbosity);
auto coll = acquireCollection(
operationContext(),
CollectionAcquisitionRequest::fromOpCtx(
operationContext(), kNss, AcquisitionPrerequisites::OperationType::kRead),
MODE_IS);
MultipleCollectionAccessor colls{coll};
auto findCommand = std::make_unique<FindCommandRequest>(kNss);
findCommand->setFilter(fromjson("{a: {$gte: 0}, b: {$gte: 0}}"));
auto cq = std::make_unique<CanonicalQuery>(CanonicalQueryParams{
.expCtx = expCtx,
.parsedFind = ParsedFindCommandParams{.findCommand = std::move(findCommand)}});
Command* cmd = CommandHelpers::findCommand(operationContext(), "find");
{
std::lock_guard<Client> clientLock(*operationContext()->getClient());
CurOp::get(operationContext())
->setGenericOpRequestDetails(clientLock, kNss, cmd, BSONObj(), NetworkOp::dbQuery);
}
auto swExec = getExecutorFind(
operationContext(), colls, std::move(cq), PlanYieldPolicy::YieldPolicy::INTERRUPT_ONLY);
ASSERT_OK(swExec);
BSONObjBuilder bob;
Explain::explainStages(swExec.getValue().get(),
colls,
verbosity,
Status::OK(),
boost::none,
BSONObj(),
SerializationContext::stateCommandReply(),
BSONObj(),
&bob);
const BSONObj explained = bob.obj();
auto queryPlanner = explained["queryPlanner"];
ASSERT(queryPlanner.isABSONObj()) << "Missing queryPlanner in: " << explained;
auto ceSamplingMeta = queryPlanner["ceSamplingMetadata"];
ASSERT(ceSamplingMeta.isABSONObj())
<< "Missing ceSamplingMetadata in queryPlanner: " << queryPlanner;
// Exactly one namespace entry expected.
ASSERT_EQ(ceSamplingMeta.Obj().nFields(), 1) << ceSamplingMeta;
const BSONElement nsElem = ceSamplingMeta.Obj().firstElement();
ASSERT_EQ(nsElem.type(), BSONType::object);
const BSONObj nsMeta = nsElem.Obj();
ASSERT_EQ(nsMeta["sampleSource"].String(), "onTheFly");
ASSERT(nsMeta.hasField("sampleTechnique")) << nsMeta;
ASSERT(nsMeta.hasField("sampleDocCount")) << nsMeta;
ASSERT(nsMeta.hasField("sampleRequestedDocCount")) << nsMeta;
ASSERT(nsMeta.hasField("sampleMemorySizeBytes")) << nsMeta;
}
} // namespace
} // namespace mongo

View File

@ -36,6 +36,7 @@
#include "mongo/db/query/compiler/ce/exact/exact_cardinality_impl.h"
#include "mongo/db/query/compiler/ce/sampling/sampling_estimator.h"
#include "mongo/db/query/compiler/ce/sampling/sampling_estimator_impl.h"
#include "mongo/db/query/compiler/optimizer/cost_based_ranker/estimates.h"
#include "mongo/db/query/planner_analysis.h"
#include "mongo/db/stats/counters.h"
@ -211,7 +212,7 @@ StatusWith<PlanRankingResult> CBRPlanRankingStrategy::rankPlans(
samplingEstimator.get(),
exactCardinality.get(),
std::move(statusWithMultiPlanSolns),
query.getExplain().has_value());
query);
// Calculate duration for server status metrics
auto durationMicros = tickSource->ticksTo<Microseconds>(tickSource->getTicks() - startTicks);

View File

@ -1768,7 +1768,7 @@ StatusWith<PlanRankingResult> QueryPlanner::planWithCostBasedRanking(
ce::SamplingEstimator* samplingEstimator,
const ce::ExactCardinalityEstimator* exactCardinality,
StatusWith<std::vector<std::unique_ptr<QuerySolution>>> statusWithMultiPlanSolns,
bool isExplain) {
const CanonicalQuery& query) {
using namespace cost_based_ranker;
auto cbrMode = params.planRankerMode;
EstimateMap estimates;
@ -1849,7 +1849,7 @@ StatusWith<PlanRankingResult> QueryPlanner::planWithCostBasedRanking(
PlanRankingResult{.solutions = std::move(acceptedSoln),
.maybeExplainData = PlanExplainerData{.estimates = std::move(estimates)},
.needsWorksMeasuredForPlanCache = successfullyChoseWinner};
if (isExplain) {
if (query.getExplain()) {
std::vector<SolutionWithPlanStage> rejectedSolnWithStages;
rejectedSolnWithStages.reserve(rejectedSoln.size());
std::transform(std::make_move_iterator(rejectedSoln.begin()),
@ -1860,6 +1860,12 @@ StatusWith<PlanRankingResult> QueryPlanner::planWithCostBasedRanking(
});
planRankingResult.maybeExplainData->rejectedPlansWithStages =
std::move(rejectedSolnWithStages);
if (samplingEstimator) {
planRankingResult.maybeExplainData->ceSamplingMetadata.emplace(
NamespaceStringUtil::serialize(query.nss(),
query.getExpCtx()->getSerializationContext()),
samplingEstimator->getSamplingMetadata());
}
}
return std::move(planRankingResult);
}

View File

@ -129,14 +129,15 @@ public:
* estimation (CE) and costing modules. The return value contains a list of plans that were
* rejected on the basis of cost, as well as any non-rejected plans from which the caller can
* select a winner.
* If isExplain is true, collect and return planExplainerData as part of the PlanRankingResult.
* If query.getExplain().has_value(), collect and return planExplainerData as part of the
* PlanRankingResult.
*/
static StatusWith<PlanRankingResult> planWithCostBasedRanking(
const QueryPlannerParams& params,
ce::SamplingEstimator* samplingEstimator,
const ce::ExactCardinalityEstimator* exactCardinality,
StatusWith<std::vector<std::unique_ptr<QuerySolution>>> statusWithMultiPlanSolns,
bool isExplain);
const CanonicalQuery& query);
/**
* Generates and returns a query solution, given data retrieved from the plan cache.