SERVER-124332 Populate sampling information into explain (#54416)
GitOrigin-RevId: 97cc09b511347d9c11759e623f061c710d68dd9f
This commit is contained in:
parent
f54d4c0e40
commit
9bb4967995
@ -25,7 +25,7 @@ for (let i = 0; i < docsPerColl; i++) {
|
||||
assert.commandWorked(collC.insert({c: i, val: 10 - i, groupKey: i}));
|
||||
}
|
||||
|
||||
const executionStatsIngoredFields = [
|
||||
const executionStatsIgnoredFields = [
|
||||
"executionTimeMillis",
|
||||
"executionTimeMillisEstimate",
|
||||
"saveState",
|
||||
@ -36,10 +36,17 @@ const executionStatsIngoredFields = [
|
||||
"numKeysEstimate",
|
||||
];
|
||||
|
||||
const stagesIgnoredFields = ["slots", "optimizationTimeMillis", "planCacheKey", "querySettings", "isCached"];
|
||||
const stagesIgnoredFields = [
|
||||
"slots",
|
||||
"optimizationTimeMillis",
|
||||
"planCacheKey",
|
||||
"querySettings",
|
||||
"isCached",
|
||||
"ceSamplingMetadata",
|
||||
];
|
||||
|
||||
const mongosIgnoredFields = ["works", "needTime", "queryHash", "planCacheShapeHash", "optimizationTimeMillis"].concat(
|
||||
executionStatsIngoredFields,
|
||||
executionStatsIgnoredFields,
|
||||
stagesIgnoredFields,
|
||||
);
|
||||
|
||||
@ -108,7 +115,7 @@ function assertExplainEq(getUnion, getRegular) {
|
||||
documentEqWithIgnoredFields(
|
||||
unionStats.executionStages,
|
||||
regularStats.executionStages,
|
||||
executionStatsIngoredFields,
|
||||
executionStatsIgnoredFields,
|
||||
),
|
||||
buildErrorString(unionStats, regularStats, "executionStages"),
|
||||
);
|
||||
@ -122,7 +129,7 @@ function assertExplainEq(getUnion, getRegular) {
|
||||
assert(
|
||||
arrayEqWithIgnoredFields(union, regular.stages, [
|
||||
...stagesIgnoredFields,
|
||||
...executionStatsIngoredFields,
|
||||
...executionStatsIgnoredFields,
|
||||
]),
|
||||
buildErrorString(union, regular, "stages with executionStats"),
|
||||
);
|
||||
|
||||
@ -141,12 +141,17 @@ function dropSamplesCollection() {
|
||||
});
|
||||
}
|
||||
|
||||
function getWinningPlanCE(query) {
|
||||
function getWinningPlanMetadata(query) {
|
||||
const explain = coll.find(query).explain();
|
||||
const plan = getWinningPlanFromExplain(explain);
|
||||
assert(isCollscan(db, plan), `expected a COLLSCAN plan: ${tojson(plan)}`);
|
||||
assert.eq(plan.estimatesMetadata.ceSource, "Sampling", plan);
|
||||
return plan.cardinalityEstimate;
|
||||
assert(isCollscan(db, plan), "expected a COLLSCAN plan", {plan});
|
||||
assert.eq(plan.estimatesMetadata.ceSource, "Sampling", "expected Sampling CE source", {plan});
|
||||
const ceSamplingMetadata = explain.queryPlanner.ceSamplingMetadata;
|
||||
assert(ceSamplingMetadata, "expected ceSamplingMetadata in queryPlanner", {explain});
|
||||
const ns = coll.getFullName();
|
||||
const meta = ceSamplingMetadata[ns];
|
||||
assert(meta, "expected ceSamplingMetadata entry for namespace " + ns, {ceSamplingMetadata});
|
||||
return meta;
|
||||
}
|
||||
|
||||
const prevCBRConfig = getCBRConfig(db);
|
||||
@ -172,42 +177,27 @@ try {
|
||||
}),
|
||||
);
|
||||
|
||||
// The test detects whether the persistent sample was used via CE as an indirect signal.
|
||||
// Persistent sample docs are all tagged `kPersistentTag`; source collection docs are all
|
||||
// tagged "from_source". Running `find({tag: kPersistentTag})` yields:
|
||||
// - Hit (persistent sample loaded): every sampled doc matches → selectivity 1.0
|
||||
// → CE = collCard = kSourceSize.
|
||||
// - Miss (on-the-fly sample used): no source doc carries kPersistentTag → selectivity 0
|
||||
// → CE = 1. See CardinalityEstimator::clampZeroEstimates().
|
||||
// This approach is fragile: any change to zero-estimate clamping will break these assertions.
|
||||
// TODO SERVER-124332: Replace with a direct check of the sampling source from explain output.
|
||||
const kPersistentTag = "from_persistent";
|
||||
const kHitCE = kSourceSize;
|
||||
const kMissCE = 1;
|
||||
const kSampleDocs = Array.from({length: kSampleSize}, (_, i) => ({_id: i, a: i}));
|
||||
|
||||
{
|
||||
jsTest.log.info("Testing random sampling technique with a persistent sample hit");
|
||||
assert.commandWorked(db.adminCommand({setParameter: 1, internalQuerySamplingCEMethod: "random"}));
|
||||
resetCollections();
|
||||
|
||||
const persistedDocs = [];
|
||||
for (let i = 0; i < kSampleSize; i++) {
|
||||
persistedDocs.push({_id: i, tag: kPersistentTag});
|
||||
}
|
||||
insertPersistedSample(
|
||||
buildPersistentSampleDoc({
|
||||
collectionUuid: getCollectionUuidString(),
|
||||
method: "random",
|
||||
sampleSize: kSampleSize,
|
||||
docs: persistedDocs,
|
||||
docs: kSampleDocs,
|
||||
}),
|
||||
);
|
||||
const foundCE = getWinningPlanCE({tag: kPersistentTag});
|
||||
assert.eq(
|
||||
foundCE,
|
||||
kHitCE,
|
||||
`Unexpected CE for random technique when persistent sample is hit: ${foundCE}. Expected ${kHitCE}`,
|
||||
);
|
||||
const meta = getWinningPlanMetadata({a: {$gte: 0}});
|
||||
assert.eq(meta.sampleSource, "persisted", "expected persisted sample on hit", {meta});
|
||||
assert.eq(meta.sampleTechnique, "random", "expected random technique", {meta});
|
||||
assert.eq(meta.sampleDocCount, kSampleSize, "expected docCount to match persisted sample size", {meta});
|
||||
assert.eq(meta.sampleRequestedDocCount, kSampleSize, "expected requestedDocCount to match", {meta});
|
||||
assert(!meta.hasOwnProperty("sampleNumChunks"), "random technique should not have numChunks", {meta});
|
||||
}
|
||||
|
||||
{
|
||||
@ -215,12 +205,11 @@ try {
|
||||
assert.commandWorked(db.adminCommand({setParameter: 1, internalQuerySamplingCEMethod: "random"}));
|
||||
resetCollections();
|
||||
|
||||
const foundCE = getWinningPlanCE({tag: kPersistentTag});
|
||||
assert.eq(
|
||||
foundCE,
|
||||
kMissCE,
|
||||
`Unexpected CE for random technique when no persistent sample exists: ${foundCE}. Expected ${kMissCE}`,
|
||||
);
|
||||
const meta = getWinningPlanMetadata({a: {$gte: 0}});
|
||||
assert.eq(meta.sampleSource, "onTheFly", "expected on-the-fly sample on miss", {meta});
|
||||
assert.eq(meta.sampleTechnique, "random", "expected random technique", {meta});
|
||||
assert.eq(meta.sampleRequestedDocCount, kSampleSize, "expected requestedDocCount to match", {meta});
|
||||
assert(!meta.hasOwnProperty("sampleNumChunks"), "random technique should not have numChunks", {meta});
|
||||
}
|
||||
|
||||
{
|
||||
@ -228,26 +217,21 @@ try {
|
||||
assert.commandWorked(db.adminCommand({setParameter: 1, internalQuerySamplingCEMethod: "chunk"}));
|
||||
resetCollections();
|
||||
|
||||
const persistedDocs = [];
|
||||
for (let i = 0; i < kSampleSize; i++) {
|
||||
persistedDocs.push({_id: i, tag: kPersistentTag});
|
||||
}
|
||||
insertPersistedSample(
|
||||
buildPersistentSampleDoc({
|
||||
collectionUuid: getCollectionUuidString(),
|
||||
method: "chunk",
|
||||
sampleSize: kSampleSize,
|
||||
docs: persistedDocs,
|
||||
docs: kSampleDocs,
|
||||
numChunks: kNumChunks,
|
||||
}),
|
||||
);
|
||||
|
||||
const foundCE = getWinningPlanCE({tag: kPersistentTag});
|
||||
assert.eq(
|
||||
foundCE,
|
||||
kHitCE,
|
||||
`Unexpected CE for chunk technique when persistent sample is hit: ${foundCE}. Expected ${kHitCE}`,
|
||||
);
|
||||
const meta = getWinningPlanMetadata({a: {$gte: 0}});
|
||||
assert.eq(meta.sampleSource, "persisted", "expected persisted sample on hit", {meta});
|
||||
assert.eq(meta.sampleTechnique, "chunk", "expected chunk technique", {meta});
|
||||
assert.eq(meta.sampleNumChunks, kNumChunks, "expected numChunks to match", {meta});
|
||||
assert.eq(meta.sampleDocCount, kSampleSize, "expected docCount to match persisted sample size", {meta});
|
||||
assert.eq(meta.sampleRequestedDocCount, kSampleSize, "expected requestedDocCount to match", {meta});
|
||||
}
|
||||
|
||||
{
|
||||
@ -255,12 +239,11 @@ try {
|
||||
assert.commandWorked(db.adminCommand({setParameter: 1, internalQuerySamplingCEMethod: "chunk"}));
|
||||
resetCollections();
|
||||
|
||||
const foundCE = getWinningPlanCE({tag: kPersistentTag});
|
||||
assert.eq(
|
||||
foundCE,
|
||||
kMissCE,
|
||||
`Unexpected CE for chunk technique when no persistent sample exists: ${foundCE}. Expected ${kMissCE}`,
|
||||
);
|
||||
const meta = getWinningPlanMetadata({a: {$gte: 0}});
|
||||
assert.eq(meta.sampleSource, "onTheFly", "expected on-the-fly sample on miss", {meta});
|
||||
assert.eq(meta.sampleTechnique, "chunk", "expected chunk technique", {meta});
|
||||
assert.eq(meta.sampleNumChunks, kNumChunks, "expected numChunks to match", {meta});
|
||||
assert.eq(meta.sampleRequestedDocCount, kSampleSize, "expected requestedDocCount to match", {meta});
|
||||
}
|
||||
} finally {
|
||||
setCBRConfig(db, prevCBRConfig);
|
||||
|
||||
@ -280,7 +280,7 @@ Status SubplanStage::pickBestPlan(const QueryPlannerParams& plannerParams,
|
||||
samplingEstimator.get(),
|
||||
exactCardinality.get(),
|
||||
std::move(branchResult->solutions),
|
||||
_query->getExplain().has_value());
|
||||
*_query);
|
||||
if (!statusWithCBRSolns.isOK()) {
|
||||
str::stream ss;
|
||||
ss << "Can't plan for subchild " << branchResult->canonicalQuery->toString() << " "
|
||||
|
||||
@ -47,6 +47,7 @@ using TopLevelFieldsProjection = StringSet;
|
||||
using ProjectionParams = std::variant<NoProjection, TopLevelFieldsProjection>;
|
||||
|
||||
using CardinalityEstimate = mongo::cost_based_ranker::CardinalityEstimate;
|
||||
using SamplingMetadata = mongo::cost_based_ranker::SamplingMetadata;
|
||||
|
||||
class SamplingEstimator {
|
||||
public:
|
||||
@ -130,6 +131,11 @@ public:
|
||||
virtual double getCollCard() const = 0;
|
||||
|
||||
virtual size_t getSampleSize() const = 0;
|
||||
|
||||
/**
|
||||
* Returns metadata about the sample used for cardinality estimation.
|
||||
*/
|
||||
virtual SamplingMetadata getSamplingMetadata() const = 0;
|
||||
};
|
||||
|
||||
} // namespace mongo::ce
|
||||
|
||||
@ -557,7 +557,11 @@ void SamplingEstimatorImpl::generateChunkSample() {
|
||||
}
|
||||
|
||||
void SamplingEstimatorImpl::generateSample(ce::ProjectionParams projectionParams) {
|
||||
_isSampleGenerated = true;
|
||||
tassert(12433201, "SamplingEstimatorImpl must not be reused", !_isSampleGenerated);
|
||||
// The final sample size (_sampleSize) may not be exactly the requested one
|
||||
// (_requestedSampleSize). Capturing here the requested sample size before it gets updated.
|
||||
_requestedSampleSize = _sampleSize;
|
||||
|
||||
if (auto topLevelSampleFieldNames =
|
||||
std::get_if<ce::TopLevelFieldsProjection>(&projectionParams)) {
|
||||
validateTopLevelSampleFieldNames(*topLevelSampleFieldNames);
|
||||
@ -570,17 +574,25 @@ void SamplingEstimatorImpl::generateSample(ce::ProjectionParams projectionParams
|
||||
|
||||
if (internalQuerySamplingBySequentialScan.load()) {
|
||||
// This is only used for testing purposes when a repeatable sample is needed.
|
||||
_usedSamplingTechnique = cost_based_ranker::SamplingTechnique::kSeqScan;
|
||||
generateSampleBySeqScanningForTesting();
|
||||
} else if (_sampleSize >= _collectionCard.cardinality().v()) {
|
||||
// If the required sample is larger than the collection, the sample is generated from all
|
||||
// the documents on the collection.
|
||||
_usedSamplingTechnique = cost_based_ranker::SamplingTechnique::kFullCollScan;
|
||||
generateFullCollScanSample();
|
||||
} else if (_samplingStyle == SamplingCEMethodEnum::kRandom) {
|
||||
_usedSamplingTechnique = cost_based_ranker::SamplingTechnique::kRandom;
|
||||
generateRandomSample();
|
||||
} else {
|
||||
tassert(9372901, "The number of chunks should be positive.", _numChunks && *_numChunks > 0);
|
||||
_usedSamplingTechnique = cost_based_ranker::SamplingTechnique::kChunk;
|
||||
generateChunkSample();
|
||||
}
|
||||
if (!_wasSamplePersisted) {
|
||||
_sampleCreatedAt = Date_t::now();
|
||||
}
|
||||
_isSampleGenerated = true;
|
||||
}
|
||||
|
||||
void SamplingEstimatorImpl::generateSampleBySeqScanningForTesting() {
|
||||
@ -994,10 +1006,35 @@ Status SamplingEstimatorImpl::tryLoadPersistentSample(SamplingCEMethodEnum metho
|
||||
_sample = parsed.getValue().getDocs();
|
||||
_sampleSize = _sample.size();
|
||||
_uniqueDocCount = boost::none;
|
||||
_isSampleGenerated = true;
|
||||
_wasSamplePersisted = true;
|
||||
_sampleCreatedAt = parsed.getValue().getCreatedAt();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
SamplingMetadata SamplingEstimatorImpl::getSamplingMetadata() const {
|
||||
tassert(
|
||||
12433200, "getSamplingMetadata() called before sample was generated", _isSampleGenerated);
|
||||
// Account for: vector metadata, BSONObj object overhead per slot, and per-document
|
||||
// buffer allocation (BSON data + SharedBuffer::Holder ref-count header).
|
||||
size_t memorySizeBytes = sizeof(std::vector<BSONObj>) + sizeof(BSONObj) * _sample.capacity();
|
||||
for (const auto& doc : _sample) {
|
||||
tassert(12433202, "Sample documents must be owned BSONObjs", doc.isOwned());
|
||||
// TODO SERVER-126975. Read this from the persisted doc.
|
||||
memorySizeBytes += SharedBuffer::kHolderSize + static_cast<size_t>(doc.objsize());
|
||||
}
|
||||
SamplingMetadata meta;
|
||||
meta.isPersisted = _wasSamplePersisted;
|
||||
meta.docCount = _sample.size();
|
||||
meta.requestedDocCount = _requestedSampleSize;
|
||||
meta.memorySizeBytes = memorySizeBytes;
|
||||
meta.technique = *_usedSamplingTechnique;
|
||||
if (*_usedSamplingTechnique == cost_based_ranker::SamplingTechnique::kChunk) {
|
||||
meta.numChunks = _numChunks;
|
||||
}
|
||||
meta.createdAt = _sampleCreatedAt;
|
||||
return meta;
|
||||
}
|
||||
|
||||
SamplingEstimatorImpl::~SamplingEstimatorImpl() {}
|
||||
|
||||
CardinalityEstimate SamplingEstimatorImpl::estimateNDV(
|
||||
|
||||
@ -198,6 +198,17 @@ public:
|
||||
return _sampleSize;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the sampling metadata for the generated sample, which includes:
|
||||
* - the sampling technique
|
||||
* - the requested sample size
|
||||
* - the actual sample size
|
||||
* - the memory size of the sample in bytes
|
||||
* - the sampling source (persistent vs on-the-fly)
|
||||
* - the date and time when the sample was generated
|
||||
*/
|
||||
SamplingMetadata getSamplingMetadata() const final;
|
||||
|
||||
/**
|
||||
* For each document in a given sample, this helper calculates the number of
|
||||
* index keys which satisfy 'bounds', which may be >1 in the case of multi-key
|
||||
@ -390,6 +401,22 @@ private:
|
||||
// 'analyze' constructs its estimator with kOnTheFlySample so it always collects a fresh sample
|
||||
// (otherwise a refresh would just re-read the sample it's about to replace).
|
||||
SamplingSourceEnum _samplingSource;
|
||||
|
||||
// Set to true when tryLoadPersistentSample() successfully loads a sample from the stats
|
||||
// collection. Used to populate SamplingMetadata for explain output.
|
||||
bool _wasSamplePersisted = false;
|
||||
// The timestamp when the sample was created. For persisted samples this is read from the
|
||||
// stored document; for on-the-fly samples it is set to Date_t::now() at the end of
|
||||
// generateSample(). Always valid after generateSample() completes.
|
||||
boost::optional<Date_t> _sampleCreatedAt;
|
||||
// The number of documents requested when generateSample() was called. May differ from the
|
||||
// actual sample size (_sampleSize) in the following cases:
|
||||
// 1. The collection is smaller than the requested sample size (full collection scan used).
|
||||
// 2. Chunk-based sampling: if a random cursor lands on the last document in the collection,
|
||||
// no full chunk can be collected for that cursor, so the actual sample is smaller.
|
||||
size_t _requestedSampleSize = 0;
|
||||
// The actual sampling strategy used. Set by generateSample() before dispatch.
|
||||
boost::optional<cost_based_ranker::SamplingTechnique> _usedSamplingTechnique;
|
||||
};
|
||||
|
||||
} // namespace mongo::ce
|
||||
|
||||
@ -2246,54 +2246,6 @@ TEST_F(SamplingEstimatorTest, ChunkSamplingSkipsPersistentSampleWhenFeatureFlagD
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SamplingEstimatorTest, LoadPersistentSampleResetsUniqueDocCountCache) {
|
||||
// _uniqueDocCount is a lazy cache of countUniqueDocuments(_sample), populated on the first
|
||||
// estimateNDV call after a sample is loaded. When tryLoadPersistentSample replaces _sample it
|
||||
// must also clear the cache — otherwise a stale count from a previous sample would be used.
|
||||
// TODO SERVER-112627: Remove once featureFlagPersistentStats is enabled by default.
|
||||
RAIIServerParameterControllerForTest persistentStatsFlag{"featureFlagPersistentStats", true};
|
||||
insertDocuments(kTestNss, {BSON("_id" << 1 << "tag" << "not_persisted")});
|
||||
const UUID uuid = [&] {
|
||||
auto srcColl = acquireCollection(operationContext(), kTestNss);
|
||||
return srcColl.getCollectionPtr()->uuid();
|
||||
}();
|
||||
std::vector<BSONObj> persistedDocs{BSON("_id" << 2 << "tag" << "persisted"),
|
||||
BSON("_id" << 3 << "tag" << "persisted"),
|
||||
BSON("_id" << 4 << "tag" << "persisted")};
|
||||
createCollAndInsertDocuments(
|
||||
operationContext(),
|
||||
NamespaceStringUtil::deserialize(kTestNss.dbName(), kSamplesCollectionName),
|
||||
{buildPersistentSampleDoc(
|
||||
uuid, SamplingCEMethodEnum::kRandom, persistedDocs.size(), persistedDocs)});
|
||||
|
||||
auto coll = acquireCollection(operationContext(), kTestNss);
|
||||
auto colls = MultipleCollectionAccessor(coll, {}, false);
|
||||
SamplingEstimatorForTesting estimator(operationContext(),
|
||||
colls,
|
||||
kTestNss,
|
||||
PlanYieldPolicy::YieldPolicy::YIELD_AUTO,
|
||||
persistedDocs.size(),
|
||||
SamplingCEMethodEnum::kRandom,
|
||||
numChunks,
|
||||
makeCardinalityEstimate(100));
|
||||
estimator.generateSample(ce::NoProjection{});
|
||||
ASSERT_FALSE(estimator.getUniqueDocCountForTesting().has_value());
|
||||
for (const auto& doc : estimator.getSample()) {
|
||||
ASSERT_EQUALS(doc.getStringField("tag"), "persisted");
|
||||
}
|
||||
|
||||
// Simulate the cache being populated by a prior estimateNDV call.
|
||||
estimator.setUniqueDocCountForTesting(99);
|
||||
ASSERT_TRUE(estimator.getUniqueDocCountForTesting().has_value());
|
||||
|
||||
// A second generateSample via the persistent path must clear the cache.
|
||||
estimator.generateSample(ce::NoProjection{});
|
||||
ASSERT_FALSE(estimator.getUniqueDocCountForTesting().has_value());
|
||||
for (const auto& doc : estimator.getSample()) {
|
||||
ASSERT_EQUALS(doc.getStringField("tag"), "persisted");
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SamplingEstimatorTest, MalformedPersistentSampleFallsBackToOnTheFly) {
|
||||
// A doc with the correct _id key exists in system.stats.samples but is malformed (sampleSize
|
||||
// field disagrees with the docs array length). tryLoadPersistentSample must log the error and
|
||||
@ -2352,4 +2304,28 @@ TEST_F(SamplingEstimatorTest, MalformedPersistentSampleFallsBackToOnTheFly) {
|
||||
}
|
||||
}
|
||||
|
||||
DEATH_TEST_F(SamplingEstimatorTestDeathTest,
|
||||
GenerateSampleAssertsOnReuse,
|
||||
"SamplingEstimatorImpl must not be reused") {
|
||||
auto estimator =
|
||||
createSamplingEstimatorForTesting(10 /* collCard */, kSampleSize, ce::NoProjection{});
|
||||
// This is indeed the 2nd call since createSamplingEstimatorForTesting() calls generateSample()
|
||||
// once already.
|
||||
estimator.generateSample(ce::NoProjection{});
|
||||
}
|
||||
|
||||
DEATH_TEST_F(SamplingEstimatorTestDeathTest,
|
||||
GetSamplingMetadataAssertsOnNonOwnedDoc,
|
||||
"Sample documents must be owned BSONObjs") {
|
||||
auto estimator =
|
||||
createSamplingEstimatorForTesting(10 /* collCard */, kSampleSize, ce::NoProjection{});
|
||||
|
||||
// Create a non-owned BSONObj (raw-pointer view into an existing buffer).
|
||||
auto owned = BSON("a" << 1);
|
||||
BSONObj unowned(owned.objdata());
|
||||
|
||||
estimator.setSampleForTesting({unowned});
|
||||
estimator.getSamplingMetadata();
|
||||
}
|
||||
|
||||
} // namespace mongo::ce
|
||||
|
||||
@ -29,17 +29,16 @@
|
||||
|
||||
#include "mongo/bson/json.h"
|
||||
#include "mongo/db/matcher/expression.h"
|
||||
#include "mongo/db/query/canonical_query.h"
|
||||
#include "mongo/db/query/compiler/ce/sampling/sampling_estimator.h"
|
||||
#include "mongo/db/query/compiler/ce/sampling/sampling_test_utils.h"
|
||||
#include "mongo/db/query/compiler/optimizer/cost_based_ranker/cbr_rewrites.h"
|
||||
#include "mongo/db/query/compiler/optimizer/cost_based_ranker/cbr_test_utils.h"
|
||||
#include "mongo/db/query/compiler/optimizer/index_bounds_builder/index_bounds_builder.h"
|
||||
#include "mongo/db/query/compiler/rewrites/matcher/expression_optimizer.h"
|
||||
#include "mongo/unittest/unittest.h"
|
||||
|
||||
#include <limits>
|
||||
|
||||
#include <gmock/gmock.h>
|
||||
|
||||
namespace mongo::cost_based_ranker {
|
||||
namespace {
|
||||
|
||||
@ -1054,6 +1053,9 @@ public:
|
||||
size_t getSampleSize() const override {
|
||||
return 100;
|
||||
}
|
||||
ce::SamplingMetadata getSamplingMetadata() const override {
|
||||
MONGO_UNREACHABLE;
|
||||
}
|
||||
};
|
||||
|
||||
// Build IndexBounds with a single-point OIL on "a" and 'bIntervalCount' point intervals on "b".
|
||||
@ -1471,6 +1473,9 @@ public:
|
||||
size_t getSampleSize() const override {
|
||||
MONGO_UNIMPLEMENTED;
|
||||
}
|
||||
ce::SamplingMetadata getSamplingMetadata() const override {
|
||||
MONGO_UNREACHABLE;
|
||||
}
|
||||
|
||||
private:
|
||||
double _ndv;
|
||||
|
||||
@ -576,6 +576,29 @@ CardinalityEstimate operator*(const SelectivityEstimate& s, const CardinalityEst
|
||||
|
||||
CardinalityEstimate operator*(const CardinalityEstimate& ce, const SelectivityEstimate& s);
|
||||
|
||||
/**
|
||||
* The actual strategy used to generate the sample.
|
||||
*/
|
||||
enum class SamplingTechnique {
|
||||
kRandom,
|
||||
kChunk,
|
||||
kFullCollScan,
|
||||
kSeqScan,
|
||||
};
|
||||
|
||||
/**
|
||||
* Metadata about the sample used when 'ceSource == Sampling'.
|
||||
*/
|
||||
struct SamplingMetadata {
|
||||
bool isPersisted;
|
||||
size_t docCount; // number of documents in the sample
|
||||
size_t requestedDocCount; // number of documents originally requested
|
||||
size_t memorySizeBytes;
|
||||
SamplingTechnique technique;
|
||||
boost::optional<int> numChunks;
|
||||
boost::optional<Date_t> createdAt;
|
||||
};
|
||||
|
||||
/**
|
||||
* The optimizer's estimate of a single QSN in the physical plan.
|
||||
*/
|
||||
|
||||
@ -170,12 +170,8 @@ const QuerySolution* bestCBRPlan(CanonicalQuery* cq,
|
||||
double generateSampleTimeMS = generateSampleTimer.elapsed().count() / 1000.0;
|
||||
|
||||
Timer planningTimer;
|
||||
auto statusWithCBRSolns =
|
||||
QueryPlanner::planWithCostBasedRanking(plannerParams,
|
||||
samplingEstimator.get(),
|
||||
nullptr,
|
||||
std::move(statusWithMultiPlanSolns),
|
||||
cq->getExplain().has_value());
|
||||
auto statusWithCBRSolns = QueryPlanner::planWithCostBasedRanking(
|
||||
plannerParams, samplingEstimator.get(), nullptr, std::move(statusWithMultiPlanSolns), *cq);
|
||||
double planTimeMS = planningTimer.elapsed().count() / 1000.0;
|
||||
|
||||
if (timeProfile.has_value()) {
|
||||
|
||||
@ -322,8 +322,7 @@ StatusWith<JoinReorderedExecutorResult> getJoinReorderedExecutor(
|
||||
// Select access plans for each table in the join.
|
||||
auto yieldPolicy = PlanYieldPolicy::YieldPolicy::YIELD_AUTO;
|
||||
SamplingEstimatorMap samplingEstimators = makeSamplingEstimators(mca, model.graph, yieldPolicy);
|
||||
auto swAccessPlans = singleTableAccessPlans(
|
||||
opCtx, mca, model.graph, samplingEstimators, expCtx->getExplain().has_value());
|
||||
auto swAccessPlans = singleTableAccessPlans(opCtx, mca, model.graph, samplingEstimators);
|
||||
if (!swAccessPlans.isOK()) {
|
||||
return swAccessPlans.getStatus();
|
||||
}
|
||||
|
||||
@ -82,8 +82,7 @@ StatusWith<SingleTableAccessPlansResult> singleTableAccessPlans(
|
||||
OperationContext* opCtx,
|
||||
const MultipleCollectionAccessor& collections,
|
||||
const JoinGraph& graph,
|
||||
const SamplingEstimatorMap& samplingEstimators,
|
||||
bool isExplain) {
|
||||
const SamplingEstimatorMap& samplingEstimators) {
|
||||
const auto numNodes = graph.numNodes();
|
||||
QuerySolutionMap solns;
|
||||
cost_based_ranker::EstimateMap estimates;
|
||||
@ -148,7 +147,7 @@ StatusWith<SingleTableAccessPlansResult> singleTableAccessPlans(
|
||||
samplingEstimator.get(),
|
||||
nullptr /*exactCardinality*/,
|
||||
std::move(swSolns.getValue()),
|
||||
isExplain);
|
||||
*node.accessPath);
|
||||
// Return bad status if CBR is unable to produce a plan
|
||||
if (!swCbrResult.isOK()) {
|
||||
return swCbrResult.getStatus();
|
||||
|
||||
@ -56,7 +56,6 @@ StatusWith<SingleTableAccessPlansResult> singleTableAccessPlans(
|
||||
OperationContext* opCtx,
|
||||
const MultipleCollectionAccessor& collections,
|
||||
const JoinGraph& model,
|
||||
const SamplingEstimatorMap& samplingEstimators,
|
||||
bool isExplain);
|
||||
const SamplingEstimatorMap& samplingEstimators);
|
||||
|
||||
} // namespace mongo::join_ordering
|
||||
|
||||
@ -97,7 +97,7 @@ TEST_F(SingleTableAccessTestFixture, EstimatesPopulated) {
|
||||
ASSERT(node2);
|
||||
|
||||
JoinGraph graph(std::move(mgraph));
|
||||
auto swRes = singleTableAccessPlans(opCtx, mca, graph, estimators, false);
|
||||
auto swRes = singleTableAccessPlans(opCtx, mca, graph, estimators);
|
||||
ASSERT_OK(swRes);
|
||||
|
||||
auto& res = swRes.getValue();
|
||||
|
||||
@ -232,6 +232,10 @@ public:
|
||||
MONGO_UNREACHABLE;
|
||||
}
|
||||
|
||||
ce::SamplingMetadata getSamplingMetadata() const override {
|
||||
MONGO_UNREACHABLE;
|
||||
}
|
||||
|
||||
private:
|
||||
CardinalityEstimate _collCard;
|
||||
stdx::unordered_map<std::vector<FieldPath>, CardinalityEstimate> _fakeEstimates;
|
||||
|
||||
@ -172,6 +172,41 @@ void generatePlannerInfo(PlanExecutor* exec,
|
||||
}
|
||||
|
||||
auto&& explainer = exec->getPlanExplainer();
|
||||
|
||||
if (const auto ceSamplingMeta = explainer.getCeSamplingMetadata(); ceSamplingMeta.has_value()) {
|
||||
BSONObjBuilder ceSamplingMetaBob(plannerBob.subobjStart("ceSamplingMetadata"));
|
||||
for (const auto& [ns, meta] : ceSamplingMeta.value()) {
|
||||
BSONObjBuilder nsMetaBob(ceSamplingMetaBob.subobjStart(ns));
|
||||
nsMetaBob.append("sampleSource", meta.isPersisted ? "persisted" : "onTheFly");
|
||||
static constexpr auto techniqueToStr =
|
||||
[](cost_based_ranker::SamplingTechnique t) -> StringData {
|
||||
switch (t) {
|
||||
case cost_based_ranker::SamplingTechnique::kRandom:
|
||||
return "random"_sd;
|
||||
case cost_based_ranker::SamplingTechnique::kChunk:
|
||||
return "chunk"_sd;
|
||||
case cost_based_ranker::SamplingTechnique::kFullCollScan:
|
||||
return "fullCollScan"_sd;
|
||||
case cost_based_ranker::SamplingTechnique::kSeqScan:
|
||||
return "seqScan"_sd;
|
||||
}
|
||||
MONGO_UNREACHABLE;
|
||||
};
|
||||
nsMetaBob.append("sampleTechnique", techniqueToStr(meta.technique));
|
||||
if (meta.technique == cost_based_ranker::SamplingTechnique::kChunk && meta.numChunks) {
|
||||
nsMetaBob.appendNumber("sampleNumChunks", *meta.numChunks);
|
||||
}
|
||||
nsMetaBob.appendNumber("sampleRequestedDocCount",
|
||||
static_cast<long long>(meta.requestedDocCount));
|
||||
nsMetaBob.appendNumber("sampleDocCount", static_cast<long long>(meta.docCount));
|
||||
nsMetaBob.appendNumber("sampleMemorySizeBytes",
|
||||
static_cast<long long>(meta.memorySizeBytes));
|
||||
tassert(12433203,
|
||||
"SamplingMetadata::createdAt must be set before explain is generated",
|
||||
meta.createdAt.has_value());
|
||||
nsMetaBob.appendDate("sampleCreatedAt", meta.createdAt.value());
|
||||
}
|
||||
}
|
||||
auto&& enumeratorInfo = explainer.getEnumeratorInfo();
|
||||
plannerBob.append("maxIndexedOrSolutionsReached", enumeratorInfo.hitIndexedOrLimit);
|
||||
plannerBob.append("maxIndexedAndSolutionsReached", enumeratorInfo.hitIndexedAndLimit);
|
||||
|
||||
@ -39,6 +39,7 @@
|
||||
#include "mongo/db/query/stage_builder/classic_stage_builder.h"
|
||||
#include "mongo/util/duration.h"
|
||||
#include "mongo/util/modules.h"
|
||||
#include "mongo/util/string_map.h"
|
||||
|
||||
namespace mongo {
|
||||
|
||||
@ -59,6 +60,9 @@ struct PlanExplainerData {
|
||||
boost::optional<double> multiPlannerWinningPlanScore;
|
||||
stage_builder::PlanStageToQsnMap planStageQsnMap;
|
||||
cost_based_ranker::EstimateMap estimates;
|
||||
// Namespace-keyed map of sampling metadata emitted under queryPlanner.ceSamplingMetadata.
|
||||
// Populated on the explain path when CBR used a sampling estimator.
|
||||
StringMap<cost_based_ranker::SamplingMetadata> ceSamplingMetadata;
|
||||
bool fromPlanCache = false;
|
||||
};
|
||||
|
||||
@ -70,6 +74,12 @@ inline PlanExplainerData& operator<<(PlanExplainerData& lhs, PlanExplainerData&&
|
||||
for (auto& [k, v] : rhs.estimates) {
|
||||
lhs.estimates.insert_or_assign(k, std::move(v));
|
||||
}
|
||||
for (auto& [ns, meta] : rhs.ceSamplingMetadata) {
|
||||
tassert(12433204,
|
||||
"ceSamplingMetadata already has an entry for namespace during merge",
|
||||
!lhs.ceSamplingMetadata.contains(ns));
|
||||
lhs.ceSamplingMetadata.emplace(ns, std::move(meta));
|
||||
}
|
||||
return lhs;
|
||||
}
|
||||
|
||||
@ -195,6 +205,16 @@ public:
|
||||
_solution = qs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the per-collection sampling metadata to be emitted under
|
||||
* queryPlanner.ceSamplingMetadata in explain output. Returns boost::none if no sampling
|
||||
* metadata is available (e.g., CBR was not used, or this is not a classic-engine plan).
|
||||
*/
|
||||
virtual boost::optional<StringMap<cost_based_ranker::SamplingMetadata>> getCeSamplingMetadata()
|
||||
const {
|
||||
return boost::none;
|
||||
}
|
||||
|
||||
protected:
|
||||
const QuerySolution* _solution{nullptr};
|
||||
PlanEnumeratorExplainInfo _enumeratorExplainInfo;
|
||||
|
||||
@ -77,6 +77,14 @@ public:
|
||||
std::vector<PlanStatsDetails> getCachedPlanStats(const plan_cache_debug_info::DebugInfo&,
|
||||
ExplainOptions::Verbosity) const;
|
||||
|
||||
boost::optional<StringMap<cost_based_ranker::SamplingMetadata>> getCeSamplingMetadata()
|
||||
const override {
|
||||
if (_explainData.ceSamplingMetadata.empty()) {
|
||||
return boost::none;
|
||||
}
|
||||
return _explainData.ceSamplingMetadata;
|
||||
}
|
||||
|
||||
private:
|
||||
/**
|
||||
* A helper that formats the plan stats into a BSON object and collects summary stats.
|
||||
|
||||
@ -34,6 +34,7 @@
|
||||
#include "mongo/db/pipeline/pipeline_d.h"
|
||||
#include "mongo/db/query/canonical_query.h"
|
||||
#include "mongo/db/query/compiler/physical_model/query_solution/query_solution.h"
|
||||
#include "mongo/db/query/explain.h"
|
||||
#include "mongo/db/query/explain_diagnostic_printer.h"
|
||||
#include "mongo/db/query/get_executor.h"
|
||||
#include "mongo/db/query/multiple_collection_accessor.h"
|
||||
@ -661,5 +662,69 @@ TEST_F(PlanExplainerTest, PlanExplainerDataMergeFull) {
|
||||
ASSERT_EQ(data1.estimates.size(), 2);
|
||||
}
|
||||
|
||||
TEST_F(PlanExplainerTest, CBRSamplingMetadataSerializedInExplain) {
|
||||
// Verify that when CBR uses sampling CE, the 'ceSamplingMetadata' section appears in the
|
||||
// queryPlanner explain output and contains the expected fields for each collection.
|
||||
RAIIServerParameterControllerForTest samplingController("internalQueryCBRCEMode", "samplingCE");
|
||||
|
||||
const auto verbosity = ExplainOptions::Verbosity::kQueryPlanner;
|
||||
expCtx->setExplain(verbosity);
|
||||
|
||||
auto coll = acquireCollection(
|
||||
operationContext(),
|
||||
CollectionAcquisitionRequest::fromOpCtx(
|
||||
operationContext(), kNss, AcquisitionPrerequisites::OperationType::kRead),
|
||||
MODE_IS);
|
||||
MultipleCollectionAccessor colls{coll};
|
||||
|
||||
auto findCommand = std::make_unique<FindCommandRequest>(kNss);
|
||||
findCommand->setFilter(fromjson("{a: {$gte: 0}, b: {$gte: 0}}"));
|
||||
auto cq = std::make_unique<CanonicalQuery>(CanonicalQueryParams{
|
||||
.expCtx = expCtx,
|
||||
.parsedFind = ParsedFindCommandParams{.findCommand = std::move(findCommand)}});
|
||||
|
||||
Command* cmd = CommandHelpers::findCommand(operationContext(), "find");
|
||||
{
|
||||
std::lock_guard<Client> clientLock(*operationContext()->getClient());
|
||||
CurOp::get(operationContext())
|
||||
->setGenericOpRequestDetails(clientLock, kNss, cmd, BSONObj(), NetworkOp::dbQuery);
|
||||
}
|
||||
|
||||
auto swExec = getExecutorFind(
|
||||
operationContext(), colls, std::move(cq), PlanYieldPolicy::YieldPolicy::INTERRUPT_ONLY);
|
||||
ASSERT_OK(swExec);
|
||||
|
||||
BSONObjBuilder bob;
|
||||
Explain::explainStages(swExec.getValue().get(),
|
||||
colls,
|
||||
verbosity,
|
||||
Status::OK(),
|
||||
boost::none,
|
||||
BSONObj(),
|
||||
SerializationContext::stateCommandReply(),
|
||||
BSONObj(),
|
||||
&bob);
|
||||
const BSONObj explained = bob.obj();
|
||||
|
||||
auto queryPlanner = explained["queryPlanner"];
|
||||
ASSERT(queryPlanner.isABSONObj()) << "Missing queryPlanner in: " << explained;
|
||||
|
||||
auto ceSamplingMeta = queryPlanner["ceSamplingMetadata"];
|
||||
ASSERT(ceSamplingMeta.isABSONObj())
|
||||
<< "Missing ceSamplingMetadata in queryPlanner: " << queryPlanner;
|
||||
|
||||
// Exactly one namespace entry expected.
|
||||
ASSERT_EQ(ceSamplingMeta.Obj().nFields(), 1) << ceSamplingMeta;
|
||||
const BSONElement nsElem = ceSamplingMeta.Obj().firstElement();
|
||||
ASSERT_EQ(nsElem.type(), BSONType::object);
|
||||
const BSONObj nsMeta = nsElem.Obj();
|
||||
|
||||
ASSERT_EQ(nsMeta["sampleSource"].String(), "onTheFly");
|
||||
ASSERT(nsMeta.hasField("sampleTechnique")) << nsMeta;
|
||||
ASSERT(nsMeta.hasField("sampleDocCount")) << nsMeta;
|
||||
ASSERT(nsMeta.hasField("sampleRequestedDocCount")) << nsMeta;
|
||||
ASSERT(nsMeta.hasField("sampleMemorySizeBytes")) << nsMeta;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace mongo
|
||||
|
||||
@ -36,6 +36,7 @@
|
||||
#include "mongo/db/query/compiler/ce/exact/exact_cardinality_impl.h"
|
||||
#include "mongo/db/query/compiler/ce/sampling/sampling_estimator.h"
|
||||
#include "mongo/db/query/compiler/ce/sampling/sampling_estimator_impl.h"
|
||||
#include "mongo/db/query/compiler/optimizer/cost_based_ranker/estimates.h"
|
||||
#include "mongo/db/query/planner_analysis.h"
|
||||
#include "mongo/db/stats/counters.h"
|
||||
|
||||
@ -211,7 +212,7 @@ StatusWith<PlanRankingResult> CBRPlanRankingStrategy::rankPlans(
|
||||
samplingEstimator.get(),
|
||||
exactCardinality.get(),
|
||||
std::move(statusWithMultiPlanSolns),
|
||||
query.getExplain().has_value());
|
||||
query);
|
||||
|
||||
// Calculate duration for server status metrics
|
||||
auto durationMicros = tickSource->ticksTo<Microseconds>(tickSource->getTicks() - startTicks);
|
||||
|
||||
@ -1768,7 +1768,7 @@ StatusWith<PlanRankingResult> QueryPlanner::planWithCostBasedRanking(
|
||||
ce::SamplingEstimator* samplingEstimator,
|
||||
const ce::ExactCardinalityEstimator* exactCardinality,
|
||||
StatusWith<std::vector<std::unique_ptr<QuerySolution>>> statusWithMultiPlanSolns,
|
||||
bool isExplain) {
|
||||
const CanonicalQuery& query) {
|
||||
using namespace cost_based_ranker;
|
||||
auto cbrMode = params.planRankerMode;
|
||||
EstimateMap estimates;
|
||||
@ -1849,7 +1849,7 @@ StatusWith<PlanRankingResult> QueryPlanner::planWithCostBasedRanking(
|
||||
PlanRankingResult{.solutions = std::move(acceptedSoln),
|
||||
.maybeExplainData = PlanExplainerData{.estimates = std::move(estimates)},
|
||||
.needsWorksMeasuredForPlanCache = successfullyChoseWinner};
|
||||
if (isExplain) {
|
||||
if (query.getExplain()) {
|
||||
std::vector<SolutionWithPlanStage> rejectedSolnWithStages;
|
||||
rejectedSolnWithStages.reserve(rejectedSoln.size());
|
||||
std::transform(std::make_move_iterator(rejectedSoln.begin()),
|
||||
@ -1860,6 +1860,12 @@ StatusWith<PlanRankingResult> QueryPlanner::planWithCostBasedRanking(
|
||||
});
|
||||
planRankingResult.maybeExplainData->rejectedPlansWithStages =
|
||||
std::move(rejectedSolnWithStages);
|
||||
if (samplingEstimator) {
|
||||
planRankingResult.maybeExplainData->ceSamplingMetadata.emplace(
|
||||
NamespaceStringUtil::serialize(query.nss(),
|
||||
query.getExpCtx()->getSerializationContext()),
|
||||
samplingEstimator->getSamplingMetadata());
|
||||
}
|
||||
}
|
||||
return std::move(planRankingResult);
|
||||
}
|
||||
|
||||
@ -129,14 +129,15 @@ public:
|
||||
* estimation (CE) and costing modules. The return value contains a list of plans that were
|
||||
* rejected on the basis of cost, as well as any non-rejected plans from which the caller can
|
||||
* select a winner.
|
||||
* If isExplain is true, collect and return planExplainerData as part of the PlanRankingResult.
|
||||
* If query.getExplain().has_value(), collect and return planExplainerData as part of the
|
||||
* PlanRankingResult.
|
||||
*/
|
||||
static StatusWith<PlanRankingResult> planWithCostBasedRanking(
|
||||
const QueryPlannerParams& params,
|
||||
ce::SamplingEstimator* samplingEstimator,
|
||||
const ce::ExactCardinalityEstimator* exactCardinality,
|
||||
StatusWith<std::vector<std::unique_ptr<QuerySolution>>> statusWithMultiPlanSolns,
|
||||
bool isExplain);
|
||||
const CanonicalQuery& query);
|
||||
|
||||
/**
|
||||
* Generates and returns a query solution, given data retrieved from the plan cache.
|
||||
|
||||
Loading…
Reference in New Issue
Block a user