diff --git a/jstests/aggregation/sources/unionWith/unionWith_explain.js b/jstests/aggregation/sources/unionWith/unionWith_explain.js index 62b052ffadd..8c5d790c4bc 100644 --- a/jstests/aggregation/sources/unionWith/unionWith_explain.js +++ b/jstests/aggregation/sources/unionWith/unionWith_explain.js @@ -25,7 +25,7 @@ for (let i = 0; i < docsPerColl; i++) { assert.commandWorked(collC.insert({c: i, val: 10 - i, groupKey: i})); } -const executionStatsIngoredFields = [ +const executionStatsIgnoredFields = [ "executionTimeMillis", "executionTimeMillisEstimate", "saveState", @@ -36,10 +36,17 @@ const executionStatsIngoredFields = [ "numKeysEstimate", ]; -const stagesIgnoredFields = ["slots", "optimizationTimeMillis", "planCacheKey", "querySettings", "isCached"]; +const stagesIgnoredFields = [ + "slots", + "optimizationTimeMillis", + "planCacheKey", + "querySettings", + "isCached", + "ceSamplingMetadata", +]; const mongosIgnoredFields = ["works", "needTime", "queryHash", "planCacheShapeHash", "optimizationTimeMillis"].concat( - executionStatsIngoredFields, + executionStatsIgnoredFields, stagesIgnoredFields, ); @@ -108,7 +115,7 @@ function assertExplainEq(getUnion, getRegular) { documentEqWithIgnoredFields( unionStats.executionStages, regularStats.executionStages, - executionStatsIngoredFields, + executionStatsIgnoredFields, ), buildErrorString(unionStats, regularStats, "executionStages"), ); @@ -122,7 +129,7 @@ function assertExplainEq(getUnion, getRegular) { assert( arrayEqWithIgnoredFields(union, regular.stages, [ ...stagesIgnoredFields, - ...executionStatsIngoredFields, + ...executionStatsIgnoredFields, ]), buildErrorString(union, regular, "stages with executionStats"), ); diff --git a/jstests/noPassthroughWithMongod/query/cbr/cbr_persistent_sample.js b/jstests/noPassthroughWithMongod/query/cbr/cbr_persistent_sample.js index 861813d6ee2..2f7ba19eab5 100644 --- a/jstests/noPassthroughWithMongod/query/cbr/cbr_persistent_sample.js +++ b/jstests/noPassthroughWithMongod/query/cbr/cbr_persistent_sample.js @@ -141,12 +141,17 @@ function dropSamplesCollection() { }); } -function getWinningPlanCE(query) { +function getWinningPlanMetadata(query) { const explain = coll.find(query).explain(); const plan = getWinningPlanFromExplain(explain); - assert(isCollscan(db, plan), `expected a COLLSCAN plan: ${tojson(plan)}`); - assert.eq(plan.estimatesMetadata.ceSource, "Sampling", plan); - return plan.cardinalityEstimate; + assert(isCollscan(db, plan), "expected a COLLSCAN plan", {plan}); + assert.eq(plan.estimatesMetadata.ceSource, "Sampling", "expected Sampling CE source", {plan}); + const ceSamplingMetadata = explain.queryPlanner.ceSamplingMetadata; + assert(ceSamplingMetadata, "expected ceSamplingMetadata in queryPlanner", {explain}); + const ns = coll.getFullName(); + const meta = ceSamplingMetadata[ns]; + assert(meta, "expected ceSamplingMetadata entry for namespace " + ns, {ceSamplingMetadata}); + return meta; } const prevCBRConfig = getCBRConfig(db); @@ -172,42 +177,27 @@ try { }), ); - // The test detects whether the persistent sample was used via CE as an indirect signal. - // Persistent sample docs are all tagged `kPersistentTag`; source collection docs are all - // tagged "from_source". Running `find({tag: kPersistentTag})` yields: - // - Hit (persistent sample loaded): every sampled doc matches → selectivity 1.0 - // → CE = collCard = kSourceSize. - // - Miss (on-the-fly sample used): no source doc carries kPersistentTag → selectivity 0 - // → CE = 1. See CardinalityEstimator::clampZeroEstimates(). - // This approach is fragile: any change to zero-estimate clamping will break these assertions. - // TODO SERVER-124332: Replace with a direct check of the sampling source from explain output. - const kPersistentTag = "from_persistent"; - const kHitCE = kSourceSize; - const kMissCE = 1; + const kSampleDocs = Array.from({length: kSampleSize}, (_, i) => ({_id: i, a: i})); { jsTest.log.info("Testing random sampling technique with a persistent sample hit"); assert.commandWorked(db.adminCommand({setParameter: 1, internalQuerySamplingCEMethod: "random"})); resetCollections(); - const persistedDocs = []; - for (let i = 0; i < kSampleSize; i++) { - persistedDocs.push({_id: i, tag: kPersistentTag}); - } insertPersistedSample( buildPersistentSampleDoc({ collectionUuid: getCollectionUuidString(), method: "random", sampleSize: kSampleSize, - docs: persistedDocs, + docs: kSampleDocs, }), ); - const foundCE = getWinningPlanCE({tag: kPersistentTag}); - assert.eq( - foundCE, - kHitCE, - `Unexpected CE for random technique when persistent sample is hit: ${foundCE}. Expected ${kHitCE}`, - ); + const meta = getWinningPlanMetadata({a: {$gte: 0}}); + assert.eq(meta.sampleSource, "persisted", "expected persisted sample on hit", {meta}); + assert.eq(meta.sampleTechnique, "random", "expected random technique", {meta}); + assert.eq(meta.sampleDocCount, kSampleSize, "expected docCount to match persisted sample size", {meta}); + assert.eq(meta.sampleRequestedDocCount, kSampleSize, "expected requestedDocCount to match", {meta}); + assert(!meta.hasOwnProperty("sampleNumChunks"), "random technique should not have numChunks", {meta}); } { @@ -215,12 +205,11 @@ try { assert.commandWorked(db.adminCommand({setParameter: 1, internalQuerySamplingCEMethod: "random"})); resetCollections(); - const foundCE = getWinningPlanCE({tag: kPersistentTag}); - assert.eq( - foundCE, - kMissCE, - `Unexpected CE for random technique when no persistent sample exists: ${foundCE}. Expected ${kMissCE}`, - ); + const meta = getWinningPlanMetadata({a: {$gte: 0}}); + assert.eq(meta.sampleSource, "onTheFly", "expected on-the-fly sample on miss", {meta}); + assert.eq(meta.sampleTechnique, "random", "expected random technique", {meta}); + assert.eq(meta.sampleRequestedDocCount, kSampleSize, "expected requestedDocCount to match", {meta}); + assert(!meta.hasOwnProperty("sampleNumChunks"), "random technique should not have numChunks", {meta}); } { @@ -228,26 +217,21 @@ try { assert.commandWorked(db.adminCommand({setParameter: 1, internalQuerySamplingCEMethod: "chunk"})); resetCollections(); - const persistedDocs = []; - for (let i = 0; i < kSampleSize; i++) { - persistedDocs.push({_id: i, tag: kPersistentTag}); - } insertPersistedSample( buildPersistentSampleDoc({ collectionUuid: getCollectionUuidString(), method: "chunk", sampleSize: kSampleSize, - docs: persistedDocs, + docs: kSampleDocs, numChunks: kNumChunks, }), ); - - const foundCE = getWinningPlanCE({tag: kPersistentTag}); - assert.eq( - foundCE, - kHitCE, - `Unexpected CE for chunk technique when persistent sample is hit: ${foundCE}. Expected ${kHitCE}`, - ); + const meta = getWinningPlanMetadata({a: {$gte: 0}}); + assert.eq(meta.sampleSource, "persisted", "expected persisted sample on hit", {meta}); + assert.eq(meta.sampleTechnique, "chunk", "expected chunk technique", {meta}); + assert.eq(meta.sampleNumChunks, kNumChunks, "expected numChunks to match", {meta}); + assert.eq(meta.sampleDocCount, kSampleSize, "expected docCount to match persisted sample size", {meta}); + assert.eq(meta.sampleRequestedDocCount, kSampleSize, "expected requestedDocCount to match", {meta}); } { @@ -255,12 +239,11 @@ try { assert.commandWorked(db.adminCommand({setParameter: 1, internalQuerySamplingCEMethod: "chunk"})); resetCollections(); - const foundCE = getWinningPlanCE({tag: kPersistentTag}); - assert.eq( - foundCE, - kMissCE, - `Unexpected CE for chunk technique when no persistent sample exists: ${foundCE}. Expected ${kMissCE}`, - ); + const meta = getWinningPlanMetadata({a: {$gte: 0}}); + assert.eq(meta.sampleSource, "onTheFly", "expected on-the-fly sample on miss", {meta}); + assert.eq(meta.sampleTechnique, "chunk", "expected chunk technique", {meta}); + assert.eq(meta.sampleNumChunks, kNumChunks, "expected numChunks to match", {meta}); + assert.eq(meta.sampleRequestedDocCount, kSampleSize, "expected requestedDocCount to match", {meta}); } } finally { setCBRConfig(db, prevCBRConfig); diff --git a/src/mongo/db/exec/classic/subplan.cpp b/src/mongo/db/exec/classic/subplan.cpp index 47016fafb5f..f44927ff942 100644 --- a/src/mongo/db/exec/classic/subplan.cpp +++ b/src/mongo/db/exec/classic/subplan.cpp @@ -280,7 +280,7 @@ Status SubplanStage::pickBestPlan(const QueryPlannerParams& plannerParams, samplingEstimator.get(), exactCardinality.get(), std::move(branchResult->solutions), - _query->getExplain().has_value()); + *_query); if (!statusWithCBRSolns.isOK()) { str::stream ss; ss << "Can't plan for subchild " << branchResult->canonicalQuery->toString() << " " diff --git a/src/mongo/db/query/compiler/ce/sampling/sampling_estimator.h b/src/mongo/db/query/compiler/ce/sampling/sampling_estimator.h index 49d24e916fd..fb27e93497f 100644 --- a/src/mongo/db/query/compiler/ce/sampling/sampling_estimator.h +++ b/src/mongo/db/query/compiler/ce/sampling/sampling_estimator.h @@ -47,6 +47,7 @@ using TopLevelFieldsProjection = StringSet; using ProjectionParams = std::variant; using CardinalityEstimate = mongo::cost_based_ranker::CardinalityEstimate; +using SamplingMetadata = mongo::cost_based_ranker::SamplingMetadata; class SamplingEstimator { public: @@ -130,6 +131,11 @@ public: virtual double getCollCard() const = 0; virtual size_t getSampleSize() const = 0; + + /** + * Returns metadata about the sample used for cardinality estimation. + */ + virtual SamplingMetadata getSamplingMetadata() const = 0; }; } // namespace mongo::ce diff --git a/src/mongo/db/query/compiler/ce/sampling/sampling_estimator_impl.cpp b/src/mongo/db/query/compiler/ce/sampling/sampling_estimator_impl.cpp index b6418a52385..df22817c605 100644 --- a/src/mongo/db/query/compiler/ce/sampling/sampling_estimator_impl.cpp +++ b/src/mongo/db/query/compiler/ce/sampling/sampling_estimator_impl.cpp @@ -557,7 +557,11 @@ void SamplingEstimatorImpl::generateChunkSample() { } void SamplingEstimatorImpl::generateSample(ce::ProjectionParams projectionParams) { - _isSampleGenerated = true; + tassert(12433201, "SamplingEstimatorImpl must not be reused", !_isSampleGenerated); + // The final sample size (_sampleSize) may not be exactly the requested one + // (_requestedSampleSize). Capturing here the requested sample size before it gets updated. + _requestedSampleSize = _sampleSize; + if (auto topLevelSampleFieldNames = std::get_if(&projectionParams)) { validateTopLevelSampleFieldNames(*topLevelSampleFieldNames); @@ -570,17 +574,25 @@ void SamplingEstimatorImpl::generateSample(ce::ProjectionParams projectionParams if (internalQuerySamplingBySequentialScan.load()) { // This is only used for testing purposes when a repeatable sample is needed. + _usedSamplingTechnique = cost_based_ranker::SamplingTechnique::kSeqScan; generateSampleBySeqScanningForTesting(); } else if (_sampleSize >= _collectionCard.cardinality().v()) { // If the required sample is larger than the collection, the sample is generated from all // the documents on the collection. + _usedSamplingTechnique = cost_based_ranker::SamplingTechnique::kFullCollScan; generateFullCollScanSample(); } else if (_samplingStyle == SamplingCEMethodEnum::kRandom) { + _usedSamplingTechnique = cost_based_ranker::SamplingTechnique::kRandom; generateRandomSample(); } else { tassert(9372901, "The number of chunks should be positive.", _numChunks && *_numChunks > 0); + _usedSamplingTechnique = cost_based_ranker::SamplingTechnique::kChunk; generateChunkSample(); } + if (!_wasSamplePersisted) { + _sampleCreatedAt = Date_t::now(); + } + _isSampleGenerated = true; } void SamplingEstimatorImpl::generateSampleBySeqScanningForTesting() { @@ -994,10 +1006,35 @@ Status SamplingEstimatorImpl::tryLoadPersistentSample(SamplingCEMethodEnum metho _sample = parsed.getValue().getDocs(); _sampleSize = _sample.size(); _uniqueDocCount = boost::none; - _isSampleGenerated = true; + _wasSamplePersisted = true; + _sampleCreatedAt = parsed.getValue().getCreatedAt(); return Status::OK(); } +SamplingMetadata SamplingEstimatorImpl::getSamplingMetadata() const { + tassert( + 12433200, "getSamplingMetadata() called before sample was generated", _isSampleGenerated); + // Account for: vector metadata, BSONObj object overhead per slot, and per-document + // buffer allocation (BSON data + SharedBuffer::Holder ref-count header). + size_t memorySizeBytes = sizeof(std::vector) + sizeof(BSONObj) * _sample.capacity(); + for (const auto& doc : _sample) { + tassert(12433202, "Sample documents must be owned BSONObjs", doc.isOwned()); + // TODO SERVER-126975. Read this from the persisted doc. + memorySizeBytes += SharedBuffer::kHolderSize + static_cast(doc.objsize()); + } + SamplingMetadata meta; + meta.isPersisted = _wasSamplePersisted; + meta.docCount = _sample.size(); + meta.requestedDocCount = _requestedSampleSize; + meta.memorySizeBytes = memorySizeBytes; + meta.technique = *_usedSamplingTechnique; + if (*_usedSamplingTechnique == cost_based_ranker::SamplingTechnique::kChunk) { + meta.numChunks = _numChunks; + } + meta.createdAt = _sampleCreatedAt; + return meta; +} + SamplingEstimatorImpl::~SamplingEstimatorImpl() {} CardinalityEstimate SamplingEstimatorImpl::estimateNDV( diff --git a/src/mongo/db/query/compiler/ce/sampling/sampling_estimator_impl.h b/src/mongo/db/query/compiler/ce/sampling/sampling_estimator_impl.h index 2472fff7ad4..28cd052e662 100644 --- a/src/mongo/db/query/compiler/ce/sampling/sampling_estimator_impl.h +++ b/src/mongo/db/query/compiler/ce/sampling/sampling_estimator_impl.h @@ -198,6 +198,17 @@ public: return _sampleSize; } + /* + * Returns the sampling metadata for the generated sample, which includes: + * - the sampling technique + * - the requested sample size + * - the actual sample size + * - the memory size of the sample in bytes + * - the sampling source (persistent vs on-the-fly) + * - the date and time when the sample was generated + */ + SamplingMetadata getSamplingMetadata() const final; + /** * For each document in a given sample, this helper calculates the number of * index keys which satisfy 'bounds', which may be >1 in the case of multi-key @@ -390,6 +401,22 @@ private: // 'analyze' constructs its estimator with kOnTheFlySample so it always collects a fresh sample // (otherwise a refresh would just re-read the sample it's about to replace). SamplingSourceEnum _samplingSource; + + // Set to true when tryLoadPersistentSample() successfully loads a sample from the stats + // collection. Used to populate SamplingMetadata for explain output. + bool _wasSamplePersisted = false; + // The timestamp when the sample was created. For persisted samples this is read from the + // stored document; for on-the-fly samples it is set to Date_t::now() at the end of + // generateSample(). Always valid after generateSample() completes. + boost::optional _sampleCreatedAt; + // The number of documents requested when generateSample() was called. May differ from the + // actual sample size (_sampleSize) in the following cases: + // 1. The collection is smaller than the requested sample size (full collection scan used). + // 2. Chunk-based sampling: if a random cursor lands on the last document in the collection, + // no full chunk can be collected for that cursor, so the actual sample is smaller. + size_t _requestedSampleSize = 0; + // The actual sampling strategy used. Set by generateSample() before dispatch. + boost::optional _usedSamplingTechnique; }; } // namespace mongo::ce diff --git a/src/mongo/db/query/compiler/ce/sampling/sampling_estimator_test.cpp b/src/mongo/db/query/compiler/ce/sampling/sampling_estimator_test.cpp index 66653a0793b..9474fb861bb 100644 --- a/src/mongo/db/query/compiler/ce/sampling/sampling_estimator_test.cpp +++ b/src/mongo/db/query/compiler/ce/sampling/sampling_estimator_test.cpp @@ -2246,54 +2246,6 @@ TEST_F(SamplingEstimatorTest, ChunkSamplingSkipsPersistentSampleWhenFeatureFlagD } } -TEST_F(SamplingEstimatorTest, LoadPersistentSampleResetsUniqueDocCountCache) { - // _uniqueDocCount is a lazy cache of countUniqueDocuments(_sample), populated on the first - // estimateNDV call after a sample is loaded. When tryLoadPersistentSample replaces _sample it - // must also clear the cache — otherwise a stale count from a previous sample would be used. - // TODO SERVER-112627: Remove once featureFlagPersistentStats is enabled by default. - RAIIServerParameterControllerForTest persistentStatsFlag{"featureFlagPersistentStats", true}; - insertDocuments(kTestNss, {BSON("_id" << 1 << "tag" << "not_persisted")}); - const UUID uuid = [&] { - auto srcColl = acquireCollection(operationContext(), kTestNss); - return srcColl.getCollectionPtr()->uuid(); - }(); - std::vector persistedDocs{BSON("_id" << 2 << "tag" << "persisted"), - BSON("_id" << 3 << "tag" << "persisted"), - BSON("_id" << 4 << "tag" << "persisted")}; - createCollAndInsertDocuments( - operationContext(), - NamespaceStringUtil::deserialize(kTestNss.dbName(), kSamplesCollectionName), - {buildPersistentSampleDoc( - uuid, SamplingCEMethodEnum::kRandom, persistedDocs.size(), persistedDocs)}); - - auto coll = acquireCollection(operationContext(), kTestNss); - auto colls = MultipleCollectionAccessor(coll, {}, false); - SamplingEstimatorForTesting estimator(operationContext(), - colls, - kTestNss, - PlanYieldPolicy::YieldPolicy::YIELD_AUTO, - persistedDocs.size(), - SamplingCEMethodEnum::kRandom, - numChunks, - makeCardinalityEstimate(100)); - estimator.generateSample(ce::NoProjection{}); - ASSERT_FALSE(estimator.getUniqueDocCountForTesting().has_value()); - for (const auto& doc : estimator.getSample()) { - ASSERT_EQUALS(doc.getStringField("tag"), "persisted"); - } - - // Simulate the cache being populated by a prior estimateNDV call. - estimator.setUniqueDocCountForTesting(99); - ASSERT_TRUE(estimator.getUniqueDocCountForTesting().has_value()); - - // A second generateSample via the persistent path must clear the cache. - estimator.generateSample(ce::NoProjection{}); - ASSERT_FALSE(estimator.getUniqueDocCountForTesting().has_value()); - for (const auto& doc : estimator.getSample()) { - ASSERT_EQUALS(doc.getStringField("tag"), "persisted"); - } -} - TEST_F(SamplingEstimatorTest, MalformedPersistentSampleFallsBackToOnTheFly) { // A doc with the correct _id key exists in system.stats.samples but is malformed (sampleSize // field disagrees with the docs array length). tryLoadPersistentSample must log the error and @@ -2352,4 +2304,28 @@ TEST_F(SamplingEstimatorTest, MalformedPersistentSampleFallsBackToOnTheFly) { } } +DEATH_TEST_F(SamplingEstimatorTestDeathTest, + GenerateSampleAssertsOnReuse, + "SamplingEstimatorImpl must not be reused") { + auto estimator = + createSamplingEstimatorForTesting(10 /* collCard */, kSampleSize, ce::NoProjection{}); + // This is indeed the 2nd call since createSamplingEstimatorForTesting() calls generateSample() + // once already. + estimator.generateSample(ce::NoProjection{}); +} + +DEATH_TEST_F(SamplingEstimatorTestDeathTest, + GetSamplingMetadataAssertsOnNonOwnedDoc, + "Sample documents must be owned BSONObjs") { + auto estimator = + createSamplingEstimatorForTesting(10 /* collCard */, kSampleSize, ce::NoProjection{}); + + // Create a non-owned BSONObj (raw-pointer view into an existing buffer). + auto owned = BSON("a" << 1); + BSONObj unowned(owned.objdata()); + + estimator.setSampleForTesting({unowned}); + estimator.getSamplingMetadata(); +} + } // namespace mongo::ce diff --git a/src/mongo/db/query/compiler/optimizer/cost_based_ranker/cardinality_estimator_test.cpp b/src/mongo/db/query/compiler/optimizer/cost_based_ranker/cardinality_estimator_test.cpp index fc6fbb5f898..a45a62225b0 100644 --- a/src/mongo/db/query/compiler/optimizer/cost_based_ranker/cardinality_estimator_test.cpp +++ b/src/mongo/db/query/compiler/optimizer/cost_based_ranker/cardinality_estimator_test.cpp @@ -29,17 +29,16 @@ #include "mongo/bson/json.h" #include "mongo/db/matcher/expression.h" -#include "mongo/db/query/canonical_query.h" #include "mongo/db/query/compiler/ce/sampling/sampling_estimator.h" #include "mongo/db/query/compiler/ce/sampling/sampling_test_utils.h" -#include "mongo/db/query/compiler/optimizer/cost_based_ranker/cbr_rewrites.h" #include "mongo/db/query/compiler/optimizer/cost_based_ranker/cbr_test_utils.h" #include "mongo/db/query/compiler/optimizer/index_bounds_builder/index_bounds_builder.h" -#include "mongo/db/query/compiler/rewrites/matcher/expression_optimizer.h" #include "mongo/unittest/unittest.h" #include +#include + namespace mongo::cost_based_ranker { namespace { @@ -1054,6 +1053,9 @@ public: size_t getSampleSize() const override { return 100; } + ce::SamplingMetadata getSamplingMetadata() const override { + MONGO_UNREACHABLE; + } }; // Build IndexBounds with a single-point OIL on "a" and 'bIntervalCount' point intervals on "b". @@ -1471,6 +1473,9 @@ public: size_t getSampleSize() const override { MONGO_UNIMPLEMENTED; } + ce::SamplingMetadata getSamplingMetadata() const override { + MONGO_UNREACHABLE; + } private: double _ndv; diff --git a/src/mongo/db/query/compiler/optimizer/cost_based_ranker/estimates.h b/src/mongo/db/query/compiler/optimizer/cost_based_ranker/estimates.h index b8197b53e28..0a945b0d004 100644 --- a/src/mongo/db/query/compiler/optimizer/cost_based_ranker/estimates.h +++ b/src/mongo/db/query/compiler/optimizer/cost_based_ranker/estimates.h @@ -576,6 +576,29 @@ CardinalityEstimate operator*(const SelectivityEstimate& s, const CardinalityEst CardinalityEstimate operator*(const CardinalityEstimate& ce, const SelectivityEstimate& s); +/** + * The actual strategy used to generate the sample. + */ +enum class SamplingTechnique { + kRandom, + kChunk, + kFullCollScan, + kSeqScan, +}; + +/** + * Metadata about the sample used when 'ceSource == Sampling'. + */ +struct SamplingMetadata { + bool isPersisted; + size_t docCount; // number of documents in the sample + size_t requestedDocCount; // number of documents originally requested + size_t memorySizeBytes; + SamplingTechnique technique; + boost::optional numChunks; + boost::optional createdAt; +}; + /** * The optimizer's estimate of a single QSN in the physical plan. */ diff --git a/src/mongo/db/query/compiler/optimizer/cost_based_ranker/plan_ranking_utils.cpp b/src/mongo/db/query/compiler/optimizer/cost_based_ranker/plan_ranking_utils.cpp index f71879eac87..e95f66ec5a4 100644 --- a/src/mongo/db/query/compiler/optimizer/cost_based_ranker/plan_ranking_utils.cpp +++ b/src/mongo/db/query/compiler/optimizer/cost_based_ranker/plan_ranking_utils.cpp @@ -170,12 +170,8 @@ const QuerySolution* bestCBRPlan(CanonicalQuery* cq, double generateSampleTimeMS = generateSampleTimer.elapsed().count() / 1000.0; Timer planningTimer; - auto statusWithCBRSolns = - QueryPlanner::planWithCostBasedRanking(plannerParams, - samplingEstimator.get(), - nullptr, - std::move(statusWithMultiPlanSolns), - cq->getExplain().has_value()); + auto statusWithCBRSolns = QueryPlanner::planWithCostBasedRanking( + plannerParams, samplingEstimator.get(), nullptr, std::move(statusWithMultiPlanSolns), *cq); double planTimeMS = planningTimer.elapsed().count() / 1000.0; if (timeProfile.has_value()) { diff --git a/src/mongo/db/query/compiler/optimizer/join/executor.cpp b/src/mongo/db/query/compiler/optimizer/join/executor.cpp index 1373172efca..ad580b9a64d 100644 --- a/src/mongo/db/query/compiler/optimizer/join/executor.cpp +++ b/src/mongo/db/query/compiler/optimizer/join/executor.cpp @@ -322,8 +322,7 @@ StatusWith getJoinReorderedExecutor( // Select access plans for each table in the join. auto yieldPolicy = PlanYieldPolicy::YieldPolicy::YIELD_AUTO; SamplingEstimatorMap samplingEstimators = makeSamplingEstimators(mca, model.graph, yieldPolicy); - auto swAccessPlans = singleTableAccessPlans( - opCtx, mca, model.graph, samplingEstimators, expCtx->getExplain().has_value()); + auto swAccessPlans = singleTableAccessPlans(opCtx, mca, model.graph, samplingEstimators); if (!swAccessPlans.isOK()) { return swAccessPlans.getStatus(); } diff --git a/src/mongo/db/query/compiler/optimizer/join/single_table_access.cpp b/src/mongo/db/query/compiler/optimizer/join/single_table_access.cpp index 3f1147ece4f..179c7555188 100644 --- a/src/mongo/db/query/compiler/optimizer/join/single_table_access.cpp +++ b/src/mongo/db/query/compiler/optimizer/join/single_table_access.cpp @@ -82,8 +82,7 @@ StatusWith singleTableAccessPlans( OperationContext* opCtx, const MultipleCollectionAccessor& collections, const JoinGraph& graph, - const SamplingEstimatorMap& samplingEstimators, - bool isExplain) { + const SamplingEstimatorMap& samplingEstimators) { const auto numNodes = graph.numNodes(); QuerySolutionMap solns; cost_based_ranker::EstimateMap estimates; @@ -148,7 +147,7 @@ StatusWith singleTableAccessPlans( samplingEstimator.get(), nullptr /*exactCardinality*/, std::move(swSolns.getValue()), - isExplain); + *node.accessPath); // Return bad status if CBR is unable to produce a plan if (!swCbrResult.isOK()) { return swCbrResult.getStatus(); diff --git a/src/mongo/db/query/compiler/optimizer/join/single_table_access.h b/src/mongo/db/query/compiler/optimizer/join/single_table_access.h index 8ba37a00f4e..d991d87878f 100644 --- a/src/mongo/db/query/compiler/optimizer/join/single_table_access.h +++ b/src/mongo/db/query/compiler/optimizer/join/single_table_access.h @@ -56,7 +56,6 @@ StatusWith singleTableAccessPlans( OperationContext* opCtx, const MultipleCollectionAccessor& collections, const JoinGraph& model, - const SamplingEstimatorMap& samplingEstimators, - bool isExplain); + const SamplingEstimatorMap& samplingEstimators); } // namespace mongo::join_ordering diff --git a/src/mongo/db/query/compiler/optimizer/join/single_table_access_test.cpp b/src/mongo/db/query/compiler/optimizer/join/single_table_access_test.cpp index 50f8c7a6eba..6979c2c6a1c 100644 --- a/src/mongo/db/query/compiler/optimizer/join/single_table_access_test.cpp +++ b/src/mongo/db/query/compiler/optimizer/join/single_table_access_test.cpp @@ -97,7 +97,7 @@ TEST_F(SingleTableAccessTestFixture, EstimatesPopulated) { ASSERT(node2); JoinGraph graph(std::move(mgraph)); - auto swRes = singleTableAccessPlans(opCtx, mca, graph, estimators, false); + auto swRes = singleTableAccessPlans(opCtx, mca, graph, estimators); ASSERT_OK(swRes); auto& res = swRes.getValue(); diff --git a/src/mongo/db/query/compiler/optimizer/join/unit_test_helpers.h b/src/mongo/db/query/compiler/optimizer/join/unit_test_helpers.h index 3753f2b0bf3..26e3edf4e3a 100644 --- a/src/mongo/db/query/compiler/optimizer/join/unit_test_helpers.h +++ b/src/mongo/db/query/compiler/optimizer/join/unit_test_helpers.h @@ -232,6 +232,10 @@ public: MONGO_UNREACHABLE; } + ce::SamplingMetadata getSamplingMetadata() const override { + MONGO_UNREACHABLE; + } + private: CardinalityEstimate _collCard; stdx::unordered_map, CardinalityEstimate> _fakeEstimates; diff --git a/src/mongo/db/query/explain.cpp b/src/mongo/db/query/explain.cpp index e429b88c2e5..a62d9e2bc0a 100644 --- a/src/mongo/db/query/explain.cpp +++ b/src/mongo/db/query/explain.cpp @@ -172,6 +172,41 @@ void generatePlannerInfo(PlanExecutor* exec, } auto&& explainer = exec->getPlanExplainer(); + + if (const auto ceSamplingMeta = explainer.getCeSamplingMetadata(); ceSamplingMeta.has_value()) { + BSONObjBuilder ceSamplingMetaBob(plannerBob.subobjStart("ceSamplingMetadata")); + for (const auto& [ns, meta] : ceSamplingMeta.value()) { + BSONObjBuilder nsMetaBob(ceSamplingMetaBob.subobjStart(ns)); + nsMetaBob.append("sampleSource", meta.isPersisted ? "persisted" : "onTheFly"); + static constexpr auto techniqueToStr = + [](cost_based_ranker::SamplingTechnique t) -> StringData { + switch (t) { + case cost_based_ranker::SamplingTechnique::kRandom: + return "random"_sd; + case cost_based_ranker::SamplingTechnique::kChunk: + return "chunk"_sd; + case cost_based_ranker::SamplingTechnique::kFullCollScan: + return "fullCollScan"_sd; + case cost_based_ranker::SamplingTechnique::kSeqScan: + return "seqScan"_sd; + } + MONGO_UNREACHABLE; + }; + nsMetaBob.append("sampleTechnique", techniqueToStr(meta.technique)); + if (meta.technique == cost_based_ranker::SamplingTechnique::kChunk && meta.numChunks) { + nsMetaBob.appendNumber("sampleNumChunks", *meta.numChunks); + } + nsMetaBob.appendNumber("sampleRequestedDocCount", + static_cast(meta.requestedDocCount)); + nsMetaBob.appendNumber("sampleDocCount", static_cast(meta.docCount)); + nsMetaBob.appendNumber("sampleMemorySizeBytes", + static_cast(meta.memorySizeBytes)); + tassert(12433203, + "SamplingMetadata::createdAt must be set before explain is generated", + meta.createdAt.has_value()); + nsMetaBob.appendDate("sampleCreatedAt", meta.createdAt.value()); + } + } auto&& enumeratorInfo = explainer.getEnumeratorInfo(); plannerBob.append("maxIndexedOrSolutionsReached", enumeratorInfo.hitIndexedOrLimit); plannerBob.append("maxIndexedAndSolutionsReached", enumeratorInfo.hitIndexedAndLimit); diff --git a/src/mongo/db/query/plan_explainer.h b/src/mongo/db/query/plan_explainer.h index 8bd872050af..15be30e12ff 100644 --- a/src/mongo/db/query/plan_explainer.h +++ b/src/mongo/db/query/plan_explainer.h @@ -39,6 +39,7 @@ #include "mongo/db/query/stage_builder/classic_stage_builder.h" #include "mongo/util/duration.h" #include "mongo/util/modules.h" +#include "mongo/util/string_map.h" namespace mongo { @@ -59,6 +60,9 @@ struct PlanExplainerData { boost::optional multiPlannerWinningPlanScore; stage_builder::PlanStageToQsnMap planStageQsnMap; cost_based_ranker::EstimateMap estimates; + // Namespace-keyed map of sampling metadata emitted under queryPlanner.ceSamplingMetadata. + // Populated on the explain path when CBR used a sampling estimator. + StringMap ceSamplingMetadata; bool fromPlanCache = false; }; @@ -70,6 +74,12 @@ inline PlanExplainerData& operator<<(PlanExplainerData& lhs, PlanExplainerData&& for (auto& [k, v] : rhs.estimates) { lhs.estimates.insert_or_assign(k, std::move(v)); } + for (auto& [ns, meta] : rhs.ceSamplingMetadata) { + tassert(12433204, + "ceSamplingMetadata already has an entry for namespace during merge", + !lhs.ceSamplingMetadata.contains(ns)); + lhs.ceSamplingMetadata.emplace(ns, std::move(meta)); + } return lhs; } @@ -195,6 +205,16 @@ public: _solution = qs; } + /** + * Returns the per-collection sampling metadata to be emitted under + * queryPlanner.ceSamplingMetadata in explain output. Returns boost::none if no sampling + * metadata is available (e.g., CBR was not used, or this is not a classic-engine plan). + */ + virtual boost::optional> getCeSamplingMetadata() + const { + return boost::none; + } + protected: const QuerySolution* _solution{nullptr}; PlanEnumeratorExplainInfo _enumeratorExplainInfo; diff --git a/src/mongo/db/query/plan_explainer_impl.h b/src/mongo/db/query/plan_explainer_impl.h index bf0d5cff64b..8a706510783 100644 --- a/src/mongo/db/query/plan_explainer_impl.h +++ b/src/mongo/db/query/plan_explainer_impl.h @@ -77,6 +77,14 @@ public: std::vector getCachedPlanStats(const plan_cache_debug_info::DebugInfo&, ExplainOptions::Verbosity) const; + boost::optional> getCeSamplingMetadata() + const override { + if (_explainData.ceSamplingMetadata.empty()) { + return boost::none; + } + return _explainData.ceSamplingMetadata; + } + private: /** * A helper that formats the plan stats into a BSON object and collects summary stats. diff --git a/src/mongo/db/query/plan_explainer_test.cpp b/src/mongo/db/query/plan_explainer_test.cpp index 6fb9fec65ee..6c5f8fd65cb 100644 --- a/src/mongo/db/query/plan_explainer_test.cpp +++ b/src/mongo/db/query/plan_explainer_test.cpp @@ -34,6 +34,7 @@ #include "mongo/db/pipeline/pipeline_d.h" #include "mongo/db/query/canonical_query.h" #include "mongo/db/query/compiler/physical_model/query_solution/query_solution.h" +#include "mongo/db/query/explain.h" #include "mongo/db/query/explain_diagnostic_printer.h" #include "mongo/db/query/get_executor.h" #include "mongo/db/query/multiple_collection_accessor.h" @@ -661,5 +662,69 @@ TEST_F(PlanExplainerTest, PlanExplainerDataMergeFull) { ASSERT_EQ(data1.estimates.size(), 2); } +TEST_F(PlanExplainerTest, CBRSamplingMetadataSerializedInExplain) { + // Verify that when CBR uses sampling CE, the 'ceSamplingMetadata' section appears in the + // queryPlanner explain output and contains the expected fields for each collection. + RAIIServerParameterControllerForTest samplingController("internalQueryCBRCEMode", "samplingCE"); + + const auto verbosity = ExplainOptions::Verbosity::kQueryPlanner; + expCtx->setExplain(verbosity); + + auto coll = acquireCollection( + operationContext(), + CollectionAcquisitionRequest::fromOpCtx( + operationContext(), kNss, AcquisitionPrerequisites::OperationType::kRead), + MODE_IS); + MultipleCollectionAccessor colls{coll}; + + auto findCommand = std::make_unique(kNss); + findCommand->setFilter(fromjson("{a: {$gte: 0}, b: {$gte: 0}}")); + auto cq = std::make_unique(CanonicalQueryParams{ + .expCtx = expCtx, + .parsedFind = ParsedFindCommandParams{.findCommand = std::move(findCommand)}}); + + Command* cmd = CommandHelpers::findCommand(operationContext(), "find"); + { + std::lock_guard clientLock(*operationContext()->getClient()); + CurOp::get(operationContext()) + ->setGenericOpRequestDetails(clientLock, kNss, cmd, BSONObj(), NetworkOp::dbQuery); + } + + auto swExec = getExecutorFind( + operationContext(), colls, std::move(cq), PlanYieldPolicy::YieldPolicy::INTERRUPT_ONLY); + ASSERT_OK(swExec); + + BSONObjBuilder bob; + Explain::explainStages(swExec.getValue().get(), + colls, + verbosity, + Status::OK(), + boost::none, + BSONObj(), + SerializationContext::stateCommandReply(), + BSONObj(), + &bob); + const BSONObj explained = bob.obj(); + + auto queryPlanner = explained["queryPlanner"]; + ASSERT(queryPlanner.isABSONObj()) << "Missing queryPlanner in: " << explained; + + auto ceSamplingMeta = queryPlanner["ceSamplingMetadata"]; + ASSERT(ceSamplingMeta.isABSONObj()) + << "Missing ceSamplingMetadata in queryPlanner: " << queryPlanner; + + // Exactly one namespace entry expected. + ASSERT_EQ(ceSamplingMeta.Obj().nFields(), 1) << ceSamplingMeta; + const BSONElement nsElem = ceSamplingMeta.Obj().firstElement(); + ASSERT_EQ(nsElem.type(), BSONType::object); + const BSONObj nsMeta = nsElem.Obj(); + + ASSERT_EQ(nsMeta["sampleSource"].String(), "onTheFly"); + ASSERT(nsMeta.hasField("sampleTechnique")) << nsMeta; + ASSERT(nsMeta.hasField("sampleDocCount")) << nsMeta; + ASSERT(nsMeta.hasField("sampleRequestedDocCount")) << nsMeta; + ASSERT(nsMeta.hasField("sampleMemorySizeBytes")) << nsMeta; +} + } // namespace } // namespace mongo diff --git a/src/mongo/db/query/plan_ranking/cbr_plan_ranking.cpp b/src/mongo/db/query/plan_ranking/cbr_plan_ranking.cpp index e271905dea1..49ee9400552 100644 --- a/src/mongo/db/query/plan_ranking/cbr_plan_ranking.cpp +++ b/src/mongo/db/query/plan_ranking/cbr_plan_ranking.cpp @@ -36,6 +36,7 @@ #include "mongo/db/query/compiler/ce/exact/exact_cardinality_impl.h" #include "mongo/db/query/compiler/ce/sampling/sampling_estimator.h" #include "mongo/db/query/compiler/ce/sampling/sampling_estimator_impl.h" +#include "mongo/db/query/compiler/optimizer/cost_based_ranker/estimates.h" #include "mongo/db/query/planner_analysis.h" #include "mongo/db/stats/counters.h" @@ -211,7 +212,7 @@ StatusWith CBRPlanRankingStrategy::rankPlans( samplingEstimator.get(), exactCardinality.get(), std::move(statusWithMultiPlanSolns), - query.getExplain().has_value()); + query); // Calculate duration for server status metrics auto durationMicros = tickSource->ticksTo(tickSource->getTicks() - startTicks); diff --git a/src/mongo/db/query/query_planner.cpp b/src/mongo/db/query/query_planner.cpp index b328de94626..1bce368bfb9 100644 --- a/src/mongo/db/query/query_planner.cpp +++ b/src/mongo/db/query/query_planner.cpp @@ -1768,7 +1768,7 @@ StatusWith QueryPlanner::planWithCostBasedRanking( ce::SamplingEstimator* samplingEstimator, const ce::ExactCardinalityEstimator* exactCardinality, StatusWith>> statusWithMultiPlanSolns, - bool isExplain) { + const CanonicalQuery& query) { using namespace cost_based_ranker; auto cbrMode = params.planRankerMode; EstimateMap estimates; @@ -1849,7 +1849,7 @@ StatusWith QueryPlanner::planWithCostBasedRanking( PlanRankingResult{.solutions = std::move(acceptedSoln), .maybeExplainData = PlanExplainerData{.estimates = std::move(estimates)}, .needsWorksMeasuredForPlanCache = successfullyChoseWinner}; - if (isExplain) { + if (query.getExplain()) { std::vector rejectedSolnWithStages; rejectedSolnWithStages.reserve(rejectedSoln.size()); std::transform(std::make_move_iterator(rejectedSoln.begin()), @@ -1860,6 +1860,12 @@ StatusWith QueryPlanner::planWithCostBasedRanking( }); planRankingResult.maybeExplainData->rejectedPlansWithStages = std::move(rejectedSolnWithStages); + if (samplingEstimator) { + planRankingResult.maybeExplainData->ceSamplingMetadata.emplace( + NamespaceStringUtil::serialize(query.nss(), + query.getExpCtx()->getSerializationContext()), + samplingEstimator->getSamplingMetadata()); + } } return std::move(planRankingResult); } diff --git a/src/mongo/db/query/query_planner.h b/src/mongo/db/query/query_planner.h index 6db683928a2..cd9d9e974f5 100644 --- a/src/mongo/db/query/query_planner.h +++ b/src/mongo/db/query/query_planner.h @@ -129,14 +129,15 @@ public: * estimation (CE) and costing modules. The return value contains a list of plans that were * rejected on the basis of cost, as well as any non-rejected plans from which the caller can * select a winner. - * If isExplain is true, collect and return planExplainerData as part of the PlanRankingResult. + * If query.getExplain().has_value(), collect and return planExplainerData as part of the + * PlanRankingResult. */ static StatusWith planWithCostBasedRanking( const QueryPlannerParams& params, ce::SamplingEstimator* samplingEstimator, const ce::ExactCardinalityEstimator* exactCardinality, StatusWith>> statusWithMultiPlanSolns, - bool isExplain); + const CanonicalQuery& query); /** * Generates and returns a query solution, given data retrieved from the plan cache.