SERVER-107879: BACKPORT-25612: [v8.2] Best-effort ban hybrid search on timeseries collections inside of $lookup and $unionWiths (#39303)

GitOrigin-RevId: d1a6b73fdd6d8a880b2cfe4f2ccba5e1f189f156
This commit is contained in:
Finley Lau 2025-07-30 12:35:28 -05:00 committed by MongoDB Bot
parent 812035b596
commit 91e5dcf79c
8 changed files with 369 additions and 8 deletions

View File

@ -0,0 +1,221 @@
/*
* Tests hybrid search with both $scoreFusion and $rankFusion get rejected when inside of $unionWith
* or $lookup subpipelines on timeseries collections.
*
* This test can only run on unsharded collections because we cannot deterministically ban hybrid
* search on timeseries collections in the sharded collections case.
*
* TODO SERVER-108218 Ban hybrid search on sharded collections and remove the
* assumes_unsharded_collection tag.
*
* @tags: [ requires_timeseries, assumes_unsharded_collection, featureFlagSearchHybridScoringFull,
* requires_fcv_82 ]
*/
const timeseriesCollName = jsTestName() + "_timeseries";
assert.commandWorked(
db.createCollection(timeseriesCollName, {timeseries: {timeField: "t", metaField: "m"}}));
const timeseriesColl = db[timeseriesCollName];
assert.commandWorked(timeseriesColl.insert({t: new Date(), m: 1, a: 42, b: 17}));
const nonTimeseriesCollName = jsTestName() + "_nontimeseries";
assert.commandWorked(db.createCollection(nonTimeseriesCollName));
const nonTimeseriesColl = db[nonTimeseriesCollName];
assert.commandWorked(nonTimeseriesColl.insert({a: 50, b: 20}));
let rankFusionPipeline = [{$rankFusion: {input: {pipelines: {sortPipeline: [{$sort: {a: 1}}]}}}}];
let scoreFusionPipeline = [{
$scoreFusion:
{input: {pipelines: {scorePipeline: [{$score: {score: "$a"}}]}, normalization: "none"}}
}];
function runPipeline(pipeline, collName) {
return db.runCommand({aggregate: collName, pipeline, cursor: {}});
}
(function testHybridSearchRejected() {
assert.commandFailedWithCode(runPipeline(rankFusionPipeline, timeseriesCollName),
[10557301, ErrorCodes.OptionNotSupportedOnView]);
assert.commandFailedWithCode(runPipeline(scoreFusionPipeline, timeseriesCollName),
[10557301, ErrorCodes.OptionNotSupportedOnView]);
})();
// TODO SERVER-108117 Enable these tests.
(function testUnionWithRejectsIsHybridSearchFlagFromUser() {
let badUnionWithStageWithIsHybridSearchTrue = {
$unionWith: {
coll: timeseriesCollName,
pipeline: [{$sort: {_id: 1}}],
$_internalIsHybridSearch: true
}
};
assert.commandFailedWithCode(
runPipeline([badUnionWithStageWithIsHybridSearchTrue], timeseriesCollName), 5491300);
let badUnionWithStageWithIsHybridSearchFalse = {
$unionWith: {
coll: timeseriesCollName,
pipeline: [{$sort: {_id: 1}}],
as: "out",
$_internalIsHybridSearch: false
}
};
assert.commandFailedWithCode(
runPipeline([badUnionWithStageWithIsHybridSearchFalse], timeseriesCollName), 5491300);
});
// TODO SERVER-108117 Enable these tests.
(function testLookupRejectsIsHybridSearchFlagFromUser() {
let badLookupStageWithIsHybridSearchTrue = {
$lookup: {
from: timeseriesCollName,
pipeline: [{$sort: {_id: 1}}],
$_internalIsHybridSearch: true
}
};
assert.commandFailedWithCode(
runPipeline([badLookupStageWithIsHybridSearchTrue], timeseriesCollName), 5491300);
let badLookupStageWithIsHybridSearchFalse = {
$lookup: {
from: timeseriesCollName,
pipeline: [{$sort: {_id: 1}}],
as: "out",
$_internalIsHybridSearch: false
}
};
assert.commandFailedWithCode(
runPipeline([badLookupStageWithIsHybridSearchFalse], timeseriesCollName), 5491300);
});
// Note that hybrid search cannot run against a collectionless $unionWith because a collectionless
// $unionWith must start with the $documents stage, but hybrid search stages must be the first
// stages in the pipeline.
(function testHybridSearchRejectedOnUnionWithPipeline() {
let rankFusionUnionWithStage = {
$unionWith: {coll: timeseriesCollName, pipeline: rankFusionPipeline}
};
assert.commandFailedWithCode(runPipeline([rankFusionUnionWithStage], timeseriesCollName),
[10787900, 10787901]);
let scoreFusionUnionWithStage = {
$unionWith: {coll: timeseriesCollName, pipeline: scoreFusionPipeline}
};
assert.commandFailedWithCode(runPipeline([scoreFusionUnionWithStage], timeseriesCollName),
[10787900, 10787901]);
})();
(function testHybridSearchOnUnionWithOnNonTimeseriesCollectionInsideTimeseriesQuery() {
// These queries should pass because hybrid search is valid on a non-timeseries collection,
// regardless of what the outer query is running on.
let rankFusionUnionWithStage = {
$unionWith: {coll: nonTimeseriesCollName, pipeline: rankFusionPipeline}
};
assert.commandWorked(runPipeline([rankFusionUnionWithStage], timeseriesCollName));
let scoreFusionUnionWithStage = {
$unionWith: {coll: nonTimeseriesCollName, pipeline: scoreFusionPipeline}
};
assert.commandWorked(runPipeline([scoreFusionUnionWithStage], timeseriesCollName));
})();
(function testHybridSearchOnUnionWithOnTimeseriesCollectionInsideNonTimeseriesQuery() {
// These queries should fail because hybrid search is not valid on timeseries collections,
// regardless of what the outer query is running on.
let rankFusionUnionWithStage = {
$unionWith: {coll: timeseriesCollName, pipeline: rankFusionPipeline}
};
assert.commandFailedWithCode(runPipeline([rankFusionUnionWithStage], nonTimeseriesCollName),
[10787900, 10787901]);
let scoreFusionUnionWithStage = {
$unionWith: {coll: timeseriesCollName, pipeline: scoreFusionPipeline}
};
assert.commandFailedWithCode(runPipeline([scoreFusionUnionWithStage], nonTimeseriesCollName),
[10787900, 10787901]);
})();
(function testHybridSearchOnUnionWithOnTimeseriesCollectionInsideNonTimeseriesQueryNested() {
let rankFusionUnionWithStage = {
$unionWith: {coll: timeseriesCollName, pipeline: rankFusionPipeline}
};
let nestedRankFusionUnionWithStage = {
$unionWith: {coll: nonTimeseriesCollName, pipeline: [rankFusionUnionWithStage]}
};
assert.commandFailedWithCode(
runPipeline([nestedRankFusionUnionWithStage], nonTimeseriesCollName), [10787900, 10787901]);
let scoreFusionUnionWithStage = {
$unionWith: {coll: timeseriesCollName, pipeline: scoreFusionPipeline}
};
let nestedScoreFusionUnionWithStage = {
$unionWith: {coll: nonTimeseriesCollName, pipeline: [scoreFusionUnionWithStage]}
};
assert.commandFailedWithCode(
runPipeline([nestedScoreFusionUnionWithStage], nonTimeseriesCollName),
[10787900, 10787901]);
})();
(function testHybridSearchRejectedOnLookupPipeline() {
let rankFusionLookupStage = {
$lookup: {from: timeseriesCollName, pipeline: rankFusionPipeline, as: "out"}
};
assert.commandFailedWithCode(runPipeline([rankFusionLookupStage], timeseriesCollName),
[10787900, 10787901]);
let scoreFusionLookupStage = {
$lookup: {from: timeseriesCollName, pipeline: scoreFusionPipeline, as: "out"}
};
assert.commandFailedWithCode(runPipeline([scoreFusionLookupStage], timeseriesCollName),
[10787900, 10787901]);
})();
(function testHybridSearchOnLookupOnNonTimeseriesCollectionInsideTimeseriesQuery() {
// These queries should succeed because the pipeline is running against a non timeseries
// collection.
let rankFusionLookupStage = {
$lookup: {from: nonTimeseriesCollName, pipeline: rankFusionPipeline, as: "out"}
};
assert.commandWorked(runPipeline([rankFusionLookupStage], timeseriesCollName));
let scoreFusionLookupStage = {
$lookup: {from: nonTimeseriesCollName, pipeline: scoreFusionPipeline, as: "out"}
};
assert.commandWorked(runPipeline([scoreFusionLookupStage], timeseriesCollName));
})();
(function testHybridSearchOnLookupOnTimeseriesCollectionInsideNonTimeseriesQuery() {
let rankFusionLookupStage = {
$lookup: {from: timeseriesCollName, pipeline: rankFusionPipeline, as: "out"}
};
assert.commandFailedWithCode(runPipeline([rankFusionLookupStage], nonTimeseriesCollName),
[10787900, 10787901]);
let scoreFusionLookupStage = {
$lookup: {from: timeseriesCollName, pipeline: scoreFusionPipeline, as: "out"}
};
assert.commandFailedWithCode(runPipeline([scoreFusionLookupStage], nonTimeseriesCollName),
[10787900, 10787901]);
})();
(function testHybridSearchOnLookupOnTimeseriesCollectionInsideNonTimeseriesQueryNested() {
let rankFusionLookupStage = {
$lookup: {from: timeseriesCollName, pipeline: rankFusionPipeline, as: "out"}
};
let nestedLookupRankFusionStage = {
$lookup: {from: nonTimeseriesCollName, pipeline: [rankFusionLookupStage], as: "out"}
};
assert.commandFailedWithCode(runPipeline([nestedLookupRankFusionStage], nonTimeseriesCollName),
[10787900, 10787901]);
let scoreFusionLookupStage = {
$lookup: {from: timeseriesCollName, pipeline: scoreFusionPipeline, as: "out"}
};
let nestedLookupScoreFusionStage = {
$lookup: {from: nonTimeseriesCollName, pipeline: [scoreFusionLookupStage], as: "out"}
};
assert.commandFailedWithCode(runPipeline([nestedLookupScoreFusionStage], nonTimeseriesCollName),
[10787900, 10787901]);
})();

View File

@ -189,11 +189,36 @@ assert.commandFailedWithCode(
normalization: "sigmoid"
},
}
}
},
]
},
normalization: "none"
}
}
}]),
// TODO SERVER-104725 Change this to the error code from LiteParsedPipeline::validate().
10170100);
assert.commandFailedWithCode(
runPipeline([{
$scoreFusion: {
input: {
pipelines: {
nested: [
{
$scoreFusion: {
input: {
pipelines: {simple: [{$score: {score: "$score_50"}}]},
normalization: "sigmoid"
},
}
},
{$score: 10},
]
},
normalization: "none"
}
}
}]),
// TODO SERVER-104725 Change this to the error code from LiteParsedPipeline::validate().
10473003);

View File

@ -51,6 +51,8 @@
#include <fmt/ranges.h>
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kQuery
namespace mongo::hybrid_scoring_util {
bool isScoreStage(const boost::intrusive_ptr<DocumentSource>& stage) {
@ -380,8 +382,6 @@ Status isScoredPipeline(const std::vector<BSONObj>& bsonPipeline,
}
bool isHybridSearchPipeline(const std::vector<BSONObj>& bsonPipeline) {
tassert(10473000, "Input pipeline must not be empty.", !bsonPipeline.empty());
// Please keep the following in alphabetical order.
static const std::set<StringData> hybridScoringStages{
DocumentSourceRankFusion::kStageName,
@ -398,6 +398,41 @@ bool isHybridSearchPipeline(const std::vector<BSONObj>& bsonPipeline) {
return false;
}
void validateIsHybridSearchNotSetByUser(boost::intrusive_ptr<ExpressionContext> expCtx,
const BSONObj& spec) {
if (spec.hasField(kIsHybridSearchFlagFieldName)) {
assertAllowedInternalIfRequired(expCtx->getOperationContext(),
kIsHybridSearchFlagFieldName,
AllowedWithClientType::kInternal);
}
}
void assertForeignCollectionIsNotTimeseries(const NamespaceString& nss,
const boost::intrusive_ptr<ExpressionContext>& expCtx) {
const auto opCtx = expCtx->getOperationContext();
const auto collectionCatalog = CollectionCatalog::get(opCtx);
if (auto collectionPtr = collectionCatalog->lookupCollectionByNamespace(opCtx, nss)) {
uassert(10787900,
"$rankFusion and $scoreFusion are unsupported on timeseries collections",
!collectionPtr->isTimeseriesCollection());
} else if (auto viewPtr = collectionCatalog->lookupView(opCtx, nss)) {
uassert(10787901,
"$rankFusion and $scoreFusion are unsupported on timeseries collections",
!viewPtr->timeseries());
} else {
// Note that we try our best to ban timeseries collections on hybrid search.
// However, in a sharded collections environment, a mongod shard might not know the
// information about the timeseries collection (if it is owned by another shard). In
// that case, it is non-trivial to ban the timeseries query.
// TODO SERVER-108218 Ban hybrid search inside of subpipelines on time series collections.
LOGV2(10787902,
"$rankFusion and $scoreFusion are unsupported on timeseries collections, but not "
"enough information is available to determine if a subpipeline is running on a "
"timeseries collection.");
}
}
namespace score_details {
std::pair<std::string, BSONObj> constructScoreDetailsForGrouping(const std::string pipelineName) {

View File

@ -36,6 +36,8 @@
namespace mongo::hybrid_scoring_util {
static constexpr StringData kIsHybridSearchFlagFieldName = "$_internalIsHybridSearch"_sd;
/**
* Checks if this stage is a $score stage, where it has been desugared to $setMetadata with the meta
* type MetaType::kScore.
@ -114,6 +116,27 @@ bool pipelineContainsScoreStage(const std::vector<BSONObj>& bsonPipeline);
*/
bool isHybridSearchPipeline(const std::vector<BSONObj>& bsonPipeline);
/**
* Validates that the provided spec does not have the internal-use-only $_internalIsHybridSearch
* flag set.
*
* TODO SERVER-108117 This is currently not called because the validation is broken when running an
* explain on a view in a sharded collection. In that scenario, the router desugars the subpipeline,
* adds $_internalIsHybridSearch to the serialized BSON, and sends it to the shards. The shards
* respond with an error that the view must be executed on the router, and then the router tries
* executing the fully-desugared pipeline. However, on this retry, the internal client flag is not
* set, and the router fails the explain due to this assertion.
*/
void validateIsHybridSearchNotSetByUser(boost::intrusive_ptr<ExpressionContext> expCtx,
const BSONObj& spec);
/**
* Validates that a given collection/view namespace is not a timeseries collection for hybrid
* search.
*/
void assertForeignCollectionIsNotTimeseries(const NamespaceString& nss,
const boost::intrusive_ptr<ExpressionContext>& expCtx);
namespace score_details {
/**
* Construct the scoreDetails field name and obj (ex: name_scoreDetails: {$mergeObjects:

View File

@ -56,6 +56,7 @@
#include "mongo/db/pipeline/document_path_support.h"
#include "mongo/db/pipeline/document_source.h"
#include "mongo/db/pipeline/document_source_documents.h"
#include "mongo/db/pipeline/document_source_hybrid_scoring_util.h"
#include "mongo/db/pipeline/document_source_merge_gen.h"
#include "mongo/db/pipeline/document_source_queue.h"
#include "mongo/db/pipeline/document_source_sequential_document_cache.h"
@ -1312,6 +1313,13 @@ void DocumentSourceLookUp::serializeToArray(std::vector<Value>& array,
output[getSourceName()]["let"] = Value(exprList.freeze());
output[getSourceName()]["pipeline"] = Value(serializedPipeline);
if (!opts.isSerializingForExplain() &&
hybrid_scoring_util::isHybridSearchPipeline(
_userPipeline.value_or(std::vector<BSONObj>()))) {
output[getSourceName()][hybrid_scoring_util::kIsHybridSearchFlagFieldName] =
Value(true);
}
}
if (opts.isSerializingForExplain()) {
@ -1552,8 +1560,10 @@ boost::intrusive_ptr<DocumentSource> DocumentSourceLookUp::createFromBson(
bool hasPipeline = false;
bool hasLet = false;
auto lookupSpec = DocumentSourceLookupSpec::parse(IDLParserContext(kStageName), elem.Obj());
// TODO SERVER-108117 Validate that the isHybridSearch flag is only set internally. See helper
// hybrid_scoring_util::validateIsHybridSearchNotSetByUser to handle this.
auto lookupSpec = DocumentSourceLookupSpec::parse(IDLParserContext(kStageName), elem.Obj());
if (lookupSpec.getFrom().has_value()) {
fromNs = parseLookupFromAndResolveNamespace(lookupSpec.getFrom().value().getElement(),
@ -1582,6 +1592,17 @@ boost::intrusive_ptr<DocumentSource> DocumentSourceLookUp::createFromBson(
fromNs =
NamespaceString::makeCollectionlessAggregateNSS(pExpCtx->getNamespaceString().dbName());
}
if (lookupSpec.getIsHybridSearch() || hybrid_scoring_util::isHybridSearchPipeline(pipeline)) {
// If there is a hybrid search stage in our pipeline, then we should validate that we
// are not running on a timeseries collection.
//
// If the hybrid search flag is set to true, this request may have
// come from a mongos that does not know if the collection is a valid collection for
// hybrid search. Therefore, we must validate it here.
hybrid_scoring_util::assertForeignCollectionIsNotTimeseries(fromNs, pExpCtx);
}
boost::intrusive_ptr<DocumentSourceLookUp> lookupStage = nullptr;
if (hasPipeline) {
if (localField.empty() && foreignField.empty()) {

View File

@ -67,3 +67,11 @@ structs:
description: The foreign field used to perform equality match with localField
type: string
optional: true
# When a hybrid search stage is de-sugared, then serialized into sub-pipeline BSON to be sent
# across the wire (i.e. from mongos to mongod), its unclear from inspecting the BSON that the
# original query was a hybrid search, so this internal field preserves that information.
$_internalIsHybridSearch:
description: An optional internal field specifying if the subpipeline is a hybrid search.
type: optionalBool
stability: internal
cpp_name: isHybridSearch

View File

@ -37,11 +37,13 @@
#include "mongo/db/exec/agg/pipeline_builder.h"
#include "mongo/db/exec/document_value/document.h"
#include "mongo/db/pipeline/document_source_documents.h"
#include "mongo/db/pipeline/document_source_hybrid_scoring_util.h"
#include "mongo/db/pipeline/document_source_match.h"
#include "mongo/db/pipeline/document_source_queue.h"
#include "mongo/db/pipeline/document_source_single_document_transformation.h"
#include "mongo/db/pipeline/document_source_union_with_gen.h"
#include "mongo/db/pipeline/process_interface/mongo_process_interface.h"
#include "mongo/db/pipeline/search/search_helper.h"
#include "mongo/db/query/allowed_contexts.h"
#include "mongo/db/query/plan_summary_stats.h"
#include "mongo/db/views/resolved_view.h"
@ -260,6 +262,8 @@ boost::intrusive_ptr<DocumentSource> DocumentSourceUnionWith::createFromBson(
unionNss = NamespaceStringUtil::deserialize(expCtx->getNamespaceString().dbName(),
elem.valueStringData());
} else {
// TODO SERVER-108117 Validate that the isHybridSearch flag is only set internally. See
// helper hybrid_scoring_util::validateIsHybridSearchNotSetByUser to handle this.
auto unionWithSpec =
UnionWithSpec::parse(IDLParserContext(kStageName), elem.embeddedObject());
if (unionWithSpec.getColl()) {
@ -272,6 +276,16 @@ boost::intrusive_ptr<DocumentSource> DocumentSourceUnionWith::createFromBson(
expCtx->getNamespaceString().dbName());
}
pipeline = unionWithSpec.getPipeline().value_or(std::vector<BSONObj>{});
if (unionWithSpec.getIsHybridSearch() ||
hybrid_scoring_util::isHybridSearchPipeline(pipeline)) {
// If there is a hybrid search stage in our pipeline, then we should validate that we
// are not running on a timeseries collection.
//
// If the hybrid search flag is set to true, this request may have
// come from a mongos that does not know if the collection is a valid collection for
// hybrid search. Therefore, we must validate it here.
hybrid_scoring_util::assertForeignCollectionIsNotTimeseries(unionNss, expCtx);
}
}
return make_intrusive<DocumentSourceUnionWith>(
expCtx, std::move(unionNss), std::move(pipeline));
@ -541,10 +555,16 @@ Value DocumentSourceUnionWith::serialize(const SerializationOptions& opts) const
return _pipeline->serializeToBson(opts);
}();
auto spec = collectionless ? DOC("pipeline" << serializedPipeline)
: DOC("coll" << opts.serializeIdentifier(_userNss.coll())
<< "pipeline" << serializedPipeline);
return Value(DOC(getSourceName() << spec));
bool isHybridSearch = hybrid_scoring_util::isHybridSearchPipeline(_userPipeline);
MutableDocument spec;
if (!collectionless) {
spec["coll"] = Value(opts.serializeIdentifier(_userNss.coll()));
}
spec["pipeline"] = Value(serializedPipeline);
if (isHybridSearch) {
spec[hybrid_scoring_util::kIsHybridSearchFlagFieldName] = Value(isHybridSearch);
}
return Value(DOC(getSourceName() << spec.freezeToValue()));
}
}

View File

@ -46,3 +46,11 @@ structs:
description: An optional pipeline to apply to the collection being unioned.
optional: true
type: pipeline
# When a hybrid search stage is de-sugared, then serialized into sub-pipeline BSON to be sent
# across the wire (i.e. from mongos to mongod), its unclear from inspecting the BSON that the
# original query was a hybrid search, so this internal field preserves that information.
$_internalIsHybridSearch:
description: An optional internal field specifying if the subpipeline is a hybrid search.
type: optionalBool
stability: internal
cpp_name: isHybridSearch