SERVER-119091 Fix lookup hash table strings with collation spilling (#47779)

GitOrigin-RevId: aea49593e08fd39df872af0e0cbb8efb71b6da0b
This commit is contained in:
Ivan Fefer 2026-02-11 12:00:43 +01:00 committed by MongoDB Bot
parent 88fd06161a
commit ff55682afe
2 changed files with 104 additions and 3 deletions

View File

@ -384,4 +384,101 @@ TEST_F(HashLookupStageTest, ForceSpillTest) {
lookupStage->close();
}
TEST_F(HashLookupStageTest, SpillLargeStringWithCollationTest) {
constexpr size_t kStringLength = 64;
constexpr size_t kStringCount = 26;
RAIIServerParameterControllerForTest maxMemoryLimit(
"internalQuerySlotBasedExecutionHashLookupApproxMemoryUseInBytesBeforeSpill",
static_cast<long long>(kStringCount * kStringLength / 2));
std::vector<std::string> innerKeys;
std::vector<std::string> outerKeys;
innerKeys.reserve(kStringCount);
outerKeys.reserve(kStringCount);
for (size_t i = 0; i < kStringCount; ++i) {
const std::array<char, 2> chars{static_cast<char>('a' + i), static_cast<char>('A' + i)};
std::string inner, outer;
inner.reserve(kStringLength);
outer.reserve(kStringLength);
for (size_t j = 0; j < kStringLength; ++j) {
inner.push_back(chars[j % 2]);
outer.push_back(chars[1 - j % 2]);
}
innerKeys.push_back(std::move(inner));
outerKeys.push_back(std::move(outer));
}
BSONArrayBuilder innerJoin;
BSONArrayBuilder outerJoin;
for (size_t i = 0; i < kStringCount; ++i) {
BSONObj obj = BSON("_id" << static_cast<long long>(i));
innerJoin.append(BSON_ARRAY(obj << innerKeys[i]));
outerJoin.append(BSON_ARRAY(obj << outerKeys[i]));
}
auto [innerScanSlots, innerScanStage] = generateVirtualScanMulti(2, innerJoin.arr());
auto [outerScanSlots, outerScanStage] = generateVirtualScanMulti(2, outerJoin.arr());
auto ctx = makeCompileCtx();
auto collatorSlot = generateSlotId();
auto collator =
std::make_unique<CollatorInterfaceMock>(CollatorInterfaceMock::MockType::kToLowerString);
value::OwnedValueAccessor collatorAccessor;
ctx->pushCorrelated(collatorSlot, &collatorAccessor);
collatorAccessor.reset(value::TypeTags::collator,
value::bitcastFrom<CollatorInterface*>(collator.release()));
value::SlotId lookupStageOutputSlot = generateSlotId();
SlotExprPair agg = std::make_pair(
lookupStageOutputSlot, makeFunction("addToArray", makeE<EVariable>(innerScanSlots[0])));
auto lookupStage = makeS<HashLookupStage>(std::move(outerScanStage),
std::move(innerScanStage),
outerScanSlots[1],
innerScanSlots[1],
innerScanSlots[0],
std::move(agg),
collatorSlot,
kEmptyPlanNodeId);
value::SlotVector lookupSlots;
lookupSlots.reserve(2);
lookupSlots.push_back(outerScanSlots[0]);
lookupSlots.push_back(lookupStageOutputSlot);
auto resultAccessors = prepareTree(ctx.get(), lookupStage.get(), lookupSlots);
std::vector<std::vector<std::pair<value::TypeTags, value::Value>>> actualResultView;
std::vector<std::pair<value::TypeTags, value::Value>> ownedValues;
while (lookupStage->getNext() == PlanState::ADVANCED) {
std::vector<std::pair<value::TypeTags, value::Value>> results{};
results.reserve(resultAccessors.size());
for (size_t i = 0; i < resultAccessors.size(); ++i) {
ownedValues.emplace_back(resultAccessors[i]->getCopyOfValue());
results.emplace_back(ownedValues.back());
}
actualResultView.emplace_back(std::move(results));
}
ValueVectorGuard resultsGuard{ownedValues};
lookupStage->close();
ASSERT_EQ(actualResultView.size(), kStringCount);
for (size_t i = 0; i < kStringCount; ++i) {
const auto& result = actualResultView[i];
ASSERT_EQ(result.size(), 2);
const BSONObj input = BSON("_id" << static_cast<long long>(i));
assertValuesEqual(result[0].first,
result[0].second,
value::TypeTags::bsonObject,
value::bitcastFrom<const char*>(input.objdata()));
const BSONArray output = BSON_ARRAY(input);
assertValuesEqual(result[1].first,
result[1].second,
value::TypeTags::bsonArray,
value::bitcastFrom<const char*>(output.objdata()));
}
}
} // namespace mongo::sbe

View File

@ -56,8 +56,10 @@ void LookupHashTableIter::initSearchArray() {
hashTableMatchIter->second.end());
} else if (_hashTable._recordStoreHt) {
// The key wasn't in memory. Check the '_hashTable._recordStoreHt' disk spill.
auto [_, tagElemCollView, valElemCollView] =
auto [owned, tagElemCollView, valElemCollView] =
_hashTable.normalizeStringIfCollator(tagElemView, valElemView);
value::ValueGuard elemGuard{owned, tagElemCollView, valElemCollView};
boost::optional<std::vector<size_t>> indicesFromRS =
_hashTable.readIndicesFromRecordStore(
_hashTable._recordStoreHt.get(), tagElemCollView, valElemCollView);
@ -82,8 +84,10 @@ void LookupHashTableIter::initSearchScalar() {
_hashTableMatchVectorIdx = 0;
} else if (_hashTable._recordStoreHt) {
// The key wasn't in memory. Check the '_hashTable._recordStoreHt' disk spill.
auto [_, tagKeyCollView, valKeyCollView] =
auto [owned, tagKeyCollView, valKeyCollView] =
_hashTable.normalizeStringIfCollator(_outerKeyTag, _outerKeyVal);
value::ValueGuard keyGuard{owned, tagKeyCollView, valKeyCollView};
boost::optional<std::vector<size_t>> indicesFromRS = _hashTable.readIndicesFromRecordStore(
_hashTable._recordStoreHt.get(), tagKeyCollView, valKeyCollView);
if (indicesFromRS) {
@ -330,7 +334,7 @@ void LookupHashTable::spillIndicesToRecordStore(SpillingStore* rs,
}
auto [owned, tagKeyColl, valKeyColl] = normalizeStringIfCollator(tagKey, valKey);
_htProbeKey.reset(0, owned, tagKeyColl, valKeyColl);
value::ValueGuard keyGuard{owned, tagKeyColl, valKeyColl};
auto valFromRs = readIndicesFromRecordStore(rs, tagKeyColl, valKeyColl);