SERVER-127045 Balancer must observe jumbo flag updates (#53995)

GitOrigin-RevId: 31a0acf24a5c7ec249aa055679c8c8b4c4daedaa
This commit is contained in:
Silvia Surroca 2026-05-19 17:18:04 +02:00 committed by MongoDB Bot
parent 85a3200abb
commit e66aa75621
3 changed files with 220 additions and 35 deletions

View File

@ -0,0 +1,113 @@
/**
* Verifies that the balancer's routing cache observes the cleared jumbo flag immediately after
* clearJumboFlag, without requiring a placement-version bump.
*
* @tags: [
* requires_sharding,
* requires_fcv_90,
* ]
*/
// Direct writes to config.chunks (used to stamp the jumbo flag) bypass shard filtering metadata
// consistency checks.
TestData.skipCheckShardFilteringMetadata = true;
import {ShardingTest} from "jstests/libs/shardingtest.js";
import {findChunksUtil} from "jstests/sharding/libs/find_chunks_util.js";
const st = new ShardingTest({
shards: 2,
other: {
chunkSize: 1,
enableBalancer: false,
},
});
const dbName = jsTestName();
const collName = "coll";
const ns = dbName + "." + collName;
const adminDB = st.s.getDB("admin");
const configDB = st.s.getDB("config");
// Log messages from the balancer are emitted on the config server primary.
const csrsPrimary = st.configRS.getPrimary();
// --------------------------------------------------------------------------
// Setup:
// There are two shards. Zone "Z" is assigned only to shard1 and covers the range [0, MaxKey).
// The chunk [0, MaxKey) is assigned to zone "Z", but it initially resides on shard0, creating a
// zone violation.
// --------------------------------------------------------------------------
assert.commandWorked(adminDB.runCommand({enableSharding: dbName, primaryShard: st.shard0.shardName}));
assert.commandWorked(adminDB.runCommand({addShardToZone: st.shard1.shardName, zone: "Z"}));
assert.commandWorked(adminDB.runCommand({shardCollection: ns, key: {x: 1}}));
assert.commandWorked(adminDB.runCommand({split: ns, middle: {x: 0}}));
assert.commandWorked(adminDB.runCommand({updateZoneKeyRange: ns, min: {x: 0}, max: {x: MaxKey}, zone: "Z"}));
// Move [0, MaxKey) to shard0 if it was placed elsewhere during sharding.
const chunkBeforeSetup = findChunksUtil.findOneChunkByNs(configDB, ns, {min: {x: 0}});
assert(chunkBeforeSetup, "expected chunk with min {x:0}");
if (chunkBeforeSetup.shard !== st.shard0.shardName) {
assert.commandWorked(
adminDB.runCommand({moveRange: ns, min: {x: 0}, max: {x: MaxKey}, toShard: st.shard0.shardName}),
);
}
// Stamp the chunk as jumbo directly on config.chunks — the balancer will see it as unmovable
// until clearJumboFlag clears it.
assert.commandWorked(configDB.chunks.updateOne({uuid: chunkBeforeSetup.uuid, min: {x: 0}}, {$set: {jumbo: true}}));
const jumboChunkBefore = findChunksUtil.findOneChunkByNs(configDB, ns, {min: {x: 0}});
assert(jumboChunkBefore.jumbo, "jumbo flag not set on config.chunks");
st.forEachConfigServer((conn) =>
assert.commandWorked(conn.adminCommand({setParameter: 1, balancerMigrationsThrottlingMs: 0})),
);
// --------------------------------------------------------------------------
// Round 1: the balancer must skip the chunk because it is jumbo and emit
// log 21891 ("Chunk violates zone, but it is jumbo and cannot be moved").
// --------------------------------------------------------------------------
st.startBalancer();
st.awaitBalancerRound();
const chunkAfterRound1 = findChunksUtil.findOneChunkByNs(configDB, ns, {min: {x: 0}});
assert.eq(st.shard0.shardName, chunkAfterRound1.shard, "jumbo chunk should not have moved during round 1");
// Confirm the balancer logged the jumbo-skip warning.
checkLog.containsJson(csrsPrimary, 21891, {namespace: ns});
jsTestLog("Round 1 complete: balancer correctly skipped the jumbo chunk (log 21891 confirmed).");
// --------------------------------------------------------------------------
// clearJumboFlag: mutates the in-memory ChunkInfo and clears the flag on disk.
// The placement version must NOT be bumped.
// --------------------------------------------------------------------------
assert.commandWorked(adminDB.runCommand({clearJumboFlag: ns, find: {x: 0}}));
const chunkAfterClear = findChunksUtil.findOneChunkByNs(configDB, ns, {min: {x: 0}});
assert(!chunkAfterClear.jumbo, "jumbo flag should be cleared on disk after clearJumboFlag");
assert.eq(
jumboChunkBefore.lastmod.getTime(),
chunkAfterClear.lastmod.getTime(),
"clearJumboFlag must not bump the placement version",
);
jsTestLog("clearJumboFlag done: disk cleared, version unchanged.");
// --------------------------------------------------------------------------
// Round 2: the balancer reads the cleared flag from the in-memory routing
// cache and migrates the chunk to shard1 in this round.
// --------------------------------------------------------------------------
assert.soon(() => {
const chunk = findChunksUtil.findOneChunkByNs(configDB, ns, {min: {x: 0}});
return chunk && chunk.shard === st.shard1.shardName;
}, "balancer did not migrate the chunk to shard1 after clearJumboFlag; " + "the in-memory routing-cache update may not be working");
st.stopBalancer();
jsTestLog("Round 2 complete: balancer migrated the chunk to shard1 immediately after clearJumboFlag.");
st.stop();

View File

@ -1821,8 +1821,6 @@ void ShardingCatalogManager::clearJumboFlag(OperationContext* opCtx,
// under the exclusive _kChunkOpLock happen on the same term.
opCtx->setAlwaysInterruptAtStepDownOrUp_UNSAFE();
auto cm = uassertStatusOK(
RoutingInformationCache::get(opCtx)->getCollectionPlacementInfoWithRefresh(opCtx, nss));
// Take _kChunkOpLock in exclusive mode to serialise with concurrent chunk modifications.
Lock::ExclusiveLock lk(opCtx, _kChunkOpLock);
@ -1879,24 +1877,7 @@ void ShardingCatalogManager::clearJumboFlag(OperationContext* opCtx,
return;
}
// Best-effort update of the in-memory routing cache so the balancer's next iteration observes
// the cleared flag without waiting for a refresh. This mirrors the asymmetric pattern used by
// splitOrMarkJumbo when it sets the flag: the persisted write below intentionally does not
// bump the chunk version, so an incremental routing-cache refresh would not pick up the
// change. The balancer is the only consumer of the jumbo flag, so updating the configsvr's
// own routing entry in place is sufficient. Stale jumbo:true entries on mongos and shard
// routing caches are benign because no router or shard reads the field for routing or
// filtering decisions. If the cache cannot be obtained or doesn't contain the chunk, the
// persisted write below is still correct; the balancer will observe the change on its next
// refresh.
if (cm.isSharded()) {
auto inMemoryChunk = cm.findIntersectingChunkWithSimpleCollation(chunk.getMin());
if (inMemoryChunk.getMin().woCompare(chunk.getMin()) == 0 &&
inMemoryChunk.getMax().woCompare(chunk.getMax()) == 0) {
inMemoryChunk.setJumbo(false);
}
}
// Persist the cleared flag.
BSONObj chunkQuery(BSON(ChunkType::min(chunk.getMin())
<< ChunkType::max(chunk.getMax()) << ChunkType::collectionUUID
<< coll.getUuid()));
@ -1913,6 +1894,32 @@ void ShardingCatalogManager::clearJumboFlag(OperationContext* opCtx,
str::stream() << "failed to clear jumbo flag due to " << chunkQuery
<< " not matching any existing chunks",
didUpdate);
// Patch the in-memory routing cache so the balancer's next iteration observes the cleared
// flag without having to wait for an unrelated placement-version bump. The persisted write
// above intentionally does not bump the chunk version, so an incremental routing-cache
// refresh would otherwise reuse the existing RoutingTableHistory (and the existing
// ChunkInfo) and keep observing jumbo=true. The balancer is the only consumer of this
// flag, so updating the configsvr's own routing entry in place is sufficient. Stale
// jumbo:true entries on mongos and shard routing caches are benign because no router or
// shard reads the field for routing or filtering decisions. If the cache cannot be
// obtained or doesn't contain the chunk, the persisted write above is still correct; the
// balancer will observe the change on its next refresh.
//
// Held under _kChunkOpLock so that no concurrent chunk modification can bump the placement
// version between the persist above and this patch — otherwise the routing cache would be
// rebuilt and `inMemoryChunk` would refer to a now-orphaned ChunkInfo, losing the in-memory
// update.
auto cm = uassertStatusOK(
RoutingInformationCache::get(opCtx)->getCollectionPlacementInfoWithRefresh(opCtx, nss));
if (cm.isSharded()) {
auto inMemoryChunk = cm.findIntersectingChunkWithSimpleCollation(chunk.getMin());
if (inMemoryChunk.getMin().woCompare(chunk.getMin()) == 0 &&
inMemoryChunk.getMax().woCompare(chunk.getMax()) == 0) {
inMemoryChunk.setJumbo(false);
}
}
}
void ShardingCatalogManager::ensureChunkVersionIsGreaterThan(OperationContext* opCtx,
@ -2158,11 +2165,7 @@ void ShardingCatalogManager::splitOrMarkJumbo(OperationContext* opCtx,
if (splitPoints.empty()) {
LOGV2(21873, "Marking chunk as jumbo", "chunk"_attr = redact(chunk.toString()));
// Take _kChunkOpLock in exclusive mode to prevent concurrent chunk modifications. Note
// that the operation below doesn't increment the chunk marked as jumbo's version, which
// means that a subsequent incremental refresh will not see it. However, it is being
// marked in memory through the call to 'markAsJumbo' above so subsequent balancer
// iterations will not consider it for migration.
// Serialize with concurrent chunk modifications.
Lock::ExclusiveLock lk(opCtx, _kChunkOpLock);
const auto findCollResponse = uassertStatusOK(_localConfigShard->exhaustiveFindOnConfig(
@ -2179,16 +2182,7 @@ void ShardingCatalogManager::splitOrMarkJumbo(OperationContext* opCtx,
!findCollResponse.docs.empty());
const CollectionType coll(findCollResponse.docs[0]);
// Best-effort update of the in-memory routing cache so the balancer's next iteration
// observes the flag update without waiting for a refresh.
// The persisted write below intentionally does not bump the chunk version, so an
// incremental routing-cache refresh would not pick up the change. The balancer is the
// only consumer of the jumbo flag, so updating the configsvr's own routing entry in
// place is sufficient.
// If the cache cannot be obtained or doesn't contain the chunk, the persisted write
// below is still correct; the balancer will observe the change on its next refresh.
chunk.setJumbo(true);
// Persist the jumbo flag.
const auto chunkQuery = BSON(ChunkType::collectionUUID()
<< coll.getUuid() << ChunkType::min(chunk.getMin()));
@ -2205,6 +2199,30 @@ void ShardingCatalogManager::splitOrMarkJumbo(OperationContext* opCtx,
"namespace"_attr = redact(toStringForLogging(nss)),
"minKey"_attr = redact(chunk.getMin()),
"error"_attr = redact(status.getStatus()));
return;
}
// Patch the in-memory routing cache so the balancer's next iteration observes the
// flag update without having to wait for an unrelated placement-version bump. The
// persisted write above intentionally does not bump the chunk version, so an
// incremental routing-cache refresh would otherwise reuse the existing
// RoutingTableHistory (and the existing ChunkInfo) and keep observing jumbo=false.
// The balancer is the only consumer of this flag, so updating the configsvr's own
// routing entry in place is sufficient.
//
// Re-fetch the chunk manager and the chunk under _kChunkOpLock: the `cm` / `chunk`
// captured at the top of the function were obtained without the lock, so a concurrent
// chunk modification may have already bumped the placement version and replaced the
// RoutingTableHistory. Looking the chunk up again here guarantees that we patch the
// ChunkInfo currently installed in the routing cache, and the lock prevents any
// further version bump from racing with us between the persist above and this patch.
auto refreshedCm = uassertStatusOK(
RoutingInformationCache::get(opCtx)->getCollectionPlacementInfoWithRefresh(opCtx,
nss));
if (refreshedCm.isSharded()) {
auto inMemoryChunk =
refreshedCm.findIntersectingChunkWithSimpleCollation(chunk.getMin());
inMemoryChunk.setJumbo(true);
}
return;

View File

@ -34,11 +34,13 @@
#include "mongo/bson/bsontypes.h"
#include "mongo/bson/oid.h"
#include "mongo/bson/timestamp.h"
#include "mongo/db/global_catalog/chunk_manager.h"
#include "mongo/db/global_catalog/ddl/sharding_catalog_manager.h"
#include "mongo/db/global_catalog/type_chunk.h"
#include "mongo/db/global_catalog/type_shard.h"
#include "mongo/db/keypattern.h"
#include "mongo/db/namespace_string.h"
#include "mongo/db/router_role/routing_cache/routing_information_cache.h"
#include "mongo/db/sharding_environment/config_server_test_fixture.h"
#include "mongo/db/versioning_protocol/chunk_version.h"
#include "mongo/unittest/unittest.h"
@ -169,5 +171,57 @@ TEST_F(ClearJumboFlagTest, AssertsIfChunkCantBeFound) {
test(_nss2, Timestamp(42));
}
// Demonstrates that the balancer (which reads through the configsvr's
// RoutingInformationCache) observes a cleared jumbo flag even though clearJumboFlag does not bump
// the chunk's placement version. The mechanism is the in-memory ChunkInfo mutation performed by
// clearJumboFlag: an incremental cache refresh would otherwise not pick up the change because the
// loader filters chunks by {lastmod: {$gte: sinceVersion}}.
TEST_F(ClearJumboFlagTest, BalancerObservesClearedJumboFlagViaRoutingCache) {
const auto collTimestamp = Timestamp(42);
const auto collUuid = UUID::gen();
const auto collEpoch = OID::gen();
makeCollection(_nss2, collUuid, collEpoch, collTimestamp);
// Reads jumbo for the chunk whose min == jumboChunk().getMin() through the same path the
// balancer takes (RoutingInformationCache::getCollectionPlacementInfoWithRefresh).
const auto isJumboInRoutingCache = [&]() {
auto cm =
uassertStatusOK(RoutingInformationCache::get(operationContext())
->getCollectionPlacementInfoWithRefresh(operationContext(), _nss2));
ASSERT_TRUE(cm.isSharded());
bool found = false;
bool jumbo = false;
cm.forEachChunk([&](const auto& chunk) {
if (chunk.getMin().woCompare(jumboChunk().getMin()) == 0) {
found = true;
jumbo = chunk.isJumbo();
return false;
}
return true;
});
ASSERT_TRUE(found);
return jumbo;
};
// Prime the routing cache: the chunk is jumbo on disk, so the cached ChunkInfo must reflect
// that.
ASSERT_TRUE(isJumboInRoutingCache());
ShardingCatalogManager::get(operationContext())
->clearJumboFlag(operationContext(), _nss2, collEpoch, jumboChunk());
// The persisted chunk's placement version is unchanged: any observation of the cleared flag
// through the routing cache must come from the in-memory mutation, not from a version-driven
// incremental refresh.
auto chunkDoc = uassertStatusOK(
getChunkDoc(operationContext(), collUuid, jumboChunk().getMin(), collEpoch, collTimestamp));
ASSERT_FALSE(chunkDoc.getJumbo());
ASSERT_EQ(ChunkVersion({collEpoch, collTimestamp}, {12, 7}), chunkDoc.getVersion());
// The balancer's next read sees the cleared flag.
ASSERT_FALSE(isJumboInRoutingCache());
}
} // namespace
} // namespace mongo