mongo/jstests/replsets/oplog_fetch_lag_metric.js
Denis Trailin c188373261 SERVER-119766: fix race in OplogFetcher metric (#48163)
GitOrigin-RevId: 81ba037c77427f79ba9aae33312466ba1c791454
2026-02-20 18:57:02 +00:00

81 lines
2.9 KiB
JavaScript

/**
* Tests the oplogFetchLagSeconds metric, which tracks how far behind the secondary's oplog
* fetcher is from the primary. Verifies that the metric is ~0 when caught up, shows positive
* lag when the secondary falls behind, and returns to ~0 after catching up.
*
*/
import {configureFailPoint} from "jstests/libs/fail_point_util.js";
import {ReplSetTest} from "jstests/libs/replsettest.js";
const rst = new ReplSetTest({
nodes: [{}, {rsConfig: {priority: 0}}],
});
rst.startSet();
rst.initiate();
rst.awaitSecondaryNodes(60000);
// Restart secondary with small batch size so it fetches one entry at a time.
// This must be done after init to avoid slowing down initial sync.
const secondary = rst.restart(1, {setParameter: {bgSyncOplogFetcherBatchSize: 1}});
rst.awaitSecondaryNodes(60000);
const primary = rst.getPrimary();
const testDB = primary.getDB("test");
const secondaryAdminDB = secondary.getDB("admin");
function getoplogFetcherLagSeconds() {
return secondaryAdminDB.serverStatus().metrics.repl.network.oplogFetcherLagSeconds;
}
// Insert initial data and wait for replication to stabilize
assert.commandWorked(testDB.coll.insert({initial: true}));
rst.awaitReplication(60000);
// Verify the metric exists and is ~0 when caught up.
let ss = secondaryAdminDB.serverStatus();
assert(
ss.metrics.repl.network.hasOwnProperty("oplogFetcherLagSeconds"),
"oplogFetcherLagSeconds metric should exist in serverStatus",
);
let lagMetric = getoplogFetcherLagSeconds();
assert.gte(lagMetric, 0, "oplogFetcherLagSeconds should be non-negative");
// When caught up, lag should be 0 or at most 1 second
assert.lte(lagMetric, 1, "oplogFetcherLagSeconds should be ~0 when caught up");
// Create fetch lag by stopping fetcher, doing writes, then resuming.
const stopFetcher = configureFailPoint(secondary, "stopReplProducer");
stopFetcher.wait({maxTimeMS: 60000});
const bulk = testDB.coll.initializeUnorderedBulkOp();
for (let i = 0; i < 20; i++) {
bulk.insert({lag_test: i});
}
assert.commandWorked(bulk.execute({w: 1}));
// Wait 3 seconds to ensure clear timestamp separation
sleep(3000);
// Do another write to ensure primary's lastApplied is in a new second
assert.commandWorked(testDB.coll.insert({final_write: true}, {writeConcern: {w: 1}}));
const hangAfterMetric = configureFailPoint(secondary, "hangOplogFetcherBeforeAdvancingLastFetched");
stopFetcher.off();
// Wait for the fetcher to fetch a batch and hit the hang point
hangAfterMetric.wait({maxTimeMS: 60000});
// Read the metric - should show positive lag
const lagWithHang = getoplogFetcherLagSeconds();
// Release the hang to let test continue
hangAfterMetric.off();
assert.gte(lagWithHang, 2, "Expected to observe fetch lag of at least 2 seconds, got: " + lagWithHang);
// Verify lag returns to ~0 after catching up.
rst.awaitReplication(60000);
lagMetric = getoplogFetcherLagSeconds();
assert.lte(lagMetric, 1, "oplogFetcherLagSeconds should return to ~0 after catchup");
rst.stopSet();