SERVER-106302, SERVER-112636: Provide visibility into the sizes of serverStatus sections if the aggregate FTDC object is too large (#43954)
Co-authored-by: Didier Nadeau <didier.nadeau@mongodb.com> GitOrigin-RevId: 6463d8b8f906909e741dcae0b67528d16c7f7882
This commit is contained in:
parent
74692da3cc
commit
8dad976ba2
@ -1,4 +1,11 @@
|
||||
import {ShardingTest} from "jstests/libs/shardingtest.js";
|
||||
|
||||
function checkNoFTDCEntryLogs(conn) {
|
||||
assert.eq(false,
|
||||
checkLog.checkContainsOnce(conn, "FTDC Entry"),
|
||||
"Found FTDC Entry log line on " + conn.host);
|
||||
}
|
||||
|
||||
function rotate(conn, path, rotateCount) {
|
||||
sleep(2000);
|
||||
for (let i = 1; i <= rotateCount; ++i) {
|
||||
@ -20,12 +27,14 @@ function rotate(conn, path, rotateCount) {
|
||||
MongoRunner.runMongod({setParameter: {diagnosticDataCollectionDirectoryPath: singlePath}});
|
||||
rotate(singleStandalone, singlePath, 1);
|
||||
|
||||
checkNoFTDCEntryLogs(singleStandalone);
|
||||
MongoRunner.stopMongod(singleStandalone);
|
||||
|
||||
const multiStandalone =
|
||||
MongoRunner.runMongod({setParameter: {diagnosticDataCollectionDirectoryPath: multiPath}});
|
||||
rotate(multiStandalone, multiPath, 25);
|
||||
|
||||
checkNoFTDCEntryLogs(multiStandalone);
|
||||
MongoRunner.stopMongod(multiStandalone);
|
||||
}
|
||||
|
||||
@ -40,5 +49,10 @@ function rotate(conn, path, rotateCount) {
|
||||
|
||||
rotate(st.s, path, 5);
|
||||
|
||||
// Check logs for all mongod processes in each shard and the mongos before shutdown
|
||||
st.rs0.nodes.forEach((node) => checkNoFTDCEntryLogs(node));
|
||||
st.rs1.nodes.forEach((node) => checkNoFTDCEntryLogs(node));
|
||||
checkNoFTDCEntryLogs(st.s);
|
||||
|
||||
st.stop();
|
||||
}
|
||||
|
||||
@ -99,7 +99,9 @@ public:
|
||||
|
||||
void collect() {
|
||||
auto client = getService()->makeClient("collectionClient");
|
||||
auto result = _collectorCollection->collect(client.get(), UseMultiServiceSchema{false});
|
||||
std::vector<std::pair<std::string, int>> sectionSizes;
|
||||
auto result =
|
||||
_collectorCollection->collect(client.get(), UseMultiServiceSchema{false}, sectionSizes);
|
||||
LOGV2(11113101, "Collected FTDC sample", "obj"_attr = std::get<0>(result));
|
||||
}
|
||||
|
||||
|
||||
@ -64,7 +64,6 @@
|
||||
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kFTDC
|
||||
|
||||
namespace mongo {
|
||||
|
||||
namespace {
|
||||
|
||||
static constexpr auto roles = std::to_array<std::pair<ClusterRole::Value, StringData>>({
|
||||
@ -111,7 +110,9 @@ bool FTDCCollectorCollection::empty() {
|
||||
}
|
||||
|
||||
std::tuple<BSONObj, Date_t> FTDCCollectorCollection::collect(
|
||||
Client* client, UseMultiServiceSchema multiServiceSchema) {
|
||||
Client* client,
|
||||
UseMultiServiceSchema multiServiceSchema,
|
||||
std::vector<std::pair<std::string, int>>& sectionSizes) {
|
||||
BSONObjBuilder builder;
|
||||
// If there are no collectors, just return an empty BSONObj so that that are caller knows we did
|
||||
// not collect anything
|
||||
@ -152,7 +153,7 @@ std::tuple<BSONObj, Date_t> FTDCCollectorCollection::collect(
|
||||
scopedRouterService.emplace(opCtx.get());
|
||||
}
|
||||
|
||||
_collect(opCtx.get(), role.first, parent);
|
||||
_collect(opCtx.get(), role.first, parent, sectionSizes);
|
||||
|
||||
if (multiServiceSchema) {
|
||||
maybeSubBuilder->appendDate(kFTDCCollectEndField, getCurrentDate(opCtx.get()));
|
||||
@ -337,9 +338,11 @@ void AsyncFTDCCollectorCollection::add(std::unique_ptr<FTDCCollectorInterface> c
|
||||
getSet(role).addCollector(std::move(collector), role);
|
||||
}
|
||||
|
||||
void AsyncFTDCCollectorCollection::_collect(OperationContext* opCtx,
|
||||
ClusterRole role,
|
||||
BSONObjBuilder* builder) {
|
||||
void AsyncFTDCCollectorCollection::_collect(
|
||||
OperationContext* opCtx,
|
||||
ClusterRole role,
|
||||
BSONObjBuilder* builder,
|
||||
std::vector<std::pair<std::string, int>>& sectionSizes) {
|
||||
getSet(role).collect(opCtx, builder);
|
||||
}
|
||||
|
||||
@ -359,7 +362,8 @@ void SyncFTDCCollectorCollection::add(std::unique_ptr<FTDCCollectorInterface> co
|
||||
|
||||
void SyncFTDCCollectorCollection::_collect(OperationContext* opCtx,
|
||||
ClusterRole role,
|
||||
BSONObjBuilder* builder) {
|
||||
BSONObjBuilder* builder,
|
||||
std::vector<std::pair<std::string, int>>& sectionSizes) {
|
||||
auto& collectorVector = _collectors[role];
|
||||
for (auto& collector : collectorVector) {
|
||||
// Skip collection if this collector has no data to return
|
||||
@ -369,17 +373,19 @@ void SyncFTDCCollectorCollection::_collect(OperationContext* opCtx,
|
||||
|
||||
try {
|
||||
BSONObjBuilder subObjBuilder(builder->subobjStart(collector->name()));
|
||||
|
||||
// Add a Date_t before and after each BSON is collected so that we can track timing of
|
||||
// the collector.
|
||||
subObjBuilder.appendDate(kFTDCCollectStartField, getCurrentDate(opCtx));
|
||||
collector->collect(opCtx, subObjBuilder);
|
||||
subObjBuilder.appendDate(kFTDCCollectEndField, getCurrentDate(opCtx));
|
||||
sectionSizes.emplace_back(collector->name(), subObjBuilder.len());
|
||||
} catch (...) {
|
||||
LOGV2_ERROR(9761500,
|
||||
"Collector threw an error",
|
||||
"error"_attr = exceptionToStatus(),
|
||||
"collector"_attr = collector->name());
|
||||
"collector"_attr = collector->name(),
|
||||
"size"_attr = builder->len());
|
||||
sectionSizes.emplace_back(collector->name(), builder->len());
|
||||
throw;
|
||||
}
|
||||
|
||||
|
||||
@ -168,13 +168,18 @@ public:
|
||||
* "end" : Date_t, <- Time at which all collecting ended
|
||||
* }
|
||||
*/
|
||||
std::tuple<BSONObj, Date_t> collect(Client* client, UseMultiServiceSchema multiServiceSchema);
|
||||
std::tuple<BSONObj, Date_t> collect(Client* client,
|
||||
UseMultiServiceSchema multiServiceSchema,
|
||||
std::vector<std::pair<std::string, int>>& sectionSizes);
|
||||
|
||||
protected:
|
||||
FTDCCollectorCollection() = default;
|
||||
|
||||
private:
|
||||
virtual void _collect(OperationContext* opCtx, ClusterRole role, BSONObjBuilder* builder) = 0;
|
||||
virtual void _collect(OperationContext* opCtx,
|
||||
ClusterRole role,
|
||||
BSONObjBuilder* builder,
|
||||
std::vector<std::pair<std::string, int>>& sectionSizes) = 0;
|
||||
};
|
||||
|
||||
class SampleCollectorCache {
|
||||
@ -325,7 +330,10 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
void _collect(OperationContext* opCtx, ClusterRole role, BSONObjBuilder* builder) override;
|
||||
void _collect(OperationContext* opCtx,
|
||||
ClusterRole role,
|
||||
BSONObjBuilder* builder,
|
||||
std::vector<std::pair<std::string, int>>& sectionsSize) override;
|
||||
|
||||
void _forEach(std::function<void(AsyncFTDCCollectorCollectionSet&)> f);
|
||||
|
||||
@ -371,7 +379,10 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
void _collect(OperationContext* opCtx, ClusterRole role, BSONObjBuilder* builder) override;
|
||||
void _collect(OperationContext* opCtx,
|
||||
ClusterRole role,
|
||||
BSONObjBuilder* builder,
|
||||
std::vector<std::pair<std::string, int>>& sectionsSize) override;
|
||||
|
||||
private:
|
||||
// collection of collectors
|
||||
|
||||
@ -306,6 +306,7 @@ void FTDCController::doLoop(Service* service) try {
|
||||
// reset to _config.metadataCaptureFrequency and countdown starts again.
|
||||
std::uint64_t metadataCaptureFrequencyCountdown = 1;
|
||||
|
||||
std::vector<std::pair<std::string, int>> sectionSizes;
|
||||
while (true) {
|
||||
_env->onStartLoop();
|
||||
|
||||
@ -362,27 +363,48 @@ void FTDCController::doLoop(Service* service) try {
|
||||
iassert(_mgr->rotate(client));
|
||||
}
|
||||
|
||||
auto collectSample = feature_flags::gFeatureFlagGaplessFTDC.isEnabled()
|
||||
? _asyncPeriodicCollectors->collect(client, _multiServiceSchema)
|
||||
: _periodicCollectors.collect(client, _multiServiceSchema);
|
||||
sectionSizes.clear();
|
||||
try {
|
||||
auto collectSample = feature_flags::gFeatureFlagGaplessFTDC.isEnabled()
|
||||
? _asyncPeriodicCollectors->collect(client, _multiServiceSchema, sectionSizes)
|
||||
: _periodicCollectors.collect(client, _multiServiceSchema, sectionSizes);
|
||||
Status s = _mgr->writeSampleAndRotateIfNeeded(
|
||||
client, std::get<0>(collectSample), std::get<1>(collectSample));
|
||||
|
||||
Status s = _mgr->writeSampleAndRotateIfNeeded(
|
||||
client, std::get<0>(collectSample), std::get<1>(collectSample));
|
||||
uassertStatusOK(s);
|
||||
|
||||
uassertStatusOK(s);
|
||||
|
||||
// Store a reference to the most recent document from the periodic collectors
|
||||
{
|
||||
stdx::lock_guard<stdx::mutex> lock(_mutex);
|
||||
_mostRecentPeriodicDocument = std::get<0>(collectSample);
|
||||
// Store a reference to the most recent document from the periodic collectors
|
||||
{
|
||||
stdx::lock_guard<stdx::mutex> lock(_mutex);
|
||||
_mostRecentPeriodicDocument = std::get<0>(collectSample);
|
||||
}
|
||||
} catch (...) {
|
||||
for (const auto& entry : sectionSizes) {
|
||||
LOGV2_INFO(
|
||||
10630200, "FTDC Entry", "name"_attr = entry.first, "size"_attr = entry.second);
|
||||
}
|
||||
throw;
|
||||
}
|
||||
|
||||
if (--metadataCaptureFrequencyCountdown == 0) {
|
||||
metadataCaptureFrequencyCountdown = _config.metadataCaptureFrequency;
|
||||
auto collectSample = _periodicMetadataCollectors.collect(client, _multiServiceSchema);
|
||||
Status s = _mgr->writePeriodicMetadataSampleAndRotateIfNeeded(
|
||||
client, std::get<0>(collectSample), std::get<1>(collectSample));
|
||||
iassert(s);
|
||||
sectionSizes.clear();
|
||||
try {
|
||||
auto collectSample =
|
||||
_periodicMetadataCollectors.collect(client, _multiServiceSchema, sectionSizes);
|
||||
Status s = _mgr->writePeriodicMetadataSampleAndRotateIfNeeded(
|
||||
client, std::get<0>(collectSample), std::get<1>(collectSample));
|
||||
iassert(s);
|
||||
|
||||
} catch (const ExceptionFor<ErrorCodes::BSONObjectTooLarge>&) {
|
||||
for (const auto& entry : sectionSizes) {
|
||||
LOGV2_INFO(10630202,
|
||||
"FTDC Entry",
|
||||
"name"_attr = entry.first,
|
||||
"size"_attr = entry.second);
|
||||
}
|
||||
throw;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (...) {
|
||||
|
||||
@ -246,6 +246,18 @@ std::vector<BSONObj> insertNewSchemaDocuments(const std::vector<BSONObj>& docs,
|
||||
return newDocs;
|
||||
}
|
||||
|
||||
class MockLargeDataCollector : public MockCollector {
|
||||
public:
|
||||
MockLargeDataCollector(int32_t largeDataSize) : _largeDataSize(largeDataSize) {}
|
||||
|
||||
void generateDocument(BSONObjBuilder& builder, std::uint32_t counter) final {
|
||||
builder.append("testingDataLarge", std::string(_largeDataSize, 'a'));
|
||||
}
|
||||
|
||||
private:
|
||||
int32_t _largeDataSize = 0;
|
||||
};
|
||||
|
||||
/**
|
||||
* Used to sync the flow of the FTDCController with its test. FTDCController calls onStartLoop() at
|
||||
* the start of each collection loop and it will block until the test calls
|
||||
@ -356,7 +368,7 @@ void FTDCControllerTest::testPeriodicCollector(UseMultiServiceSchema multiServic
|
||||
}
|
||||
_checkpoint->wait();
|
||||
|
||||
// Wait for numCollections samples to have occured
|
||||
// Wait for numCollections samples to have occurred
|
||||
LOGV2_DEBUG(9129201, 0, "Collecting");
|
||||
auto collectUntilDocCount = [&](auto& collectorPtr, size_t docs) {
|
||||
while (collectorPtr->getDocs().size() < docs)
|
||||
@ -516,7 +528,7 @@ DEATH_TEST_REGEX_F(FTDCControllerTest,
|
||||
|
||||
DEATH_TEST_REGEX_F(FTDCControllerTest,
|
||||
LogAndTerminateWhenExceptionThrown,
|
||||
"9761500.*MockFailCollector") {
|
||||
"9761500.*MockFailCollector.*size") {
|
||||
FTDCConfig config;
|
||||
config.period = Milliseconds(100);
|
||||
setUpControllerAndCheckpoint(config);
|
||||
@ -530,5 +542,24 @@ DEATH_TEST_REGEX_F(FTDCControllerTest,
|
||||
doCollection();
|
||||
}
|
||||
|
||||
DEATH_TEST_REGEX_F(FTDCControllerTest,
|
||||
LogAndTerminateWhenLargeDataCollectionFails,
|
||||
"10630200.*FTDC Entry.*name.*size") {
|
||||
FTDCConfig config;
|
||||
config.period = Milliseconds(100);
|
||||
setUpControllerAndCheckpoint(config);
|
||||
|
||||
auto collector1 = std::make_unique<MockLargeDataCollector>(50 * 1024 * 1024);
|
||||
auto collector2 = std::make_unique<MockLargeDataCollector>(60 * 1024 * 1024);
|
||||
auto collector3 = std::make_unique<MockLargeDataCollector>(70 * 1024 * 1024);
|
||||
controller()->addPeriodicCollector(std::move(collector1), ClusterRole::None);
|
||||
controller()->addPeriodicCollector(std::move(collector2), ClusterRole::None);
|
||||
controller()->addPeriodicCollector(std::move(collector3), ClusterRole::None);
|
||||
|
||||
startController();
|
||||
|
||||
doCollection();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace mongo
|
||||
|
||||
@ -208,17 +208,25 @@ Status FTDCFileManager::openArchiveFile(
|
||||
}
|
||||
}
|
||||
|
||||
// After the system restarts or a new file has been started,
|
||||
// collect one-time information
|
||||
// This is appened after the file is opened to ensure a user can determine which bson objects
|
||||
// where collected from which server instance.
|
||||
auto sample = _rotateCollectors->collect(client, _multiServiceSchema);
|
||||
if (!std::get<0>(sample).isEmpty()) {
|
||||
Status s = _writer.writeMetadata(std::get<0>(sample), std::get<1>(sample));
|
||||
// After the system restarts or a new file has been started, collect one-time information. This
|
||||
// is appended after the file is opened to ensure a user can determine which bson objects were
|
||||
// collected from which server instance.
|
||||
std::vector<std::pair<std::string, int>> sectionSizes;
|
||||
try {
|
||||
auto sample = _rotateCollectors->collect(client, _multiServiceSchema, sectionSizes);
|
||||
if (!std::get<0>(sample).isEmpty()) {
|
||||
Status s = _writer.writeMetadata(std::get<0>(sample), std::get<1>(sample));
|
||||
|
||||
if (!s.isOK()) {
|
||||
return s;
|
||||
if (!s.isOK()) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
} catch (const ExceptionFor<ErrorCodes::BSONObjectTooLarge>&) {
|
||||
for (const auto& entry : sectionSizes) {
|
||||
LOGV2_INFO(
|
||||
10630203, "FTDC Entry", "name"_attr = entry.first, "size"_attr = entry.second);
|
||||
}
|
||||
throw;
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
|
||||
Loading…
Reference in New Issue
Block a user