SERVER-106302 Provide visibility into the sizes of serverStatus sections if the aggregate FTDC object is too large (#42716)

GitOrigin-RevId: 2bce77d5edd8c9221568ee297c5fdb99c123c52f
This commit is contained in:
Cole Harbeck 2025-10-16 17:18:00 -04:00 committed by MongoDB Bot
parent 31b06936c6
commit 829bda19ce
5 changed files with 123 additions and 38 deletions

View File

@ -63,7 +63,6 @@
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kFTDC
namespace mongo {
namespace {
static constexpr auto roles = std::to_array<std::pair<ClusterRole::Value, StringData>>({
@ -110,7 +109,9 @@ bool FTDCCollectorCollection::empty() {
}
std::tuple<BSONObj, Date_t> FTDCCollectorCollection::collect(
Client* client, UseMultiServiceSchema multiServiceSchema) {
Client* client,
UseMultiServiceSchema multiServiceSchema,
std::vector<std::pair<std::string, int>>& sectionSizes) {
BSONObjBuilder builder;
// If there are no collectors, just return an empty BSONObj so that that are caller knows we did
// not collect anything
@ -151,7 +152,7 @@ std::tuple<BSONObj, Date_t> FTDCCollectorCollection::collect(
scopedRouterService.emplace(opCtx.get());
}
_collect(opCtx.get(), role.first, parent);
_collect(opCtx.get(), role.first, parent, sectionSizes);
if (multiServiceSchema) {
maybeSubBuilder->appendDate(kFTDCCollectEndField, getCurrentDate(opCtx.get()));
@ -333,9 +334,11 @@ void AsyncFTDCCollectorCollection::add(std::unique_ptr<FTDCCollectorInterface> c
getSet(role).addCollector(std::move(collector), role);
}
void AsyncFTDCCollectorCollection::_collect(OperationContext* opCtx,
ClusterRole role,
BSONObjBuilder* builder) {
void AsyncFTDCCollectorCollection::_collect(
OperationContext* opCtx,
ClusterRole role,
BSONObjBuilder* builder,
std::vector<std::pair<std::string, int>>& sectionSizes) {
getSet(role).collect(opCtx, builder);
}
@ -355,7 +358,8 @@ void SyncFTDCCollectorCollection::add(std::unique_ptr<FTDCCollectorInterface> co
void SyncFTDCCollectorCollection::_collect(OperationContext* opCtx,
ClusterRole role,
BSONObjBuilder* builder) {
BSONObjBuilder* builder,
std::vector<std::pair<std::string, int>>& sectionSizes) {
auto& collectorVector = _collectors[role];
for (auto& collector : collectorVector) {
// Skip collection if this collector has no data to return
@ -365,17 +369,19 @@ void SyncFTDCCollectorCollection::_collect(OperationContext* opCtx,
try {
BSONObjBuilder subObjBuilder(builder->subobjStart(collector->name()));
// Add a Date_t before and after each BSON is collected so that we can track timing of
// the collector.
subObjBuilder.appendDate(kFTDCCollectStartField, getCurrentDate(opCtx));
collector->collect(opCtx, subObjBuilder);
subObjBuilder.appendDate(kFTDCCollectEndField, getCurrentDate(opCtx));
sectionSizes.emplace_back(collector->name(), subObjBuilder.len());
} catch (...) {
LOGV2_ERROR(9761500,
"Collector threw an error",
"error"_attr = exceptionToStatus(),
"collector"_attr = collector->name());
"collector"_attr = collector->name(),
"size"_attr = builder->len());
sectionSizes.emplace_back(collector->name(), builder->len());
throw;
}

View File

@ -168,13 +168,18 @@ public:
* "end" : Date_t, <- Time at which all collecting ended
* }
*/
std::tuple<BSONObj, Date_t> collect(Client* client, UseMultiServiceSchema multiServiceSchema);
std::tuple<BSONObj, Date_t> collect(Client* client,
UseMultiServiceSchema multiServiceSchema,
std::vector<std::pair<std::string, int>>& sectionSizes);
protected:
FTDCCollectorCollection() = default;
private:
virtual void _collect(OperationContext* opCtx, ClusterRole role, BSONObjBuilder* builder) = 0;
virtual void _collect(OperationContext* opCtx,
ClusterRole role,
BSONObjBuilder* builder,
std::vector<std::pair<std::string, int>>& sectionSizes) = 0;
};
class SampleCollectorCache {
@ -325,7 +330,10 @@ public:
}
private:
void _collect(OperationContext* opCtx, ClusterRole role, BSONObjBuilder* builder) override;
void _collect(OperationContext* opCtx,
ClusterRole role,
BSONObjBuilder* builder,
std::vector<std::pair<std::string, int>>& sectionsSize) override;
void _forEach(std::function<void(AsyncFTDCCollectorCollectionSet&)> f);
@ -371,7 +379,10 @@ public:
}
private:
void _collect(OperationContext* opCtx, ClusterRole role, BSONObjBuilder* builder) override;
void _collect(OperationContext* opCtx,
ClusterRole role,
BSONObjBuilder* builder,
std::vector<std::pair<std::string, int>>& sectionsSize) override;
private:
// collection of collectors

View File

@ -306,6 +306,7 @@ void FTDCController::doLoop(Service* service) try {
// reset to _config.metadataCaptureFrequency and countdown starts again.
std::uint64_t metadataCaptureFrequencyCountdown = 1;
std::vector<std::pair<std::string, int>> sectionSizes;
while (true) {
_env->onStartLoop();
@ -362,27 +363,55 @@ void FTDCController::doLoop(Service* service) try {
iassert(_mgr->rotate(client));
}
auto collectSample = feature_flags::gFeatureFlagGaplessFTDC.isEnabled()
? _asyncPeriodicCollectors->collect(client, _multiServiceSchema)
: _periodicCollectors.collect(client, _multiServiceSchema);
sectionSizes.clear();
try {
auto collectSample = feature_flags::gFeatureFlagGaplessFTDC.isEnabled()
? _asyncPeriodicCollectors->collect(client, _multiServiceSchema, sectionSizes)
: _periodicCollectors.collect(client, _multiServiceSchema, sectionSizes);
Status s = _mgr->writeSampleAndRotateIfNeeded(
client, std::get<0>(collectSample), std::get<1>(collectSample));
Status s = _mgr->writeSampleAndRotateIfNeeded(
client, std::get<0>(collectSample), std::get<1>(collectSample));
uassertStatusOK(s);
uassertStatusOK(s);
// Store a reference to the most recent document from the periodic collectors
{
stdx::lock_guard<stdx::mutex> lock(_mutex);
_mostRecentPeriodicDocument = std::get<0>(collectSample);
// Store a reference to the most recent document from the periodic collectors
{
stdx::lock_guard<stdx::mutex> lock(_mutex);
_mostRecentPeriodicDocument = std::get<0>(collectSample);
}
} catch (...) {
for (const auto& entry : sectionSizes) {
LOGV2_INFO(
10630200, "FTDC Entry", "name"_attr = entry.first, "size"_attr = entry.second);
}
throw;
}
if (--metadataCaptureFrequencyCountdown == 0) {
metadataCaptureFrequencyCountdown = _config.metadataCaptureFrequency;
auto collectSample = _periodicMetadataCollectors.collect(client, _multiServiceSchema);
Status s = _mgr->writePeriodicMetadataSampleAndRotateIfNeeded(
client, std::get<0>(collectSample), std::get<1>(collectSample));
iassert(s);
sectionSizes.clear();
try {
auto collectSample =
_periodicMetadataCollectors.collect(client, _multiServiceSchema, sectionSizes);
Status s = _mgr->writePeriodicMetadataSampleAndRotateIfNeeded(
client, std::get<0>(collectSample), std::get<1>(collectSample));
iassert(s);
for (const auto& entry : sectionSizes) {
LOGV2_INFO(10630201,
"FTDC Entry",
"name"_attr = entry.first,
"size"_attr = entry.second);
}
} catch (const ExceptionFor<ErrorCodes::BSONObjectTooLarge>&) {
for (const auto& entry : sectionSizes) {
LOGV2_INFO(10630202,
"FTDC Entry",
"name"_attr = entry.first,
"size"_attr = entry.second);
}
throw;
}
}
}
} catch (...) {

View File

@ -246,6 +246,18 @@ std::vector<BSONObj> insertNewSchemaDocuments(const std::vector<BSONObj>& docs,
return newDocs;
}
class MockLargeDataCollector : public MockCollector {
public:
MockLargeDataCollector(int32_t largeDataSize) : _largeDataSize(largeDataSize) {}
void generateDocument(BSONObjBuilder& builder, std::uint32_t counter) final {
builder.append("testingDataLarge", std::string(_largeDataSize, 'a'));
}
private:
int32_t _largeDataSize = 0;
};
/**
* Used to sync the flow of the FTDCController with its test. FTDCController calls onStartLoop() at
* the start of each collection loop and it will block until the test calls
@ -356,7 +368,7 @@ void FTDCControllerTest::testPeriodicCollector(UseMultiServiceSchema multiServic
}
_checkpoint->wait();
// Wait for numCollections samples to have occured
// Wait for numCollections samples to have occurred
LOGV2_DEBUG(9129201, 0, "Collecting");
auto collectUntilDocCount = [&](auto& collectorPtr, size_t docs) {
while (collectorPtr->getDocs().size() < docs)
@ -516,7 +528,7 @@ DEATH_TEST_REGEX_F(FTDCControllerTest,
DEATH_TEST_REGEX_F(FTDCControllerTest,
LogAndTerminateWhenExceptionThrown,
"9761500.*MockFailCollector") {
"9761500.*MockFailCollector.*size") {
FTDCConfig config;
config.period = Milliseconds(100);
setUpControllerAndCheckpoint(config);
@ -530,5 +542,24 @@ DEATH_TEST_REGEX_F(FTDCControllerTest,
doCollection();
}
DEATH_TEST_REGEX_F(FTDCControllerTest,
LogAndTerminateWhenLargeDataCollectionFails,
"10630200.*FTDC Entry.*name.*size") {
FTDCConfig config;
config.period = Milliseconds(100);
setUpControllerAndCheckpoint(config);
auto collector1 = std::make_unique<MockLargeDataCollector>(50 * 1024 * 1024);
auto collector2 = std::make_unique<MockLargeDataCollector>(60 * 1024 * 1024);
auto collector3 = std::make_unique<MockLargeDataCollector>(70 * 1024 * 1024);
controller()->addPeriodicCollector(std::move(collector1), ClusterRole::None);
controller()->addPeriodicCollector(std::move(collector2), ClusterRole::None);
controller()->addPeriodicCollector(std::move(collector3), ClusterRole::None);
startController();
doCollection();
}
} // namespace
} // namespace mongo

View File

@ -208,17 +208,25 @@ Status FTDCFileManager::openArchiveFile(
}
}
// After the system restarts or a new file has been started,
// collect one-time information
// This is appened after the file is opened to ensure a user can determine which bson objects
// where collected from which server instance.
auto sample = _rotateCollectors->collect(client, _multiServiceSchema);
if (!std::get<0>(sample).isEmpty()) {
Status s = _writer.writeMetadata(std::get<0>(sample), std::get<1>(sample));
// After the system restarts or a new file has been started, collect one-time information. This
// is appended after the file is opened to ensure a user can determine which bson objects were
// collected from which server instance.
std::vector<std::pair<std::string, int>> sectionSizes;
try {
auto sample = _rotateCollectors->collect(client, _multiServiceSchema, sectionSizes);
if (!std::get<0>(sample).isEmpty()) {
Status s = _writer.writeMetadata(std::get<0>(sample), std::get<1>(sample));
if (!s.isOK()) {
return s;
if (!s.isOK()) {
return s;
}
}
} catch (const ExceptionFor<ErrorCodes::BSONObjectTooLarge>&) {
for (const auto& entry : sectionSizes) {
LOGV2_INFO(
10630203, "FTDC Entry", "name"_attr = entry.first, "size"_attr = entry.second);
}
throw;
}
return Status::OK();