SERVER-106302 Provide visibility into the sizes of serverStatus sections if the aggregate FTDC object is too large (#42716)
GitOrigin-RevId: 2bce77d5edd8c9221568ee297c5fdb99c123c52f
This commit is contained in:
parent
31b06936c6
commit
829bda19ce
@ -63,7 +63,6 @@
|
||||
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kFTDC
|
||||
|
||||
namespace mongo {
|
||||
|
||||
namespace {
|
||||
|
||||
static constexpr auto roles = std::to_array<std::pair<ClusterRole::Value, StringData>>({
|
||||
@ -110,7 +109,9 @@ bool FTDCCollectorCollection::empty() {
|
||||
}
|
||||
|
||||
std::tuple<BSONObj, Date_t> FTDCCollectorCollection::collect(
|
||||
Client* client, UseMultiServiceSchema multiServiceSchema) {
|
||||
Client* client,
|
||||
UseMultiServiceSchema multiServiceSchema,
|
||||
std::vector<std::pair<std::string, int>>& sectionSizes) {
|
||||
BSONObjBuilder builder;
|
||||
// If there are no collectors, just return an empty BSONObj so that that are caller knows we did
|
||||
// not collect anything
|
||||
@ -151,7 +152,7 @@ std::tuple<BSONObj, Date_t> FTDCCollectorCollection::collect(
|
||||
scopedRouterService.emplace(opCtx.get());
|
||||
}
|
||||
|
||||
_collect(opCtx.get(), role.first, parent);
|
||||
_collect(opCtx.get(), role.first, parent, sectionSizes);
|
||||
|
||||
if (multiServiceSchema) {
|
||||
maybeSubBuilder->appendDate(kFTDCCollectEndField, getCurrentDate(opCtx.get()));
|
||||
@ -333,9 +334,11 @@ void AsyncFTDCCollectorCollection::add(std::unique_ptr<FTDCCollectorInterface> c
|
||||
getSet(role).addCollector(std::move(collector), role);
|
||||
}
|
||||
|
||||
void AsyncFTDCCollectorCollection::_collect(OperationContext* opCtx,
|
||||
ClusterRole role,
|
||||
BSONObjBuilder* builder) {
|
||||
void AsyncFTDCCollectorCollection::_collect(
|
||||
OperationContext* opCtx,
|
||||
ClusterRole role,
|
||||
BSONObjBuilder* builder,
|
||||
std::vector<std::pair<std::string, int>>& sectionSizes) {
|
||||
getSet(role).collect(opCtx, builder);
|
||||
}
|
||||
|
||||
@ -355,7 +358,8 @@ void SyncFTDCCollectorCollection::add(std::unique_ptr<FTDCCollectorInterface> co
|
||||
|
||||
void SyncFTDCCollectorCollection::_collect(OperationContext* opCtx,
|
||||
ClusterRole role,
|
||||
BSONObjBuilder* builder) {
|
||||
BSONObjBuilder* builder,
|
||||
std::vector<std::pair<std::string, int>>& sectionSizes) {
|
||||
auto& collectorVector = _collectors[role];
|
||||
for (auto& collector : collectorVector) {
|
||||
// Skip collection if this collector has no data to return
|
||||
@ -365,17 +369,19 @@ void SyncFTDCCollectorCollection::_collect(OperationContext* opCtx,
|
||||
|
||||
try {
|
||||
BSONObjBuilder subObjBuilder(builder->subobjStart(collector->name()));
|
||||
|
||||
// Add a Date_t before and after each BSON is collected so that we can track timing of
|
||||
// the collector.
|
||||
subObjBuilder.appendDate(kFTDCCollectStartField, getCurrentDate(opCtx));
|
||||
collector->collect(opCtx, subObjBuilder);
|
||||
subObjBuilder.appendDate(kFTDCCollectEndField, getCurrentDate(opCtx));
|
||||
sectionSizes.emplace_back(collector->name(), subObjBuilder.len());
|
||||
} catch (...) {
|
||||
LOGV2_ERROR(9761500,
|
||||
"Collector threw an error",
|
||||
"error"_attr = exceptionToStatus(),
|
||||
"collector"_attr = collector->name());
|
||||
"collector"_attr = collector->name(),
|
||||
"size"_attr = builder->len());
|
||||
sectionSizes.emplace_back(collector->name(), builder->len());
|
||||
throw;
|
||||
}
|
||||
|
||||
|
||||
@ -168,13 +168,18 @@ public:
|
||||
* "end" : Date_t, <- Time at which all collecting ended
|
||||
* }
|
||||
*/
|
||||
std::tuple<BSONObj, Date_t> collect(Client* client, UseMultiServiceSchema multiServiceSchema);
|
||||
std::tuple<BSONObj, Date_t> collect(Client* client,
|
||||
UseMultiServiceSchema multiServiceSchema,
|
||||
std::vector<std::pair<std::string, int>>& sectionSizes);
|
||||
|
||||
protected:
|
||||
FTDCCollectorCollection() = default;
|
||||
|
||||
private:
|
||||
virtual void _collect(OperationContext* opCtx, ClusterRole role, BSONObjBuilder* builder) = 0;
|
||||
virtual void _collect(OperationContext* opCtx,
|
||||
ClusterRole role,
|
||||
BSONObjBuilder* builder,
|
||||
std::vector<std::pair<std::string, int>>& sectionSizes) = 0;
|
||||
};
|
||||
|
||||
class SampleCollectorCache {
|
||||
@ -325,7 +330,10 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
void _collect(OperationContext* opCtx, ClusterRole role, BSONObjBuilder* builder) override;
|
||||
void _collect(OperationContext* opCtx,
|
||||
ClusterRole role,
|
||||
BSONObjBuilder* builder,
|
||||
std::vector<std::pair<std::string, int>>& sectionsSize) override;
|
||||
|
||||
void _forEach(std::function<void(AsyncFTDCCollectorCollectionSet&)> f);
|
||||
|
||||
@ -371,7 +379,10 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
void _collect(OperationContext* opCtx, ClusterRole role, BSONObjBuilder* builder) override;
|
||||
void _collect(OperationContext* opCtx,
|
||||
ClusterRole role,
|
||||
BSONObjBuilder* builder,
|
||||
std::vector<std::pair<std::string, int>>& sectionsSize) override;
|
||||
|
||||
private:
|
||||
// collection of collectors
|
||||
|
||||
@ -306,6 +306,7 @@ void FTDCController::doLoop(Service* service) try {
|
||||
// reset to _config.metadataCaptureFrequency and countdown starts again.
|
||||
std::uint64_t metadataCaptureFrequencyCountdown = 1;
|
||||
|
||||
std::vector<std::pair<std::string, int>> sectionSizes;
|
||||
while (true) {
|
||||
_env->onStartLoop();
|
||||
|
||||
@ -362,27 +363,55 @@ void FTDCController::doLoop(Service* service) try {
|
||||
iassert(_mgr->rotate(client));
|
||||
}
|
||||
|
||||
auto collectSample = feature_flags::gFeatureFlagGaplessFTDC.isEnabled()
|
||||
? _asyncPeriodicCollectors->collect(client, _multiServiceSchema)
|
||||
: _periodicCollectors.collect(client, _multiServiceSchema);
|
||||
sectionSizes.clear();
|
||||
try {
|
||||
auto collectSample = feature_flags::gFeatureFlagGaplessFTDC.isEnabled()
|
||||
? _asyncPeriodicCollectors->collect(client, _multiServiceSchema, sectionSizes)
|
||||
: _periodicCollectors.collect(client, _multiServiceSchema, sectionSizes);
|
||||
|
||||
Status s = _mgr->writeSampleAndRotateIfNeeded(
|
||||
client, std::get<0>(collectSample), std::get<1>(collectSample));
|
||||
Status s = _mgr->writeSampleAndRotateIfNeeded(
|
||||
client, std::get<0>(collectSample), std::get<1>(collectSample));
|
||||
|
||||
uassertStatusOK(s);
|
||||
uassertStatusOK(s);
|
||||
|
||||
// Store a reference to the most recent document from the periodic collectors
|
||||
{
|
||||
stdx::lock_guard<stdx::mutex> lock(_mutex);
|
||||
_mostRecentPeriodicDocument = std::get<0>(collectSample);
|
||||
// Store a reference to the most recent document from the periodic collectors
|
||||
{
|
||||
stdx::lock_guard<stdx::mutex> lock(_mutex);
|
||||
_mostRecentPeriodicDocument = std::get<0>(collectSample);
|
||||
}
|
||||
} catch (...) {
|
||||
for (const auto& entry : sectionSizes) {
|
||||
LOGV2_INFO(
|
||||
10630200, "FTDC Entry", "name"_attr = entry.first, "size"_attr = entry.second);
|
||||
}
|
||||
throw;
|
||||
}
|
||||
|
||||
if (--metadataCaptureFrequencyCountdown == 0) {
|
||||
metadataCaptureFrequencyCountdown = _config.metadataCaptureFrequency;
|
||||
auto collectSample = _periodicMetadataCollectors.collect(client, _multiServiceSchema);
|
||||
Status s = _mgr->writePeriodicMetadataSampleAndRotateIfNeeded(
|
||||
client, std::get<0>(collectSample), std::get<1>(collectSample));
|
||||
iassert(s);
|
||||
sectionSizes.clear();
|
||||
try {
|
||||
auto collectSample =
|
||||
_periodicMetadataCollectors.collect(client, _multiServiceSchema, sectionSizes);
|
||||
Status s = _mgr->writePeriodicMetadataSampleAndRotateIfNeeded(
|
||||
client, std::get<0>(collectSample), std::get<1>(collectSample));
|
||||
iassert(s);
|
||||
|
||||
for (const auto& entry : sectionSizes) {
|
||||
LOGV2_INFO(10630201,
|
||||
"FTDC Entry",
|
||||
"name"_attr = entry.first,
|
||||
"size"_attr = entry.second);
|
||||
}
|
||||
} catch (const ExceptionFor<ErrorCodes::BSONObjectTooLarge>&) {
|
||||
for (const auto& entry : sectionSizes) {
|
||||
LOGV2_INFO(10630202,
|
||||
"FTDC Entry",
|
||||
"name"_attr = entry.first,
|
||||
"size"_attr = entry.second);
|
||||
}
|
||||
throw;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (...) {
|
||||
|
||||
@ -246,6 +246,18 @@ std::vector<BSONObj> insertNewSchemaDocuments(const std::vector<BSONObj>& docs,
|
||||
return newDocs;
|
||||
}
|
||||
|
||||
class MockLargeDataCollector : public MockCollector {
|
||||
public:
|
||||
MockLargeDataCollector(int32_t largeDataSize) : _largeDataSize(largeDataSize) {}
|
||||
|
||||
void generateDocument(BSONObjBuilder& builder, std::uint32_t counter) final {
|
||||
builder.append("testingDataLarge", std::string(_largeDataSize, 'a'));
|
||||
}
|
||||
|
||||
private:
|
||||
int32_t _largeDataSize = 0;
|
||||
};
|
||||
|
||||
/**
|
||||
* Used to sync the flow of the FTDCController with its test. FTDCController calls onStartLoop() at
|
||||
* the start of each collection loop and it will block until the test calls
|
||||
@ -356,7 +368,7 @@ void FTDCControllerTest::testPeriodicCollector(UseMultiServiceSchema multiServic
|
||||
}
|
||||
_checkpoint->wait();
|
||||
|
||||
// Wait for numCollections samples to have occured
|
||||
// Wait for numCollections samples to have occurred
|
||||
LOGV2_DEBUG(9129201, 0, "Collecting");
|
||||
auto collectUntilDocCount = [&](auto& collectorPtr, size_t docs) {
|
||||
while (collectorPtr->getDocs().size() < docs)
|
||||
@ -516,7 +528,7 @@ DEATH_TEST_REGEX_F(FTDCControllerTest,
|
||||
|
||||
DEATH_TEST_REGEX_F(FTDCControllerTest,
|
||||
LogAndTerminateWhenExceptionThrown,
|
||||
"9761500.*MockFailCollector") {
|
||||
"9761500.*MockFailCollector.*size") {
|
||||
FTDCConfig config;
|
||||
config.period = Milliseconds(100);
|
||||
setUpControllerAndCheckpoint(config);
|
||||
@ -530,5 +542,24 @@ DEATH_TEST_REGEX_F(FTDCControllerTest,
|
||||
doCollection();
|
||||
}
|
||||
|
||||
DEATH_TEST_REGEX_F(FTDCControllerTest,
|
||||
LogAndTerminateWhenLargeDataCollectionFails,
|
||||
"10630200.*FTDC Entry.*name.*size") {
|
||||
FTDCConfig config;
|
||||
config.period = Milliseconds(100);
|
||||
setUpControllerAndCheckpoint(config);
|
||||
|
||||
auto collector1 = std::make_unique<MockLargeDataCollector>(50 * 1024 * 1024);
|
||||
auto collector2 = std::make_unique<MockLargeDataCollector>(60 * 1024 * 1024);
|
||||
auto collector3 = std::make_unique<MockLargeDataCollector>(70 * 1024 * 1024);
|
||||
controller()->addPeriodicCollector(std::move(collector1), ClusterRole::None);
|
||||
controller()->addPeriodicCollector(std::move(collector2), ClusterRole::None);
|
||||
controller()->addPeriodicCollector(std::move(collector3), ClusterRole::None);
|
||||
|
||||
startController();
|
||||
|
||||
doCollection();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace mongo
|
||||
|
||||
@ -208,17 +208,25 @@ Status FTDCFileManager::openArchiveFile(
|
||||
}
|
||||
}
|
||||
|
||||
// After the system restarts or a new file has been started,
|
||||
// collect one-time information
|
||||
// This is appened after the file is opened to ensure a user can determine which bson objects
|
||||
// where collected from which server instance.
|
||||
auto sample = _rotateCollectors->collect(client, _multiServiceSchema);
|
||||
if (!std::get<0>(sample).isEmpty()) {
|
||||
Status s = _writer.writeMetadata(std::get<0>(sample), std::get<1>(sample));
|
||||
// After the system restarts or a new file has been started, collect one-time information. This
|
||||
// is appended after the file is opened to ensure a user can determine which bson objects were
|
||||
// collected from which server instance.
|
||||
std::vector<std::pair<std::string, int>> sectionSizes;
|
||||
try {
|
||||
auto sample = _rotateCollectors->collect(client, _multiServiceSchema, sectionSizes);
|
||||
if (!std::get<0>(sample).isEmpty()) {
|
||||
Status s = _writer.writeMetadata(std::get<0>(sample), std::get<1>(sample));
|
||||
|
||||
if (!s.isOK()) {
|
||||
return s;
|
||||
if (!s.isOK()) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
} catch (const ExceptionFor<ErrorCodes::BSONObjectTooLarge>&) {
|
||||
for (const auto& entry : sectionSizes) {
|
||||
LOGV2_INFO(
|
||||
10630203, "FTDC Entry", "name"_attr = entry.first, "size"_attr = entry.second);
|
||||
}
|
||||
throw;
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
|
||||
Loading…
Reference in New Issue
Block a user