SERVER-85737 add latest TCMalloc from google as allocator option (#18870)

GitOrigin-RevId: 23c89085da2424a0fb91913c42c5d356b6a860df
This commit is contained in:
Daniel Moody 2024-02-16 13:47:20 -06:00 committed by MongoDB Bot
parent bd01a61df2
commit 13401a4bfe
324 changed files with 77664 additions and 360 deletions

View File

@ -3,6 +3,7 @@ src/third_party/grpc
src/third_party/abseil-cpp
src/third_party/protobuf
src/third_party/re2
src/third_party/tcmalloc
# Ignore node_modules due to the following error
# ERROR: in verify_node_modules_ignored:

View File

@ -105,7 +105,7 @@ def use_system_version_of_library(name):
# add a new C++ library dependency that may be shimmed out to the system, add it to the below
# list.
def using_system_version_of_cxx_libraries():
cxx_library_names = ["tcmalloc", "boost"]
cxx_library_names = ["tcmalloc-google", "boost", "tcmalloc-gperf"]
return True in [use_system_version_of_library(x) for x in cxx_library_names]
@ -415,7 +415,7 @@ add_option(
add_option(
'allocator',
choices=["auto", "system", "tcmalloc", "tcmalloc-experimental"],
choices=["auto", "system", "tcmalloc-google", "tcmalloc-gperf"],
default=build_profile.allocator,
help='allocator to use (use "auto" for best choice for current platform)',
type='choice',
@ -485,7 +485,8 @@ for pack in [
('protobuf', "Protocol Buffers"),
('snappy', ),
('stemmer', ),
('tcmalloc', ),
('tcmalloc-google', ),
('tcmalloc-gperf', ),
('libunwind', ),
('valgrind', ),
('wiredtiger', ),
@ -2124,17 +2125,48 @@ env['TARGET_OS_FAMILY'] = 'posix' if env.TargetOSIs('posix') else env.GetTargetO
# would be nicer to use SetOption here, but you can't reset user
# options for some strange reason in SCons. Instead, we store this
# option as a new variable in the environment.
try:
kernel_version = platform.release().split(".")
kernel_major = int(kernel_version[0])
kernel_minor = int(kernel_version[1])
except (ValueError, IndexError):
print(
f"Failed to extract kernel major and minor versions, tcmalloc-google will not be available for use: {kernel_version}"
)
kernel_major = 0
kernel_minor = 0
if get_option('allocator') == "auto":
# using an allocator besides system on android would require either fixing or disabling
# gperftools on android
if env.TargetOSIs('windows') or \
env.TargetOSIs('linux') and not env.TargetOSIs('android'):
env['MONGO_ALLOCATOR'] = "tcmalloc"
if env.TargetOSIs('linux') and env['TARGET_ARCH'] in ('x86_64', 'aarch64'):
# TODO SERVER-86472 make bazel support both tcmalloc implementations
if env.get("BAZEL_BUILD_ENABLED"):
env['MONGO_ALLOCATOR'] = "tcmalloc-gperf"
else:
env['MONGO_ALLOCATOR'] = "tcmalloc-google"
# googles tcmalloc uses the membarrier() system call which was added in Linux 4.3,
# so fall back to gperf implementation for older kernels
if kernel_major < 4 or (kernel_major == 4 and kernel_minor < 3):
env['MONGO_ALLOCATOR'] = "tcmalloc-gperf"
elif env.TargetOSIs('windows') or (env.TargetOSIs('linux')
and env['TARGET_ARCH'] in ('ppc64le', 's390x')):
env['MONGO_ALLOCATOR'] = "tcmalloc-gperf"
else:
env['MONGO_ALLOCATOR'] = "system"
else:
env['MONGO_ALLOCATOR'] = get_option('allocator')
if env['MONGO_ALLOCATOR'] == "tcmalloc-google":
if kernel_major < 4 or (kernel_major == 4 and kernel_minor < 3):
env.ConfError(
f"tcmalloc-google allocator only supported on linux kernel 4.3 or greater: kenerl verison={platform.release()}"
)
if env['MONGO_ALLOCATOR'] == "tcmalloc-google":
env.Append(CPPDEFINES=["ABSL_ALLOCATOR_NOTHROW"])
if has_option("cache"):
if has_option("gcov"):
env.FatalError("Mixing --cache and --gcov doesn't work correctly yet. See SERVER-11084")
@ -2445,6 +2477,13 @@ if not env.TargetOSIs('windows'):
env["LINKCOM"] = env["LINKCOM"].replace("$LINKFLAGS", "$PROGLINKFLAGS")
env["PROGLINKFLAGS"] = ['$LINKFLAGS']
# CPPFLAGS is used for assembler commands, this condition below assumes assembler files
# will be only directly assembled in librarys and not programs
if link_model.startswith("dynamic"):
env.Append(CPPFLAGS=["-fPIC"])
else:
env.Append(CPPFLAGS=["-fPIE"])
# When it is necessary to supply additional SHLINKFLAGS without modifying the toolset default,
# following appends contents of SHLINKFLAGS_EXTRA variable to the linker command
env.AppendUnique(SHLINKFLAGS=['$SHLINKFLAGS_EXTRA'])
@ -3070,7 +3109,9 @@ if env.TargetOSIs('posix'):
# If runtime hardening is requested, then build anything
# destined for an executable with the necessary flags for PIE.
env.AppendUnique(
PROGCFLAGS=['-fPIE'],
PROGCCFLAGS=['-fPIE'],
PROGCXXFLAGS=['-fPIE'],
PROGLINKFLAGS=['-pie'],
)
@ -3102,7 +3143,8 @@ if env.TargetOSIs('posix'):
# For debug builds with tcmalloc, we need the frame pointer so it can
# record the stack of allocations.
can_nofp &= not (debugBuild and (env['MONGO_ALLOCATOR'] == 'tcmalloc'))
can_nofp &= not (debugBuild and
(env['MONGO_ALLOCATOR'] in ['tcmalloc-google', 'tcmalloc-gperf']))
# Only disable frame pointers if requested
can_nofp &= ("nofp" in selected_experimental_optimizations)
@ -4116,6 +4158,10 @@ def doConfigure(myenv):
if not myenv.ToolchainIs('clang', 'gcc'):
env.FatalError('sanitize is only supported with clang or gcc')
# sanitizer libs may inject undefined refs (for hooks) at link time, but
# the symbols will be available at runtime via the compiler runtime lib.
env.Append(LINKFLAGS='-Wl,--allow-shlib-undefined')
if myenv.ToolchainIs('gcc'):
# GCC's implementation of ASAN depends on libdl.
env.Append(LIBS=['dl'])
@ -4157,11 +4203,14 @@ def doConfigure(myenv):
get_san_lib_path(sanitizer) for sanitizer in sanitizer_list
]
if 'thread' not in sanitizer_list:
env.Append(LINKFLAGS=['-rtlib=compiler-rt', '-unwindlib=libgcc'])
if using_lsan:
env.FatalError("Please use --sanitize=address instead of --sanitize=leak")
if (using_asan
or using_msan) and env['MONGO_ALLOCATOR'] in ['tcmalloc', 'tcmalloc-experimental']:
or using_msan) and env['MONGO_ALLOCATOR'] in ['tcmalloc-google', 'tcmalloc-gperf']:
# There are multiply defined symbols between the sanitizer and
# our vendorized tcmalloc.
env.FatalError("Cannot use --sanitize=address or --sanitize=memory with tcmalloc")
@ -4236,7 +4285,7 @@ def doConfigure(myenv):
else:
myenv.ConfError('Failed to enable sanitizers with flag: {0}', sanitizer_option)
if get_option('shared-libsan') == 'on':
if get_option("shared-libsan") == "on":
shared_libsan_option = '-shared-libsan'
if myenv.AddToCCFLAGSIfSupported(shared_libsan_option):
myenv.Append(LINKFLAGS=[shared_libsan_option])
@ -5279,13 +5328,16 @@ def doConfigure(myenv):
# 'tcmalloc' needs to be the last library linked. Please, add new libraries before this
# point.
if myenv['MONGO_ALLOCATOR'] == 'tcmalloc':
if use_system_version_of_library('tcmalloc'):
conf.FindSysLibDep("tcmalloc", ["tcmalloc"])
elif myenv['MONGO_ALLOCATOR'] in ['system', 'tcmalloc-experimental']:
if myenv['MONGO_ALLOCATOR'] == 'tcmalloc-google':
if use_system_version_of_library('tcmalloc-google'):
conf.FindSysLibDep("tcmalloc-google", ["tcmalloc"])
elif myenv['MONGO_ALLOCATOR'] == 'tcmalloc-gperf':
if use_system_version_of_library('tcmalloc-gperf'):
conf.FindSysLibDep("tcmalloc-gperf", ["tcmalloc"])
elif myenv['MONGO_ALLOCATOR'] in ['system']:
pass
else:
myenv.FatalError("Invalid --allocator parameter: $MONGO_ALLOCATOR")
myenv.FatalError(f"Invalid --allocator parameter: {env['MONGO_ALLOCATOR']}")
def CheckStdAtomic(context, base_type, extra_message):
test_body = """

View File

@ -348,6 +348,7 @@ buildvariants:
archive-mongocryptd-debug
lang_environment: LANG=C
san_options: *ubsan_options
# TODO SERVER-86610 set --allocator=tcmalloc-google
compile_flags: >-
--variables-files=etc/scons/mongodbtoolchain_stable_clang.vars
--dbg=on
@ -355,6 +356,7 @@ buildvariants:
--sanitize=undefined
--ssl
--ocsp-stapling=off
--allocator=tcmalloc-gperf
-j$(grep -c ^processor /proc/cpuinfo)
--link-model=dynamic
--use-diagnostic-latches=on

View File

@ -469,12 +469,14 @@ buildvariants:
archive-mongocryptd-debug
lang_environment: LANG=C
san_options: *ubsan_options
# TODO SERVER-86610 add tcmalloc-google as the allocator for ubsan
compile_flags: >-
--variables-files=etc/scons/mongodbtoolchain_${toolchain_version}_clang.vars
--dbg=on
--opt=on
--sanitize=undefined
--ssl
--allocator=tcmalloc-gperf
--ocsp-stapling=off
-j$(grep -c ^processor /proc/cpuinfo)
--use-diagnostic-latches=on

View File

@ -522,9 +522,13 @@ def generate(env: SCons.Environment.Environment) -> None:
else:
build_mode = f"opt_{mongo_generators.get_opt_options(env)}" # one of "on", "size", "off"
# Deprecate tcmalloc-experimental
allocator = "tcmalloc" if env.GetOption(
"allocator") == "tcmalloc-experimental" else env.GetOption("allocator")
# TODO SERVER-86472 make bazel support both tcmalloc implementations
if env.GetOption("allocator") == "tcmalloc-google":
env.ConfError("Bazel build currently does not support tcmalloc-google allocator.")
if env.GetOption("allocator") == "tcmalloc-gperf":
allocator = "tcmalloc"
else:
allocator = env.GetOption("allocator")
bazel_internal_flags = [
f'--//bazel/config:compiler_type={env.ToolchainName()}',

View File

@ -38,6 +38,7 @@ env['LINK'] = [f'{base_toolchain_bin}/g++']
env['SHLINK'] = [f'{base_toolchain_bin}/g++']
env['CPPPATH'] = [str(tidy_include)]
env['LIBPATH'] = []
env['CPPFLAGS'] = []
env['CCFLAGS'] = [
'-DGTEST_HAS_RTTI=0',
'-D_GNU_SOURCE',

View File

@ -37,7 +37,7 @@
#include <src/core/lib/security/credentials/ssl/ssl_credentials.h>
#include <src/core/lib/security/security_connector/ssl_utils.h>
#include <src/core/tsi/ssl_transport_security.cc>
#include <src/core/tsi/ssl_transport_security.h>
#include <src/cpp/server/secure_server_credentials.h>
#include "mongo/base/error_codes.h"

View File

@ -361,20 +361,38 @@ env.CppUnitTest(
],
)
if env['MONGO_ALLOCATOR'] in ['tcmalloc', 'tcmalloc-experimental']:
tcmspEnv = env.Clone()
if not use_system_version_of_library('tcmalloc'):
# Add in the include path for our vendored tcmalloc.
tcmspEnv.InjectThirdParty('gperftools')
tcmallocAttrs = None
for impl in [
{
# Modern standalone tcmalloc (not gperftools)
'options': ['tcmalloc-google'],
'sys_name': 'tcmalloc-google',
'inject': 'tcmalloc',
'cppdefs': ['MONGO_HAVE_GOOGLE_TCMALLOC'],
},
{
# Old gperftools tcmalloc
'options': ['tcmalloc-gperf'],
'sys_name':
'tcmalloc-gperf',
'inject':
'gperftools',
# If our changes to tcmalloc are ever upstreamed, this should become set based on a top
# level configure check, though its effects should still be scoped just to these files.
tcmspEnv.Append(CPPDEFINES=[
'cppdefs': [
'MONGO_HAVE_GPERF_TCMALLOC',
'MONGO_HAVE_GPERFTOOLS_GET_THREAD_CACHE_SIZE',
'MONGO_HAVE_GPERFTOOLS_SIZE_CLASS_STATS',
])
],
},
]:
if env['MONGO_ALLOCATOR'] in impl['options']:
tcmallocAttrs = impl
if tcmallocAttrs:
tcmspEnv = env.Clone()
if not use_system_version_of_library(tcmallocAttrs['sys_name']):
tcmspEnv.InjectThirdParty(tcmallocAttrs['inject'])
tcmspEnv.Append(CPPDEFINES=tcmallocAttrs['cppdefs'])
if not use_system_version_of_library('valgrind'):
# Include valgrind since tcmalloc disables itself while running under valgrind

View File

@ -27,8 +27,6 @@
* it in the license file.
*/
#include <gperftools/malloc_hook.h>
#include <absl/hash/hash.h>
// IWYU pragma: no_include "cxxabi.h"
@ -60,6 +58,7 @@
#include "mongo/logv2/log.h"
#include "mongo/logv2/log_attr.h"
#include "mongo/logv2/log_component.h"
#include "mongo/stdx/unordered_map.h"
#include "mongo/util/stacktrace.h"
#include "mongo/util/tcmalloc_parameters_gen.h"
@ -67,11 +66,54 @@
#include <unistd.h>
#endif
#include <MurmurHash3.h>
#ifdef MONGO_HAVE_GPERF_TCMALLOC
#include <gperftools/malloc_hook.h>
#endif
#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
#include <absl/debugging/symbolize.h>
#include <tcmalloc/malloc_extension.h>
#endif
#if defined(_POSIX_VERSION) && defined(MONGO_CONFIG_HAVE_EXECINFO_BACKTRACE)
#include <dlfcn.h>
#include <execinfo.h>
#endif
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kDefault
// for dlfcn.h and backtrace
#if defined(_POSIX_VERSION) && defined(MONGO_CONFIG_HAVE_EXECINFO_BACKTRACE)
#if defined(_POSIX_VERSION) && defined(MONGO_CONFIG_HAVE_EXECINFO_BACKTRACE) && \
(defined(MONGO_HAVE_GPERF_TCMALLOC) || defined(MONGO_HAVE_GOOGLE_TCMALLOC))
namespace mongo {
namespace {
/** Simple wrapper for the demangler, particularly its buffer space. */
class Demangler {
public:
Demangler() = default;
Demangler(const Demangler&) = delete;
~Demangler() {
free(_buf);
}
char* operator()(const char* sym) {
char* dm = abi::__cxa_demangle(sym, _buf, &_bufSize, &_status);
if (dm)
_buf = dm;
return dm;
}
private:
size_t _bufSize = 0;
char* _buf = nullptr;
int _status = 0;
};
//
// Sampling heap profiler
@ -149,33 +191,6 @@
// and acceptable size overhead for the hash tables.
//
namespace mongo {
namespace {
// Simple wrapper for the demangler, particularly its buffer space.
class Demangler {
public:
Demangler() = default;
Demangler(const Demangler&) = delete;
~Demangler() {
free(_buf);
}
char* operator()(const char* sym) {
char* dm = abi::__cxa_demangle(sym, _buf, &_bufSize, &_status);
if (dm)
_buf = dm;
return dm;
}
private:
size_t _bufSize = 0;
char* _buf = nullptr;
int _status = 0;
};
// TODO SERVER-44010: Consider replacing this custom implementation with a generic one.
//
// Simple hash table maps Key->Value.
@ -201,29 +216,6 @@ using Hash = size_t;
template <class Key, class Value>
class HashTable {
HashTable(const HashTable&) = delete;
HashTable& operator=(const HashTable&) = delete;
private:
struct Entry {
Key key{};
Value value{};
std::atomic<Entry*> next{nullptr}; // NOLINT
std::atomic<bool> valid{false}; // NOLINT
Entry() {}
};
const size_t maxEntries; // we allocate storage for this many entries on creation
std::atomic_size_t numEntries; // number of entries currently in use NOLINT
size_t numBuckets; // number of buckets, computed as numEntries * loadFactor
// pre-allocate buckets and entries
std::unique_ptr<std::atomic<Entry*>[]> buckets; // NOLINT
std::unique_ptr<Entry[]> entries;
std::atomic_size_t nextEntry; // first entry that's never been used NOLINT
Entry* freeEntry; // linked list of entries returned to us by removeEntry
public:
HashTable(size_t maxEntries, int loadFactor)
: maxEntries(maxEntries),
@ -246,17 +238,15 @@ public:
} else if (nextEntry < maxEntries) {
entry = &entries[nextEntry++];
}
if (entry) {
entry->next = buckets[hash].load();
buckets[hash] = entry;
entry->key = key;
entry->value = value;
entry->valid = true; // signal that the entry is well-formed and may be traversed
numEntries++;
return &entry->value;
} else {
if (!entry)
return nullptr;
}
entry->next = buckets[hash].load();
buckets[hash] = entry;
entry->key = key;
entry->value = value;
entry->valid = true; // signal that the entry is well-formed and may be traversed
++numEntries;
return &entry->value;
}
// Find the entry containing Key in the specified hash bucket.
@ -279,7 +269,7 @@ public:
entry->valid = false; // first signal entry is invalid as it may get reused
entry->next = freeEntry;
freeEntry = entry;
numEntries--;
--numEntries;
break;
}
}
@ -292,76 +282,105 @@ public:
// Note however it is not guaranteed to provide snapshot semantics wrt the set of entries,
// and caller must ensure safety wrt concurrent updates to the Value of an entry
template <typename F>
void forEach(F f) {
void forEach(const F& f) {
for (size_t i = 0; i < nextEntry; i++) {
Entry& entry = entries[i];
if (entry.valid) // only traverse well-formed entries
f(entry.key, entry.value);
Entry& e = entries[i];
if (e.valid) // only traverse well-formed entries
f(e.key, e.value);
}
}
// Determines whether the specified hash bucket is empty. May be called concurrently with
// insert() and remove(). Concurrent visibility on other threads is guaranteed because
// buckets[hash] is atomic.
bool isEmptyBucket(Hash hash) {
hash %= numBuckets;
return buckets[hash] == nullptr;
bool isEmptyBucket(Hash hash) const {
return !buckets[hash % numBuckets];
}
// Number of entries.
size_t size() {
size_t size() const {
return numEntries;
}
// Highwater mark of number of entries used, for reporting stats.
size_t maxSizeSeen() {
size_t maxSizeSeen() const {
return nextEntry;
}
// Returns total allocated size of the hash table, for reporting stats.
size_t memorySizeBytes() {
size_t memorySizeBytes() const {
return numBuckets * sizeof(buckets[0]) + maxEntries * sizeof(entries[0]);
}
private:
struct Entry {
Key key{};
Value value{};
std::atomic<Entry*> next{nullptr}; // NOLINT
std::atomic<bool> valid{false}; // NOLINT
};
const size_t maxEntries; // we allocate storage for this many entries on creation
std::atomic_size_t numEntries; // number of entries currently in use NOLINT
size_t numBuckets; // number of buckets, computed as numEntries * loadFactor
// pre-allocate buckets and entries
std::unique_ptr<std::atomic<Entry*>[]> buckets; // NOLINT
std::unique_ptr<Entry[]> entries;
std::atomic_size_t nextEntry; // first entry that's never been used NOLINT
Entry* freeEntry; // linked list of entries returned to us by removeEntry
};
namespace heap_profiler_detail_gperf_tcmalloc {
class HeapProfiler {
public:
static inline HeapProfiler* heapProfiler;
HeapProfiler() {
// Set sample interval from the parameter.
sampleIntervalBytes = HeapProfilingSampleIntervalBytes;
// This is our only allocator dependency - ifdef and change as
// appropriate for other allocators, using hooks or shims.
// For tcmalloc we skip two frames that are internal to the allocator
// so that the top frame is the public tc_* function.
skipStartFrames = 2;
skipEndFrames = 0;
#ifdef MONGO_HAVE_GPERF_TCMALLOC
MallocHook::AddNewHook(+[](const void* p, size_t sz) { heapProfiler->_alloc(p, sz); });
MallocHook::AddDeleteHook(+[](const void* p) { heapProfiler->_free(p); });
#endif
}
static void generateServerStatusSection(BSONObjBuilder& builder) {
if (heapProfiler)
heapProfiler->_generateServerStatusSection(builder);
}
static void start() {
heapProfiler = new HeapProfiler();
}
private:
// 0: sampling internally disabled
// 1: sample every allocation - byte accurate but slow and big
// >1: sample ever sampleIntervalBytes bytes allocated - less accurate but fast and small
std::atomic_size_t sampleIntervalBytes; // NOLINT
// guards updates to both object and stack hash tables
stdx::mutex hashtable_mutex; // NOLINT
// guards against races updating the StackInfo bson representation
stdx::mutex stackinfo_mutex; // NOLINT
// cumulative bytes allocated - determines when samples are taken
std::atomic_size_t bytesAllocated{0}; // NOLINT
// estimated currently active bytes - sum of activeBytes for all stacks
size_t totalActiveBytes = 0;
//
// Hash table of stacks
//
using FrameInfo = void*; // per-frame information is just the IP
static const int kMaxStackInfos = 20000; // max number of unique call sites we handle
static const int kStackHashTableLoadFactor = 2; // keep loading <50%
static const size_t kMaxFramesPerStack = 100; // max depth of stack
static const int kMaxObjInfos = 1024 * 1024; // maximum tracked allocations
static const int kObjHashTableLoadFactor = 4; // keep hash table loading <25%
static const int kMaxImportantSamples = 4 * 3600; // reset every 4 hours at 1Hz
// stack HashTable Key
struct Stack {
size_t numFrames = 0;
std::array<FrameInfo, kMaxFramesPerStack> frames;
Stack() {}
Stack() = default;
bool operator==(const Stack& that) {
return this->numFrames == that.numFrames &&
std::equal(frames.begin(), frames.begin() + numFrames, that.frames.begin());
friend bool operator==(const Stack& a, const Stack& b) {
return a.numFrames == b.numFrames &&
std::equal(a.frames.begin(), a.frames.begin() + a.numFrames, b.frames.begin());
}
Hash hash() {
@ -371,61 +390,51 @@ private:
numFrames * sizeof(FrameInfo)};
return absl::HashOf(dataRange);
}
size_t numFrames = 0;
std::array<FrameInfo, kMaxFramesPerStack> frames;
};
// Stack HashTable Value.
struct StackInfo {
StackInfo() = default;
explicit StackInfo(int stackNum) : stackNum(stackNum) {}
int stackNum = 0; // used for stack short name
size_t activeBytes = 0; // number of live allocated bytes charged to this stack
bool logged = false; // true when stack has been logged once.
explicit StackInfo(int stackNum) : stackNum(stackNum) {}
StackInfo() {}
};
// The stack HashTable itself.
HashTable<Stack, StackInfo> stackHashTable{kMaxStackInfos, kStackHashTableLoadFactor};
struct ByPointeeStackNum {
bool operator()(const StackInfo* a, const StackInfo* b) const {
return a->stackNum < b->stackNum;
}
};
// frames to skip at top and bottom of backtrace when reporting stacks
size_t skipStartFrames = 0;
size_t skipEndFrames = 0;
//
// Hash table of allocated objects.
//
static const int kMaxObjInfos = 1024 * 1024; // maximum tracked allocations
static const int kObjHashTableLoadFactor = 4; // keep hash table loading <25%
// Obj HashTable Key.
struct Obj {
const void* objPtr = nullptr;
Obj() = default;
explicit Obj(const void* objPtr) : objPtr(objPtr) {}
Obj() {}
bool operator==(const Obj& that) {
return this->objPtr == that.objPtr;
friend bool operator==(const Obj& a, const Obj& b) {
return a.objPtr == b.objPtr;
}
Hash hash() {
return absl::HashOf(objPtr);
}
const void* objPtr = nullptr;
};
// Obj HashTable Value.
struct ObjInfo {
size_t accountedLen = 0;
StackInfo* stackInfo = nullptr;
ObjInfo() = default;
ObjInfo(size_t accountedLen, StackInfo* stackInfo)
: accountedLen(accountedLen), stackInfo(stackInfo) {}
ObjInfo() {}
size_t accountedLen = 0;
StackInfo* stackInfo = nullptr;
};
// The obj HashTable itself.
HashTable<Obj, ObjInfo> objHashTable{kMaxObjInfos, kObjHashTableLoadFactor};
// If we encounter an error that doesn't allow us to proceed, for
// example out of space for new hash table entries, we internally
// disable profiling and then log an error message.
@ -562,26 +571,6 @@ private:
"stackObj"_attr = builder.done());
}
//
// Generate serverStatus section.
//
bool logGeneralStats = true; // first time only
// In order to reduce load on ftdc we track the stacks we deem important enough to emit
// once a stack is deemed "important" it remains important from that point on.
// "Important" is a sticky quality to improve the stability of the set of stacks we emit,
// and we always emit them in stackNum order, greatly improving ftdc compression efficiency.
struct ImportantStacksOrder {
bool operator()(const StackInfo* a, const StackInfo* b) const {
return a->stackNum < b->stackNum;
}
};
std::set<const StackInfo*, ImportantStacksOrder> importantStacks;
int numImportantSamples = 0; // samples currently included in importantStacks
const int kMaxImportantSamples = 4 * 3600; // reset every 4 hours at default 1 sample / sec
void _generateServerStatusSection(BSONObjBuilder& builder) {
// compute and log some informational stats first time through
if (logGeneralStats) {
@ -681,44 +670,218 @@ private:
}
}
//
// Static hooks to give to the allocator.
//
// 0: sampling internally disabled
// 1: sample every allocation - byte accurate but slow and big
// >1: sample ever sampleIntervalBytes bytes allocated - less accurate but fast and small
std::atomic_size_t sampleIntervalBytes; // NOLINT
static void alloc(const void* obj, size_t objLen) {
heapProfiler->_alloc(obj, objLen);
}
// guards updates to both object and stack hash tables
stdx::mutex hashtable_mutex; // NOLINT
// guards against races updating the StackInfo bson representation
stdx::mutex stackinfo_mutex; // NOLINT
static void free(const void* obj) {
heapProfiler->_free(obj);
}
// cumulative bytes allocated - determines when samples are taken
std::atomic_size_t bytesAllocated{0}; // NOLINT
// estimated currently active bytes - sum of activeBytes for all stacks
size_t totalActiveBytes = 0;
// The stack HashTable itself.
HashTable<Stack, StackInfo> stackHashTable{kMaxStackInfos, kStackHashTableLoadFactor};
// frames to skip at top and bottom of backtrace when reporting stacks
size_t skipStartFrames = 0;
size_t skipEndFrames = 0;
// The obj HashTable itself.
HashTable<Obj, ObjInfo> objHashTable{kMaxObjInfos, kObjHashTableLoadFactor};
bool logGeneralStats = true; // first time only
// In order to reduce load on ftdc we track the stacks we deem important enough to emit
// once a stack is deemed "important" it remains important from that point on.
// "Important" is a sticky quality to improve the stability of the set of stacks we emit,
// and we always emit them in stackNum order, greatly improving ftdc compression efficiency.
std::set<const StackInfo*, ByPointeeStackNum> importantStacks;
int numImportantSamples = 0; // samples currently included in importantStacks
};
} // namespace heap_profiler_detail_gperf_tcmalloc
namespace heap_profiler_detail_tcmalloc {
#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
class HeapProfiler {
public:
static HeapProfiler* heapProfiler;
static const int kMaxImportantSamples = 4 * 3600; // reset every 4 hours at 1Hz
static inline HeapProfiler* heapProfiler;
HeapProfiler() {
// Set sample interval from the parameter.
sampleIntervalBytes = HeapProfilingSampleIntervalBytes;
// This is our only allocator dependency - ifdef and change as
// appropriate for other allocators, using hooks or shims.
// For tcmalloc we skip two frames that are internal to the allocator
// so that the top frame is the public tc_* function.
skipStartFrames = 2;
skipEndFrames = 0;
MallocHook::AddNewHook(alloc);
MallocHook::AddDeleteHook(free);
tcmalloc::MallocExtension::SetProfileSamplingRate(sampleIntervalBytes);
auto profileToken = tcmalloc::MallocExtension::StartAllocationProfiling();
profileTokens.push_back(std::move(profileToken));
}
static void generateServerStatusSection(BSONObjBuilder& builder) {
if (heapProfiler)
heapProfiler->_generateServerStatusSection(builder);
}
};
//
// serverStatus section
//
static void start() {
heapProfiler = new HeapProfiler();
}
private:
struct StackInfo {
StackInfo(const tcmalloc::Profile::Sample& stackSample, int id) {
stackNum = id;
numFrames = stackSample.depth;
// Generate a bson representation of our new stack.
BSONArrayBuilder builder;
std::string frameString(256, '\0');
for (int i = 0; i < stackSample.depth; ++i) {
char buf[256];
if (!absl::Symbolize(stackSample.stack[i], buf, sizeof(buf))) {
frameString = fmt::format("{}", stackSample.stack[i]);
} else {
frameString.assign(buf);
}
builder.append(frameString);
}
LOGV2(8592501,
"heapProfile stack",
"stackNum"_attr = stackNum,
"stackObj"_attr = builder.obj());
}
int stackNum = 0; // used for stack short name
BSONObj stackObj; // symbolized representation
int numFrames = 0;
uint64_t activeBytes = 0;
};
struct ByStackNum {
bool operator()(StackInfo* a, StackInfo* b) const {
return a->stackNum < b->stackNum;
}
};
uint32_t StackHash(const tcmalloc::Profile::Sample& stackSample) {
uint32_t hash;
MurmurHash3_x86_32(stackSample.stack, stackSample.depth * sizeof(void*), 0, &hash);
return hash;
}
void _generateServerStatusSection(BSONObjBuilder& builder) {
// Compute and log some informational stats first time through
if (logGeneralStats) {
LOGV2(8592504,
"Generating heap profiler serverStatus",
"heapProfilingSampleIntervalBytes"_attr = HeapProfilingSampleIntervalBytes);
LOGV2(8592503, "Following stack trace is for heap profiler informational purposes");
printStackTrace();
logGeneralStats = false;
}
// Get a live snapshot profile of the current heap usage
int64_t totalActiveBytes = 0;
std::vector<StackInfo*> stackInfos;
std::set<StackInfo*, ByStackNum> activeStacks;
tcmalloc::MallocExtension::SnapshotCurrent(tcmalloc::ProfileType::kHeap)
.Iterate([&](const auto& sample) {
totalActiveBytes += sample.sum;
// Compute backtrace hash of sample stack
uint32_t stackHash = StackHash(sample);
StackInfo* stackInfo = stackInfoMap[stackHash];
// If this is a new stack, store in our stack map
if (!stackInfo) {
stackInfo = new StackInfo(sample, stackInfoMap.size());
stackInfoMap[stackHash] = stackInfo;
}
auto activeStackSearch = activeStacks.find(stackInfo);
if (activeStackSearch != activeStacks.end()) {
stackInfo->activeBytes += sample.sum;
} else {
activeStacks.insert(stackInfo);
stackInfos.push_back(stackInfo);
stackInfo->activeBytes = sample.sum;
}
});
// Get the series of allocation samples to this point
auto currentToken = std::move(profileTokens.back());
profileTokens.pop_back();
auto allocProfile = std::move(currentToken).Stop();
// Start a new allocation profile session for the next invocation
auto newToken = tcmalloc::MallocExtension::StartAllocationProfiling();
profileTokens.push_back(std::move(newToken));
// Sum all the allocations performed (of what we sampled)
int64_t allocatedBytes = 0;
allocProfile.Iterate(
[&](const tcmalloc::Profile::Sample& sample) { allocatedBytes += sample.sum; });
sampleBytesAllocated += allocatedBytes;
BSONObjBuilder(builder.subobjStart("stats"))
.appendNumber("totalActiveBytes", static_cast<long long>(totalActiveBytes))
.appendNumber("bytesAllocated", static_cast<long long>(sampleBytesAllocated))
.appendNumber("numStacks", static_cast<long long>(stackInfoMap.size()));
// Sort the stacks and find enough stacks to account for at least 99% of the active bytes
// deem any stack that has ever met this criterion as "important".
std::stable_sort(stackInfos.begin(), stackInfos.end(), [](StackInfo* a, StackInfo* b) {
return a->activeBytes > b->activeBytes;
});
size_t threshold = totalActiveBytes * 0.99;
size_t cumulative = 0;
for (auto&& stackInfo : stackInfos) {
importantStacks.insert(stackInfo);
cumulative += stackInfo->activeBytes;
if (cumulative > threshold)
break;
}
// Build the stacks subsection by emitting a sample of stacks that were live at a peak of
// total heap usage.
{
BSONObjBuilder stacks(builder.subobjStart("stacks"));
for (auto&& stackInfo : importantStacks)
BSONObjBuilder{stacks.subobjStart(fmt::format("stack{}", stackInfo->stackNum))}
.appendNumber("activeBytes", static_cast<long long>(stackInfo->activeBytes));
}
// importantStacks grows monotonically, so it can accumulate unneeded stacks,
// so we clear it periodically.
if (++numImportantSamples >= kMaxImportantSamples) {
LOGV2(8592502, "Clearing importantStacks");
importantStacks.clear();
numImportantSamples = 0;
}
}
std::vector<tcmalloc::MallocExtension::AllocationProfilingToken> profileTokens;
std::atomic_size_t sampleIntervalBytes;
std::atomic_size_t sampleBytesAllocated{0};
bool logGeneralStats = true; // first time only
stdx::unordered_map<uint32_t, StackInfo*> stackInfoMap;
// In order to reduce load on ftdc we track the stacks we deem important enough to emit
// once a stack is deemed "important" it remains important from that point on.
// "Important" is a sticky quality to improve the stability of the set of stacks we emit,
// and we always emit them in stackNum order, greatly improving ftdc compression efficiency.
std::set<StackInfo*, ByStackNum> importantStacks;
int numImportantSamples = 0; // samples currently included in importantStacks
};
#endif // MONGO_HAVE_GOOGLE_TCMALLOC
} // namespace heap_profiler_detail_tcmalloc
#if defined(MONGO_HAVE_GOOGLE_TCMALLOC)
using heap_profiler_detail_tcmalloc::HeapProfiler;
#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
using heap_profiler_detail_gperf_tcmalloc::HeapProfiler;
#endif
class HeapProfilerServerStatusSection final : public ServerStatusSection {
public:
@ -728,27 +891,26 @@ public:
return HeapProfilingEnabled;
}
BSONObj generateSection(OperationContext* opCtx,
const BSONElement& configElement) const override {
BSONObj generateSection(OperationContext*, const BSONElement&) const override {
BSONObjBuilder builder;
HeapProfiler::generateServerStatusSection(builder);
return builder.obj();
}
} heapProfilerServerStatusSection;
};
//
// startup
//
#ifdef MONGO_HAVE_HEAP_PROFILER
HeapProfiler* HeapProfiler::heapProfiler;
HeapProfilerServerStatusSection heapProfilerServerStatusSection;
MONGO_INITIALIZER_GENERAL(StartHeapProfiling, ("EndStartupOptionHandling"), ("default"))
(InitializerContext* context) {
(InitializerContext*) {
if (HeapProfilingEnabled)
HeapProfiler::heapProfiler = new HeapProfiler();
HeapProfiler::start();
}
#endif // MONGO_HAVE_HEAP_PROFILER
} // namespace
} // namespace mongo
#endif // MONGO_HAVE_HEAP_PROFILER
#endif //_POSIX_VERSION

View File

@ -28,15 +28,17 @@
*/
#include "mongo/base/string_data_comparator.h"
#ifdef _WIN32
#define NVALGRIND
#endif
#include <cstddef>
#include <gperftools/malloc_extension.h>
#include <memory>
#include <utility>
#include <valgrind/valgrind.h>
#include <boost/optional/optional.hpp>
#include "mongo/base/error_codes.h"
@ -51,6 +53,14 @@
#include "mongo/db/tenant_id.h"
#include "mongo/util/tcmalloc_parameters_gen.h"
#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
#include <tcmalloc/malloc_extension.h>
auto static tcmallocProperties = tcmalloc::MallocExtension::GetProperties();
#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
#include <gperftools/malloc_extension.h>
auto static mallocExtensionAPI = MallocExtension::instance();
#endif
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kDefault
@ -93,74 +103,89 @@ public:
BSONObjBuilder builder;
auto getValueIfExists = [&](StringData property) -> boost::optional<size_t> {
#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
if (auto value = tcmallocProperties.find(property.toString());
value != tcmallocProperties.end()) {
return {value->second.value};
}
#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
size_t value;
if (mallocExtensionAPI->GetNumericProperty(property.rawData(), &value)) {
return {value};
}
#endif
return boost::none;
};
auto tryAppend = [&](BSONObjBuilder& builder, StringData bsonName, StringData property) {
if (auto value = getValueIfExists(property); !!value) {
builder.appendNumber(bsonName, static_cast<long long>(*value));
}
};
auto tryStat = [&](BSONObjBuilder& builder, StringData topic, StringData base) {
tryAppend(builder, base, fmt::format("{}.{}", topic, base));
};
// For a list of properties see the "Generic Tcmalloc Status" section of
// http://google-perftools.googlecode.com/svn/trunk/doc/tcmalloc.html and
// http://code.google.com/p/gperftools/source/browse/src/gperftools/malloc_extension.h
{
BSONObjBuilder sub(builder.subobjStart("generic"));
appendNumericPropertyIfAvailable(
sub, "current_allocated_bytes", "generic.current_allocated_bytes");
appendNumericPropertyIfAvailable(sub, "heap_size", "generic.heap_size");
tryStat(sub, "generic", "current_allocated_bytes");
tryStat(sub, "generic", "heap_size");
}
{
BSONObjBuilder sub(builder.subobjStart("tcmalloc"));
auto tryTc = [&](StringData key) {
tryStat(sub, "tcmalloc", key);
};
appendNumericPropertyIfAvailable(
sub, "pageheap_free_bytes", "tcmalloc.pageheap_free_bytes");
appendNumericPropertyIfAvailable(
sub, "pageheap_unmapped_bytes", "tcmalloc.pageheap_unmapped_bytes");
appendNumericPropertyIfAvailable(
sub, "max_total_thread_cache_bytes", "tcmalloc.max_total_thread_cache_bytes");
appendNumericPropertyIfAvailable(sub,
"current_total_thread_cache_bytes",
"tcmalloc.current_total_thread_cache_bytes");
// Not including tcmalloc.slack_bytes since it is deprecated.
tryTc("pageheap_free_bytes");
tryTc("pageheap_unmapped_bytes");
tryTc("max_total_thread_cache_bytes");
tryTc("current_total_thread_cache_bytes");
// Calculate total free bytes, *excluding the page heap*
size_t central;
size_t transfer;
size_t thread;
if (MallocExtension::instance()->GetNumericProperty("tcmalloc.central_cache_free_bytes",
&central) &&
MallocExtension::instance()->GetNumericProperty(
"tcmalloc.transfer_cache_free_bytes", &transfer) &&
MallocExtension::instance()->GetNumericProperty("tcmalloc.thread_cache_free_bytes",
&thread)) {
sub.appendNumber("total_free_bytes",
static_cast<long long>(central) +
static_cast<long long>(transfer) +
static_cast<long long>(thread));
{
long long total = 0;
if (auto central = getValueIfExists("tcmalloc.central_cache_free"); !!central) {
sub.appendNumber("central_cache_free_bytes", static_cast<long long>(*central));
total += *central;
}
if (auto transfer = getValueIfExists("tcmalloc.transfer_cache_free"); !!transfer) {
sub.appendNumber("transfer_cache_free_bytes",
static_cast<long long>(*transfer));
total += *transfer;
}
if (auto thread = getValueIfExists("tcmalloc.thread_cache_free"); !!thread) {
sub.appendNumber("thread_cache_free_bytes", static_cast<long long>(*thread));
total += *thread;
}
if (auto cpu = getValueIfExists("tcmalloc.cpu_free"); !!cpu) {
sub.appendNumber("cpu_cache_free_bytes", static_cast<long long>(*cpu));
total += *cpu;
}
sub.appendNumber("total_free_bytes", total);
}
appendNumericPropertyIfAvailable(
sub, "central_cache_free_bytes", "tcmalloc.central_cache_free_bytes");
appendNumericPropertyIfAvailable(
sub, "transfer_cache_free_bytes", "tcmalloc.transfer_cache_free_bytes");
appendNumericPropertyIfAvailable(
sub, "thread_cache_free_bytes", "tcmalloc.thread_cache_free_bytes");
appendNumericPropertyIfAvailable(
sub, "aggressive_memory_decommit", "tcmalloc.aggressive_memory_decommit");
appendNumericPropertyIfAvailable(
sub, "pageheap_committed_bytes", "tcmalloc.pageheap_committed_bytes");
appendNumericPropertyIfAvailable(
sub, "pageheap_scavenge_count", "tcmalloc.pageheap_scavenge_count");
appendNumericPropertyIfAvailable(
sub, "pageheap_commit_count", "tcmalloc.pageheap_commit_count");
appendNumericPropertyIfAvailable(
sub, "pageheap_total_commit_bytes", "tcmalloc.pageheap_total_commit_bytes");
appendNumericPropertyIfAvailable(
sub, "pageheap_decommit_count", "tcmalloc.pageheap_decommit_count");
appendNumericPropertyIfAvailable(
sub, "pageheap_total_decommit_bytes", "tcmalloc.pageheap_total_decommit_bytes");
appendNumericPropertyIfAvailable(
sub, "pageheap_reserve_count", "tcmalloc.pageheap_reserve_count");
appendNumericPropertyIfAvailable(
sub, "pageheap_total_reserve_bytes", "tcmalloc.pageheap_total_reserve_bytes");
appendNumericPropertyIfAvailable(
sub, "spinlock_total_delay_ns", "tcmalloc.spinlock_total_delay_ns");
tryTc("aggressive_memory_decommit");
auto tcmallocReleaseRate = MallocExtension::instance()->GetMemoryReleaseRate();
sub.appendNumber("release_rate", tcmallocReleaseRate);
tryTc("pageheap_committed_bytes");
tryTc("pageheap_scavenge_count");
tryTc("pageheap_commit_count");
tryTc("pageheap_total_commit_bytes");
tryTc("pageheap_decommit_count");
tryTc("pageheap_total_decommit_bytes");
tryTc("pageheap_reserve_count");
tryTc("pageheap_total_reserve_bytes");
tryTc("spinlock_total_delay_ns");
#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
sub.appendNumber(
"release_rate",
static_cast<long long>(tcmalloc::MallocExtension::GetBackgroundReleaseRate()));
#endif
#if MONGO_HAVE_GPERFTOOLS_SIZE_CLASS_STATS
if (verbosity >= 2) {
@ -170,31 +195,25 @@ public:
// Size classes and page heap info is dumped in 1 call so that the performance
// sensitive tcmalloc page heap lock is only taken once
MallocExtension::instance()->SizeClasses(
&builders, appendSizeClassInfo, appendPageHeapInfo);
mallocExtensionAPI->SizeClasses(&builders, appendSizeClassInfo, appendPageHeapInfo);
builders.first.done();
builder.append("page_heap", builders.second.arr());
}
#endif
#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
builder.append("formattedString", tcmalloc::MallocExtension::GetStats());
#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
char buffer[4096];
MallocExtension::instance()->GetStats(buffer, sizeof buffer);
mallocExtensionAPI->GetStats(buffer, sizeof buffer);
builder.append("formattedString", buffer);
#endif
}
return builder.obj();
}
private:
static void appendNumericPropertyIfAvailable(BSONObjBuilder& builder,
StringData bsonName,
const char* property) {
size_t value;
if (MallocExtension::instance()->GetNumericProperty(property, &value))
builder.appendNumber(bsonName, static_cast<long long>(value));
}
#if MONGO_HAVE_GPERFTOOLS_SIZE_CLASS_STATS
static void appendSizeClassInfo(void* bsonarr_builder, const base::MallocSizeClass* stats) {
BSONArrayBuilder& builder =

View File

@ -33,7 +33,6 @@
#include <algorithm>
#include <cstdlib>
#include <gperftools/malloc_extension.h>
#include <limits>
#include <string>
#include <valgrind/valgrind.h>
@ -58,11 +57,23 @@
#include "mongo/util/str.h"
#include "mongo/util/tcmalloc_parameters_gen.h"
#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
#include <tcmalloc/malloc_extension.h>
#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
#include <gperftools/malloc_extension.h>
#endif
namespace mongo {
namespace {
constexpr absl::string_view toStringView(StringData s) {
return {s.rawData(), s.size()};
}
constexpr auto kMaxTotalThreadCacheBytesPropertyName = "tcmalloc.max_total_thread_cache_bytes"_sd;
constexpr auto kAggressiveMemoryDecommitPropertyName = "tcmalloc.aggressive_memory_decommit"_sd;
#if defined(MONGO_HAVE_GPERF_TCMALLOC)
StatusWith<size_t> getProperty(StringData propname) {
size_t value;
if (!MallocExtension::instance()->GetNumericProperty(propname.toString().c_str(), &value)) {
@ -81,6 +92,66 @@ Status setProperty(StringData propname, size_t value) {
}
return Status::OK();
}
#endif
void setMaxTotalThreadCacheBytes(size_t cacheSize) {
#if defined(MONGO_HAVE_GOOGLE_TCMALLOC)
tcmalloc::MallocExtension::SetMaxTotalThreadCacheBytes(cacheSize);
#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
uassertStatusOK(setProperty(kMaxTotalThreadCacheBytesPropertyName, cacheSize));
#endif // MONGO_HAVE_GPERF_TCMALLOC
}
#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
// Implement abstraction for the differences between gperftools and new tcmalloc.
bool getNumericProperty(absl::string_view key, size_t* val) {
auto optVal = tcmalloc::MallocExtension::GetNumericProperty(key);
if (!optVal)
return false;
*val = *optVal;
return true;
}
StatusWith<size_t> getProperty(StringData propname) {
size_t value;
if (!getNumericProperty(propname.toString().c_str(), &value)) {
return {ErrorCodes::InternalError,
str::stream() << "Failed to retreive tcmalloc prop: " << propname};
}
return value;
}
bool setNumericProperty(absl::string_view key, size_t val) {
if (key == toStringView(kMaxTotalThreadCacheBytesPropertyName)) {
setMaxTotalThreadCacheBytes(val);
return true;
}
return false;
}
Status setProperty(StringData propname, size_t value) {
if (!RUNNING_ON_VALGRIND) { // NOLINT
if (!setNumericProperty(propname.toString().c_str(), value)) {
return {ErrorCodes::InternalError,
str::stream() << "Failed to set internal tcmalloc property " << propname};
}
}
return Status::OK();
}
long long getMemoryReleaseRate() {
return static_cast<size_t>(tcmalloc::MallocExtension::GetBackgroundReleaseRate());
}
bool setMemoryReleaseRate(size_t val) {
tcmalloc::MallocExtension::SetBackgroundReleaseRate(
tcmalloc::MallocExtension::BytesPerSecond{val});
return true;
}
#endif
StatusWith<size_t> validateTCMallocValue(StringData name, const BSONElement& newValueElement) {
if (!newValueElement.isNumber()) {
@ -152,7 +223,20 @@ MONGO_INITIALIZER_GENERAL(TcmallocConfigurationDefaults, (), ("BeginStartupOptio
(systemMemorySizeMB / 8) * 1024 * 1024; // 1/8 of system memory in bytes
size_t cacheSize = std::min(defaultTcMallocCacheSize, derivedTcMallocCacheSize);
uassertStatusOK(setProperty(kMaxTotalThreadCacheBytesPropertyName, cacheSize));
setMaxTotalThreadCacheBytes(cacheSize);
#if defined(MONGO_HAVE_GOOGLE_TCMALLOC)
size_t numCores = pi.getNumAvailableCores();
// 1024MB in bytes spread across cores.
size_t defaultTcMallocPerCPUCacheSize = (1024 * 1024 * 1024) / numCores;
size_t derivedTcMallocPerCPUCacheSize =
((systemMemorySizeMB / 8) * 2 * 1024 * 1024) / numCores; // 1/4 of system memory in bytes
size_t perCPUCacheSize =
std::min(defaultTcMallocPerCPUCacheSize, derivedTcMallocPerCPUCacheSize);
tcmalloc::MallocExtension::SetMaxPerCpuCacheSize(perCPUCacheSize);
#endif // MONGO_HAVE_GOOGLE_TCMALLOC
}
} // namespace
@ -162,7 +246,11 @@ void TCMallocReleaseRateServerParameter::append(OperationContext*,
BSONObjBuilder* builder,
StringData fieldName,
const boost::optional<TenantId>&) {
#if defined(MONGO_HAVE_GOOGLE_TCMALLOC)
auto value = getMemoryReleaseRate();
#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
auto value = MallocExtension::instance()->GetMemoryReleaseRate();
#endif
builder->append(fieldName, value);
}
@ -178,8 +266,11 @@ Status TCMallocReleaseRateServerParameter::setFromString(StringData tcmalloc_rel
str::stream() << "tcmallocReleaseRate cannot be negative: "
<< tcmalloc_release_rate};
}
#if defined(MONGO_HAVE_GOOGLE_TCMALLOC)
setMemoryReleaseRate(value);
#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
MallocExtension::instance()->SetMemoryReleaseRate(value);
#endif
return Status::OK();
}

View File

@ -83,10 +83,15 @@ def injectMozJS(thisEnv):
env.AddMethod(injectMozJS, 'InjectMozJS')
if not use_system_version_of_library('tcmalloc'):
if not use_system_version_of_library('tcmalloc-gperf'):
# GPerftools does this slightly differently than the others.
thirdPartyEnvironmentModifications['gperftools'] = {}
if not use_system_version_of_library('tcmalloc-google'):
thirdPartyEnvironmentModifications['tcmalloc'] = {
'CPPPATH': ['#/src/third_party/tcmalloc/dist'],
}
if not use_system_version_of_library('pcre2'):
thirdPartyEnvironmentModifications['pcre2'] = {
'CPPPATH': ['#/src/third_party/pcre2/src'],
@ -422,6 +427,12 @@ boostEnv.ShimLibrary(name="boost")
abseilDirectory = 'abseil-cpp'
abseilEnv = env.Clone()
# We can't depend on the allocator if we are using tcmalloc as its depends
# on us (abseil-cpp)
if abseilEnv['MONGO_ALLOCATOR'] in ['tcmalloc-google']:
abseilEnv = abseilEnv.Clone(LIBDEPS_NO_INHERIT=['$BUILD_DIR/third_party/shim_allocator'])
abseilEnv.InjectThirdParty(libraries=['abseil-cpp'])
abseilEnv.SConscript(abseilDirectory + '/SConscript', exports={'env': abseilEnv})
abseilEnv = abseilEnv.Clone(LIBDEPS_INTERFACE=[
@ -510,17 +521,34 @@ if "tom" in env["MONGO_CRYPTO"]:
tomcryptEnv.ShimLibrary(name="tomcrypt", )
gperftoolsEnv = env.Clone(LIBDEPS_NO_INHERIT=[
# tcmallocEnv implements this shim, so it rejects the implicit dependency.
tcmallocEnv = env.Clone(LIBDEPS_NO_INHERIT=[
'$BUILD_DIR/third_party/shim_allocator',
], )
if gperftoolsEnv['MONGO_ALLOCATOR'] in ["tcmalloc", "tcmalloc-experimental"]:
if use_system_version_of_library("tcmalloc"):
gperftoolsEnv = gperftoolsEnv.Clone(SYSLIBDEPS=[
if tcmallocEnv['MONGO_ALLOCATOR'] in ["tcmalloc-google"]:
if use_system_version_of_library("tcmalloc-google"):
tcmallocEnv = tcmallocEnv.Clone(SYSLIBDEPS=[
env['LIBDEPS_TCMALLOC_SYSLIBDEP'],
])
else:
gperftoolsEnv = gperftoolsEnv.Clone()
gperftoolsEnv.InjectThirdParty(libraries=['gperftools'])
tcmallocDirectory = 'tcmalloc'
tcmallocEnv = tcmallocEnv.Clone()
tcmallocEnv.InjectThirdParty(libraries=['tcmalloc'])
tcmallocEnv.SConscript(
tcmallocDirectory + '/SConscript',
exports={'env': tcmallocEnv},
)
tcmallocEnv = tcmallocEnv.Clone(LIBDEPS_INTERFACE=[
'tcmalloc/tcmalloc',
])
elif tcmallocEnv['MONGO_ALLOCATOR'] in ["tcmalloc-gperf"]:
if use_system_version_of_library("tcmalloc-gperf"):
tcmallocEnv = tcmallocEnv.Clone(SYSLIBDEPS=[
env['LIBDEPS_TCMALLOC_SYSLIBDEP'],
])
else:
tcmallocEnv = tcmallocEnv.Clone()
tcmallocEnv.InjectThirdParty(libraries=['gperftools'])
# Allow gperftools to determine its own consumer-side include/ dirs.
# Needed because those are in a platform-specific subdirectory.
@ -528,16 +556,16 @@ if gperftoolsEnv['MONGO_ALLOCATOR'] in ["tcmalloc", "tcmalloc-experimental"]:
for k, v in kwargs.items():
thirdPartyEnvironmentModifications['gperftools'][k] = v
gperftoolsEnv.AddMethod(registerConsumerModifications, 'RegisterConsumerModifications')
gperftoolsEnv.SConscript(
tcmallocEnv.AddMethod(registerConsumerModifications, 'RegisterConsumerModifications')
tcmallocEnv.SConscript(
'gperftools' + '/SConscript',
exports={'env': gperftoolsEnv},
exports={'env': tcmallocEnv},
)
gperftoolsEnv = gperftoolsEnv.Clone(LIBDEPS_INTERFACE=[
tcmallocEnv = tcmallocEnv.Clone(LIBDEPS_INTERFACE=[
'gperftools/tcmalloc_minimal',
])
gperftoolsEnv.ShimLibrary(
tcmallocEnv.ShimLibrary(
name="allocator",
LIBDEPS_TAGS=[
# TODO: Remove when SERVER-48291 is merged into stable build tools.

View File

@ -10,27 +10,6 @@ if env.ToolchainIs('msvc'):
CCFLAGS=[],
)
if env.GetOption('sanitize') and 'undefined' in env.GetOption('sanitize').split(','):
# UBSAN causes the __muloti4 reference to be in the library. This is not defined in libgcc, so
# we will just opt out of this check in this third party library. Related issues below:
#
# abseil issue showing the commit it was introduced
# https://github.com/abseil/abseil-cpp/issues/841
#
# GCC bug saying the symbol is missing
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103034
#
# LLVM bug saying the symbol requires extra linkage
# https://bugs.llvm.org/show_bug.cgi?id=16404
env.Append(
CCFLAGS=[
'-fno-sanitize=signed-integer-overflow',
],
LINKFLAGS=[
'-fno-sanitize=signed-integer-overflow',
],
)
if env.ToolchainIs('gcc'):
env.Append(CCFLAGS=[
'-Wno-error=ignored-attributes',

View File

@ -335,11 +335,7 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' ||
#if (defined(__clang__) && !defined(_WIN32)) || \
(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ >= 9) || \
(defined(__GNUC__) && !defined(__clang__) && !defined(__CUDACC__))
#if !ABSL_HAVE_FEATURE(address_sanitizer) && !ABSL_HAVE_FEATURE(memory_sanitizer) && \
!ABSL_HAVE_FEATURE(thread_sanitizer) && !ABSL_HAVE_FEATURE(undefined_behavior_sanitizer)
#define ABSL_HAVE_INTRINSIC_INT128 1
#endif // !ABSL_HAVE_FEATURE(address_sanitizer) && !ABSL_HAVE_FEATURE(memory_sanitizer) &&
// !ABSL_HAVE_FEATURE(thread_sanitizer) && !ABSL_HAVE_FEATURE(undefined_behavior_sanitizer)
#elif defined(__CUDACC__)
// __CUDACC_VER__ is a full version number before CUDA 9, and is defined to a
// string explaining that it has been removed starting with CUDA 9. We use

View File

@ -8,7 +8,7 @@ IFS=$'\n\t'
set -vx
NAME=abseil-cpp
REVISION="20230802.1-mongo-20240205"
REVISION="20230802.1-SERVER-85737"
VERSION="20230802.1"
DEST_DIR=$(git rev-parse --show-toplevel)/src/third_party/abseil-cpp

View File

@ -96,27 +96,6 @@ if env.ToolchainIs('msvc'):
CCFLAGS=[],
)
if env.GetOption('sanitize') and 'undefined' in env.GetOption('sanitize').split(','):
# UBSAN causes the __muloti4 reference to be in the library. This is not defined in libgcc, so
# we will just opt out of this check in this third party library. Related issues below:
#
# abseil issue showing the commit it was introduced
# https://github.com/abseil/abseil-cpp/issues/841
#
# GCC bug saying the symbol is missing
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103034
#
# LLVM bug saying the symbol requires extra linkage
# https://bugs.llvm.org/show_bug.cgi?id=16404
env.Append(
CCFLAGS=[
'-fno-sanitize=signed-integer-overflow',
],
LINKFLAGS=[
'-fno-sanitize=signed-integer-overflow',
],
)
if env.ToolchainIs('gcc'):
env.Append(
CCFLAGS=[

View File

@ -152,13 +152,6 @@ upb_env.Library(
],
)
upb_generated_protobuf_descriptor_obj = upb_env.LibraryObject(
target="upb_generated_protobuf_descriptor",
source=[
"dist/src/core/ext/upb-generated/google/protobuf/descriptor.upb.c",
],
)[0]
upb_env.Library(
target="upb_wire",
source=[
@ -230,7 +223,7 @@ upb_env.Library(
"dist/third_party/upb/upb/reflection/method_def.c",
"dist/third_party/upb/upb/reflection/oneof_def.c",
"dist/third_party/upb/upb/reflection/service_def.c",
upb_generated_protobuf_descriptor_obj,
"dist/src/core/ext/upb-generated/google/protobuf/descriptor.upb.c",
],
LIBDEPS=[
"upb_collections",
@ -1231,7 +1224,6 @@ grpc_env.Library(
"dist/src/core/tsi/ssl_transport_security_utils.cc",
"dist/src/core/tsi/transport_security.cc",
"dist/src/core/tsi/transport_security_grpc.cc",
upb_generated_protobuf_descriptor_obj,
],
OBJPREFIX=env.get('OBJPREFIX', '') + 'grpc_',
LIBDEPS=[

173
src/third_party/tcmalloc/SConscript vendored Normal file
View File

@ -0,0 +1,173 @@
# Project: com_google_tcmalloc
import json
import re
import sys
import SCons
Import("env")
Import("has_option")
env = env.Clone(
# Building with hidden visibility interferes with intercepting the
# libc allocation functions.
DISALLOW_VISHIDDEN=True,
NINJA_GENSOURCE_INDEPENDENT=True,
)
if env.Verbose():
def tcmalloc_scons_print(msg, *args, **kwargs):
print("[TCMALLOC_TO_SCONS]: " + msg, *args, **kwargs)
else:
def tcmalloc_scons_print(msg, *args, **kwargs):
pass
# manually switch this for all the debugging
tcmalloc_extra_debug = False
if tcmalloc_extra_debug:
def tcmalloc_scons_debug(msg, *args, **kwargs):
print("[TCMALLOC_TO_SCONS][DEBUG]: " + msg, *args, **kwargs)
else:
def tcmalloc_scons_debug(msg, *args, **kwargs):
pass
_bazelToSconsMap = dict(
(f'@com_google_absl//absl/{k}', [f'$BUILD_DIR/third_party/abseil-cpp/absl_{ve}' for ve in v])
for k, v in {
'algorithm:container': [],
'base:config': [],
'base:core_headers': [],
'base:dynamic_annotations': [],
'container:btree': [],
'container:fixed_array': [],
'container:flat_hash_map': ['raw_hash_set'],
'debugging:leak_check': [],
'debugging:stacktrace': ['stacktrace'],
'debugging:symbolize': [],
'functional:function_ref': [],
'base:malloc_internal': ['malloc_internal'],
'memory': [],
'numeric:bits': [],
'numeric:int128': [],
'strings:str_format': [],
'types:optional': [],
'types:span': [],
}.items())
sys.path.append(env.Dir('scripts/site-scons').srcnode().abspath)
from bazel_to_scons import BazelEnv, Label
def dumpBazelLibs(baz, target):
if tcmalloc_extra_debug:
tcmalloc_scons_debug(f"Dumping tcmalloc deps to: '{target}'", file=sys.stderr)
with open(target.abspath, 'w') as dump:
tcmalloc_scons_debug(
json.dumps({'libraries': baz}, sort_keys=True, indent=4), file=dump)
else:
pass
def _remapAbseilDep(label: Label) -> 'list[str]':
tcmalloc_scons_print(f'Remap abseilDep {label}', file=sys.stderr)
if str(label) in _bazelToSconsMap:
out = _bazelToSconsMap[str(label)]
tcmalloc_scons_print(f'Remap {label} to {out}', file=sys.stderr)
return out
pkg = label.package().replace('/', '_')
tgt = label.target()
# bazel expands //foo/bar => //foo/bar:bar implicitly. Use short form
if tgt and not pkg.endswith('/' + tgt):
tgt = "_" + tgt.replace('/', '_')
else:
tgt = ''
return [f'$BUILD_DIR/third_party/abseil-cpp/{pkg}{tgt}']
def findAbslLibs():
abslSconscript = env.File('$BUILD_DIR/third_party/abseil-cpp/SConscript').srcnode().abspath
tcmalloc_scons_debug(f'abslSconscript={abslSconscript}', file=sys.stderr)
abslLibs = []
with open(abslSconscript) as inf:
lines = (s.strip() for s in inf.readlines())
targetRe = re.compile(r"\s*target=['\"](.*)['\"],")
for line in lines:
m = targetRe.match(line)
if m:
fq = f'$BUILD_DIR/third_party/abseil-cpp/{m[1]}'
tcmalloc_scons_debug(f"found {fq} in {line}", file=sys.stderr)
abslLibs.append(fq)
return sorted(abslLibs)
def _mapDepToScons(lab: str, base: str = '') -> str:
if re.match(r'^@com_google_absl//', lab):
return _remapAbseilDep(Label(lab))
lab = re.sub(r'^:', f'//{Label(base).package()}:', lab)
lab = re.sub(r'^//', '', lab)
lab = re.sub(r'(.*):(.*)', r'\1_\2', lab)
lab = lab.replace("/", "_")
return [lab]
def slurpBlaze(target, source, exports, env):
bazel = BazelEnv(env, env.Dir("dist").srcnode().abspath, debug=tcmalloc_scons_debug)
bazel.run()
bazel.pruneTestOnlyLibraries()
bazel.eliminateHeadersFromSources()
bazel.eliminateSourcelessDeps()
bzl = bazel.libraries()
dumpBazelLibs(bzl, target)
resolved = bazel.resolveDeps(exports)
unknowns = [(x, resolved[x]) for x in resolved if 'unknown' in resolved[x]]
abslImports = {}
for unk in sorted(unknowns):
lab = Label(unk[0])
if lab.remote() == 'com_google_absl':
abslImports[str(lab)] = _remapAbseilDep(lab)
tcmalloc_scons_debug(f"{json.dumps({'abslImports': abslImports}, indent=4)}", file=sys.stderr)
tcmalloc_scons_print('Final render into env.Library calls', file=sys.stderr)
for libName in sorted(resolved.keys()):
if Label(libName).remote() or libName in _bazelToSconsMap or libName not in bzl:
continue
libDef = bzl[libName]
# It's the abseil name
lab = _mapDepToScons(libName)[0]
tcmalloc_scons_debug(f'libName: {libName:60s} => {lab}', file=sys.stderr)
tcmalloc_scons_debug(f' {json.dumps(list(libDef), indent=4)}', file=sys.stderr)
kwargs = {'target': lab}
for src in libDef.get('srcs', []):
src = f'dist/{Label(libName).package()}/{src}'
tcmalloc_scons_debug(f'srcs for lib={libName} -> src={src}', file=sys.stderr)
kwargs.setdefault('source', []).append(src)
for dep in libDef.get('deps', set()):
scons_deps = _mapDepToScons(dep, base=libName)
tcmalloc_scons_debug(f'lib={libName}: dep={dep} => {scons_deps}', file=sys.stderr)
kwargs.setdefault('LIBDEPS', []).extend(scons_deps)
if 'LIBDEPS' in kwargs:
kwargs['LIBDEPS'] = sorted(list(set(kwargs['LIBDEPS'])))
for cf in libDef.get('copts', []):
kwargs.setdefault('CCFLAGS', [e for e in env.get('CCFLAGS', [])]).append(cf)
tcmalloc_scons_print(f'env.Library(**{json.dumps(kwargs, indent=4)})', file=sys.stderr)
env.Library(**kwargs)
return 0
env = env.Clone()
env.InjectThirdParty(libraries=['abseil-cpp'])
slurpBlaze(
target=env.File('tcmalloc_deps.json').srcnode(), source=[],
exports=['//tcmalloc', '//tcmalloc:tcmalloc_extension'], env=env)

View File

@ -0,0 +1,74 @@
# How to Contribute to TCMalloc
We'd love to accept your patches and contributions to this project. There are
just a few small guidelines you need to follow.
NOTE: If you are new to GitHub, please start by reading [Pull Request
howto](https://help.github.com/articles/about-pull-requests/)
## Contributor License Agreement
Contributions to this project must be accompanied by a Contributor License
Agreement. You (or your employer) retain the copyright to your contribution;
this simply gives us permission to use and redistribute your contributions as
part of the project. Head over to <https://cla.developers.google.com/> to see
your current agreements on file or to sign a new one.
You generally only need to submit a CLA once, so if you've already submitted one
(even if it was for a different project), you probably don't need to do it
again.
## Guidelines for Pull Requests
* All submissions, including submissions by project members, require review.
We use GitHub pull requests for this purpose. Consult
[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
information on using pull requests.
* If you are a Googler, it is preferable to first create an internal CL and
have it reviewed and submitted. The code propagation process will deliver
the change to GitHub.
* Create **small PRs** that are narrowly focused on **addressing a single concern**.
When PRs try to fix several things at a time, if only one fix is considered
acceptable, nothing gets merged and both author's & review's time is wasted.
Create more PRs to address different concerns and everyone will be happy.
* Provide a good **PR description** as a record of **what** change is being
made and **why** it was made. Link to a GitHub issue if it exists.
* Don't fix code style and formatting unless you are already changing that line
to address an issue. Formatting of modified lines may be done using
`git clang-format`. PRs with irrelevant changes won't be merged. If you do
want to fix formatting or style, do that in a separate PR.
* Unless your PR is trivial, you should expect there will be reviewer comments
that you'll need to address before merging. We expect you to be reasonably
responsive to those comments, otherwise the PR will be closed after 2-3 weeks
of inactivity.
* Maintain **clean commit history** and use **meaningful commit messages**.
PRs with messy commit history are difficult to review and won't be merged.
Use `rebase -i upstream/master` to curate your commit history and/or to
bring in latest changes from master (but avoid rebasing in the middle of a
code review).
* Keep your PR up to date with upstream/master (if there are merge conflicts,
we can't really merge your change).
* **All tests need to be passing** before your change can be merged. We
recommend you **run tests locally** (see below)
* Exceptions to the rules can be made if there's a compelling reason for doing
so. That is - the rules are here to serve us, not the other way around, and
the rules need to be serving their intended purpose to be valuable.
## TCMalloc Committers
The current members of the TCMalloc engineering team are the only committers at
present.
## Community Guidelines
This project follows
[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).

202
src/third_party/tcmalloc/dist/LICENSE vendored Normal file
View File

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
https://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

45
src/third_party/tcmalloc/dist/README.md vendored Normal file
View File

@ -0,0 +1,45 @@
# TCMalloc
This repository contains the TCMalloc C++ code.
TCMalloc is Google's customized implementation of C's `malloc()` and C++'s
`operator new` used for memory allocation within our C and C++ code. TCMalloc is
a fast, multi-threaded malloc implementation.
## Building TCMalloc
[Bazel](https://bazel.build) is the official build system for TCMalloc.
The [TCMalloc Platforms Guide](docs/platforms.md) contains information on
platform support for TCMalloc.
## Documentation
All users of TCMalloc should consult the following documentation resources:
* The [TCMalloc Quickstart](docs/quickstart.md) covers downloading,
installing, building, and testing TCMalloc, including incorporating within
your codebase.
* The [TCMalloc Overview](docs/overview.md) covers the basic architecture of
TCMalloc, and how that may affect configuration choices.
* The [TCMalloc Reference](docs/reference.md) covers the C and C++ TCMalloc
API endpoints.
More advanced usages of TCMalloc may find the following documentation useful:
* The [TCMalloc Tuning Guide](docs/tuning.md) covers the configuration
choices in more depth, and also illustrates other ways to customize
TCMalloc. This also covers important operating system-level properties for
improving TCMalloc performance.
* The [TCMalloc Design Doc](docs/design.md) covers how TCMalloc works
underneath the hood, and why certain design choices were made. Most
developers will not need this level of implementation detail.
* The [TCMalloc Compatibility Guide](docs/compatibility.md) which documents
our expectations for how our APIs are used.
## License
The TCMalloc library is licensed under the terms of the Apache license. See
LICENSE for more information.
Disclaimer: This is not an officially supported Google product.

111
src/third_party/tcmalloc/dist/WORKSPACE vendored Normal file
View File

@ -0,0 +1,111 @@
# Copyright 2019 The TCMalloc Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
workspace(name = "com_google_tcmalloc")
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
# Load a recent version of skylib in case our dependencies have obsolete
# versions. This is needed for bazel 6 compatibility.
http_archive(
name = "bazel_skylib", # 2022-09-01
urls = ["https://github.com/bazelbuild/bazel-skylib/archive/refs/tags/1.3.0.zip"],
strip_prefix = "bazel-skylib-1.3.0",
sha256 = "4756ab3ec46d94d99e5ed685d2d24aece484015e45af303eb3a11cab3cdc2e71",
)
# Abseil
http_archive(
name = "com_google_absl",
urls = ["https://github.com/abseil/abseil-cpp/archive/b3162b1da62711c663d0025e2eabeb83fd1f2728.zip"],
strip_prefix = "abseil-cpp-b3162b1da62711c663d0025e2eabeb83fd1f2728",
sha256 = "d5c91248c33269fcc7ab35897315a45cfa2c37abb4c6d4ed36cb5c82f366367a",
)
# GoogleTest/GoogleMock framework. Used by most unit-tests.
http_archive(
name = "com_google_googletest", # 2021-05-19T20:10:13Z
urls = ["https://github.com/google/googletest/archive/aa9b44a18678dfdf57089a5ac22c1edb69f35da5.zip"],
strip_prefix = "googletest-aa9b44a18678dfdf57089a5ac22c1edb69f35da5",
sha256 = "8cf4eaab3a13b27a95b7e74c58fb4c0788ad94d1f7ec65b20665c4caf1d245e8",
)
# Google benchmark.
http_archive(
name = "com_github_google_benchmark",
urls = ["https://github.com/google/benchmark/archive/0baacde3618ca617da95375e0af13ce1baadea47.zip"],
strip_prefix = "benchmark-0baacde3618ca617da95375e0af13ce1baadea47",
sha256 = "62e2f2e6d8a744d67e4bbc212fcfd06647080de4253c97ad5c6749e09faf2cb0",
)
# C++ rules for Bazel.
http_archive(
name = "rules_cc", # 2021-05-14T14:51:14Z
urls = ["https://github.com/bazelbuild/rules_cc/archive/68cb652a71e7e7e2858c50593e5a9e3b94e5b9a9.zip"],
strip_prefix = "rules_cc-68cb652a71e7e7e2858c50593e5a9e3b94e5b9a9",
sha256 = "1e19e9a3bc3d4ee91d7fcad00653485ee6c798efbbf9588d40b34cbfbded143d",
)
# Python rules
#
# This is explicitly added to work around
# https://github.com/bazelbuild/rules_fuzzing/issues/207
# and https://github.com/google/tcmalloc/issues/127
http_archive(
name = "rules_python",
urls = ["https://github.com/bazelbuild/rules_python/archive/refs/tags/0.11.0.tar.gz"],
sha256 = "c03246c11efd49266e8e41e12931090b613e12a59e6f55ba2efd29a7cb8b4258",
strip_prefix = "rules_python-0.11.0",
)
# Proto rules for Bazel and Protobuf
http_archive(
name = "com_google_protobuf",
urls = ["https://github.com/protocolbuffers/protobuf/archive/13d559beb6967033a467a7517c35d8ad970f8afb.zip"],
strip_prefix = "protobuf-13d559beb6967033a467a7517c35d8ad970f8afb",
sha256 = "9ca59193fcfe52c54e4c2b4584770acd1a6528fc35efad363f8513c224490c50",
)
load("@com_google_protobuf//:protobuf_deps.bzl", "protobuf_deps")
protobuf_deps()
http_archive(
name = "rules_proto",
sha256 = "66bfdf8782796239d3875d37e7de19b1d94301e8972b3cbd2446b332429b4df1",
strip_prefix = "rules_proto-4.0.0",
urls = [
"https://mirror.bazel.build/github.com/bazelbuild/rules_proto/archive/refs/tags/4.0.0.tar.gz",
"https://github.com/bazelbuild/rules_proto/archive/refs/tags/4.0.0.tar.gz",
],
)
load("@rules_proto//proto:repositories.bzl", "rules_proto_dependencies", "rules_proto_toolchains")
rules_proto_dependencies()
rules_proto_toolchains()
# Fuzzing
http_archive(
name = "rules_fuzzing",
sha256 = "a5734cb42b1b69395c57e0bbd32ade394d5c3d6afbfe782b24816a96da24660d",
strip_prefix = "rules_fuzzing-0.1.1",
urls = ["https://github.com/bazelbuild/rules_fuzzing/archive/v0.1.1.zip"],
)
# Protobuf
load("@rules_fuzzing//fuzzing:repositories.bzl", "rules_fuzzing_dependencies")
rules_fuzzing_dependencies()
load("@rules_fuzzing//fuzzing:init.bzl", "rules_fuzzing_init")
rules_fuzzing_init()

View File

@ -0,0 +1,58 @@
# TCMalloc
This repository contains the TCMalloc C++ code.
TCMalloc is Google's customized implementation of C's `malloc()` and C++'s
`operator new` used for memory allocation within our C and C++ code. TCMalloc is
a fast, multi-threaded malloc implementation.
## Building TCMalloc
[Bazel](https://bazel.build) is the official build system for TCMalloc.
The [TCMalloc Platforms Guide](platforms.md) contains information on platform
support for TCMalloc.
## Documentation
All users of TCMalloc should consult the following documentation resources:
* The [TCMalloc Quickstart](quickstart.md) covers downloading, installing,
building, and testing TCMalloc, including incorporating within your
codebase.
* The [TCMalloc Overview](overview.md) covers the basic architecture of
TCMalloc, and how that may affect configuration choices.
* The [TCMalloc Reference](reference.md) covers the C and C++ TCMalloc API
endpoints.
More advanced usages of TCMalloc may find the following documentation useful:
* The [TCMalloc Tuning Guide](tuning.md) covers the configuration choices in
more depth, and also illustrates other ways to customize TCMalloc.
* The [TCMalloc Design Doc](design.md) covers how TCMalloc works underneath
the hood, and why certain design choices were made. Most developers will not
need this level of implementation detail.
* The [TCMalloc Compatibility Guide](compatibility.md) which documents our
expectations for how our APIs are used.
* The [history and differences](gperftools.md) between this repository and
gperftools.
## Publications
We've published several papers relating to TCMalloc optimizations:
* ["Beyond malloc efficiency to fleet efficiency: a hugepage-aware memory
allocator" (OSDI 2021)](https://research.google/pubs/pub50370/) relating to
the development and rollout of [Temeraire](temeraire.md), TCMalloc's
hugepage-aware page heap implementation.
* ["Adaptive Hugepage Subrelease for Non-moving Memory Allocators in
Warehouse-Scale Computers" (ISMM
2021)](https://research.google/pubs/pub50436/) relating to optimizations for
releasing partial hugepages to the operating system.
## License
The TCMalloc library is licensed under the terms of the Apache license. See
LICENSE for more information.
Disclaimer: This is not an officially supported Google product.

View File

@ -0,0 +1,44 @@
# TCMalloc Compatibility Guidelines
This document details what we expect from well-behaved users. Any usage of
TCMalloc libraries outside of these technical boundaries may result in breakage
when upgrading to newer versions of TCMalloc.
Put another way: don't do things that make TCMalloc API maintenance tasks
harder. If you misuse TCMalloc APIs, you're on your own.
Additionally, because TCMalloc depends on Abseil, Abseil's
[compatibility guidelines](https://abseil.io/about/compatibility) also apply.
## What Users Must (And Must Not) Do
* **Do not depend on a compiled representation of TCMalloc.** We do not
promise any ABI compatibility &mdash; we intend for TCMalloc to be built
from source, hopefully from head. The internal layout of our types may
change at any point, without notice. Building TCMalloc in the presence of
different C++ standard library types may change Abseil types, especially for
pre-adopted types (`string_view`, `variant`, etc) &mdash; these will become
typedefs and their ABI will change accordingly.
* **Do not rely on dynamic loading/unloading.** TCMalloc does not support
dynamic loading and unloading.
* **You may not open namespace `tcmalloc`.** You are not allowed to define
additional names in namespace `tcmalloc`, nor are you allowed to specialize
anything we provide.
* **You may not depend on the signatures of TCMalloc APIs.** You cannot take
the address of APIs in TCMalloc (that would prevent us from adding overloads
without breaking you). You cannot use metaprogramming tricks to depend on
those signatures either. (This is also similar to the restrictions in the
C++ standard.)
* **You may not forward declare TCMalloc APIs.** This is actually a sub-point
of "do not depend on the signatures of TCMalloc APIs" as well as "do not
open namespace `tcmalloc`", but can be surprising. Any refactoring that
changes template parameters, default parameters, or namespaces will be a
breaking change in the face of forward-declarations.
* **Do not depend upon internal details.** This should go without saying: if
something is in a namespace or filename/path that includes the word
"internal", you are not allowed to depend upon it. It's an implementation
detail. You cannot friend it, you cannot include it, you cannot mention it
or refer to it in any way.
* **Include What You Use.** We may make changes to the internal `#include`
graph for TCMalloc headers - if you use an API, please include the relevant
header file directly.

View File

@ -0,0 +1,470 @@
# TCMalloc : Thread-Caching Malloc
## Motivation
TCMalloc is a memory allocator designed as an alternative to the system default
allocator that has the following characteristics:
* Fast, uncontended allocation and deallocation for most objects. Objects are
cached, depending on mode, either per-thread, or per-logical-CPU. Most
allocations do not need to take locks, so there is low contention and good
scaling for multi-threaded applications.
* Flexible use of memory, so freed memory can be reused for different object
sizes, or returned to the OS.
* Low per object memory overhead by allocating "pages" of objects of the same
size. Leading to space-efficient representation of small objects.
* Low overhead sampling, enabling detailed insight into applications memory
usage.
## Usage
You use TCMalloc by specifying it as the `malloc` attribute on your binary rules in Bazel.
## Overview
The following block diagram shows the rough internal structure of TCMalloc:
![Diagram of TCMalloc internal structure](images/tcmalloc_internals.png "TCMalloc internal structure")
We can break TCMalloc into three components. The front-end, middle-end, and
back-end. We will discuss these in more details in the following sections. A
rough breakdown of responsibilities is:
* The front-end is a cache that provides fast allocation and deallocation of
memory to the application.
* The middle-end is responsible for refilling the front-end cache.
* The back-end handles fetching memory from the OS.
Note that the front-end can be run in either per-CPU or legacy per-thread mode,
and the back-end can support either the hugepage aware pageheap or the legacy
pageheap.
## The TCMalloc Front-end
The front-end handles a request for memory of a particular size. The front-end
has a cache of memory that it can use for allocation or to hold free memory.
This cache is only accessible by a single thread at a time, so it does not
require any locks, hence most allocations and deallocations are fast.
The front-end will satisfy any request if it has cached memory of the
appropriate size. If the cache for that particular size is empty, the front-end
will request a batch of memory from the middle-end to refill the cache. The
middle-end comprises the CentralFreeList and the TransferCache.
If the middle-end is exhausted, or if the requested size is greater than the
maximum size that the front-end caches handle, a request will go to the back-end
to either satisfy the large allocation, or to refill the caches in the
middle-end. The back-end is also referred to as the PageHeap.
There are two implementations of the TCMalloc front-end:
* Originally it supported per-thread caches of objects (hence the name Thread
Caching Malloc). However, this resulted in memory footprints that scaled
with the number of threads. Modern applications can have large thread
counts, which result in either large amounts of aggregate per-thread memory,
or many threads having minuscule per-thread caches.
* More recently TCMalloc has supported per-CPU mode. In this mode each logical
CPU in the system has its own cache from which to allocate memory. Note: On
x86 a logical CPU is equivalent to a hyperthread.
The differences between per-thread and per-CPU modes are entirely confined to
the implementations of malloc/new and free/delete.
## Small and Large Object Allocation
Allocations of "small" objects are mapped onto one of
[60-80 allocatable size-classes](https://github.com/google/tcmalloc/blob/master/tcmalloc/size_classes.cc).
For example, an allocation of 12 bytes will get rounded up to the 16 byte
size-class. The size-classes are designed to minimize the amount of memory that
is wasted when rounding to the next largest size-class.
When compiled with `__STDCPP_DEFAULT_NEW_ALIGNMENT__ <= 8`, we use a set of
sizes aligned to 8 bytes for raw storage allocated with `::operator new`. This
smaller alignment minimizes wasted memory for many common allocation sizes (24,
40, etc.) which are otherwise rounded up to a multiple of 16 bytes. On many
compilers, this behavior is controlled by the `-fnew-alignment=...` flag.
When
`__STDCPP_DEFAULT_NEW_ALIGNMENT__` is not specified (or is larger than 8 bytes),
we use standard 16 byte alignments for `::operator new`. However, for
allocations under 16 bytes, we may return an object with a lower alignment, as
no object with a larger alignment requirement can be allocated in the space.
When an object of a given size is requested, that request is mapped to a request
of a particular size-class using the
[`SizeMap::GetSizeClass()` function](https://github.com/google/tcmalloc/blob/master/tcmalloc/common.h),
and the returned memory is from that size-class. This means that the returned
memory is at least as large as the requested size. Allocations from size-classes
are handled by the front-end.
Objects of size greater than the limit defined by
[`kMaxSize`](https://github.com/google/tcmalloc/blob/master/tcmalloc/common.h)
are allocated directly from the [backend](#tcmalloc-backend). As such they are
not cached in either the front or middle ends. Allocation requests for large
object sizes are rounded up to the [TCMalloc page size](#tcmalloc-page-sizes).
## Deallocation
When an object is deallocated, the compiler will provide the size of the object
if it is known at compile time. If the size is not known, it will be looked up
in the [pagemap](#pagemap). If the object is small it will be put back into the
front-end cache. If the object is larger than kMaxSize it is returned directly
to the pageheap.
### Per-CPU Mode
In per-CPU mode a single large block of memory is allocated. The following
diagram shows how this slab of memory is divided between CPUs and how each CPU
uses a part of the slab to hold metadata as well as pointers to available
objects.
![Memory layout of per-cpu data structures](images/per-cpu-cache-internals.png "Memory layout of per-cpu data structures")
Each logical CPU is assigned a section of this memory to hold metadata and
pointers to available objects of particular size-classes. The metadata comprises
one /header/ block per size-class. The header has a pointer to the start of the
per-size-class array of pointers to objects, as well as a pointer to the
current, dynamic, maximum capacity and the current position within that array
segment. The static maximum capacity of each per-size-class array of pointers is
[determined at start time](https://github.com/google/tcmalloc/blob/master/tcmalloc/internal/percpu_tcmalloc.h)
by the difference between the start of the array for this size-class and the
start of the array for the next size-class.
At runtime the maximum number of items of a particular size-class that can be
stored in the per-cpu block will vary, but it can never exceed the statically
determined maximum capacity assigned at start up.
When an object of a particular size-class is requested it is removed from this
array, when the object is freed it is added to the array. If the array is
[exhausted](https://github.com/google/tcmalloc/blob/master/tcmalloc/cpu_cache.h)
the array is refilled using a batch of objects from the middle-end. If the array
would
[overflow](https://github.com/google/tcmalloc/blob/master/tcmalloc/cpu_cache.h),
a batch of objects are removed from the array and returned to the middle-end.
The amount of memory that can be cached is limited per-cpu by the parameter
`MallocExtension::SetMaxPerCpuCacheSize`. This means that the total amount of
cached memory depends on the number of active per-cpu caches. Consequently
machines with higher CPU counts can cache more memory.
To avoid holding memory on CPUs where the application no longer runs,
`MallocExtension::ReleaseCpuMemory` frees objects held in a specified CPU's
caches.
Within a CPU, the distribution of memory is managed across all the size-classes
so as to keep the maximum amount of cached memory below the limit. Notice that
it is managing the maximum amount that can be cached, and not the amount that is
currently cached. On average the amount actually cached should be about half the
limit.
The maximum capacity is increased when a size-class
[runs out of objects](https://github.com/google/tcmalloc/blob/master/tcmalloc/cpu_cache.h),
and when fetching more objects, it also considers
[increasing the capacity](https://github.com/google/tcmalloc/blob/master/tcmalloc/cpu_cache.h)
of the size-class. It can increase the capacity of the size-class up until the
total memory (for all size-classes) that the cache could hold reaches the
per-cpu limit or until the capacity of that size-class reaches the hard-coded
size limit for that size-class. If the size-class has not reached the hard-coded
limit, then in order to increase the capacity it can
[steal](https://github.com/google/tcmalloc/blob/master/tcmalloc/cpu_cache.h)
capacity from another size-class on the same CPU.
### Restartable Sequences and Per-CPU TCMalloc
To work correctly, per-CPU mode relies on restartable sequences (man rseq(2)). A
restartable sequence is just a block of (assembly language) instructions,
largely like a typical function. A restriction of restartable sequences is that
they cannot write partial state to memory, the final instruction must be a
single write of the updated state. The idea of restartable sequences is that if
a thread is removed from a CPU (e.g. context switched) while it is executing a
restartable sequence, the sequence will be restarted from the top. Hence the
sequence will either complete without interruption, or be repeatedly restarted
until it completes without interruption. This is achieved without using any
locking or atomic instructions, thereby avoiding any contention in the sequence
itself.
The practical implication of this for TCMalloc is that the code can use a
restartable sequence like
[TcmallocSlab_Internal_Push](https://github.com/google/tcmalloc/blob/master/tcmalloc/internal/percpu_tcmalloc.h)
to fetch from or return an element to a per-CPU array without needing locking.
The restartable sequence ensures that either the array is updated without the
thread being interrupted, or the sequence is restarted if the thread was
interrupted (for example, by a context switch that enables a different thread to
run on that CPU).
Additional information about the design choices and implementation are discussed
in a specific [design doc](rseq.md) for it.
### Legacy Per-Thread mode
In per-thread mode, TCMalloc assigns each thread a thread-local cache. Small
allocations are satisfied from this thread-local cache. Objects are moved
between the middle-end into and out of the thread-local cache as needed.
A thread cache contains one singly linked list of free objects per size-class
(so if there are N size-classes, there will be N corresponding linked lists), as
shown in the following diagram.
![Structure of per-thread cache](images/per-thread-structure.png "Structure of per-thread cache")
On allocation an object is removed from the appropriate size-class of the
per-thread caches. On deallocation, the object is prepended to the appropriate
size-class. Underflow and overflow are handled by accessing the middle-end to
either fetch more objects, or to return some objects.
The maximum capacity of the per-thread caches is set by the parameter
`MallocExtension::SetMaxTotalThreadCacheBytes`.
However it is possible for the
total size to exceed that limit as each per-thread cache has a minimum size
[KMinThreadCacheSize](https://github.com/google/tcmalloc/blob/master/tcmalloc/common.h)
which is usually 512KiB. In the event that a thread wishes to increase its
capacity, it needs to
[scavenge](https://github.com/google/tcmalloc/blob/master/tcmalloc/thread_cache.cc)
capacity from other threads.
When threads exit their cached memory is
[returned](https://github.com/google/tcmalloc/blob/master/tcmalloc/thread_cache.cc)
to the middle-end
### Runtime Sizing of Front-end Caches
It is important for the size of the front-end cache free lists to adjust
optimally. If the free list is too small, we'll need to go to the central free
list too often. If the free list is too big, we'll waste memory as objects sit
idle in there.
Note that the caches are just as important for deallocation as they are for
allocation. Without a cache, each deallocation would require moving the memory
to the central free list.
Per-CPU and per-thread modes have different implementations of a dynamic cache
sizing algorithm.
* In per-thread mode the maximum number of objects that can be stored is
[increased](https://github.com/google/tcmalloc/blob/master/tcmalloc/thread_cache.cc)
up to a limit whenever more objects need to be fetched from the middle-end.
Similarly the capacity is
[decreased](https://github.com/google/tcmalloc/blob/master/tcmalloc/thread_cache.cc)
when we find that we have cached too many objects. The size of the cache is
also
[reduced](https://github.com/google/tcmalloc/blob/master/tcmalloc/thread_cache.cc)
should the total size of the cached objects exceed the per-thread limit.
* In per-CPU mode the
[capacity](https://github.com/google/tcmalloc/blob/master/tcmalloc/cpu_cache.h)
of the free list is increased depending on whether we are alternating
between underflows and overflows (indicating that a larger cache might stop
this alternation). The capacity is
[reduced](https://github.com/google/tcmalloc/blob/master/tcmalloc/cpu_cache.h)
when it has not been grown for a time and may therefore be over capacity.
## TCMalloc Middle-end
The middle-end is responsible for providing memory to the front-end and
returning memory to the back-end. The middle-end comprises the Transfer cache
and the Central free list. Although these are often referred to as singular,
there is one transfer cache and one central free list per size-class. These
caches are each protected by a mutex lock - so there is a serialization cost to
accessing them.
### Transfer Cache
When the front-end requests memory, or returns memory, it will reach out to the
transfer cache.
The transfer cache holds an array of pointers to free memory, and it is quick to
move objects into this array, or fetch objects from this array on behalf of the
front-end.
The transfer cache gets its name from situations where one CPU (or thread) is
allocating memory that is deallocated by another CPU (or thread). The transfer
cache allows memory to rapidly flow between two different CPUs (or threads).
If the transfer cache is unable to satisfy the memory request, or has
insufficient space to hold the returned objects, it will access the central free
list.
### Central Free List
The central free list manages memory in "[spans](#spans)", a span is a
collection of one or more "[TCMalloc pages](#tcmalloc-page-sizes)" of memory.
These terms will be explained in the next couple of sections.
A request for one or more objects is satisfied by the central free list by
[extracting](https://github.com/google/tcmalloc/blob/master/tcmalloc/central_freelist.cc)
objects from spans until the request is satisfied. If there are insufficient
available objects in the spans, more spans are requested from the back-end.
When objects are
[returned to the central free list](https://github.com/google/tcmalloc/blob/master/tcmalloc/central_freelist.cc),
each object is mapped to the span to which it belongs (using the
[pagemap](#pagemap-and-spans)) and then released into that span. If all the
objects that reside in a particular span are returned to it, the entire span
gets returned to the back-end.
### Pagemap and Spans
The heap managed by TCMalloc is divided into [pages](#pagesize) of a
compile-time determined size. A run of contiguous pages is represented by a
`Span` object. A span can be used to manage a large object that has been handed
off to the application, or a run of pages that have been split up into a
sequence of small objects. If the span manages small objects, the size-class of
the objects is recorded in the span.
The pagemap is used to look up the span to which an object belongs, or to
identify the size-class for a given object.
TCMalloc uses a 2-level or 3-level
[radix tree](https://github.com/google/tcmalloc/blob/master/tcmalloc/pagemap.h)
in order to map all possible memory locations onto spans.
The following diagram shows how a radix-2 pagemap is used to map the address of
objects onto the spans that control the pages where the objects reside. In the
diagram **span A** covers two pages, and **span B** covers 3 pages.
![The pagemap maps objects to spans.](images/pagemap.png "The pagemap maps objects to spans.")
Spans are used in the middle-end to determine where to place returned objects,
and in the back-end to manage the handling of page ranges.
### Storing Small Objects in Spans
A span contains a pointer to the base of the TCMalloc pages that the span
controls. For small objects those pages are divided into at most 2<sup>16</sup>
objects. This value is selected so that within the span we can refer to objects
by a two-byte index.
This means that we can use an
[unrolled linked list](https://en.wikipedia.org/wiki/Unrolled_linked_list) to
hold the objects. For example, if we have eight byte objects we can store the
indexes of three ready-to-use objects, and use the forth slot to store the index
of the next object in the chain. This data structure reduces cache misses over a
fully linked list.
The other advantage of using two byte indexes is that we're able to use spare
capacity in the span itself to
[cache four objects](https://github.com/google/tcmalloc/blob/master/tcmalloc/span.h).
When we have
[no available objects](https://github.com/google/tcmalloc/blob/master/tcmalloc/central_freelist.cc)
for a size-class, we need to fetch a new span from the pageheap and
[populate](https://github.com/google/tcmalloc/blob/master/tcmalloc/central_freelist.cc)
it.
## TCMalloc Page Sizes
TCMalloc can be built with various
["page sizes"](https://github.com/google/tcmalloc/blob/master/tcmalloc/common.h)
. Note that these do not correspond to the page size used in the TLB of the
underlying hardware. These TCMalloc page sizes are currently 4KiB, 8KiB, 32KiB,
and 256KiB.
A TCMalloc page either holds multiple objects of a particular size, or is used
as part of a group to hold an object of size greater than a single page. If an
entire page becomes free it will be returned to the back-end (the pageheap) and
can later be repurposed to hold objects of a different size (or returned to the
OS).
Small pages are better able to handle the memory requirements of the application
with less overhead. For example, a half-used 4KiB page will have 2KiB left over
versus a 32KiB page which would have 16KiB. Small pages are also more likely to
become free. For example, a 4KiB page can hold eight 512-byte objects versus 64
objects on a 32KiB page; and there is much less chance of 64 objects being free
at the same time than there is of eight becoming free.
Large pages result in less need to fetch and return memory from the back-end. A
single 32KiB page can hold eight times the objects of a 4KiB page, and this can
result in the costs of managing the larger pages being smaller. It also takes
fewer large pages to map the entire virtual address space. TCMalloc has a
[pagemap](https://github.com/google/tcmalloc/blob/master/tcmalloc/pagemap.h)
which maps a virtual address onto the structures that manage the objects in that
address range. Larger pages mean that the pagemap needs fewer entries and is
therefore smaller.
Consequently, it makes sense for applications with small memory footprints, or
that are sensitive to memory footprint size to use smaller TCMalloc page sizes.
Applications with large memory footprints are likely to benefit from larger
TCMalloc page sizes.
## TCMalloc Backend
The back-end of TCMalloc has three jobs:
* It manages large chunks of unused memory.
* It is responsible for fetching memory from the OS when there is no suitably
sized memory available to fulfill an allocation request.
* It is responsible for returning unneeded memory back to the OS.
There are two backends for TCMalloc:
* The Legacy pageheap which manages memory in TCMalloc page sized chunks.
* The hugepage aware pageheap which manages memory in chunks of hugepage
sizes. Managing memory in hugepage chunks enables the allocator to improve
application performance by reducing TLB misses.
### Legacy Pageheap
The legacy pageheap is an array of free lists for particular lengths of
contiguous pages of available memory. For `k < 256`, the `k`th entry is a free
list of runs that consist of `k` TCMalloc pages. The `256`th entry is a free
list of runs that have length `>= 256` pages:
![Layout of legacy pageheap.](images/legacy_pageheap.png "Layout of legacy pageheap.")
An allocation for `k` pages is satisfied by looking in the `k`th free list. If
that free list is empty, we look in the next free list, and so forth.
Eventually, we look in the last free list if necessary. If that fails, we fetch
memory from the system `mmap`.
If an allocation for `k` pages is satisfied by a run of pages of length `> k` ,
the remainder of the run is re-inserted back into the appropriate free list in
the pageheap.
When a range of pages are returned to the pageheap, the adjacent pages are
checked to determine if they now form a contiguous region, if that is the case
then the pages are concatenated and placed into the appropriate free list.
### Hugepage Aware Allocator
The objective of the hugepage aware allocator is to hold memory in hugepage size
chunks. On x86 a hugepage is 2MiB in size. To do this the back-end has three
different caches:
* The filler cache holds hugepages which have had some memory allocated from
them. This can be considered to be similar to the legacy pageheap in that it
holds linked lists of memory of a particular number of TCMalloc pages.
Allocation requests for sizes of less than a hugepage in size are
(typically) returned from the filler cache. If the filler cache does not
have sufficient available memory it will request additional hugepages from
which to allocate.
* The region cache which handles allocations of greater than a hugepage. This
cache allows allocations to straddle multiple hugepages, and packs multiple
such allocations into a contiguous region. This is particularly useful for
allocations that slightly exceed the size of a hugepage (for example, 2.1
MiB).
* The hugepage cache handles large allocations of at least a hugepage. There
is overlap in usage with the region cache, but the region cache is only
enabled when it is determined (at runtime) that the allocation pattern would
benefit from it.
Additional information about the design choices made in HPAA are discussed in a
specific [design doc](temeraire.md) for it.
## Caveats
TCMalloc will reserve some memory for metadata at start up. The amount of
metadata will grow as the heap grows. In particular the pagemap will grow with
the virtual address range that TCMalloc uses, and the spans will grow as the
number of active pages of memory grows. In per-CPU mode, TCMalloc will reserve a
slab of memory per-CPU (typically 256 KiB), which, on systems with large numbers
of logical CPUs, can lead to a multi-mebibyte footprint.
It is worth noting that TCMalloc requests memory from the OS in large chunks
(typically 1 GiB regions). The address space is reserved, but not backed by
physical memory until it is used. Because of this approach the VSS of the
application can be substantially larger than the RSS. A side effect of this is
that trying to limit an application's memory use by restricting VSS will fail
long before the application has used that much physical memory.
Don't try to load TCMalloc into a running binary (e.g., using JNI in Java
programs). The binary will have allocated some objects using the system malloc,
and may try to pass them to TCMalloc for deallocation. TCMalloc will not be able
to handle such objects.

View File

@ -0,0 +1,70 @@
# TCMalloc and gperftools
There are two projects on Github that are based on Googles internal TCMalloc:
This repository and [gperftools](https://github.com/gperftools/gperftools). Both
are fast C/C++ memory allocators designed around a fast path that avoids
synchronizing with other threads for most allocations.
This repository is Google's current implementation of TCMalloc, used by ~all of
our C++ programs in production. The code is limited to the memory allocator
implementation itself.
## History
Google open-sourced its memory allocator as part of "Google Performance Tools"
in 2005. At the time, it became easy to externalize code, but more difficult to
keep it in-sync with our internal usage, as discussed by Titus Winters in
[his 2017 CppCon Talk](https://www.youtube.com/watch?v=tISy7EJQPzI) and the
"Software Engineering at Google" book. Subsequently, our internal implementation
diverged from the code externally. This project eventually was adopted by the
community as "gperftools."
## Differences
Since
[“Profiling a Warehouse-Scale Computer” (Kanev 2015)](https://research.google/pubs/pub44271/),
we have invested in improving application productivity via optimizations to the
implementation (per-CPU caches, sized delete, fast/slow path improvements,
[hugepage-aware backend](temeraire.md)).
Because this repository reflects our day-to-day usage, we've focused on the
platforms we regularly use and can see extensive testing and optimization.
This implementation is based on [Abseil](https://github.com/abseil/abseil-cpp).
Like Abseil, we do not attempt to provide ABI stability. Providing a stable ABI
could require compromising performance or adding otherwise unneeded complexity
to maintain stability. These caveats are noted in our
[Compatibility Guidelines](compatibility.md).
In addition to a memory allocator, the gperftools project contains a number of
other tools:
* An All-Allocation Memory Profiler: We have found this prohibitively costly
to use regularly, and instead focus on using low-overhead, always-on
sampling profilers. This sampling based profiler is exposed in our
`malloc_extension.h`.
* A SIGPROF-based CPU Profiler: The Linux `perf` tool is decreasing our
internal need for signal-based profiling. Additionally, with restartable
sequences, signals interrupt the fastpath, leading to skew between the
observed instruction pointer and where we actually spend CPU time.
* A Heap Checker/Debug Allocator: The LeakSanitizer, AddressSanitizer, and
MemorySanitizer suite provide higher accuracy and better performance.
* A perl-based `pprof` tool: This project is now developed in Go and is
[available on Github](https://github.com/google/pprof).
## Differences From Google's Implementation of TCMalloc
The configuration on Github mirrors our production defaults, with two notable
exceptions:
* Many of our production servers start a background thread (via
`tcmalloc::MallocExtension::ProcessBackgroundActions`) to regularly call
`tcmalloc::MallocExtension::ReleaseMemoryToSystem`, while others never
release memory in favor of better CPU performance. These tradeoffs are
discussed in our [tuning page](tuning.md).
* We do not activate [GWP ASan](gwp-asan.md) by default, but can be activated
via `MallocExtension`.
Over time, we have found that configurability carries a maintenance burden.
While a knob can provide immediate flexibility, the increased complexity can
cause subtle problems for more rarely used combinations.

View File

@ -0,0 +1,87 @@
# GWP-ASan
GWP-ASan is a low-overhead sampling-based utility for finding
heap-use-after-frees and heap-buffer-overflows in production.
GWP-ASan is a recursive acronym: "**G**WP-ASan **W**ill **P**rovide
**A**llocation **San**ity".
## Why not just use ASan?
For many cases you **should** use [ASan](https://clang.llvm.org/docs/AddressSanitizer.html)
(e.g., on your tests). However, ASan comes with average execution slowdown of 2x
(compared to `-O2`), binary size increase of 2x, and significant memory
overhead. For these reasons, ASan is generally impractical for use in production
(other than in dedicated canaries). GWP-ASan is a minimal-overhead alternative
designed for widespread use in production.
## How to use GWP-ASan
You can enable GWP-ASan by calling `tcmalloc::MallocExtension::ActivateGuardedSampling()`.
To adjust GWP-ASan's sampling rate, see
[below](#what-should-i-set-the-sampling-rate-to).
When GWP-ASan detects a heap memory error, it prints stack traces for the point
of the memory error, as well as the points where the memory was allocated and
(if applicable) freed. These stack traces can then be
symbolized offline to get file names and line
numbers.
GWP-ASan will crash after printing stack traces.
## CPU and RAM Overhead
For guarded sampling rates above 100M (the default), CPU overhead is negligible. For sampling rates as low as 8M, CPU overhead is under 0.5%.
RAM overhead is up to 512 KB on x86\_64, or 4 MB on PowerPC.
## What should I set the sampling rate to?
`tcmalloc::MallocExtension::SetGuardedSamplingRate` sets the sampling rate for
GWP-ASan. GWP-ASan will guard allocations approximately every
`GuardedSamplingRate` bytes allocated. Thus, lower values will generally
increase the the chance of finding bugs but will also have higher CPU overhead.
For applications that cannot tolerate any CPU overhead, we recommend
using TCMalloc's default sampling rate. If your application can tolerate some
CPU overhead, we recommend a sampling rate of 8MB.
## Limitations
- The current version of GWP-ASan will only find bugs in allocations of 8 KB
or less. This restriction was made to limit the CPU/RAM overhead required by
GWP-ASan.
- GWP-ASan has limited diagnostic information for buffer overflows within
alignment padding, since overflows of this type will not touch a guard
page. For write-overflows,
GWP-ASan will still be able to detect the overflow during deallocation by
checking whether magic bytes have been overwritten, but the stack trace of
the overflow itself will not be available.
## FAQs
### Does GWP-ASan report false positives?
No. GWP-ASan crashes because your program accessed unmapped memory, which is
always a true bug, or a sign of hardware failure (see below).
### How do I know a GWP-ASan report isn't caused by hardware failure?
The vast majority of GWP-ASan reports we see are true bugs, but occasionally
faulty hardware will be the actual cause of the crash. In general, if you see
the same GWP-ASan crash on multiple machines, it is very likely there's a true
software bug.
### Can GWP-ASan cause queries of death (QoD) in my production?
Since GWP-ASan finds bugs with very low probability, QoD is generally not a
concern. Even if there is a reliable way to trigger a bug, GWP-ASan will only
detect it and crash on a tiny fraction of actual occurrences, allowing the other
99.9% to continue without crashing.
## Other versions of GWP-ASan
Separate implementations of GWP-ASan exist for Chromium and Android. For
GWP-ASan for Chromium see
[here](https://chromium.googlesource.com/chromium/src/+/lkgr/docs/gwp_asan.md).
For Android, see [here](https://developer.android.com/ndk/guides/gwp-asan).

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 57 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

View File

@ -0,0 +1,102 @@
# Lifetime-based Memory Allocation
TCMalloc contains an experimental feature that leverages object lifetime
information for managing memory allocations. [Temeraire](temeraire.md)'s default
allocation policy binpacks medium-sized allocations into the last hugepage
associated with a large allocation. If the large allocation is short-lived, this
can cause persistent fragmentation from long-lived medium-sized allocations that
get binpacked into this region.
The lifetime-based allocator attempts to side-step this problem by predicting
the lifetime of large allocations and allocating short-lived large objects from
a special [HugeRegion](regions-are-not-optional.md) instead. Lifetimes are
treated as binary (short, long) and are predicted based on the stack trace at
the time of allocation. While the application is running, we are recording
statistics about all large allocations that we encounter and once we have enough
samples, we use these statistics to make a prediction when we encounter that
same stack trace again. If a large allocation is predicted to be short-lived, it
is placed into a special short-lived HugeRegion, otherwise it is handled as
usual. We call this region "lifetime region".
The allocator can run in two different modes:
* **Enabled**: The allocator will execute the allocation policy described
above.
* **Counterfactual**: The allocator will execute the lifetime-based policy on
the side but not affect the actual allocation behavior. Instead, it will
collect statistics about the correctness of its decisions as well as the
size of the lifetime region had the lifetime-based allocator been enabled.
The lifetime-based allocator has one configuration parameter (T), which is the
cutoff below which an object is considered short-lived (T = 0.5s by default).
Note that setting T = infinity causes all large allocations to be placed into
the separate region.
## Lifetime Profiling
Lifetime profiling is implemented through two components:
* `LifetimeDatabase`: This component stores a dictionary of lifetime
statistics, indexed by allocation stack trace. The size of the dictionary is
limited to avoid memory blow-up. Entries are managed through a combination
of LRU and reference counting. Each entry stores the number of long-lived
(lifetime > T) and short-lived objects with this allocation stack trace that
were encountered. The lifetime database does not track lifetimes itself but
gets called from other components to 1) record lifetimes, and 2) look up
lifetime predictions for a given stack trace. The latter works by looking up
the statistics associated with that stack trace and predicting the object as
long-lived if the number of long-lived allocations emanating from this stack
trace exceeds the number of short-lived allocations by a significant margin.
* `LifetimeTracker`: A lifetime tracker is a small amount of meta-data that
can be associated with an allocation and is used to track its lifetime. The
tracker (among other information) stores a pointer to the lifetime
statistics associated with this allocation, a timestamp, and a (possibly
unused) counterfactual pointer whose purpose will be explained later in this
document. Active trackers are strung together in a linked list sorted by
allocation timestamp. The timestamp associated with the tracker at the front
of this list is checked on every operation and if the lifetime of this
object exceeds T, it is classified as long-lived. In this case, all trackers
whose lifetime exceeds T are removed from the list (i.e., their trackers
become inactive) and their associated lifetime statistics are updated to
reflect that a long-lived allocation was encountered. If an object is
deallocated before its tracker becomes inactive, its tracker is removed from
the list and a short-lived allocation is recorded.
The use of trackers differs between enabled and counterfactual mode. In enabled
mode, a tracker is associated with every large allocation that is placed in the
regular hugepage-aware allocator and results in a filler donation. This tracker
is allocated with the remaining meta-data that is already associated with any
such donation. If an object is allocated in the lifetime region, its tracker is
allocated in a special meta-data region associated with the lifetime region.
This ensures that lifetimes continue to be tracked even if the allocator has
decided to treat a particular allocation site as short-lived.
![Lifetime Tracking (enabled)](images/lifetimes-enabled.png "Lifetime tracking in enabled mode")
In counterfactual mode, no actual objects are allocated in the short-lived
region. Instead, the lifetime region is a HugeRegion that is not backed by
actual memory but otherwise executes the same logic. This means that for any
object that would have been placed in the lifetime region had it been enabled,
the real backing object is allocated in the existing hugepage-aware allocator.
In this case, the tracker will store a `counterfactual_ptr` that points towards
the address that the object would have had if it were actually allocated in the
lifetime region. Otherwise, the object is tracked just like any other object in
the hugepage-aware allocator.
![Lifetime Tracking (counterfactual)](images/lifetimes-counterfactual.png "Lifetime tracking in enabled mode")
## Lifetime-based Allocation
The lifetime-based allocator uses the existing HugeRegion implementation for all
objects that are predicted short-lived. Whenever a large allocation is
encountered, the current stack trace is collected, and the lifetime is looked up
in the lifetime database. In regular enabled mode, the object is placed in the
lifetime region or the regular allocator, depending on this prediction, and a
tracker is installed. In counterfactual mode, the object is always allocated in
the regular allocator and if the prediction called for the allocation to be
placed in the lifetime region, an *additional* allocation call is placed to the
lifetime region (which, in counterfactual mode, is not backed by actual memory).
In this case, the tracker's `counterfactual_ptr` is set to the address that the
object would have been allocated at, so that on deallocation, a corresponding
call can be made to the lifetime region to deallocate the object.

View File

@ -0,0 +1,99 @@
# TCMalloc Overview
TCMalloc is Google's customized implementation of C's `malloc()` and C++'s
`operator new` used for memory allocation within our C and C++ code. This custom
memory allocation framework is an alternative to the one provided by the C
standard library (on Linux usually through `glibc`) and C++ standard library.
TCMalloc is designed to be more efficient at scale than other implementations.
Specifically, TCMalloc provides the following benefits:
* Performance scales with highly parallel applications.
* Optimizations brought about with recent C++14 and C++17 standard
enhancements, and by diverging slightly from the standard where performance
benefits warrant. (These are noted within the
[TCMalloc Reference](reference.md).)
* Extensions to allow performance improvements under certain architectures,
and additional behavior such as metric gathering.
## TCMalloc Cache Operation Mode
TCMalloc may operate in one of two fashions:
* (default) per-CPU caching, where TCMalloc maintains memory caches local to
individual logical cores. Per-CPU caching is enabled when running TCMalloc
on any Linux kernel that utilizes restartable sequences (RSEQ). Support for
RSEQ was merged in Linux 4.18.
* per-thread caching, where TCMalloc maintains memory caches local to each
application thread. If RSEQ is unavailable, TCMalloc reverts to using this
legacy behavior.
NOTE: the "TC" in TCMalloc refers to Thread Caching, which was originally a
distinguishing feature of TCMalloc; the name remains as a legacy.
In both cases, these cache implementations allows TCMalloc to avoid requiring
locks for most memory allocations and deallocations.
## TCMalloc Features
TCMalloc provides APIs for dynamic memory allocation: `malloc()` using the C
API, and `::operator new` using the C++ API. TCMalloc, like most allocation
frameworks, manages this memory better than raw memory requests (such as through
`mmap()`) by providing several optimizations:
* Performs allocations from the operating system by managing
specifically-sized chunks of memory (called "pages"). Having all of these
chunks of memory the same size allows TCMalloc to simplify bookkeeping.
* Devoting separate pages (or runs of pages called "Spans" in TCMalloc) to
specific object sizes. For example, all 16-byte objects are placed within a
"Span" specifically allocated for objects of that size. Operations to get or
release memory in such cases are much simpler.
* Holding memory in *caches* to speed up access of commonly-used objects.
Holding such caches even after deallocation also helps avoid costly system
calls if such memory is later re-allocated.
The cache size can also affect performance. The larger the cache, the less any
given cache will overflow or get exhausted, and therefore require a lock to get
more memory. TCMalloc extensions allow you to modify this cache size, though the
default behavior should be preferred in most cases. For more information,
consult the [TCMalloc Tuning Guide](tuning.md).
Additionally, TCMalloc exposes telemetry about the state of the application's
heap via `MallocExtension`. This can be used for gathering profiles of the live
heap, as well as a snapshot taken near the heap's highwater mark size (a peak
heap profile).
## The TCMalloc API
TCMalloc implements the C and C++ dynamic memory API endpoints from the C11,
C++11, C++14, and C++17 standards.
From C++, this includes
* The basic `::operator new`, `::operator delete`, and array variant
functions.
* C++14's sized `::operator delete`
* C++17's overaligned `::operator new` and `::operator delete` functions.
Unlike in the standard implementations, TCMalloc does not throw an exception
when allocations fail, but instead crashes directly. Such behavior can be used
as a performance optimization for move constructors not currently marked
`noexcept`; such move operations can be allowed to fail directly due to
allocation failures. In [Abseil](https://abseil.io/docs/cpp/guides/base), these
are enabled with `-DABSL_ALLOCATOR_NOTHROW`.
From C, this includes `malloc`, `calloc`, `realloc`, and `free`.
The TCMalloc API obeys the behavior of C90 DR075 and
[DR445](http://www.open-std.org/jtc1/sc22/wg14/www/docs/summary.htm#dr_445)
which states:
> The alignment requirement still applies even if the size is too small for any
> object requiring the given alignment.
In other words, `malloc(1)` returns `alignof(std::max_align_t)`-aligned pointer.
Based on the progress of
[N2293](http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2293.htm), we may relax
this alignment in the future.
For more complete information, consult the [TCMalloc Reference](reference.md).

View File

@ -0,0 +1,75 @@
# TCMalloc Platforms
The TCMalloc code is supported on the following platforms. By "platforms", we
mean the union of operating system, architecture (e.g. little-endian vs.
big-endian), compiler, and standard library.
## Language Requirements
TCMalloc requires a code base that supports C++17 and our code is
C++17-compliant. C code is required to be compliant to C11.
We guarantee that our code will compile under the following compilation flags:
Linux:
* gcc 9.2+, clang 9.0+: `-std=c++17`
(TL;DR; All code at this time must be built under C++17. We will update this
list if circumstances change.)
## Supported Platforms
The document below lists each platform, broken down by Operating System,
Architecture, Specific Compiler, and Standard Library implementation.
### Linux
**Supported**
<table width="80%">
<col width="360">
<col width="120">
<tbody>
<tr>
<th>Operating System</th>
<th>Endianness/Word Size</th>
<th>Processor Architectures</th>
<th>Compilers*</th>
<th>Standard Libraries</th>
</tr>
<tr>
<td>Linux</td>
<td>little-endian, 64-bit</td>
<td>x86, AArch64</td>
<td>gcc 9.2+<br/>clang 9.0+</td>
<td>libstdc++<br/>libc++</td>
</tr>
</tbody>
</table>
\* We test on gcc 9.2, though gcc versions (which support C++17) prior to that
release should also work.
**Best Effort**
<table width="80%">
<col width="360">
<col width="120">
<tbody>
<tr>
<th>Operating System</th>
<th>Endianness/Word Size</th>
<th>Processor Architectures</th>
<th>Compilers*</th>
<th>Standard Libraries</th>
</tr>
<tr>
<td>Linux</td>
<td>little-endian, 64-bit</td>
<td>PPC</td>
<td>gcc 9.2+<br/>clang 9.0+</td>
<td>libstdc++<br/>libc++</td>
</tr>
</tbody>
</table>

View File

@ -0,0 +1,267 @@
# TCMalloc Quickstart
Note: this Quickstart uses Bazel as the official build system for TCMalloc,
which is supported on Linux, and compatible with most major compilers. The
TCMalloc source code assumes you are using Bazel and contains `BUILD.bazel`
files for that purpose.
This document is designed to allow you to get TCMalloc set up as your custom
allocator within a C++ development environment. We recommend that each person
starting development using TCMalloc at least run through this quick tutorial.
## Prerequisites
Running the code within this tutorial requires:
* A compatible platform (E.g. Linux). Consult the
[Platforms Guide](platforms.md) for more information.
* A compatible C++ compiler *supporting at least C++17*. Most major compilers
are supported.
* [Git](https://git-scm.com/) for interacting with the Abseil source code
repository, which is contained on [GitHub](http://github.com). To install
Git, consult the [Set Up Git](https://help.github.com/articles/set-up-git/)
guide on GitHub.
Although you are free to use your own build system, most of the documentation
within this guide will assume you are using [Bazel](https://bazel.build/),
version 4.0 or newer.
To download and install Bazel (and any of its dependencies), consult the
[Bazel Installation Guide](https://docs.bazel.build/versions/master/install.html).
## Getting the TCMalloc Code
You can obtain the TCMalloc code from its repository on GitHub:
```
# Change to the directory where you want to create the code repository
$ cd ~
$ mkdir Source; cd Source
$ git clone https://github.com/google/tcmalloc.git
Cloning into 'tcmalloc'...
remote: Total 1935 (delta 1083), reused 1935 (delta 1083)
Receiving objects: 100% (1935/1935), 1.06 MiB | 0 bytes/s, done.
Resolving deltas: 100% (1083/1083), done.
$
```
Git will create the repository within a directory named `tcmalloc`. Navigate
into this directory and run all tests:
```
$ cd tcmalloc
$ bazel test //tcmalloc/...
INFO: Analyzed 112 targets (12 packages loaded, 606 targets configured).
...
INFO: Build completed successfully, 827 total actions
$
```
Congratulations! You've installed TCMalloc
## Running the TCMalloc Hello World
Once you've verified you have TCMalloc installed correctly, you can compile and
run the
[tcmalloc-hello](https://github.com/google/tcmalloc/blob/master/tcmalloc/testing/hello_main.cc)
sample binary to see how TCMalloc is linked into a sample binary. This tiny
project features proper configuration and a simple `hello_main` to demonstrate
how TCMalloc works.
First, build the `tcmalloc/testing:hello_main` target:
```
tcmalloc$ bazel build tcmalloc/testing:hello_main
Extracting Bazel installation...
Starting local Bazel server and connecting to it...
INFO: Analyzed target //tcmalloc/testing:hello_main (31 packages loaded ...
...
INFO: Build completed successfully, 102 total actions
PASSED in 0.1s
tcmalloc$
```
Now, run the compiled program:
```
tcmalloc$ bazel run tcmalloc/testing:hello_main
...
INFO: Found 1 target...
...
INFO: Build completed successfully, 1 total action
Current heap size = 73728 bytes
hello world!
new'd 1073741824 bytes at 0x14ea40000000
Current heap size = 1073816576 bytes
malloc'd 1073741824 bytes at 0x14eac0000000
Current heap size = 2147558400 bytes
$
```
You can inspect this code within
[`tcmalloc/testing/hello_main.cc`](https://github.com/google/tcmalloc/blob/master/tcmalloc/testing/hello_main.cc)
Happy Coding!
## Creating and Running TCMalloc
Now that you've obtained the TCMalloc code and verified that you can build,
test, and run it, you're ready to use it within your own project.
### Linking Your Code to the TCMalloc Repository
First, create (or select) a source code directory for your work. This directory
should generally not be the `tcmalloc` directory itself; instead, you will link
into that repository from your own source directory.
```
# Change to your main development directory and create a new development
# directory. (If you already have a development directory you'd wish to use,
# you can use that.)
$ cd ~/Source
$ mkdir TestProject; cd TestProject
```
Bazel allows you to link other Bazel projects using `WORKSPACE` files in the
root of your development directories. To add a link to your local TCMalloc
repository within your new project, add the following into a `WORKSPACE` file:
```
local_repository(
# Name of the TCMalloc repository. This name is defined within your
# WORKSPACE file, in its `workspace()` metadata
name = "com_google_tcmalloc",
# NOTE: Bazel paths must be absolute paths. E.g., you can't use ~/Source
path = "/PATH_TO_SOURCE/Source/tcmalloc",
)
```
The "name" in the `WORKSPACE` file identifies the name you will use in Bazel
`BUILD` files to refer to the linked repository (in this case
"com_google_tcmalloc").
Note that your path to the TCMalloc source code must be an absolute path.
### Adding Abseil
TCMalloc requires [Abseil](https://abseil.io) which you will also need to
provide as a `local_repository`, or link to a specific commit (we always
recommend the latest commit) using an `http_archive` declaration in the
`WORKSPACE` file:
<pre>
# Abseil HTTP Archive to specific commit
#
# Consult https://github.com/abseil/abseil-cpp/commits/master for the latest
# commit. But DO NOT use master.zip for that purpose. (Sha256 values are not
# stable across master versions.) Click on that specific commit.
#
# Click "Browse Files" on the commit and click on "Clone or Download Code."
#
# Right click on "Download ZIP" to copy the HTTP Archive URL, which you will
# use within the http_archive "urls" field.
#
# Note that you will need to generate a sha256 value for Bazel's http_archive
# to ensure this code is secure. On Linux you can do so with a downloaded .zip
# file using the sha256sum command line:
#
# $ sha256sum github_zip_file.zip
http_archive(
name = "com_google_absl",
urls = ["https://github.com/abseil/abseil-cpp/archive/<i>commit_value</i>.zip"],
strip_prefix = "abseil-cpp-<i>commit_value</i>",
sha256 = "<i>sha256_of_commit_value</i>",
)
</pre>
### Creating Your Test Code
Within your `TestProject` create an `examples` directory:
```
$ cd TestProject; mkdir examples; cd examples
```
Now, create a `hello_world.cc` C++ file within your `examples` directory:
```
#include <iostream>
#include <cstddef>
int main() {
std::cout << "Standard Alignment: " << alignof(std::max_align_t) << '\n';
double *ptr = (double*) malloc(sizeof(double));
std::cout << "Double Alignment: " << alignof(*ptr) << '\n';
char *ptr2 = (char*) malloc(1);
std::cout << "Char Alignment: " << alignof(*ptr2) << '\n';
void *ptr3;
std::cout << "Sizeof void*: " << sizeof(ptr3) << '\n';
return 0;
}
```
### Creating Your BUILD File
Now, create a `BUILD` file within your `examples` directory like the following:
```
cc_binary(
name = "hello_world",
srcs = ["hello_world.cc"],
malloc = "@com_google_tcmalloc//tcmalloc",
)
```
NOTE: For more information on how to create Bazel BUILD files, consult the
[Bazel Tutorial](https://docs.bazel.build/versions/master/tutorial/cpp.html).
We declare TCMalloc as our own custom allocation framework using the `malloc`
keyword and set this to the library name (`//tcmalloc`) within our `WORKSPACE`
file (`@com_google_tcmalloc`).
Build our target ("hello_world") and run it:
```
# It's often good practice to build files from the workspace root
$ cd ~/Source/TestProject
Source/TestProject$ bazel build //examples:hello_world --cxxopt='-std=c++17'
INFO: Analysed target //examples:hello_world (12 packages loaded).
INFO: Found 1 target...
Target //examples:hello_world up-to-date:
bazel-bin/examples/hello_world
INFO: Elapsed time: 0.180s, Critical Path: 0.00s
INFO: Build completed successfully, 1 total action
Source/TestProject$ bazel run //examples:hello_world
INFO: Running command line: bazel-bin/examples/hello_world
Standard Alignment: 16
Double Alignment: 8
Char Alignment: 1
Sizeof void*: 8
Source/TestProject$
```
Note that we passed `--cxxopt='std=c++17'` to build using C++17. Instead of
passing this flag you can add this line to your root `.bazelrc` file:
```
build --cxxopt='-std=c++17'
```
Congratulations! You've created your first binary using TCMalloc.
## What's Next
* Read our [overview](overview.md), if you haven't already. The overview
covers memory allocation concepts and best practices for using TCMalloc.
* Read through the TCMalloc [reference](reference.md) for information on the
behavior of `malloc()`, `::operator new`, and other allocation/deallocation
routines in TCMalloc.
* Consult the TCMalloc C++ `malloc_extension.h` header file, which contains
information on TCMalloc's supported extensions.
* Read our [contribution guidelines](../CONTRIBUTING.md), if you intend to
submit code to our repository.

View File

@ -0,0 +1,244 @@
# TCMalloc Basic Reference
TCMalloc provides implementations for C and C++ library memory management
routines (`malloc()`, etc.) provided within the C and C++ standard libraries.
Currently, TCMalloc requires code that conforms to the C11 C standard library
and the C++11, C++14, or C++17 C++ standard library.
NOTE: although the C API in this document is specific to the C language, the
entire TCMalloc API itself is designed to be callable directly within C++ code
(and we expect most usage to be from C++). The documentation in this section
assumes C constructs (e.g. `size_t`) though invocations using equivalent C++
constructs of aliased types (e.g. `std::size_t`) are instrinsically supported.
## C++ API
We implement the variants of `operator new` and `operator delete` from the
C++11, C++14, C++17 standards exposed within the `<new>` header file. This
includes:
* The basic `::operator new()`, `::operator delete()`, and array variant
functions.
* C++14's sized `::operator delete()`
* C++17's overaligned `::operator new()` and `::operator delete()` functions.
As required by the C++ standard, memory allocated using an aligned `operator
new` function must be deallocated with an aligned `operator delete`.
### `::operator new` / `::operator new[]`
```
void* operator new(std::size_t count);
void* operator new(std::size_t count, const std::nothrow_t& tag) noexcept;
void* operator new(std::size_t count, std::align_val_t al); // C++17
void* operator new(std::size_t count,
std::align_val_t al, const std::nothrow_t&) noexcept; // C++17
void* operator new[](std::size_t count);
void* operator new[](std::size_t count, const std::nothrow_t& tag) noexcept;
void* operator new[](std::size_t count, std::align_val_t al); // C++17
void* operator new[](std::size_t count,
std::align_val_t al, const std::nothrow_t&) noexcept; // C++17
```
`operator new`/`operator new[]` allocates `count` bytes. They may be invoked
directly but are more commonly invoked as part of a *new*-expression.
When `__STDCPP_DEFAULT_NEW_ALIGNMENT__` is not specified (or is larger than 8
bytes), we use standard 16 byte alignments for `::operator new` without a
`std::align_val_t` argument. However, for allocations under 16 bytes, we may
return an object with a lower alignment, as no object with a larger alignment
requirement can be allocated in the space. When compiled with
`__STDCPP_DEFAULT_NEW_ALIGNMENT__ <= 8`, we use a set of sizes aligned to 8
bytes for raw storage allocated with `::operator new`.
NOTE: On many platforms, the value of `__STDCPP_DEFAULT_NEW_ALIGNMENT__` can be
configured by the `-fnew-alignment=...` flag.
The `std::align_val_t` variants provide storage suitably aligned to the
requested alignment.
If the allocation is unsuccessful, a failure terminates the program.
NOTE: unlike in the C++ standard, we do not throw an exception in case of
allocation failure, or invoke `std::get_new_handler()` repeatedly in an attempt
to successfully allocate, but instead crash directly. Such behavior can be used
as a performance optimization for move constructors not currently marked
`noexcept`; such move operations can be allowed to fail directly due to
allocation failures. Within Abseil code, these direct allocation failures are
enabled with the Abseil build-time configuration macro
[`ABSL_ALLOCATOR_NOTHROW`](https://abseil.io/docs/cpp/guides/base#abseil-exception-policy).
If the `std::no_throw_t` variant is utilized, upon failure, `::operator new`
will return `nullptr` instead.
### `::operator delete` / `::operator delete[]`
```
void operator delete(void* ptr) noexcept;
void operator delete(void* ptr, std::size_t sz) noexcept;
void operator delete(void* ptr, std::align_val_t al) noexcept;
void operator delete(void* ptr, std::size_t sz,
std::align_val_t all) noexcept;
void operator delete[](void* ptr) noexcept;
void operator delete[](void* ptr, std::size_t sz) noexcept; // C++14
void operator delete[](void* ptr, std::align_val_t al) noexcept; // C++17
void operator delete[](void* ptr, std::size_t sz,
std::align_val_t al) noexcept; // C++17
```
`::operator delete`/`::operator delete[]` deallocate memory previously allocated
by a corresponding `::operator new`/`::operator new[]` call respectively. It is
commonly invoked as part of a *delete*-expression.
Sized delete is used as a critical performance optimization, eliminating the
need to perform a costly pointer-to-size lookup.
### Extensions
We also expose a prototype of
[P0901](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2019/p0901r5.html) in
https://github.com/google/tcmalloc/blob/master/tcmalloc/malloc_extension.h with
`tcmalloc_size_returning_operator_new()`. This returns both memory and the size
of the allocation in bytes. It can be freed with `::operator delete`.
## C API
The C standard library specifies the API for dynamic memory management within
the `<stdlib.h>` header file. Implementations require C11 or greater.
TCMalloc provides implementation for the following C API functions:
* `malloc()`
* `calloc()`
* `realloc()`
* `free()`
* `aligned_alloc()`
For `malloc`, `calloc`, and `realloc`, we obey the behavior of C90 DR075 and
[DR445](http://www.open-std.org/jtc1/sc22/wg14/www/docs/summary.htm#dr_445)
which states:
> The alignment requirement still applies even if the size is too small for any
> object requiring the given alignment.
In other words, `malloc(1)` returns `alignof(std::max_align_t)`-aligned pointer.
Based on the progress of
[N2293](http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2293.htm), we may relax
this alignment in the future.
Additionally, TCMalloc provides an implementation for the following POSIX
standard library function, available within glibc:
* `posix_memalign()`
TCMalloc also provides implementations for the following obsolete functions
typically provided within libc implementations:
* `cfree()`
* `memalign()`
* `valloc()`
* `pvalloc()`
Documentation is not provided for these obsolete functions. The implementations
are provided only for compatibility purposes.
### `malloc()`
```
void* malloc(size_t size);
```
`malloc` allocates `size` bytes of memory and returns a `void *` pointer to the
start of that memory.
`malloc(0)` returns a non-NULL zero-sized pointer. (Attempting to access memory
at this location is undefined.) If `malloc()` fails for some reason, it returns
NULL.
### `calloc()`
```
void* calloc(size_t num, size_t size);
```
`calloc()` allocates memory for an array of objects, zero-initializes all bytes
in allocated storage, and if allocation succeeds, returns a pointer to the first
byte in the allocated memory block.
`calloc(num, 0)` or `calloc(0, size)` returns a non-NULL zero-sized pointer.
(Attempting to access memory at this location is undefined.) If `calloc()` fails
for some reason, it returns NULL.
### `realloc()`
```
void* realloc(void *ptr, size_t new_size);
```
`realloc()` re-allocates memory for an existing region of memory by either
expanding or contracting the memory based on the passed `new_size` in bytes,
returning a `void*` pointer to the start of that memory (which may not change);
it does not perform any initialization of new areas of memory.
`realloc(OBJ*, 0)` returns a NULL pointer. If `realloc()` fails for some reason,
it also returns NULL.
### `aligned_alloc()`
```
void* aligned_alloc(size_t alignment, size_t size);
```
`aligned_alloc()` allocates `size` bytes of memory with alignment of size
`alignment` and returns a `void *` pointer to the start of that memory; it does
not perform any initialization.
The `size` parameter must be an integral multiple of `alignment` and `alignment`
must be a power of two. If either of these cases is not satisfied,
`aligned_alloc()` will fail and return a NULL pointer.
`aligned_alloc` with `size=0` returns a non-NULL zero-sized pointer. (Attempting
to access memory at this location is undefined.)
### `posix_memalign()`
```
int posix_memalign(void **memptr, size_t alignment, size_t size);
```
`posix_memalign()`, like `aligned_alloc()` allocates `size` bytes of memory with
alignment of size `alignment` to the start of memory pointed to by `**memptr`;
it does not perform any initialization. This pointer can be cast to the desired
type of data pointer in order to be dereferenceable. If the alignment allocation
succeeds, `posix_memalign()` returns `0`; otherwise it returns an error value.
`posix_memalign` is similar to `aligned_alloc()` but `alignment` be a power of
two multiple of `sizeof(void *)`. If the constraints are not satisfied,
`posix_memalign()` will fail.
`posix_memalign` with `size=0` returns a non-NULL zero-sized pointer.
(Attempting to access memory at this location is undefined.)
### `free()`
```
void free(void* ptr);
```
`free()` deallocates memory previously allocated by `malloc()`, `calloc()`,
`aligned_alloc()`, `posix_memalign()`, or `realloc()`. If `free()` is passed a
null pointer, the function does nothing.
### Extensions
These are contained in
https://github.com/google/tcmalloc/blob/master/tcmalloc/malloc_extension.h.
* `nallocx(size_t size, int flags)` - Returns the number of bytes that would
be allocated by `malloc(size)`, subject to the alignment specified in
`flags`.
* `sdallocx(void* ptr, size_t size, int flags)` - Deallocates memory allocated
by `malloc` or `memalign`. It takes a size parameter to pass the original
allocation size, improving deallocation performance.

View File

@ -0,0 +1,154 @@
# Regions Are Not Optional!
Andrew Hunter
Discussion on the design of [Temeraire](temeraire.md) posited that `HugeRegion`
is a weird/complex feature that possibly is a premature optimization.
`HugeRegion` is neither optional, nor really all that complex. We claim this is
actually a fairly simple approach that fixes what would otherwise be a very
serious flaw.
This expands on the description of `HugeRegion` in the main design doc.
## Our Trilemma
`HugeRegion` exists because of three key framing requirements for a
Temeraire-enabled TCMalloc:
1. We must support allocations of any (reasonable) size, and in particular a
heap composed of any set of reasonable sizes in any ratio; "sorry, tcmalloc
detonates if you mostly use requests of size X" is not acceptable.
1. We must be able to back (most, ideally all) of our heap with hugepages.
1. We would like to tightly bound global space overhead[^1] on our heap.
Consider requests R<sub>i</sub> that are larger than a hugepage, but small
enough that the rounding error from extending to a hugepage boundary is
significant by (3). (Note that rounding up to a hugepage boundary would
introduce a significant amount of overhead for allocations between 1 and 10
hugepages, and the overhead could still be considered significant for
allocations larger than that.)
* We *cannot* unback the unused tail of the last hugepage (requirement (2)
would be violated).
* We *cannot* assume these requests are necessarily rare and we will have many
smaller ones to fill the unused tail (requirement (1) would be violated).
Moreover this is **empirically false** for widely used
binaries.
In summary, we must be able to use the unused tail of a hugepage from one
R<sub>i</sub> as space for another large R<sub>j</sub>. If we do not enable such
usage in our allocator, we will either potentially have space overhead of up to
100%, or dramatically reduce our hugepage usage. The conclusion we came to is
that we **must support**, in some form, allocating multiple such R<sub>i</sub>
contiguously; that is, using the unused tail from R<sub>1 </sub>as the beginning
of R<sub>2</sub> and so on.
**This is all `HugeRegion{,Set}` does.**
## The "Simple" Truth
The above argument is why we have `HugeRegion`: we need a way to allocate
multiple large (>1 hugepage) allocations on overlapping hugepages. So how can we
do that? Clearly, we need some range of hugepages, large enough for several such
R<sub>i</sub>, from which we allocate. What should we do in that space? A
best-fit algorithm that tracks the free lengths seems appropriate.
As allocations become free, it seems reasonable (by requirement (3) above) that
we unback empty hugepages.
Finally, what happens if the the range we allocated is full? We could do two
things
1. extend it
1. obtain a new one and do allocations from there as needed.
(1) is an interesting choice, but not actually possible with the `SysAllocator`
interface. We might get lucky with `sbrk` (or even `mmap`, though it is less
likely) placement choice, but we also might not; we cannot rely on it. So we
must be able to fall back to (2) anyway, and given that there's very little
disadvantages to having multiple such ranges (we wont need very many in any
case), why not just only do that?
It should not be surprising that we have just described the algorithm
`HugeRegion{,Set}` uses: inside some fixed-size range, do best-fit allocation
for large allocations, backing and unbacking hugepages on demand. When one
region fills, obtain another; fill from the most fragmented to bound total
overhead (a policy derived from `HugePageFiller`).
That is *really it*. We do not see this as particularly complicated. The only
thing left is the implementation of that policy: We used `RangeTracker` because
it was convenient, supported exactly the API we needed, and fast enough (even
though we're tracking fairly large bitsets).
## But what about...
There are some reasonable objections to particular details, which we are happy
to address.
### Why are regions so big?
Because it worked. Virtual address space is virtually free. :) We can easily
justify why they arent 32 MiB (our original choice, as it happens):
[Temeraire](temeraire.md) contains a simple argument, it is trivial to waste a
full hugepage per region, and this scales down nicely with increasing region
size. Why did we go to a gigabyte? Because it worked. :) It had an added
advantage: even large binaries would only use a handful of regions, and thus
walking the list was cheap and we could print a lot of info about each in
mallocz.
We've run more tests; 128 MiB and 512 MiB both perform reasonably, but this
isn't a compelling reason to change the size. We don't really support VSS limits
(and in practice we don't have them, outside badly behaved sandbox programs and
some daemons that use `SMALL_BUT_SLOW` anyway, which we're not currently
changing).
### How did we pick the current policy for what goes to regions?
Because it worked. The arguments above make it clear that anything larger than
one hugepage and smaller than &lt;some value we can agree is many&gt; hugepages
must go there. It seemed reasonable to allow slightly smaller ones to slip into
the region if we had space and it was needed; we saw no reason not to allow
many-hugepage allocations there if they fit. In practice, this seems to work
well. There really isnt more thought than that.
### Cant we fix binaries with problematic allocation patterns?
Yes, we can. We probably should. It'd be good to do anyway. However: doing so
doesnt stop us from needing Regions:
* Changing workloads takes a long time.
* We cannot successfully change, all the programs that make any significant
use of allocations &gt;2 MiB and less than (say) 50 MiB. We cannot tell
users "Eh, no, tcmalloc does terribly if you allocate a couple megabytes at
a time?" Requirement (1) above is our expression of how we don't think
that's reasonable at all: we should able to handle 3 MiB allocations without
embarrassing ourselves.
Recall that the trilemma leading to regions applies for **anything more than 2
MiB which we can't just ignore the tail on**. It's easiest to show the potential
huge problems with the canonical "2.1 MiB" allocation, but 5 MiB or 6.1 MiB or
even 10.1 MiB allocations, if they're a significant component of heap usage,
will lead to unacceptable overhead without `HugeRegion`, and we don't think we
can say "don't do that."
## Conclusion
`HugeRegion` is the simplest possible solution we've found to a pressing problem
in a hugepage-oriented allocator. When you read the [design doc](temeraire.md),
please don't assume that HugeRegion is a speculative fix for a potential
problem, that we might not need, nor that it's a roughed out attempt. This is a
key part of the algorithm, and one we've thought a lot about the best fix for.
We don't claim it is perfect and must surely have hit on the best fix, but
"nothing" is not an acceptable solution. This gets reasonable space performance
with badly sized allocations.
**In short, `HugeRegion` is neither optional nor particularly complex. Having it
produces dramatic savings in a number of realistic scenarios, and costs us very
little.**
## Notes
[^1]: What our designed bound of overhead is...a very interesting question.
Different places accept different forms of overhead. While we could target
the current overhead, we can and must do better than this. One goal of
Temeraire is to dramatically cut this (in the pageheap).

View File

@ -0,0 +1,424 @@
# Restartable Sequence Mechanism for TCMalloc
<!--*
# Document freshness: For more information, see go/fresh-source.
freshness: { owner: 'ckennelly' reviewed: '2022-12-14' }
*-->
## per-CPU Caches
TCMalloc implements its per-CPU caches using restartable sequences (`man
rseq(2)`) on Linux. This kernel feature was developed by
[Paul Turner and Andrew Hunter at Google](http://www.linuxplumbersconf.net/2013/ocw//system/presentations/1695/original/LPC%20-%20PerCpu%20Atomics.pdf)
and Mathieu Desnoyers at EfficiOS. Restartable sequences let us execute a region
to completion (atomically with respect to other threads on the same CPU) or to
be aborted if interrupted by the kernel by preemption, interrupts, or signal
handling.
Choosing to restart on migration across cores or preemption allows us to
optimize the common case - we stay on the same core - by avoiding atomics, over
the more rare case - we are actually preempted. As a consequence of this
tradeoff, we need to make our code paths actually support being restarted. The
entire sequence, except for its final store to memory which *commits* the
change, must be capable of starting over.
This carries a few implementation challenges:
* We need fine-grained control over the generated assembly, to ensure stores
are not reordered in unsuitable ways.
* The restart sequence is triggered if the kernel detects a context switch
occurred with the PC in the restartable sequence code. If this happens
instead of restarting at this PC, it restarts the thread at an abort
sequence, the abort sequence determines the interrupted restartable
sequence, and then returns to control to the entry point of this sequence.
We must preserve adequate state to successfully restart the code sequence.
In particular, we must preserve the function parameters so that we can
restart the sequence with the same conditions; next we must reload any
parameters like the CPU ID, and recompute any necessary values.
## Structure of the `TcmallocSlab`
In per-CPU mode, we allocate an array of `N` `TcmallocSlab::Slabs`. For all
operations, we index into the array with the logical CPU ID.
Each slab has a header region of control data (one 8-byte header per-size
class). These index into the remainder of the slab, which contains pointers to
free listed objects.
![Memory layout of per-cpu data structures](images/per-cpu-cache-internals.png "Memory layout of per-cpu data structures")
In
[C++ code](https://github.com/google/tcmalloc/blob/master/tcmalloc/internal/percpu_tcmalloc.h),
these are represented as:
```
struct Slabs {
std::atomic<int64_t> header[NumClasses];
void* mem[((1ul << Shift) - sizeof(header)) / sizeof(void*)];
};
// Slab header (packed, atomically updated 64-bit).
// All {begin, current, end} values are pointer offsets from per-CPU region
// start. The slot array is in [begin, end), and the occupied slots are in
// [begin, current).
struct Header {
// The end offset of the currently occupied slots.
uint16_t current;
// Copy of end. Updated by Shrink/Grow, but is not overwritten by Drain.
uint16_t end_copy;
// Lock updates only begin and end with a 32-bit write.
// The begin offset of the slot array for this size class.
uint16_t begin;
// The end offset of the slot array for this size class.
uint16_t end;
// Lock is used by Drain to stop concurrent mutations of the Header.
// Lock sets begin to 0xffff and end to 0, which makes Push and Pop fail
// regardless of current value.
bool IsLocked() const;
void Lock();
};
```
The atomic `header` allows us to read the state (esp. for telemetry purposes) of
a core without undefined behavior.
The fields in `Header` are indexed in `sizeof(void*)` strides into the slab. For
the default value of `Shift=18`, this allows us to cache nearly 32K objects per
CPU. Ongoing work encodes `Slabs*` and `Shift` into a single pointer, allowing
it to be dynamically updated at runtime.
We have allocated capacity for `end-begin` objects for a given size-class.
`begin` is chosen via static partitioning at initialization time. `end` is
chosen dynamically at a higher-level (in `tcmalloc::CPUCache`), as to:
* Avoid running into the next size-classes' `begin`
* Balance cached object capacity across size-classes, according to the
specified byte limit.
## Usage: Allocation
As the first operation, we can look at allocation, which needs to read the
pointer at index `current-1`, return that object, and decrement `current`.
Decrementing `current` is the *commit* operation.
In pseudo-C++, this looks like:
```
void* TcmallocSlab_Pop(
void *slabs,
size_t size_class,
UnderflowHandler underflow_handler) {
// Expanded START_RSEQ macro...
restart:
__rseq_abi.rseq_cs = &__rseq_cs_TcmallocSlab_Pop;
start:
// Actual sequence
uint64_t cpu_id = __rseq_abi.cpu_id;
Header* hdr = &slabs[cpu_id].header[size_class];
uint64_t current = hdr->current;
uint64_t begin = hdr->begin;
if (ABSL_PREDICT_FALSE(current <= begin)) {
goto underflow;
}
void* next = *(&slabs[cpu_id] + current * sizeof(void*) - 2 * sizeof(void*))
prefetcht0(next);
void* ret = *(&slabs[cpu_id] + current * sizeof(void*) - sizeof(void*));
--current;
hdr->current = current;
commit:
return ret;
underflow:
return underflow_handler(cpu_id, size_class);
}
// This is implemented in assembly, but for exposition.
ABSL_CONST_INIT kernel_rseq_cs __rseq_cs_TcmallocSlab_Pop = {
.version = 0,
.flags = 0,
.start_ip = &&start,
.post_commit_offset = &&commit - &&start,
.abort_ip = &&abort,
};
```
`__rseq_cs_TcmallocSlab_Pop` is a read-only data structure, which contains
metadata about this particular restartable sequence. When the kernel preempts
the current thread, it examines this data structure. If the current instruction
pointer is between `[start, commit)`, it returns control to a specified,
per-sequence restart header at `abort`.
Since the *next* object is frequently allocated soon after the current object,
the allocation path prefetches the pointed-to object. To avoid prefetching a
wild address, we populate `slabs[cpu][begin]` for each CPU/size-class with a
pointer-to-self.
This sequence terminates with the *single* committing store to `hdr->current`.
If we are migrated or otherwise interrupted, we restart the preparatory steps,
as the values of `cpu_id`, `current`, `begin` may have changed.
As these operations work on a single core's data and are executed on that core.
From a memory ordering perspective, loads and stores need to appear on that core
in program order.
### Restart Handling
The `abort` label is distinct from `restart`. The `rseq` API provided by the
kernel (see below) requires a "signature" (typically an intentionally invalid
opcode) in the 4 bytes prior to the restart handler. We form a small
trampoline - properly signed - to jump back to `restart`.
In x86 assembly, this looks like:
```
// Encode nop with RSEQ_SIGNATURE in its padding.
.byte 0x0f, 0x1f, 0x05
.long RSEQ_SIGNATURE
.local TcmallocSlab_Push_trampoline
.type TcmallocSlab_Push_trampoline,@function
TcmallocSlab_Push_trampoline:
abort:
jmp restart
```
This ensures that the 4 bytes prior to `abort` match up with the signature that
was configured with the `rseq` syscall.
On x86, we can represent this with a nop which would allow for interleaving in
the main implementation. On other platforms - with fixed width instructions -
the signature is often chosen to be an illegal/trap instruction, so it has to be
disjoint from the function's body.
## Usage: Deallocation
Deallocation uses two stores, one to store the deallocated object and another to
update `current`. This is still compatible with the restartable sequence
technique, as there is a *single* commit step, updating `current`. Any preempted
sequences will overwrite the value of the deallocated object until a successful
sequence commits it by updating `current`.
```
int TcmallocSlab_Push(
void *slab,
size_t size_class,
void* item,
OverflowHandler overflow_handler) {
// Expanded START_RSEQ macro...
restart:
__rseq_abi.rseq_cs = &__rseq_cs_TcmallocSlab_Push;
start:
// Actual sequence
uint64_t cpu_id = __rseq_abi.cpu_id;
Header* hdr = &slabs[cpu_id].header[size_class];
uint64_t current = hdr->current;
uint64_t end = hdr->end;
if (ABSL_PREDICT_FALSE(current >= end)) {
goto overflow;
}
*(&slabs[cpu_id] + current * sizeof(void*) - sizeof(void*)) = item;
current++;
hdr->current = current;
commit:
return;
overflow:
return overflow_handler(cpu_id, size_class, item);
}
```
## Initialization of the Slab
To reduce metadata demands, we lazily initialize the slabs, relying on the
kernel to provide zeroed pages from the `mmap` call to obtain memory for the
slab metadata.
At startup, this leaves the `Header` of each initialized to `current = begin =
end = 0`. The initial push or pop will trigger the overflow or underflow paths
(respectively), so that we can populate these values.
## More Complex Operations: Batches
When the cache under or overflows, we populate or remove a full batch of objects
obtained from inner caches. This amortizes some of the lock acquisition/logic
for those caches. Using a similar approach to push and pop, we read/write a
batch of `N` items and we update `current` to commit the operation.
## Kernel API and implementation
This section contains notes on the rseq API provided by the kernel, which is not
well documented, and code pointers for how it is implemented.
The `rseq` syscall is implemented by
[`sys_rseq`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L304-L366).
It starts by
[handling](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L312-L328)
the case where the thread wants to unregister, implementing that by clearing the
[rseq information](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/linux/sched.h#L1188-L1189)
out of the `task_struct` for the thread running
[on the current CPU](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/arch/x86/include/asm/current.h#L11-L18).
It then moves on to
[return an error](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L333-L345)
if the thread is already registered for rseq. Then it
[validates](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L347-L355)
and
[saves](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L356-L357)
the input from the user, and
[sets](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L358-L363)
the
[`TIF_NOTIFY_RESUME` flag](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/linux/sched.h#L2044-L2048)
for the thread.
### Restarts
Among other things, the user's input to the `rseq` syscall is used by
`rseq_ip_fixup` to
[decide](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L232-L238)
whether we're in a critical section and if so
[restart](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L247)
at the abort point. That function is
[called](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L271)
by `__rseq_handle_notify_resume`, which is
[documented](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L251-L261)
as needing to be called after preemption or signal delivery before returning to
the user. That in turn is called by
[`rseq_handle_notify_resume`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/linux/sched.h#L2052-L2057),
a simple wrapper that bails if rseq is not enabled for the thread.
Here is one path that causes us to wind up here on x86:
* [`rseq_signal_deliver`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/linux/sched.h#L2065)
* [`setup_rt_frame`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/arch/x86/kernel/signal.c#L690-L691)
* [`handle_signal`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/arch/x86/kernel/signal.c#L746)
* [`arch_do_signal_or_restart`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/arch/x86/kernel/signal.c#L812-L813)
* [`handle_signal_work`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/entry/common.c#L147)
* [`exit_to_user_mode_loop`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/entry/common.c#L171)
* [`exit_to_user_mode_prepare`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/entry/common.c#L208)
So the choke point is the code that returns to user space. Here are some notes
on how the restart logic varies based on user input:
* `rseq_ip_fixup`
[calls](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L228)
`rseq_get_rseq_cs` every time. That means it
[reads](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L123-L124)
the
[pointer](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/uapi/linux/rseq.h#L91-L124)
to `struct rseq_cs` and then
[indirects](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L131-L133)
through it fresh from user memory each time. It
[checks](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L135-L145)
for invalid cases (which
[cause](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L278-L280)
a segfault for the user process) and then does
[validation](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L147-L157)
of the abort IP signature discussed below.
* Signature validation: from
[the code](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L147-L157)
linked above we can see that the requirement is that the abort handler
specified by `rseq_cs::abort_ip` be preceded by a 32-bit magic integer that
[matches](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L152)
the one originally provided to and
[saved by](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L357)
the `rseq` syscall.
The intent is to avoid turning buffer overflows into arbitrary code
execution: if an attacker can write into memory then they can control
`rseq_cs::abort_ip`, which is kind of like writing a jump instruction into
memory, which can be seen as breaking
[W^X](https://en.wikipedia.org/wiki/W%5EX) protections. Instead the kernel
has the caller pre-register a magic value from the executable memory that
they want to run, under the assumption that an attacker is unlikely to be
able to find other usable "gadgets" in executable memory that happen to be
preceded by that value.
It's also worth noting that signals and preemption always
[result in](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L238-L242)
[clearing](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L197-L210)
`rseq::rseq_cs::ptr64` from user space memory on the way out, except in error
cases that cause a segfault.
### CPU IDs
The other thing `rseq.c` takes care of is writing CPU IDs to user space memory.
There are two fields in user space that get this information:
[`rseq::cpu_id_start`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/uapi/linux/rseq.h#L63-L75)
and
[`rseq::cpu_id`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/uapi/linux/rseq.h#L76-L90).
The difference between the two is that `cpu_id_start` is always in range,
whereas `cpu_id` may contain error values. The kernel provides both in order to
support computation of values derived from the CPU ID that happens before
entering the critical section. We could do this with one CPU ID, but it would
require an extra branch to distinguish "not initialized" from "CPU ID changed
after fetching it". On the other hand if (like tcmalloc) you only fetch the CPU
Id within a critical section, then you need only one field because you have only
one branch: am I initialized. There is no such thing as a CPU mismatch because
instead you are just restarted when the CPU ID changes.
The two CPU ID fields are maintained as follows:
* [`rseq_update_cpu_id`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L84-L94)
writes a CPU ID into each. This is
[called](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L274-L275)
by `__rseq_handle_notify_resume`, which is discussed above.
* [`rseq_reset_rseq_cpu_id`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L96-L113)
sets the `cpu_id_start` field to zero and the `cpu_id` field to
[`RSEQ_CPU_ID_UNINITIALIZED`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/uapi/linux/rseq.h#L17)
(an out of range value). It is
[called](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L322)
in the unregister path discussed above.
## Cross-CPU Operations
With restartable sequences, we've optimized the fast path for same-CPU
operations at the expense of costlier cross-CPU operations. Cross-CPU operations
are rare&mdash;typically done only to facilitate periodic drains of idle
caches&mdash;so this is a desirable tradeoff.
Cross-CPU operations rely on operating system assistance (wrapped in
`tcmalloc::tcmalloc_internal::subtle::percpu::FenceCpu`) to interrupt any
running restartable sequences on the remote core. When control is returned to
the thread running on that core, we have guaranteed that either the restartable
sequence that was running has completed *or* that the restartable sequence was
preempted.
We use preemption and "locks" (`TcmallocSlab::Header::Lock`) to ensure that
during a particular period, all accesses to the fast path will fail&mdash;the
cache is both simultaneously "full" and "empty" so all inserts and removes will
go to the slow path. Unlike using `sched_setaffinity` to run a remote core, this
approach allows us to perform longer operations, such as taking elements from
the cache and inserting them into the `TransferCache` as part of `Drain`, while
still maintaining correctness.
Since we are using relaxed loads and stores, potentially with word-level
granularity, our operations need to potentially store part of the needed data to
`Header`, fence, and then write additional fields. For example, at the end of of
`Drain`, we:
* Store `hdr.current`. `hdr.begin = 0xFFFF` and `hdr.end = 0x0`, ensuring
insert and remove operations continue to fail.
* `FenceCpu`
* Store `hdr.begin` and `hdr.end` to their proper values.
This sequence ensures that a thread running on the remote core can only see one
of:
* `hdr.current = X`; `hdr.begin = 0xFFFF`; `hdr.end = 0x0`
* `hdr.current = Y`; `hdr.begin = 0xFFFF`; `hdr.end = 0x0`
* `hdr.current = Y`; `hdr.begin = Y`; `hdr.end = Y`
`FenceCpu` ensures that after it completes, no thread can see `current=X` any
longer.
If we did a single store or omitted the intervening fence operation, a thread on
the remote core could potentially see `hdr.begin = Y < hdr.current = X` and
attempt to remove an element from the cache. (This failure would lead to data
corruption as the element had already been "deallocated" to the `TransferCache`,
essentially triggering a double-free.)

View File

@ -0,0 +1,64 @@
# How sampling in TCMalloc works.
## Introduction
TCMalloc uses sampling to get representative data on memory usage and
allocation. How this works is not well documented. This doc attempts to at least
partially fix this.
## Sampling
We chose to sample an allocation every N bytes where N is a random value using
[Sampler::PickNextSamplingPoint()](https://github.com/google/tcmalloc/blob/master/tcmalloc/sampler.cc)
with a mean set by the profile sample rate using
[MallocExtension::SetProfileSamplingRate()](https://github.com/google/tcmalloc/blob/master/tcmalloc/malloc_extension.h).
By default this is every 2MiB.
## How We Sample Allocations
When we pick an allocation such as
[Sampler::RecordAllocationSlow()](https://github.com/google/tcmalloc/blob/master/tcmalloc/sampler.cc)
to sample we do some additional processing around that allocation using
[SampleifyAllocation()](https://github.com/google/tcmalloc/blob/master/tcmalloc/allocation_sampling.h) -
recording stack, alignment, request size, and allocation size. Then we go
through all the active samplers using
[ReportMalloc()](https://github.com/google/tcmalloc/blob/master/tcmalloc/allocation_sample.h)
and tell them about the allocation. We also tell the span that we're sampling
it - we can do this because we do sampling at tcmalloc page sizes, so each
sample corresponds to a particular page in the pagemap.
## How We Free Sampled Objects
Each sampled allocation is tagged. So we can quickly test whether a particular
allocation might be a sample.
When we are done with the sampled span we release it using
[tcmalloc::Span::Unsample()](https://github.com/google/tcmalloc/blob/master/tcmalloc/span.cc).
## How Do We Handle Heap and Fragmentation Profiling
To handle heap and fragmentation profiling we just need to traverse the list of
sampled objects and compute either their degree of fragmentation, or the amount
of heap they consume.
## How Do We Handle Allocation Profiling
Allocation profiling reports a list of sampled allocations during a length of
time. We start an allocation profile using
[MallocExtension::StartAllocationProfiling()](https://github.com/google/tcmalloc/blob/master/tcmalloc/malloc_extension.h),
then wait until time has elapsed, then call `Stop` on the token. and report the
profile.
While the allocation sampler is active it is added to the list of samplers for
allocations and removed from the list when it is claimed.
## How Do We Handle Lifetime Profiling
Lifetime profiling reports a list of object lifetimes as pairs of allocation and
deallocation records. Profiling is initiated by calling
[MallocExtension::StartLifetimeProfiling()](https://github.com/google/tcmalloc/blob/master/tcmalloc/malloc_extension.h).
Profiling continues until `Stop` is invoked on the token. Lifetimes are only
reported for objects where allocation *and* deallocation are observed while
profiling is active. A description of the sampling based lifetime profiler can
be found in Section 4 of
["Learning-based Memory Allocation for C++ Server Workloads, ASPLOS 2020"](https://research.google/pubs/pub49008/).

View File

@ -0,0 +1,961 @@
# Understanding Malloc Stats
## Getting Malloc Stats
Human-readable statistics can be obtained by calling
`tcmalloc::MallocExtension::GetStats()`.
## Understanding Malloc Stats Output
### It's A Lot Of Information
The output contains a lot of information. Much of it can be considered debug
info that's interesting to folks who are passingly familiar with the internals
of TCMalloc, but potentially not that useful for most people.
### Summary Section
The most generally useful section is the first few lines:
```
See https://github.com/google/tcmalloc/tree/master/docs/stats.md for an explanation of this page
------------------------------------------------
MALLOC: 10858234672 (10355.2 MiB) Bytes in use by application
MALLOC: + 827129856 ( 788.8 MiB) Bytes in page heap freelist
MALLOC: + 386098400 ( 368.2 MiB) Bytes in central cache freelist
MALLOC: + 105330688 ( 100.5 MiB) Bytes in per-CPU cache freelist
MALLOC: + 9095680 ( 8.7 MiB) Bytes in transfer cache freelist
MALLOC: + 660976 ( 0.6 MiB) Bytes in thread cache freelists
MALLOC: + 49333930 ( 47.0 MiB) Bytes in malloc metadata
MALLOC: + 629440 ( 0.6 MiB) Bytes in malloc metadata Arena unallocated
MALLOC: + 1599704 ( 1.5 MiB) Bytes in malloc metadata Arena unavailable
MALLOC: ------------
MALLOC: = 12238113346 (11671.2 MiB) Actual memory used (physical + swap)
MALLOC: + 704643072 ( 672.0 MiB) Bytes released to OS (aka unmapped)
MALLOC: ------------
MALLOC: = 12942756418 (12343.2 MiB) Virtual address space used
```
* **Bytes in use by application:** Number of bytes that the application is
actively using to hold data. This is computed by the bytes requested from
the OS minus any bytes that are held in caches and other internal data
structures.
* **Bytes in page heap freelist:** The pageheap is a structure that holds
memory ready for TCMalloc to use. This memory is not actively being used,
and could be returned to the OS. [See TCMalloc tuning](tuning.md)
* **Bytes in central cache freelist:** This is the amount of memory currently
held in the central freelist. This is a structure that holds partially used
"[spans](#more-detail-on-metadata)" of memory. The spans are partially used
because some memory has been allocated from them, but not entirely used -
since they have some free memory on them.
* **Bytes in per-CPU cache freelist:** In per-cpu mode (which is the default)
each CPU holds some memory ready to quickly hand to the application. The
maximum size of this per-cpu cache is tunable.
[See TCMalloc tuning](tuning.md)
* **Bytes in transfer cache freelist:** The transfer cache can be considered
another part of the central freelist. It holds memory that is ready to be
provided to the application for use.
* **Bytes in thread cache freelists:** The TC in TCMalloc stands for thread
cache. Originally each thread held its own cache of memory to provide to the
application. Since the change of default to per-cpu caches, the thread
caches are used by very few applications. However, TCMalloc starts in
per-thread mode, so there may be some memory left in per-thread caches from
before it switches into per-cpu mode.
* **Bytes in malloc metadata:** the size of the data structures used for
tracking memory allocation. This will grow as the amount of memory used
grows.
* **Bytes in malloc metadata Arena unallocated:** Metadata is allocated in an
internal Arena. Memory requests to the OS are made in blocks which amortize
several Arena allocations and this captures memory that is not yet allocated
but could be by future Arena allocations.
* **Bytes in malloc metadata Arena unavailable:** The Arena allocator may fail
to allocate a block fully when a subsequent Arena allocation request is made
that is larger than the block's remaining space. This memory is currently
unavailable for allocation.
There's a couple of summary lines:
* **Actual memory used:** This is the total amount of memory that TCMalloc
thinks it is using in the various categories. This is computed from the size
of the various areas, the actual contribution to RSS may be larger or
smaller than this value. The true RSS may be less if memory is not mapped
in. In some cases RSS can be larger if small regions end up being mapped
with huge pages. This does not count memory that TCMalloc is not aware of
(eg memory mapped files, text segments etc.)
* **Bytes released to OS:** TCMalloc can release memory back to the OS (see
[tcmalloc tuning](tuning.md)), and this is the upper bound on the amount of
released memory. However, it is up to the OS as to whether the act of
releasing the memory actually reduces the RSS of the application. The code
uses `MADV_DONTNEED`/`MADV_REMOVE` which tells the OS that the memory is no
longer needed.
* **Virtual address space used:** This is the amount of virtual address space
that TCMalloc believes it is using. This should match the later section on
requested memory. There are other ways that an application can increase its
virtual address space, and this statistic does not capture them.
### More Detail On Metadata
The next section gives some insight into the amount of metadata that TCMalloc is
using. This is really debug information, and not very actionable.
```
MALLOC: 236176 Spans in use
MALLOC: 238709 ( 10.9 MiB) Spans created
MALLOC: 8 Thread heaps in use
MALLOC: 46 ( 0.0 MiB) Thread heaps created
MALLOC: 13517 Stack traces in use
MALLOC: 13742 ( 7.2 MiB) Stack traces created
MALLOC: 0 Table buckets in use
MALLOC: 2808 ( 0.0 MiB) Table buckets created
MALLOC: 11665416 ( 11.1 MiB) Pagemap bytes used
MALLOC: 4067336 ( 3.9 MiB) Pagemap root resident bytes
```
* **Spans:** structures that hold multiple [pages](#page-sizes) of allocatable
objects.
* **Thread heaps:** These are the per-thread structures used in per-thread
mode.
* **Stack traces:** These hold metadata for each sampled object.
* **Table buckets:** These hold data for stack traces for sampled events.
* **Pagemap:** This data structure supports the mapping of object addresses to
information about the objects held on the page. The pagemap root is a
potentially large array, and it is useful to know how much of it is actually
memory resident.
### Realized Fragmentation
```
MALLOC: 12238113346 (11671.2 MiB) Actual memory used at peak
MALLOC: 11626207678 (11087.6 MiB) Estimated in-use at peak
MALLOC: 5.2632 Realized fragmentation (%)
```
Memory overhead at peak demand is more important than off-peak, since we need to
provision a process with sufficient memory to run during its peak requirements
without OOM'ing. After a peak in demand, memory may be deallocated and held in
caches in anticipation of future reuse. Overhead as a fraction of the remaining
live allocations rises, but no additional memory is required.
This metric is called "realized fragmentation" and described in ["Adaptive
Hugepage Subrelease for Non-moving Memory Allocators in Warehouse-Scale
Computers"](https://research.google/pubs/pub50436/) (ISMM 2021). The realized
fragmentation metric computed here is a snapshot over the life of the entire
process.
These realized fragmentation stats in the summary table indicate a snapshot of
conditions when TCMalloc used a peak in its physical memory. As of April 2022,
the in-use at peak number is estimated from TCMalloc's periodic allocation
sampling.
### Page Sizes
There are three relevant "page" sizes for systems and TCMalloc. It's important
to be able to disambiguate them.
* **System default page size:** this is not reported by TCMalloc. This is 4KiB
on x86. It's not referred to in TCMalloc, and it's not important, but it's
important to know that it is different from the sizes of pages used in
TCMalloc.
* **TCMalloc page size:** This is the basic unit of memory management for
TCMalloc. Objects on the same page are the same number of bytes in size.
Internally TCMalloc manages memory in chunks of this size. TCMalloc supports
4 sizes: 4KiB (small but slow), 8KiB (the default), 32 KiB (large), 256 KiB
(256 KiB pages). There are trade-offs around the page sizes:
* Smaller page sizes are more memory efficient because we have less
fragmentation (ie left over space) when trying to provide the requested
amount of memory using 4KiB chunks. It's also more likely that all the
objects on a 4KiB page will be freed allowing the page to be returned
and used for a different size of data.
* Larger pages result in fewer fetches from the page heap to provide a
given amount of memory. They also keep allocated objects of the same
size in closer proximity.
* **TCMalloc hugepage size:** This is the size of a hugepage on the system,
for x86 this is 2MiB. This size is used as a unit of management by
temeriare, but not used by the pre-temeraire pageheap.
```
MALLOC: 32768 Tcmalloc page size
MALLOC: 2097152 Tcmalloc hugepage size
```
### Experiments
There is an experiment framework embedded into TCMalloc.
The enabled experiments are reported as part of the statistics.
```
MALLOC EXPERIMENTS: TCMALLOC_TEMERAIRE=0 TCMALLOC_TEMERAIRE_WITH_SUBRELEASE_V3=0
```
### Actual Memory Footprint
The output also reports the memory size information recorded by the OS:
* Bytes resident is the amount of physical memory in use by the application
(RSS). This includes things like program text which is excluded from the
information that TCMalloc presents.
* Bytes mapped is the size of the virtual address space in use by the
application (VSS). This can be substantially larger than the virtual memory
reported by TCMalloc as applications can increase VSS in other ways. It's
also not that useful as a metric since the VSS is a limit to the RSS, but
not directly related to the amount of physical memory that the application
uses.
```
Total process stats (inclusive of non-malloc sources):
TOTAL: 86880677888 (82855.9 MiB) Bytes resident (physical memory used)
TOTAL: 89124790272 (84996.0 MiB) Bytes mapped (virtual memory used)
```
### Per Size-Class Information
Requests for memory are rounded to convenient sizes. For example a request for
15 bytes could be rounded to 16 bytes. These sizes are referred to as class
sizes. There are various caches in TCMalloc where memory gets held, and the per
size-class section reports how much memory is being used by cached objects of
each size. The columns reported for each size-class are:
* The size of each object in that size-class.
* The number of objects of that size currently held in the per-cpu,
per-thread, transfer, and central caches.
* The total size of those objects in MiB - ie size of each object multiplied
by the number of objects.
* The cumulative size of that size-class plus all smaller size-classes.
* The number of live pages dedicated to this size-class.
* The number of returned and requested spans of this size-class.
```
Total size of freelists for per-thread and per-CPU caches,
transfer cache, and central cache, as well as number of
live pages, returned/requested spans by size-class
------------------------------------------------
class 1 [ 8 bytes ] : 45645 objs; 0.3 MiB; 0.3 cum MiB; 73 live pages; spans: 19 ret / 92 req = 0.2065;
class 2 [ 16 bytes ] : 39942 objs; 0.6 MiB; 1.0 cum MiB; 120 live pages; spans: 3 ret / 123 req = 0.0244;
class 3 [ 24 bytes ] : 84130 objs; 1.9 MiB; 2.9 cum MiB; 807 live pages; spans: 1330 ret / 2137 req = 0.6224;
class 4 [ 32 bytes ] : 107271 objs; 3.3 MiB; 6.2 cum MiB; 1048 live pages; spans: 420 ret / 1468 req = 0.2861;
class 5 [ 40 bytes ] : 82230 objs; 3.1 MiB; 9.3 cum MiB; 790 live pages; spans: 962 ret / 1752 req = 0.5491;
...
```
### Central Cache Free List Span Utilization
Central cache free list manages memory in spans, where each span is a collection
of one or more TCMalloc pages. We track histogram of span utilization, where
each column refers to the number of spans with allocated objects less than N.
```
------------------------------------------------
Central cache freelist: Span utilization histogram
Non-cumulative number of spans with allocated objects < N
------------------------------------------------
class 1 [ 8 bytes ] : 0 < 1, 0 < 2, 0 < 4, 0 < 8, 0 < 16, 1 < 32, 0 < 64, 1 < 128, 1 < 256, 1 < 512, 0 < 1024, 0 < 2048, 4 < 4096, 16 < 8192, 0 < 16384, 0 < 32768, 0 < 65536
class 2 [ 16 bytes ] : 0 < 1, 0 < 2, 0 < 4, 0 < 8, 0 < 16, 0 < 32, 0 < 64, 0 < 128, 0 < 256, 0 < 512, 1 < 1024, 0 < 2048, 47 < 4096, 0 < 8192, 0 < 16384, 0 < 32768, 0 < 65536
class 3 [ 24 bytes ] : 0 < 1, 0 < 2, 0 < 4, 0 < 8, 0 < 16, 0 < 32, 0 < 64, 2 < 128, 1 < 256, 3 < 512, 5 < 1024, 127 < 2048, 0 < 4096, 0 < 8192, 0 < 16384, 0 < 32768, 0 < 65536
class 4 [ 32 bytes ] : 0 < 1, 0 < 2, 0 < 4, 0 < 8, 0 < 16, 0 < 32, 0 < 64, 0 < 128, 0 < 256, 1 < 512, 0 < 1024, 129 < 2048, 0 < 4096, 0 < 8192, 0 < 16384, 0 < 32768, 0 < 65536
class 5 [ 40 bytes ] : 0 < 1, 1 < 2, 1 < 4, 0 < 8, 0 < 16, 0 < 32, 1 < 64, 1 < 128, 4 < 256, 5 < 512, 80 < 1024, 0 < 2048, 0 < 4096, 0 < 8192, 0 < 16384, 0 < 32768, 0 < 65536
...
```
### Transfer Cache Information
Transfer cache is used by TCMalloc, before going to central free list. For each
size-class, we track and report the following statistics:
* The size of each object in that size-class.
* The number of objects of that size currently held in the transfer cache.
* The total size of those objects in MiB - i.e. size of each object multiplied
by the number of objects in the freelist.
* The cumulative size of that size-class plus all smaller size-classes.
* The current capacity of the freelist.
* The maximum capacity to which the freelist is allowed to grow.
* The number of hits observed during inserts to the transfer cache.
* The total number batched and non-batched misses observed during insert
operations.
* The number of partial (i.e. non-batch-sized) misses observed during insert
operations.
* The number of hits observed during removes from the transfer cache.
* The total number batched and non-batched misses observed during remove
operations.
* The number of partial (i.e. non-batch-sized) misses observed during remove
operations.
```
------------------------------------------------
Used bytes, current capacity, and maximum allowed capacity
of the transfer cache freelists.
It also reports insert/remove hits/misses by size class.
------------------------------------------------
class 1 [ 8 bytes ] : 1472 objs; 0.0 MiB; 0.0 cum MiB; 2048 capacity; 2048 max_capacity; 935 insert hits; 8543 insert misses ( 4507 partial); 889 remove hits; 6612 remove misses ( 86 partial);
class 2 [ 16 bytes ] : 608 objs; 0.0 MiB; 0.0 cum MiB; 2048 capacity; 2048 max_capacity; 575 insert hits; 3739 insert misses ( 3602 partial); 556 remove hits; 3368 remove misses ( 70 partial);
class 3 [ 24 bytes ] : 864 objs; 0.0 MiB; 0.0 cum MiB; 2048 capacity; 2048 max_capacity; 1533 insert hits; 15594 insert misses ( 9417 partial); 1506 remove hits; 11939 remove misses ( 74 partial);
class 4 [ 32 bytes ] : 96 objs; 0.0 MiB; 0.0 cum MiB; 2048 capacity; 2048 max_capacity; 1065 insert hits; 21772 insert misses ( 19918 partial); 1061 remove hits; 6403 remove misses ( 119 partial);
class 5 [ 40 bytes ] : 1408 objs; 0.1 MiB; 0.1 cum MiB; 2048 capacity; 2048 max_capacity; 1475 insert hits; 16018 insert misses ( 14943 partial); 1431 remove hits; 3293 remove misses ( 60 partial);
class 6 [ 48 bytes ] : 1664 objs; 0.1 MiB; 0.2 cum MiB; 2048 capacity; 2048 max_capacity; 1213 insert hits; 39140 insert misses ( 37096 partial); 1160 remove hits; 5909 remove misses ( 80 partial);
class 7 [ 56 bytes ] : 1792 objs; 0.1 MiB; 0.3 cum MiB; 2048 capacity; 2048 max_capacity; 466 insert hits; 650 insert misses ( 375 partial); 410 remove hits; 1264 remove misses ( 55 partial);
class 8 [ 64 bytes ] : 1408 objs; 0.1 MiB; 0.4 cum MiB; 2048 capacity; 2048 max_capacity; 2181 insert hits; 8816 insert misses ( 8069 partial); 2137 remove hits; 2024 remove misses ( 74 partial);
class 9 [ 72 bytes ] : 960 objs; 0.1 MiB; 0.4 cum MiB; 1600 capacity; 2048 max_capacity; 104 insert hits; 463 insert misses ( 463 partial); 74 remove hits; 287 remove misses ( 62 partial);
class 10 [ 80 bytes ] : 1056 objs; 0.1 MiB; 0.5 cum MiB; 2048 capacity; 2048 max_capacity; 372 insert hits; 3334 insert misses ( 3287 partial); 339 remove hits; 562 remove misses ( 80 partial);
...
```
As of July 2021, the `TransferCache` misses when inserting or removing a
non-batch size number of objects from the cache. These are reflected in the
"partial" column. The insert and remove miss column is *inclusive* of misses for
both batch size and non-batch size numbers of objects.
### Per-CPU Information
If the per-cpu cache is enabled then we get a report of the memory currently
being cached on each CPU.
The first number reported is the maximum size of the per-cpu cache on each CPU.
This corresponds to the parameter `MallocExtension::GetMaxPerCpuCacheSize()`,
which defaults to 1.5MiB. [See tuning](tuning.md)
The following columns are reported for each CPU:
* The cpu ID
* The total size of the objects held in the CPU's cache in bytes.
* The total size of the objects held in the CPU's cache in MiB.
* The total number of unallocated bytes.
The concept of unallocated bytes needs to be explained because the definition is
not obvious.
The per-cpu cache is an array of pointers to available memory. Each size-class
has a number of entries that it can use in the array. These entries can be used
to hold memory, or be empty.
To control the maximum memory that the per-cpu cache can use we sum up the
number of slots that can be used by a size-class multiplied by the size of
objects in that size-class. This gives us the total memory that could be held in
the cache. This is not what is reported by unallocated memory.
Unallocated memory is the amount of memory left over from the per cpu limit
after we have subtracted the total memory that could be held in the cache.
The in use memory is calculated from the sum of the number of populated entries
in the per-cpu array multiplied by the size of the objects held in those
entries.
To summarise, the per-cpu limit (which is reported before the per-cpu data) is
equal to the number of bytes in use (which is reported in the second column)
plus the number of bytes that could be used (which is not reported) plus the
unallocated "spare" bytes (which is reported as the last column).
```
Bytes in per-CPU caches (per cpu limit: 3145728 bytes)
------------------------------------------------
cpu 0: 2168200 bytes ( 2.1 MiB) with 52536 bytes unallocated active
cpu 1: 1734880 bytes ( 1.7 MiB) with 258944 bytes unallocated active
cpu 2: 1779352 bytes ( 1.7 MiB) with 8384 bytes unallocated active
cpu 3: 1414224 bytes ( 1.3 MiB) with 112432 bytes unallocated active
cpu 4: 1260016 bytes ( 1.2 MiB) with 179800 bytes unallocated
...
```
Some CPU caches may be marked `active`, indicating that the process is currently
runnable on that CPU.
### Size Class Capacity Information in Per-CPU Caches
In per-CPU caches, TCMalloc caches objects of discrete sizes. These are referred
to as size classes. Memory requests for a particular object size are rounded off
to a convenient size class. TCMalloc populates objects in each size class based
on their demand, but also imposes an upper limit on the number of objects that
may be cached per size class. The statistics below measure the capacity of each
size class freelist, where capacity represents the total number of objects
currently cached by the freelist. The columns below report number of objects
cached by TCMalloc per size class:
* Size class.
* The size of each object in that size class.
* Minimum capacity of the size class freelist summarized over all per-CPU
caches.
* Average capacity of the size class freelist summarized over all per-CPU
caches.
* Maximum capacity of the size class freelist summarized over all per-CPU
caches.
* The upper limit imposed by TCMalloc on the number of objects that can be
cached in a per-CPU cache for that size class.
```
------------------------------------------------
Size class capacity statistics in per-cpu caches
------------------------------------------------
class 0 [ 0 bytes ] : 0 (minimum), 0.0 (average), 0 (maximum), 0 maximum allowed capacity
class 1 [ 8 bytes ] : 0 (minimum), 133.1 (average), 636 (maximum), 2048 maximum allowed capacity
class 2 [ 16 bytes ] : 0 (minimum), 51.8 (average), 378 (maximum), 2048 maximum allowed capacity
class 3 [ 24 bytes ] : 0 (minimum), 119.3 (average), 510 (maximum), 2048 maximum allowed capacity
class 4 [ 32 bytes ] : 0 (minimum), 100.0 (average), 542 (maximum), 2048 maximum allowed capacity
class 5 [ 40 bytes ] : 0 (minimum), 80.6 (average), 467 (maximum), 2048 maximum allowed capacity
```
### Number of per-CPU cache underflows, overflows, and reclaims
We also keep track of cache miss counts. Underflows are when the user allocates
and the cache does not have any pointers to return. Overflows are when the user
deallocates and the cache is full. The ratio of overflows to underflows gives a
rough indication of whether the cache is large enough. If the cache had infinite
capacity, then we would expect to have 0 overflows whereas if the cache had 0
capacity, we would expect to see roughly equal numbers of overflows and
underflows. Therefore, if the ratio is close to 1.0, then the cache may not be
large enough. Reclaims are when we empty out a cache for a specific CPU because
it has been idle for a period of time. In this section, we report the total
numbers of each of these metrics across all CPUs as well as the numbers for each
individual CPU.
```
------------------------------------------------
Number of per-CPU cache underflows, overflows, and reclaims
------------------------------------------------
Total : 242 underflows, 12 overflows, overflows / underflows: 0.05, 168 reclaims
cpu 0: 69 underflows, 5 overflows, overflows / underflows: 0.07, 46 reclaims
cpu 1: 58 underflows, 0 overflows, overflows / underflows: 0.00, 42 reclaims
cpu 2: 62 underflows, 7 overflows, overflows / underflows: 0.11, 42 reclaims
cpu 3: 40 underflows, 0 overflows, overflows / underflows: 0.00, 27 reclaims
cpu 4: 13 underflows, 0 overflows, overflows / underflows: 0.00, 11 reclaims
cpu 5: 0 underflows, 0 overflows, overflows / underflows: 0.00, 0 reclaims
```
### Pageheap Information
The pageheap holds pages of memory that are not currently being used either by
the application or by TCMalloc's internal caches. These pages are grouped into
spans - which are ranges of contiguous pages, and these spans can be either
mapped (backed by physical memory) or unmapped (not necessarily backed by
physical memory).
Memory from the pageheap is used either to replenish the per-thread or per-cpu
caches, or to directly satisfy requests that are larger than the sizes supported
by the per-thread or per-cpu caches.
**Note:** TCMalloc cannot tell whether a span of memory is actually backed by
physical memory, but it uses *unmapped* to indicate that it has told the OS that
the span is not used and does not need the associated physical memory. For this
reason the physical memory of an application may be larger that the amount that
TCMalloc reports.
The pageheap section contains the following information:
* The first line reports the number of sizes of spans, the total memory that
these spans cover, and the total amount of that memory that is unmapped.
* The size of the span in number of pages.
* The number of spans of that size.
* The total memory consumed by those spans in MiB.
* The cumulative total memory held in spans of that size and fewer pages.
* The amount of that memory that has been unmapped.
* The cumulative amount of unmapped memory for spans of that size and smaller.
```
PageHeap: 30 sizes; 480.1 MiB free; 318.4 MiB unmapped
------------------------------------------------
1 pages * 341 spans ~ 10.7 MiB; 10.7 MiB cum; unmapped: 1.9 MiB; 1.9 MiB cum
2 pages * 469 spans ~ 29.3 MiB; 40.0 MiB cum; unmapped: 0.0 MiB; 1.9 MiB cum
3 pages * 462 spans ~ 43.3 MiB; 83.3 MiB cum; unmapped: 3.3 MiB; 5.2 MiB cum
4 pages * 119 spans ~ 14.9 MiB; 98.2 MiB cum; unmapped: 0.1 MiB; 5.3 MiB cum
...
```
### Pageheap Cache Age
The next section gives some indication of the age of the various spans in the
pageheap. Live (ie backed by physical memory) and unmapped spans are reported
separately.
The columns indicate roughly how long the span has been in the pageheap, ranging
from less than a second to more than 8 hours.
```
------------------------------------------------
PageHeap cache entry age (count of pages in spans of a given size that have been idle for up to the given period of time)
------------------------------------------------
mean <1s 1s 30s 1m 30m 1h 8+h
Live span TOTAL PAGES: 9.1 533 13322 26 1483 0 0 0
Live span, 1 pages: 7.4 0 256 0 24 0 0 0
Live span, 2 pages: 1.6 38 900 0 0 0 0 0
Unmapped span TOTAL PAGES: 153.9 153 2245 1801 5991 0 0 0
Unmapped span, 1 pages: 34.6 0 35 15 11 0 0 0
Unmapped span, 3 pages: 28.4 0 60 42 3 0 0 0
...
```
### Pageheap Allocation Summary
This reports some stats on the number of pages allocated.
* The number of live (i.e., not on page heap) pages that were "small"
allocations. Small allocations are ones that are tracked in the pageheap by
size (e.g., a region of two pages in size). Larger allocations are just kept
in an array that has to be scanned linearly.
* The pages of slack result from situations where allocation is rounded up to
hugepages, and this leaves some spare pages.
* The largest seen allocation is self explanatory.
```
PageHeap: stats on allocation sizes
PageHeap: 344420 pages live small allocation
PageHeap: 12982 pages of slack on large allocations
PageHeap: largest seen allocation 29184 pages
```
### Pageheap Per Number Of Pages In Range
This starts off reporting the activity for small ranges of pages, but at the end
of the list starts aggregating information for groups of page ranges.
* The first column contains the number of pages (or the range of pages if the
bucket is wider than a single page).
* The second and third columns are the number of allocated and freed pages we
have seen of this size.
* The fourth column is the number of live allocations of this size.
* The fifth column is the size of those live allocations in MiB.
* The sixth column is the allocation rate in pages per second since the start
of the application.
* The seventh column is the allocation rate in MiB per second since the start
of the application.
```
PageHeap: per-size information:
PageHeap: 1 page info: 23978897 / 23762891 a/f, 216006 (6750.2 MiB) live, 2.43e+03 allocs/s ( 76.1 MiB/s)
PageHeap: 2 page info: 21442844 / 21436331 a/f, 6513 ( 407.1 MiB) live, 2.18e+03 allocs/s (136.0 MiB/s)
PageHeap: 3 page info: 2333686 / 2329225 a/f, 4461 ( 418.2 MiB) live, 237 allocs/s ( 22.2 MiB/s)
PageHeap: 4 page info: 21509168 / 21508751 a/f, 417 ( 52.1 MiB) live, 2.18e+03 allocs/s (272.9 MiB/s)
PageHeap: 5 page info: 3356076 / 3354188 a/f, 1888 ( 295.0 MiB) live, 341 allocs/s ( 53.2 MiB/s)
PageHeap: 6 page info: 1718534 / 1718486 a/f, 48 ( 9.0 MiB) live, 174 allocs/s ( 32.7 MiB/s)
...
```
### GWP-ASan Status
The GWP-ASan section displays information about allocations guarded by
[GWP-ASan](gwp-asan.md).
* The number of successful and failed GWP-ASan allocations. If there are 0
successful and 0 failed allocations, GWP-ASan is probably disabled on your
binary. If there are a large number of failed allocations, it probably means
your sampling rate is too high, causing the guarded slots to be exhausted.
See
[GWP-ASan sampling rate](gwp-asan.md#what-should-i-set-the-sampling-rate-to).
* The number of "slots" currently allocated and quarantined. An allocated slot
contains an allocation that is still active (i.e., not freed) while a
quarantined slot has either not been used yet or contains an allocation that
was freed.
* The maximum number of slots that have been allocated at the same time. This
number is printed along with the allocated slot limit. If the maximum slots
allocated matches the limit, you may want to reduce your sampling rate to
avoid failed GWP-ASan allocations.
```
------------------------------------------------
GWP-ASan Status
------------------------------------------------
Successful Allocations: 1823
Failed Allocations: 0
Slots Currently Allocated: 33
Slots Currently Quarantined: 95
Moximum Slots Allocated: 51 / 64
```
### Memory Requested From The OS
The stats also report the amount of memory requested from the OS by mmap.
Memory is also requested, but may not actually be backed by physical memory, so
these stats should resemble the VSS of the application, not the RSS.
```
Low-level allocator stats:
MmapSysAllocator: 18083741696 bytes (17246.0 MiB) allocated
```
## Temeraire
### Introduction
Temeraire (or Huge Page Aware Allocator) is a new page heap for TCMalloc that is
hugepage aware. It is designed to better handle memory backed by hugepages -
avoiding breaking them up. Since it is more elaborate code, it reports
additional information.
See the [Temeraire design doc](temeraire.md) for more complete information.
### Summary Statistics
The initial set of statistics from the Huge Page Aware Allocator are similar to
the old page heap, and show a summary of the number of instances of each range
of contiguous pages.
```
------------------------------------------------
HugePageAware: 75 sizes; 938.8 MiB free; 1154.0 MiB unmapped
------------------------------------------------
1 pages * 86655 spans ~ 677.0 MiB; 677.0 MiB cum; unmapped: 0.0 MiB; 0.0 MiB cum
2 pages * 3632 spans ~ 56.8 MiB; 733.7 MiB cum; unmapped: 0.0 MiB; 0.0 MiB cum
3 pages * 288 spans ~ 6.8 MiB; 740.5 MiB cum; unmapped: 0.0 MiB; 0.0 MiB cum
4 pages * 250 spans ~ 7.8 MiB; 748.3 MiB cum; unmapped: 0.0 MiB; 0.0 MiB cum
...
```
The first line indicates the number of different sizes of ranges, the total MiB
available, and the total MiB of unmapped ranges. The next lines are per number
of continuous pages:
* The number of contiguous pages
* The number of spans of that number of pages
* The total number of MiB of that span size that are mapped.
* The cumulative total of the mapped pages.
* The total number of MiB of that span size that are unmapped.
* The cumulative total of the unmapped pages.
### Per Component Information
The Huge Page Aware Allocator has multiple places where pages of memory are
held. More details of its workings can be found in
[the Temeraire design doc](temeraire.md). There are four caches where pages of
memory can be located:
* The filler, used for allocating ranges of a few TCMalloc pages in size.
* The region cache, used for allocating ranges of multiple pages.
* The huge cache which contains huge pages that are backed with memory.
* The huge page allocator which contains huge pages that are not backed by
memory.
We get some summary information for the various caches, before we report
detailed information for each of the caches.
```
Huge page aware allocator components:
------------------------------------------------
HugePageAware: breakdown of free / unmapped / used space:
HugePageAware: filler 38825.2 MiB used, 938.8 MiB free, 0.0 MiB unmapped
HugePageAware: region 0.0 MiB used, 0.0 MiB free, 0.0 MiB unmapped
HugePageAware: cache 908.0 MiB used, 0.0 MiB free, 0.0 MiB unmapped
HugePageAware: alloc 0.0 MiB used, 0.0 MiB free, 1154.0 MiB unmapped
```
The summary information tells us:
* The first column shows how much memory has been allocated from each of the
caches
* The second column indicates how much backed memory is available in each
cache.
* The third column indicates how much unmapped memory is available in each
cache.
### Filler Cache
The filler cache contains TCMalloc sized pages from within a single hugepage. So
if we want a single TCMalloc page we will look for it in the filler.
There are three sections of stats around the filler cache. The first section
gives an indication of the number and state of the hugepages in the filler
cache.
```
HugePageFiller: densely pack small requests into hugepages
HugePageFiller: 19882 total, 8083 full, 11799 partial, 0 released (0 partially), 0 quarantined
HugePageFiller: 120168 pages free in 19882 hugepages, 0.0236 free
HugePageFiller: among non-fulls, 0.0398 free
HugePageFiller: 499 used pages in subreleased hugepages (0 of them in partially released)
HugePageFiller: 0 hugepages partially released, 0.0000 released
HugePageFiller: 1.0000 of used pages hugepageable
HugePageFiller: Since startup, 26159 pages subreleased, 345 hugepages broken
```
The summary stats are as follows:
* "total" refers to the total number of hugepages in the filler cache.
* "full" is the number of those hugepages that have multiple in-use
allocations.
* "partial" is the remaining number of hugepages that have a single in-use
allocation.
* "released" is the number of hugepages that are released - i.e., partially
unmapped. If partially released hugepages are enabled, the number in
parentheses shows the number of hugepages in this category.
* "quarantined" is a feature has been disabled, so the result is currently
zero.
The second section gives an indication of the number of pages in various states
in the filler cache:
* "pages free" refers to the number of free TCMalloc pages in the filler, as
well as the ratio to the total number of hugepages.
* "among non-fulls" states this ratio to the number of non-full hugepages.
* "used pages" refers to the number of occupied pages in the different types
of partially unmapped hugepages.
```
HugePageFiller: fullness histograms
HugePageFiller: # of regular hps with a<= # of free pages <b
HugePageFiller: < 0<= 8083 < 1<= 6 < 2<= 1 < 3<= 1 < 4<= 0 < 16<= 103
HugePageFiller: < 32<= 1 < 48<= 0 < 64<= 3 < 80<= 1 < 96<= 0 <112<= 0
HugePageFiller: <128<= 28 <144<= 0 <160<= 0 <176<= 1 <192<= 0 <208<= 0
HugePageFiller: <224<= 2 <240<= 0 <252<= 0 <253<= 0 <254<= 0 <255<= 0
HugePageFiller: # of donated hps with a<= # of free pages <b
HugePageFiller: < 0<= 0 < 1<= 0 < 2<= 0 < 3<= 0 < 4<= 0 < 16<= 0
HugePageFiller: < 32<= 0 < 48<= 0 < 64<= 0 < 80<= 0 < 96<= 0 <112<= 0
HugePageFiller: <128<= 1 <144<= 0 <160<= 0 <176<= 0 <192<= 0 <208<= 0
HugePageFiller: <224<= 0 <240<= 0 <252<= 0 <253<= 0 <254<= 0 <255<= 0
HugePageFiller: # of released hps with a<= # of free pages <b
...
HugePageFiller: # of regular hps with a<= longest free range <b
HugePageFiller: < 0<= 8083 < 1<= 6 < 2<= 1 < 3<= 1 < 4<= 0 < 16<= 103
HugePageFiller: < 32<= 1 < 48<= 0 < 64<= 4 < 80<= 0 < 96<= 0 <112<= 0
HugePageFiller: <128<= 29 <144<= 0 <160<= 0 <176<= 0 <192<= 0 <208<= 1
HugePageFiller: <224<= 1 <240<= 0 <252<= 0 <253<= 0 <254<= 0 <255<= 0
HugePageFiller: # of released hps with a<= longest free range <b
...
HugePageFiller: # of regular hps with a<= # of allocations <b
HugePageFiller: < 1<= 8 < 2<= 7 < 3<= 10 < 4<= 10 < 5<= 12 < 17<= 15
HugePageFiller: < 33<= 12 < 49<= 2 < 65<= 0 < 81<= 2 < 97<= 17 <113<= 166
HugePageFiller: <129<= 42 <145<= 6 <161<= 20 <177<= 48 <193<= 398 <209<= 1968
HugePageFiller: <225<= 5062 <241<= 425 <253<= 0 <254<= 0 <255<= 0 <256<= 0
HugePageFiller: # of released hps with a<= # of allocations <b
...
```
Some sections have been elided here for space.
There are three sections, split by three tracker types. They use the same
reporting format and indicate:
* The available TCMalloc pages in the hugepages of the given type.
* The longest contiguous range of available TCMalloc pages in the hugepages of
the given type.
* The number of current allocations from each of the hugepages of the given
type. The ranges are offset by one here, because a hugepage can't have zero
allocations.
The reporting format is the number of hugepages that are between a particular
range for the characteristic of interest. For example:
* There are 3 regular hugepages with TCMalloc free pages >= 64 and < 80.
* There are 6 regular hugepages with a longest contiguous length of exactly 1
page.
* There are 2 regular hugepages with between 81 and 96 allocations.
The three tracker types are "regular," "donated," and "released." "Regular" is
by far the most common, and indicates regular memory in the filler.
"Donated" is hugepages that have been donated to the filler from the tail of
large (multi-hugepage) allocations, so that the leftover space can be packed
with smaller allocations. But we prefer to use up all useable regular hugepages
before touching the donated ones, which devolve to "regular" type once they are
used. Because of this last property, donated hugepages always have only one
allocation and their longest range equals their free space, so those histograms
aren't shown.
"Released" is partially released hugepages. Normally the entirety of a hugepage
is backed by real RAM, but in partially released hugepages most of it has been
returned to the OS. Because this defeats the primary goal of the hugepage-aware
allocator, this is done rarely, and we only reuse partially-released hugepages
for new allocations as a last resort.
The final section shows a summary of the filler's state over the past 5 minute
time period:
```
HugePageFiller: time series over 5 min interval
HugePageFiller: realized fragmentation: 0.0 MiB
HugePageFiller: minimum free pages: 0 (0 backed)
HugePageFiller: at peak demand: 1774 pages (and 261 free, 13 unmapped)
HugePageFiller: at peak demand: 8 hps (5 regular, 1 donated, 0 partial, 2 released)
HugePageFiller: at peak hps: 1774 pages (and 261 free, 13 unmapped)
HugePageFiller: at peak hps: 8 hps (5 regular, 1 donated, 0 partial, 2 released)
```
The first line shows the minimum number of free pages over the time interval,
which is an indication of how much memory could have been "usefully" reclaimed
(i.e., free for long enough that the OS would likely be able to use the memory
for another process). The line shows both the total number of free pages in the
filler (whether or not released to the OS) as well as only those that were
backed by physical memory for the full 5-min interval. The realized
fragmentation metric computed here uses a bounded window.
The next two sections show the state of the filler at peak demand (i.e., when
the maximum number of pages was in use) and at peak hps (i.e., when the maximum
number of hugepages was in use). For each, we show the number of free (backed)
pages as well as unmapped pages, and the number of the four different types of
hugepages active at that time. If there are multiple peaks, we return the state
at the latest one of them.
If applicable, an additional section tracks the behavior that skips subreleasing
hugepages if behind the recent demand requirement, which is either the peak
within `--tcmalloc_skip_subrelease_interval`, or the sum of short-term
fluctuation peak within `--tcmalloc_skip_subrelease_short_interval` and
long-term trend within `--tcmalloc_skip_subrelease_long_interval`.
**Note:** Conducting skip-subrelease using both short-term and long-term
intervals is an experimental feature, and should not be enabled without
understanding its performance tradeoffs.
```
HugePageFiller: Since the start of the execution, 0 subreleases (0 pages) were skipped due to either recent (0s) peaks, or the sum of short-term (0s) fluctuations and long-term (0s) trends..
HugePageFiller: 100.0000% of decisions confirmed correct, 0 pending (100.0000% of pages, 0 pending), as per anticipated 300s realized fragmentation.
```
This shows how many times a page that was meant to be subreleased was not (note
that this can refer to the same page multiple times if subrelease of this page
would have been triggered multiple times). The percentage shows what fraction of
times this decision would have been correct (i.e., if we decided not to
subrelease a page because of the calculated demand requirement, did memory
consumption increase again within the *next* five minutes?). "Pending" refers to
subrelease decisions that were less than five minutes in the past and we
therefore do not know yet whether or not they were correct. The correctness
evaluation chooses to use the five minutes interval as it is the interval used
for realized fragmentation.
The skip-subrelease feature prioritizes using the recent peak if
`--tcmalloc_skip_subrelease_interval` is configured, otherwise it uses the
combination of the recent short-term fluctuation peak and long-term trend. The
feature is disabled if all three intervals are zero.
### Region Cache
The region cache holds a chunk of memory from which can be allocated spans of
multiple TCMalloc pages. The region cache may not be populated, and it can
contain multiple regions.
```
HugeRegionSet: 1 MiB+ allocations best-fit into 1024 MiB slabs
HugeRegionSet: 0 total regions
HugeRegionSet: 0 hugepages backed out of 0 total
HugeRegionSet: 0 pages free in backed region, 0.0000 free
```
The lines of output indicate:
* The size of each region in MiB - this is currently 1GiB.
* The total number of regions in the region cache, in the example above there
are no regions in the cache.
* The number of backed hugepages in the cache out of the total number of
hugepages in the region cache.
* The number of free TCMalloc pages in the regions, and as a ratio of the
number of backed pages.
### Huge Cache
The huge cache contains backed hugepages, it grows and shrinks in size depending
on runtime conditions. Attempting to hold onto backed memory ready to be
provided for the application.
```
HugeCache: contains unused, backed hugepage(s)
HugeCache: 0 / 10 hugepages cached / cache limit (0.053 hit rate, 0.436 overflow rate)
HugeCache: 88880 MiB fast unbacked, 6814 MiB periodic
HugeCache: 1234 MiB*s cached since startup
HugeCache: recent usage range: 40672 min - 40672 curr - 40672 max MiB
HugeCache: recent offpeak range: 0 min - 0 curr - 0 max MiB
HugeCache: recent cache range: 0 min - 0 curr - 0 max MiB
```
The output shows the following information:
* The number of hugepages out of the maximum number of hugepages we will hold
in the huge cache. The hit rate is how often we get pages from the huge
cache vs getting them from the huge allocator. The overflow rate is the
number of times we added something to the huge cache causing it to exceed
its size limit.
* The fast unbacked is the cumulative amount of memory unbacked due size
limitations, the periodic count is the cumulative amount of memory unbacked
by periodic calls to release unused memory.
* The amount of cumulative memory stored in HugeCache since the startup of the
process. In other words, the area under the cached-memory-vs-time curve.
* The usage range is the range minimum, current, maximum in MiB of memory
obtained from the huge cache.
* The off-peak range is the minimum, current, maximum cache size in MiB
compared to the peak cache size.
* The recent range is the minimum, current, maximum size of memory in MiB in
the huge cache.
### Huge Allocator
The huge allocator holds unmapped memory ranges. We allocate from here if we are
unable to allocate from any of the caches.
```
HugeAllocator: contiguous, unbacked hugepage(s)
HugeAddressMap: treap 5 / 10 nodes used / created
HugeAddressMap: 256 contiguous hugepages available
HugeAllocator: 20913 requested - 20336 in use = 577 hugepages free
```
The information reported here is:
* The number of nodes used and created to handle regions of memory.
* The size of the longest contiguous region of available hugepages.
* The number of hugepages requested from the system, the number of hugepages
in used, and the number of hugepages available in the cache.
### Pageheap Summary Information
The new pageheap reports some summary information:
```
HugePageAware: stats on allocation sizes
HugePageAware: 4969003 pages live small allocation
HugePageAware: 659 pages of slack on large allocations
HugePageAware: largest seen allocation 45839 pages
```
These are:
* The number of live "small" TCMalloc pages allocated (these less than 2MiB in
size). [Note: the 2MiB size distinction is separate from the size of
hugepages]
* The number of TCMalloc pages which are left over from "large" allocations.
These allocations are larger than 2MiB in size, and are rounded to a
hugepage - the slack being the amount left over after rounding.
* The largest seen allocation request in TCMalloc pages.
### Per Size Range Info:
The per size range info is the same format as the old pageheap:
* The first column contains the number of pages (or the range of pages if the
bucket is wider than a single page).
* The second and third columns are the number of allocated and freed pages we
have seen of this size.
* The fourth column is the number of live allocations of this size.
* The fifth column is the size of those live allocations in MiB.
* The sixth column is the allocation rate in pages per second since the start
of the application.
* The seventh column is the allocation rate in MiB per second since the start
of the application.
```
HugePageAware: per-size information:
HugePageAware: 1 page info: 5817510 / 3863506 a/f, 1954004 (15265.7 MiB) live, 16 allocs/s ( 0.1 MiB/s)
HugePageAware: 2 page info: 1828473 / 1254096 a/f, 574377 ( 8974.6 MiB) live, 5.03 allocs/s ( 0.1 MiB/s)
HugePageAware: 3 page info: 1464568 / 1227253 a/f, 237315 ( 5562.1 MiB) live, 4.03 allocs/s ( 0.1 MiB/s)
...
```
### Pageheap Age Information:
The new pageheap allocator also reports information on the age of the various
page ranges. In this example you can see that there was a large number of
unmapped pages in the last minute.
```
------------------------------------------------
HugePageAware cache entry age (count of pages in spans of a given size that have been idle for up to the given period of time)
------------------------------------------------
mean <1s 1s 30s 1m 30m 1h 8+h
Live span TOTAL PAGES: 29317.6 145 549 1775 13059 13561 58622 32457
Live span, 1 pages: 35933.7 0 55 685 6354 8111 43853 27597
...
Unmapped span TOTAL PAGES: 51.3 0 0 131072 16640 0 0 0
Unmapped span, >=64 pages: 51.3 0 0 131072 16640 0 0 0
...
```

View File

@ -0,0 +1,267 @@
# Temeraire: Hugepage-Aware Allocator
Andrew Hunter, [Chris Kennelly](ckennelly@google.com)
*Notes on the name*[^cutie]*: the french word for "reckless" or "rash" :), and
also the name of several large and powerful English warships. So: giant and
powerful, but maybe a little dangerous. :)*
This is a description of the design of the Hugepage-Aware Allocator. We have
also published ["Beyond malloc efficiency to fleet efficiency: a hugepage-aware
memory allocator" at OSDI 2021](https://research.google/pubs/pub50370/), which
provides further details on the design, implementation, and rollout of
Temeraire.
## GOALS
What do we want out of this redesign?
* Dramatic reduction in pageheap size. The pageheap in TCMalloc holds
substantial amounts of memory *after* its attempts to `MADV_DONTNEED` memory
back to the OS, due to internal fragmentation. We can recover a useful
fraction of this. In optimal cases, we see savings of over 90%. We do not
expect to achieve this generally, but a variety of synthetic loads suggest
50% of pageheap is a reasonable target savings.
* Dramatic increase in hugepage usage. The `madvise()` in
`ReleaseMemoryToSystem` is made without any thought to transparent
hugepages, and in practice prevent most fleet RAM from remaining as intact
hugepages. Services have seen substantial performance gains from **from
disabling release** (and going to various other lengths to maximize hugepage
usage).
* *reasonable* allocation speed. This is really stating a non-goal: speed
parity with `PageHeap::New`. PageHeap is a relatively light consumer of
cycles. We are willing to accept a speed hit in actual page allocation in
exchange for better hugepage usage and space overhead. This is not free but
we think is well justified. Our goal is more to avoid catastrophic
regressions in speed. We intentionally accept two particular time hits:
* much more aggressive releasing (of entire hugepages), leading to
increased costs for *backing* memory.
* much more detailed (and expensive) choices of where to fulfill a
particular request.
## DESIGN
The algorithm -- as usual here, really, the data structures, which neatly
determine our algorithm -- are nicely divided into components. Essentially, the
path of an allocation goes like this:
1. If it is sufficiently small and we have the space we take an existing,
backed, partially empty hugepage and fit our allocation within it.
1. If it is too large to fit in a single hugepage, but too small to simply
round up to an integral number of hugepages, we best-fit it into one of
several larger slabs (whose allocations can cross hugepage boundaries). We
will back hugepages as needed for the allocation.
1. Sufficiently large allocations are rounded up to the nearest hugepage; the
extra space may be used for smaller allocations.
Deallocation simply determines which of 1), 2), or 3) happened, and marks the
corresponding object we allocated from as free.
We will sketch the purpose and approach of each important part. Note that we
have fairly detailed unit tests for each of these; one consequence on the
implementations is that most components are templated on the
`tcmalloc::SystemRelease` functions[^templated] as we make a strong attempt to
be zero initializable where possible (sadly not everywhere).
### `RangeTracker`
`RangeTracker` and `Bitmap`, its underlying implementation, are helper class
used throughout the components below. They are both quite simple: `Bitmap` is a
fixed-size (templated) bitmap with fast operations to set and clear bits and
ranges of bits, with extensive support for searching and iterating. (Search and
iteration support is why `std::bitset` is not usable here.)
`RangeTracker` is essentially a `Bitmap` augmented with statistics on usage, in
particular the longest range of contiguous free (false) bits. It provides
methods to do best-fit allocation from free ranges (keeping the statistics
correct).
Both of these need to be quite fast as they're on nearly every
allocation/deallocation path in `HugePageAwareAllocator` (in multiple ways)!
They are reasonably optimized but probably still have more headroom.
### HugeAllocator/HugeCache (the backing...)
This is a set of classes that fulfills requests for backed (or unbacked) aligned
hugepage ranges. We use this for sufficiently large (or nicely sized) requests,
and to provide memory for the other components to break up into smaller chunks.
#### `HugeAllocator`
`HugeAllocator` is (nearly) trivial: it requests arbitrarily large
hugepage-sized chunks from `SysAllocator`, keeps them unbacked, and tracks the
available (unbacked) regions. Note that we do not need to be perfectly space
efficient here: we only pay virtual memory and metadata, since *none* of the
contents are backed. (We do make our best efforts to be relatively frugal,
however, since theres no need to inflate VSS by large factors.) Nor do we have
to be particularly fast; this is well off any hot path, and were going to incur
non-trivial backing costs as soon as were done assigning a range.
The one tricky bit here is that we have to write some fiddly data structures by
hand. We would have liked to implement this by grabbing large (gigabyte+) ranges
from SysAllocator and using bitmaps or the like within them; however, too many
tests have brittle reliance on details of `SysAllocator` that break if TCMalloc
consistently requests (any considerable amount) more than the minimum needed to
back current usage. So instead we need to track relatively small ranges. We've
implemented a balanced tree that merges adjacent ranges; it is, as we said,
fiddly, but reasonably efficient and not stunningly complicated.
#### `HugeCache`
This is a very simple wrapper on top of HugeAllocator. It's only purpose is to
store some number of backed *single* hugepage ranges as a hot cache (in case we
rapidly allocate and deallocate a 2 MiB chunk).
It is not clear whether the cache is necessary, but we have it and it's not
costing us much in complexity, and will help significantly in some potential
antagonistic scenarios, so we favor keeping it.
It currently attempts to estimate the optimal cache size based on past behavior.
This may not really be needed, but it's a very minor feature to keep *or* drop.
### `HugePageFiller` (the core…)
`HugePageFiller` takes small requests (less than a hugepage) and attempts to
pack them efficiently into hugepages. The vast majority of binaries use almost
entirely small allocations[^conditional], so this is the dominant consumer of
space and the most important component.
Our goal here is to make our live allocations fit within the smallest set of
hugepages possible, so that we can afford to keep all used hugepages fully
backed (and aggressively free empty ones).
The key challenge is avoiding fragmentation of free space within a hugepage:
requests for 1 page are (usually) the most common, but 4, 8, or even 50+ page
requests aren't unheard of. Many 1-page free regions wont be useful here, and
we'll have to request enormous numbers of new hugepages for anything large.
Our solution is to build a heap-ordered data structure on *fragmentation*, not
total amount free, in each hugepage. We use the **longest free range** (the
biggest allocation a hugepage can fulfill!) as a measurement of fragmentation.
In other words: if a hugepage has a free range of length 8, we *never* allocate
from it for a smaller request (unless all hugepages available have equally long
ranges). This carefully husbands long ranges for the requests that need them,
and allows them to grow (as neighboring allocations are freed).
Inside each equal-longest-free-range group, we order our heap by the **number of
allocations** (chunked logarithmically). This helps favor allocating from fuller
hugepages (of equal fragmented status). Number of allocations handily
outperforms the total number of allocated pages here; our hypothesis is that
since allocations of any size are equally likely[^radioactive] to become free at
any given time, and we need all allocations on a hugepage to become free to make
the hugepage empty, were better off hoping for 1 10-page allocation to become
free (with some probability P) than 5 1-page allocations (with probability P^5).
The `HugePageFiller` contains support for releasing parts of mostly-empty
hugepages as a last resort.
The actual implementation uses a fixed set of lists and a bitmap for
acceleration.
### `HugeRegion` (big but not enormous...)
`HugeAllocator` covers very large requests and `HugePageFiller` tiny ones; what
about the middle? In particular, requests that cannot fit into a hugepage, but
should not be rounded to multiples? (For instance, 2.1 MiB.) These are woefully
common.
In any case, we certainly have to do something with "2.1 MiB"-type allocations,
and rounding them to 4 will produce unacceptable slack (see below for what we
can do with the filler here; it is wildly insufficient in current binaries which
have the majority of their allocation in these large chunks.)
The solution is a much larger "region" that best-fits these chunks into a large
range of hugepages (i.e. allows them to cross a hugepage boundary). We keep a
set of these regions, and allocate from the most fragmented one (much as with
Filler above)! The main difference is that these regions are kept **un-backed**
by default (whereas the Filler deals almost entirely with backed hugepages). We
back hugepages on demand when they are used by a request hitting the region (and
aggressively _unback _them when they become empty again).
A few important details:
* These regions are currently 1 GiB, which is very large!
The reason is this: suppose our entire binary allocates a huge number `N` of
requests of size `S` that are too big for the filler, but that dont evenly
divide the region size `M` (say, 2.1 MiB :)) How much space will we waste?
Answer: we will allocate about `R = N / (M / S)` regions, with each region
storing `floor(M/S)` allocations. The tail will be unused. We can unback any
totally untouched hugepages, but suppose that `M/S` allocations just barely
touches the last hugepage in the region: we will then waste ~a full hugepage
per region, and thus waste `R` hugepages. Conclusion: the larger a region we
use, the less waste (in this case). Originally regions were 32 MiB, and this
effect was very noticeable. This also allows us to use very few regions in a
given binary, which means we can be less careful about how we organize the
set of regions.
* We dont make *any* attempt, when allocating from a given region, to find an
already-backed but unused range. Nor do we prefer regions that have such
ranges.
This is basically a question of effort. We'd like to do this, but we don't
see any way to do it without making the data structure more complicated and
cumbersome. So far in tests it hasn't proved a major problem. (Note that
`RangeTracker` has a low-address bias, which will help somewhat here by
compacting allocations towards the low end of any region).
Additional details on the design goals/tradeoffs are in the
[Regions Are Not Optional](regions-are-not-optional.md) design doc.
### `HugePageAwareAllocator` (putting it all together...)
This class houses the above components and routes between them, in addition to
interfacing with the rest of TCMalloc (the above classes dont need or use
Spans, for instance). This is mostly straightforward; two points are worth
discussing.
* How do we choose which sub-allocator for a given request?
We use a size-based policy.
1. Small allocations are handed directly to the filler; we add hugepages to
the filler as needed.
1. For slightly larger allocations (still under a full hugepage), we *try*
the filler, but dont grow it if theres not currently space. Instead,
we look in the regions for free space. If neither the regions or the
filler has space, we prefer growing the filler (since it comes in
smaller chunks!) The reasoning here is that if our binary only has
allocations of (say) ¾ a hugepage, we dont want the filler to be giant
but ¼ empty; but in a more reasonable binary where we can easily pack
such allocations near smaller ones, wed prefer to do so over using the
region.
1. Allocations that wont fit in a hugepage are just given to the regions
(or, for truly enormous ones, to `HugeAllocator` directly).
The changeover point between 1) and 2) is just a tuning decision (any choice
would produce a usable binary). Half a hugepage was picked arbitrarily; this
seems to work well.
* How do we handle backing?
Allocations from `HugeAllocator` or `HugeRegion` (some of the time) need to be
backed; so do hugepages that grow the `HugePageFiller`. This isnt free. Page
heap allocation isnt hugely expensive in practice, but it is under a lock and
contention matters. We currently rely on access by the application to back
memory, and assume returned memory has been backed.
For accounting purposes, we do a bit of tracking whether a given allocation is
being fulfilled from previously-unbacked memory.
We do wire that information to the point we drop the pageheap lock; we then back
it without producing lock contention. This made a noticeable performance
difference when explicitly backing memory before returning it to the
application.
## Notes
[^cutie]: Also the name of
[this cutie](https://lh3.googleusercontent.com/VXENOSfqH1L84VMwLVAUA7JIqQh7TYH-IZHLBalvVVuMUeD3w5rOVHPsIp97nYEgmKpQoxsHO-lieGouheNmifA2X6tOPTBleTbQc_WCZIrI_roU2K37iiHg9go6omp2ys0Y7cxYc9c6EWNaCYtKG1dEPyyYLULUarCex4oqwt8KgRl95rd3yKXC6YQeW-TWkDpK786ZaAA3vKJXqT5E-ArPxQccyPH13EAmHrltKatqihC7L4Ym5IfP42u58IJwC5bRnKMczm2WwUfipGDEOvymf63mPNKmGMka50AQV4VGrE7hW_Ateb2roCTGISgZIooBSRwK0PMjqV9hBLP5DmUG4ITSV4FlOI5iWOyMSNZV6Gz5T2FgNez08Wdn98tsEsN4_lPcjdZXyJuHeVRKxAawDwjkbWP3aieXDckHY-bJMt0QfyDhPWzSOpTxTALcZiwoC069K9SrBDVKEKowJ2Zag7OlbpROhqbagM5Wuo_nn6O27yWXpihc8Lptt-Vo_e8kQZ4N2RReby3bxNPdRyv2L8BrDCIWBO-iFk7GcYRd9ox7HSD-7Y0yH1FtMP0FZKD5a2raVmabMQrolhsjc-AfYHgD3xBkNo-uTJ8YnFpqjpTdZz_1=w2170-h1446-no),
the real reason for the choice.
[^templated]: It will be possible, given recent improvements in constexpr usage,
to eliminate this in followups.
[^conditional]: Here we mean "requests to the pageheap as filtered through
sampling, the central cache, etc"
[^radioactive]: Well, no, this is false in our empirical data, but to first
order.

View File

@ -0,0 +1,214 @@
# Performance Tuning TCMalloc
## User-Accessible Controls
There are three user accessible controls that we can use to performance tune
TCMalloc:
* The logical page size for TCMalloc (4KiB, 8KiB, 32KiB, 256KiB)
* The per-thread or per-cpu cache sizes
* The rate at which memory is released to the OS
None of these tuning parameters are clear wins, otherwise they would be the
default. We'll discuss the advantages and disadvantages of changing them.
### The Logical Page Size for TCMalloc:
This is determined at compile time by linking in the appropriate version of
TCMalloc. The page size indicates the unit in which TCMalloc manages memory. The
default is in 8KiB chunks, there are larger options of 32KiB and 256KiB. There
is also the 4KiB page size used by the small-but-slow allocator.
A smaller page size allows TCMalloc to provide memory to an application with
less waste. Waste comes about through two issues:
* Left-over memory when rounding larger requests to the page size (eg a
request for 62 KiB might get rounded to 64 KiB).
* Pages of memory that are stuck because they have a single in use allocation
on the page, and therefore cannot be repurposed to hold a different size of
allocation.
The second of these points is worth elucidating. For small allocations TCMalloc
will fit multiple objects onto a single page.
So if you request 512 bytes, then an entire page will be devoted to 512 byte
objects. If the size of that page is 4KiB we get 8 objects, if the size of that
page is 256KiB we get 512 objects. That page can only be used for 512 byte
objects until all the objects on the page have been freed.
If you have 8 objects on a page, there's a reasonable chance that all 8 will
become free at the same time, and we can repurpose the page for objects of a
different size. If there's 512 objects on that page, then it is very unlikely
that all the objects will become freed at the same time, so that page will
probably never become entirely free and will probably hang around, potentially
containing only a few in-use objects.
The consequence of this is that large pages tend to lead to a larger memory
footprint. There's also the issue that if you want one object of a size, you
need to allocate a whole page.
The advantage of managing objects using larger page sizes are:
* Objects of the same size are better clustered in memory. If you need 512 KiB
of 8 byte objects, then that's two 256 KiB pages, or 128 x 4 KiB pages. If
memory is largely backed by hugepages, then with large pages in the worst
case we can map the entire demand with two large pages, whereas small pages
could take up to 128 entries in the TLB.
* There's a structure called the `PageMap` which enables TCMalloc to lookup
information about any allocated memory. If we use large pages the pagemap
needs fewer entries and can be much smaller. This makes it more likely that
it is cache resident. However, sized delete substantially reduced the number
of times that we need to consult the pagemap, so the benefit from larger
pages is reduced.
**Suggestion:** The default of 8KiB page sizes is probably good enough for most
applications. However, if an application has a heap measured in GiB it may be
worth looking at using large page sizes.
**Suggestion:** Small-but-slow is *extremely* slow and should be used only where
it is absolutely vital to minimize memory footprint over performance at all
costs. Small-but-slow works by turning off and shrinking several of TCMalloc's
caches, but this comes at a significant performance penalty.
**Note:** Size-classes are determined on a per-page-size basis. So changing the
page size will implicitly change the size-classes used. Size-classes are
selected to be memory-efficient for the applications using that page size. If an
application changes page size, there may be a performance or memory impact from
the different selection of size-classes.
### Per-thread/per-cpu Cache Sizes
The default is for TCMalloc to run in per-cpu mode as this is faster; however,
there are few applications which have not yet transitioned. The plan is to move
these across at some point soon.
Increasing the size of the cache is an obvious way to improve performance. The
larger the cache the less frequently memory needs to be fetched from the central
caches. Returning memory from the cache is substantially faster than fetching
from the central cache.
The size of the per-cpu caches is controlled by
`tcmalloc::MallocExtension::SetMaxPerCpuCacheSize`. This controls the limit for
each CPU, so the total amount of memory for application could be much larger
than this. Memory on CPUs where the application is no longer able to run can be
freed by calling `tcmalloc::MallocExtension::ReleaseCpuMemory`.
The heterogeneous per-cpu cache optimization in TCMalloc dynamically sizes
per-cpu caches so as to balance the miss rate across all the active and
populated caches. It shuffles and reassigns the capacity from lightly used
caches to the heavily used caches, using miss rate as the proxy for their usage.
When enabled, the heavily used per-cpu caches may steal capacity from lightly
used caches and grow beyond the limit set by `tcmalloc_max_per_cpu_cache_size`
flag. This optimization is enabled by default in TCMalloc.
Releasing memory held by unuable CPU caches is handled by
`tcmalloc::MallocExtension::ProcessBackgroundActions`.
In contrast `tcmalloc::MallocExtension::SetMaxTotalThreadCacheBytes` controls
the *total* size of all thread caches in the application.
**Suggestion:** The default cache size is typically sufficient, but cache size
can be increased (or decreased) depending on the amount of time spent in
TCMalloc code, and depending on the overall size of the application (a larger
application can afford to cache more memory without noticeably increasing its
overall size).
### Memory Releasing
`tcmalloc::MallocExtension::ReleaseMemoryToSystem` makes a request to release
`n` bytes of memory to TCMalloc. This can keep the memory footprint of the
application down to a minimal amount, however it should be considered that this
just reduces the application down from its peak memory footprint over time, and
does not make that peak memory footprint smaller.
Using a background thread running
`tcmalloc::MallocExtension::ProcessBackgroundActions()`, memory will be released
from the page heap at the specified rate.
There are two disadvantages of releasing memory aggressively:
* Memory that is unmapped may be immediately needed, and there is a cost to
faulting unmapped memory back into the application.
* Memory that is unmapped at small granularity will break up hugepages, and
this will cause some performance loss due to increased TLB misses.
**Note:** Release rate is not a panacea for memory usage. Jobs should be
provisioned for peak memory usage to avoid OOM errors. Setting a release rate
may enable an application to exceed the memory limit for short periods of time
without triggering an OOM. A release rate is also a good citizen behavior as it
will enable the system to use spare capacity memory for applications which are
are under provisioned. However, it is not a substitute for setting appropriate
memory requirements for the job.
**Note:** Memory is released from the `PageHeap` and stranded per-cpu caches. It
is not possible to release memory from other internal structures, like the
`CentralFreeList`.
**Suggestion:** The default release rate is probably appropriate for most
applications. In situations where it is tempting to set a faster rate it is
worth considering why there are memory spikes, since those spikes are likely to
cause an OOM at some point.
## System-Level Optimizations
* TCMalloc heavily relies on Transparent Huge Pages (THP). As of February
2020, we build and test with
```
/sys/kernel/mm/transparent_hugepage/enabled:
[always] madvise never
/sys/kernel/mm/transparent_hugepage/defrag:
always defer [defer+madvise] madvise never`
/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none:
0
```
* TCMalloc makes assumptions about the availability of virtual address space,
so that we can layout allocations in cetain ways. We build and test with
```
/proc/sys/vm/overcommit_memory:
1
```
## Build-Time Optimizations
TCMalloc is built and tested in certain ways. These build-time options can
improve performance:
* Statically-linking TCMalloc reduces function call overhead, by obviating the
need to call procedure linkage stubs in the procedure linkage table (PLT).
* Enabling
[sized deallocation from C++14](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2013/n3778.html)
reduces deallocation costs when the size can be determined. Sized
deallocation is enabled with the `-fsized-deallocation` flag. This behavior
is enabled by default in GCC), but as of early 2020, is not enabled by
default on Clang even when compiling for C++14/C++17.
Some standard C++ libraries (such as
[libc++](https://reviews.llvm.org/rCXX345214)) will take advantage of sized
deallocation for their allocators as well, improving deallocation
performance in C++ containers.
* Aligning raw storage allocated with `::operator new` to 8 bytes by compiling
with `__STDCPP_DEFAULT_NEW_ALIGNMENT__ <= 8`. This smaller alignment
minimizes wasted memory for many common allocation sizes (24, 40, etc.)
which are otherwise rounded up to a multiple of 16 bytes. On many compilers,
this behavior is controlled by the `-fnew-alignment=...` flag.
When `__STDCPP_DEFAULT_NEW_ALIGNMENT__` is not specified (or is larger than
8 bytes), we use standard 16 byte alignments for `::operator new`. However,
for allocations under 16 bytes, we may return an object with a lower
alignment, as no object with a larger alignment requirement can be allocated
in the space.
* Optimizing failures of `operator new` by directly failing instead of
throwing exceptions. Because TCMalloc does not throw exceptions when
`operator new` fails, this can be used as a performance optimization for
many move constructors.
Within Abseil code, these direct allocation failures are enabled with the
Abseil build-time configuration macro
[`ABSL_ALLOCATOR_NOTHROW`](https://abseil.io/docs/cpp/guides/base#abseil-exception-policy).

View File

@ -0,0 +1,6 @@
---
Language: Cpp
BasedOnStyle: Google
DerivePointerAlignment: false
PointerAlignment: Left
...

View File

@ -0,0 +1,5 @@
# Default owners
* @ckennelly
# Documentation
docs/* @manshreck

View File

@ -0,0 +1,63 @@
# Copyright 2022 The TCMalloc Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: ci
on:
push:
branches:
- master
pull_request:
jobs:
Linux:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
compiler:
- g++
- clang++
name: "Build/Test ${{matrix.compiler}}"
steps:
- name: Cancel previous
uses: styfle/cancel-workflow-action@0.8.0
with:
access_token: ${{ github.token }}
- name: Prepare
run: |
sudo apt-get update -qq
sudo apt install -y g++ clang
- uses: actions/checkout@v2
with:
fetch-depth: 0
- name: Create Cache Timestamp
id: cache_timestamp
uses: nanzm/get-time-action@v1.1
with:
format: 'YYYY-MM-DD-HH-mm-ss'
- name: Mount bazel cache
uses: actions/cache@v2
with:
path: "/home/runner/.cache/bazel"
key: bazelcache_${{matrix.compiler}}_${{ steps.cache_timestamp.outputs.time }}
restore-keys: bazelcache_${{matrix.compiler}}_
- name: Tests
run: CXX=${{matrix.compiler}} bazel test --test_output=errors //...

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,52 @@
// Copyright 2022 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tcmalloc/allocation_sample.h"
#include <memory>
#include "absl/time/clock.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc::tcmalloc_internal {
AllocationSample::AllocationSample(AllocationSampleList* list, absl::Time start)
: list_(list), start_(start) {
mallocs_ = std::make_unique<StackTraceTable>(ProfileType::kAllocations);
list->Add(this);
}
AllocationSample::~AllocationSample() {
if (mallocs_ == nullptr) {
return;
}
// deleted before ending profile, do it for them
list_->Remove(this);
}
Profile AllocationSample::Stop() && {
// We need to remove ourselves from list_ before we mutate mallocs_;
//
// A concurrent call to AllocationSampleList::ReportMalloc can access mallocs_
// until we remove it from list_.
if (mallocs_) {
list_->Remove(this);
mallocs_->SetDuration(absl::Now() - start_);
}
return ProfileAccessor::MakeProfile(std::move(mallocs_));
}
} // namespace tcmalloc::tcmalloc_internal
GOOGLE_MALLOC_SECTION_END

View File

@ -0,0 +1,87 @@
// Copyright 2022 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef TCMALLOC_ALLOCATION_SAMPLE_H_
#define TCMALLOC_ALLOCATION_SAMPLE_H_
#include "absl/base/dynamic_annotations.h"
#include "absl/base/internal/spinlock.h"
#include "absl/time/time.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/stack_trace_table.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc::tcmalloc_internal {
class AllocationSampleList;
class AllocationSample final : public AllocationProfilingTokenBase {
public:
AllocationSample(AllocationSampleList* list, absl::Time start);
~AllocationSample() override;
Profile Stop() && override;
private:
AllocationSampleList* list_;
std::unique_ptr<StackTraceTable> mallocs_;
absl::Time start_;
AllocationSample* next_ = nullptr;
friend class AllocationSampleList;
};
class AllocationSampleList {
public:
constexpr AllocationSampleList() = default;
void Add(AllocationSample* as) {
absl::base_internal::SpinLockHolder h(&lock_);
as->next_ = first_;
first_ = as;
}
// This list is very short and we're nowhere near a hot path, just walk
void Remove(AllocationSample* as) {
absl::base_internal::SpinLockHolder h(&lock_);
AllocationSample** link = &first_;
AllocationSample* cur = first_;
while (cur != as) {
CHECK_CONDITION(cur != nullptr);
link = &cur->next_;
cur = cur->next_;
}
*link = as->next_;
}
void ReportMalloc(const struct StackTrace& sample) {
absl::base_internal::SpinLockHolder h(&lock_);
AllocationSample* cur = first_;
while (cur != nullptr) {
cur->mallocs_->AddTrace(1.0, sample);
cur = cur->next_;
}
}
private:
// Guard against any concurrent modifications on the list of allocation
// samples. Invoking `new` while holding this lock can lead to deadlock.
absl::base_internal::SpinLock lock_{
absl::kConstInit, absl::base_internal::SCHEDULE_KERNEL_ONLY};
AllocationSample* first_ ABSL_GUARDED_BY(lock_) = nullptr;
};
} // namespace tcmalloc::tcmalloc_internal
GOOGLE_MALLOC_SECTION_END
#endif // TCMALLOC_ALLOCATION_SAMPLE_H_

View File

@ -0,0 +1,132 @@
// Copyright 2022 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tcmalloc/allocation_sample.h"
#include <stddef.h>
#include <memory>
#include <utility>
#include <vector>
#include "gtest/gtest.h"
#include "absl/base/thread_annotations.h"
#include "absl/random/bit_gen_ref.h"
#include "absl/random/random.h"
#include "absl/synchronization/mutex.h"
#include "absl/time/clock.h"
#include "absl/time/time.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/static_vars.h"
#include "tcmalloc/testing/thread_manager.h"
namespace tcmalloc::tcmalloc_internal {
namespace {
TEST(AllocationSample, Threaded) {
// StackTraceTable uses a global allocator. It must be initialized.
tc_globals.InitIfNecessary();
// This test exercises b/143623146 by ensuring that the state of the sample is
// not modified before it is removed from the linked list.
AllocationSampleList list;
const int kThreads = 5;
const int kMaxSamplers = 3;
const int kMaxAllocations = 100;
ThreadManager m;
std::vector<absl::BitGen> thread_states(kThreads);
struct GlobalState {
absl::Mutex mu;
std::vector<std::unique_ptr<AllocationSample>> samplers ABSL_GUARDED_BY(mu);
} global;
auto PopSample = [&](absl::BitGenRef rng) {
std::unique_ptr<AllocationSample> ret;
// Do our test bookkeeping separately, so we don't synchronize list
// externally.
absl::MutexLock l(&global.mu);
if (global.samplers.empty()) {
return ret;
}
size_t index = absl::Uniform<size_t>(rng, 0, global.samplers.size() - 1u);
std::swap(global.samplers[index], global.samplers.back());
ret = std::move(global.samplers.back());
global.samplers.pop_back();
CHECK_CONDITION(ret != nullptr);
return ret;
};
m.Start(kThreads, [&](int thread) {
auto& state = thread_states[thread];
const double coin = absl::Uniform(state, 0., 1.0);
if (coin < 0.1) {
// Add a sampler. This occurs implicitly in the AllocationSample
// constructor.
auto sampler = std::make_unique<AllocationSample>(&list, absl::Now());
// Do our test bookkeeping separately, so we don't synchronize list
// externally.
{
absl::MutexLock l(&global.mu);
if (global.samplers.size() < kMaxSamplers) {
// Add to the list.
global.samplers.push_back(std::move(sampler));
}
}
// If we didn't push it, we will unregister in ~AllocationSample.
} else if (coin < 0.2) {
std::unique_ptr<AllocationSample> sampler = PopSample(state);
// Remove a sample and allow its destructor to handle unregistering.
sampler.reset();
} else if (coin < 0.25) {
// Call Stop occasionally.
std::unique_ptr<AllocationSample> sampler = PopSample(state);
if (sampler) {
std::move(*sampler).Stop();
}
} else {
int allocations;
{
// StackTraceTable uses a global allocator, rather than one that is
// injected. Consult the global state to see how many allocations are
// active.
absl::base_internal::SpinLockHolder h(&pageheap_lock);
allocations = tc_globals.linked_sample_allocator().stats().in_use;
}
if (allocations >= kMaxAllocations) {
return;
}
StackTrace s{};
s.requested_size = 16;
s.allocated_size = 32;
list.ReportMalloc(s);
}
});
absl::SleepFor(absl::Milliseconds(1));
m.Stop();
}
} // namespace
} // namespace tcmalloc::tcmalloc_internal

View File

@ -0,0 +1,383 @@
// Copyright 2022 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef TCMALLOC_ALLOCATION_SAMPLING_H_
#define TCMALLOC_ALLOCATION_SAMPLING_H_
#include <memory>
#include <utility>
#include "tcmalloc/cpu_cache.h"
#include "tcmalloc/guarded_page_allocator.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/malloc_extension.h"
#include "tcmalloc/pagemap.h"
#include "tcmalloc/sampler.h"
#include "tcmalloc/span.h"
#include "tcmalloc/stack_trace_table.h"
#include "tcmalloc/tcmalloc_policy.h"
#include "tcmalloc/thread_cache.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc::tcmalloc_internal {
// This function computes a profile that maps a live stack trace to
// the number of bytes of central-cache memory pinned by an allocation
// at that stack trace.
// In the case when span is hosting >= 1 number of small objects (t.proxy !=
// nullptr), we call span::Fragmentation() and read `span->allocated_`. It is
// safe to do so since we hold the per-sample lock while iterating over sampled
// allocations. It prevents the sampled allocation that has the proxy object to
// complete deallocation, thus `proxy` can not be returned to the span yet. It
// thus prevents the central free list to return the span to the page heap.
template <typename State>
static std::unique_ptr<const ProfileBase> DumpFragmentationProfile(
State& state) {
auto profile = std::make_unique<StackTraceTable>(ProfileType::kFragmentation);
state.sampled_allocation_recorder().Iterate(
[&state, &profile](const SampledAllocation& sampled_allocation) {
// Compute fragmentation to charge to this sample:
const StackTrace& t = sampled_allocation.sampled_stack;
if (t.proxy == nullptr) {
// There is just one object per-span, and neighboring spans
// can be released back to the system, so we charge no
// fragmentation to this sampled object.
return;
}
// Fetch the span on which the proxy lives so we can examine its
// co-residents.
const PageId p = PageIdContaining(t.proxy);
Span* span = state.pagemap().GetDescriptor(p);
if (span == nullptr) {
// Avoid crashes in production mode code, but report in tests.
ASSERT(span != nullptr);
return;
}
const double frag = span->Fragmentation(t.allocated_size);
if (frag > 0) {
// Associate the memory warmth with the actual object, not the proxy.
// The residency information (t.span_start_address) is likely not very
// useful, but we might as well pass it along.
profile->AddTrace(frag, t);
}
});
return profile;
}
template <typename State>
static std::unique_ptr<const ProfileBase> DumpHeapProfile(State& state) {
auto profile = std::make_unique<StackTraceTable>(ProfileType::kHeap);
state.sampled_allocation_recorder().Iterate(
[&](const SampledAllocation& sampled_allocation) {
profile->AddTrace(1.0, sampled_allocation.sampled_stack);
});
return profile;
}
ABSL_CONST_INIT static thread_local Sampler thread_sampler_
ABSL_ATTRIBUTE_INITIAL_EXEC;
inline Sampler* GetThreadSampler() { return &thread_sampler_; }
inline bool ShouldGuardingBeAttempted(
Profile::Sample::GuardedStatus guarded_status) {
switch (guarded_status) {
case Profile::Sample::GuardedStatus::LargerThanOnePage:
case Profile::Sample::GuardedStatus::Disabled:
case Profile::Sample::GuardedStatus::RateLimited:
case Profile::Sample::GuardedStatus::TooSmall:
case Profile::Sample::GuardedStatus::NoAvailableSlots:
case Profile::Sample::GuardedStatus::MProtectFailed:
case Profile::Sample::GuardedStatus::Filtered:
case Profile::Sample::GuardedStatus::Unknown:
case Profile::Sample::GuardedStatus::NotAttempted:
return false;
case Profile::Sample::GuardedStatus::Requested:
case Profile::Sample::GuardedStatus::Required:
case Profile::Sample::GuardedStatus::Guarded:
return true;
}
return false;
}
// If this allocation can be guarded, and if it's time to do a guarded sample,
// returns a guarded allocation Span. Otherwise returns nullptr.
template <typename State>
static GuardedPageAllocator::AllocWithStatus TrySampleGuardedAllocation(
State& state, size_t size, size_t alignment, Length num_pages) {
if (num_pages != Length(1)) {
return {nullptr, Profile::Sample::GuardedStatus::LargerThanOnePage};
}
Profile::Sample::GuardedStatus guarded_status =
GetThreadSampler()->ShouldSampleGuardedAllocation();
// If there is a reason not to guard, then return.
if (!ShouldGuardingBeAttempted(guarded_status)) {
return {nullptr, guarded_status};
}
// The num_pages == 1 constraint ensures that size <= kPageSize. And
// since alignments above kPageSize cause size_class == 0, we're also
// guaranteed alignment <= kPageSize
//
// In all cases kPageSize <= GPA::page_size_, so Allocate's preconditions
// are met.
return state.guardedpage_allocator().Allocate(size, alignment);
}
// ShouldSampleAllocation() is called when an allocation of the given requested
// size is in progress. It returns the sampling weight of the allocation if it
// should be "sampled," and 0 otherwise. See SampleifyAllocation().
//
// Sampling is done based on requested sizes and later unskewed during profile
// generation.
inline size_t ShouldSampleAllocation(size_t size) {
return GetThreadSampler()->RecordAllocation(size);
}
template <typename State>
ABSL_ATTRIBUTE_NOINLINE static inline void FreeProxyObject(State& state,
void* ptr,
size_t size_class) {
if (ABSL_PREDICT_TRUE(UsePerCpuCache(state))) {
state.cpu_cache().Deallocate(ptr, size_class);
} else if (ThreadCache* cache = ThreadCache::GetCacheIfPresent();
ABSL_PREDICT_TRUE(cache)) {
cache->Deallocate(ptr, size_class);
} else {
// This thread doesn't have thread-cache yet or already. Delete directly
// into transfer cache.
state.transfer_cache().InsertRange(size_class, absl::Span<void*>(&ptr, 1));
}
}
// Performs sampling for already occurred allocation of object.
//
// For very small object sizes, object is used as 'proxy' and full
// page with sampled marked is allocated instead.
//
// For medium-sized objects that have single instance per span,
// they're simply freed and fresh page span is allocated to represent
// sampling.
//
// For large objects (i.e. allocated with do_malloc_pages) they are
// also fully reused and their span is marked as sampled.
//
// Note that do_free_with_size assumes sampled objects have
// page-aligned addresses. Please change both functions if need to
// invalidate the assumption.
//
// Note that size_class might not match requested_size in case of
// memalign. I.e. when larger than requested allocation is done to
// satisfy alignment constraint.
//
// In case of out-of-memory condition when allocating span or
// stacktrace struct, this function simply cheats and returns original
// object. As if no sampling was requested.
template <typename State, typename Policy>
static void* SampleifyAllocation(State& state, Policy policy,
size_t requested_size, size_t weight,
size_t size_class, void* obj, Span* span,
size_t* capacity) {
CHECK_CONDITION((size_class != 0 && obj != nullptr && span == nullptr) ||
(size_class == 0 && obj == nullptr && span != nullptr));
StackTrace stack_trace;
stack_trace.proxy = nullptr;
stack_trace.requested_size = requested_size;
// Grab the stack trace outside the heap lock.
stack_trace.depth = absl::GetStackTrace(stack_trace.stack, kMaxStackDepth, 0);
// requested_alignment = 1 means 'small size table alignment was used'
// Historically this is reported as requested_alignment = 0
stack_trace.requested_alignment = policy.align();
if (stack_trace.requested_alignment == 1) {
stack_trace.requested_alignment = 0;
}
stack_trace.requested_size_returning = capacity != nullptr;
stack_trace.access_hint = static_cast<uint8_t>(policy.access());
stack_trace.weight = weight;
GuardedPageAllocator::AllocWithStatus alloc_with_status{
nullptr, Profile::Sample::GuardedStatus::NotAttempted};
if (size_class != 0) {
ASSERT(size_class == state.pagemap().sizeclass(PageIdContaining(obj)));
stack_trace.allocated_size = state.sizemap().class_to_size(size_class);
stack_trace.cold_allocated = IsExpandedSizeClass(size_class);
// If the caller didn't provide a span, allocate one:
Length num_pages = BytesToLengthCeil(stack_trace.allocated_size);
alloc_with_status = TrySampleGuardedAllocation(
state, requested_size, stack_trace.requested_alignment, num_pages);
if (alloc_with_status.status == Profile::Sample::GuardedStatus::Guarded) {
ASSERT(IsSampledMemory(alloc_with_status.alloc));
const PageId p = PageIdContaining(alloc_with_status.alloc);
absl::base_internal::SpinLockHolder h(&pageheap_lock);
span = Span::New(p, num_pages);
state.pagemap().Set(p, span);
// If we report capacity back from a size returning allocation, we can not
// report the allocated_size, as we guard the size to 'requested_size',
// and we maintain the invariant that GetAllocatedSize() must match the
// returned size from size returning allocations. So in that case, we
// report the requested size for both capacity and GetAllocatedSize().
if (capacity) stack_trace.allocated_size = requested_size;
} else if ((span = state.page_allocator().New(
num_pages, 1, MemoryTag::kSampled)) == nullptr) {
if (capacity) *capacity = stack_trace.allocated_size;
return obj;
}
size_t span_size =
Length(state.sizemap().class_to_pages(size_class)).in_bytes();
size_t objects_per_span = span_size / stack_trace.allocated_size;
if (objects_per_span != 1) {
ASSERT(objects_per_span > 1);
stack_trace.proxy = obj;
obj = nullptr;
}
} else {
// Set allocated_size to the exact size for a page allocation.
// NOTE: if we introduce gwp-asan sampling / guarded allocations
// for page allocations, then we need to revisit do_malloc_pages as
// the current assumption is that only class sized allocs are sampled
// for gwp-asan.
stack_trace.allocated_size = span->bytes_in_span();
stack_trace.cold_allocated = IsColdMemory(span->start_address());
}
if (capacity) *capacity = stack_trace.allocated_size;
ASSERT(span != nullptr);
stack_trace.sampled_alloc_handle =
state.sampled_alloc_handle_generator.fetch_add(
1, std::memory_order_relaxed) +
1;
stack_trace.span_start_address = span->start_address();
stack_trace.allocation_time = absl::Now();
stack_trace.guarded_status = static_cast<int>(alloc_with_status.status);
// How many allocations does this sample represent, given the sampling
// frequency (weight) and its size.
const double allocation_estimate =
static_cast<double>(weight) / (requested_size + 1);
// Adjust our estimate of internal fragmentation.
ASSERT(requested_size <= stack_trace.allocated_size);
if (requested_size < stack_trace.allocated_size) {
state.sampled_internal_fragmentation_.Add(
allocation_estimate * (stack_trace.allocated_size - requested_size));
}
state.allocation_samples.ReportMalloc(stack_trace);
state.deallocation_samples.ReportMalloc(stack_trace);
// The SampledAllocation object is visible to readers after this. Readers only
// care about its various metadata (e.g. stack trace, weight) to generate the
// heap profile, and won't need any information from Span::Sample() next.
SampledAllocation* sampled_allocation =
state.sampled_allocation_recorder().Register(std::move(stack_trace));
// No pageheap_lock required. The span is freshly allocated and no one else
// can access it. It is visible after we return from this allocation path.
span->Sample(sampled_allocation);
state.peak_heap_tracker().MaybeSaveSample();
if (obj != nullptr) {
// We are not maintaining precise statistics on malloc hit/miss rates at our
// cache tiers. We can deallocate into our ordinary cache.
ASSERT(size_class != 0);
FreeProxyObject(state, obj, size_class);
}
return (alloc_with_status.alloc != nullptr) ? alloc_with_status.alloc
: span->start_address();
}
template <typename State>
inline void MaybeUnsampleAllocation(State& state, void* ptr, Span* span) {
// No pageheap_lock required. The sampled span should be unmarked and have its
// state cleared only once. External synchronization when freeing is required;
// otherwise, concurrent writes here would likely report a double-free.
if (SampledAllocation* sampled_allocation = span->Unsample()) {
void* const proxy = sampled_allocation->sampled_stack.proxy;
const size_t weight = sampled_allocation->sampled_stack.weight;
const size_t requested_size =
sampled_allocation->sampled_stack.requested_size;
const size_t allocated_size =
sampled_allocation->sampled_stack.allocated_size;
const size_t alignment =
sampled_allocation->sampled_stack.requested_alignment;
// How many allocations does this sample represent, given the sampling
// frequency (weight) and its size.
const double allocation_estimate =
static_cast<double>(weight) / (requested_size + 1);
AllocHandle sampled_alloc_handle =
sampled_allocation->sampled_stack.sampled_alloc_handle;
state.sampled_allocation_recorder().Unregister(sampled_allocation);
// Adjust our estimate of internal fragmentation.
ASSERT(requested_size <= allocated_size);
if (requested_size < allocated_size) {
const size_t sampled_fragmentation =
allocation_estimate * (allocated_size - requested_size);
// Check against wraparound
ASSERT(state.sampled_internal_fragmentation_.value() >=
sampled_fragmentation);
state.sampled_internal_fragmentation_.Add(-sampled_fragmentation);
}
state.deallocation_samples.ReportFree(sampled_alloc_handle);
if (proxy) {
const auto policy = CppPolicy().InSameNumaPartitionAs(proxy);
size_t size_class;
if (AccessFromPointer(proxy) == AllocationAccess::kCold) {
size_class = state.sizemap().SizeClass(
policy.AccessAsCold().AlignAs(alignment), allocated_size);
} else {
size_class = state.sizemap().SizeClass(
policy.AccessAsHot().AlignAs(alignment), allocated_size);
}
ASSERT(size_class == state.pagemap().sizeclass(PageIdContaining(proxy)));
FreeProxyObject(state, proxy, size_class);
}
}
}
template <typename State, typename Policy, typename CapacityPtr>
static void* SampleLargeAllocation(State& state, Policy policy,
size_t requested_size, size_t weight,
Span* span, CapacityPtr capacity) {
return SampleifyAllocation(state, policy, requested_size, weight, 0, nullptr,
span, capacity);
}
template <typename State, typename Policy, typename CapacityPtr>
static void* SampleSmallAllocation(State& state, Policy policy,
size_t requested_size, size_t weight,
size_t size_class, void* obj,
CapacityPtr capacity) {
return SampleifyAllocation(state, policy, requested_size, weight, size_class,
obj, nullptr, capacity);
}
} // namespace tcmalloc::tcmalloc_internal
GOOGLE_MALLOC_SECTION_END
#endif // TCMALLOC_ALLOCATION_SAMPLING_H_

View File

@ -0,0 +1,85 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tcmalloc/arena.h"
#include <new>
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/static_vars.h"
#include "tcmalloc/system-alloc.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
void* Arena::Alloc(size_t bytes, std::align_val_t alignment) {
size_t align = static_cast<size_t>(alignment);
ASSERT(align > 0);
{ // First we need to move up to the correct alignment.
const int misalignment = reinterpret_cast<uintptr_t>(free_area_) % align;
const int alignment_bytes = misalignment != 0 ? align - misalignment : 0;
free_area_ += alignment_bytes;
free_avail_ -= alignment_bytes;
bytes_allocated_ += alignment_bytes;
}
char* result;
if (free_avail_ < bytes) {
size_t ask = bytes > kAllocIncrement ? bytes : kAllocIncrement;
// TODO(b/171081864): Arena allocations should be made relatively
// infrequently. Consider tagging this memory with sampled objects which
// are also infrequently allocated.
//
// In the meantime it is important that we use the current NUMA partition
// rather than always using a particular one because it's possible that any
// single partition we choose might only contain nodes that the process is
// unable to allocate from due to cgroup restrictions.
MemoryTag tag;
const auto& numa_topology = tc_globals.numa_topology();
if (numa_topology.numa_aware()) {
tag = NumaNormalTag(numa_topology.GetCurrentPartition());
} else {
tag = MemoryTag::kNormal;
}
auto [ptr, actual_size] = SystemAlloc(ask, kPageSize, tag);
free_area_ = reinterpret_cast<char*>(ptr);
if (ABSL_PREDICT_FALSE(free_area_ == nullptr)) {
Crash(kCrash, __FILE__, __LINE__,
"FATAL ERROR: Out of memory trying to allocate internal tcmalloc "
"data (bytes, object-size); is something preventing mmap from "
"succeeding (sandbox, VSS limitations)?",
kAllocIncrement, bytes);
}
SystemBack(free_area_, actual_size);
// We've discarded the previous free_area_, so any bytes that were
// unallocated are effectively inaccessible to future allocations.
bytes_unavailable_ += free_avail_;
blocks_++;
free_avail_ = actual_size;
}
ASSERT(reinterpret_cast<uintptr_t>(free_area_) % align == 0);
result = free_area_;
free_area_ += bytes;
free_avail_ -= bytes;
bytes_allocated_ += bytes;
return reinterpret_cast<void*>(result);
}
} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END

View File

@ -0,0 +1,107 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef TCMALLOC_ARENA_H_
#define TCMALLOC_ARENA_H_
#include <stddef.h>
#include <stdint.h>
#include <new>
#include "absl/base/attributes.h"
#include "absl/base/thread_annotations.h"
#include "tcmalloc/common.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
struct ArenaStats {
// The number of bytes allocated and in-use by calls to Alloc().
size_t bytes_allocated;
// The number of bytes currently reserved for future calls to Alloc().
size_t bytes_unallocated;
// The number of bytes lost and unavailable to calls to Alloc() due to
// inefficiencies in Arena.
size_t bytes_unavailable;
// The number of allocated bytes that have subsequently become non-resident,
// e.g. due to the slab being resized. Note that these bytes are disjoint from
// the ones counted in `bytes_allocated`.
size_t bytes_nonresident;
// The number of blocks allocated by the Arena.
size_t blocks;
};
// Arena allocation; designed for use by tcmalloc internal data structures like
// spans, profiles, etc. Always expands.
class Arena {
public:
constexpr Arena() {}
// Returns a properly aligned byte array of length "bytes". Crashes if
// allocation fails. Requires pageheap_lock is held.
ABSL_ATTRIBUTE_RETURNS_NONNULL void* Alloc(
size_t bytes, std::align_val_t alignment = kAlignment)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
// Updates the stats for allocated and non-resident bytes.
void UpdateAllocatedAndNonresident(int64_t allocated, int64_t nonresident)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
ASSERT(static_cast<int64_t>(bytes_allocated_) + allocated >= 0);
bytes_allocated_ += allocated;
ASSERT(static_cast<int64_t>(bytes_nonresident_) + nonresident >= 0);
bytes_nonresident_ += nonresident;
}
// Returns statistics about memory allocated and managed by this Arena.
ArenaStats stats() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
ArenaStats s;
s.bytes_allocated = bytes_allocated_;
s.bytes_unallocated = free_avail_;
s.bytes_unavailable = bytes_unavailable_;
s.bytes_nonresident = bytes_nonresident_;
s.blocks = blocks_;
return s;
}
private:
// How much to allocate from system at a time
static constexpr int kAllocIncrement = 128 << 10;
// Free area from which to carve new objects
char* free_area_ ABSL_GUARDED_BY(pageheap_lock) = nullptr;
size_t free_avail_ ABSL_GUARDED_BY(pageheap_lock) = 0;
// Total number of bytes allocated from this arena
size_t bytes_allocated_ ABSL_GUARDED_BY(pageheap_lock) = 0;
// The number of bytes that are unused and unavailable for future allocations
// because they are at the end of a discarded arena block.
size_t bytes_unavailable_ ABSL_GUARDED_BY(pageheap_lock) = 0;
// The number of bytes on the arena that have been MADV_DONTNEEDed away. Note
// that these bytes are disjoint from the ones counted in `bytes_allocated`.
size_t bytes_nonresident_ ABSL_GUARDED_BY(pageheap_lock) = 0;
// Total number of blocks/free areas managed by this Arena.
size_t blocks_ ABSL_GUARDED_BY(pageheap_lock) = 0;
Arena(const Arena&) = delete;
Arena& operator=(const Arena&) = delete;
};
} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END
#endif // TCMALLOC_ARENA_H_

View File

@ -0,0 +1,158 @@
// Copyright 2021 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tcmalloc/arena.h"
#include <stdint.h>
#include <new>
#include "gtest/gtest.h"
namespace tcmalloc {
namespace tcmalloc_internal {
namespace {
std::align_val_t Align(int align) {
return static_cast<std::align_val_t>(align);
}
TEST(Arena, AlignedAlloc) {
Arena arena;
absl::base_internal::SpinLockHolder h(&pageheap_lock);
EXPECT_EQ(reinterpret_cast<uintptr_t>(arena.Alloc(64, Align(64))) % 64, 0);
EXPECT_EQ(reinterpret_cast<uintptr_t>(arena.Alloc(7)) % 8, 0);
EXPECT_EQ(reinterpret_cast<uintptr_t>(arena.Alloc(128, Align(64))) % 64, 0);
for (int alignment = 1; alignment < 100; ++alignment) {
EXPECT_EQ(reinterpret_cast<uintptr_t>(arena.Alloc(7, Align(alignment))) %
alignment,
0);
}
}
TEST(Arena, Stats) {
Arena arena;
ArenaStats stats;
{
absl::base_internal::SpinLockHolder h(&pageheap_lock);
stats = arena.stats();
}
EXPECT_EQ(stats.bytes_allocated, 0);
EXPECT_EQ(stats.bytes_unallocated, 0);
EXPECT_EQ(stats.bytes_unavailable, 0);
EXPECT_EQ(stats.bytes_nonresident, 0);
EXPECT_EQ(stats.blocks, 0);
// Trigger an allocation and grab new stats.
ArenaStats stats_after_alloc;
void* ptr;
{
absl::base_internal::SpinLockHolder h(&pageheap_lock);
ptr = arena.Alloc(1, Align(1));
stats_after_alloc = arena.stats();
}
EXPECT_NE(ptr, nullptr);
EXPECT_EQ(stats_after_alloc.bytes_allocated, 1);
EXPECT_GE(stats_after_alloc.bytes_unallocated, 0);
EXPECT_EQ(stats_after_alloc.bytes_unavailable, 0);
EXPECT_EQ(stats_after_alloc.bytes_nonresident, 0);
EXPECT_EQ(stats_after_alloc.blocks, 1);
// Trigger an allocation that is larger than the remaining free bytes.
//
// TODO(b/201694482): Optimize this.
ArenaStats stats_after_alloc2;
{
absl::base_internal::SpinLockHolder h(&pageheap_lock);
ptr = arena.Alloc(stats_after_alloc.bytes_unallocated + 1, Align(1));
stats_after_alloc2 = arena.stats();
}
EXPECT_NE(ptr, nullptr);
EXPECT_EQ(stats_after_alloc2.bytes_allocated,
stats_after_alloc.bytes_unallocated + 2);
EXPECT_GE(stats_after_alloc2.bytes_unallocated, 0);
EXPECT_EQ(stats_after_alloc2.bytes_unavailable,
stats_after_alloc.bytes_unallocated);
EXPECT_EQ(stats_after_alloc.bytes_nonresident, 0);
EXPECT_EQ(stats_after_alloc2.blocks, 2);
}
TEST(Arena, ReportUnmapped) {
Arena arena;
ArenaStats stats_after_alloc;
void* ptr;
{
absl::base_internal::SpinLockHolder h(&pageheap_lock);
ptr = arena.Alloc(10, Align(1));
stats_after_alloc = arena.stats();
}
EXPECT_NE(ptr, nullptr);
EXPECT_EQ(stats_after_alloc.bytes_allocated, 10);
EXPECT_EQ(stats_after_alloc.bytes_nonresident, 0);
{
absl::base_internal::SpinLockHolder h(&pageheap_lock);
arena.UpdateAllocatedAndNonresident(-5, 5);
stats_after_alloc = arena.stats();
}
EXPECT_EQ(stats_after_alloc.bytes_allocated, 5);
EXPECT_EQ(stats_after_alloc.bytes_nonresident, 5);
{
absl::base_internal::SpinLockHolder h(&pageheap_lock);
arena.UpdateAllocatedAndNonresident(3, -3);
stats_after_alloc = arena.stats();
}
EXPECT_EQ(stats_after_alloc.bytes_allocated, 8);
EXPECT_EQ(stats_after_alloc.bytes_nonresident, 2);
}
TEST(Arena, BytesImpending) {
Arena arena;
ArenaStats stats;
{
absl::base_internal::SpinLockHolder h(&pageheap_lock);
stats = arena.stats();
}
EXPECT_EQ(stats.bytes_allocated, 0);
{
absl::base_internal::SpinLockHolder h(&pageheap_lock);
arena.UpdateAllocatedAndNonresident(100, 0);
stats = arena.stats();
}
EXPECT_EQ(stats.bytes_allocated, 100);
void* ptr;
{
absl::base_internal::SpinLockHolder h(&pageheap_lock);
arena.UpdateAllocatedAndNonresident(-100, 0);
ptr = arena.Alloc(100, Align(1));
stats = arena.stats();
}
EXPECT_NE(ptr, nullptr);
EXPECT_EQ(stats.bytes_allocated, 100);
}
} // namespace
} // namespace tcmalloc_internal
} // namespace tcmalloc

View File

@ -0,0 +1,131 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <errno.h>
#include "absl/base/internal/sysinfo.h"
#include "absl/time/clock.h"
#include "absl/time/time.h"
#include "tcmalloc/cpu_cache.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/internal/percpu.h"
#include "tcmalloc/internal_malloc_extension.h"
#include "tcmalloc/malloc_extension.h"
#include "tcmalloc/parameters.h"
#include "tcmalloc/static_vars.h"
// Release memory to the system at a constant rate.
void MallocExtension_Internal_ProcessBackgroundActions() {
using ::tcmalloc::tcmalloc_internal::Parameters;
using ::tcmalloc::tcmalloc_internal::tc_globals;
tcmalloc::MallocExtension::MarkThreadIdle();
absl::Time prev_time = absl::Now();
constexpr absl::Duration kSleepTime = absl::Seconds(1);
// Reclaim inactive per-cpu caches once per kCpuCacheReclaimPeriod.
//
// We use a longer 30 sec reclaim period to make sure that caches are indeed
// idle. Reclaim drains entire cache, as opposed to cache shuffle for instance
// that only shrinks a cache by a few objects at a time. So, we might have
// larger performance degradation if we use a shorter reclaim interval and
// drain caches that weren't supposed to.
constexpr absl::Duration kCpuCacheReclaimPeriod = absl::Seconds(30);
absl::Time last_reclaim = absl::Now();
// Shuffle per-cpu caches once per kCpuCacheShufflePeriod.
constexpr absl::Duration kCpuCacheShufflePeriod = absl::Seconds(5);
absl::Time last_shuffle = absl::Now();
// See if we should resize the slab once per kCpuCacheSlabResizePeriod. This
// period is coprime to kCpuCacheShufflePeriod and kCpuCacheReclaimPeriod.
constexpr absl::Duration kCpuCacheSlabResizePeriod = absl::Seconds(29);
absl::Time last_slab_resize_check = absl::Now();
#ifndef TCMALLOC_SMALL_BUT_SLOW
// We reclaim unused objects from the transfer caches once per
// kTransferCacheResizePeriod.
constexpr absl::Duration kTransferCachePlunderPeriod = absl::Seconds(5);
absl::Time last_transfer_cache_plunder_check = absl::Now();
// Resize transfer caches once per kTransferCacheResizePeriod.
constexpr absl::Duration kTransferCacheResizePeriod = absl::Seconds(2);
absl::Time last_transfer_cache_resize_check = absl::Now();
#endif
while (true) {
absl::Time now = absl::Now();
// We follow the cache hierarchy in TCMalloc from outermost (per-CPU) to
// innermost (the page heap). Freeing up objects at one layer can help aid
// memory coalescing for inner caches.
if (tcmalloc::MallocExtension::PerCpuCachesActive()) {
// Accelerate fences as part of this operation by registering this thread
// with rseq. While this is not strictly required to succeed, we do not
// expect an inconsistent state for rseq (some threads registered and some
// threads unable to).
CHECK_CONDITION(tcmalloc::tcmalloc_internal::subtle::percpu::IsFast());
// Try to reclaim per-cpu caches once every kCpuCacheReclaimPeriod
// when enabled.
if (now - last_reclaim >= kCpuCacheReclaimPeriod) {
tc_globals.cpu_cache().TryReclaimingCaches();
last_reclaim = now;
}
if (Parameters::shuffle_per_cpu_caches() &&
now - last_shuffle >= kCpuCacheShufflePeriod) {
tc_globals.cpu_cache().ShuffleCpuCaches();
last_shuffle = now;
}
// See if we need to grow the slab once every kCpuCacheSlabResizePeriod
// when enabled.
if (Parameters::per_cpu_caches_dynamic_slab_enabled() &&
now - last_slab_resize_check >= kCpuCacheSlabResizePeriod) {
tc_globals.cpu_cache().ResizeSlabIfNeeded();
last_slab_resize_check = now;
}
}
tc_globals.sharded_transfer_cache().Plunder();
#ifndef TCMALLOC_SMALL_BUT_SLOW
// Try to plunder and reclaim unused objects from transfer caches.
if (now - last_transfer_cache_plunder_check >=
kTransferCachePlunderPeriod &&
Parameters::partial_transfer_cache()) {
tc_globals.transfer_cache().TryPlunder();
last_transfer_cache_plunder_check = now;
}
if (now - last_transfer_cache_resize_check >= kTransferCacheResizePeriod) {
tc_globals.transfer_cache().TryResizingCaches();
last_transfer_cache_resize_check = now;
}
#endif
const ssize_t bytes_to_release =
static_cast<size_t>(Parameters::background_release_rate()) *
absl::ToDoubleSeconds(now - prev_time);
if (bytes_to_release > 0) { // may be negative if time goes backwards
tcmalloc::MallocExtension::ReleaseMemoryToSystem(bytes_to_release);
}
prev_time = now;
absl::SleepFor(kSleepTime);
}
}

View File

@ -0,0 +1,116 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tcmalloc/central_freelist.h"
#include <stdint.h>
#include "tcmalloc/internal/linked_list.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/internal/optimization.h"
#include "tcmalloc/internal/prefetch.h"
#include "tcmalloc/page_heap.h"
#include "tcmalloc/pagemap.h"
#include "tcmalloc/pages.h"
#include "tcmalloc/static_vars.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
namespace central_freelist_internal {
static MemoryTag MemoryTagFromSizeClass(size_t size_class) {
if (IsExpandedSizeClass(size_class)) {
return MemoryTag::kCold;
}
if (!tc_globals.numa_topology().numa_aware()) {
return MemoryTag::kNormal;
}
return NumaNormalTag(size_class / kNumBaseClasses);
}
size_t StaticForwarder::class_to_size(int size_class) {
return tc_globals.sizemap().class_to_size(size_class);
}
Length StaticForwarder::class_to_pages(int size_class) {
return Length(tc_globals.sizemap().class_to_pages(size_class));
}
Span* StaticForwarder::MapObjectToSpan(const void* object) {
const PageId p = PageIdContaining(object);
Span* span = tc_globals.pagemap().GetExistingDescriptor(p);
return span;
}
Span* StaticForwarder::AllocateSpan(int size_class, size_t objects_per_span,
Length pages_per_span) {
const MemoryTag tag = MemoryTagFromSizeClass(size_class);
Span* span =
tc_globals.page_allocator().New(pages_per_span, objects_per_span, tag);
if (ABSL_PREDICT_FALSE(span == nullptr)) {
return nullptr;
}
ASSERT(tag == GetMemoryTag(span->start_address()));
ASSERT(span->num_pages() == pages_per_span);
tc_globals.pagemap().RegisterSizeClass(span, size_class);
return span;
}
static void ReturnSpansToPageHeap(MemoryTag tag, absl::Span<Span*> free_spans,
size_t objects_per_span)
ABSL_LOCKS_EXCLUDED(pageheap_lock) {
absl::base_internal::SpinLockHolder h(&pageheap_lock);
for (Span* const free_span : free_spans) {
ASSERT(tag == GetMemoryTag(free_span->start_address()));
tc_globals.page_allocator().Delete(free_span, objects_per_span, tag);
}
}
void StaticForwarder::DeallocateSpans(int size_class, size_t objects_per_span,
absl::Span<Span*> free_spans) {
// Unregister size class doesn't require holding any locks.
for (Span* const free_span : free_spans) {
ASSERT(IsNormalMemory(free_span->start_address()) ||
IsColdMemory(free_span->start_address()));
tc_globals.pagemap().UnregisterSizeClass(free_span);
// Before taking pageheap_lock, prefetch the PageTrackers these spans are
// on.
//
// Small-but-slow does not use the HugePageAwareAllocator (by default), so
// do not prefetch on this config.
#ifndef TCMALLOC_SMALL_BUT_SLOW
const PageId p = free_span->first_page();
// In huge_page_filler.h, we static_assert that PageTracker's key elements
// for deallocation are within the first two cachelines.
void* pt = tc_globals.pagemap().GetHugepage(p);
// Prefetch for writing, as we will issue stores to the PageTracker
// instance.
PrefetchW(pt);
PrefetchW(reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(pt) +
ABSL_CACHELINE_SIZE));
#endif // TCMALLOC_SMALL_BUT_SLOW
}
const MemoryTag tag = MemoryTagFromSizeClass(size_class);
ReturnSpansToPageHeap(tag, free_spans, objects_per_span);
}
} // namespace central_freelist_internal
} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END

View File

@ -0,0 +1,581 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef TCMALLOC_CENTRAL_FREELIST_H_
#define TCMALLOC_CENTRAL_FREELIST_H_
#include <stddef.h>
#include <algorithm>
#include <cstddef>
#include "absl/base/attributes.h"
#include "absl/base/const_init.h"
#include "absl/base/internal/spinlock.h"
#include "absl/base/macros.h"
#include "absl/base/thread_annotations.h"
#include "tcmalloc/common.h"
#include "tcmalloc/hinted_tracker_lists.h"
#include "tcmalloc/internal/atomic_stats_counter.h"
#include "tcmalloc/internal/optimization.h"
#include "tcmalloc/pages.h"
#include "tcmalloc/parameters.h"
#include "tcmalloc/span.h"
#include "tcmalloc/span_stats.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
namespace central_freelist_internal {
// StaticForwarder provides access to the PageMap and page heap.
//
// This is a class, rather than namespaced globals, so that it can be mocked for
// testing.
class StaticForwarder {
public:
static size_t class_to_size(int size_class);
static Length class_to_pages(int size_class);
static Span* MapObjectToSpan(const void* object);
static Span* AllocateSpan(int size_class, size_t objects_per_span,
Length pages_per_span)
ABSL_LOCKS_EXCLUDED(pageheap_lock);
static void DeallocateSpans(int size_class, size_t objects_per_span,
absl::Span<Span*> free_spans)
ABSL_LOCKS_EXCLUDED(pageheap_lock);
};
// Specifies number of nonempty_ lists that keep track of non-empty spans.
static constexpr size_t kNumLists = 8;
// Data kept per size-class in central cache.
template <typename ForwarderT>
class CentralFreeList {
public:
using Forwarder = ForwarderT;
constexpr CentralFreeList()
: lock_(absl::kConstInit, absl::base_internal::SCHEDULE_KERNEL_ONLY),
size_class_(0),
object_size_(0),
objects_per_span_(0),
first_nonempty_index_(0),
pages_per_span_(0),
nonempty_() {}
CentralFreeList(const CentralFreeList&) = delete;
CentralFreeList& operator=(const CentralFreeList&) = delete;
void Init(size_t size_class) ABSL_LOCKS_EXCLUDED(lock_);
// These methods all do internal locking.
// Insert batch into the central freelist.
// REQUIRES: batch.size() > 0 && batch.size() <= kMaxObjectsToMove.
void InsertRange(absl::Span<void*> batch) ABSL_LOCKS_EXCLUDED(lock_);
// Fill a prefix of batch[0..N-1] with up to N elements removed from central
// freelist. Return the number of elements removed.
ABSL_MUST_USE_RESULT int RemoveRange(void** batch, int N)
ABSL_LOCKS_EXCLUDED(lock_);
// Returns the number of free objects in cache.
size_t length() const { return static_cast<size_t>(counter_.value()); }
// Returns the memory overhead (internal fragmentation) attributable
// to the freelist. This is memory lost when the size of elements
// in a freelist doesn't exactly divide the page-size (an 8192-byte
// page full of 5-byte objects would have 2 bytes memory overhead).
size_t OverheadBytes() const;
// Returns number of live spans currently in the nonempty_[n] list.
// REQUIRES: n >= 0 && n < kNumLists.
size_t NumSpansInList(int n) ABSL_LOCKS_EXCLUDED(lock_);
SpanStats GetSpanStats() const;
// Reports span utilization histogram stats.
void PrintSpanUtilStats(Printer* out) const;
void PrintSpanUtilStatsInPbtxt(PbtxtRegion* region) const;
// Get number of spans in the histogram bucket. We record spans in the
// histogram indexed by absl::bit_width(allocated). So, instead of using the
// absolute number of allocated objects, it uses absl::bit_width(allocated),
// passed as <bitwidth>, to index and return the number of spans in the
// histogram.
size_t NumSpansWith(uint16_t bitwidth) const;
Forwarder& forwarder() { return forwarder_; }
private:
// Release an object to spans.
// Returns object's span if it become completely free.
Span* ReleaseToSpans(void* object, Span* span, size_t object_size)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_);
// Populate cache by fetching from the page heap.
// May temporarily release lock_.
// Fill a prefix of batch[0..N-1] with up to N elements removed from central
// freelist. Returns the number of elements removed.
int Populate(void** batch, int N) ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_);
// Parses nonempty_ lists and returns span from the list with the lowest
// possible index.
// Returns the span if one exists in the nonempty_ lists. Else, returns
// nullptr.
Span* FirstNonEmptySpan() ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_);
// Returns first index to the nonempty_ lists that may record spans.
uint8_t GetFirstNonEmptyIndex() const;
// Returns index into nonempty_ based on the number of allocated objects for
// the span. Instead of using the absolute number of allocated objects, it
// uses absl::bit_width(allocated), passed as bitwidth, to calculate the list
// index.
static uint8_t IndexFor(uint8_t bitwidth);
// Records span utilization in objects_to_span_ map. Instead of using the
// absolute number of allocated objects, it uses
// absl::bit_width(allocated), passed as <bitwidth>, to index this map.
//
// If increase is set to true, includes the span by incrementing the count
// in the map. Otherwise, removes the span by decrementing the count in
// the map.
void RecordSpanUtil(uint8_t bitwidth, bool increase)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
ASSUME(bitwidth > 0);
// Updates to objects_to_span_ are guarded by lock_, so writes may be
// performed using LossyAdd.
objects_to_spans_[bitwidth - 1].LossyAdd(increase ? 1 : -1);
}
// This lock protects all the mutable data members.
absl::base_internal::SpinLock lock_;
size_t size_class_; // My size class (immutable after Init())
size_t object_size_;
size_t objects_per_span_;
// Hint used for parsing through the nonempty_ lists. This prevents us from
// parsing the lists with an index starting zero, if the lowest possible index
// is higher than that.
size_t first_nonempty_index_;
Length pages_per_span_;
size_t num_spans() const {
size_t requested = num_spans_requested_.value();
size_t returned = num_spans_returned_.value();
if (requested < returned) return 0;
return (requested - returned);
}
void RecordSpanAllocated() ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
counter_.LossyAdd(objects_per_span_);
num_spans_requested_.LossyAdd(1);
}
void RecordMultiSpansDeallocated(size_t num_spans_returned)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
counter_.LossyAdd(-num_spans_returned * objects_per_span_);
num_spans_returned_.LossyAdd(num_spans_returned);
}
void UpdateObjectCounts(int num) ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
counter_.LossyAdd(num);
}
// The followings are kept as a StatsCounter so that they can read without
// acquiring a lock. Updates to these variables are guarded by lock_
// so writes are performed using LossyAdd for speed, the lock still
// guarantees accuracy.
// Num free objects in cache entry
StatsCounter counter_;
StatsCounter num_spans_requested_;
StatsCounter num_spans_returned_;
// Records histogram of span utilization.
//
// Each bucket in the histogram records number of live spans with
// corresponding number of allocated objects. Instead of using the absolute
// value of number of allocated objects, we use absl::bit_width(allocated) to
// index this map. A bucket in the histogram corresponds to power-of-two
// number of objects. That is, bucket N tracks number of spans with allocated
// objects < 2^(N+1). For instance, objects_to_spans_ map tracks number of
// spans with allocated objects in the range [a,b), indexed as: [1,2) in
// objects_to_spans_[0], [2,4) in objects_to_spans_[1], [4, 8) in
// objects_to_spans_[2] and so on. We can query the objects_to_spans_ map
// using NumSpansWith(bitwidth) to obtain the number of spans associated
// with the corresponding bucket in the histogram.
//
// As the actual value of objects_per_span_ is not known at compile time, we
// use maximum value that it can be to initialize this hashmap, and
// kSpanUtilBucketCapacity determines this value. We also check during Init
// that absl::bit_width(objects_per_span_) is indeed less than or equal to
// kSpanUtilBucketCapacity.
//
// We disable collection of histogram stats for TCMalloc small-but-slow due to
// performance issues. See b/227362263.
static constexpr size_t kSpanUtilBucketCapacity = 16;
StatsCounter objects_to_spans_[kSpanUtilBucketCapacity];
// Non-empty lists that distinguish spans based on the number of objects
// allocated from them. As we prioritize spans, spans may be added to any of
// the kNumLists nonempty_ lists based on their allocated objects. If span
// prioritization is disabled, we add spans to the nonempty_[kNumlists-1]
// list, leaving other lists unused.
//
// We do not enable multiple nonempty lists for small-but-slow yet due to
// performance issues. See b/227362263.
#ifdef TCMALLOC_SMALL_BUT_SLOW
SpanList nonempty_ ABSL_GUARDED_BY(lock_);
#else
HintedTrackerLists<Span, kNumLists> nonempty_ ABSL_GUARDED_BY(lock_);
#endif
TCMALLOC_NO_UNIQUE_ADDRESS Forwarder forwarder_;
};
// Like a constructor and hence we disable thread safety analysis.
template <class Forwarder>
inline void CentralFreeList<Forwarder>::Init(size_t size_class)
ABSL_NO_THREAD_SAFETY_ANALYSIS {
size_class_ = size_class;
object_size_ = Forwarder::class_to_size(size_class);
pages_per_span_ = Forwarder::class_to_pages(size_class);
objects_per_span_ =
pages_per_span_.in_bytes() / (object_size_ ? object_size_ : 1);
// Records nonempty_ list index associated with the span with
// objects_per_span_ number of allocated objects. Refer to the comment in
// IndexFor(...) below for a detailed description.
first_nonempty_index_ =
kNumLists -
std::min<size_t>(absl::bit_width(objects_per_span_), kNumLists);
ASSERT(absl::bit_width(objects_per_span_) <= kSpanUtilBucketCapacity);
}
template <class Forwarder>
inline Span* CentralFreeList<Forwarder>::ReleaseToSpans(void* object,
Span* span,
size_t object_size) {
if (ABSL_PREDICT_FALSE(span->FreelistEmpty(object_size))) {
#ifdef TCMALLOC_SMALL_BUT_SLOW
nonempty_.prepend(span);
#else
const uint8_t index = GetFirstNonEmptyIndex();
nonempty_.Add(span, index);
span->set_nonempty_index(index);
#endif
}
#ifdef TCMALLOC_SMALL_BUT_SLOW
// We maintain a single nonempty list for small-but-slow. Also, we do not
// collect histogram stats due to performance issues.
if (ABSL_PREDICT_TRUE(span->FreelistPush(object, object_size))) {
return nullptr;
}
nonempty_.remove(span);
return span;
#else
const uint8_t prev_index = span->nonempty_index();
const uint8_t prev_bitwidth = absl::bit_width(span->Allocated());
if (ABSL_PREDICT_FALSE(!span->FreelistPush(object, object_size))) {
// Update the histogram as the span is full and will be removed from the
// nonempty_ list.
RecordSpanUtil(prev_bitwidth, /*increase=*/false);
nonempty_.Remove(span, prev_index);
return span;
}
// As the objects are being added to the span, its utilization might change.
// We remove the stale utilization from the histogram and add the new
// utilization to the histogram after we release objects to the span.
const uint8_t cur_bitwidth = absl::bit_width(span->Allocated());
if (cur_bitwidth != prev_bitwidth) {
RecordSpanUtil(prev_bitwidth, /*increase=*/false);
RecordSpanUtil(cur_bitwidth, /*increase=*/true);
// If span allocation changes so that it moved to a different nonempty_
// list, we remove it from the previous list and add it to the desired
// list indexed by cur_index.
const uint8_t cur_index = IndexFor(cur_bitwidth);
if (cur_index != prev_index) {
nonempty_.Remove(span, prev_index);
nonempty_.Add(span, cur_index);
span->set_nonempty_index(cur_index);
}
}
return nullptr;
#endif
}
template <class Forwarder>
inline Span* CentralFreeList<Forwarder>::FirstNonEmptySpan() {
// Scan nonempty_ lists in the range [first_nonempty_index_, kNumLists) and
// return the span from a non-empty list if one exists. If all the lists are
// empty, return nullptr.
#ifdef TCMALLOC_SMALL_BUT_SLOW
if (ABSL_PREDICT_FALSE(nonempty_.empty())) {
return nullptr;
}
return nonempty_.first();
#else
return nonempty_.PeekLeast(GetFirstNonEmptyIndex());
#endif
}
template <class Forwarder>
inline uint8_t CentralFreeList<Forwarder>::GetFirstNonEmptyIndex() const {
return first_nonempty_index_;
}
template <class Forwarder>
inline uint8_t CentralFreeList<Forwarder>::IndexFor(uint8_t bitwidth) {
// We would like to index into the nonempty_ list based on the number of
// allocated objects from the span. Given a span with fewer allocated objects
// (i.e. when it is more likely to be freed), we would like to map it to a
// higher index in the nonempty_ list. Depending on the number of kNumLists
// and the number of objects per span, we may have to clamp multiple buckets
// in index 0. It should be ok to do that because it is less beneficial to
// differentiate between spans that have 128 vs 256 allocated objects,
// compared to those that have 16 vs 32 allocated objects.
//
// Consider objects_per_span = 1024 and kNumLists = 8. The following examples
// show spans with allocated objects in the range [a, b) indexed to the
// nonempty_[idx] list using a notation [a, b) -> idx.
// [1, 2) -> 7, [2, 4) -> 6, [4, 8) -> 5, [8, 16) -> 4, [16, 32) -> 3, [32,
// 64) -> 2, [64, 128) -> 1, [128, 1024) -> 0.
ASSUME(bitwidth > 0);
const uint8_t offset = std::min<size_t>(bitwidth, kNumLists);
const uint8_t index = kNumLists - offset;
ASSUME(index < kNumLists);
return index;
}
template <class Forwarder>
inline size_t CentralFreeList<Forwarder>::NumSpansInList(int n) {
ASSUME(n >= 0);
ASSUME(n < kNumLists);
absl::base_internal::SpinLockHolder h(&lock_);
#ifdef TCMALLOC_SMALL_BUT_SLOW
return nonempty_.length();
#else
return nonempty_.SizeOfList(n);
#endif
}
template <class Forwarder>
inline void CentralFreeList<Forwarder>::InsertRange(absl::Span<void*> batch) {
CHECK_CONDITION(!batch.empty() && batch.size() <= kMaxObjectsToMove);
Span* spans[kMaxObjectsToMove];
// Safe to store free spans into freed up space in span array.
Span** free_spans = spans;
int free_count = 0;
// Prefetch Span objects to reduce cache misses.
for (int i = 0; i < batch.size(); ++i) {
Span* span = forwarder_.MapObjectToSpan(batch[i]);
ASSERT(span != nullptr);
span->Prefetch();
spans[i] = span;
}
// First, release all individual objects into spans under our mutex
// and collect spans that become completely free.
{
// Use local copy of variable to ensure that it is not reloaded.
size_t object_size = object_size_;
absl::base_internal::SpinLockHolder h(&lock_);
for (int i = 0; i < batch.size(); ++i) {
Span* span = ReleaseToSpans(batch[i], spans[i], object_size);
if (ABSL_PREDICT_FALSE(span)) {
free_spans[free_count] = span;
free_count++;
}
}
RecordMultiSpansDeallocated(free_count);
UpdateObjectCounts(batch.size());
}
// Then, release all free spans into page heap under its mutex.
if (ABSL_PREDICT_FALSE(free_count)) {
forwarder_.DeallocateSpans(size_class_, objects_per_span_,
absl::MakeSpan(free_spans, free_count));
}
}
template <class Forwarder>
inline int CentralFreeList<Forwarder>::RemoveRange(void** batch, int N) {
ASSUME(N > 0);
// Use local copy of variable to ensure that it is not reloaded.
size_t object_size = object_size_;
int result = 0;
absl::base_internal::SpinLockHolder h(&lock_);
do {
Span* span = FirstNonEmptySpan();
if (ABSL_PREDICT_FALSE(!span)) {
result += Populate(batch + result, N - result);
break;
}
#ifdef TCMALLOC_SMALL_BUT_SLOW
// We do not collect histogram stats for small-but-slow.
int here = span->FreelistPopBatch(batch + result, N - result, object_size);
ASSERT(here > 0);
if (span->FreelistEmpty(object_size)) {
nonempty_.remove(span);
}
#else
const uint8_t prev_bitwidth = absl::bit_width(span->Allocated());
const uint8_t prev_index = span->nonempty_index();
int here = span->FreelistPopBatch(batch + result, N - result, object_size);
ASSERT(here > 0);
// As the objects are being popped from the span, its utilization might
// change. So, we remove the stale utilization from the histogram here and
// add it again once we pop the objects.
const uint8_t cur_bitwidth = absl::bit_width(span->Allocated());
if (cur_bitwidth != prev_bitwidth) {
RecordSpanUtil(prev_bitwidth, /*increase=*/false);
RecordSpanUtil(cur_bitwidth, /*increase=*/true);
}
if (span->FreelistEmpty(object_size)) {
nonempty_.Remove(span, prev_index);
} else if (cur_bitwidth != prev_bitwidth) {
// If span allocation changes so that it must be moved to a different
// nonempty_ list, we remove it from the previous list and add it to the
// desired list indexed by cur_index.
const uint8_t cur_index = IndexFor(cur_bitwidth);
if (cur_index != prev_index) {
nonempty_.Remove(span, prev_index);
nonempty_.Add(span, cur_index);
span->set_nonempty_index(cur_index);
}
}
#endif
result += here;
} while (result < N);
UpdateObjectCounts(-result);
return result;
}
// Fetch memory from the system and add to the central cache freelist.
template <class Forwarder>
inline int CentralFreeList<Forwarder>::Populate(void** batch, int N)
ABSL_NO_THREAD_SAFETY_ANALYSIS {
// Release central list lock while operating on pageheap
// Note, this could result in multiple calls to populate each allocating
// a new span and the pushing those partially full spans onto nonempty.
lock_.Unlock();
Span* span =
forwarder_.AllocateSpan(size_class_, objects_per_span_, pages_per_span_);
if (ABSL_PREDICT_FALSE(span == nullptr)) {
Log(kLog, __FILE__, __LINE__, "tcmalloc: allocation failed",
pages_per_span_.in_bytes());
lock_.Lock();
return 0;
}
int result = span->BuildFreelist(object_size_, objects_per_span_, batch, N);
ASSERT(result > 0);
// This is a cheaper check than using FreelistEmpty().
bool span_empty = result == objects_per_span_;
lock_.Lock();
#ifdef TCMALLOC_SMALL_BUT_SLOW
// We do not collect histogram stats for small-but-slow. Moreover, we maintain
// a single nonempty list to which we prepend the span.
if (!span_empty) {
nonempty_.prepend(span);
}
#else
// Update the histogram once we populate the span.
const uint8_t bitwidth = absl::bit_width(span->Allocated());
RecordSpanUtil(bitwidth, /*increase=*/true);
if (!span_empty) {
const uint8_t index = IndexFor(bitwidth);
nonempty_.Add(span, index);
span->set_nonempty_index(index);
}
#endif
RecordSpanAllocated();
return result;
}
template <class Forwarder>
inline size_t CentralFreeList<Forwarder>::OverheadBytes() const {
if (ABSL_PREDICT_FALSE(object_size_ == 0)) {
return 0;
}
const size_t overhead_per_span = pages_per_span_.in_bytes() % object_size_;
return num_spans() * overhead_per_span;
}
template <class Forwarder>
inline SpanStats CentralFreeList<Forwarder>::GetSpanStats() const {
SpanStats stats;
if (ABSL_PREDICT_FALSE(objects_per_span_ == 0)) {
return stats;
}
stats.num_spans_requested = static_cast<size_t>(num_spans_requested_.value());
stats.num_spans_returned = static_cast<size_t>(num_spans_returned_.value());
stats.obj_capacity = stats.num_live_spans() * objects_per_span_;
return stats;
}
template <class Forwarder>
inline size_t CentralFreeList<Forwarder>::NumSpansWith(
uint16_t bitwidth) const {
ASSERT(bitwidth > 0);
const int bucket = bitwidth - 1;
return objects_to_spans_[bucket].value();
}
template <class Forwarder>
inline void CentralFreeList<Forwarder>::PrintSpanUtilStats(Printer* out) const {
out->printf("class %3d [ %8zu bytes ] : ", size_class_, object_size_);
for (size_t i = 1; i <= kSpanUtilBucketCapacity; ++i) {
out->printf("%6zu < %zu", NumSpansWith(i), 1 << i);
if (i < kSpanUtilBucketCapacity) {
out->printf(",");
}
}
out->printf("\n");
}
template <class Forwarder>
inline void CentralFreeList<Forwarder>::PrintSpanUtilStatsInPbtxt(
PbtxtRegion* region) const {
for (size_t i = 1; i <= kSpanUtilBucketCapacity; ++i) {
PbtxtRegion histogram = region->CreateSubRegion("span_util_histogram");
histogram.PrintI64("lower_bound", 1 << (i - 1));
histogram.PrintI64("upper_bound", 1 << i);
histogram.PrintI64("value", NumSpansWith(i));
}
}
} // namespace central_freelist_internal
using CentralFreeList = central_freelist_internal::CentralFreeList<
central_freelist_internal::StaticForwarder>;
} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END
#endif // TCMALLOC_CENTRAL_FREELIST_H_

View File

@ -0,0 +1,201 @@
// Copyright 2021 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <vector>
#include "absl/algorithm/container.h"
#include "absl/random/random.h"
#include "benchmark/benchmark.h"
#include "tcmalloc/central_freelist.h"
#include "tcmalloc/common.h"
#include "tcmalloc/static_vars.h"
#include "tcmalloc/tcmalloc_policy.h"
namespace tcmalloc {
namespace tcmalloc_internal {
namespace {
// This benchmark measures how long it takes to populate multiple
// spans. The spans are freed in the same order as they were populated
// to minimize the time it takes to free them.
void BM_Populate(benchmark::State& state) {
size_t object_size = state.range(0);
size_t size_class = tc_globals.sizemap().SizeClass(CppPolicy(), object_size);
int batch_size = tc_globals.sizemap().num_objects_to_move(size_class);
int num_objects = 64 * 1024 * 1024 / object_size;
const int num_batches = num_objects / batch_size;
CentralFreeList cfl;
// Initialize the span to contain the appropriate size of object.
cfl.Init(size_class);
// Allocate an array large enough to hold 64 MiB of objects.
std::vector<void*> buffer(num_objects);
int64_t items_processed = 0;
absl::BitGen rnd;
while (state.KeepRunningBatch(num_batches)) {
int index = 0;
// The cost of fetching objects will include the cost of fetching and
// populating the span.
while (index < num_objects) {
int count = std::min(batch_size, num_objects - index);
int got = cfl.RemoveRange(&buffer[index], count);
index += got;
}
// Don't include the cost of returning the objects to the span, and the
// span to the pageheap.
state.PauseTiming();
index = 0;
while (index < num_objects) {
uint64_t count = std::min(batch_size, num_objects - index);
cfl.InsertRange({&buffer[index], count});
index += count;
}
items_processed += index;
state.ResumeTiming();
}
state.SetItemsProcessed(items_processed);
}
BENCHMARK(BM_Populate)
->DenseRange(8, 64, 16)
->DenseRange(64, 1024, 64)
->DenseRange(4096, 28 * 1024, 4096)
->DenseRange(32 * 1024, 256 * 1024, 32 * 1024);
// This benchmark fills a large array with objects, shuffles the objects
// and then returns them.
// This should be relatively representative of what happens at runtime.
// Fetching objects from the CFL is usually done in batches, but returning
// them is usually done spread over many active spans.
void BM_MixAndReturn(benchmark::State& state) {
size_t object_size = state.range(0);
size_t size_class = tc_globals.sizemap().SizeClass(CppPolicy(), object_size);
int batch_size = tc_globals.sizemap().num_objects_to_move(size_class);
int num_objects = 64 * 1024 * 1024 / object_size;
const int num_batches = num_objects / batch_size;
CentralFreeList cfl;
// Initialize the span to contain the appropriate size of object.
cfl.Init(size_class);
// Allocate an array large enough to hold 64 MiB of objects.
std::vector<void*> buffer(num_objects);
int64_t items_processed = 0;
absl::BitGen rnd;
while (state.KeepRunningBatch(num_batches)) {
int index = 0;
while (index < num_objects) {
int count = std::min(batch_size, num_objects - index);
int got = cfl.RemoveRange(&buffer[index], count);
index += got;
}
state.PauseTiming();
// Shuffle the vector so that we don't return the objects in the same
// order as they were allocated.
absl::c_shuffle(buffer, rnd);
state.ResumeTiming();
index = 0;
while (index < num_objects) {
unsigned int count = std::min(batch_size, num_objects - index);
cfl.InsertRange({&buffer[index], count});
index += count;
}
items_processed += index;
}
state.SetItemsProcessed(items_processed);
}
BENCHMARK(BM_MixAndReturn)
->DenseRange(8, 64, 16)
->DenseRange(64, 1024, 64)
->DenseRange(4096, 28 * 1024, 4096)
->DenseRange(32 * 1024, 256 * 1024, 32 * 1024);
// This benchmark holds onto half the allocated objects so that (except for
// single object spans) spans are never allocated or freed during the
// benchmark run. This evaluates the performance of just the span handling
// code, and avoids timing the pageheap code.
void BM_SpanReuse(benchmark::State& state) {
size_t object_size = state.range(0);
size_t size_class = tc_globals.sizemap().SizeClass(CppPolicy(), object_size);
int batch_size = tc_globals.sizemap().num_objects_to_move(size_class);
int num_objects = 64 * 1024 * 1024 / object_size;
const int num_batches = num_objects / batch_size;
CentralFreeList cfl;
// Initialize the span to contain the appropriate size of object.
cfl.Init(size_class);
// Array used to hold onto half of the objects
std::vector<void*> held_objects(2 * num_objects);
// Request twice the objects we need
for (int index = 0; index < 2 * num_objects;) {
int count = std::min(batch_size, 2 * num_objects - index);
int got = cfl.RemoveRange(&held_objects[index], count);
index += got;
}
// Return half of the objects. This will stop the spans from being
// returned to the pageheap. So future operations will not touch the
// pageheap.
for (int index = 0; index < 2 * num_objects; index += 2) {
cfl.InsertRange({&held_objects[index], 1});
}
// Allocate an array large enough to hold 64 MiB of objects.
std::vector<void*> buffer(num_objects);
int64_t items_processed = 0;
absl::BitGen rnd;
while (state.KeepRunningBatch(num_batches)) {
int index = 0;
while (index < num_objects) {
int count = std::min(batch_size, num_objects - index);
int got = cfl.RemoveRange(&buffer[index], count);
index += got;
}
state.PauseTiming();
// Shuffle the vector so that we don't return the objects in the same
// order as they were allocated.
absl::c_shuffle(buffer, rnd);
state.ResumeTiming();
index = 0;
while (index < num_objects) {
uint64_t count = std::min(batch_size, num_objects - index);
cfl.InsertRange({&buffer[index], count});
index += count;
}
items_processed += index;
}
state.SetItemsProcessed(items_processed);
// Return the other half of the objects.
for (int index = 1; index < 2 * num_objects; index += 2) {
cfl.InsertRange({&held_objects[index], 1});
}
}
// Want to avoid benchmarking spans where there is a single object per span.
BENCHMARK(BM_SpanReuse)
->DenseRange(8, 64, 16)
->DenseRange(64, 1024, 64)
->DenseRange(1024, 4096, 512);
} // namespace
} // namespace tcmalloc_internal
} // namespace tcmalloc

View File

@ -0,0 +1,792 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tcmalloc/central_freelist.h"
#include <math.h>
#include <stddef.h>
#include <stdint.h>
#include <algorithm>
#include <memory>
#include <utility>
#include <vector>
#include "benchmark/benchmark.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "absl/algorithm/container.h"
#include "absl/base/thread_annotations.h"
#include "absl/container/fixed_array.h"
#include "absl/memory/memory.h"
#include "absl/numeric/bits.h"
#include "absl/random/random.h"
#include "absl/synchronization/mutex.h"
#include "absl/time/clock.h"
#include "absl/time/time.h"
#include "absl/types/span.h"
#include "tcmalloc/common.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/mock_static_forwarder.h"
#include "tcmalloc/pagemap.h"
#include "tcmalloc/static_vars.h"
#include "tcmalloc/testing/thread_manager.h"
namespace tcmalloc {
namespace tcmalloc_internal {
namespace central_freelist_internal {
class StaticForwarderTest : public testing::TestWithParam<size_t> {
protected:
size_t size_class_;
size_t object_size_;
Length pages_per_span_;
size_t batch_size_;
size_t objects_per_span_;
private:
void SetUp() override {
size_class_ = GetParam();
if (IsExpandedSizeClass(size_class_)) {
#if ABSL_HAVE_THREAD_SANITIZER
GTEST_SKIP() << "Skipping test under sanitizers that conflict with "
"address placement";
#endif
if (!ColdFeatureActive()) {
// If !ColdFeatureActive(), we will use the normal page heap, which will
// keep us from seeing memory get the expected tags.
GTEST_SKIP()
<< "Skipping expanded size classes without cold experiment";
}
}
object_size_ = tc_globals.sizemap().class_to_size(size_class_);
if (object_size_ == 0) {
GTEST_SKIP() << "Skipping empty size class.";
}
pages_per_span_ = Length(tc_globals.sizemap().class_to_pages(size_class_));
batch_size_ = tc_globals.sizemap().num_objects_to_move(size_class_);
objects_per_span_ = pages_per_span_.in_bytes() / object_size_;
}
};
TEST_P(StaticForwarderTest, Simple) {
Span* span = StaticForwarder::AllocateSpan(size_class_, objects_per_span_,
pages_per_span_);
ASSERT_NE(span, nullptr);
absl::FixedArray<void*> batch(objects_per_span_);
size_t allocated = span->BuildFreelist(object_size_, objects_per_span_,
&batch[0], objects_per_span_);
ASSERT_EQ(allocated, objects_per_span_);
EXPECT_EQ(size_class_, tc_globals.pagemap().sizeclass(span->first_page()));
EXPECT_EQ(size_class_, tc_globals.pagemap().sizeclass(span->last_page()));
// span_test.cc provides test coverage for Span, but we need to obtain several
// objects to confirm we can map back to the Span pointer from the PageMap.
for (void* ptr : batch) {
EXPECT_EQ(span, StaticForwarder::MapObjectToSpan(ptr));
}
for (void* ptr : batch) {
span->FreelistPush(ptr, object_size_);
}
StaticForwarder::DeallocateSpans(size_class_, objects_per_span_,
absl::MakeSpan(&span, 1));
}
class StaticForwarderEnvironment {
struct SpanData {
Span* span;
void* batch[kMaxObjectsToMove];
};
public:
StaticForwarderEnvironment(int size_class, size_t object_size,
size_t objects_per_span, Length pages_per_span,
int batch_size)
: size_class_(size_class),
object_size_(object_size),
objects_per_span_(objects_per_span),
pages_per_span_(pages_per_span),
batch_size_(batch_size) {}
~StaticForwarderEnvironment() { Drain(); }
void RandomlyPoke() {
absl::BitGen rng;
double coin = absl::Uniform(rng, 0.0, 1.0);
if (coin < 0.5) {
Grow();
} else if (coin < 0.9) {
// Deallocate Spans. We may deallocate more than 1 span, so we bias
// towards allocating Spans more often than we deallocate.
Shrink();
} else {
Shuffle(rng);
}
}
void Drain() {
std::vector<std::unique_ptr<SpanData>> spans;
{
absl::MutexLock l(&mu_);
if (data_.empty()) {
return;
}
spans = std::move(data_);
data_.clear();
}
// Check mappings.
std::vector<Span*> free_spans;
for (const auto& data : spans) {
EXPECT_EQ(size_class_,
tc_globals.pagemap().sizeclass(data->span->first_page()));
EXPECT_EQ(size_class_,
tc_globals.pagemap().sizeclass(data->span->last_page()));
// Confirm we can map at least one object back.
EXPECT_EQ(data->span, StaticForwarder::MapObjectToSpan(data->batch[0]));
free_spans.push_back(data->span);
}
StaticForwarder::DeallocateSpans(size_class_, objects_per_span_,
absl::MakeSpan(free_spans));
}
void Grow() {
// Allocate a Span
Span* span = StaticForwarder::AllocateSpan(size_class_, objects_per_span_,
pages_per_span_);
ASSERT_NE(span, nullptr);
auto d = absl::make_unique<SpanData>();
d->span = span;
size_t allocated = span->BuildFreelist(object_size_, objects_per_span_,
d->batch, batch_size_);
EXPECT_LE(allocated, objects_per_span_);
EXPECT_EQ(size_class_, tc_globals.pagemap().sizeclass(span->first_page()));
EXPECT_EQ(size_class_, tc_globals.pagemap().sizeclass(span->last_page()));
// Confirm we can map at least one object back.
EXPECT_EQ(span, StaticForwarder::MapObjectToSpan(d->batch[0]));
absl::MutexLock l(&mu_);
spans_allocated_++;
data_.push_back(std::move(d));
}
void Shrink() {
absl::BitGen rng;
std::vector<std::unique_ptr<SpanData>> spans;
{
absl::MutexLock l(&mu_);
if (data_.empty()) {
return;
}
size_t count = absl::LogUniform<size_t>(rng, 1, data_.size());
spans.reserve(count);
for (int i = 0; i < count; i++) {
spans.push_back(std::move(data_.back()));
data_.pop_back();
}
}
// Check mappings.
std::vector<Span*> free_spans;
for (auto& data : spans) {
EXPECT_EQ(size_class_,
tc_globals.pagemap().sizeclass(data->span->first_page()));
EXPECT_EQ(size_class_,
tc_globals.pagemap().sizeclass(data->span->last_page()));
// Confirm we can map at least one object back.
EXPECT_EQ(data->span, StaticForwarder::MapObjectToSpan(data->batch[0]));
free_spans.push_back(data->span);
}
StaticForwarder::DeallocateSpans(size_class_, objects_per_span_,
absl::MakeSpan(free_spans));
}
void Shuffle(absl::BitGen& rng) {
// Shuffle the shared vector.
absl::MutexLock l(&mu_);
absl::c_shuffle(data_, rng);
}
int64_t BytesAllocated() {
absl::MutexLock l(&mu_);
return pages_per_span_.in_bytes() * spans_allocated_;
}
private:
int size_class_;
size_t object_size_;
size_t objects_per_span_;
Length pages_per_span_;
int batch_size_;
absl::Mutex mu_;
int64_t spans_allocated_ ABSL_GUARDED_BY(mu_) = 0;
std::vector<std::unique_ptr<SpanData>> data_ ABSL_GUARDED_BY(mu_);
};
static BackingStats PageHeapStats() {
absl::base_internal::SpinLockHolder l(&pageheap_lock);
return tc_globals.page_allocator().stats();
}
TEST_P(StaticForwarderTest, Fuzz) {
#if ABSL_HAVE_THREAD_SANITIZER
// TODO(b/193887621): Enable this test under TSan after addressing benign
// true positives.
GTEST_SKIP() << "Skipping test under Thread Sanitizer.";
#endif // ABSL_HAVE_THREAD_SANITIZER
const auto page_heap_before = PageHeapStats();
StaticForwarderEnvironment env(size_class_, object_size_, objects_per_span_,
pages_per_span_, batch_size_);
ThreadManager threads;
threads.Start(10, [&](int) { env.RandomlyPoke(); });
absl::SleepFor(absl::Seconds(0.2));
threads.Stop();
const auto page_heap_after = PageHeapStats();
// Confirm we did not leak Spans by ensuring the page heap did not grow nearly
// 1:1 by the total number of Spans we ever allocated.
//
// Since we expect to allocate a significant number of spans, we apply a
// factor of 1/2 (which is unlikely to be flaky) to avoid false negatives
// if/when a background thread triggers a deallocation.
const int64_t bytes_allocated = env.BytesAllocated();
EXPECT_GT(bytes_allocated, 0);
EXPECT_LE(static_cast<int64_t>(page_heap_after.system_bytes) -
static_cast<int64_t>(page_heap_before.system_bytes),
bytes_allocated / 2);
}
INSTANTIATE_TEST_SUITE_P(All, StaticForwarderTest,
testing::Range(size_t(1), kNumClasses));
} // namespace central_freelist_internal
namespace {
using central_freelist_internal::kNumLists;
template <typename Env>
using CentralFreeListTest = ::testing::Test;
TYPED_TEST_SUITE_P(CentralFreeListTest);
TYPED_TEST_P(CentralFreeListTest, IsolatedSmoke) {
TypeParam e;
EXPECT_CALL(e.forwarder(), AllocateSpan).Times(1);
absl::FixedArray<void*> batch(TypeParam::kBatchSize);
int allocated =
e.central_freelist().RemoveRange(&batch[0], TypeParam::kBatchSize);
ASSERT_GT(allocated, 0);
EXPECT_LE(allocated, TypeParam::kBatchSize);
// We should observe span's utilization captured in the histogram. The number
// of spans in rest of the buckets should be zero.
const int bitwidth = absl::bit_width(static_cast<unsigned>(allocated));
for (int i = 1; i <= absl::bit_width(TypeParam::kObjectsPerSpan); ++i) {
if (i == bitwidth) {
EXPECT_EQ(e.central_freelist().NumSpansWith(i), 1);
} else {
EXPECT_EQ(e.central_freelist().NumSpansWith(i), 0);
}
}
EXPECT_CALL(e.forwarder(), MapObjectToSpan).Times(allocated);
EXPECT_CALL(e.forwarder(), DeallocateSpans).Times(1);
SpanStats stats = e.central_freelist().GetSpanStats();
EXPECT_EQ(stats.num_spans_requested, 1);
EXPECT_EQ(stats.num_spans_returned, 0);
EXPECT_EQ(stats.obj_capacity, 1024);
e.central_freelist().InsertRange(absl::MakeSpan(&batch[0], allocated));
stats = e.central_freelist().GetSpanStats();
EXPECT_EQ(stats.num_spans_requested, 1);
EXPECT_EQ(stats.num_spans_returned, 1);
EXPECT_EQ(stats.obj_capacity, 0);
// Span captured in the histogram with the earlier utilization should have
// been removed.
for (int i = 1; i <= absl::bit_width(TypeParam::kObjectsPerSpan); ++i) {
EXPECT_EQ(e.central_freelist().NumSpansWith(i), 0);
}
}
TYPED_TEST_P(CentralFreeListTest, SpanUtilizationHistogram) {
TypeParam e;
constexpr size_t kNumSpans = 10;
// Request kNumSpans spans.
void* batch[kMaxObjectsToMove];
const int num_objects_to_fetch = kNumSpans * TypeParam::kObjectsPerSpan;
int total_fetched = 0;
// Tracks object and corresponding span idx from which it was allocated.
std::vector<std::pair<void*, int>> objects_to_span_idx;
// Tracks number of objects allocated per span.
std::vector<size_t> allocated_per_span(kNumSpans, 0);
int span_idx = 0;
while (total_fetched < num_objects_to_fetch) {
size_t n = num_objects_to_fetch - total_fetched;
int got = e.central_freelist().RemoveRange(
batch, std::min(n, TypeParam::kBatchSize));
total_fetched += got;
// Increment span_idx if current objects have been fetched from the new
// span.
if (total_fetched > (span_idx + 1) * TypeParam::kObjectsPerSpan) {
++span_idx;
}
// Record fetched object and associated span index.
for (int i = 0; i < got; ++i) {
objects_to_span_idx.push_back(std::make_pair(batch[i], span_idx));
}
ASSERT(span_idx < kNumSpans);
allocated_per_span[span_idx] += got;
}
// Make sure that we have fetched exactly from kNumSpans spans.
EXPECT_EQ(span_idx + 1, kNumSpans);
// We should have kNumSpans spans in the histogram with number of allocated
// objects equal to TypeParam::kObjectsPerSpan (i.e. in the last bucket).
// Rest of the buckets should be empty.
const int expected_bitwidth = absl::bit_width(TypeParam::kObjectsPerSpan);
EXPECT_EQ(e.central_freelist().NumSpansWith(expected_bitwidth), kNumSpans);
for (int i = 1; i < expected_bitwidth; ++i) {
EXPECT_EQ(e.central_freelist().NumSpansWith(i), 0);
}
// Shuffle.
absl::BitGen rng;
std::shuffle(objects_to_span_idx.begin(), objects_to_span_idx.end(), rng);
// Return objects, a fraction at a time, each time checking that histogram is
// correct.
int total_returned = 0;
const int last_bucket = absl::bit_width(TypeParam::kObjectsPerSpan) - 1;
while (total_returned < num_objects_to_fetch) {
uint64_t size_to_pop = std::min(objects_to_span_idx.size() - total_returned,
TypeParam::kBatchSize);
for (int i = 0; i < size_to_pop; ++i) {
const auto [ptr, span_idx] = objects_to_span_idx[i + total_returned];
batch[i] = ptr;
ASSERT(span_idx < kNumSpans);
--allocated_per_span[span_idx];
}
total_returned += size_to_pop;
e.central_freelist().InsertRange({batch, size_to_pop});
// Calculate expected histogram.
size_t expected[absl::bit_width(TypeParam::kObjectsPerSpan)] = {0};
for (int i = 0; i < kNumSpans; ++i) {
// If span has non-zero allocated objects, include it in the histogram.
if (allocated_per_span[i]) {
const size_t bucket = absl::bit_width(allocated_per_span[i]) - 1;
ASSERT(bucket <= last_bucket);
++expected[bucket];
}
}
// Fetch number of spans logged in the histogram and compare it with the
// expected histogram that we calculated using the tracked allocated
// objects per span.
for (int i = 1; i <= last_bucket; ++i) {
EXPECT_EQ(e.central_freelist().NumSpansWith(i), expected[i - 1]);
}
}
// Since no span is live here, histogram must be empty.
for (int i = 1; i <= last_bucket; ++i) {
EXPECT_EQ(e.central_freelist().NumSpansWith(i), 0);
}
}
// Confirms that a call to RemoveRange returns at most kObjectsPerSpan objects
// in cases when there are no non-empty spans in the central freelist. This
// makes sure that we populate, and subsequently allocate from a single span.
// This avoids memory regression due to multiple Populate calls observed in
// b/225880278.
TYPED_TEST_P(CentralFreeListTest, SinglePopulate) {
// Make sure that we allocate up to kObjectsPerSpan objects in both the span
// prioritization states.
TypeParam e;
// Try to fetch sufficiently large number of objects at startup.
const int num_objects_to_fetch = 10 * TypeParam::kObjectsPerSpan;
void* objects[num_objects_to_fetch];
const size_t got =
e.central_freelist().RemoveRange(objects, num_objects_to_fetch);
// Confirm we allocated at most kObjectsPerSpan number of objects.
EXPECT_GT(got, 0);
EXPECT_LE(got, TypeParam::kObjectsPerSpan);
size_t returned = 0;
while (returned < got) {
const size_t to_return = std::min(got - returned, TypeParam::kBatchSize);
e.central_freelist().InsertRange({&objects[returned], to_return});
returned += to_return;
}
}
// Checks if we are indexing a span in the nonempty_ lists as expected.
TYPED_TEST_P(CentralFreeListTest, MultiNonEmptyLists) {
TypeParam e;
ASSERT(kNumLists > 0);
const int num_objects_to_fetch = TypeParam::kObjectsPerSpan;
std::vector<void*> objects(num_objects_to_fetch);
size_t fetched = 0;
int expected_idx = kNumLists - 1;
int prev_bitwidth = 1;
// Fetch one object at a time from a span and confirm that the span is moved
// through the nonempty_ lists as we allocate more objects from it.
while (fetched < num_objects_to_fetch) {
// Try to fetch one object from the span.
int got = e.central_freelist().RemoveRange(&objects[fetched], 1);
fetched += got;
ASSERT(fetched);
size_t cur_bitwidth = absl::bit_width(fetched);
// We index nonempty_ lists based on log2(allocated) and so, we update the
// index when the bit_width changes.
if (cur_bitwidth != prev_bitwidth) {
// We ceil spans to nonempty_[0] when allocated objects from the span
// increases above 2^(kNumLists-1).
expected_idx = expected_idx > 0 ? expected_idx - 1 : 0;
prev_bitwidth = cur_bitwidth;
}
ASSERT(expected_idx >= 0);
ASSERT(expected_idx < kNumLists);
if (fetched % num_objects_to_fetch == 0) {
// Span should have been removed from nonempty_ lists because we have
// allocated all the objects from it.
EXPECT_EQ(e.central_freelist().NumSpansInList(expected_idx), 0);
} else {
// Check that the span exists in the corresponding nonempty_ list.
EXPECT_EQ(e.central_freelist().NumSpansInList(expected_idx), 1);
}
}
// Similar to our previous test, we now make sure that the span is moved
// through the nonempty_ lists when we deallocate objects back to it.
size_t remaining = fetched;
// We ceil spans to nonempty_[0] when allocated objects from the span
// increases above 2^(kNumLists-1).
const size_t threshold = pow(2, kNumLists - 1);
while (--remaining > 0) {
// Return objects back to the span one at a time.
e.central_freelist().InsertRange({&objects[remaining], 1});
ASSERT(remaining);
const size_t cur_bitwidth = absl::bit_width(remaining);
// If we cross pow2 boundaries, update the expected index into nonempty_
// lists.
if (cur_bitwidth != prev_bitwidth) {
// When allocated objects are more than the threshold, the span is indexed
// to nonempty_ list 0.
expected_idx = remaining < threshold ? expected_idx + 1 : 0;
prev_bitwidth = cur_bitwidth;
}
EXPECT_LT(expected_idx, kNumLists);
EXPECT_EQ(e.central_freelist().NumSpansInList(expected_idx), 1);
}
// When the last object is returned, we release the span to the page heap. So,
// nonempty_[0] should also be empty.
e.central_freelist().InsertRange({&objects[remaining], 1});
EXPECT_EQ(e.central_freelist().NumSpansInList(0), 0);
}
// Checks if we are indexing a span in the nonempty_ lists as expected. We also
// check if the spans are correctly being prioritized. That is, we create a
// scenario where we have two live spans, and one span has more allocated
// objects than the other span. On subsequent allocations, we confirm that the
// objects are allocated from the span with a higher number of allocated objects
// as enforced by our prioritization scheme.
TYPED_TEST_P(CentralFreeListTest, SpanPriority) {
TypeParam e;
// If the number of objects per span is less than 3, we do not use more than
// one nonempty_ lists. So, we can not prioritize the spans based on how many
// objects were allocated from them.
const int objects_per_span = TypeParam::kObjectsPerSpan;
if (objects_per_span < 3 || kNumLists < 2) return;
constexpr int kNumSpans = 2;
// Track objects allocated per span.
absl::FixedArray<std::vector<void*>> objects(kNumSpans);
void* batch[kMaxObjectsToMove];
const size_t to_fetch = objects_per_span;
// Allocate all objects from kNumSpans.
for (int span = 0; span < kNumSpans; ++span) {
size_t fetched = 0;
while (fetched < to_fetch) {
const size_t n = to_fetch - fetched;
int got = e.central_freelist().RemoveRange(
batch, std::min(n, TypeParam::kBatchSize));
for (int i = 0; i < got; ++i) {
objects[span].push_back(batch[i]);
}
fetched += got;
}
}
// Perform deallocations so that each span contains only two objects.
size_t to_release = to_fetch - 2;
for (int span = 0; span < kNumSpans; ++span) {
size_t released = 0;
while (released < to_release) {
uint64_t n = std::min(to_release - released, TypeParam::kBatchSize);
for (int i = 0; i < n; ++i) {
batch[i] = objects[span][i + released];
}
released += n;
e.central_freelist().InsertRange({batch, n});
}
objects[span].erase(objects[span].begin(),
objects[span].begin() + released);
}
// Make sure we have kNumSpans in the expected second-last nonempty_ list.
EXPECT_EQ(e.central_freelist().NumSpansInList(kNumLists - 2), kNumSpans);
// Release an additional object from all but one spans so that they are
// deprioritized for subsequent allocations.
to_release = 1;
for (int span = 1; span < kNumSpans; ++span) {
size_t released = 0;
while (released < to_release) {
uint64_t n = std::min(to_release - released, TypeParam::kBatchSize);
for (int i = 0; i < n; ++i) {
batch[i] = objects[span][i + released];
}
released += n;
e.central_freelist().InsertRange({batch, n});
}
objects[span].erase(objects[span].begin(),
objects[span].begin() + released);
}
// Make sure we have kNumSpans-1 spans in the last nonempty_ list and just one
// span in the second-last list.
EXPECT_EQ(e.central_freelist().NumSpansInList(kNumLists - 1), kNumSpans - 1);
EXPECT_EQ(e.central_freelist().NumSpansInList(kNumLists - 2), 1);
// Allocate one object to ensure that it is being allocated from the span with
// the highest number of allocated objects.
int got = e.central_freelist().RemoveRange(batch, 1);
EXPECT_EQ(got, 1);
// Number of spans in the last nonempty_ list should be unchanged (i.e.
// kNumSpans-1).
EXPECT_EQ(e.central_freelist().NumSpansInList(kNumLists - 1), kNumSpans - 1);
// We should have only one span in the second-last nonempty_ list; this is the
// span from which we should have allocated the last object.
EXPECT_EQ(e.central_freelist().NumSpansInList(kNumLists - 2), 1);
// Return previously allocated object.
e.central_freelist().InsertRange({batch, 1});
// Return rest of the objects.
for (int span = 0; span < kNumSpans; ++span) {
for (int i = 0; i < objects[span].size(); ++i) {
e.central_freelist().InsertRange({&objects[span][i], 1});
}
}
}
TYPED_TEST_P(CentralFreeListTest, MultipleSpans) {
TypeParam e;
std::vector<void*> all_objects;
constexpr size_t kNumSpans = 10;
// Request kNumSpans spans.
void* batch[kMaxObjectsToMove];
const int num_objects_to_fetch = kNumSpans * TypeParam::kObjectsPerSpan;
int total_fetched = 0;
while (total_fetched < num_objects_to_fetch) {
size_t n = num_objects_to_fetch - total_fetched;
int got = e.central_freelist().RemoveRange(
batch, std::min(n, TypeParam::kBatchSize));
for (int i = 0; i < got; ++i) {
all_objects.push_back(batch[i]);
}
total_fetched += got;
}
// We should have kNumSpans spans in the histogram with number of
// allocated objects equal to TypeParam::kObjectsPerSpan (i.e. in the last
// bucket). Rest of the buckets should be empty.
const int expected_bitwidth = absl::bit_width(TypeParam::kObjectsPerSpan);
EXPECT_EQ(e.central_freelist().NumSpansWith(expected_bitwidth), kNumSpans);
for (int i = 1; i < expected_bitwidth; ++i) {
EXPECT_EQ(e.central_freelist().NumSpansWith(i), 0);
}
SpanStats stats = e.central_freelist().GetSpanStats();
EXPECT_EQ(stats.num_spans_requested, kNumSpans);
EXPECT_EQ(stats.num_spans_returned, 0);
EXPECT_EQ(all_objects.size(), num_objects_to_fetch);
// Shuffle
absl::BitGen rng;
std::shuffle(all_objects.begin(), all_objects.end(), rng);
// Return all
int total_returned = 0;
bool checked_half = false;
while (total_returned < num_objects_to_fetch) {
uint64_t size_to_pop =
std::min(all_objects.size() - total_returned, TypeParam::kBatchSize);
for (int i = 0; i < size_to_pop; ++i) {
batch[i] = all_objects[i + total_returned];
}
total_returned += size_to_pop;
e.central_freelist().InsertRange({batch, size_to_pop});
// sanity check
if (!checked_half && total_returned >= (num_objects_to_fetch / 2)) {
stats = e.central_freelist().GetSpanStats();
EXPECT_GT(stats.num_spans_requested, stats.num_spans_returned);
EXPECT_NE(stats.obj_capacity, 0);
// Total spans recorded in the histogram must be equal to the number of
// live spans.
size_t spans_in_histogram = 0;
for (int i = 1; i <= absl::bit_width(TypeParam::kObjectsPerSpan); ++i) {
spans_in_histogram += e.central_freelist().NumSpansWith(i);
}
EXPECT_EQ(spans_in_histogram, stats.num_live_spans());
checked_half = true;
}
}
stats = e.central_freelist().GetSpanStats();
EXPECT_EQ(stats.num_spans_requested, stats.num_spans_returned);
// Since no span is live, histogram must be empty.
for (int i = 1; i <= absl::bit_width(TypeParam::kObjectsPerSpan); ++i) {
EXPECT_EQ(e.central_freelist().NumSpansWith(i), 0);
}
EXPECT_EQ(stats.obj_capacity, 0);
}
TYPED_TEST_P(CentralFreeListTest, PassSpanObjectCountToPageheap) {
ASSERT_GT(TypeParam::kObjectsPerSpan, 1);
auto test_function = [&](size_t num_objects) {
TypeParam e;
std::vector<void*> objects(TypeParam::kObjectsPerSpan);
EXPECT_CALL(
e.forwarder(),
AllocateSpan(testing::_, TypeParam::kObjectsPerSpan, testing::_))
.Times(1);
const size_t to_fetch =
std::min(TypeParam::kObjectsPerSpan, TypeParam::kBatchSize);
const size_t fetched =
e.central_freelist().RemoveRange(&objects[0], to_fetch);
size_t returned = 0;
while (returned < fetched) {
EXPECT_CALL(
e.forwarder(),
DeallocateSpans(testing::_, TypeParam::kObjectsPerSpan, testing::_))
.Times(1);
const size_t to_return =
std::min(fetched - returned, TypeParam::kBatchSize);
e.central_freelist().InsertRange({&objects[returned], to_return});
returned += to_return;
}
};
test_function(1);
test_function(TypeParam::kObjectsPerSpan);
}
TYPED_TEST_P(CentralFreeListTest, SpanFragmentation) {
// This test is primarily exercising Span itself to model how tcmalloc.cc uses
// it, but this gives us a self-contained (and sanitizable) implementation of
// the CentralFreeList.
TypeParam e;
// Allocate one object from the CFL to allocate a span.
void* initial;
int got = e.central_freelist().RemoveRange(&initial, 1);
ASSERT_EQ(got, 1);
Span* const span = e.central_freelist().forwarder().MapObjectToSpan(initial);
const size_t object_size =
e.central_freelist().forwarder().class_to_size(TypeParam::kSizeClass);
ThreadManager fragmentation;
fragmentation.Start(1, [&](int) {
benchmark::DoNotOptimize(span->Fragmentation(object_size));
});
ThreadManager cfl;
cfl.Start(1, [&](int) {
void* next;
int got = e.central_freelist().RemoveRange(&next, 1);
e.central_freelist().InsertRange(absl::MakeSpan(&next, got));
});
absl::SleepFor(absl::Seconds(0.1));
fragmentation.Stop();
cfl.Stop();
e.central_freelist().InsertRange(absl::MakeSpan(&initial, 1));
}
REGISTER_TYPED_TEST_SUITE_P(CentralFreeListTest, IsolatedSmoke,
MultiNonEmptyLists, SpanPriority,
SpanUtilizationHistogram, MultipleSpans,
SinglePopulate, PassSpanObjectCountToPageheap,
SpanFragmentation);
namespace unit_tests {
using Env = FakeCentralFreeListEnvironment<
central_freelist_internal::CentralFreeList<MockStaticForwarder>>;
INSTANTIATE_TYPED_TEST_SUITE_P(CentralFreeList, CentralFreeListTest,
::testing::Types<Env>);
} // namespace unit_tests
} // namespace
} // namespace tcmalloc_internal
} // namespace tcmalloc

View File

@ -0,0 +1,53 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tcmalloc/common.h"
#include <algorithm>
#include "tcmalloc/experiment.h"
#include "tcmalloc/internal/environment.h"
#include "tcmalloc/internal/optimization.h"
#include "tcmalloc/pages.h"
#include "tcmalloc/sampler.h"
#include "tcmalloc/span.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
absl::string_view MemoryTagToLabel(MemoryTag tag) {
switch (tag) {
case MemoryTag::kNormal:
return "NORMAL";
case MemoryTag::kNormalP1:
return "NORMAL_P1";
case MemoryTag::kSampled:
return "SAMPLED";
case MemoryTag::kCold:
return "COLD";
default:
ASSUME(false);
}
}
// This only provides correct answer for TCMalloc-allocated memory,
// and may give a false positive for non-allocated block.
extern "C" bool TCMalloc_Internal_PossiblyCold(const void* ptr) {
return IsColdMemory(ptr);
}
} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END

View File

@ -0,0 +1,380 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Common definitions for tcmalloc code.
#ifndef TCMALLOC_COMMON_H_
#define TCMALLOC_COMMON_H_
#include <stddef.h>
#include <stdint.h>
#include <algorithm>
#include <limits>
#include <new>
#include <type_traits>
#include "absl/base/attributes.h"
#include "absl/base/dynamic_annotations.h"
#include "absl/base/internal/spinlock.h"
#include "absl/base/macros.h"
#include "absl/base/optimization.h"
#include "absl/numeric/bits.h"
#include "absl/strings/string_view.h"
#include "absl/types/span.h"
#include "tcmalloc/experiment.h"
#include "tcmalloc/internal/config.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/internal/optimization.h"
#include "tcmalloc/malloc_extension.h"
#include "tcmalloc/size_class_info.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
static_assert(sizeof(void*) == 8);
//-------------------------------------------------------------------
// Configuration
//-------------------------------------------------------------------
// There are four different models for tcmalloc which are created by defining a
// set of constant variables differently:
//
// DEFAULT:
// The default configuration strives for good performance while trying to
// minimize fragmentation. It uses a smaller page size to reduce
// fragmentation, but allocates per-thread and per-cpu capacities similar to
// TCMALLOC_LARGE_PAGES / TCMALLOC_256K_PAGES.
//
// TCMALLOC_LARGE_PAGES:
// Larger page sizes increase the bookkeeping granularity used by TCMalloc for
// its allocations. This can reduce PageMap size and traffic to the
// innermost cache (the page heap), but can increase memory footprints. As
// TCMalloc will not reuse a page for a different allocation size until the
// entire page is deallocated, this can be a source of increased memory
// fragmentation.
//
// Historically, larger page sizes improved lookup performance for the
// pointer-to-size lookup in the PageMap that was part of the critical path.
// With most deallocations leveraging C++14's sized delete feature
// (https://isocpp.org/files/papers/n3778.html), this optimization is less
// significant.
//
// TCMALLOC_256K_PAGES
// This configuration uses an even larger page size (256KB) as the unit of
// accounting granularity.
//
// TCMALLOC_SMALL_BUT_SLOW:
// Used for situations where minimizing the memory footprint is the most
// desirable attribute, even at the cost of performance.
//
// The constants that vary between models are:
//
// kPageShift - Shift amount used to compute the page size.
// kNumBaseClasses - Number of size classes serviced by bucket allocators
// kMaxSize - Maximum size serviced by bucket allocators (thread/cpu/central)
// kMinThreadCacheSize - The minimum size in bytes of each ThreadCache.
// kMaxThreadCacheSize - The maximum size in bytes of each ThreadCache.
// kDefaultOverallThreadCacheSize - The maximum combined size in bytes of all
// ThreadCaches for an executable.
// kStealAmount - The number of bytes one ThreadCache will steal from another
// when the first ThreadCache is forced to Scavenge(), delaying the next
// call to Scavenge for this thread.
// Older configurations had their own customized macros. Convert them into
// a page-shift parameter that is checked below.
#ifndef TCMALLOC_PAGE_SHIFT
#ifdef TCMALLOC_SMALL_BUT_SLOW
#define TCMALLOC_PAGE_SHIFT 12
#define TCMALLOC_USE_PAGEMAP3
#elif defined(TCMALLOC_256K_PAGES)
#define TCMALLOC_PAGE_SHIFT 18
#elif defined(TCMALLOC_LARGE_PAGES)
#define TCMALLOC_PAGE_SHIFT 15
#else
#define TCMALLOC_PAGE_SHIFT 13
#endif
#else
#error "TCMALLOC_PAGE_SHIFT is an internal macro!"
#endif
#if TCMALLOC_PAGE_SHIFT == 12
inline constexpr size_t kPageShift = 12;
inline constexpr size_t kNumBaseClasses = 46;
inline constexpr bool kHasExpandedClasses = false;
inline constexpr size_t kMaxSize = 8 << 10;
inline constexpr size_t kMinThreadCacheSize = 4 * 1024;
inline constexpr size_t kMaxThreadCacheSize = 64 * 1024;
inline constexpr size_t kMaxCpuCacheSize = 10 * 1024;
inline constexpr size_t kDefaultOverallThreadCacheSize = kMaxThreadCacheSize;
inline constexpr size_t kStealAmount = kMinThreadCacheSize;
inline constexpr size_t kDefaultProfileSamplingRate = 1 << 19;
inline constexpr size_t kMinPages = 2;
#elif TCMALLOC_PAGE_SHIFT == 15
inline constexpr size_t kPageShift = 15;
inline constexpr size_t kNumBaseClasses = 78;
inline constexpr bool kHasExpandedClasses = true;
inline constexpr size_t kMaxSize = 256 * 1024;
inline constexpr size_t kMinThreadCacheSize = kMaxSize * 2;
inline constexpr size_t kMaxThreadCacheSize = 4 << 20;
inline constexpr size_t kMaxCpuCacheSize = 1.5 * 1024 * 1024;
inline constexpr size_t kDefaultOverallThreadCacheSize =
8u * kMaxThreadCacheSize;
inline constexpr size_t kStealAmount = 1 << 16;
inline constexpr size_t kDefaultProfileSamplingRate = 1 << 21;
inline constexpr size_t kMinPages = 8;
#elif TCMALLOC_PAGE_SHIFT == 18
inline constexpr size_t kPageShift = 18;
inline constexpr size_t kNumBaseClasses = 89;
inline constexpr bool kHasExpandedClasses = true;
inline constexpr size_t kMaxSize = 256 * 1024;
inline constexpr size_t kMinThreadCacheSize = kMaxSize * 2;
inline constexpr size_t kMaxThreadCacheSize = 4 << 20;
inline constexpr size_t kMaxCpuCacheSize = 1.5 * 1024 * 1024;
inline constexpr size_t kDefaultOverallThreadCacheSize =
8u * kMaxThreadCacheSize;
inline constexpr size_t kStealAmount = 1 << 16;
inline constexpr size_t kDefaultProfileSamplingRate = 1 << 21;
inline constexpr size_t kMinPages = 8;
#elif TCMALLOC_PAGE_SHIFT == 13
inline constexpr size_t kPageShift = 13;
inline constexpr size_t kNumBaseClasses = 86;
inline constexpr bool kHasExpandedClasses = true;
inline constexpr size_t kMaxSize = 256 * 1024;
inline constexpr size_t kMinThreadCacheSize = kMaxSize * 2;
inline constexpr size_t kMaxThreadCacheSize = 4 << 20;
inline constexpr size_t kMaxCpuCacheSize = 1.5 * 1024 * 1024;
inline constexpr size_t kDefaultOverallThreadCacheSize =
8u * kMaxThreadCacheSize;
inline constexpr size_t kStealAmount = 1 << 16;
inline constexpr size_t kDefaultProfileSamplingRate = 1 << 21;
inline constexpr size_t kMinPages = 8;
#else
#error "Unsupported TCMALLOC_PAGE_SHIFT value!"
#endif
// Sanitizers constrain the memory layout which causes problems with the
// enlarged tags required to represent NUMA partitions. Disable NUMA awareness
// to avoid failing to mmap memory.
#if defined(TCMALLOC_NUMA_AWARE) && !defined(MEMORY_SANITIZER) && \
!defined(THREAD_SANITIZER)
inline constexpr size_t kNumaPartitions = 2;
#else
inline constexpr size_t kNumaPartitions = 1;
#endif
// We have copies of kNumBaseClasses size classes for each NUMA node, followed
// by any expanded classes.
inline constexpr size_t kExpandedClassesStart =
kNumBaseClasses * kNumaPartitions;
inline constexpr size_t kNumClasses =
kExpandedClassesStart + (kHasExpandedClasses ? kNumBaseClasses : 0);
// Size classes are often stored as uint32_t values, but there are some
// situations where we need to store a size class with as compact a
// representation as possible (e.g. in PageMap). Here we determine the integer
// type to use in these situations - i.e. the smallest integer type large
// enough to store values in the range [0,kNumClasses).
constexpr size_t kMaxClass = kNumClasses - 1;
using CompactSizeClass =
std::conditional_t<kMaxClass <= std::numeric_limits<uint8_t>::max(),
uint8_t, uint16_t>;
// ~64K classes ought to be enough for anybody, but let's be sure.
static_assert(kMaxClass <= std::numeric_limits<CompactSizeClass>::max());
// Minimum/maximum number of batches in TransferCache per size class.
// Actual numbers depends on a number of factors, see TransferCache::Init
// for details.
inline constexpr size_t kMinObjectsToMove = 2;
inline constexpr size_t kMaxObjectsToMove = 128;
inline constexpr size_t kPageSize = 1 << kPageShift;
// Verify that the page size used is at least 8x smaller than the maximum
// element size in the thread cache. This guarantees at most 12.5% internal
// fragmentation (1/8). When page size is 256k (kPageShift == 18), the benefit
// of increasing kMaxSize to be multiple of kPageSize is unclear. Object size
// profile data indicates that the number of simultaneously live objects (of
// size >= 256k) tends to be very small. Keeping those objects as 'large'
// objects won't cause too much memory waste, while heap memory reuse can be
// improved. Increasing kMaxSize to be too large has another bad side effect --
// the thread cache pressure is increased, which will in turn increase traffic
// between central cache and thread cache, leading to performance degradation.
static_assert((kMaxSize / kPageSize) >= kMinPages || kPageShift >= 18,
"Ratio of kMaxSize / kPageSize is too small");
inline constexpr std::align_val_t kAlignment{8};
// log2 (kAlignment)
inline constexpr size_t kAlignmentShift =
absl::bit_width(static_cast<size_t>(kAlignment) - 1u);
// The number of times that a deallocation can cause a freelist to
// go over its max_length() before shrinking max_length().
inline constexpr int kMaxOverages = 3;
// Maximum length we allow a per-thread free-list to have before we
// move objects from it into the corresponding central free-list. We
// want this big to avoid locking the central free-list too often. It
// should not hurt to make this list somewhat big because the
// scavenging code will shrink it down when its contents are not in use.
inline constexpr int kMaxDynamicFreeListLength = 8192;
enum class MemoryTag : uint8_t {
// Sampled, infrequently allocated
kSampled = 0x0,
// Not sampled, NUMA partition 0
kNormalP0 = 0x1,
// Not sampled, NUMA partition 1
kNormalP1 = (kNumaPartitions > 1) ? 0x2 : 0xff,
// Not sampled
kNormal = kNormalP0,
// Cold
kCold = (kNumaPartitions > 1) ? 0x4 : 0x2,
};
// We make kNormal and kCold disjoint so that IsCold implies IsSampled. This
// allows us to avoid modifying the fast delete path in any way when cold-tagged
// memory allocations are absent. We can overload the IsSampled check and then
// do a second check for whether the possibly-sampled allocation is actually
// IsCold.
static_assert((static_cast<uint8_t>(MemoryTag::kNormal) &
static_cast<uint8_t>(MemoryTag::kCold)) == 0,
"kNormal and kCold should have disjoint bit patterns");
inline constexpr uintptr_t kTagShift = std::min(kAddressBits - 4, 42);
inline constexpr uintptr_t kTagMask = uintptr_t{kNumaPartitions > 1 ? 0x7 : 0x3}
<< kTagShift;
inline bool IsSampledMemory(const void* ptr) {
constexpr uintptr_t kSampledNormalMask = kNumaPartitions > 1 ? 0x3 : 0x1;
static_assert(static_cast<uintptr_t>(MemoryTag::kNormalP0) &
kSampledNormalMask);
static_assert(static_cast<uintptr_t>(MemoryTag::kNormalP1) &
kSampledNormalMask);
const uintptr_t tag =
(reinterpret_cast<uintptr_t>(ptr) & kTagMask) >> kTagShift;
return (tag & kSampledNormalMask) ==
static_cast<uintptr_t>(MemoryTag::kSampled);
}
inline bool IsNormalMemory(const void* ptr) { return !IsSampledMemory(ptr); }
inline bool IsColdMemory(const void* ptr) {
bool r = (reinterpret_cast<uintptr_t>(ptr) & kTagMask) ==
(static_cast<uintptr_t>(MemoryTag::kCold) << kTagShift);
// IsColdMemory(ptr) implies IsSampledMemory(ptr). This allows us to avoid
// introducing new branches on the delete fast path when cold memory tags are
// not in use.
ASSERT(!r || IsSampledMemory(ptr));
return r;
}
inline constexpr bool ColdFeatureActive() { return kHasExpandedClasses; }
inline MemoryTag GetMemoryTag(const void* ptr) {
return static_cast<MemoryTag>((reinterpret_cast<uintptr_t>(ptr) & kTagMask) >>
kTagShift);
}
absl::string_view MemoryTagToLabel(MemoryTag tag);
inline constexpr bool IsExpandedSizeClass(unsigned size_class) {
return kHasExpandedClasses && (size_class >= kExpandedClassesStart);
}
#if !defined(TCMALLOC_SMALL_BUT_SLOW) && __SIZEOF_POINTER__ != 4
// Always allocate at least a huge page
inline constexpr size_t kMinSystemAlloc = kHugePageSize;
inline constexpr size_t kMinMmapAlloc = 1 << 30; // mmap() in 1GiB ranges.
#else
// Allocate in units of 2MiB. This is the size of a huge page for x86, but
// not for Power.
inline constexpr size_t kMinSystemAlloc = 2 << 20;
// mmap() in units of 32MiB. This is a multiple of huge page size for
// both x86 (2MiB) and Power (16MiB)
inline constexpr size_t kMinMmapAlloc = 32 << 20;
#endif
static_assert(kMinMmapAlloc % kMinSystemAlloc == 0,
"Minimum mmap allocation size is not a multiple of"
" minimum system allocation size");
enum class AllocationAccess {
kHot,
kCold,
};
inline bool IsColdHint(hot_cold_t hint) {
return static_cast<uint8_t>(hint) < uint8_t{128};
}
inline AllocationAccess AccessFromPointer(void* ptr) {
if (!kHasExpandedClasses) {
ASSERT(!IsColdMemory(ptr));
return AllocationAccess::kHot;
}
return ABSL_PREDICT_FALSE(IsColdMemory(ptr)) ? AllocationAccess::kCold
: AllocationAccess::kHot;
}
inline MemoryTag NumaNormalTag(size_t numa_partition) {
switch (numa_partition) {
case 0:
return MemoryTag::kNormalP0;
case 1:
return MemoryTag::kNormalP1;
default:
ASSUME(false);
__builtin_unreachable();
}
}
inline size_t NumaPartitionFromPointer(void* ptr) {
if constexpr (kNumaPartitions == 1) {
return 0;
}
switch (GetMemoryTag(ptr)) {
case MemoryTag::kNormalP1:
return 1;
default:
return 0;
}
}
// Linker initialized, so this lock can be accessed at any time.
// Note: `CpuCache::ResizeInfo::lock` must be taken before the `pageheap_lock`
// if both are going to be held simultaneously.
extern absl::base_internal::SpinLock pageheap_lock;
// Evaluates a/b, avoiding division by zero.
inline double safe_div(double a, double b) {
if (b == 0) {
return 0.;
} else {
return a / b;
}
}
} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END
#endif // TCMALLOC_COMMON_H_

View File

@ -0,0 +1,47 @@
# Copyright 2019 The TCMalloc Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This package provides default compiler warning flags for the OSS release"""
TCMALLOC_LLVM_FLAGS = [
# Ensure TCMalloc itself builds without errors, even if its dependencies
# aren't necessarily -Werror clean.
"-Werror",
"-Wno-deprecated-declarations",
"-Wno-deprecated-volatile",
"-Wno-implicit-int-float-conversion",
"-Wno-sign-compare",
"-Wno-uninitialized",
"-Wno-unused-function",
"-Wno-unused-variable",
]
TCMALLOC_GCC_FLAGS = [
# Ensure TCMalloc itself builds without errors, even if its dependencies
# aren't necessarily -Werror clean.
"-Werror",
"-Wno-attribute-alias",
"-Wno-sign-compare",
"-Wno-stringop-overflow",
"-Wno-uninitialized",
"-Wno-unused-function",
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66425
"-Wno-unused-result",
"-Wno-unused-variable",
]
TCMALLOC_DEFAULT_COPTS = select({
"//tcmalloc:llvm": TCMALLOC_LLVM_FLAGS,
"//conditions:default": TCMALLOC_GCC_FLAGS,
})

View File

@ -0,0 +1,82 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tcmalloc/cpu_cache.h"
#include <stdlib.h>
#include <string.h>
#include <algorithm>
#include <atomic>
#include "absl/base/dynamic_annotations.h"
#include "absl/base/macros.h"
#include "absl/base/thread_annotations.h"
#include "absl/container/fixed_array.h"
#include "tcmalloc/arena.h"
#include "tcmalloc/common.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/internal_malloc_extension.h"
#include "tcmalloc/parameters.h"
#include "tcmalloc/static_vars.h"
#include "tcmalloc/transfer_cache.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
static void ActivatePerCpuCaches() {
if (tcmalloc::tcmalloc_internal::tc_globals.CpuCacheActive()) {
// Already active.
return;
}
if (Parameters::per_cpu_caches() && subtle::percpu::IsFast()) {
tc_globals.InitIfNecessary();
tc_globals.cpu_cache().Activate();
tc_globals.ActivateCpuCache();
// no need for this thread cache anymore, I guess.
ThreadCache::BecomeIdle();
// If there's a problem with this code, let's notice it right away:
::operator delete(::operator new(1));
}
}
class PerCPUInitializer {
public:
PerCPUInitializer() {
ActivatePerCpuCaches();
}
};
static PerCPUInitializer module_enter_exit;
} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END
extern "C" void TCMalloc_Internal_ForceCpuCacheActivation() {
tcmalloc::tcmalloc_internal::ActivatePerCpuCaches();
}
extern "C" bool MallocExtension_Internal_GetPerCpuCachesActive() {
return tcmalloc::tcmalloc_internal::tc_globals.CpuCacheActive();
}
extern "C" int32_t MallocExtension_Internal_GetMaxPerCpuCacheSize() {
return tcmalloc::tcmalloc_internal::Parameters::max_per_cpu_cache_size();
}
extern "C" void MallocExtension_Internal_SetMaxPerCpuCacheSize(int32_t value) {
tcmalloc::tcmalloc_internal::Parameters::set_max_per_cpu_cache_size(value);
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,86 @@
// Copyright 2021 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdint.h>
#include <string>
#include <thread> // NOLINT(build/c++11)
#include "benchmark/benchmark.h"
#include "gtest/gtest.h"
#include "absl/base/internal/sysinfo.h"
#include "absl/random/random.h"
#include "absl/synchronization/notification.h"
#include "absl/time/clock.h"
#include "absl/time/time.h"
#include "tcmalloc/cpu_cache.h"
#include "tcmalloc/internal/percpu.h"
#include "tcmalloc/internal_malloc_extension.h"
#include "tcmalloc/static_vars.h"
namespace tcmalloc {
namespace tcmalloc_internal {
namespace {
// This test mutates global state, including triggering the activation of the
// per-CPU caches. It should not be run along side other tests in the same
// process that may rely on an isolated global instance.
TEST(CpuCacheActivateTest, GlobalInstance) {
if (!subtle::percpu::IsFast()) {
return;
}
CpuCache& cache = tc_globals.cpu_cache();
absl::Notification done;
std::thread t([&]() {
const int num_cpus = absl::base_internal::NumCPUs();
absl::BitGen rng;
while (!done.HasBeenNotified()) {
const double coin = absl::Uniform(rng, 0., 1.);
const bool ready = tc_globals.CpuCacheActive();
if (ready && coin < 0.25) {
const int cpu = absl::Uniform(rng, 0, num_cpus);
benchmark::DoNotOptimize(cache.UsedBytes(cpu));
} else if (ready && coin < 0.5) {
const int cpu = absl::Uniform(rng, 0, num_cpus);
benchmark::DoNotOptimize(cache.Capacity(cpu));
} else if (ready && coin < 0.75) {
benchmark::DoNotOptimize(cache.TotalUsedBytes());
} else {
benchmark::DoNotOptimize(cache.CacheLimit());
}
}
});
// Trigger initialization of the CpuCache, confirming it was not initialized
// at the start of the test and is afterwards.
EXPECT_FALSE(tc_globals.CpuCacheActive());
ASSERT_NE(&TCMalloc_Internal_ForceCpuCacheActivation, nullptr);
Parameters::set_per_cpu_caches(true);
TCMalloc_Internal_ForceCpuCacheActivation();
EXPECT_TRUE(tc_globals.CpuCacheActive());
absl::SleepFor(absl::Seconds(0.2));
done.Notify();
t.join();
}
} // namespace
} // namespace tcmalloc_internal
} // namespace tcmalloc

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,606 @@
// Copyright 2022 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tcmalloc/deallocation_profiler.h"
#include <algorithm>
#include <cmath> // for std::lround
#include <cstdint> // for uintptr_t
#include <functional>
#include <limits>
#include <memory>
#include <string> // for memset
#include <type_traits>
#include <utility>
#include "absl/base/attributes.h"
#include "absl/base/internal/low_level_alloc.h"
#include "absl/base/internal/spinlock.h"
#include "absl/base/internal/sysinfo.h"
#include "absl/container/flat_hash_map.h"
#include "absl/debugging/stacktrace.h" // for GetStackTrace
#include "absl/hash/hash.h"
#include "absl/time/clock.h"
#include "absl/time/time.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/internal/percpu.h"
#include "tcmalloc/internal_malloc_extension.h"
#include "tcmalloc/malloc_extension.h"
#include "tcmalloc/sampled_allocation.h"
#include "tcmalloc/static_vars.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace deallocationz {
namespace {
using ::absl::base_internal::SpinLock;
using ::absl::base_internal::SpinLockHolder;
// STL adaptor for an arena based allocator which provides the following:
// static void* Alloc::Allocate(size_t size);
// static void Alloc::Free(void* ptr, size_t size);
template <typename T, class Alloc>
class AllocAdaptor final {
public:
using value_type = T;
AllocAdaptor() {}
AllocAdaptor(const AllocAdaptor&) {}
template <class T1>
using rebind = AllocAdaptor<T1, Alloc>;
template <class T1>
explicit AllocAdaptor(const AllocAdaptor<T1, Alloc>&) {}
T* allocate(size_t n) {
// Check if n is too big to allocate.
ASSERT((n * sizeof(T)) / sizeof(T) == n);
return static_cast<T*>(Alloc::Allocate(n * sizeof(T)));
}
void deallocate(T* p, size_t n) { Alloc::Free(p, n * sizeof(T)); }
// There's no state, so these allocators are always equal
bool operator==(const AllocAdaptor&) const { return true; }
bool operator!=(const AllocAdaptor&) const { return false; }
};
const int64_t kMaxStackDepth = 64;
// Stores stack traces and metadata for any allocation or deallocation
// encountered by the profiler.
struct DeallocationSampleRecord {
double weight = 0.0;
size_t requested_size = 0;
size_t requested_alignment = 0;
size_t allocated_size = 0; // size after sizeclass/page rounding
int depth = 0; // Number of PC values stored in array below
void* stack[kMaxStackDepth];
// creation_time is used to capture the life_time of sampled allocations
absl::Time creation_time;
int cpu_id = -1;
pid_t thread_id = 0;
template <typename H>
friend H AbslHashValue(H h, const DeallocationSampleRecord& c) {
return H::combine(H::combine_contiguous(std::move(h), c.stack, c.depth),
c.depth, c.requested_size, c.requested_alignment,
c.allocated_size);
}
bool operator==(const DeallocationSampleRecord& other) const {
if (depth != other.depth || requested_size != other.requested_size ||
requested_alignment != other.requested_alignment ||
allocated_size != other.allocated_size) {
return false;
}
return std::equal(stack, stack + depth, other.stack);
}
};
// Tracks whether an object was allocated/deallocated by the same CPU/thread.
struct CpuThreadMatchingStatus {
constexpr CpuThreadMatchingStatus(bool cpu_matched, bool thread_matched)
: cpu_matched(cpu_matched),
thread_matched(thread_matched),
value((static_cast<int>(cpu_matched) << 1) |
static_cast<int>(thread_matched)) {}
bool cpu_matched;
bool thread_matched;
int value;
};
struct RpcMatchingStatus {
static constexpr int ComputeValue(uint64_t alloc, uint64_t dealloc) {
if (alloc != 0 && dealloc != 0) {
return static_cast<int>(alloc == dealloc);
} else {
return 2;
}
}
constexpr RpcMatchingStatus(uint64_t alloc, uint64_t dealloc)
: value(ComputeValue(alloc, dealloc)) {}
int value;
};
int ComputeIndex(CpuThreadMatchingStatus status, RpcMatchingStatus rpc_status) {
return status.value * 3 + rpc_status.value;
}
constexpr std::pair<CpuThreadMatchingStatus, RpcMatchingStatus> kAllCases[] = {
{CpuThreadMatchingStatus(false, false), RpcMatchingStatus(0, 0)},
{CpuThreadMatchingStatus(false, true), RpcMatchingStatus(0, 0)},
{CpuThreadMatchingStatus(true, false), RpcMatchingStatus(0, 0)},
{CpuThreadMatchingStatus(true, true), RpcMatchingStatus(0, 0)},
{CpuThreadMatchingStatus(false, false), RpcMatchingStatus(1, 2)},
{CpuThreadMatchingStatus(false, true), RpcMatchingStatus(1, 2)},
{CpuThreadMatchingStatus(true, false), RpcMatchingStatus(1, 2)},
{CpuThreadMatchingStatus(true, true), RpcMatchingStatus(1, 2)},
{CpuThreadMatchingStatus(false, false), RpcMatchingStatus(1, 1)},
{CpuThreadMatchingStatus(false, true), RpcMatchingStatus(1, 1)},
{CpuThreadMatchingStatus(true, false), RpcMatchingStatus(1, 1)},
{CpuThreadMatchingStatus(true, true), RpcMatchingStatus(1, 1)},
};
} // namespace
class DeallocationProfiler {
private:
// Arena and allocator used to back STL objects used by DeallocationProfiler
// Shared between all instances of DeallocationProfiler
// TODO(b/248332543): Use TCMalloc's own arena allocator instead of defining a
// new one here. The need for refcount management could be the reason for
// using a custom allocator in the first place.
class MyAllocator {
public:
static void* Allocate(size_t n) {
return absl::base_internal::LowLevelAlloc::AllocWithArena(n, arena_);
}
static void Free(const void* p, size_t /* n */) {
absl::base_internal::LowLevelAlloc::Free(const_cast<void*>(p));
}
// The lifetime of the arena is managed using a reference count and
// determined by how long at least one emitted Profile remains alive.
struct LowLevelArenaReference {
LowLevelArenaReference() {
SpinLockHolder h(&arena_lock_);
if ((refcount_++) == 0) {
CHECK_CONDITION(arena_ == nullptr);
arena_ = absl::base_internal::LowLevelAlloc::NewArena(0);
}
}
~LowLevelArenaReference() {
SpinLockHolder h(&arena_lock_);
if ((--refcount_) == 0) {
CHECK_CONDITION(
absl::base_internal::LowLevelAlloc::DeleteArena(arena_));
arena_ = nullptr;
}
}
};
private:
// We need to protect the arena with a mutex and ensure that every thread
// acquires that mutex before it uses the arena for the first time. Once
// it has acquired the mutex, it is guaranteed that arena won't change
// between that point in time and when the thread stops accessing it (as
// enforced by LowLevelArenaReference below).
ABSL_CONST_INIT static SpinLock arena_lock_;
static absl::base_internal::LowLevelAlloc::Arena* arena_;
// We assume that launching a new deallocation profiler takes too long
// to cause this to overflow within the sampling period. The reason this
// is not using std::shared_ptr is that we do not only need to protect the
// value of the reference count but also the pointer itself (and therefore
// need a separate mutex either way).
static uint32_t refcount_;
};
// This must be the first member of the class to be initialized. The
// underlying arena must stay alive as long as the profiler.
MyAllocator::LowLevelArenaReference arena_ref_;
// All active profilers are stored in a list.
DeallocationProfiler* next_;
DeallocationProfilerList* list_ = nullptr;
friend class DeallocationProfilerList;
using AllocsTable = absl::flat_hash_map<
tcmalloc_internal::AllocHandle, DeallocationSampleRecord,
absl::Hash<tcmalloc_internal::AllocHandle>,
std::equal_to<tcmalloc_internal::AllocHandle>,
AllocAdaptor<std::pair<const tcmalloc_internal::AllocHandle,
DeallocationSampleRecord>,
MyAllocator>>;
class DeallocationStackTraceTable final
: public tcmalloc_internal::ProfileBase {
public:
// We define the dtor to ensure it is placed in the desired text section.
~DeallocationStackTraceTable() override = default;
void AddTrace(const DeallocationSampleRecord& alloc_trace,
const DeallocationSampleRecord& dealloc_trace);
void Iterate(
absl::FunctionRef<void(const Profile::Sample&)> func) const override;
ProfileType Type() const override {
return tcmalloc::ProfileType::kLifetimes;
}
absl::Duration Duration() const override {
return stop_time_ - start_time_;
}
void StopAndRecord(const AllocsTable& allocs);
private:
// This must be the first member of the class to be initialized. The
// underlying arena must stay alive as long as the profile.
MyAllocator::LowLevelArenaReference arena_ref_;
static constexpr int kNumCases =
12; // CPUthreadMatchingStatus({T,F},{T,F}) x RPCMatchingStatus
struct Key {
DeallocationSampleRecord alloc;
DeallocationSampleRecord dealloc;
Key(const DeallocationSampleRecord& alloc,
const DeallocationSampleRecord& dealloc)
: alloc(alloc), dealloc(dealloc) {}
template <typename H>
friend H AbslHashValue(H h, const Key& c) {
return H::combine(std::move(h), c.alloc, c.dealloc);
}
bool operator==(const Key& other) const {
return (alloc == other.alloc) && (dealloc == other.dealloc);
}
};
struct Value {
// for each possible cases, we collect repetition count and avg lifetime
// we also collect the minimum and maximum lifetimes, as well as the sum
// of squares (to calculate the standard deviation).
double counts[kNumCases] = {0.0};
double mean_life_times_ns[kNumCases] = {0.0};
double variance_life_times_ns[kNumCases] = {0.0};
double min_life_times_ns[kNumCases] = {0.0};
double max_life_times_ns[kNumCases] = {0.0};
Value() {
std::fill_n(min_life_times_ns, kNumCases,
std::numeric_limits<double>::max());
}
};
absl::flat_hash_map<Key, Value, absl::Hash<Key>, std::equal_to<Key>,
AllocAdaptor<std::pair<const Key, Value>, MyAllocator>>
table_;
absl::Time start_time_ = absl::Now();
absl::Time stop_time_;
};
// Keep track of allocations that are in flight
AllocsTable allocs_;
// Table to store lifetime information collected by this profiler
std::unique_ptr<DeallocationStackTraceTable> reports_ = nullptr;
public:
explicit DeallocationProfiler(DeallocationProfilerList* list) : list_(list) {
reports_ = std::make_unique<DeallocationStackTraceTable>();
list_->Add(this);
}
~DeallocationProfiler() {
if (reports_ != nullptr) {
Stop();
}
}
const tcmalloc::Profile Stop() {
if (reports_ != nullptr) {
// We first remove the profiler from the list to avoid racing with
// potential allocations which may modify the allocs_ table.
list_->Remove(this);
reports_->StopAndRecord(allocs_);
return tcmalloc_internal::ProfileAccessor::MakeProfile(
std::move(reports_));
}
return tcmalloc::Profile();
}
void ReportMalloc(const tcmalloc_internal::StackTrace& stack_trace) {
// store sampled alloc in the hashmap
DeallocationSampleRecord& allocation =
allocs_[stack_trace.sampled_alloc_handle];
allocation.allocated_size = stack_trace.allocated_size;
allocation.requested_size = stack_trace.requested_size;
allocation.requested_alignment = stack_trace.requested_alignment;
allocation.depth = stack_trace.depth;
memcpy(allocation.stack, stack_trace.stack,
sizeof(void*) * std::min(static_cast<int64_t>(stack_trace.depth),
kMaxStackDepth));
// TODO(mmaas): Do we need to worry about b/65384231 anymore?
allocation.creation_time = stack_trace.allocation_time;
allocation.cpu_id = tcmalloc_internal::subtle::percpu::GetCurrentCpu();
allocation.thread_id = absl::base_internal::GetTID();
// We divide by the requested size to obtain the number of allocations.
// TODO(b/248332543): Consider using AllocatedBytes from sampler.h.
allocation.weight = static_cast<double>(stack_trace.weight) /
(stack_trace.requested_size + 1);
}
void ReportFree(tcmalloc_internal::AllocHandle handle) {
auto it = allocs_.find(handle);
// Handle the case that we observed the deallocation but not the allocation
if (it == allocs_.end()) {
return;
}
DeallocationSampleRecord sample = it->second;
allocs_.erase(it);
DeallocationSampleRecord deallocation;
deallocation.allocated_size = sample.allocated_size;
deallocation.requested_alignment = sample.requested_alignment;
deallocation.requested_size = sample.requested_size;
deallocation.creation_time = absl::Now();
deallocation.cpu_id = tcmalloc_internal::subtle::percpu::GetCurrentCpu();
deallocation.thread_id = absl::base_internal::GetTID();
deallocation.depth =
absl::GetStackTrace(deallocation.stack, kMaxStackDepth, 1);
reports_->AddTrace(sample, deallocation);
}
};
void DeallocationProfilerList::Add(DeallocationProfiler* profiler) {
SpinLockHolder h(&profilers_lock_);
profiler->next_ = first_;
first_ = profiler;
// Whenever a new profiler is created, we seed it with live allocations.
tcmalloc_internal::tc_globals.sampled_allocation_recorder().Iterate(
[profiler](
const tcmalloc_internal::SampledAllocation& sampled_allocation) {
profiler->ReportMalloc(sampled_allocation.sampled_stack);
});
}
// This list is very short and we're nowhere near a hot path, just walk
void DeallocationProfilerList::Remove(DeallocationProfiler* profiler) {
SpinLockHolder h(&profilers_lock_);
DeallocationProfiler** link = &first_;
DeallocationProfiler* cur = first_;
while (cur != profiler) {
CHECK_CONDITION(cur != nullptr);
link = &cur->next_;
cur = cur->next_;
}
*link = profiler->next_;
}
void DeallocationProfilerList::ReportMalloc(
const tcmalloc_internal::StackTrace& stack_trace) {
SpinLockHolder h(&profilers_lock_);
DeallocationProfiler* cur = first_;
while (cur != nullptr) {
cur->ReportMalloc(stack_trace);
cur = cur->next_;
}
}
void DeallocationProfilerList::ReportFree(
tcmalloc_internal::AllocHandle handle) {
SpinLockHolder h(&profilers_lock_);
DeallocationProfiler* cur = first_;
while (cur != nullptr) {
cur->ReportFree(handle);
cur = cur->next_;
}
}
// Initialize static variables
absl::base_internal::LowLevelAlloc::Arena*
DeallocationProfiler::MyAllocator::arena_ = nullptr;
uint32_t DeallocationProfiler::MyAllocator::refcount_ = 0;
ABSL_CONST_INIT SpinLock DeallocationProfiler::MyAllocator::arena_lock_(
absl::kConstInit, absl::base_internal::SCHEDULE_KERNEL_ONLY);
void DeallocationProfiler::DeallocationStackTraceTable::StopAndRecord(
const AllocsTable& allocs) {
stop_time_ = absl::Now();
// Insert a dummy DeallocationSampleRecord since the table stores pairs. This
// allows us to make minimal changes to the rest of the sample processing
// steps reducing special casing for censored samples. This also allows us to
// aggregate censored samples just like regular deallocation samples.
const DeallocationSampleRecord censored{
.creation_time = stop_time_,
};
for (const auto& [unused, alloc] : allocs) {
AddTrace(alloc, censored);
}
}
void DeallocationProfiler::DeallocationStackTraceTable::AddTrace(
const DeallocationSampleRecord& alloc_trace,
const DeallocationSampleRecord& dealloc_trace) {
CpuThreadMatchingStatus status =
CpuThreadMatchingStatus(alloc_trace.cpu_id == dealloc_trace.cpu_id,
alloc_trace.thread_id == dealloc_trace.thread_id);
// Initialize a default rpc matched status.
RpcMatchingStatus rpc_status(/*alloc=*/0, /*dealloc=*/0);
const int index = ComputeIndex(status, rpc_status);
DeallocationStackTraceTable::Value& v =
table_[DeallocationStackTraceTable::Key(alloc_trace, dealloc_trace)];
const absl::Duration life_time =
dealloc_trace.creation_time - alloc_trace.creation_time;
double life_time_ns = absl::ToDoubleNanoseconds(life_time);
// Update mean and variance using Welfords online algorithm.
double old_mean_ns = v.mean_life_times_ns[index];
v.mean_life_times_ns[index] +=
(life_time_ns - old_mean_ns) / static_cast<double>(v.counts[index] + 1);
v.variance_life_times_ns[index] +=
(life_time_ns - v.mean_life_times_ns[index]) *
(v.mean_life_times_ns[index] - old_mean_ns);
v.min_life_times_ns[index] =
std::min(v.min_life_times_ns[index], life_time_ns);
v.max_life_times_ns[index] =
std::max(v.max_life_times_ns[index], life_time_ns);
v.counts[index]++;
}
void DeallocationProfiler::DeallocationStackTraceTable::Iterate(
absl::FunctionRef<void(const Profile::Sample&)> func) const {
uint64_t pair_id = 1;
for (auto& it : table_) {
const Key& k = it.first;
const Value& v = it.second;
// Report total bytes that are a multiple of the object size.
size_t allocated_size = k.alloc.allocated_size;
for (const auto& matching_case : kAllCases) {
const int index = ComputeIndex(matching_case.first, matching_case.second);
if (v.counts[index] == 0) {
continue;
}
uintptr_t bytes =
std::lround(v.counts[index] * k.alloc.weight * allocated_size);
int64_t count = (bytes + allocated_size - 1) / allocated_size;
int64_t sum = count * allocated_size;
// The variance should be >= 0, but it's not impossible that it drops
// below 0 for numerical reasons. We don't want to crash in this case,
// so we ensure to return 0 if this happens.
double stddev_life_time_ns =
sqrt(std::max(0.0, v.variance_life_times_ns[index] /
static_cast<double>((v.counts[index]))));
const auto bucketize = internal::LifetimeNsToBucketedDuration;
Profile::Sample sample{
.sum = sum,
.requested_size = k.alloc.requested_size,
.requested_alignment = k.alloc.requested_alignment,
.allocated_size = allocated_size,
.profile_id = pair_id++,
// Set the is_censored flag so that when we create a proto
// sample later we can treat the *_lifetime accordingly.
.is_censored = (k.dealloc.depth == 0),
.avg_lifetime = bucketize(v.mean_life_times_ns[index]),
.stddev_lifetime = bucketize(stddev_life_time_ns),
.min_lifetime = bucketize(v.min_life_times_ns[index]),
.max_lifetime = bucketize(v.max_life_times_ns[index])};
// Only set the cpu and thread matched flags if the sample is not
// censored.
if (!sample.is_censored) {
sample.allocator_deallocator_cpu_matched =
matching_case.first.cpu_matched;
sample.allocator_deallocator_thread_matched =
matching_case.first.thread_matched;
}
// first for allocation
sample.count = count;
sample.depth = k.alloc.depth;
std::copy(k.alloc.stack, k.alloc.stack + k.alloc.depth, sample.stack);
func(sample);
// If this is a right-censored allocation (i.e. we did not observe the
// deallocation) then do not emit a deallocation sample pair.
if (sample.is_censored) {
continue;
}
// second for deallocation
static_assert(
std::is_signed<decltype(tcmalloc::Profile::Sample::count)>::value,
"Deallocation samples are tagged with negative count values.");
sample.count = -1 * count;
sample.depth = k.dealloc.depth;
std::copy(k.dealloc.stack, k.dealloc.stack + k.dealloc.depth,
sample.stack);
func(sample);
}
}
}
DeallocationSample::DeallocationSample(DeallocationProfilerList* list) {
profiler_ = std::make_unique<DeallocationProfiler>(list);
}
tcmalloc::Profile DeallocationSample::Stop() && {
if (profiler_ != nullptr) {
tcmalloc::Profile profile = profiler_->Stop();
profiler_.reset();
return profile;
}
return tcmalloc::Profile();
}
namespace internal {
// Lifetimes below 1ns are truncated to 1ns. Lifetimes between 1ns and 1ms
// are rounded to the next smaller power of 10. Lifetimes above 1ms are rounded
// down to the nearest millisecond.
absl::Duration LifetimeNsToBucketedDuration(double lifetime_ns) {
if (lifetime_ns < 1000000.0) {
if (lifetime_ns <= 1) {
// Avoid negatives. We can't allocate in a negative amount of time or
// even as quickly as a nanosecond (microbenchmarks of
// allocation/deallocation in a tight loop are several nanoseconds), so
// results this small indicate probable clock skew or other confounding
// factors in the data.
return absl::Nanoseconds(1);
}
for (uint64_t cutoff_ns = 10; cutoff_ns <= 1000000; cutoff_ns *= 10) {
if (lifetime_ns < cutoff_ns) {
return absl::Nanoseconds(cutoff_ns / 10);
}
}
}
// Round down to nearest millisecond.
return absl::Nanoseconds(static_cast<uint64_t>(lifetime_ns / 1000000.0) *
1000000L);
}
} // namespace internal
} // namespace deallocationz
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END

View File

@ -0,0 +1,67 @@
// Copyright 2022 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef TCMALLOC_DEALLOCATION_PROFILER_H_
#define TCMALLOC_DEALLOCATION_PROFILER_H_
#include <memory>
#include "absl/base/const_init.h"
#include "absl/base/internal/spinlock.h"
#include "tcmalloc/internal/config.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/malloc_extension.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace deallocationz {
class DeallocationProfiler;
class DeallocationProfilerList {
public:
constexpr DeallocationProfilerList() = default;
void ReportMalloc(const tcmalloc_internal::StackTrace& stack_trace);
void ReportFree(tcmalloc_internal::AllocHandle handle);
void Add(DeallocationProfiler* profiler);
void Remove(DeallocationProfiler* profiler);
private:
DeallocationProfiler* first_ = nullptr;
absl::base_internal::SpinLock profilers_lock_{
absl::kConstInit, absl::base_internal::SCHEDULE_KERNEL_ONLY};
};
class DeallocationSample final
: public tcmalloc_internal::AllocationProfilingTokenBase {
public:
explicit DeallocationSample(DeallocationProfilerList* list);
// We define the dtor to ensure it is placed in the desired text section.
~DeallocationSample() override = default;
tcmalloc::Profile Stop() && override;
private:
std::unique_ptr<DeallocationProfiler> profiler_;
};
namespace internal {
absl::Duration LifetimeNsToBucketedDuration(double lifetime_ns);
} // namespace internal
} // namespace deallocationz
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END
#endif // TCMALLOC_DEALLOCATION_PROFILER_H_

View File

@ -0,0 +1,140 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tcmalloc/experiment.h"
#include <string.h>
#include <algorithm>
#include <string>
#include "absl/base/macros.h"
#include "absl/strings/match.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/string_view.h"
#include "tcmalloc/internal/environment.h"
#include "tcmalloc/internal/logging.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
namespace {
const char kDelimiter = ',';
const char kExperiments[] = "BORG_EXPERIMENTS";
const char kDisableExperiments[] = "BORG_DISABLE_EXPERIMENTS";
constexpr absl::string_view kEnableAll = "enable-all-known-experiments";
constexpr absl::string_view kDisableAll = "all";
bool LookupExperimentID(absl::string_view label, Experiment* exp) {
for (auto config : experiments) {
if (config.name == label) {
*exp = config.id;
return true;
}
}
return false;
}
const bool* GetSelectedExperiments() {
static bool by_id[kNumExperiments];
static const bool* status = [&]() {
const char* active_experiments = thread_safe_getenv(kExperiments);
const char* disabled_experiments = thread_safe_getenv(kDisableExperiments);
return SelectExperiments(by_id,
active_experiments ? active_experiments : "",
disabled_experiments ? disabled_experiments : "");
}();
return status;
}
template <typename F>
void ParseExperiments(absl::string_view labels, F f) {
absl::string_view::size_type pos = 0;
do {
absl::string_view token;
auto end = labels.find(kDelimiter, pos);
if (end == absl::string_view::npos) {
token = labels.substr(pos);
pos = end;
} else {
token = labels.substr(pos, end - pos);
pos = end + 1;
}
f(token);
} while (pos != absl::string_view::npos);
}
} // namespace
const bool* SelectExperiments(bool* buffer, absl::string_view active,
absl::string_view disabled) {
memset(buffer, 0, sizeof(*buffer) * kNumExperiments);
if (active == kEnableAll) {
std::fill(buffer, buffer + kNumExperiments, true);
}
ParseExperiments(active, [buffer](absl::string_view token) {
Experiment id;
if (LookupExperimentID(token, &id)) {
buffer[static_cast<int>(id)] = true;
}
});
if (disabled == kDisableAll) {
memset(buffer, 0, sizeof(*buffer) * kNumExperiments);
}
ParseExperiments(disabled, [buffer](absl::string_view token) {
Experiment id;
if (LookupExperimentID(token, &id)) {
buffer[static_cast<int>(id)] = false;
}
});
return buffer;
}
} // namespace tcmalloc_internal
bool IsExperimentActive(Experiment exp) {
ASSERT(static_cast<int>(exp) >= 0);
ASSERT(exp < Experiment::kMaxExperimentID);
return tcmalloc_internal::GetSelectedExperiments()[static_cast<int>(exp)];
}
absl::optional<Experiment> FindExperimentByName(absl::string_view name) {
for (const auto& config : experiments) {
if (name == config.name) {
return config.id;
}
}
return absl::nullopt;
}
void WalkExperiments(
absl::FunctionRef<void(absl::string_view name, bool active)> callback) {
for (const auto& config : experiments) {
callback(config.name, IsExperimentActive(config.id));
}
}
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END

View File

@ -0,0 +1,68 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef TCMALLOC_EXPERIMENT_H_
#define TCMALLOC_EXPERIMENT_H_
#include <stddef.h>
#include <string>
#include "absl/functional/function_ref.h"
#include "absl/strings/string_view.h"
#include "absl/types/optional.h"
#include "tcmalloc/experiment_config.h"
#include "tcmalloc/internal/config.h"
// TCMalloc Experiment Controller
//
// This consumes environment variables to decide whether to activate experiments
// to control TCMalloc behavior. It avoids memory allocations when making
// experiment decisions to allow experiments to be used in critical TCMalloc
// initialization paths.
//
// If an experiment is causing difficulty, all experiments can be disabled by
// setting the environment variable:
// BORG_DISABLE_EXPERIMENTS=all *or*
// BORG_DISABLE_EXPERIMENTS=BAD_EXPERIMENT_LABEL
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
constexpr size_t kNumExperiments =
static_cast<size_t>(Experiment::kMaxExperimentID);
// SelectExperiments parses the experiments enumerated by active and disabled
// and updates buffer[experiment_id] accordingly.
//
// buffer must be sized for kMaxExperimentID entries.
//
// This is exposed for testing purposes only.
const bool* SelectExperiments(bool* buffer, absl::string_view active,
absl::string_view disabled);
} // namespace tcmalloc_internal
bool IsExperimentActive(Experiment exp);
absl::optional<Experiment> FindExperimentByName(absl::string_view name);
void WalkExperiments(
absl::FunctionRef<void(absl::string_view name, bool active)> callback);
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END
#endif // TCMALLOC_EXPERIMENT_H_

View File

@ -0,0 +1,55 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef TCMALLOC_EXPERIMENT_CONFIG_H_
#define TCMALLOC_EXPERIMENT_CONFIG_H_
#include "absl/strings/string_view.h"
// Autogenerated by experiments_proto_test --experiments_generate_config=true
namespace tcmalloc {
enum class Experiment : int {
TEST_ONLY_TCMALLOC_POW2_SIZECLASS,
TEST_ONLY_TCMALLOC_SHARDED_TRANSFER_CACHE,
TEST_ONLY_TCMALLOC_GENERIC_SHARDED_TRANSFER_CACHE,
TEST_ONLY_TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN,
TEST_ONLY_TCMALLOC_SEPARATE_ALLOCS_FOR_FEW_AND_MANY_OBJECTS_SPANS,
TCMALLOC_GENERIC_SHARDED_TRANSFER_CACHE,
TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN,
TCMALLOC_SEPARATE_ALLOCS_FOR_FEW_AND_MANY_OBJECTS_SPANS2,
kMaxExperimentID,
};
struct ExperimentConfig {
Experiment id;
absl::string_view name;
};
// clang-format off
inline constexpr ExperimentConfig experiments[] = {
{Experiment::TEST_ONLY_TCMALLOC_POW2_SIZECLASS, "TEST_ONLY_TCMALLOC_POW2_SIZECLASS"},
{Experiment::TEST_ONLY_TCMALLOC_SHARDED_TRANSFER_CACHE, "TEST_ONLY_TCMALLOC_SHARDED_TRANSFER_CACHE"},
{Experiment::TEST_ONLY_TCMALLOC_GENERIC_SHARDED_TRANSFER_CACHE, "TEST_ONLY_TCMALLOC_GENERIC_SHARDED_TRANSFER_CACHE"},
{Experiment::TEST_ONLY_TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN, "TEST_ONLY_TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN"},
{Experiment::TEST_ONLY_TCMALLOC_SEPARATE_ALLOCS_FOR_FEW_AND_MANY_OBJECTS_SPANS, "TEST_ONLY_TCMALLOC_SEPARATE_ALLOCS_FOR_FEW_AND_MANY_OBJECTS_SPANS"},
{Experiment::TCMALLOC_GENERIC_SHARDED_TRANSFER_CACHE, "TCMALLOC_GENERIC_SHARDED_TRANSFER_CACHE"},
{Experiment::TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN, "TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN"},
{Experiment::TCMALLOC_SEPARATE_ALLOCS_FOR_FEW_AND_MANY_OBJECTS_SPANS2, "TCMALLOC_SEPARATE_ALLOCS_FOR_FEW_AND_MANY_OBJECTS_SPANS2"},
};
// clang-format on
} // namespace tcmalloc
#endif // TCMALLOC_EXPERIMENT_CONFIG_H_

View File

@ -0,0 +1,31 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tcmalloc/experiment_config.h"
#include "gtest/gtest.h"
namespace tcmalloc {
namespace {
// Verify IDs are non-negative and strictly less than kMaxExperimentID.
TEST(ExperimentConfigTest, ValidateIDs) {
for (const auto& exp : experiments) {
ASSERT_LE(0, static_cast<int>(exp.id));
ASSERT_LT(exp.id, Experiment::kMaxExperimentID);
}
}
} // namespace
} // namespace tcmalloc

View File

@ -0,0 +1,38 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include "absl/strings/string_view.h"
#include "tcmalloc/experiment.h"
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* d, size_t size) {
const char* data = reinterpret_cast<const char*>(d);
bool buffer[tcmalloc::tcmalloc_internal::kNumExperiments];
absl::string_view active, disabled;
const char* split = static_cast<const char*>(memchr(data, ';', size));
if (split == nullptr) {
active = absl::string_view(data, size);
} else {
active = absl::string_view(data, split - data);
disabled = absl::string_view(split + 1, size - (split - data + 1));
}
tcmalloc::tcmalloc_internal::SelectExperiments(buffer, active, disabled);
return 0;
}

View File

@ -0,0 +1,240 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tcmalloc/common.h"
#include "tcmalloc/sizemap.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
// <fixed> is fixed per-size-class overhead due to end-of-span fragmentation
// and other factors. For instance, if we have a 96 byte size class, and use a
// single 8KiB page, then we will hold 85 objects per span, and have 32 bytes
// left over. There is also a fixed component of 48 bytes of TCMalloc metadata
// per span. Together, the fixed overhead would be wasted/allocated =
// (32 + 48) / (8192 - 32) ~= 0.98%.
// There is also a dynamic component to overhead based on mismatches between the
// number of bytes requested and the number of bytes provided by the size class.
// Together they sum to the total overhead; for instance if you asked for a
// 50-byte allocation that rounds up to a 64-byte size class, the dynamic
// overhead would be 28%, and if <fixed> were 22% it would mean (on average)
// 25 bytes of overhead for allocations of that size.
// clang-format off
#if defined(__cpp_aligned_new) && __STDCPP_DEFAULT_NEW_ALIGNMENT__ <= 8
#if TCMALLOC_PAGE_SHIFT == 13
static_assert(kMaxSize == 262144, "kMaxSize mismatch");
static const int kCount = 17;
static_assert(kCount <= kNumClasses);
static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
// <bytes>, <pages>, <batch size> <fixed>
{ 0, 0, 0}, // +Inf%
{ 8, 1, 32}, // 0.59%
{ 16, 1, 32}, // 0.59%
{ 32, 1, 32}, // 0.59%
{ 64, 1, 32}, // 0.59%
{ 128, 1, 32}, // 0.59%
{ 256, 1, 32}, // 0.59%
{ 512, 1, 32}, // 0.59%
{ 1024, 1, 32}, // 0.59%
{ 2048, 2, 32}, // 0.29%
{ 4096, 1, 16}, // 0.59%
{ 8192, 1, 8}, // 0.59%
{ 16384, 2, 4}, // 0.29%
{ 32768, 4, 2}, // 0.15%
{ 65536, 8, 2}, // 0.07%
{ 131072, 16, 2}, // 0.04%
{ 262144, 32, 2}, // 0.02%
};
constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
#elif TCMALLOC_PAGE_SHIFT == 15
static_assert(kMaxSize == 262144, "kMaxSize mismatch");
static const int kCount = 17;
static_assert(kCount <= kNumClasses);
static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
// <bytes>, <pages>, <batch size> <fixed>
{ 0, 0, 0}, // +Inf%
{ 8, 1, 32}, // 0.15%
{ 16, 1, 32}, // 0.15%
{ 32, 1, 32}, // 0.15%
{ 64, 1, 32}, // 0.15%
{ 128, 1, 32}, // 0.15%
{ 256, 1, 32}, // 0.15%
{ 512, 1, 32}, // 0.15%
{ 1024, 1, 32}, // 0.15%
{ 2048, 1, 32}, // 0.15%
{ 4096, 1, 16}, // 0.15%
{ 8192, 1, 8}, // 0.15%
{ 16384, 1, 4}, // 0.15%
{ 32768, 1, 2}, // 0.15%
{ 65536, 2, 2}, // 0.07%
{ 131072, 4, 2}, // 0.04%
{ 262144, 8, 2}, // 0.02%
};
constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
#elif TCMALLOC_PAGE_SHIFT == 18
static_assert(kMaxSize == 262144, "kMaxSize mismatch");
static const int kCount = 17;
static_assert(kCount <= kNumClasses);
static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
// <bytes>, <pages>, <batch size> <fixed>
{ 0, 0, 0}, // +Inf%
{ 8, 1, 32}, // 0.02%
{ 16, 1, 32}, // 0.02%
{ 32, 1, 32}, // 0.02%
{ 64, 1, 32}, // 0.02%
{ 128, 1, 32}, // 0.02%
{ 256, 1, 32}, // 0.02%
{ 512, 1, 32}, // 0.02%
{ 1024, 1, 32}, // 0.02%
{ 2048, 1, 32}, // 0.02%
{ 4096, 1, 16}, // 0.02%
{ 8192, 1, 8}, // 0.02%
{ 16384, 1, 4}, // 0.02%
{ 32768, 1, 2}, // 0.02%
{ 65536, 1, 2}, // 0.02%
{ 131072, 1, 2}, // 0.02%
{ 262144, 1, 2}, // 0.02%
};
constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
#elif TCMALLOC_PAGE_SHIFT == 12
static_assert(kMaxSize == 8192, "kMaxSize mismatch");
static const int kCount = 12;
static_assert(kCount <= kNumClasses);
static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
// <bytes>, <pages>, <batch size> <fixed>
{ 0, 0, 0}, // +Inf%
{ 8, 1, 32}, // 1.17%
{ 16, 1, 32}, // 1.17%
{ 32, 1, 32}, // 1.17%
{ 64, 1, 32}, // 1.17%
{ 128, 1, 32}, // 1.17%
{ 256, 1, 32}, // 1.17%
{ 512, 1, 32}, // 1.17%
{ 1024, 2, 32}, // 0.59%
{ 2048, 4, 32}, // 0.29%
{ 4096, 4, 16}, // 0.29%
{ 8192, 4, 8}, // 0.29%
};
constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
#else
#error "Unsupported TCMALLOC_PAGE_SHIFT value!"
#endif
#else
#if TCMALLOC_PAGE_SHIFT == 13
static_assert(kMaxSize == 262144, "kMaxSize mismatch");
static const int kCount = 17;
static_assert(kCount <= kNumClasses);
static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
// <bytes>, <pages>, <batch size> <fixed>
{ 0, 0, 0}, // +Inf%
{ 8, 1, 32}, // 0.59%
{ 16, 1, 32}, // 0.59%
{ 32, 1, 32}, // 0.59%
{ 64, 1, 32}, // 0.59%
{ 128, 1, 32}, // 0.59%
{ 256, 1, 32}, // 0.59%
{ 512, 1, 32}, // 0.59%
{ 1024, 1, 32}, // 0.59%
{ 2048, 2, 32}, // 0.29%
{ 4096, 1, 16}, // 0.59%
{ 8192, 1, 8}, // 0.59%
{ 16384, 2, 4}, // 0.29%
{ 32768, 4, 2}, // 0.15%
{ 65536, 8, 2}, // 0.07%
{ 131072, 16, 2}, // 0.04%
{ 262144, 32, 2}, // 0.02%
};
constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
#elif TCMALLOC_PAGE_SHIFT == 15
static_assert(kMaxSize == 262144, "kMaxSize mismatch");
static const int kCount = 17;
static_assert(kCount <= kNumClasses);
static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
// <bytes>, <pages>, <batch size> <fixed>
{ 0, 0, 0}, // +Inf%
{ 8, 1, 32}, // 0.15%
{ 16, 1, 32}, // 0.15%
{ 32, 1, 32}, // 0.15%
{ 64, 1, 32}, // 0.15%
{ 128, 1, 32}, // 0.15%
{ 256, 1, 32}, // 0.15%
{ 512, 1, 32}, // 0.15%
{ 1024, 1, 32}, // 0.15%
{ 2048, 1, 32}, // 0.15%
{ 4096, 1, 16}, // 0.15%
{ 8192, 1, 8}, // 0.15%
{ 16384, 1, 4}, // 0.15%
{ 32768, 1, 2}, // 0.15%
{ 65536, 2, 2}, // 0.07%
{ 131072, 4, 2}, // 0.04%
{ 262144, 8, 2}, // 0.02%
};
constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
#elif TCMALLOC_PAGE_SHIFT == 18
static_assert(kMaxSize == 262144, "kMaxSize mismatch");
static const int kCount = 17;
static_assert(kCount <= kNumClasses);
static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
// <bytes>, <pages>, <batch size> <fixed>
{ 0, 0, 0}, // +Inf%
{ 8, 1, 32}, // 0.02%
{ 16, 1, 32}, // 0.02%
{ 32, 1, 32}, // 0.02%
{ 64, 1, 32}, // 0.02%
{ 128, 1, 32}, // 0.02%
{ 256, 1, 32}, // 0.02%
{ 512, 1, 32}, // 0.02%
{ 1024, 1, 32}, // 0.02%
{ 2048, 1, 32}, // 0.02%
{ 4096, 1, 16}, // 0.02%
{ 8192, 1, 8}, // 0.02%
{ 16384, 1, 4}, // 0.02%
{ 32768, 1, 2}, // 0.02%
{ 65536, 1, 2}, // 0.02%
{ 131072, 1, 2}, // 0.02%
{ 262144, 1, 2}, // 0.02%
};
constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
#elif TCMALLOC_PAGE_SHIFT == 12
static_assert(kMaxSize == 8192, "kMaxSize mismatch");
static const int kCount = 12;
static_assert(kCount <= kNumClasses);
static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
// <bytes>, <pages>, <batch size> <fixed>
{ 0, 0, 0}, // +Inf%
{ 8, 1, 32}, // 1.17%
{ 16, 1, 32}, // 1.17%
{ 32, 1, 32}, // 1.17%
{ 64, 1, 32}, // 1.17%
{ 128, 1, 32}, // 1.17%
{ 256, 1, 32}, // 1.17%
{ 512, 1, 32}, // 1.17%
{ 1024, 2, 32}, // 0.59%
{ 2048, 4, 32}, // 0.29%
{ 4096, 4, 16}, // 0.29%
{ 8192, 4, 8}, // 0.29%
};
constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
#else
#error "Unsupported TCMALLOC_PAGE_SHIFT value!"
#endif
#endif
// clang-format on
} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END

View File

@ -0,0 +1,62 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef TCMALLOC_EXPLICITLY_CONSTRUCTED_H_
#define TCMALLOC_EXPLICITLY_CONSTRUCTED_H_
#include <stdint.h>
#include <utility>
#include "tcmalloc/internal/config.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
// Wraps a variable whose constructor is explicitly called. It is particularly
// useful for a global variable, without its constructor and destructor run on
// start and end of the program lifetime. This circumvents the initial
// construction order fiasco, while keeping the address of the empty string a
// compile time constant.
//
// Pay special attention to the initialization state of the object.
// 1. The object is "uninitialized" to begin with.
// 2. Call Construct() only if the object is uninitialized. After the call, the
// object becomes "initialized".
// 3. Call get_mutable() only if the object is initialized.
template <typename T>
class ExplicitlyConstructed {
public:
template <typename... Args>
void Construct(Args&&... args) {
new (&union_) T(std::forward<Args>(args)...);
}
T& get_mutable() { return reinterpret_cast<T&>(union_); }
private:
union AlignedUnion {
constexpr AlignedUnion() = default;
alignas(T) char space[sizeof(T)];
int64_t align_to_int64;
void* align_to_ptr;
} union_;
};
} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END
#endif // TCMALLOC_EXPLICITLY_CONSTRUCTED_H_

View File

@ -0,0 +1,800 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tcmalloc/global_stats.h"
#include "absl/strings/match.h"
#include "absl/strings/strip.h"
#include "tcmalloc/central_freelist.h"
#include "tcmalloc/common.h"
#include "tcmalloc/cpu_cache.h"
#include "tcmalloc/experiment.h"
#include "tcmalloc/guarded_page_allocator.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/internal/memory_stats.h"
#include "tcmalloc/page_allocator.h"
#include "tcmalloc/page_heap.h"
#include "tcmalloc/page_heap_allocator.h"
#include "tcmalloc/pagemap.h"
#include "tcmalloc/pages.h"
#include "tcmalloc/parameters.h"
#include "tcmalloc/sampled_allocation.h"
#include "tcmalloc/sampler.h"
#include "tcmalloc/span.h"
#include "tcmalloc/static_vars.h"
#include "tcmalloc/stats.h"
#include "tcmalloc/system-alloc.h"
#include "tcmalloc/thread_cache.h"
#include "tcmalloc/transfer_cache.h"
#include "tcmalloc/transfer_cache_stats.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
// Get stats into "r". Also, if class_count != NULL, class_count[k]
// will be set to the total number of objects of size class k in the
// central cache, transfer cache, and per-thread and per-CPU caches.
// If small_spans is non-NULL, it is filled. Same for large_spans.
// The boolean report_residence determines whether residence information
// should be captured or not. Residence info requires a potentially
// costly OS call, and is not necessary in all situations.
void ExtractStats(TCMallocStats* r, uint64_t* class_count,
SpanStats* span_stats, SmallSpanStats* small_spans,
LargeSpanStats* large_spans, bool report_residence) {
r->central_bytes = 0;
r->transfer_bytes = 0;
for (int size_class = 0; size_class < kNumClasses; ++size_class) {
const size_t length = tc_globals.central_freelist(size_class).length();
const size_t tc_length = tc_globals.transfer_cache().tc_length(size_class);
const size_t cache_overhead =
tc_globals.central_freelist(size_class).OverheadBytes();
const size_t size = tc_globals.sizemap().class_to_size(size_class);
r->central_bytes += (size * length) + cache_overhead;
r->transfer_bytes += (size * tc_length);
if (class_count) {
// Sum the lengths of all per-class freelists, except the per-thread
// freelists, which get counted when we call GetThreadStats(), below.
class_count[size_class] = length + tc_length;
if (UsePerCpuCache(tc_globals)) {
class_count[size_class] +=
tc_globals.cpu_cache().TotalObjectsOfClass(size_class);
}
}
if (span_stats) {
span_stats[size_class] =
tc_globals.central_freelist(size_class).GetSpanStats();
}
}
// Add stats from per-thread heaps
r->thread_bytes = 0;
{ // scope
absl::base_internal::SpinLockHolder h(&pageheap_lock);
ThreadCache::GetThreadStats(&r->thread_bytes, class_count);
r->tc_stats = ThreadCache::HeapStats();
r->span_stats = tc_globals.span_allocator().stats();
r->stack_stats = tc_globals.sampledallocation_allocator().stats();
r->linked_sample_stats = tc_globals.linked_sample_allocator().stats();
r->metadata_bytes = tc_globals.metadata_bytes();
r->pagemap_bytes = tc_globals.pagemap().bytes();
r->pageheap = tc_globals.page_allocator().stats();
r->peak_stats = tc_globals.page_allocator().peak_stats();
if (small_spans != nullptr) {
tc_globals.page_allocator().GetSmallSpanStats(small_spans);
}
if (large_spans != nullptr) {
tc_globals.page_allocator().GetLargeSpanStats(large_spans);
}
r->arena = tc_globals.arena().stats();
if (!report_residence) {
r->metadata_bytes += r->arena.bytes_nonresident;
}
}
// We can access the pagemap without holding the pageheap_lock since it
// is static data, and we are only taking address and size which are
// constants.
if (report_residence) {
auto resident_bytes = tc_globals.pagemap_residence();
r->pagemap_root_bytes_res = resident_bytes;
ASSERT(r->metadata_bytes >= r->pagemap_bytes);
r->metadata_bytes = r->metadata_bytes - r->pagemap_bytes + resident_bytes;
} else {
r->pagemap_root_bytes_res = 0;
}
r->per_cpu_bytes = 0;
r->sharded_transfer_bytes = 0;
r->percpu_metadata_bytes_res = 0;
r->percpu_metadata_bytes = 0;
if (UsePerCpuCache(tc_globals)) {
r->per_cpu_bytes = tc_globals.cpu_cache().TotalUsedBytes();
r->sharded_transfer_bytes =
tc_globals.sharded_transfer_cache().TotalBytes();
if (report_residence) {
auto percpu_metadata = tc_globals.cpu_cache().MetadataMemoryUsage();
r->percpu_metadata_bytes_res = percpu_metadata.resident_size;
r->percpu_metadata_bytes = percpu_metadata.virtual_size;
ASSERT(r->metadata_bytes >= r->percpu_metadata_bytes);
r->metadata_bytes = r->metadata_bytes - r->percpu_metadata_bytes +
r->percpu_metadata_bytes_res;
}
}
}
void ExtractTCMallocStats(TCMallocStats* r, bool report_residence) {
ExtractStats(r, nullptr, nullptr, nullptr, nullptr, report_residence);
}
// Because different fields of stats are computed from state protected
// by different locks, they may be inconsistent. Prevent underflow
// when subtracting to avoid gigantic results.
static uint64_t StatSub(uint64_t a, uint64_t b) {
return (a >= b) ? (a - b) : 0;
}
// Return approximate number of bytes in use by app.
uint64_t InUseByApp(const TCMallocStats& stats) {
return StatSub(stats.pageheap.system_bytes,
stats.thread_bytes + stats.central_bytes +
stats.transfer_bytes + stats.per_cpu_bytes +
stats.sharded_transfer_bytes + stats.pageheap.free_bytes +
stats.pageheap.unmapped_bytes);
}
uint64_t VirtualMemoryUsed(const TCMallocStats& stats) {
return stats.pageheap.system_bytes + stats.metadata_bytes +
stats.arena.bytes_unallocated + stats.arena.bytes_unavailable +
stats.arena.bytes_nonresident;
}
uint64_t UnmappedBytes(const TCMallocStats& stats) {
return stats.pageheap.unmapped_bytes + stats.arena.bytes_nonresident;
}
uint64_t PhysicalMemoryUsed(const TCMallocStats& stats) {
return StatSub(VirtualMemoryUsed(stats), UnmappedBytes(stats));
}
// The number of bytes either in use by the app or fragmented so that
// it cannot be (arbitrarily) reused.
uint64_t RequiredBytes(const TCMallocStats& stats) {
return StatSub(PhysicalMemoryUsed(stats), stats.pageheap.free_bytes);
}
size_t ExternalBytes(const TCMallocStats& stats) {
return stats.pageheap.free_bytes + stats.central_bytes + stats.per_cpu_bytes +
stats.sharded_transfer_bytes + stats.transfer_bytes +
stats.thread_bytes + stats.metadata_bytes +
stats.arena.bytes_unavailable + stats.arena.bytes_unallocated;
}
size_t HeapSizeBytes(const BackingStats& stats) {
return StatSub(stats.system_bytes, stats.unmapped_bytes);
}
size_t LocalBytes(const TCMallocStats& stats) {
return stats.thread_bytes + stats.per_cpu_bytes +
stats.sharded_transfer_bytes;
}
size_t SlackBytes(const BackingStats& stats) {
return stats.free_bytes + stats.unmapped_bytes;
}
static int CountAllowedCpus() {
cpu_set_t allowed_cpus;
if (sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus) != 0) {
return 0;
}
return CPU_COUNT(&allowed_cpus);
}
void DumpStats(Printer* out, int level) {
TCMallocStats stats;
uint64_t class_count[kNumClasses];
SpanStats span_stats[kNumClasses];
if (level >= 2) {
ExtractStats(&stats, class_count, span_stats, nullptr, nullptr, true);
} else {
ExtractTCMallocStats(&stats, true);
}
static const double MiB = 1048576.0;
out->printf(
"See https://github.com/google/tcmalloc/tree/master/docs/stats.md for an explanation of "
"this page\n");
const uint64_t virtual_memory_used = VirtualMemoryUsed(stats);
const uint64_t physical_memory_used = PhysicalMemoryUsed(stats);
const uint64_t unmapped_bytes = UnmappedBytes(stats);
const uint64_t bytes_in_use_by_app = InUseByApp(stats);
#ifdef TCMALLOC_SMALL_BUT_SLOW
out->printf("NOTE: SMALL MEMORY MODEL IS IN USE, PERFORMANCE MAY SUFFER.\n");
#endif
// clang-format off
// Avoid clang-format complaining about the way that this text is laid out.
out->printf(
"------------------------------------------------\n"
"MALLOC: %12u (%7.1f MiB) Bytes in use by application\n"
"MALLOC: + %12u (%7.1f MiB) Bytes in page heap freelist\n"
"MALLOC: + %12u (%7.1f MiB) Bytes in central cache freelist\n"
"MALLOC: + %12u (%7.1f MiB) Bytes in per-CPU cache freelist\n"
"MALLOC: + %12u (%7.1f MiB) Bytes in Sharded cache freelist\n"
"MALLOC: + %12u (%7.1f MiB) Bytes in transfer cache freelist\n"
"MALLOC: + %12u (%7.1f MiB) Bytes in thread cache freelists\n"
"MALLOC: + %12u (%7.1f MiB) Bytes in malloc metadata\n"
"MALLOC: + %12u (%7.1f MiB) Bytes in malloc metadata Arena unallocated\n"
"MALLOC: + %12u (%7.1f MiB) Bytes in malloc metadata Arena unavailable\n"
"MALLOC: ------------\n"
"MALLOC: = %12u (%7.1f MiB) Actual memory used (physical + swap)\n"
"MALLOC: + %12u (%7.1f MiB) Bytes released to OS (aka unmapped)\n"
"MALLOC: ------------\n"
"MALLOC: = %12u (%7.1f MiB) Virtual address space used\n"
"MALLOC:\n"
"MALLOC: %12u Spans in use\n"
"MALLOC: %12u (%7.1f MiB) Spans created\n"
"MALLOC: %12u Thread heaps in use\n"
"MALLOC: %12u (%7.1f MiB) Thread heaps created\n"
"MALLOC: %12u Stack traces in use\n"
"MALLOC: %12u (%7.1f MiB) Stack traces created\n"
"MALLOC: %12u Table buckets in use\n"
"MALLOC: %12u (%7.1f MiB) Table buckets created\n"
"MALLOC: %12u (%7.1f MiB) Pagemap bytes used\n"
"MALLOC: %12u (%7.1f MiB) Pagemap root resident bytes\n"
"MALLOC: %12u (%7.1f MiB) per-CPU slab bytes used\n"
"MALLOC: %12u (%7.1f MiB) per-CPU slab resident bytes\n"
"MALLOC: %12u (%7.1f MiB) malloc metadata Arena non-resident bytes\n"
"MALLOC: %12u (%7.1f MiB) Actual memory used at peak\n"
"MALLOC: %12u (%7.1f MiB) Estimated in-use at peak\n"
"MALLOC: %12.4f Realized fragmentation (%%)\n"
"MALLOC: %12u Tcmalloc page size\n"
"MALLOC: %12u Tcmalloc hugepage size\n"
"MALLOC: %12u CPUs Allowed in Mask\n"
"MALLOC: %12u Arena blocks\n",
bytes_in_use_by_app, bytes_in_use_by_app / MiB,
stats.pageheap.free_bytes, stats.pageheap.free_bytes / MiB,
stats.central_bytes, stats.central_bytes / MiB,
stats.per_cpu_bytes, stats.per_cpu_bytes / MiB,
stats.sharded_transfer_bytes, stats.sharded_transfer_bytes / MiB,
stats.transfer_bytes, stats.transfer_bytes / MiB,
stats.thread_bytes, stats.thread_bytes / MiB,
stats.metadata_bytes, stats.metadata_bytes / MiB,
stats.arena.bytes_unallocated, stats.arena.bytes_unallocated / MiB,
stats.arena.bytes_unavailable, stats.arena.bytes_unavailable / MiB,
physical_memory_used, physical_memory_used / MiB,
unmapped_bytes, unmapped_bytes / MiB,
virtual_memory_used, virtual_memory_used / MiB,
uint64_t(stats.span_stats.in_use),
uint64_t(stats.span_stats.total),
(stats.span_stats.total * sizeof(Span)) / MiB,
uint64_t(stats.tc_stats.in_use),
uint64_t(stats.tc_stats.total),
(stats.tc_stats.total * sizeof(ThreadCache)) / MiB,
uint64_t(stats.stack_stats.in_use),
uint64_t(stats.stack_stats.total),
(stats.stack_stats.total * sizeof(StackTrace)) / MiB,
uint64_t(stats.linked_sample_stats.in_use),
uint64_t(stats.linked_sample_stats.total),
(stats.linked_sample_stats.total * sizeof(StackTraceTable::LinkedSample)) / MiB,
uint64_t(stats.pagemap_bytes),
stats.pagemap_bytes / MiB,
stats.pagemap_root_bytes_res, stats.pagemap_root_bytes_res / MiB,
uint64_t(stats.percpu_metadata_bytes),
stats.percpu_metadata_bytes / MiB,
stats.percpu_metadata_bytes_res, stats.percpu_metadata_bytes_res / MiB,
stats.arena.bytes_nonresident, stats.arena.bytes_nonresident / MiB,
uint64_t(stats.peak_stats.backed_bytes),
stats.peak_stats.backed_bytes / MiB,
uint64_t(stats.peak_stats.sampled_application_bytes),
stats.peak_stats.sampled_application_bytes / MiB,
100. * safe_div(stats.peak_stats.backed_bytes - stats.peak_stats.sampled_application_bytes, stats.peak_stats.sampled_application_bytes),
uint64_t(kPageSize),
uint64_t(kHugePageSize),
CountAllowedCpus(),
stats.arena.blocks
);
// clang-format on
out->printf("MALLOC EXPERIMENTS:");
WalkExperiments([&](absl::string_view name, bool active) {
const char* value = active ? "1" : "0";
out->printf(" %s=%s", name, value);
});
out->printf("\n");
out->printf(
"MALLOC SAMPLED PROFILES: %zu bytes (current), %zu bytes (internal "
"fragmentation), %zu bytes (peak), %zu count (total)\n",
static_cast<size_t>(tc_globals.sampled_objects_size_.value()),
tc_globals.sampled_internal_fragmentation_.value(),
tc_globals.peak_heap_tracker().CurrentPeakSize(),
tc_globals.total_sampled_count_.value());
MemoryStats memstats;
if (GetMemoryStats(&memstats)) {
uint64_t rss = memstats.rss;
uint64_t vss = memstats.vss;
// clang-format off
out->printf(
"\n"
"Total process stats (inclusive of non-malloc sources):\n"
"TOTAL: %12u (%7.1f MiB) Bytes resident (physical memory used)\n"
"TOTAL: %12u (%7.1f MiB) Bytes mapped (virtual memory used)\n",
rss, rss / MiB, vss, vss / MiB);
// clang-format on
}
out->printf(
"------------------------------------------------\n"
"Call ReleaseMemoryToSystem() to release freelist memory to the OS"
" (via madvise()).\n"
"Bytes released to the OS take up virtual address space"
" but no physical memory.\n");
if (level >= 2) {
out->printf("------------------------------------------------\n");
out->printf("Total size of freelists for per-thread and per-CPU caches,\n");
out->printf("transfer cache, and central cache, as well as number of\n");
out->printf("live pages, returned/requested spans by size class\n");
out->printf("------------------------------------------------\n");
uint64_t cumulative = 0;
for (int size_class = 1; size_class < kNumClasses; ++size_class) {
uint64_t class_bytes = class_count[size_class] *
tc_globals.sizemap().class_to_size(size_class);
cumulative += class_bytes;
out->printf(
// clang-format off
"class %3d [ %8zu bytes ] : %8u objs; %5.1f MiB; %6.1f cum MiB; "
"%8u live pages; spans: %10zu ret / %10zu req = %5.4f;\n",
// clang-format on
size_class, tc_globals.sizemap().class_to_size(size_class),
class_count[size_class], class_bytes / MiB, cumulative / MiB,
span_stats[size_class].num_live_spans() *
tc_globals.sizemap().class_to_pages(size_class),
span_stats[size_class].num_spans_returned,
span_stats[size_class].num_spans_requested,
span_stats[size_class].prob_returned());
}
#ifndef TCMALLOC_SMALL_BUT_SLOW
out->printf("------------------------------------------------\n");
out->printf("Central cache freelist: Span utilization histogram\n");
out->printf("Non-cumulative number of spans with allocated objects < N\n");
out->printf("------------------------------------------------\n");
for (int size_class = 1; size_class < kNumClasses; ++size_class) {
tc_globals.central_freelist(size_class).PrintSpanUtilStats(out);
}
#endif
tc_globals.transfer_cache().Print(out);
tc_globals.sharded_transfer_cache().Print(out);
if (UsePerCpuCache(tc_globals)) {
tc_globals.cpu_cache().Print(out);
}
tc_globals.page_allocator().Print(out, MemoryTag::kNormal);
if (tc_globals.numa_topology().active_partitions() > 1) {
tc_globals.page_allocator().Print(out, MemoryTag::kNormalP1);
}
tc_globals.page_allocator().Print(out, MemoryTag::kSampled);
tc_globals.page_allocator().Print(out, MemoryTag::kCold);
tc_globals.guardedpage_allocator().Print(out);
uint64_t limit_bytes;
bool is_hard;
std::tie(limit_bytes, is_hard) = tc_globals.page_allocator().limit();
out->printf("PARAMETER desired_usage_limit_bytes %u %s\n", limit_bytes,
is_hard ? "(hard)" : "");
out->printf("Number of times limit was hit: %lld\n",
tc_globals.page_allocator().limit_hits());
out->printf("PARAMETER tcmalloc_per_cpu_caches %d\n",
Parameters::per_cpu_caches() ? 1 : 0);
out->printf("PARAMETER tcmalloc_max_per_cpu_cache_size %d\n",
Parameters::max_per_cpu_cache_size());
out->printf("PARAMETER tcmalloc_max_total_thread_cache_bytes %lld\n",
Parameters::max_total_thread_cache_bytes());
out->printf("PARAMETER malloc_release_bytes_per_sec %llu\n",
Parameters::background_release_rate());
out->printf(
"PARAMETER tcmalloc_skip_subrelease_interval %s\n",
absl::FormatDuration(Parameters::filler_skip_subrelease_interval()));
out->printf("PARAMETER tcmalloc_skip_subrelease_short_interval %s\n",
absl::FormatDuration(
Parameters::filler_skip_subrelease_short_interval()));
out->printf("PARAMETER tcmalloc_skip_subrelease_long_interval %s\n",
absl::FormatDuration(
Parameters::filler_skip_subrelease_long_interval()));
out->printf("PARAMETER flat vcpus %d\n",
subtle::percpu::UsingFlatVirtualCpus() ? 1 : 0);
out->printf("PARAMETER tcmalloc_shuffle_per_cpu_caches %d\n",
Parameters::shuffle_per_cpu_caches() ? 1 : 0);
out->printf("PARAMETER tcmalloc_partial_transfer_cache %d\n",
Parameters::partial_transfer_cache() ? 1 : 0);
out->printf(
"PARAMETER tcmalloc_separate_allocs_for_few_and_many_objects_spans "
"%d\n",
Parameters::separate_allocs_for_few_and_many_objects_spans());
}
}
void DumpStatsInPbtxt(Printer* out, int level) {
TCMallocStats stats;
uint64_t class_count[kNumClasses];
SpanStats span_stats[kNumClasses];
if (level >= 2) {
ExtractStats(&stats, class_count, span_stats, nullptr, nullptr, true);
} else {
ExtractTCMallocStats(&stats, true);
}
const uint64_t bytes_in_use_by_app = InUseByApp(stats);
const uint64_t virtual_memory_used = VirtualMemoryUsed(stats);
const uint64_t physical_memory_used = PhysicalMemoryUsed(stats);
const uint64_t unmapped_bytes = UnmappedBytes(stats);
PbtxtRegion region(out, kTop);
region.PrintI64("in_use_by_app", bytes_in_use_by_app);
region.PrintI64("page_heap_freelist", stats.pageheap.free_bytes);
region.PrintI64("central_cache_freelist", stats.central_bytes);
region.PrintI64("per_cpu_cache_freelist", stats.per_cpu_bytes);
region.PrintI64("sharded_transfer_cache_freelist",
stats.sharded_transfer_bytes);
region.PrintI64("transfer_cache_freelist", stats.transfer_bytes);
region.PrintI64("thread_cache_freelists", stats.thread_bytes);
region.PrintI64("malloc_metadata", stats.metadata_bytes);
region.PrintI64("malloc_metadata_arena_unavailable",
stats.arena.bytes_unavailable);
region.PrintI64("malloc_metadata_arena_unallocated",
stats.arena.bytes_unallocated);
region.PrintI64("actual_mem_used", physical_memory_used);
region.PrintI64("unmapped", unmapped_bytes);
region.PrintI64("virtual_address_space_used", virtual_memory_used);
region.PrintI64("num_spans", uint64_t(stats.span_stats.in_use));
region.PrintI64("num_spans_created", uint64_t(stats.span_stats.total));
region.PrintI64("num_thread_heaps", uint64_t(stats.tc_stats.in_use));
region.PrintI64("num_thread_heaps_created", uint64_t(stats.tc_stats.total));
region.PrintI64("num_stack_traces", uint64_t(stats.stack_stats.in_use));
region.PrintI64("num_stack_traces_created",
uint64_t(stats.stack_stats.total));
region.PrintI64("num_table_buckets",
uint64_t(stats.linked_sample_stats.in_use));
region.PrintI64("num_table_buckets_created",
uint64_t(stats.linked_sample_stats.total));
region.PrintI64("pagemap_size", uint64_t(stats.pagemap_bytes));
region.PrintI64("pagemap_root_residence", stats.pagemap_root_bytes_res);
region.PrintI64("percpu_slab_size", stats.percpu_metadata_bytes);
region.PrintI64("percpu_slab_residence", stats.percpu_metadata_bytes_res);
region.PrintI64("peak_backed", stats.peak_stats.backed_bytes);
region.PrintI64("peak_application_demand",
stats.peak_stats.sampled_application_bytes);
region.PrintI64("tcmalloc_page_size", uint64_t(kPageSize));
region.PrintI64("tcmalloc_huge_page_size", uint64_t(kHugePageSize));
region.PrintI64("cpus_allowed", CountAllowedCpus());
region.PrintI64("arena_blocks", stats.arena.blocks);
{
auto sampled_profiles = region.CreateSubRegion("sampled_profiles");
sampled_profiles.PrintI64("current_bytes",
tc_globals.sampled_objects_size_.value());
sampled_profiles.PrintI64(
"current_fragmentation_bytes",
tc_globals.sampled_internal_fragmentation_.value());
sampled_profiles.PrintI64("peak_bytes",
tc_globals.peak_heap_tracker().CurrentPeakSize());
}
// Print total process stats (inclusive of non-malloc sources).
MemoryStats memstats;
if (GetMemoryStats(&memstats)) {
region.PrintI64("total_resident", uint64_t(memstats.rss));
region.PrintI64("total_mapped", uint64_t(memstats.vss));
}
region.PrintI64("total_sampled_count",
tc_globals.total_sampled_count_.value());
if (level >= 2) {
{
#ifndef TCMALLOC_SMALL_BUT_SLOW
for (int size_class = 1; size_class < kNumClasses; ++size_class) {
uint64_t class_bytes = class_count[size_class] *
tc_globals.sizemap().class_to_size(size_class);
PbtxtRegion entry = region.CreateSubRegion("freelist");
entry.PrintI64("sizeclass",
tc_globals.sizemap().class_to_size(size_class));
entry.PrintI64("bytes", class_bytes);
entry.PrintI64("num_spans_requested",
span_stats[size_class].num_spans_requested);
entry.PrintI64("num_spans_returned",
span_stats[size_class].num_spans_returned);
entry.PrintI64("obj_capacity", span_stats[size_class].obj_capacity);
tc_globals.central_freelist(size_class)
.PrintSpanUtilStatsInPbtxt(&entry);
}
#endif
}
tc_globals.transfer_cache().PrintInPbtxt(&region);
tc_globals.sharded_transfer_cache().PrintInPbtxt(&region);
region.PrintRaw("transfer_cache_implementation",
TransferCacheImplementationToLabel(
tc_globals.transfer_cache().implementation()));
if (UsePerCpuCache(tc_globals)) {
tc_globals.cpu_cache().PrintInPbtxt(&region);
}
}
tc_globals.page_allocator().PrintInPbtxt(&region, MemoryTag::kNormal);
if (tc_globals.numa_topology().active_partitions() > 1) {
tc_globals.page_allocator().PrintInPbtxt(&region, MemoryTag::kNormalP1);
}
tc_globals.page_allocator().PrintInPbtxt(&region, MemoryTag::kSampled);
tc_globals.page_allocator().PrintInPbtxt(&region, MemoryTag::kCold);
// We do not collect tracking information in pbtxt.
size_t limit_bytes;
bool is_hard;
std::tie(limit_bytes, is_hard) = tc_globals.page_allocator().limit();
region.PrintI64("desired_usage_limit_bytes", limit_bytes);
region.PrintBool("hard_limit", is_hard);
region.PrintI64("limit_hits", tc_globals.page_allocator().limit_hits());
{
auto gwp_asan = region.CreateSubRegion("gwp_asan");
tc_globals.guardedpage_allocator().PrintInPbtxt(&gwp_asan);
}
region.PrintI64("memory_release_failures", SystemReleaseErrors());
region.PrintBool("tcmalloc_per_cpu_caches", Parameters::per_cpu_caches());
region.PrintI64("tcmalloc_max_per_cpu_cache_size",
Parameters::max_per_cpu_cache_size());
region.PrintI64("tcmalloc_max_total_thread_cache_bytes",
Parameters::max_total_thread_cache_bytes());
region.PrintI64("malloc_release_bytes_per_sec",
static_cast<int64_t>(Parameters::background_release_rate()));
region.PrintI64(
"tcmalloc_skip_subrelease_interval_ns",
absl::ToInt64Nanoseconds(Parameters::filler_skip_subrelease_interval()));
region.PrintI64("tcmalloc_skip_subrelease_short_interval_ns",
absl::ToInt64Nanoseconds(
Parameters::filler_skip_subrelease_short_interval()));
region.PrintI64("tcmalloc_skip_subrelease_long_interval_ns",
absl::ToInt64Nanoseconds(
Parameters::filler_skip_subrelease_long_interval()));
region.PrintBool("tcmalloc_shuffle_per_cpu_caches",
Parameters::shuffle_per_cpu_caches());
region.PrintI64("profile_sampling_rate", Parameters::profile_sampling_rate());
region.PrintRaw("percpu_vcpu_type",
subtle::percpu::UsingFlatVirtualCpus() ? "FLAT" : "NONE");
region.PrintBool("tcmalloc_partial_transfer_cache",
Parameters::partial_transfer_cache());
region.PrintI64("separate_allocs_for_few_and_many_objects_spans",
Parameters::separate_allocs_for_few_and_many_objects_spans());
}
bool GetNumericProperty(const char* name_data, size_t name_size,
size_t* value) {
// LINT.IfChange
ASSERT(name_data != nullptr);
ASSERT(value != nullptr);
const absl::string_view name(name_data, name_size);
// This is near the top since ReleasePerCpuMemoryToOS() calls it frequently.
if (name == "tcmalloc.per_cpu_caches_active") {
*value = tc_globals.CpuCacheActive();
return true;
}
if (name == "generic.virtual_memory_used") {
TCMallocStats stats;
ExtractTCMallocStats(&stats, false);
*value = VirtualMemoryUsed(stats);
return true;
}
if (name == "generic.physical_memory_used") {
TCMallocStats stats;
ExtractTCMallocStats(&stats, false);
*value = PhysicalMemoryUsed(stats);
return true;
}
if (name == "generic.current_allocated_bytes" ||
name == "generic.bytes_in_use_by_app") {
TCMallocStats stats;
ExtractTCMallocStats(&stats, false);
*value = InUseByApp(stats);
return true;
}
if (name == "generic.peak_memory_usage") {
TCMallocStats stats;
ExtractTCMallocStats(&stats, false);
*value = static_cast<uint64_t>(stats.peak_stats.sampled_application_bytes);
return true;
}
if (name == "generic.realized_fragmentation") {
TCMallocStats stats;
ExtractTCMallocStats(&stats, false);
*value = static_cast<uint64_t>(
100. * safe_div(stats.peak_stats.backed_bytes -
stats.peak_stats.sampled_application_bytes,
stats.peak_stats.sampled_application_bytes));
return true;
}
if (name == "generic.heap_size") {
absl::base_internal::SpinLockHolder l(&pageheap_lock);
BackingStats stats = tc_globals.page_allocator().stats();
*value = HeapSizeBytes(stats);
return true;
}
if (name == "tcmalloc.central_cache_free") {
TCMallocStats stats;
ExtractTCMallocStats(&stats, false);
*value = stats.central_bytes;
return true;
}
if (name == "tcmalloc.cpu_free") {
TCMallocStats stats;
ExtractTCMallocStats(&stats, false);
*value = stats.per_cpu_bytes;
return true;
}
if (name == "tcmalloc.sharded_transfer_cache_free") {
TCMallocStats stats;
ExtractTCMallocStats(&stats, false);
*value = stats.sharded_transfer_bytes;
return true;
}
if (name == "tcmalloc.slack_bytes") {
// Kept for backwards compatibility. Now defined externally as:
// pageheap_free_bytes + pageheap_unmapped_bytes.
absl::base_internal::SpinLockHolder l(&pageheap_lock);
BackingStats stats = tc_globals.page_allocator().stats();
*value = SlackBytes(stats);
return true;
}
if (name == "tcmalloc.pageheap_free_bytes" ||
name == "tcmalloc.page_heap_free") {
absl::base_internal::SpinLockHolder l(&pageheap_lock);
*value = tc_globals.page_allocator().stats().free_bytes;
return true;
}
if (name == "tcmalloc.pageheap_unmapped_bytes" ||
name == "tcmalloc.page_heap_unmapped") {
absl::base_internal::SpinLockHolder l(&pageheap_lock);
// Arena non-resident bytes aren't on the page heap, but they are unmapped.
*value = tc_globals.page_allocator().stats().unmapped_bytes +
tc_globals.arena().stats().bytes_nonresident;
return true;
}
if (name == "tcmalloc.sampled_internal_fragmentation") {
*value = tc_globals.sampled_internal_fragmentation_.value();
return true;
}
if (name == "tcmalloc.page_algorithm") {
absl::base_internal::SpinLockHolder l(&pageheap_lock);
*value = tc_globals.page_allocator().algorithm();
return true;
}
if (name == "tcmalloc.max_total_thread_cache_bytes") {
absl::base_internal::SpinLockHolder l(&pageheap_lock);
*value = ThreadCache::overall_thread_cache_size();
return true;
}
if (name == "tcmalloc.current_total_thread_cache_bytes" ||
name == "tcmalloc.thread_cache_free") {
TCMallocStats stats;
ExtractTCMallocStats(&stats, false);
*value = stats.thread_bytes;
return true;
}
if (name == "tcmalloc.thread_cache_count") {
TCMallocStats stats;
ExtractTCMallocStats(&stats, false);
*value = stats.tc_stats.in_use;
return true;
}
if (name == "tcmalloc.local_bytes") {
TCMallocStats stats;
ExtractTCMallocStats(&stats, false);
*value = LocalBytes(stats);
return true;
}
if (name == "tcmalloc.external_fragmentation_bytes") {
TCMallocStats stats;
ExtractTCMallocStats(&stats, false);
*value = ExternalBytes(stats);
return true;
}
if (name == "tcmalloc.metadata_bytes") {
TCMallocStats stats;
ExtractTCMallocStats(&stats, true);
*value = stats.metadata_bytes;
return true;
}
if (name == "tcmalloc.transfer_cache_free") {
TCMallocStats stats;
ExtractTCMallocStats(&stats, false);
*value = stats.transfer_bytes;
return true;
}
bool want_hard_limit = (name == "tcmalloc.hard_usage_limit_bytes");
if (want_hard_limit || name == "tcmalloc.desired_usage_limit_bytes") {
size_t amount;
bool is_hard;
std::tie(amount, is_hard) = tc_globals.page_allocator().limit();
if (want_hard_limit != is_hard) {
amount = std::numeric_limits<size_t>::max();
}
*value = amount;
return true;
}
if (name == "tcmalloc.required_bytes") {
TCMallocStats stats;
ExtractTCMallocStats(&stats, false);
*value = RequiredBytes(stats);
return true;
}
const absl::string_view kExperimentPrefix = "tcmalloc.experiment.";
if (absl::StartsWith(name, kExperimentPrefix)) {
absl::optional<Experiment> exp =
FindExperimentByName(absl::StripPrefix(name, kExperimentPrefix));
if (exp.has_value()) {
*value = IsExperimentActive(*exp) ? 1 : 0;
return true;
}
}
// LINT.ThenChange(//depot/google3/tcmalloc/malloc_extension_test.cc)
return false;
}
} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END

View File

@ -0,0 +1,82 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef TCMALLOC_GLOBAL_STATS_H_
#define TCMALLOC_GLOBAL_STATS_H_
#include <cstdint>
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/page_allocator.h"
#include "tcmalloc/span_stats.h"
#include "tcmalloc/stats.h"
#include "tcmalloc/transfer_cache_stats.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
// Extract interesting stats
struct TCMallocStats {
uint64_t thread_bytes; // Bytes in thread caches
uint64_t central_bytes; // Bytes in central cache
uint64_t transfer_bytes; // Bytes in central transfer cache
uint64_t metadata_bytes; // Bytes alloced for metadata
uint64_t sharded_transfer_bytes; // Bytes in per-CCX cache
uint64_t per_cpu_bytes; // Bytes in per-CPU cache
uint64_t pagemap_root_bytes_res; // Resident bytes of pagemap root node
uint64_t percpu_metadata_bytes_res; // Resident bytes of the per-CPU metadata
AllocatorStats tc_stats; // ThreadCache objects
AllocatorStats span_stats; // Span objects
AllocatorStats stack_stats; // StackTrace objects
AllocatorStats linked_sample_stats; // StackTraceTable::LinkedSample objects
size_t pagemap_bytes; // included in metadata bytes
size_t percpu_metadata_bytes; // included in metadata bytes
BackingStats pageheap; // Stats from page heap
PageAllocator::PeakStats peak_stats;
ArenaStats arena; // Stats from the metadata Arena
// Explicitly declare the ctor to put it in the google_malloc section.
TCMallocStats() = default;
};
void ExtractStats(TCMallocStats* r, uint64_t* class_count,
SpanStats* span_stats, SmallSpanStats* small_spans,
LargeSpanStats* large_spans, TransferCacheStats* tc_stats,
bool report_residence);
void ExtractTCMallocStats(TCMallocStats* r, bool report_residence);
uint64_t InUseByApp(const TCMallocStats& stats);
uint64_t VirtualMemoryUsed(const TCMallocStats& stats);
uint64_t UnmappedBytes(const TCMallocStats& stats);
uint64_t PhysicalMemoryUsed(const TCMallocStats& stats);
uint64_t RequiredBytes(const TCMallocStats& stats);
size_t ExternalBytes(const TCMallocStats& stats);
size_t HeapSizeBytes(const BackingStats& stats);
size_t LocalBytes(const TCMallocStats& stats);
size_t SlackBytes(const BackingStats& stats);
// WRITE stats to "out"
void DumpStats(Printer* out, int level);
void DumpStatsInPbtxt(Printer* out, int level);
bool GetNumericProperty(const char* name_data, size_t name_size, size_t* value);
} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END
#endif // TCMALLOC_GLOBAL_STATS_H_

View File

@ -0,0 +1,569 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tcmalloc/guarded_page_allocator.h"
#include <fcntl.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>
#include <algorithm>
#include <array>
#include <cmath>
#include <csignal>
#include <tuple>
#include <utility>
#include "absl/base/call_once.h"
#include "absl/base/internal/spinlock.h"
#include "absl/base/internal/sysinfo.h"
#include "absl/debugging/stacktrace.h"
#include "absl/numeric/bits.h"
#include "absl/strings/string_view.h"
#include "tcmalloc/common.h"
#include "tcmalloc/internal/environment.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/internal/page_size.h"
#include "tcmalloc/internal/util.h"
#include "tcmalloc/pagemap.h"
#include "tcmalloc/sampler.h"
#include "tcmalloc/static_vars.h"
#include "tcmalloc/system-alloc.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
const size_t GuardedPageAllocator::kMagicSize; // NOLINT
void GuardedPageAllocator::Init(size_t max_alloced_pages, size_t total_pages) {
CHECK_CONDITION(max_alloced_pages > 0);
CHECK_CONDITION(max_alloced_pages <= total_pages);
CHECK_CONDITION(total_pages <= kGpaMaxPages);
max_alloced_pages_ = max_alloced_pages;
total_pages_ = total_pages;
// If the system page size is larger than kPageSize, we need to use the
// system page size for this allocator since mprotect operates on full pages
// only. This case happens on PPC.
page_size_ = std::max(kPageSize, static_cast<size_t>(GetPageSize()));
ASSERT(page_size_ % kPageSize == 0);
rand_ = reinterpret_cast<uint64_t>(this); // Initialize RNG seed.
MapPages();
}
void GuardedPageAllocator::Destroy() {
absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
if (initialized_) {
size_t len = pages_end_addr_ - pages_base_addr_;
int err = munmap(reinterpret_cast<void*>(pages_base_addr_), len);
ASSERT(err != -1);
(void)err;
initialized_ = false;
}
}
GuardedPageAllocator::AllocWithStatus GuardedPageAllocator::Allocate(
size_t size, size_t alignment) {
if (size == 0) {
return {nullptr, Profile::Sample::GuardedStatus::TooSmall};
}
ssize_t free_slot = ReserveFreeSlot();
// All slots are reserved.
if (free_slot == -1) {
return {nullptr, Profile::Sample::GuardedStatus::NoAvailableSlots};
}
ASSERT(size <= page_size_);
ASSERT(alignment <= page_size_);
ASSERT(alignment == 0 || absl::has_single_bit(alignment));
void* result = reinterpret_cast<void*>(SlotToAddr(free_slot));
if (mprotect(result, page_size_, PROT_READ | PROT_WRITE) == -1) {
ASSERT(false && "mprotect failed");
absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
num_failed_allocations_++;
FreeSlot(free_slot);
return {nullptr, Profile::Sample::GuardedStatus::MProtectFailed};
}
// Place some allocations at end of page for better overflow detection.
MaybeRightAlign(free_slot, size, alignment, &result);
// Record stack trace.
SlotMetadata& d = data_[free_slot];
d.dealloc_trace.depth = 0;
d.alloc_trace.depth = absl::GetStackTrace(d.alloc_trace.stack, kMaxStackDepth,
/*skip_count=*/3);
d.alloc_trace.tid = absl::base_internal::GetTID();
d.requested_size = size;
d.allocation_start = reinterpret_cast<uintptr_t>(result);
ASSERT(!alignment || d.allocation_start % alignment == 0);
return {result, Profile::Sample::GuardedStatus::Guarded};
}
void GuardedPageAllocator::Deallocate(void* ptr) {
ASSERT(PointerIsMine(ptr));
const uintptr_t page_addr = GetPageAddr(reinterpret_cast<uintptr_t>(ptr));
size_t slot = AddrToSlot(page_addr);
absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
if (IsFreed(slot)) {
double_free_detected_ = true;
} else if (WriteOverflowOccurred(slot)) {
write_overflow_detected_ = true;
}
CHECK_CONDITION(mprotect(reinterpret_cast<void*>(page_addr), page_size_,
PROT_NONE) != -1);
if (write_overflow_detected_ || double_free_detected_) {
*reinterpret_cast<char*>(ptr) = 'X'; // Trigger SEGV handler.
CHECK_CONDITION(false); // Unreachable.
}
// Record stack trace.
GpaStackTrace& trace = data_[slot].dealloc_trace;
trace.depth = absl::GetStackTrace(trace.stack, kMaxStackDepth,
/*skip_count=*/2);
trace.tid = absl::base_internal::GetTID();
FreeSlot(slot);
}
size_t GuardedPageAllocator::GetRequestedSize(const void* ptr) const {
ASSERT(PointerIsMine(ptr));
size_t slot = AddrToSlot(GetPageAddr(reinterpret_cast<uintptr_t>(ptr)));
return data_[slot].requested_size;
}
std::pair<off_t, size_t> GuardedPageAllocator::GetAllocationOffsetAndSize(
const void* ptr) const {
ASSERT(PointerIsMine(ptr));
const uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
const size_t slot = GetNearestSlot(addr);
return {addr - data_[slot].allocation_start, data_[slot].requested_size};
}
GuardedPageAllocator::ErrorType GuardedPageAllocator::GetStackTraces(
const void* ptr, GpaStackTrace* alloc_trace,
GpaStackTrace* dealloc_trace) const {
ASSERT(PointerIsMine(ptr));
const uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
size_t slot = GetNearestSlot(addr);
*alloc_trace = data_[slot].alloc_trace;
*dealloc_trace = data_[slot].dealloc_trace;
return GetErrorType(addr, data_[slot]);
}
// We take guarded samples during periodic profiling samples. Computes the
// mean number of profiled samples made for every guarded sample.
static int GetChainedRate() {
auto guarded_rate = Parameters::guarded_sampling_rate();
auto sample_rate = Parameters::profile_sampling_rate();
if (guarded_rate < 0 || sample_rate <= 0) {
return guarded_rate;
} else {
return std::ceil(static_cast<double>(guarded_rate) /
static_cast<double>(sample_rate));
}
}
void GuardedPageAllocator::Print(Printer* out) {
absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
out->printf(
"\n"
"------------------------------------------------\n"
"GWP-ASan Status\n"
"------------------------------------------------\n"
"Successful Allocations: %zu\n"
"Failed Allocations: %zu\n"
"Slots Currently Allocated: %zu\n"
"Slots Currently Quarantined: %zu\n"
"Maximum Slots Allocated: %zu / %zu\n"
"PARAMETER tcmalloc_guarded_sample_parameter %d\n",
num_allocation_requests_ - num_failed_allocations_,
num_failed_allocations_, num_alloced_pages_,
total_pages_ - num_alloced_pages_, num_alloced_pages_max_,
max_alloced_pages_, GetChainedRate());
}
void GuardedPageAllocator::PrintInPbtxt(PbtxtRegion* gwp_asan) {
absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
gwp_asan->PrintI64("successful_allocations",
num_allocation_requests_ - num_failed_allocations_);
gwp_asan->PrintI64("failed_allocations", num_failed_allocations_);
gwp_asan->PrintI64("current_slots_allocated", num_alloced_pages_);
gwp_asan->PrintI64("current_slots_quarantined",
total_pages_ - num_alloced_pages_);
gwp_asan->PrintI64("max_slots_allocated", num_alloced_pages_max_);
gwp_asan->PrintI64("allocated_slot_limit", max_alloced_pages_);
gwp_asan->PrintI64("tcmalloc_guarded_sample_parameter", GetChainedRate());
}
// Maps 2 * total_pages_ + 1 pages so that there are total_pages_ unique pages
// we can return from Allocate with guard pages before and after them.
void GuardedPageAllocator::MapPages() {
absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
ASSERT(!first_page_addr_);
ASSERT(page_size_ % GetPageSize() == 0);
size_t len = (2 * total_pages_ + 1) * page_size_;
auto base_addr = reinterpret_cast<uintptr_t>(
MmapAligned(len, page_size_, MemoryTag::kSampled));
ASSERT(base_addr);
if (!base_addr) return;
// Tell TCMalloc's PageMap about the memory we own.
const PageId page = PageIdContaining(reinterpret_cast<void*>(base_addr));
const Length page_len = BytesToLengthFloor(len);
if (!tc_globals.pagemap().Ensure(page, page_len)) {
ASSERT(false && "Failed to notify page map of page-guarded memory.");
return;
}
// Allocate memory for slot metadata.
data_ = reinterpret_cast<SlotMetadata*>(
tc_globals.arena().Alloc(sizeof(*data_) * total_pages_));
for (size_t i = 0; i < total_pages_; ++i) {
new (&data_[i]) SlotMetadata;
}
pages_base_addr_ = base_addr;
pages_end_addr_ = pages_base_addr_ + len;
// Align first page to page_size_.
first_page_addr_ = GetPageAddr(pages_base_addr_ + page_size_);
std::fill_n(free_pages_, total_pages_, true);
initialized_ = true;
}
// Selects a random slot in O(total_pages_) time.
ssize_t GuardedPageAllocator::ReserveFreeSlot() {
absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
if (!initialized_ || !allow_allocations_) return -1;
num_allocation_requests_++;
if (num_alloced_pages_ == max_alloced_pages_) {
num_failed_allocations_++;
return -1;
}
rand_ = Sampler::NextRandom(rand_);
size_t num_free_pages = total_pages_ - num_alloced_pages_;
size_t slot = GetIthFreeSlot(rand_ % num_free_pages);
ASSERT(free_pages_[slot]);
free_pages_[slot] = false;
num_alloced_pages_++;
num_alloced_pages_max_ = std::max(num_alloced_pages_, num_alloced_pages_max_);
return slot;
}
size_t GuardedPageAllocator::GetIthFreeSlot(size_t ith_free_slot) {
ASSERT(ith_free_slot < total_pages_ - num_alloced_pages_);
for (size_t free_slot_count = 0, j = 0;; j++) {
if (free_pages_[j]) {
if (free_slot_count == ith_free_slot) return j;
free_slot_count++;
}
}
}
void GuardedPageAllocator::FreeSlot(size_t slot) {
ASSERT(slot < total_pages_);
ASSERT(!free_pages_[slot]);
free_pages_[slot] = true;
num_alloced_pages_--;
}
uintptr_t GuardedPageAllocator::GetPageAddr(uintptr_t addr) const {
const uintptr_t addr_mask = ~(page_size_ - 1ULL);
return addr & addr_mask;
}
uintptr_t GuardedPageAllocator::GetNearestValidPage(uintptr_t addr) const {
if (addr < first_page_addr_) return first_page_addr_;
const uintptr_t last_page_addr =
first_page_addr_ + 2 * (total_pages_ - 1) * page_size_;
if (addr > last_page_addr) return last_page_addr;
uintptr_t offset = addr - first_page_addr_;
// If addr is already on a valid page, just return addr.
if ((offset / page_size_) % 2 == 0) return addr;
// ptr points to a guard page, so get nearest valid page.
const size_t kHalfPageSize = page_size_ / 2;
if ((offset / kHalfPageSize) % 2 == 0) {
return addr - kHalfPageSize; // Round down.
}
return addr + kHalfPageSize; // Round up.
}
size_t GuardedPageAllocator::GetNearestSlot(uintptr_t addr) const {
return AddrToSlot(GetPageAddr(GetNearestValidPage(addr)));
}
bool GuardedPageAllocator::IsFreed(size_t slot) const {
return free_pages_[slot];
}
bool GuardedPageAllocator::WriteOverflowOccurred(size_t slot) const {
if (!ShouldRightAlign(slot)) return false;
uint8_t magic = GetWriteOverflowMagic(slot);
uintptr_t alloc_end =
data_[slot].allocation_start + data_[slot].requested_size;
uintptr_t page_end = SlotToAddr(slot) + page_size_;
uintptr_t magic_end = std::min(page_end, alloc_end + kMagicSize);
for (uintptr_t p = alloc_end; p < magic_end; ++p) {
if (*reinterpret_cast<uint8_t*>(p) != magic) return true;
}
return false;
}
GuardedPageAllocator::ErrorType GuardedPageAllocator::GetErrorType(
uintptr_t addr, const SlotMetadata& d) const {
if (!d.allocation_start) return ErrorType::kUnknown;
if (double_free_detected_) return ErrorType::kDoubleFree;
if (write_overflow_detected_) return ErrorType::kBufferOverflowOnDealloc;
if (d.dealloc_trace.depth) return ErrorType::kUseAfterFree;
if (addr < d.allocation_start) return ErrorType::kBufferUnderflow;
if (addr >= d.allocation_start + d.requested_size) {
return ErrorType::kBufferOverflow;
}
return ErrorType::kUnknown;
}
uintptr_t GuardedPageAllocator::SlotToAddr(size_t slot) const {
ASSERT(slot < total_pages_);
return first_page_addr_ + 2 * slot * page_size_;
}
size_t GuardedPageAllocator::AddrToSlot(uintptr_t addr) const {
uintptr_t offset = addr - first_page_addr_;
ASSERT(offset % page_size_ == 0);
ASSERT((offset / page_size_) % 2 == 0);
int slot = offset / page_size_ / 2;
ASSERT(slot >= 0 && slot < total_pages_);
return slot;
}
void GuardedPageAllocator::MaybeRightAlign(size_t slot, size_t size,
size_t alignment, void** ptr) {
if (!ShouldRightAlign(slot)) return;
uintptr_t adjusted_ptr =
reinterpret_cast<uintptr_t>(*ptr) + page_size_ - size;
// If alignment == 0, the necessary alignment is never larger than the size
// rounded up to the next power of 2. We use this fact to minimize alignment
// padding between the end of small allocations and their guard pages.
//
// For allocations larger than the greater of kAlignment and
// __STDCPP_DEFAULT_NEW_ALIGNMENT__, we're safe aligning to that value.
size_t default_alignment =
std::min(absl::bit_ceil(size),
std::max(static_cast<size_t>(kAlignment),
static_cast<size_t>(__STDCPP_DEFAULT_NEW_ALIGNMENT__)));
// Ensure valid alignment.
alignment = std::max(alignment, default_alignment);
uintptr_t alignment_padding = adjusted_ptr & (alignment - 1);
adjusted_ptr -= alignment_padding;
// Write magic bytes in alignment padding to detect small overflow writes.
size_t magic_size = std::min(alignment_padding, kMagicSize);
memset(reinterpret_cast<void*>(adjusted_ptr + size),
GetWriteOverflowMagic(slot), magic_size);
*ptr = reinterpret_cast<void*>(adjusted_ptr);
}
// If this failure occurs during "bazel test", writes a warning for Bazel to
// display.
static void RecordBazelWarning(absl::string_view error) {
const char* warning_file = thread_safe_getenv("TEST_WARNINGS_OUTPUT_FILE");
if (!warning_file) return; // Not a bazel test.
constexpr char warning[] = "GWP-ASan error detected: ";
int fd = open(warning_file, O_CREAT | O_WRONLY | O_APPEND, 0644);
if (fd == -1) return;
(void)write(fd, warning, sizeof(warning) - 1);
(void)write(fd, error.data(), error.size());
(void)write(fd, "\n", 1);
close(fd);
}
// If this failure occurs during a gUnit test, writes an XML file describing the
// error type. Note that we cannot use ::testing::Test::RecordProperty()
// because it doesn't write the XML file if a test crashes (which we're about to
// do here). So we write directly to the XML file instead.
//
static void RecordTestFailure(absl::string_view error) {
const char* xml_file = thread_safe_getenv("XML_OUTPUT_FILE");
if (!xml_file) return; // Not a gUnit test.
// Record test failure for Sponge.
constexpr char xml_text_header[] =
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
"<testsuites><testsuite><testcase>"
" <properties>"
" <property name=\"gwp-asan-report\" value=\"";
constexpr char xml_text_footer[] =
"\"/>"
" </properties>"
" <failure message=\"MemoryError\">"
" GWP-ASan detected a memory error. See the test log for full report."
" </failure>"
"</testcase></testsuite></testsuites>";
int fd = open(xml_file, O_CREAT | O_WRONLY | O_TRUNC, 0644);
if (fd == -1) return;
(void)write(fd, xml_text_header, sizeof(xml_text_header) - 1);
(void)write(fd, error.data(), error.size());
(void)write(fd, xml_text_footer, sizeof(xml_text_footer) - 1);
close(fd);
}
//
// If this crash occurs in a test, records test failure summaries.
//
// error contains the type of error to record.
static void RecordCrash(absl::string_view error) {
RecordBazelWarning(error);
RecordTestFailure(error);
}
static void PrintStackTrace(void** stack_frames, size_t depth) {
for (size_t i = 0; i < depth; ++i) {
Log(kLog, __FILE__, __LINE__, " @ ", stack_frames[i]);
}
}
static void PrintStackTraceFromSignalHandler(void* context) {
void* stack_frames[kMaxStackDepth];
size_t depth = absl::GetStackTraceWithContext(stack_frames, kMaxStackDepth, 1,
context, nullptr);
PrintStackTrace(stack_frames, depth);
}
// A SEGV handler that prints stack traces for the allocation and deallocation
// of relevant memory as well as the location of the memory error.
static void SegvHandler(int signo, siginfo_t* info, void* context) {
if (signo != SIGSEGV) return;
void* fault = info->si_addr;
if (!tc_globals.guardedpage_allocator().PointerIsMine(fault)) return;
GuardedPageAllocator::GpaStackTrace alloc_trace, dealloc_trace;
GuardedPageAllocator::ErrorType error =
tc_globals.guardedpage_allocator().GetStackTraces(fault, &alloc_trace,
&dealloc_trace);
if (error == GuardedPageAllocator::ErrorType::kUnknown) return;
pid_t current_thread = absl::base_internal::GetTID();
off_t offset;
size_t size;
std::tie(offset, size) =
tc_globals.guardedpage_allocator().GetAllocationOffsetAndSize(fault);
Log(kLog, __FILE__, __LINE__,
"*** GWP-ASan "
"(https://google.github.io/tcmalloc/gwp-asan.html) "
"has detected a memory error ***");
Log(kLog, __FILE__, __LINE__, ">>> Access at offset", offset,
"into buffer of length", size);
Log(kLog, __FILE__, __LINE__,
"Error originates from memory allocated in thread", alloc_trace.tid,
"at:");
PrintStackTrace(alloc_trace.stack, alloc_trace.depth);
switch (error) {
case GuardedPageAllocator::ErrorType::kUseAfterFree:
Log(kLog, __FILE__, __LINE__, "The memory was freed in thread",
dealloc_trace.tid, "at:");
PrintStackTrace(dealloc_trace.stack, dealloc_trace.depth);
Log(kLog, __FILE__, __LINE__, "Use-after-free occurs in thread",
current_thread, "at:");
RecordCrash("use-after-free");
break;
case GuardedPageAllocator::ErrorType::kBufferUnderflow:
Log(kLog, __FILE__, __LINE__, "Buffer underflow occurs in thread",
current_thread, "at:");
RecordCrash("buffer-underflow");
break;
case GuardedPageAllocator::ErrorType::kBufferOverflow:
Log(kLog, __FILE__, __LINE__, "Buffer overflow occurs in thread",
current_thread, "at:");
RecordCrash("buffer-overflow");
break;
case GuardedPageAllocator::ErrorType::kDoubleFree:
Log(kLog, __FILE__, __LINE__, "The memory was freed in thread",
dealloc_trace.tid, "at:");
PrintStackTrace(dealloc_trace.stack, dealloc_trace.depth);
Log(kLog, __FILE__, __LINE__, "Double free occurs in thread",
current_thread, "at:");
RecordCrash("double-free");
break;
case GuardedPageAllocator::ErrorType::kBufferOverflowOnDealloc:
Log(kLog, __FILE__, __LINE__,
"Buffer overflow (write) detected in thread", current_thread,
"at free:");
RecordCrash("buffer-overflow-detected-at-free");
break;
case GuardedPageAllocator::ErrorType::kUnknown:
Crash(kCrash, __FILE__, __LINE__, "Unexpected ErrorType::kUnknown");
}
PrintStackTraceFromSignalHandler(context);
if (error == GuardedPageAllocator::ErrorType::kBufferOverflowOnDealloc) {
Log(kLog, __FILE__, __LINE__,
"*** Try rerunning with --config=asan to get stack trace of overflow "
"***");
}
}
static struct sigaction old_sa;
static void ForwardSignal(int signo, siginfo_t* info, void* context) {
if (old_sa.sa_flags & SA_SIGINFO) {
old_sa.sa_sigaction(signo, info, context);
} else if (old_sa.sa_handler == SIG_DFL) {
// No previous handler registered. Re-raise signal for core dump.
int err = sigaction(signo, &old_sa, nullptr);
if (err == -1) {
Log(kLog, __FILE__, __LINE__, "Couldn't restore previous sigaction!");
}
raise(signo);
} else if (old_sa.sa_handler == SIG_IGN) {
return; // Previous sigaction ignored signal, so do the same.
} else {
old_sa.sa_handler(signo);
}
}
static void HandleSegvAndForward(int signo, siginfo_t* info, void* context) {
SegvHandler(signo, info, context);
ForwardSignal(signo, info, context);
}
extern "C" void MallocExtension_Internal_ActivateGuardedSampling() {
static absl::once_flag flag;
absl::call_once(flag, []() {
struct sigaction action = {};
action.sa_sigaction = HandleSegvAndForward;
sigemptyset(&action.sa_mask);
action.sa_flags = SA_SIGINFO;
sigaction(SIGSEGV, &action, &old_sa);
tc_globals.guardedpage_allocator().AllowAllocations();
});
}
} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END

View File

@ -0,0 +1,315 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef TCMALLOC_GUARDED_PAGE_ALLOCATOR_H_
#define TCMALLOC_GUARDED_PAGE_ALLOCATOR_H_
#include <stddef.h>
#include <stdint.h>
#include <sys/types.h>
#include <utility>
#include "absl/base/attributes.h"
#include "absl/base/internal/spinlock.h"
#include "absl/base/thread_annotations.h"
#include "tcmalloc/common.h"
#include "tcmalloc/internal/logging.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
// An allocator that gives each allocation a new region, with guard pages on
// either side of the allocated region. If a buffer is overflowed to the next
// guard page or underflowed to the previous guard page, a segfault occurs.
// After an allocation is freed, the underlying page is marked as inaccessible,
// and any future accesses to it will also cause segfaults until the page is
// reallocated.
//
// Is safe to use with static storage duration and is thread safe with the
// exception of calls to Init() and Destroy() (see corresponding function
// comments).
//
// Example:
// ABSL_CONST_INIT GuardedPageAllocator gpa;
//
// void foo() {
// char *buf = reinterpret_cast<char *>(gpa.Allocate(8000, 1));
// buf[0] = 'A'; // OK. No segfault occurs.
// memset(buf, 'A', 8000); // OK. No segfault occurs.
// buf[-300] = 'A'; // Segfault!
// buf[9000] = 'A'; // Segfault!
// gpa.Deallocate(buf);
// buf[0] = 'B'; // Segfault!
// }
//
// int main() {
// // Call Init() only once.
// gpa.Init(64, GuardedPageAllocator::kGpaMaxPages);
// gpa.AllowAllocations();
// for (int i = 0; i < 1000; i++) foo();
// return 0;
// }
class GuardedPageAllocator {
public:
struct GpaStackTrace {
void* stack[kMaxStackDepth];
size_t depth = 0;
pid_t tid = 0;
};
// Maximum number of pages this class can allocate.
static constexpr size_t kGpaMaxPages = 512;
enum class ErrorType {
kUseAfterFree,
kBufferUnderflow,
kBufferOverflow,
kDoubleFree,
kBufferOverflowOnDealloc,
kUnknown,
};
constexpr GuardedPageAllocator()
: guarded_page_lock_(absl::kConstInit,
absl::base_internal::SCHEDULE_KERNEL_ONLY),
free_pages_{},
num_alloced_pages_(0),
num_alloced_pages_max_(0),
num_allocation_requests_(0),
num_failed_allocations_(0),
data_(nullptr),
pages_base_addr_(0),
pages_end_addr_(0),
first_page_addr_(0),
max_alloced_pages_(0),
total_pages_(0),
page_size_(0),
rand_(0),
initialized_(false),
allow_allocations_(false),
double_free_detected_(false),
write_overflow_detected_(false) {}
GuardedPageAllocator(const GuardedPageAllocator&) = delete;
GuardedPageAllocator& operator=(const GuardedPageAllocator&) = delete;
~GuardedPageAllocator() = default;
// Configures this allocator to allocate up to max_alloced_pages pages at a
// time from a pool of total_pages pages, where:
// 1 <= max_alloced_pages <= total_pages <= kGpaMaxPages
//
// This method should be called non-concurrently and only once to complete
// initialization. Dynamic initialization is deliberately done here and not
// in the constructor, thereby allowing the constructor to be constexpr and
// avoiding static initialization order issues.
void Init(size_t max_alloced_pages, size_t total_pages)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
// Unmaps memory allocated by this class.
//
// This method should be called non-concurrently and only once to complete
// destruction. Destruction is deliberately done here and not in the
// destructor, thereby allowing the destructor to be trivial (i.e. a no-op)
// and avoiding use-after-destruction issues for static/global instances.
void Destroy();
struct AllocWithStatus {
void* alloc = nullptr;
Profile::Sample::GuardedStatus status =
Profile::Sample::GuardedStatus::Unknown;
};
// On success, returns an instance of AllocWithStatus which includes a pointer
// to size bytes of page-guarded memory, aligned to alignment. The member
// 'alloc' is a pointer that is guaranteed to be tagged.
// The 'status' member is set to GuardedStatus::Guarded.
// On failure, returns an instance of AllocWithStatus (the 'alloc' member is
// set to 'nullptr'). Failure can occur if memory could not be mapped or
// protected, if all guarded pages are already allocated, or if size is 0.
// These conditions are reflected in the 'status' member of the
// AllocWithStatus return value.
//
// Precondition: size and alignment <= page_size_
// Precondition: alignment is 0 or a power of 2
AllocWithStatus Allocate(size_t size, size_t alignment)
ABSL_LOCKS_EXCLUDED(guarded_page_lock_);
// Deallocates memory pointed to by ptr. ptr must have been previously
// returned by a call to Allocate.
void Deallocate(void* ptr) ABSL_LOCKS_EXCLUDED(guarded_page_lock_);
// Returns the size requested when ptr was allocated. ptr must have been
// previously returned by a call to Allocate.
size_t GetRequestedSize(const void* ptr) const;
// Returns ptr's offset from the beginning of its allocation along with the
// allocation's size.
std::pair<off_t, size_t> GetAllocationOffsetAndSize(const void* ptr) const;
// Records stack traces in alloc_trace and dealloc_trace for the page nearest
// to ptr. alloc_trace is the trace at the time the page was allocated. If
// the page is still allocated, dealloc_trace->depth will be 0. If the page
// has been deallocated, dealloc_trace is the trace at the time the page was
// deallocated.
//
// Returns the likely error type for an access at ptr.
//
// Requires that ptr points to memory mapped by this class.
ErrorType GetStackTraces(const void* ptr, GpaStackTrace* alloc_trace,
GpaStackTrace* dealloc_trace) const;
// Writes a human-readable summary of GuardedPageAllocator's internal state to
// *out.
void Print(Printer* out) ABSL_LOCKS_EXCLUDED(guarded_page_lock_);
void PrintInPbtxt(PbtxtRegion* gwp_asan)
ABSL_LOCKS_EXCLUDED(guarded_page_lock_);
// Returns true if ptr points to memory managed by this class.
inline bool ABSL_ATTRIBUTE_ALWAYS_INLINE
PointerIsMine(const void* ptr) const {
uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
return pages_base_addr_ <= addr && addr < pages_end_addr_;
}
// Allows Allocate() to start returning allocations.
void AllowAllocations() ABSL_LOCKS_EXCLUDED(guarded_page_lock_) {
absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
allow_allocations_ = true;
}
// Returns the number of pages available for allocation, based on how many are
// currently in use. (Should only be used in testing.)
size_t GetNumAvailablePages() ABSL_LOCKS_EXCLUDED(guarded_page_lock_) {
absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
return max_alloced_pages_ - num_alloced_pages_;
}
private:
// Structure for storing data about a slot.
struct SlotMetadata {
GpaStackTrace alloc_trace;
GpaStackTrace dealloc_trace;
size_t requested_size = 0;
uintptr_t allocation_start = 0;
};
// Max number of magic bytes we use to detect write-overflows at deallocation.
static constexpr size_t kMagicSize = 32;
// Maps pages into memory.
void MapPages() ABSL_LOCKS_EXCLUDED(guarded_page_lock_)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
// Reserves and returns a slot randomly selected from the free slots in
// free_pages_. Returns -1 if no slots available, or if AllowAllocations()
// hasn't been called yet.
ssize_t ReserveFreeSlot() ABSL_LOCKS_EXCLUDED(guarded_page_lock_);
// Returns the i-th free slot of free_pages_. i must be in the range [0,
// total_pages_ - num_alloced_pages_).
size_t GetIthFreeSlot(size_t i)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(guarded_page_lock_);
// Marks the specified slot as unreserved.
void FreeSlot(size_t slot) ABSL_EXCLUSIVE_LOCKS_REQUIRED(guarded_page_lock_);
// Returns the address of the page that addr resides on.
uintptr_t GetPageAddr(uintptr_t addr) const;
// Returns an address somewhere on the valid page nearest to addr.
uintptr_t GetNearestValidPage(uintptr_t addr) const;
// Returns the slot number for the page nearest to addr.
size_t GetNearestSlot(uintptr_t addr) const;
// Returns true if the specified slot has already been freed.
bool IsFreed(size_t slot) const
ABSL_EXCLUSIVE_LOCKS_REQUIRED(guarded_page_lock_);
// Returns true if magic bytes for slot were overwritten.
bool WriteOverflowOccurred(size_t slot) const;
// Returns the likely error type for the given access address and metadata
// associated with the nearest slot.
ErrorType GetErrorType(uintptr_t addr, const SlotMetadata& d) const;
// Magic constant used for detecting write-overflows at deallocation time.
static uint8_t GetWriteOverflowMagic(size_t slot) {
// Only even slots get magic bytes, so use slot / 2 for more unique magics.
return uint8_t{0xcd} * static_cast<uint8_t>(slot / 2);
}
// Returns true if slot should be right aligned.
static bool ShouldRightAlign(size_t slot) { return slot % 2 == 0; }
// If slot is marked for right alignment, moves the allocation in *ptr to the
// right end of the slot, maintaining the specified size and alignment. Magic
// bytes are written in any alignment padding.
void MaybeRightAlign(size_t slot, size_t size, size_t alignment, void** ptr);
uintptr_t SlotToAddr(size_t slot) const;
size_t AddrToSlot(uintptr_t addr) const;
absl::base_internal::SpinLock guarded_page_lock_;
// Maps each bool to one page.
// true: Free. false: Reserved.
bool free_pages_[kGpaMaxPages] ABSL_GUARDED_BY(guarded_page_lock_);
// Number of currently-allocated pages.
size_t num_alloced_pages_ ABSL_GUARDED_BY(guarded_page_lock_);
// The high-water mark for num_alloced_pages_.
size_t num_alloced_pages_max_ ABSL_GUARDED_BY(guarded_page_lock_);
// Number of calls to Allocate.
size_t num_allocation_requests_ ABSL_GUARDED_BY(guarded_page_lock_);
// Number of times Allocate has failed.
size_t num_failed_allocations_ ABSL_GUARDED_BY(guarded_page_lock_);
// A dynamically-allocated array of stack trace data captured when each page
// is allocated/deallocated. Printed by the SEGV handler when a memory error
// is detected.
SlotMetadata* data_;
uintptr_t pages_base_addr_; // Points to start of mapped region.
uintptr_t pages_end_addr_; // Points to the end of mapped region.
uintptr_t first_page_addr_; // Points to first page returnable by Allocate.
size_t max_alloced_pages_; // Max number of pages to allocate at once.
size_t total_pages_; // Size of the page pool to allocate from.
size_t page_size_; // Size of pages we allocate.
uint64_t rand_; // RNG seed.
// True if this object has been fully initialized.
bool initialized_ ABSL_GUARDED_BY(guarded_page_lock_);
// Flag to control whether we can return allocations or not.
bool allow_allocations_ ABSL_GUARDED_BY(guarded_page_lock_);
// Set to true if a double free has occurred.
bool double_free_detected_;
// Set to true if a write overflow was detected on deallocation.
bool write_overflow_detected_;
};
} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END
#endif // TCMALLOC_GUARDED_PAGE_ALLOCATOR_H_

View File

@ -0,0 +1,63 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <unistd.h>
#include <algorithm>
#include "absl/base/internal/spinlock.h"
#include "benchmark/benchmark.h"
#include "tcmalloc/guarded_page_allocator.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/internal/page_size.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
namespace {
static constexpr size_t kMaxGpaPages = GuardedPageAllocator::kGpaMaxPages;
// Size of pages used by GuardedPageAllocator.
static size_t PageSize() {
static const size_t page_size =
std::max(kPageSize, static_cast<size_t>(GetPageSize()));
return page_size;
}
void BM_AllocDealloc(benchmark::State& state) {
static GuardedPageAllocator* gpa = []() {
auto gpa = new GuardedPageAllocator;
absl::base_internal::SpinLockHolder h(&pageheap_lock);
gpa->Init(kMaxGpaPages, kMaxGpaPages);
gpa->AllowAllocations();
return gpa;
}();
size_t alloc_size = state.range(0);
for (auto _ : state) {
char* ptr = reinterpret_cast<char*>(gpa->Allocate(alloc_size, 0).alloc);
CHECK_CONDITION(ptr != nullptr);
ptr[0] = 'X'; // Page fault first page.
ptr[alloc_size - 1] = 'X'; // Page fault last page.
gpa->Deallocate(ptr);
}
}
BENCHMARK(BM_AllocDealloc)->Range(1, PageSize());
BENCHMARK(BM_AllocDealloc)->Arg(1)->ThreadRange(1, kMaxGpaPages);
} // namespace
} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END

View File

@ -0,0 +1,266 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stddef.h>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "benchmark/benchmark.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "absl/container/flat_hash_set.h"
#include "absl/functional/function_ref.h"
#include "tcmalloc/malloc_extension.h"
#include "tcmalloc/static_vars.h"
#include "tcmalloc/testing/testutil.h"
namespace tcmalloc {
namespace tcmalloc_internal {
namespace {
class GuardedPageAllocatorProfileTest : public testing::Test {
public:
struct NextSteps {
bool stop = true; // stop allocating
bool free = true; // free allocation
};
void SetUp() override { MallocExtension::ActivateGuardedSampling(); }
// Return the number of allocations
int AllocateUntil(size_t size,
absl::FunctionRef<NextSteps(void*)> evaluate_alloc) {
int alloc_count = 0;
while (true) {
void* alloc = ::operator new(size);
++alloc_count;
benchmark::DoNotOptimize(alloc);
auto result = evaluate_alloc(alloc);
// evaluate_alloc takes responsibility for delete/free if result.free is
// set to false.
if (result.free) {
::operator delete(alloc);
}
if (result.stop) {
break;
}
}
return alloc_count;
}
// Allocate until sample is guarded
// Called to reduce the internal counter to -1, which will trigger resetting
// the counter to the configured rate.
void AllocateUntilGuarded() {
AllocateUntil(968, [&](void* alloc) -> NextSteps {
return {IsSampledMemory(alloc) &&
Static::guardedpage_allocator().PointerIsMine(alloc),
true};
});
}
void ExamineSamples(
Profile& profile, Profile::Sample::GuardedStatus sought_status,
absl::flat_hash_set<Profile::Sample::GuardedStatus> allowable_statuses,
absl::FunctionRef<void(const Profile::Sample& s)> verify =
[](const Profile::Sample& s) { /* do nothing */ }) {
absl::flat_hash_set<Profile::Sample::GuardedStatus> found_statuses;
int samples = 0;
profile.Iterate([&](const Profile::Sample& s) {
++samples;
found_statuses.insert(s.guarded_status);
verify(s);
});
EXPECT_THAT(found_statuses, ::testing::Contains(sought_status));
found_statuses.erase(sought_status);
EXPECT_THAT(found_statuses, ::testing::IsSubsetOf(allowable_statuses));
}
};
TEST_F(GuardedPageAllocatorProfileTest, Guarded) {
ScopedAlwaysSample sas;
AllocateUntilGuarded();
auto token = MallocExtension::StartAllocationProfiling();
AllocateUntil(1051, [&](void* alloc) -> NextSteps { return {true, true}; });
auto profile = std::move(token).Stop();
ExamineSamples(profile, Profile::Sample::GuardedStatus::Guarded, {});
}
TEST_F(GuardedPageAllocatorProfileTest, NotAttempted) {
ScopedProfileSamplingRate spsr(4096);
auto token = MallocExtension::StartAllocationProfiling();
constexpr size_t alloc_size = 2 * 1024 * 1024;
AllocateUntil(alloc_size, [&](void* alloc) -> NextSteps {
return {true, true};
});
auto profile = std::move(token).Stop();
ExamineSamples(profile, Profile::Sample::GuardedStatus::NotAttempted,
{Profile::Sample::GuardedStatus::Guarded},
[&](const Profile::Sample& s) {
switch (s.guarded_status) {
case Profile::Sample::GuardedStatus::Guarded:
EXPECT_NE(alloc_size, s.requested_size);
break;
default:
break;
}
});
}
TEST_F(GuardedPageAllocatorProfileTest, LargerThanOnePage) {
ScopedAlwaysSample sas;
AllocateUntilGuarded();
auto token = MallocExtension::StartAllocationProfiling();
constexpr size_t alloc_size = kPageSize + 1;
AllocateUntil(alloc_size, [&](void* alloc) -> NextSteps {
return {true, true};
});
auto profile = std::move(token).Stop();
ExamineSamples(profile, Profile::Sample::GuardedStatus::LargerThanOnePage,
{Profile::Sample::GuardedStatus::Guarded},
[&](const Profile::Sample& s) {
switch (s.guarded_status) {
case Profile::Sample::GuardedStatus::Guarded:
EXPECT_NE(alloc_size, s.requested_size);
break;
default:
break;
}
});
}
TEST_F(GuardedPageAllocatorProfileTest, Disabled) {
ScopedGuardedSamplingRate sgsr(-1);
ScopedProfileSamplingRate spsr(1);
auto token = MallocExtension::StartAllocationProfiling();
AllocateUntil(1024, [&](void* alloc) -> NextSteps { return {true, true}; });
auto profile = std::move(token).Stop();
ExamineSamples(profile, Profile::Sample::GuardedStatus::Disabled, {});
}
TEST_F(GuardedPageAllocatorProfileTest, RateLimited) {
ScopedGuardedSamplingRate sgsr(1);
ScopedProfileSamplingRate spsr(1);
auto token = MallocExtension::StartAllocationProfiling();
// Keep allocating until something is sampled
constexpr size_t alloc_size = 1033;
bool guarded_found = false;
bool unguarded_found = false;
AllocateUntil(alloc_size, [&](void* alloc) -> NextSteps {
if (IsSampledMemory(alloc)) {
if (Static::guardedpage_allocator().PointerIsMine(alloc)) {
guarded_found = true;
} else {
unguarded_found = true;
}
return {guarded_found && unguarded_found, true};
}
return {false, true};
});
// Ensure Guarded and RateLimited both occur for the alloc_size
bool success_found = false;
bool ratelimited_found = false;
auto profile = std::move(token).Stop();
ExamineSamples(profile, Profile::Sample::GuardedStatus::RateLimited,
{Profile::Sample::GuardedStatus::Guarded},
[&](const Profile::Sample& s) {
if (s.requested_size != alloc_size) {
return;
}
switch (s.guarded_status) {
case Profile::Sample::GuardedStatus::Guarded:
success_found = true;
break;
case Profile::Sample::GuardedStatus::RateLimited:
ratelimited_found = true;
break;
default:
break;
}
});
EXPECT_TRUE(success_found);
EXPECT_TRUE(ratelimited_found);
}
TEST_F(GuardedPageAllocatorProfileTest, TooSmall) {
ScopedAlwaysSample sas;
AllocateUntilGuarded();
auto token = MallocExtension::StartAllocationProfiling();
// Next sampled allocation should be too small
constexpr size_t alloc_size = 0;
AllocateUntil(alloc_size, [&](void* alloc) -> NextSteps {
return {true, true};
});
auto profile = std::move(token).Stop();
ExamineSamples(profile, Profile::Sample::GuardedStatus::TooSmall,
{Profile::Sample::GuardedStatus::RateLimited,
Profile::Sample::GuardedStatus::Guarded},
[&](const Profile::Sample& s) {
switch (s.guarded_status) {
case Profile::Sample::GuardedStatus::Guarded:
EXPECT_NE(alloc_size, s.requested_size);
break;
case Profile::Sample::GuardedStatus::TooSmall:
EXPECT_EQ(alloc_size, s.requested_size);
break;
default:
break;
}
});
}
TEST_F(GuardedPageAllocatorProfileTest, NoAvailableSlots) {
ScopedAlwaysSample sas;
AllocateUntilGuarded();
std::vector<std::unique_ptr<char>> allocs;
// Guard until there are no slots available.
AllocateUntil(1039, [&](void* alloc) -> NextSteps {
if (Static::guardedpage_allocator().PointerIsMine(alloc)) {
allocs.emplace_back(static_cast<char*>(alloc));
return {Static::guardedpage_allocator().GetNumAvailablePages() == 0,
false};
}
return {false, true};
});
auto token = MallocExtension::StartAllocationProfiling();
// This should fail for lack of slots
constexpr size_t alloc_size = 1055;
AllocateUntil(alloc_size, [&](void* alloc) -> NextSteps {
return {true, true};
});
auto profile = std::move(token).Stop();
ExamineSamples(profile, Profile::Sample::GuardedStatus::NoAvailableSlots, {});
}
} // namespace
} // namespace tcmalloc_internal
} // namespace tcmalloc

View File

@ -0,0 +1,275 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tcmalloc/guarded_page_allocator.h"
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <algorithm>
#include <array>
#include <memory>
#include <string>
#include <thread> // NOLINT(build/c++11)
#include <vector>
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "absl/base/attributes.h"
#include "absl/base/casts.h"
#include "absl/base/internal/spinlock.h"
#include "absl/base/internal/sysinfo.h"
#include "absl/container/flat_hash_set.h"
#include "absl/memory/memory.h"
#include "absl/numeric/bits.h"
#include "absl/time/clock.h"
#include "absl/time/time.h"
#include "tcmalloc/common.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/internal/page_size.h"
#include "tcmalloc/malloc_extension.h"
#include "tcmalloc/static_vars.h"
namespace tcmalloc {
namespace tcmalloc_internal {
namespace {
static constexpr size_t kMaxGpaPages = GuardedPageAllocator::kGpaMaxPages;
// Size of pages used by GuardedPageAllocator.
static size_t PageSize() {
static const size_t page_size =
std::max(kPageSize, static_cast<size_t>(GetPageSize()));
return page_size;
}
class GuardedPageAllocatorTest : public testing::Test {
protected:
GuardedPageAllocatorTest() {
absl::base_internal::SpinLockHolder h(&pageheap_lock);
gpa_.Init(kMaxGpaPages, kMaxGpaPages);
gpa_.AllowAllocations();
}
explicit GuardedPageAllocatorTest(size_t num_pages) {
absl::base_internal::SpinLockHolder h(&pageheap_lock);
gpa_.Init(num_pages, kMaxGpaPages);
gpa_.AllowAllocations();
}
~GuardedPageAllocatorTest() override { gpa_.Destroy(); }
GuardedPageAllocator gpa_;
};
class GuardedPageAllocatorParamTest
: public GuardedPageAllocatorTest,
public testing::WithParamInterface<size_t> {
protected:
GuardedPageAllocatorParamTest() : GuardedPageAllocatorTest(GetParam()) {}
};
TEST_F(GuardedPageAllocatorTest, SingleAllocDealloc) {
auto alloc_with_status = gpa_.Allocate(PageSize(), 0);
EXPECT_EQ(alloc_with_status.status, Profile::Sample::GuardedStatus::Guarded);
char* buf = static_cast<char*>(alloc_with_status.alloc);
EXPECT_NE(buf, nullptr);
EXPECT_TRUE(gpa_.PointerIsMine(buf));
memset(buf, 'A', PageSize());
EXPECT_DEATH(buf[-1] = 'A', "");
EXPECT_DEATH(buf[PageSize()] = 'A', "");
gpa_.Deallocate(buf);
EXPECT_DEATH(buf[0] = 'B', "");
EXPECT_DEATH(buf[PageSize() / 2] = 'B', "");
EXPECT_DEATH(buf[PageSize() - 1] = 'B', "");
}
TEST_F(GuardedPageAllocatorTest, NoAlignmentProvided) {
constexpr size_t kLargeObjectAlignment =
std::max(static_cast<size_t>(kAlignment),
static_cast<size_t>(__STDCPP_DEFAULT_NEW_ALIGNMENT__));
for (size_t base_size = 1; base_size <= 64; base_size <<= 1) {
for (size_t size : {base_size, base_size + 1}) {
SCOPED_TRACE(size);
constexpr int kElements = 10;
std::array<void*, kElements> ptrs;
// Make several allocation attempts to encounter left/right-alignment in
// the guarded region.
for (int i = 0; i < kElements; i++) {
auto alloc_with_status = gpa_.Allocate(size, 0);
EXPECT_EQ(alloc_with_status.status,
Profile::Sample::GuardedStatus::Guarded);
ptrs[i] = alloc_with_status.alloc;
EXPECT_NE(ptrs[i], nullptr);
EXPECT_TRUE(gpa_.PointerIsMine(ptrs[i]));
size_t observed_alignment =
1 << absl::countr_zero(absl::bit_cast<uintptr_t>(ptrs[i]));
EXPECT_GE(observed_alignment, std::min(size, kLargeObjectAlignment));
}
for (void* ptr : ptrs) {
gpa_.Deallocate(ptr);
}
}
}
}
TEST_F(GuardedPageAllocatorTest, AllocDeallocAligned) {
for (size_t align = 1; align <= PageSize(); align <<= 1) {
constexpr size_t alloc_size = 1;
auto alloc_with_status = gpa_.Allocate(alloc_size, align);
EXPECT_EQ(alloc_with_status.status,
Profile::Sample::GuardedStatus::Guarded);
EXPECT_NE(alloc_with_status.alloc, nullptr);
EXPECT_TRUE(gpa_.PointerIsMine(alloc_with_status.alloc));
EXPECT_EQ(reinterpret_cast<uintptr_t>(alloc_with_status.alloc) % align, 0);
}
}
TEST_P(GuardedPageAllocatorParamTest, AllocDeallocAllPages) {
size_t num_pages = GetParam();
char* bufs[kMaxGpaPages];
for (size_t i = 0; i < num_pages; i++) {
auto alloc_with_status = gpa_.Allocate(1, 0);
EXPECT_EQ(alloc_with_status.status,
Profile::Sample::GuardedStatus::Guarded);
bufs[i] = reinterpret_cast<char*>(alloc_with_status.alloc);
EXPECT_NE(bufs[i], nullptr);
EXPECT_TRUE(gpa_.PointerIsMine(bufs[i]));
}
auto alloc_with_status = gpa_.Allocate(1, 0);
EXPECT_EQ(alloc_with_status.status,
Profile::Sample::GuardedStatus::NoAvailableSlots);
EXPECT_EQ(alloc_with_status.alloc, nullptr);
gpa_.Deallocate(bufs[0]);
alloc_with_status = gpa_.Allocate(1, 0);
EXPECT_EQ(alloc_with_status.status, Profile::Sample::GuardedStatus::Guarded);
bufs[0] = reinterpret_cast<char*>(alloc_with_status.alloc);
EXPECT_NE(bufs[0], nullptr);
EXPECT_TRUE(gpa_.PointerIsMine(bufs[0]));
for (size_t i = 0; i < num_pages; i++) {
bufs[i][0] = 'A';
gpa_.Deallocate(bufs[i]);
}
}
INSTANTIATE_TEST_SUITE_P(VaryNumPages, GuardedPageAllocatorParamTest,
testing::Values(1, kMaxGpaPages / 2, kMaxGpaPages));
TEST_F(GuardedPageAllocatorTest, PointerIsMine) {
auto alloc_with_status = gpa_.Allocate(1, 0);
EXPECT_EQ(alloc_with_status.status, Profile::Sample::GuardedStatus::Guarded);
void* buf = alloc_with_status.alloc;
int stack_var;
auto malloc_ptr = absl::make_unique<char>();
EXPECT_TRUE(gpa_.PointerIsMine(buf));
EXPECT_FALSE(gpa_.PointerIsMine(&stack_var));
EXPECT_FALSE(gpa_.PointerIsMine(malloc_ptr.get()));
}
TEST_F(GuardedPageAllocatorTest, Print) {
char buf[1024] = {};
Printer out(buf, sizeof(buf));
gpa_.Print(&out);
EXPECT_THAT(buf, testing::ContainsRegex("GWP-ASan Status"));
}
// Test that no pages are double-allocated or left unallocated, and that no
// extra pages are allocated when there's concurrent calls to Allocate().
TEST_F(GuardedPageAllocatorTest, ThreadedAllocCount) {
constexpr size_t kNumThreads = 2;
void* allocations[kNumThreads][kMaxGpaPages];
{
std::vector<std::thread> threads;
threads.reserve(kNumThreads);
for (size_t i = 0; i < kNumThreads; i++) {
threads.push_back(std::thread([this, &allocations, i]() {
for (size_t j = 0; j < kMaxGpaPages; j++) {
allocations[i][j] = gpa_.Allocate(1, 0).alloc;
}
}));
}
for (auto& t : threads) {
t.join();
}
}
absl::flat_hash_set<void*> allocations_set;
for (size_t i = 0; i < kNumThreads; i++) {
for (size_t j = 0; j < kMaxGpaPages; j++) {
allocations_set.insert(allocations[i][j]);
}
}
allocations_set.erase(nullptr);
EXPECT_EQ(allocations_set.size(), kMaxGpaPages);
}
// Test that allocator remains in consistent state under high contention and
// doesn't double-allocate pages or fail to deallocate pages.
TEST_F(GuardedPageAllocatorTest, ThreadedHighContention) {
const size_t kNumThreads = 4 * absl::base_internal::NumCPUs();
{
std::vector<std::thread> threads;
threads.reserve(kNumThreads);
for (size_t i = 0; i < kNumThreads; i++) {
threads.push_back(std::thread([this]() {
char* buf;
while (true) {
auto alloc_with_status = gpa_.Allocate(1, 0);
if (alloc_with_status.status ==
Profile::Sample::GuardedStatus::Guarded) {
buf = reinterpret_cast<char*>(alloc_with_status.alloc);
EXPECT_NE(buf, nullptr);
break;
}
absl::SleepFor(absl::Nanoseconds(5000));
}
// Verify that no other thread has access to this page.
EXPECT_EQ(buf[0], 0);
// Mark this page and allow some time for another thread to potentially
// gain access to this page.
buf[0] = 'A';
absl::SleepFor(absl::Nanoseconds(5000));
// Unmark this page and deallocate.
buf[0] = 0;
gpa_.Deallocate(buf);
}));
}
for (auto& t : threads) {
t.join();
}
}
// Verify all pages have been deallocated now that all threads are done.
for (size_t i = 0; i < kMaxGpaPages; i++) {
auto alloc_with_status = gpa_.Allocate(1, 0);
EXPECT_EQ(alloc_with_status.status,
Profile::Sample::GuardedStatus::Guarded);
EXPECT_NE(alloc_with_status.alloc, nullptr);
}
}
ABSL_CONST_INIT ABSL_ATTRIBUTE_UNUSED GuardedPageAllocator
gpa_is_constant_initializable;
} // namespace
} // namespace tcmalloc_internal
} // namespace tcmalloc

View File

@ -0,0 +1,239 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <sys/mman.h>
#include <functional>
#include <memory>
#include <optional>
#include <string>
#include "tcmalloc/internal/profile.pb.h"
#include "gtest/gtest.h"
#include "absl/base/attributes.h"
#include "absl/base/const_init.h"
#include "absl/base/internal/low_level_alloc.h"
#include "absl/base/internal/spinlock.h"
#include "absl/base/thread_annotations.h"
#include "absl/container/flat_hash_set.h"
#include "absl/status/statusor.h"
#include "absl/strings/str_format.h"
#include "absl/time/clock.h"
#include "absl/time/time.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/internal/profile_builder.h"
#include "tcmalloc/malloc_extension.h"
#include "tcmalloc/sampled_allocation.h"
#include "tcmalloc/static_vars.h"
#include "tcmalloc/testing/test_allocator_harness.h"
#include "tcmalloc/testing/thread_manager.h"
namespace tcmalloc {
namespace {
class HeapProfilingTest : public ::testing::TestWithParam<int64_t> {};
// Verify that heap profiling sessions concurrent with allocations/deallocations
// do not crash, as they all use `tc_globals.sampled_allocation_recorder_`. Also
// check that the data in the sample make sense. Here the
// allocations/deallocations can happen on the same thread or the object is
// allocated in one thread, transferred to another thread and deleted there.
TEST_P(HeapProfilingTest, GetHeapProfileWhileAllocAndDealloc) {
ScopedProfileSamplingRate s(GetParam());
const int kThreads = 10;
ThreadManager manager;
AllocatorHarness harness(kThreads);
// Some threads are busy with allocating and deallocating.
manager.Start(kThreads, [&](int thread_id) { harness.Run(thread_id); });
absl::Time start = absl::Now();
// Another few threads busy with iterating different kinds of heap profiles.
for (auto t : {
ProfileType::kHeap,
ProfileType::kFragmentation,
ProfileType::kPeakHeap,
}) {
manager.Start(2, [&](int) {
MallocExtension::SnapshotCurrent(t).Iterate(
[&](const Profile::Sample& s) {
// Inspect a few fields in the sample.
EXPECT_GE(s.sum, 0);
EXPECT_GT(s.depth, 0);
EXPECT_GT(s.requested_size, 0);
EXPECT_GT(s.allocated_size, 0);
EXPECT_GT(s.allocation_time, start - absl::Seconds(10));
EXPECT_LT(s.allocation_time, start + absl::Seconds(10));
});
});
}
absl::SleepFor(absl::Seconds(1));
manager.Stop();
}
// Test at different sampling rates, from always sampling to lower sampling
// probabilities. This is stress testing and attempts to expose potential
// failure modes when we only have sampled allocations and when we have a mix of
// sampled/unsampled allocations.
INSTANTIATE_TEST_SUITE_P(SamplingRates, HeapProfilingTest,
testing::Values(1, 1 << 7, 1 << 14, 1 << 21),
testing::PrintToStringParamName());
TEST(HeapProfilingTest, AllocateDifferentSizes) {
const int num_allocations = 1000;
const size_t requested_size1 = (1 << 19) + 1;
const size_t requested_size2 = (1 << 20) + 1;
int requested_size1_count = 0;
int requested_size2_count = 0;
// First allocate some large objects at a specific size, verify through heap
// profile, and deallocate them.
void* allocations1[num_allocations];
for (int i = 0; i < num_allocations; i++) {
allocations1[i] = ::operator new(requested_size1);
}
MallocExtension::SnapshotCurrent(ProfileType::kHeap)
.Iterate([&](const Profile::Sample& s) {
if (s.requested_size == requested_size1) requested_size1_count++;
if (s.requested_size == requested_size2) requested_size2_count++;
});
EXPECT_GT(requested_size1_count, 0);
EXPECT_EQ(requested_size2_count, 0);
requested_size1_count = 0;
for (int i = 0; i < num_allocations; i++) {
::operator delete(allocations1[i]);
}
// Next allocate some large objects at a different size, verify through heap
// profile, and deallocate them.
void* allocations2[num_allocations];
for (int i = 0; i < num_allocations; i++) {
allocations2[i] = ::operator new(requested_size2);
}
MallocExtension::SnapshotCurrent(ProfileType::kHeap)
.Iterate([&](const Profile::Sample& s) {
if (s.requested_size == requested_size1) requested_size1_count++;
if (s.requested_size == requested_size2) requested_size2_count++;
});
EXPECT_EQ(requested_size1_count, 0);
EXPECT_GT(requested_size2_count, 0);
for (int i = 0; i < num_allocations; i++) {
::operator delete(allocations2[i]);
}
}
TEST(HeapProfilingTest, CheckResidency) {
ScopedProfileSamplingRate s(1);
const int num_allocations = 1000;
const size_t requested_size = (1 << 19) + 1;
void* allocations[num_allocations];
for (int i = 0; i < num_allocations; i++) {
allocations[i] = ::operator new(requested_size);
}
bool mlock_failure = false;
for (int i = 0; i < num_allocations; i++) {
if (::mlock(allocations[i], requested_size) != 0) {
mlock_failure = true;
for (int j = 0; j < requested_size; ++j) {
static_cast<volatile char*>(allocations[i])[j] = 0x20;
}
}
}
if (mlock_failure) {
absl::FPrintF(
stderr,
"one or more mlocks failed, which could cause test flakiness\n");
}
// Collect the heap profile and look for residency info.
auto converted_or = tcmalloc_internal::MakeProfileProto(
MallocExtension::SnapshotCurrent(ProfileType::kHeap));
ASSERT_TRUE(converted_or.ok());
const auto& converted = **converted_or;
// Look for "sampled_resident_bytes" string in string table.
std::optional<int> sampled_resident_bytes_id;
for (int i = 0, n = converted.string_table().size(); i < n; ++i) {
if (converted.string_table(i) == "sampled_resident_bytes") {
sampled_resident_bytes_id = i;
}
}
ASSERT_TRUE(sampled_resident_bytes_id.has_value());
size_t resident_size = 0;
for (const auto& sample : converted.sample()) {
for (const auto& label : sample.label()) {
if (label.key() == sampled_resident_bytes_id) {
resident_size += label.num();
}
}
}
EXPECT_GE(resident_size, num_allocations * requested_size);
EXPECT_LE(resident_size, num_allocations * requested_size * 2);
for (int i = 0; i < num_allocations; i++) {
// throw away the error
::munlock(allocations[i], requested_size);
}
for (int i = 0; i < num_allocations; i++) {
::operator delete(allocations[i]);
}
}
// Make sure users can allocate when iterating over the heap samples. For now
// `MallocExtension::SnapshotCurrent()` uses `StackTraceTable` to make a copy of
// the sampled allocations from `tc_globals.sampled_allocation_recorder()` and
// then iterate from the `StackTraceTable`. Ideally, we would want to avoid the
// extra copy and iterate over sampled allocations directly. However, this would
// result in deadlocks for the test case below. If we `Iterate()` directly on
// `tc_globals.sampled_allocation_recorder()`, we hold the per-sample lock. As
// we add data to a hashtable that stores allocations (always sampled here), the
// hashtable can decide to `resize()`, deallocates the same sampled allocation
// it is iterating at, wants to get the per-sample lock and ends up with a
// deadlock. At the current state, making copies over sampled allocations and
// iterate over those copies would not deadlock and the test case below passes.
TEST(HeapProfilingTest, AllocateWhileIterating) {
ScopedProfileSamplingRate s(1);
absl::flat_hash_set<void*> set;
// This fills up the slots in hashtable and so there is a good chance it would
// call `resize()` when inserting new entries later. This makes it easier for
// the deadlock to happen (>95% of the cases when directly iterating over
// `tc_globals.sampled_allocation_recorder()`).
set.reserve(1);
set.insert(::operator new(100));
for (int i = 0; i < 3; i++) {
MallocExtension::SnapshotCurrent(ProfileType::kHeap)
.Iterate(
[&](const Profile::Sample& s) { set.insert(::operator new(100)); });
}
for (void* obj : set) {
::operator delete(obj);
}
}
} // namespace
} // namespace tcmalloc

View File

@ -0,0 +1,126 @@
// Copyright 2022 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef TCMALLOC_HINTED_TRACKER_LIST_H_
#define TCMALLOC_HINTED_TRACKER_LIST_H_
#include "tcmalloc/internal/linked_list.h"
#include "tcmalloc/internal/range_tracker.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
// This class wraps an array of N TrackerLists and a Bitmap storing which
// elements are non-empty.
template <class TrackerType, size_t N>
class HintedTrackerLists {
public:
using TrackerList = TList<TrackerType>;
constexpr HintedTrackerLists() : size_{} {}
// Removes a TrackerType from the first non-empty freelist with index at
// least n and returns it. Returns nullptr if there is none.
TrackerType* GetLeast(const size_t n) {
ASSERT(n < N);
size_t i = nonempty_.FindSet(n);
if (i == N) {
return nullptr;
}
ASSERT(!lists_[i].empty());
TrackerType* pt = lists_[i].first();
if (lists_[i].remove(pt)) {
nonempty_.ClearBit(i);
}
--size_;
return pt;
}
// Returns a pointer to the TrackerType from the first non-empty freelist with
// index at least n and returns it. Returns nullptr if there is none.
//
// Unlike GetLeast, this does not remove the pointer from the list when it is
// found.
TrackerType* PeekLeast(const size_t n) {
ASSERT(n < N);
size_t i = nonempty_.FindSet(n);
if (i == N) {
return nullptr;
}
ASSERT(!lists_[i].empty());
return lists_[i].first();
}
// Adds pointer <pt> to the nonempty_[i] list.
// REQUIRES: i < N && pt != nullptr.
void Add(TrackerType* pt, const size_t i) {
ASSERT(i < N);
ASSERT(pt != nullptr);
lists_[i].prepend(pt);
++size_;
nonempty_.SetBit(i);
}
// Removes pointer <pt> from the nonempty_[i] list.
// REQUIRES: i < N && pt != nullptr.
void Remove(TrackerType* pt, const size_t i) {
ASSERT(i < N);
ASSERT(pt != nullptr);
if (lists_[i].remove(pt)) {
nonempty_.ClearBit(i);
}
--size_;
}
const TrackerList& operator[](const size_t n) const {
ASSERT(n < N);
return lists_[n];
}
size_t size() const { return size_; }
bool empty() const { return size_ == 0; }
// Returns length of the list at an index <n>.
// REQUIRES: n < N.
size_t SizeOfList(const size_t n) const {
ASSERT(n < N);
return lists_[n].length();
}
// Runs a functor on all pointers in the TrackerLists.
// This method is const but the Functor gets passed a non-const pointer.
// This quirk is inherited from TrackerList.
template <typename Functor>
void Iter(const Functor& func, size_t start) const {
size_t i = nonempty_.FindSet(start);
while (i < N) {
auto& list = lists_[i];
ASSERT(!list.empty());
for (TrackerType* pt : list) {
func(pt);
}
i++;
if (i < N) i = nonempty_.FindSet(i);
}
}
private:
TrackerList lists_[N];
size_t size_;
Bitmap<N> nonempty_;
};
} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END
#endif // TCMALLOC_HINTED_TRACKER_LIST_H_

View File

@ -0,0 +1,374 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tcmalloc/huge_address_map.h"
#include <stdlib.h>
#include <algorithm>
#include <new>
#include "absl/base/internal/cycleclock.h"
#include "tcmalloc/internal/logging.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
const HugeAddressMap::Node* HugeAddressMap::Node::next() const {
const Node* n = right_;
if (n) {
while (n->left_) n = n->left_;
return n;
}
n = parent_;
const Node* last = this;
while (n) {
if (n->left_ == last) return n;
last = n;
n = n->parent_;
}
return nullptr;
}
HugeAddressMap::Node* HugeAddressMap::Node::next() {
const Node* n = static_cast<const Node*>(this)->next();
return const_cast<Node*>(n);
}
void HugeAddressMap::Node::Check(size_t* num_nodes, HugeLength* size) const {
HugeLength longest = range_.len();
*num_nodes += 1;
*size += range_.len();
if (left_) {
// tree
CHECK_CONDITION(left_->range_.start() < range_.start());
// disjoint
CHECK_CONDITION(left_->range_.end_addr() < range_.start_addr());
// well-formed
CHECK_CONDITION(left_->parent_ == this);
// heap
CHECK_CONDITION(left_->prio_ <= prio_);
left_->Check(num_nodes, size);
if (left_->longest_ > longest) longest = left_->longest_;
}
if (right_) {
// tree
CHECK_CONDITION(right_->range_.start() > range_.start());
// disjoint
CHECK_CONDITION(right_->range_.start_addr() > range_.end_addr());
// well-formed
CHECK_CONDITION(right_->parent_ == this);
// heap
CHECK_CONDITION(right_->prio_ <= prio_);
right_->Check(num_nodes, size);
if (right_->longest_ > longest) longest = right_->longest_;
}
CHECK_CONDITION(longest_ == longest);
}
const HugeAddressMap::Node* HugeAddressMap::first() const {
const Node* n = root();
if (!n) return nullptr;
const Node* left = n->left_;
while (left) {
n = left;
left = n->left_;
}
return n;
}
HugeAddressMap::Node* HugeAddressMap::first() {
const Node* f = static_cast<const HugeAddressMap*>(this)->first();
return const_cast<Node*>(f);
}
void HugeAddressMap::Check() {
size_t nodes = 0;
HugeLength size = NHugePages(0);
if (root_) {
CHECK_CONDITION(root_->parent_ == nullptr);
root_->Check(&nodes, &size);
}
CHECK_CONDITION(nodes == nranges());
CHECK_CONDITION(size == total_mapped());
CHECK_CONDITION(total_nodes_ == used_nodes_ + freelist_size_);
}
size_t HugeAddressMap::nranges() const { return used_nodes_; }
HugeLength HugeAddressMap::total_mapped() const { return total_size_; }
void HugeAddressMap::Print(Printer* out) const {
out->printf("HugeAddressMap: treap %zu / %zu nodes used / created\n",
used_nodes_, total_nodes_);
const size_t longest = root_ ? root_->longest_.raw_num() : 0;
out->printf("HugeAddressMap: %zu contiguous hugepages available\n", longest);
}
void HugeAddressMap::PrintInPbtxt(PbtxtRegion* hpaa) const {
hpaa->PrintI64("num_huge_address_map_treap_nodes_used", used_nodes_);
hpaa->PrintI64("num_huge_address_map_treap_nodes_created", total_nodes_);
const size_t longest = root_ ? root_->longest_.in_bytes() : 0;
hpaa->PrintI64("contiguous_free_bytes", longest);
}
HugeAddressMap::Node* HugeAddressMap::Predecessor(HugePage p) {
Node* n = root();
Node* best = nullptr;
while (n) {
HugeRange here = n->range_;
if (here.contains(p)) return n;
if (p < here.start()) {
// p comes before here:
// our predecessor isn't here, nor in the right subtree.
n = n->left_;
} else {
// p comes after here:
// here is a valid candidate, and the right subtree might have better.
best = n;
n = n->right_;
}
}
return best;
}
void HugeAddressMap::Merge(Node* b, HugeRange r, Node* a) {
auto merge_when = [](HugeRange x, int64_t x_when, HugeRange y,
int64_t y_when) {
// avoid overflow with floating-point
const size_t x_len = x.len().raw_num();
const size_t y_len = y.len().raw_num();
const double x_weight = static_cast<double>(x_len) * x_when;
const double y_weight = static_cast<double>(y_len) * y_when;
return static_cast<int64_t>((x_weight + y_weight) / (x_len + y_len));
};
int64_t when = absl::base_internal::CycleClock::Now();
// Two way merges are easy.
if (a == nullptr) {
b->when_ = merge_when(b->range_, b->when(), r, when);
b->range_ = Join(b->range_, r);
FixLongest(b);
return;
} else if (b == nullptr) {
a->when_ = merge_when(r, when, a->range_, a->when());
a->range_ = Join(r, a->range_);
FixLongest(a);
return;
}
// Three way merge: slightly harder. We must remove one node
// (arbitrarily picking next).
HugeRange partial = Join(r, a->range_);
int64_t partial_when = merge_when(r, when, a->range_, a->when());
HugeRange full = Join(b->range_, partial);
int64_t full_when = merge_when(b->range_, b->when(), partial, partial_when);
// Removing a will reduce total_size_ by that length, but since we're merging
// we actually don't change lengths at all; undo that.
total_size_ += a->range_.len();
Remove(a);
b->range_ = full;
b->when_ = full_when;
FixLongest(b);
}
void HugeAddressMap::Insert(HugeRange r) {
total_size_ += r.len();
// First, try to merge if necessary. Note there are three possibilities:
// we might need to merge before with r, r with after, or all three together.
Node* before = Predecessor(r.start());
CHECK_CONDITION(!before || !before->range_.intersects(r));
Node* after = before ? before->next() : first();
CHECK_CONDITION(!after || !after->range_.intersects(r));
if (before && before->range_.precedes(r)) {
if (after && r.precedes(after->range_)) {
Merge(before, r, after);
} else {
Merge(before, r, nullptr);
}
return;
} else if (after && r.precedes(after->range_)) {
Merge(nullptr, r, after);
return;
}
CHECK_CONDITION(!before || !before->range_.precedes(r));
CHECK_CONDITION(!after || !r.precedes(after->range_));
// No merging possible; just add a new node.
Node* n = Get(r);
Node* curr = root();
Node* parent = nullptr;
Node** link = &root_;
// Walk down the tree to our correct location
while (curr != nullptr && curr->prio_ >= n->prio_) {
curr->longest_ = std::max(curr->longest_, r.len());
parent = curr;
if (curr->range_.start() < r.start()) {
link = &curr->right_;
curr = curr->right_;
} else {
link = &curr->left_;
curr = curr->left_;
}
}
*link = n;
n->parent_ = parent;
n->left_ = n->right_ = nullptr;
n->longest_ = r.len();
if (curr) {
HugePage p = r.start();
// We need to split the treap at curr into n's children.
// This will be two treaps: one less than p, one greater, and has
// a nice recursive structure.
Node** less = &n->left_;
Node* lp = n;
Node** more = &n->right_;
Node* mp = n;
while (curr) {
if (curr->range_.start() < p) {
*less = curr;
curr->parent_ = lp;
less = &curr->right_;
lp = curr;
curr = curr->right_;
} else {
*more = curr;
curr->parent_ = mp;
more = &curr->left_;
mp = curr;
curr = curr->left_;
}
}
*more = *less = nullptr;
// We ripped apart the tree along these two paths--fix longest pointers.
FixLongest(lp);
FixLongest(mp);
}
}
void HugeAddressMap::Node::FixLongest() {
const HugeLength l = left_ ? left_->longest_ : NHugePages(0);
const HugeLength r = right_ ? right_->longest_ : NHugePages(0);
const HugeLength c = range_.len();
const HugeLength new_longest = std::max({l, r, c});
longest_ = new_longest;
}
void HugeAddressMap::FixLongest(HugeAddressMap::Node* n) {
while (n) {
n->FixLongest();
n = n->parent_;
}
}
void HugeAddressMap::Remove(HugeAddressMap::Node* n) {
total_size_ -= n->range_.len();
// We need to merge the left and right children of n into one
// treap, then glue it into place wherever n was.
Node** link;
Node* parent = n->parent_;
Node* top = n->left_;
Node* bottom = n->right_;
const HugeLength child_longest =
std::max(top ? top->longest_ : NHugePages(0),
bottom ? bottom->longest_ : NHugePages(0));
if (!parent) {
link = &root_;
} else {
// Account for the removed child--might change longests.
// Easiest way: update this subtree to ignore the removed node,
// then fix the chain of parents.
n->longest_ = child_longest;
FixLongest(parent);
if (parent->range_.start() > n->range_.start()) {
link = &parent->left_;
} else {
link = &parent->right_;
}
}
// A routine op we'll need a lot: given two (possibly null)
// children, put the root-ier one into top.
auto reorder_maybe = [](Node** top, Node** bottom) {
Node *b = *bottom, *t = *top;
if (b && (!t || t->prio_ < b->prio_)) {
*bottom = t;
*top = b;
}
};
reorder_maybe(&top, &bottom);
// if we have two treaps to merge (top is always non-null if bottom is)
// Invariant: top, bottom are two valid (longest included)
// treaps. parent (and all above/elsewhere) have the correct longest
// values, though parent does not have the correct children (will be the
// merged value of top and bottom.)
while (bottom) {
*link = top;
top->parent_ = parent;
// We're merging bottom into top, so top might contain a longer
// chunk than it thinks.
top->longest_ = std::max(top->longest_, bottom->longest_);
parent = top;
if (bottom->range_.start() < top->range_.start()) {
link = &top->left_;
top = top->left_;
} else {
link = &top->right_;
top = top->right_;
}
reorder_maybe(&top, &bottom);
}
*link = top;
if (top) top->parent_ = parent;
Put(n);
}
void HugeAddressMap::Put(Node* n) {
freelist_size_++;
used_nodes_--;
n->left_ = freelist_;
freelist_ = n;
}
HugeAddressMap::Node* HugeAddressMap::Get(HugeRange r) {
CHECK_CONDITION((freelist_ == nullptr) == (freelist_size_ == 0));
used_nodes_++;
int prio = rand_r(&seed_);
if (freelist_size_ == 0) {
total_nodes_++;
Node* ret = reinterpret_cast<Node*>(meta_(sizeof(Node)));
return new (ret) Node(r, prio);
}
freelist_size_--;
Node* ret = freelist_;
freelist_ = ret->left_;
return new (ret) Node(r, prio);
}
HugeAddressMap::Node::Node(HugeRange r, int prio)
: range_(r), prio_(prio), when_(absl::base_internal::CycleClock::Now()) {}
} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END

View File

@ -0,0 +1,147 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef TCMALLOC_HUGE_ADDRESS_MAP_H_
#define TCMALLOC_HUGE_ADDRESS_MAP_H_
#include <stddef.h>
#include <stdint.h>
#include "tcmalloc/huge_pages.h"
#include "tcmalloc/internal/logging.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
// Maintains a set of disjoint HugeRanges, merging adjacent ranges into one.
// Exposes a balanced (somehow) binary tree of free ranges on address,
// augmented with the largest range in each subtree (this allows fairly simple
// allocation algorithms from the contained ranges.
//
// This class scales well and is *reasonably* performant, but it is not intended
// for use on extremely hot paths.
class HugeAddressMap {
public:
typedef void* (*MetadataAllocFunction)(size_t bytes);
explicit constexpr HugeAddressMap(MetadataAllocFunction meta);
// IMPORTANT: DESTROYING A HUGE ADDRESS MAP DOES NOT MAKE ANY ATTEMPT
// AT FREEING ALLOCATED METADATA.
~HugeAddressMap() = default;
class Node {
public:
// the range stored at this point
HugeRange range() const;
// Tree structure
Node* left();
Node* right();
// Iterate to the next node in address order
const Node* next() const;
Node* next();
// when were this node's content added (in
// absl::base_internal::CycleClock::Now units)?
int64_t when() const;
// What is the length of the longest range in the subtree rooted here?
HugeLength longest() const;
private:
Node(HugeRange r, int prio);
friend class HugeAddressMap;
HugeRange range_;
int prio_; // chosen randomly
Node *left_, *right_;
Node* parent_;
HugeLength longest_;
int64_t when_;
// Expensive, recursive consistency check.
// Accumulates node count and range sizes into passed arguments.
void Check(size_t* num_nodes, HugeLength* size) const;
// We've broken longest invariants somehow; fix them here.
void FixLongest();
};
// Get root of the tree.
Node* root();
const Node* root() const;
// Get lowest-addressed node
const Node* first() const;
Node* first();
// Returns the highest-addressed range that does not lie completely
// after p (if any).
Node* Predecessor(HugePage p);
// Expensive consistency check.
void Check();
// Statistics
size_t nranges() const;
HugeLength total_mapped() const;
void Print(Printer* out) const;
void PrintInPbtxt(PbtxtRegion* hpaa) const;
// Add <r> to the map, merging with adjacent ranges as needed.
void Insert(HugeRange r);
// Delete n from the map.
void Remove(Node* n);
private:
// our tree
Node* root_{nullptr};
size_t used_nodes_{0};
HugeLength total_size_{NHugePages(0)};
// cache of unused nodes
Node* freelist_{nullptr};
size_t freelist_size_{0};
// How we get more
MetadataAllocFunction meta_;
Node* Get(HugeRange r);
void Put(Node* n);
size_t total_nodes_{0};
void Merge(Node* b, HugeRange r, Node* a);
void FixLongest(Node* n);
// Note that we always use the same seed, currently; this isn't very random.
// In practice we're not worried about adversarial input and this works well
// enough.
unsigned int seed_{0};
};
inline constexpr HugeAddressMap::HugeAddressMap(MetadataAllocFunction meta)
: meta_(meta) {}
inline HugeRange HugeAddressMap::Node::range() const { return range_; }
inline HugeAddressMap::Node* HugeAddressMap::Node::left() { return left_; }
inline HugeAddressMap::Node* HugeAddressMap::Node::right() { return right_; }
inline int64_t HugeAddressMap::Node::when() const { return when_; }
inline HugeLength HugeAddressMap::Node::longest() const { return longest_; }
inline HugeAddressMap::Node* HugeAddressMap::root() { return root_; }
inline const HugeAddressMap::Node* HugeAddressMap::root() const {
return root_;
}
} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END
#endif // TCMALLOC_HUGE_ADDRESS_MAP_H_

View File

@ -0,0 +1,86 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tcmalloc/huge_address_map.h"
#include <stddef.h>
#include <stdlib.h>
#include <vector>
#include "gmock/gmock.h"
#include "gtest/gtest.h"
namespace tcmalloc {
namespace tcmalloc_internal {
namespace {
class HugeAddressMapTest : public ::testing::Test {
protected:
HugeAddressMapTest() : map_(MallocMetadata) { metadata_allocs_.clear(); }
~HugeAddressMapTest() override {
for (void* p : metadata_allocs_) {
free(p);
}
}
std::vector<HugeRange> Contents() {
std::vector<HugeRange> ret;
auto node = map_.first();
while (node) {
ret.push_back(node->range());
node = node->next();
}
return ret;
}
HugePage hp(size_t i) { return {i}; }
HugeLength hl(size_t i) { return NHugePages(i); }
HugeAddressMap map_;
private:
static void* MallocMetadata(size_t size) {
void* ptr = malloc(size);
metadata_allocs_.push_back(ptr);
return ptr;
}
static std::vector<void*> metadata_allocs_;
};
std::vector<void*> HugeAddressMapTest::metadata_allocs_;
// This test verifies that HugeAddressMap merges properly.
TEST_F(HugeAddressMapTest, Merging) {
const HugeRange r1 = HugeRange::Make(hp(0), hl(1));
const HugeRange r2 = HugeRange::Make(hp(1), hl(1));
const HugeRange r3 = HugeRange::Make(hp(2), hl(1));
const HugeRange all = Join(r1, Join(r2, r3));
map_.Insert(r1);
map_.Check();
EXPECT_THAT(Contents(), testing::ElementsAre(r1));
map_.Insert(r3);
map_.Check();
EXPECT_THAT(Contents(), testing::ElementsAre(r1, r3));
map_.Insert(r2);
map_.Check();
EXPECT_THAT(Contents(), testing::ElementsAre(all));
}
} // namespace
} // namespace tcmalloc_internal
} // namespace tcmalloc

View File

@ -0,0 +1,174 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tcmalloc/huge_allocator.h"
#include <string.h>
#include "tcmalloc/huge_address_map.h"
#include "tcmalloc/internal/logging.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
void HugeAllocator::Print(Printer* out) {
out->printf("HugeAllocator: contiguous, unbacked hugepage(s)\n");
free_.Print(out);
out->printf(
"HugeAllocator: %zu requested - %zu in use = %zu hugepages free\n",
from_system_.raw_num(), in_use_.raw_num(),
(from_system_ - in_use_).raw_num());
}
void HugeAllocator::PrintInPbtxt(PbtxtRegion* hpaa) const {
free_.PrintInPbtxt(hpaa);
hpaa->PrintI64("num_total_requested_huge_pages", from_system_.raw_num());
hpaa->PrintI64("num_in_use_huge_pages", in_use_.raw_num());
}
HugeAddressMap::Node* HugeAllocator::Find(HugeLength n) {
HugeAddressMap::Node* curr = free_.root();
// invariant: curr != nullptr && curr->longest >= n
// we favor smaller gaps and lower nodes and lower addresses, in that
// order. The net effect is that we are neither a best-fit nor a
// lowest-address allocator but vaguely close to both.
HugeAddressMap::Node* best = nullptr;
while (curr && curr->longest() >= n) {
if (curr->range().len() >= n) {
if (!best || best->range().len() > curr->range().len()) {
best = curr;
}
}
// Either subtree could contain a better fit and we don't want to
// search the whole tree. Pick a reasonable child to look at.
auto left = curr->left();
auto right = curr->right();
if (!left || left->longest() < n) {
curr = right;
continue;
}
if (!right || right->longest() < n) {
curr = left;
continue;
}
// Here, we have a nontrivial choice.
if (left->range().len() == right->range().len()) {
if (left->longest() <= right->longest()) {
curr = left;
} else {
curr = right;
}
} else if (left->range().len() < right->range().len()) {
// Here, the longest range in both children is the same...look
// in the subtree with the smaller root, as that's slightly
// more likely to be our best.
curr = left;
} else {
curr = right;
}
}
return best;
}
void HugeAllocator::CheckFreelist() {
free_.Check();
size_t num_nodes = free_.nranges();
HugeLength n = free_.total_mapped();
free_.Check();
CHECK_CONDITION(n == from_system_ - in_use_);
LargeSpanStats large;
AddSpanStats(nullptr, &large, nullptr);
CHECK_CONDITION(num_nodes == large.spans);
CHECK_CONDITION(n.in_pages() == large.returned_pages);
}
HugeRange HugeAllocator::AllocateRange(HugeLength n) {
if (n.overflows()) return HugeRange::Nil();
size_t bytes = n.in_bytes();
size_t align = kHugePageSize;
auto [ptr, actual] = allocate_(bytes, align);
if (ptr == nullptr) {
// OOM...
return HugeRange::Nil();
}
CHECK_CONDITION(ptr != nullptr);
// It's possible for a request to return extra hugepages.
CHECK_CONDITION(actual % kHugePageSize == 0);
n = HLFromBytes(actual);
from_system_ += n;
return HugeRange::Make(HugePageContaining(ptr), n);
}
HugeRange HugeAllocator::Get(HugeLength n) {
CHECK_CONDITION(n > NHugePages(0));
auto* node = Find(n);
if (!node) {
// Get more memory, then "delete" it
HugeRange r = AllocateRange(n);
if (!r.valid()) return r;
in_use_ += r.len();
Release(r);
node = Find(n);
CHECK_CONDITION(node != nullptr);
}
in_use_ += n;
HugeRange r = node->range();
free_.Remove(node);
if (r.len() > n) {
HugeLength before = r.len();
HugeRange extra = HugeRange::Make(r.start() + n, before - n);
r = HugeRange::Make(r.start(), n);
ASSERT(r.precedes(extra));
ASSERT(r.len() + extra.len() == before);
in_use_ += extra.len();
Release(extra);
} else {
// Release does this for us
DebugCheckFreelist();
}
return r;
}
void HugeAllocator::Release(HugeRange r) {
in_use_ -= r.len();
free_.Insert(r);
DebugCheckFreelist();
}
void HugeAllocator::AddSpanStats(SmallSpanStats* small, LargeSpanStats* large,
PageAgeHistograms* ages) const {
for (const HugeAddressMap::Node* node = free_.first(); node != nullptr;
node = node->next()) {
HugeLength n = node->range().len();
if (large != nullptr) {
large->spans++;
large->returned_pages += n.in_pages();
}
if (ages != nullptr) {
ages->RecordRange(n.in_pages(), true, node->when());
}
}
}
} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END

View File

@ -0,0 +1,108 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Tracking information for the available range of hugepages,
// and a basic allocator for unmapped hugepages.
#ifndef TCMALLOC_HUGE_ALLOCATOR_H_
#define TCMALLOC_HUGE_ALLOCATOR_H_
#include <stddef.h>
#include "tcmalloc/common.h"
#include "tcmalloc/huge_address_map.h"
#include "tcmalloc/huge_pages.h"
#include "tcmalloc/stats.h"
#include "tcmalloc/system-alloc.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
// these typedefs allow replacement of tcmalloc::System* for tests.
using MemoryAllocFunction = AddressRange (*)(size_t bytes, size_t align);
using MetadataAllocFunction = void* (*)(size_t bytes);
// This tracks available ranges of hugepages and fulfills requests for
// usable memory, allocating more from the system as needed. All
// hugepages are treated as (and assumed to be) unbacked.
class HugeAllocator {
public:
constexpr HugeAllocator(MemoryAllocFunction allocate,
MetadataAllocFunction meta_allocate)
: free_(meta_allocate), allocate_(allocate) {}
// Obtain a range of n unbacked hugepages, distinct from all other
// calls to Get (other than those that have been Released.)
HugeRange Get(HugeLength n);
// Returns a range of hugepages for reuse by subsequent Gets().
// REQUIRES: <r> is the return value (or a subrange thereof) of a previous
// call to Get(); neither <r> nor any overlapping range has been released
// since that Get().
void Release(HugeRange r);
// Total memory requested from the system, whether in use or not,
HugeLength system() const { return from_system_; }
// Unused memory in the allocator.
HugeLength size() const { return from_system_ - in_use_; }
void AddSpanStats(SmallSpanStats* small, LargeSpanStats* large,
PageAgeHistograms* ages) const;
BackingStats stats() const {
BackingStats s;
s.system_bytes = system().in_bytes();
s.free_bytes = 0;
s.unmapped_bytes = size().in_bytes();
return s;
}
void Print(Printer* out);
void PrintInPbtxt(PbtxtRegion* hpaa) const;
private:
// We're constrained in several ways by existing code. Hard requirements:
// * no radix tree or similar O(address space) external space tracking
// * support sub releasing
// * low metadata overhead
// * no pre-allocation.
// * reasonable space overhead
//
// We use a treap ordered on addresses to track. This isn't the most
// efficient thing ever but we're about to hit 100usec+/hugepage
// backing costs if we've gotten this far; the last few bits of performance
// don't matter, and most of the simple ideas can't hit all of the above
// requirements.
HugeAddressMap free_;
HugeAddressMap::Node* Find(HugeLength n);
void CheckFreelist();
void DebugCheckFreelist() {
#ifndef NDEBUG
CheckFreelist();
#endif
}
HugeLength from_system_{NHugePages(0)};
HugeLength in_use_{NHugePages(0)};
MemoryAllocFunction allocate_;
HugeRange AllocateRange(HugeLength n);
};
} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END
#endif // TCMALLOC_HUGE_ALLOCATOR_H_

View File

@ -0,0 +1,448 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tcmalloc/huge_allocator.h"
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <algorithm>
#include <string>
#include <utility>
#include <vector>
#include "gtest/gtest.h"
#include "absl/base/internal/cycleclock.h"
#include "absl/random/random.h"
#include "absl/time/clock.h"
#include "absl/time/time.h"
#include "tcmalloc/huge_pages.h"
#include "tcmalloc/internal/config.h"
#include "tcmalloc/internal/logging.h"
namespace tcmalloc {
namespace tcmalloc_internal {
namespace {
class HugeAllocatorTest : public testing::TestWithParam<bool> {
private:
// Use a tiny fraction of actual size so we can test aggressively.
static AddressRange AllocateFake(size_t bytes, size_t align);
static constexpr size_t kMaxBacking = 1024 * 1024;
// This isn't super good form but we'll never have more than one HAT
// extant at once.
static std::vector<size_t> backing_;
// We use actual malloc for metadata allocations, but we track them so they
// can be deleted.
static void* MallocMetadata(size_t size);
static std::vector<void*> metadata_allocs_;
static size_t metadata_bytes_;
static bool should_overallocate_;
static HugeLength huge_pages_requested_;
static HugeLength huge_pages_received_;
protected:
HugeLength HugePagesRequested() { return huge_pages_requested_; }
HugeLength HugePagesReceived() { return huge_pages_received_; }
HugeAllocatorTest() {
should_overallocate_ = GetParam();
huge_pages_requested_ = NHugePages(0);
huge_pages_received_ = NHugePages(0);
// We don't use the first few bytes, because things might get weird
// given zero pointers.
backing_.resize(1024);
metadata_bytes_ = 0;
}
~HugeAllocatorTest() override {
for (void* p : metadata_allocs_) {
free(p);
}
metadata_allocs_.clear();
backing_.clear();
}
size_t* GetActual(HugePage p) { return &backing_[p.index()]; }
// We're dealing with a lot of memory, so we don't want to do full memset
// and then check every byte for corruption. So set the first and last
// byte in each page...
void CheckPages(HugeRange r, size_t c) {
for (HugePage p = r.first; p < r.first + r.n; ++p) {
EXPECT_EQ(c, *GetActual(p));
}
}
void MarkPages(HugeRange r, size_t c) {
for (HugePage p = r.first; p < r.first + r.n; ++p) {
*GetActual(p) = c;
}
}
void CheckStats(HugeLength expected_use) {
const HugeLength received = HugePagesReceived();
EXPECT_EQ(received, allocator_.system());
HugeLength used = received - allocator_.size();
EXPECT_EQ(used, expected_use);
}
HugeAllocator allocator_{AllocateFake, MallocMetadata};
};
// Use a tiny fraction of actual size so we can test aggressively.
AddressRange HugeAllocatorTest::AllocateFake(size_t bytes, size_t align) {
CHECK_CONDITION(bytes % kHugePageSize == 0);
CHECK_CONDITION(align % kHugePageSize == 0);
HugeLength req = HLFromBytes(bytes);
huge_pages_requested_ += req;
// Test the case where our sys allocator provides too much.
if (should_overallocate_) ++req;
huge_pages_received_ += req;
// we'll actually provide hidden backing, one word per hugepage.
bytes = req / NHugePages(1);
align /= kHugePageSize;
size_t index = backing_.size();
if (index % align != 0) {
index += (align - (index & align));
}
if (index + bytes > kMaxBacking) return {nullptr, 0};
backing_.resize(index + bytes);
void* ptr = reinterpret_cast<void*>(index * kHugePageSize);
return {ptr, req.in_bytes()};
}
// We use actual malloc for metadata allocations, but we track them so they
// can be deleted.
void* HugeAllocatorTest::MallocMetadata(size_t size) {
metadata_bytes_ += size;
void* ptr = malloc(size);
metadata_allocs_.push_back(ptr);
return ptr;
}
std::vector<size_t> HugeAllocatorTest::backing_;
std::vector<void*> HugeAllocatorTest::metadata_allocs_;
size_t HugeAllocatorTest::metadata_bytes_;
bool HugeAllocatorTest::should_overallocate_;
HugeLength HugeAllocatorTest::huge_pages_requested_;
HugeLength HugeAllocatorTest::huge_pages_received_;
TEST_P(HugeAllocatorTest, Basic) {
std::vector<std::pair<HugeRange, size_t>> allocs;
absl::BitGen rng;
size_t label = 0;
HugeLength total = NHugePages(0);
static const size_t kSize = 1000;
HugeLength peak = total;
for (int i = 0; i < kSize; ++i) {
HugeLength len =
NHugePages(absl::LogUniform<int32_t>(rng, 0, (1 << 12) - 1) + 1);
auto r = allocator_.Get(len);
ASSERT_TRUE(r.valid());
total += len;
peak = std::max(peak, total);
CheckStats(total);
MarkPages(r, label);
allocs.push_back({r, label});
label++;
}
for (int i = 0; i < 1000 * 25; ++i) {
size_t index = absl::Uniform<int32_t>(rng, 0, kSize);
std::swap(allocs[index], allocs[kSize - 1]);
auto p = allocs[kSize - 1];
CheckPages(p.first, p.second);
total -= p.first.len();
allocator_.Release(p.first);
CheckStats(total);
HugeLength len =
NHugePages(absl::LogUniform<int32_t>(rng, 0, (1 << 12) - 1) + 1);
auto r = allocator_.Get(len);
ASSERT_TRUE(r.valid());
ASSERT_EQ(r.len(), len);
total += len;
peak = std::max(peak, total);
CheckStats(total);
MarkPages(r, label);
allocs[kSize - 1] = {r, label};
label++;
}
for (auto p : allocs) {
CheckPages(p.first, p.second);
allocator_.Release(p.first);
}
}
// Check that releasing small chunks of allocations works OK.
TEST_P(HugeAllocatorTest, Subrelease) {
size_t label = 1;
const HugeLength kLen = NHugePages(8);
const HugeLength kTotal = kLen * (kLen / NHugePages(1) - 1);
for (int i = 0; i < 100; ++i) {
std::vector<std::pair<HugeRange, size_t>> allocs;
// get allocs of kLen and release different sized sub-chunks of them -
// make sure that doesn't break anything else.
for (HugeLength j = NHugePages(1); j < kLen; ++j) {
auto r = allocator_.Get(kLen);
ASSERT_TRUE(r.valid());
MarkPages(r, label);
allocator_.Release({r.start(), j});
allocs.push_back({{r.start() + j, kLen - j}, label});
label++;
}
EXPECT_EQ(kTotal, HugePagesRequested());
for (auto p : allocs) {
CheckPages(p.first, p.second);
allocator_.Release(p.first);
}
}
}
// Does subreleasing work OK for absurdly large allocations?
TEST_P(HugeAllocatorTest, SubreleaseLarge) {
absl::BitGen rng;
std::vector<std::pair<HugeRange, size_t>> allocs;
size_t label = 1;
const HugeLength kLimit = HLFromBytes(1024ul * 1024 * 1024 * 1024);
for (HugeLength n = NHugePages(2); n < kLimit; n *= 2) {
auto r = allocator_.Get(n);
ASSERT_TRUE(r.valid());
MarkPages(r, label);
// chunk of less than half
HugeLength chunk =
NHugePages(absl::Uniform<int32_t>(rng, 0, n / NHugePages(2)) + 1);
allocator_.Release({r.start(), chunk});
allocs.push_back({{r.start() + chunk, n - chunk}, label});
label++;
}
// reuse the released space
const HugeLength total = HugePagesRequested();
while (total == HugePagesRequested()) {
HugeLength n =
NHugePages(absl::LogUniform<int32_t>(rng, 0, (1 << 8) - 1) + 1);
auto r = allocator_.Get(n);
ASSERT_TRUE(r.valid());
MarkPages(r, label);
allocs.push_back({r, label});
label++;
}
for (auto p : allocs) {
CheckPages(p.first, p.second);
allocator_.Release(p.first);
}
}
// We don't care *that* much about vaddress space, but let's not be crazy.
// Don't fill tiny requests from big spaces.
TEST_P(HugeAllocatorTest, Fragmentation) {
// Prime the pump with some random allocations.
absl::BitGen rng;
std::vector<HugeRange> free;
constexpr int kSlots = 50;
// Plan to insert a large allocation at the big_slot'th index, then free it
// during the initial priming step (so we have at least a contiguous region of
// at least big hugepages).
HugeLength big = NHugePages(8);
const int big_slot = absl::Uniform(rng, 0, kSlots);
for (int i = 0; i < kSlots; ++i) {
if (i == big_slot) {
auto r = allocator_.Get(big);
ASSERT_TRUE(r.valid());
free.push_back(r);
}
auto r = allocator_.Get(NHugePages(1));
ASSERT_TRUE(r.valid());
if (absl::Bernoulli(rng, 1.0 / 2)) {
free.push_back(r);
}
}
size_t slots = free.size() - 1;
for (auto r : free) {
allocator_.Release(r);
}
free.clear();
static const size_t kReps = 5;
for (int i = 0; i < kReps; ++i) {
SCOPED_TRACE(i);
// Ensure we have a range of this size.
HugeRange r = allocator_.Get(big);
ASSERT_TRUE(r.valid());
if (NHugePages(slots) > allocator_.size()) {
// We should also have slots pages left over after allocating big
for (int i = 0; i < slots; ++i) {
HugeRange f = allocator_.Get(NHugePages(1));
ASSERT_TRUE(f.valid());
free.push_back(f);
}
for (auto f : free) {
allocator_.Release(f);
}
free.clear();
}
allocator_.Release(r);
// We should definitely have at least this many small spaces...
for (int i = 0; i < slots; ++i) {
r = allocator_.Get(NHugePages(1));
ASSERT_TRUE(r.valid());
free.push_back(r);
}
// that don't interfere with the available big space.
auto before = allocator_.system();
r = allocator_.Get(big);
ASSERT_TRUE(r.valid());
EXPECT_EQ(before, allocator_.system());
allocator_.Release(r);
for (auto r : free) {
allocator_.Release(r);
}
free.clear();
slots += big.raw_num();
big += big;
}
}
// Check that we only request as much as we actually need from the system.
TEST_P(HugeAllocatorTest, Frugal) {
HugeLength total = NHugePages(0);
static const size_t kSize = 1000;
for (int i = 1; i < kSize; ++i) {
HugeLength len = NHugePages(i);
// toss the range, we ain't using it
ASSERT_TRUE(allocator_.Get(len).valid());
total += len;
CheckStats(total);
EXPECT_EQ(total, HugePagesRequested());
}
}
TEST_P(HugeAllocatorTest, Stats) {
struct Helper {
static void Stats(const HugeAllocator* huge, size_t* num_spans,
Length* pages, absl::Duration* avg_age) {
SmallSpanStats small;
LargeSpanStats large;
PageAgeHistograms ages(absl::base_internal::CycleClock::Now());
huge->AddSpanStats(&small, &large, &ages);
for (auto i = Length(0); i < kMaxPages; ++i) {
EXPECT_EQ(0, small.normal_length[i.raw_num()]);
EXPECT_EQ(0, small.returned_length[i.raw_num()]);
}
*num_spans = large.spans;
EXPECT_EQ(Length(0), large.normal_pages);
*pages = large.returned_pages;
const PageAgeHistograms::Histogram* hist = ages.GetTotalHistogram(true);
*avg_age = absl::Seconds(hist->avg_age());
}
};
if (GetParam()) {
// Ensure overallocation doesn't skew our measurements below.
allocator_.Release(allocator_.Get(NHugePages(7)));
}
const HugeRange r = allocator_.Get(NHugePages(8));
ASSERT_TRUE(r.valid());
const HugePage p = r.start();
// Break it into 3 ranges, separated by one-page regions,
// so we can easily track the internal state in stats.
const HugeRange r1 = {p, NHugePages(1)};
const HugeRange b1 = {p + NHugePages(1), NHugePages(1)};
const HugeRange r2 = {p + NHugePages(2), NHugePages(2)};
const HugeRange b2 = {p + NHugePages(4), NHugePages(1)};
const HugeRange r3 = {p + NHugePages(5), NHugePages(3)};
size_t num_spans;
Length pages;
absl::Duration avg_age;
Helper::Stats(&allocator_, &num_spans, &pages, &avg_age);
EXPECT_EQ(0, num_spans);
EXPECT_EQ(Length(0), pages);
EXPECT_EQ(absl::ZeroDuration(), avg_age);
allocator_.Release(r1);
constexpr absl::Duration kDelay = absl::Milliseconds(500);
absl::SleepFor(kDelay);
Helper::Stats(&allocator_, &num_spans, &pages, &avg_age);
EXPECT_EQ(1, num_spans);
EXPECT_EQ(NHugePages(1).in_pages(), pages);
// We can only do >= testing, because we might be arbitrarily delayed.
// Since avg_age is computed in floating point, we may have round-off from
// TCMalloc's internal use of absl::base_internal::CycleClock down through
// computing the average age of the spans. kEpsilon allows for a tiny amount
// of slop.
constexpr absl::Duration kEpsilon = absl::Microseconds(500);
EXPECT_LE(kDelay - kEpsilon, avg_age);
allocator_.Release(r2);
absl::SleepFor(absl::Milliseconds(250));
Helper::Stats(&allocator_, &num_spans, &pages, &avg_age);
EXPECT_EQ(2, num_spans);
EXPECT_EQ(NHugePages(3).in_pages(), pages);
EXPECT_LE(
(absl::Seconds(0.75) * 1 + absl::Seconds(0.25) * 2) / (1 + 2) - kEpsilon,
avg_age);
allocator_.Release(r3);
absl::SleepFor(absl::Milliseconds(125));
Helper::Stats(&allocator_, &num_spans, &pages, &avg_age);
EXPECT_EQ(3, num_spans);
EXPECT_EQ(NHugePages(6).in_pages(), pages);
EXPECT_LE((absl::Seconds(0.875) * 1 + absl::Seconds(0.375) * 2 +
absl::Seconds(0.125) * 3) /
(1 + 2 + 3) -
kEpsilon,
avg_age);
allocator_.Release(b1);
allocator_.Release(b2);
absl::SleepFor(absl::Milliseconds(100));
Helper::Stats(&allocator_, &num_spans, &pages, &avg_age);
EXPECT_EQ(1, num_spans);
EXPECT_EQ(NHugePages(8).in_pages(), pages);
EXPECT_LE((absl::Seconds(0.975) * 1 + absl::Seconds(0.475) * 2 +
absl::Seconds(0.225) * 3 + absl::Seconds(0.1) * 2) /
(1 + 2 + 3 + 2) -
kEpsilon,
avg_age);
}
// Make sure we're well-behaved in the presence of OOM (and that we do
// OOM at some point...)
TEST_P(HugeAllocatorTest, OOM) {
HugeLength n = NHugePages(1);
while (allocator_.Get(n).valid()) {
n *= 2;
}
}
INSTANTIATE_TEST_SUITE_P(
NormalOverAlloc, HugeAllocatorTest, testing::Values(false, true),
+[](const testing::TestParamInfo<bool>& info) {
return info.param ? "overallocates" : "normal";
});
} // namespace
} // namespace tcmalloc_internal
} // namespace tcmalloc

View File

@ -0,0 +1,497 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tcmalloc/huge_cache.h"
#include <algorithm>
#include <tuple>
#include "absl/time/time.h"
#include "tcmalloc/common.h"
#include "tcmalloc/huge_address_map.h"
#include "tcmalloc/huge_pages.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/stats.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
template <size_t kEpochs>
void MinMaxTracker<kEpochs>::Report(HugeLength val) {
timeseries_.Report(val);
}
template <size_t kEpochs>
HugeLength MinMaxTracker<kEpochs>::MaxOverTime(absl::Duration t) const {
HugeLength m = NHugePages(0);
size_t num_epochs = ceil(absl::FDivDuration(t, kEpochLength));
timeseries_.IterBackwards([&](size_t offset, int64_t ts,
const Extrema& e) { m = std::max(m, e.max); },
num_epochs);
return m;
}
template <size_t kEpochs>
HugeLength MinMaxTracker<kEpochs>::MinOverTime(absl::Duration t) const {
HugeLength m = kMaxVal;
size_t num_epochs = ceil(absl::FDivDuration(t, kEpochLength));
timeseries_.IterBackwards([&](size_t offset, int64_t ts,
const Extrema& e) { m = std::min(m, e.min); },
num_epochs);
return m;
}
template <size_t kEpochs>
void MinMaxTracker<kEpochs>::Print(Printer* out) const {
// Prints timestamp:min_pages:max_pages for each window with records.
// Timestamp == kEpochs - 1 is the most recent measurement.
const int64_t millis = absl::ToInt64Milliseconds(kEpochLength);
out->printf("\nHugeCache: window %lldms * %zu", millis, kEpochs);
int written = 0;
timeseries_.Iter(
[&](size_t offset, int64_t ts, const Extrema& e) {
if ((written++) % 100 == 0)
out->printf("\nHugeCache: Usage timeseries ");
out->printf("%zu:%zu:%zd,", offset, e.min.raw_num(), e.max.raw_num());
},
timeseries_.kSkipEmptyEntries);
out->printf("\n");
}
template <size_t kEpochs>
void MinMaxTracker<kEpochs>::PrintInPbtxt(PbtxtRegion* hpaa) const {
// Prints content of each non-empty epoch, from oldest to most recent data
auto huge_cache_history = hpaa->CreateSubRegion("huge_cache_history");
huge_cache_history.PrintI64("window_ms",
absl::ToInt64Milliseconds(kEpochLength));
huge_cache_history.PrintI64("epochs", kEpochs);
timeseries_.Iter(
[&](size_t offset, int64_t ts, const Extrema& e) {
auto m = huge_cache_history.CreateSubRegion("measurements");
m.PrintI64("epoch", offset);
m.PrintI64("min_bytes", e.min.in_bytes());
m.PrintI64("max_bytes", e.max.in_bytes());
},
timeseries_.kSkipEmptyEntries);
}
template <size_t kEpochs>
bool MinMaxTracker<kEpochs>::Extrema::operator==(const Extrema& other) const {
return (other.max == max) && (other.min == min);
}
// Explicit instantiations of template
template class MinMaxTracker<>;
template class MinMaxTracker<600>;
// The logic for actually allocating from the cache or backing, and keeping
// the hit rates specified.
HugeRange HugeCache::DoGet(HugeLength n, bool* from_released) {
auto* node = Find(n);
if (!node) {
misses_++;
weighted_misses_ += n.raw_num();
HugeRange res = allocator_->Get(n);
if (res.valid()) {
*from_released = true;
}
return res;
}
hits_++;
weighted_hits_ += n.raw_num();
*from_released = false;
size_ -= n;
UpdateSize(size());
HugeRange result, leftover;
// Put back whatever we have left (or nothing, if it's exact.)
std::tie(result, leftover) = Split(node->range(), n);
cache_.Remove(node);
if (leftover.valid()) {
cache_.Insert(leftover);
}
return result;
}
void HugeCache::MaybeGrowCacheLimit(HugeLength missed) {
// Our goal is to make the cache size = the largest "brief dip."
//
// A "dip" being a case where usage shrinks, then increases back up
// to previous levels (at least partially).
//
// "brief" is "returns to normal usage in < kCacheTime." (In
// other words, we ideally want to be willing to cache memory for
// kCacheTime before expecting it to be used again--we are loose
// on the timing..)
//
// The interesting part is finding those dips.
// This is the downward slope: we lost some usage. (This in theory could
// be as much as 2 * kCacheTime old, which is fine.)
const HugeLength shrink = off_peak_tracker_.MaxOverTime(kCacheTime);
// This is the upward slope: we are coming back up.
const HugeLength grow = usage_ - usage_tracker_.MinOverTime(kCacheTime);
// Ideally we now know that we dipped down by some amount, then came
// up. Sadly our stats aren't quite good enough to guarantee things
// happened in the proper order. Suppose our usage takes the
// following path (in essentially zero time):
// 0, 10000, 5000, 5500.
//
// Clearly the proven dip here is 500. But we'll compute shrink = 5000,
// grow = 5500--we'd prefer to measure from a min *after* that shrink.
//
// It's difficult to ensure this, and hopefully this case is rare.
// TODO(b/134690209): figure out if we can solve that problem.
const HugeLength dip = std::min(shrink, grow);
// Fragmentation: we may need to cache a little more than the actual
// usage jump. 10% seems to be a reasonable addition that doesn't waste
// much space, but gets good performance on tests.
const HugeLength slack = dip / 10;
const HugeLength lim = dip + slack;
if (lim > limit()) {
last_limit_change_ = clock_.now();
limit_ = lim;
}
}
void HugeCache::IncUsage(HugeLength n) {
usage_ += n;
usage_tracker_.Report(usage_);
detailed_tracker_.Report(usage_);
off_peak_tracker_.Report(NHugePages(0));
}
void HugeCache::DecUsage(HugeLength n) {
usage_ -= n;
usage_tracker_.Report(usage_);
detailed_tracker_.Report(usage_);
const HugeLength max = usage_tracker_.MaxOverTime(kCacheTime);
ASSERT(max >= usage_);
const HugeLength off_peak = max - usage_;
off_peak_tracker_.Report(off_peak);
}
void HugeCache::UpdateSize(HugeLength size) {
size_tracker_.Report(size);
// TODO(b/134691947): moving this inside the MinMaxTracker would save one call
// to clock_.now() but all MinMaxTrackers would track regret instead.
int64_t now = clock_.now();
if (now > last_regret_update_) {
regret_ += size.raw_num() * (now - last_regret_update_);
last_regret_update_ = now;
}
}
HugeRange HugeCache::Get(HugeLength n, bool* from_released) {
HugeRange r = DoGet(n, from_released);
// failure to get a range should "never" "never" happen (VSS limits
// or wildly incorrect allocation sizes only...) Don't deal with
// this case for cache size accounting.
IncUsage(r.len());
const bool miss = r.valid() && *from_released;
if (miss) MaybeGrowCacheLimit(n);
return r;
}
void HugeCache::Release(HugeRange r) {
DecUsage(r.len());
cache_.Insert(r);
size_ += r.len();
if (size_ <= limit()) {
fills_++;
} else {
overflows_++;
}
// Shrink the limit, if we're going to do it, before we shrink to
// the max size. (This could reduce the number of regions we break
// in half to avoid overshrinking.)
if ((clock_.now() - last_limit_change_) > (cache_time_ticks_ * 2)) {
total_fast_unbacked_ += MaybeShrinkCacheLimit();
}
total_fast_unbacked_ += ShrinkCache(limit());
UpdateSize(size());
}
void HugeCache::ReleaseUnbacked(HugeRange r) {
DecUsage(r.len());
// No point in trying to cache it, just hand it back.
allocator_->Release(r);
}
HugeLength HugeCache::MaybeShrinkCacheLimit() {
last_limit_change_ = clock_.now();
const HugeLength min = size_tracker_.MinOverTime(kCacheTime * 2);
// If cache size has gotten down to at most 20% of max, we assume
// we're close enough to the optimal size--we don't want to fiddle
// too much/too often unless we have large gaps in usage.
if (min < limit() / 5) return NHugePages(0);
// Take away half of the unused portion.
HugeLength drop = std::max(min / 2, NHugePages(1));
limit_ = std::max(limit() <= drop ? NHugePages(0) : limit() - drop,
MinCacheLimit());
return ShrinkCache(limit());
}
HugeLength HugeCache::ShrinkCache(HugeLength target) {
HugeLength removed = NHugePages(0);
while (size_ > target) {
// Remove smallest-ish nodes, to avoid fragmentation where possible.
auto* node = Find(NHugePages(1));
CHECK_CONDITION(node);
HugeRange r = node->range();
cache_.Remove(node);
// Suppose we're 10 MiB over target but the smallest available node
// is 100 MiB. Don't go overboard--split up the range.
// In particular - this prevents disastrous results if we've decided
// the cache should be 99 MiB but the actual hot usage is 100 MiB
// (and it is unfragmented).
const HugeLength delta = size() - target;
if (r.len() > delta) {
HugeRange to_remove, leftover;
std::tie(to_remove, leftover) = Split(r, delta);
ASSERT(leftover.valid());
cache_.Insert(leftover);
r = to_remove;
}
size_ -= r.len();
// Note, actual unback implementation is temporarily dropping and
// re-acquiring the page heap lock here.
if (ABSL_PREDICT_FALSE(!unback_(r.start_addr(), r.byte_len()))) {
// We failed to release r. Retain it in the cache instead of returning it
// to the HugeAllocator.
size_ += r.len();
cache_.Insert(r);
break;
}
allocator_->Release(r);
removed += r.len();
}
return removed;
}
HugeLength HugeCache::ReleaseCachedPages(HugeLength n) {
// This is a good time to check: is our cache going persistently unused?
HugeLength released = MaybeShrinkCacheLimit();
if (released < n) {
n -= released;
const HugeLength target = n > size() ? NHugePages(0) : size() - n;
released += ShrinkCache(target);
}
UpdateSize(size());
total_periodic_unbacked_ += released;
return released;
}
void HugeCache::AddSpanStats(SmallSpanStats* small, LargeSpanStats* large,
PageAgeHistograms* ages) const {
static_assert(kPagesPerHugePage >= kMaxPages);
for (const HugeAddressMap::Node* node = cache_.first(); node != nullptr;
node = node->next()) {
HugeLength n = node->range().len();
if (large != nullptr) {
large->spans++;
large->normal_pages += n.in_pages();
}
if (ages != nullptr) {
ages->RecordRange(n.in_pages(), false, node->when());
}
}
}
HugeAddressMap::Node* HugeCache::Find(HugeLength n) {
HugeAddressMap::Node* curr = cache_.root();
// invariant: curr != nullptr && curr->longest >= n
// we favor smaller gaps and lower nodes and lower addresses, in that
// order. The net effect is that we are neither a best-fit nor a
// lowest-address allocator but vaguely close to both.
HugeAddressMap::Node* best = nullptr;
while (curr && curr->longest() >= n) {
if (curr->range().len() >= n) {
if (!best || best->range().len() > curr->range().len()) {
best = curr;
}
}
// Either subtree could contain a better fit and we don't want to
// search the whole tree. Pick a reasonable child to look at.
auto left = curr->left();
auto right = curr->right();
if (!left || left->longest() < n) {
curr = right;
continue;
}
if (!right || right->longest() < n) {
curr = left;
continue;
}
// Here, we have a nontrivial choice.
if (left->range().len() == right->range().len()) {
if (left->longest() <= right->longest()) {
curr = left;
} else {
curr = right;
}
} else if (left->range().len() < right->range().len()) {
// Here, the longest range in both children is the same...look
// in the subtree with the smaller root, as that's slightly
// more likely to be our best.
curr = left;
} else {
curr = right;
}
}
return best;
}
void HugeCache::Print(Printer* out) {
const int64_t millis = absl::ToInt64Milliseconds(kCacheTime);
out->printf(
"HugeCache: contains unused, backed hugepage(s) "
"(kCacheTime = %lldms)\n",
millis);
// a / (a + b), avoiding division by zero
auto safe_ratio = [](double a, double b) {
const double total = a + b;
if (total == 0) return 0.0;
return a / total;
};
const double hit_rate = safe_ratio(hits_, misses_);
const double overflow_rate = safe_ratio(overflows_, fills_);
out->printf(
"HugeCache: %zu / %zu hugepages cached / cache limit "
"(%.3f hit rate, %.3f overflow rate)\n",
size_.raw_num(), limit().raw_num(), hit_rate, overflow_rate);
out->printf("HugeCache: %zu MiB fast unbacked, %zu MiB periodic\n",
total_fast_unbacked_.in_bytes() / 1024 / 1024,
total_periodic_unbacked_.in_bytes() / 1024 / 1024);
UpdateSize(size());
out->printf(
"HugeCache: %zu MiB*s cached since startup\n",
NHugePages(regret_).in_mib() / static_cast<size_t>(clock_.freq()));
usage_tracker_.Report(usage_);
const HugeLength usage_min = usage_tracker_.MinOverTime(kCacheTime);
const HugeLength usage_max = usage_tracker_.MaxOverTime(kCacheTime);
out->printf(
"HugeCache: recent usage range: %zu min - %zu curr - %zu max MiB\n",
usage_min.in_mib(), usage_.in_mib(), usage_max.in_mib());
const HugeLength off_peak = usage_max - usage_;
off_peak_tracker_.Report(off_peak);
const HugeLength off_peak_min = off_peak_tracker_.MinOverTime(kCacheTime);
const HugeLength off_peak_max = off_peak_tracker_.MaxOverTime(kCacheTime);
out->printf(
"HugeCache: recent offpeak range: %zu min - %zu curr - %zu max MiB\n",
off_peak_min.in_mib(), off_peak.in_mib(), off_peak_max.in_mib());
const HugeLength cache_min = size_tracker_.MinOverTime(kCacheTime);
const HugeLength cache_max = size_tracker_.MaxOverTime(kCacheTime);
out->printf(
"HugeCache: recent cache range: %zu min - %zu curr - %zu max MiB\n",
cache_min.in_mib(), size_.in_mib(), cache_max.in_mib());
detailed_tracker_.Print(out);
}
void HugeCache::PrintInPbtxt(PbtxtRegion* hpaa) {
hpaa->PrintI64("huge_cache_time_const",
absl::ToInt64Milliseconds(kCacheTime));
// a / (a + b), avoiding division by zero
auto safe_ratio = [](double a, double b) {
const double total = a + b;
if (total == 0) return 0.0;
return a / total;
};
const double hit_rate = safe_ratio(hits_, misses_);
const double overflow_rate = safe_ratio(overflows_, fills_);
// number of bytes in HugeCache
hpaa->PrintI64("cached_huge_page_bytes", size_.in_bytes());
// max allowed bytes in HugeCache
hpaa->PrintI64("max_cached_huge_page_bytes", limit().in_bytes());
// lifetime cache hit rate
hpaa->PrintDouble("huge_cache_hit_rate", hit_rate);
// lifetime cache overflow rate
hpaa->PrintDouble("huge_cache_overflow_rate", overflow_rate);
// bytes eagerly unbacked by HugeCache
hpaa->PrintI64("fast_unbacked_bytes", total_fast_unbacked_.in_bytes());
// bytes unbacked by periodic releaser thread
hpaa->PrintI64("periodic_unbacked_bytes",
total_periodic_unbacked_.in_bytes());
UpdateSize(size());
// memory cached since startup (in MiB*s)
hpaa->PrintI64("huge_cache_regret", NHugePages(regret_).in_mib() /
static_cast<size_t>(clock_.freq()));
usage_tracker_.Report(usage_);
const HugeLength usage_min = usage_tracker_.MinOverTime(kCacheTime);
const HugeLength usage_max = usage_tracker_.MaxOverTime(kCacheTime);
{
auto usage_stats = hpaa->CreateSubRegion("huge_cache_usage_stats");
usage_stats.PrintI64("min_bytes", usage_min.in_bytes());
usage_stats.PrintI64("current_bytes", usage_.in_bytes());
usage_stats.PrintI64("max_bytes", usage_max.in_bytes());
}
const HugeLength off_peak = usage_max - usage_;
off_peak_tracker_.Report(off_peak);
const HugeLength off_peak_min = off_peak_tracker_.MinOverTime(kCacheTime);
const HugeLength off_peak_max = off_peak_tracker_.MaxOverTime(kCacheTime);
{
auto usage_stats = hpaa->CreateSubRegion("huge_cache_offpeak_stats");
usage_stats.PrintI64("min_bytes", off_peak_min.in_bytes());
usage_stats.PrintI64("current_bytes", off_peak.in_bytes());
usage_stats.PrintI64("max_bytes", off_peak_max.in_bytes());
}
const HugeLength cache_min = size_tracker_.MinOverTime(kCacheTime);
const HugeLength cache_max = size_tracker_.MaxOverTime(kCacheTime);
{
auto usage_stats = hpaa->CreateSubRegion("huge_cache_cache_stats");
usage_stats.PrintI64("min_bytes", cache_min.in_bytes());
usage_stats.PrintI64("current_bytes", size_.in_bytes());
usage_stats.PrintI64("max_bytes", cache_max.in_bytes());
}
detailed_tracker_.PrintInPbtxt(hpaa);
}
} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END

View File

@ -0,0 +1,263 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Wrapping interface for HugeAllocator that handles backing and
// unbacking, including a hot cache of backed single hugepages.
#ifndef TCMALLOC_HUGE_CACHE_H_
#define TCMALLOC_HUGE_CACHE_H_
#include <stddef.h>
#include <stdint.h>
#include <algorithm>
#include <limits>
#include "absl/time/time.h"
#include "tcmalloc/common.h"
#include "tcmalloc/experiment.h"
#include "tcmalloc/experiment_config.h"
#include "tcmalloc/huge_allocator.h"
#include "tcmalloc/huge_pages.h"
#include "tcmalloc/internal/config.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/internal/timeseries_tracker.h"
#include "tcmalloc/stats.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
class MemoryModifyFunction {
using ReleaseFunction = bool (*)(void*, size_t);
public:
explicit MemoryModifyFunction(ReleaseFunction func) : func_(func) {}
ABSL_MUST_USE_RESULT bool operator()(void* start, size_t len) {
return func_(start, len);
}
private:
ReleaseFunction func_;
};
// Track the extreme values of a HugeLength value over the past
// kWindow (time ranges approximate.)
template <size_t kEpochs = 16>
class MinMaxTracker {
public:
explicit constexpr MinMaxTracker(Clock clock, absl::Duration w)
: kEpochLength(w / kEpochs), timeseries_(clock, w) {}
void Report(HugeLength val);
void Print(Printer* out) const;
void PrintInPbtxt(PbtxtRegion* hpaa) const;
// If t < kEpochLength, these functions return statistics for last epoch. The
// granularity is kEpochLength (rounded up).
HugeLength MaxOverTime(absl::Duration t) const;
HugeLength MinOverTime(absl::Duration t) const;
private:
const absl::Duration kEpochLength;
static constexpr HugeLength kMaxVal =
NHugePages(std::numeric_limits<size_t>::max());
struct Extrema {
HugeLength min, max;
static Extrema Nil() {
Extrema e;
e.max = NHugePages(0);
e.min = kMaxVal;
return e;
}
void Report(HugeLength n) {
max = std::max(max, n);
min = std::min(min, n);
}
bool empty() const { return (*this == Nil()); }
bool operator==(const Extrema& other) const;
};
TimeSeriesTracker<Extrema, HugeLength, kEpochs> timeseries_;
};
// Explicit instantiations are defined in huge_cache.cc.
extern template class MinMaxTracker<>;
extern template class MinMaxTracker<600>;
template <size_t kEpochs>
constexpr HugeLength MinMaxTracker<kEpochs>::kMaxVal;
class HugeCache {
public:
// For use in production
HugeCache(HugeAllocator* allocator, MetadataAllocFunction meta_allocate,
MemoryModifyFunction unback)
: HugeCache(allocator, meta_allocate, unback,
Clock{.now = absl::base_internal::CycleClock::Now,
.freq = absl::base_internal::CycleClock::Frequency}) {}
// For testing with mock clock.
//
// 2s (kCacheTime * 2) looks like an arbitrary window; it mostly is.
//
// Suffice to say that the below code (see MaybeGrowCacheLimit)
// tries to make sure the cache is sized to protect a working set
// that ebbs for 1 second, as a reasonable heuristic. This means it
// needs 1s of historical data to examine.
//
// Why 2s duration, then? Two reasons:
//
// - (minor) granularity of epoch boundaries make me want to err towards
// keeping a bit too much data over a bit too little.
//
// - (major) hysteresis: in ReleaseCachedPages we try to detect
// mistaken cache expansion and reverse it. I hope that using a
// longer timescale than our expansion will increase stability
// here: I will take some caches staying a bit too big over caches
// oscillating back and forth between two size estimates, so we
// require stronger evidence (longer time) to reverse an expansion
// than to make it.
//
// We also tried other algorithms, but this one is simple and suffices to
// capture the empirical dynamics we've seen. See "Beyond Malloc
// Efficiency..." (https://research.google/pubs/pub50370/) for more
// information.
HugeCache(HugeAllocator* allocator, MetadataAllocFunction meta_allocate,
MemoryModifyFunction unback, Clock clock)
: allocator_(allocator),
cache_(meta_allocate),
clock_(clock),
cache_time_ticks_(clock_.freq() * absl::ToDoubleSeconds(kCacheTime)),
nanoseconds_per_tick_(absl::ToInt64Nanoseconds(absl::Seconds(1)) /
clock_.freq()),
last_limit_change_(clock.now()),
last_regret_update_(clock.now()),
detailed_tracker_(clock, absl::Minutes(10)),
usage_tracker_(clock, kCacheTime * 2),
off_peak_tracker_(clock, kCacheTime * 2),
size_tracker_(clock, kCacheTime * 2),
unback_(unback) {}
// Allocate a usable set of <n> contiguous hugepages. Try to give out
// memory that's currently backed from the kernel if we have it available.
// *from_released is set to false if the return range is already backed;
// otherwise, it is set to true (and the caller should back it.)
HugeRange Get(HugeLength n, bool* from_released);
// Deallocate <r> (assumed to be backed by the kernel.)
void Release(HugeRange r);
// As Release, but the range is assumed to _not_ be backed.
void ReleaseUnbacked(HugeRange r);
// Release to the system up to <n> hugepages of cache contents; returns
// the number of hugepages released.
HugeLength ReleaseCachedPages(HugeLength n);
// Backed memory available.
HugeLength size() const { return size_; }
// Total memory cached (in HugeLength * nanoseconds)
uint64_t regret() const { return regret_ * nanoseconds_per_tick_; }
// Current limit for how much backed memory we'll cache.
HugeLength limit() const { return limit_; }
// Sum total of unreleased requests.
HugeLength usage() const { return usage_; }
void AddSpanStats(SmallSpanStats* small, LargeSpanStats* large,
PageAgeHistograms* ages) const;
BackingStats stats() const {
BackingStats s;
s.system_bytes = (usage() + size()).in_bytes();
s.free_bytes = size().in_bytes();
s.unmapped_bytes = 0;
return s;
}
void Print(Printer* out);
void PrintInPbtxt(PbtxtRegion* hpaa);
private:
HugeAllocator* allocator_;
// We just cache-missed a request for <missed> pages;
// should we grow?
void MaybeGrowCacheLimit(HugeLength missed);
// Check if the cache seems consistently too big. Returns the
// number of pages *evicted* (not the change in limit).
HugeLength MaybeShrinkCacheLimit();
// Ensure the cache contains at most <target> hugepages,
// returning the number removed.
HugeLength ShrinkCache(HugeLength target);
HugeRange DoGet(HugeLength n, bool* from_released);
HugeAddressMap::Node* Find(HugeLength n);
HugeAddressMap cache_;
HugeLength size_{NHugePages(0)};
HugeLength limit_{NHugePages(10)};
const absl::Duration kCacheTime = absl::Seconds(1);
size_t hits_{0};
size_t misses_{0};
size_t fills_{0};
size_t overflows_{0};
uint64_t weighted_hits_{0};
uint64_t weighted_misses_{0};
// Sum(size of Gets) - Sum(size of Releases), i.e. amount of backed
// hugepages our user currently wants to have.
void IncUsage(HugeLength n);
void DecUsage(HugeLength n);
HugeLength usage_{NHugePages(0)};
// This is CycleClock, except overridable for tests.
Clock clock_;
const int64_t cache_time_ticks_;
const double nanoseconds_per_tick_;
int64_t last_limit_change_;
// 10 hugepages is a good baseline for our cache--easily wiped away
// by periodic release, and not that much memory on any real server.
// However, we can go below it if we haven't used that much for 30 seconds.
HugeLength MinCacheLimit() const { return NHugePages(10); }
uint64_t regret_{0}; // overflows if we cache 585 hugepages for 1 year
int64_t last_regret_update_;
void UpdateSize(HugeLength size);
MinMaxTracker<600> detailed_tracker_;
MinMaxTracker<> usage_tracker_;
MinMaxTracker<> off_peak_tracker_;
MinMaxTracker<> size_tracker_;
HugeLength total_fast_unbacked_{NHugePages(0)};
HugeLength total_periodic_unbacked_{NHugePages(0)};
MemoryModifyFunction unback_;
};
} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END
#endif // TCMALLOC_HUGE_CACHE_H_

View File

@ -0,0 +1,622 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tcmalloc/huge_cache.h"
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <algorithm>
#include <memory>
#include <random>
#include <string>
#include <tuple>
#include <utility>
#include <vector>
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "absl/base/internal/cycleclock.h"
#include "absl/memory/memory.h"
#include "absl/random/random.h"
#include "absl/strings/str_cat.h"
#include "absl/time/clock.h"
#include "absl/time/time.h"
#include "tcmalloc/huge_pages.h"
#include "tcmalloc/internal/clock.h"
#include "tcmalloc/internal/config.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/stats.h"
namespace tcmalloc {
namespace tcmalloc_internal {
namespace {
using testing::Return;
class HugeCacheTest : public testing::Test {
private:
// Allow tests to modify the clock used by the cache.
static int64_t clock_offset_;
static double GetClockFrequency() {
return absl::base_internal::CycleClock::Frequency();
}
static int64_t GetClock() {
return absl::base_internal::CycleClock::Now() +
clock_offset_ * GetClockFrequency() /
absl::ToDoubleNanoseconds(absl::Seconds(1));
}
// Use a tiny fraction of actual size so we can test aggressively.
static AddressRange AllocateFake(size_t bytes, size_t align) {
if (bytes % kHugePageSize != 0) {
Crash(kCrash, __FILE__, __LINE__, "not aligned", bytes, kHugePageSize);
}
if (align % kHugePageSize != 0) {
Crash(kCrash, __FILE__, __LINE__, "not aligned", align, kHugePageSize);
}
// we'll actually provide hidden backing, one word per hugepage.
bytes /= kHugePageSize;
align /= kHugePageSize;
size_t index = backing.size();
if (index % align != 0) {
index += (align - (index & align));
}
backing.resize(index + bytes);
void* ptr = reinterpret_cast<void*>(index * kHugePageSize);
return {ptr, bytes * kHugePageSize};
}
// This isn't super good form but we'll never have more than one HAT
// extant at once.
static std::vector<size_t> backing;
// We use actual malloc for metadata allocations, but we track them so they
// can be deleted. (TODO make this an arena if we care, which I doubt)
static void* MallocMetadata(size_t size) {
metadata_bytes += size;
void* ptr = calloc(size, 1);
metadata_allocs.push_back(ptr);
return ptr;
}
static std::vector<void*> metadata_allocs;
static size_t metadata_bytes;
// This is wordy, but necessary for mocking:
class BackingInterface {
public:
virtual bool Unback(void* p, size_t len) = 0;
virtual ~BackingInterface() {}
};
class MockBackingInterface : public BackingInterface {
public:
MOCK_METHOD(bool, Unback, (void* p, size_t len), (override));
};
static bool MockUnback(void* p, size_t len) { return mock_->Unback(p, len); }
protected:
static std::unique_ptr<testing::NiceMock<MockBackingInterface>> mock_;
HugeCacheTest() {
// We don't use the first few bytes, because things might get weird
// given zero pointers.
backing.resize(1024);
metadata_bytes = 0;
mock_ = absl::make_unique<testing::NiceMock<MockBackingInterface>>();
}
~HugeCacheTest() override {
for (void* p : metadata_allocs) {
free(p);
}
metadata_allocs.clear();
backing.clear();
mock_.reset(nullptr);
clock_offset_ = 0;
}
void Advance(absl::Duration d) {
clock_offset_ += absl::ToInt64Nanoseconds(d);
}
HugeAllocator alloc_{AllocateFake, MallocMetadata};
HugeCache cache_{&alloc_, MallocMetadata, MemoryModifyFunction(MockUnback),
Clock{.now = GetClock, .freq = GetClockFrequency}};
};
std::vector<size_t> HugeCacheTest::backing;
std::vector<void*> HugeCacheTest::metadata_allocs;
size_t HugeCacheTest::metadata_bytes;
std::unique_ptr<testing::NiceMock<HugeCacheTest::MockBackingInterface>>
HugeCacheTest::mock_;
int64_t HugeCacheTest::clock_offset_ = 0;
TEST_F(HugeCacheTest, Basic) {
bool from;
for (int i = 0; i < 100 * 1000; ++i) {
cache_.Release(cache_.Get(NHugePages(1), &from));
}
}
TEST_F(HugeCacheTest, Backing) {
bool from;
cache_.Release(cache_.Get(NHugePages(4), &from));
EXPECT_TRUE(from);
// We should be able to split up a large range...
HugeRange r1 = cache_.Get(NHugePages(3), &from);
EXPECT_FALSE(from);
HugeRange r2 = cache_.Get(NHugePages(1), &from);
EXPECT_FALSE(from);
// and then merge it back.
cache_.Release(r1);
cache_.Release(r2);
HugeRange r = cache_.Get(NHugePages(4), &from);
EXPECT_FALSE(from);
cache_.Release(r);
}
TEST_F(HugeCacheTest, Release) {
bool from;
const HugeLength one = NHugePages(1);
cache_.Release(cache_.Get(NHugePages(5), &from));
HugeRange r1, r2, r3, r4, r5;
r1 = cache_.Get(one, &from);
r2 = cache_.Get(one, &from);
r3 = cache_.Get(one, &from);
r4 = cache_.Get(one, &from);
r5 = cache_.Get(one, &from);
cache_.Release(r1);
cache_.Release(r2);
cache_.Release(r3);
cache_.Release(r4);
cache_.Release(r5);
r1 = cache_.Get(one, &from);
ASSERT_EQ(false, from);
r2 = cache_.Get(one, &from);
ASSERT_EQ(false, from);
r3 = cache_.Get(one, &from);
ASSERT_EQ(false, from);
r4 = cache_.Get(one, &from);
ASSERT_EQ(false, from);
r5 = cache_.Get(one, &from);
ASSERT_EQ(false, from);
cache_.Release(r1);
cache_.Release(r2);
cache_.Release(r5);
ASSERT_EQ(NHugePages(3), cache_.size());
EXPECT_CALL(*mock_, Unback(r5.start_addr(), kHugePageSize * 1))
.WillOnce(Return(true));
EXPECT_EQ(NHugePages(1), cache_.ReleaseCachedPages(NHugePages(1)));
cache_.Release(r3);
cache_.Release(r4);
EXPECT_CALL(*mock_, Unback(r1.start_addr(), 4 * kHugePageSize))
.WillOnce(Return(true));
EXPECT_EQ(NHugePages(4), cache_.ReleaseCachedPages(NHugePages(200)));
}
TEST_F(HugeCacheTest, ReleaseFailure) {
bool from;
const HugeLength one = NHugePages(1);
cache_.Release(cache_.Get(NHugePages(5), &from));
HugeRange r1, r2, r3, r4, r5;
r1 = cache_.Get(one, &from);
r2 = cache_.Get(one, &from);
r3 = cache_.Get(one, &from);
r4 = cache_.Get(one, &from);
r5 = cache_.Get(one, &from);
cache_.Release(r1);
cache_.Release(r2);
cache_.Release(r3);
cache_.Release(r4);
cache_.Release(r5);
r1 = cache_.Get(one, &from);
ASSERT_EQ(false, from);
r2 = cache_.Get(one, &from);
ASSERT_EQ(false, from);
r3 = cache_.Get(one, &from);
ASSERT_EQ(false, from);
r4 = cache_.Get(one, &from);
ASSERT_EQ(false, from);
r5 = cache_.Get(one, &from);
ASSERT_EQ(false, from);
cache_.Release(r1);
cache_.Release(r2);
cache_.Release(r5);
ASSERT_EQ(NHugePages(3), cache_.size());
EXPECT_CALL(*mock_, Unback(r5.start_addr(), 1 * kHugePageSize))
.WillOnce(Return(false));
EXPECT_EQ(NHugePages(0), cache_.ReleaseCachedPages(NHugePages(1)));
cache_.Release(r3);
cache_.Release(r4);
EXPECT_CALL(*mock_, Unback(r1.start_addr(), 5 * kHugePageSize))
.WillOnce(Return(false));
EXPECT_EQ(NHugePages(0), cache_.ReleaseCachedPages(NHugePages(200)));
}
TEST_F(HugeCacheTest, Regret) {
bool from;
HugeRange r = cache_.Get(NHugePages(20), &from);
cache_.Release(r);
HugeLength cached = cache_.size();
absl::Duration d = absl::Seconds(20);
Advance(d);
char buf[512];
Printer out(buf, 512);
cache_.Print(&out); // To update the regret
uint64_t expected_regret = absl::ToInt64Nanoseconds(d) * cached.raw_num();
// Not exactly accurate since the mock clock advances with real time, and
// when we measure regret will be updated.
EXPECT_NEAR(cache_.regret(), expected_regret, expected_regret / 100);
EXPECT_GE(cache_.regret(), expected_regret);
}
TEST_F(HugeCacheTest, Stats) {
bool from;
HugeRange r = cache_.Get(NHugePages(1 + 1 + 2 + 1 + 3), &from);
HugeRange r1, r2, r3, spacer1, spacer2;
std::tie(r1, spacer1) = Split(r, NHugePages(1));
std::tie(spacer1, r2) = Split(spacer1, NHugePages(1));
std::tie(r2, spacer2) = Split(r2, NHugePages(2));
std::tie(spacer2, r3) = Split(spacer2, NHugePages(1));
cache_.Release(r1);
cache_.Release(r2);
cache_.Release(r3);
ASSERT_EQ(NHugePages(6), cache_.size());
r1 = cache_.Get(NHugePages(1), &from);
ASSERT_EQ(false, from);
r2 = cache_.Get(NHugePages(2), &from);
ASSERT_EQ(false, from);
r3 = cache_.Get(NHugePages(3), &from);
ASSERT_EQ(false, from);
struct Helper {
static void Stat(const HugeCache& cache, size_t* spans,
Length* pages_backed, Length* pages_unbacked,
double* avg_age) {
PageAgeHistograms ages(absl::base_internal::CycleClock::Now());
LargeSpanStats large;
cache.AddSpanStats(nullptr, &large, &ages);
const PageAgeHistograms::Histogram* hist = ages.GetTotalHistogram(false);
*spans = large.spans;
*pages_backed = large.normal_pages;
*pages_unbacked = large.returned_pages;
*avg_age = hist->avg_age();
}
};
double avg_age;
size_t spans;
Length pages_backed;
Length pages_unbacked;
cache_.Release(r1);
absl::SleepFor(absl::Microseconds(5000));
Helper::Stat(cache_, &spans, &pages_backed, &pages_unbacked, &avg_age);
EXPECT_EQ(Length(0), pages_unbacked);
EXPECT_EQ(1, spans);
EXPECT_EQ(NHugePages(1).in_pages(), pages_backed);
EXPECT_LE(0.005, avg_age);
cache_.Release(r2);
absl::SleepFor(absl::Microseconds(2500));
Helper::Stat(cache_, &spans, &pages_backed, &pages_unbacked, &avg_age);
EXPECT_EQ(Length(0), pages_unbacked);
EXPECT_EQ(2, spans);
EXPECT_EQ(NHugePages(3).in_pages(), pages_backed);
EXPECT_LE((0.0075 * 1 + 0.0025 * 2) / (1 + 2), avg_age);
cache_.Release(r3);
absl::SleepFor(absl::Microseconds(1250));
Helper::Stat(cache_, &spans, &pages_backed, &pages_unbacked, &avg_age);
EXPECT_EQ(Length(0), pages_unbacked);
EXPECT_EQ(3, spans);
EXPECT_EQ(NHugePages(6).in_pages(), pages_backed);
EXPECT_LE((0.00875 * 1 + 0.00375 * 2 + 0.00125 * 3) / (1 + 2 + 3), avg_age);
}
static double Frac(HugeLength num, HugeLength denom) {
return static_cast<double>(num.raw_num()) / denom.raw_num();
}
TEST_F(HugeCacheTest, Growth) {
EXPECT_CALL(*mock_, Unback(testing::_, testing::_))
.WillRepeatedly(Return(true));
bool released;
absl::BitGen rng;
// fragmentation is a bit of a challenge
std::uniform_int_distribution<size_t> sizes(1, 5);
// fragment the cache badly.
std::vector<HugeRange> keep;
std::vector<HugeRange> drop;
for (int i = 0; i < 1000; ++i) {
auto& l = std::bernoulli_distribution()(rng) ? keep : drop;
l.push_back(cache_.Get(NHugePages(sizes(rng)), &released));
}
for (auto r : drop) {
cache_.Release(r);
}
// See the TODO in HugeCache::MaybeGrowCache; without this delay,
// the above fragmentation plays merry havoc with our instrumentation.
Advance(absl::Seconds(30));
// Test that our cache can grow to fit a working set.
HugeLength hot_set_sizes[] = {NHugePages(5), NHugePages(10), NHugePages(100),
NHugePages(10000)};
for (const HugeLength hot : hot_set_sizes) {
SCOPED_TRACE(absl::StrCat("cache size = ", hot.in_bytes() / 1024.0 / 1024.0,
" MiB"));
// Exercise the cache allocating about <hot> worth of data. After
// a brief warmup phase, we should do this without needing to back much.
auto alloc = [&]() -> std::pair<HugeLength, HugeLength> {
HugeLength got = NHugePages(0);
HugeLength needed_backing = NHugePages(0);
std::vector<HugeRange> items;
while (got < hot) {
HugeLength rest = hot - got;
HugeLength l = std::min(rest, NHugePages(sizes(rng)));
got += l;
items.push_back(cache_.Get(l, &released));
if (released) needed_backing += l;
}
for (auto r : items) {
cache_.Release(r);
}
return {needed_backing, got};
};
// warmup - we're allowed to incur misses and be too big.
for (int i = 0; i < 2; ++i) {
alloc();
}
HugeLength needed_backing = NHugePages(0);
HugeLength total = NHugePages(0);
for (int i = 0; i < 16; ++i) {
auto r = alloc();
needed_backing += r.first;
total += r.second;
// Cache shouldn't have just grown arbitrarily
const HugeLength cached = cache_.size();
// Allow us 10% slop, but don't get out of bed for tiny caches anyway.
const double ratio = Frac(cached, hot);
SCOPED_TRACE(
absl::StrCat(cached.raw_num(), "hps ", Frac(r.first, r.second)));
if (ratio > 1 && cached > NHugePages(16)) {
EXPECT_LE(ratio, 1.1);
}
}
// approximately, given the randomized sizing...
const double ratio = Frac(needed_backing, total);
EXPECT_LE(ratio, 0.3);
}
}
// If we repeatedly grow and shrink, but do so very slowly, we should *not*
// cache the large variation.
TEST_F(HugeCacheTest, SlowGrowthUncached) {
EXPECT_CALL(*mock_, Unback(testing::_, testing::_))
.WillRepeatedly(Return(true));
absl::BitGen rng;
std::uniform_int_distribution<size_t> sizes(1, 10);
for (int i = 0; i < 20; ++i) {
std::vector<HugeRange> rs;
for (int j = 0; j < 20; ++j) {
Advance(absl::Milliseconds(600));
bool released;
rs.push_back(cache_.Get(NHugePages(sizes(rng)), &released));
}
HugeLength max_cached = NHugePages(0);
for (auto r : rs) {
Advance(absl::Milliseconds(600));
cache_.Release(r);
max_cached = std::max(max_cached, cache_.size());
}
EXPECT_GE(NHugePages(10), max_cached);
}
}
// If very rarely we have a huge increase in usage, it shouldn't be cached.
TEST_F(HugeCacheTest, SpikesUncached) {
EXPECT_CALL(*mock_, Unback(testing::_, testing::_))
.WillRepeatedly(Return(true));
absl::BitGen rng;
std::uniform_int_distribution<size_t> sizes(1, 10);
for (int i = 0; i < 20; ++i) {
std::vector<HugeRange> rs;
for (int j = 0; j < 2000; ++j) {
bool released;
rs.push_back(cache_.Get(NHugePages(sizes(rng)), &released));
}
HugeLength max_cached = NHugePages(0);
for (auto r : rs) {
cache_.Release(r);
max_cached = std::max(max_cached, cache_.size());
}
EXPECT_GE(NHugePages(10), max_cached);
Advance(absl::Seconds(30));
}
}
// If very rarely we have a huge *decrease* in usage, it *should* be cached.
TEST_F(HugeCacheTest, DipsCached) {
absl::BitGen rng;
std::uniform_int_distribution<size_t> sizes(1, 10);
for (int i = 0; i < 20; ++i) {
std::vector<HugeRange> rs;
HugeLength got = NHugePages(0);
HugeLength uncached = NHugePages(0);
for (int j = 0; j < 2000; ++j) {
bool released;
HugeLength n = NHugePages(sizes(rng));
rs.push_back(cache_.Get(n, &released));
got += n;
if (released) uncached += n;
}
// Most of our time is at high usage...
Advance(absl::Seconds(30));
// Now immediately release and reallocate.
for (auto r : rs) {
cache_.Release(r);
}
// warmup
if (i >= 2) {
EXPECT_GE(0.07, Frac(uncached, got));
}
}
}
// Suppose in a previous era of behavior we needed a giant cache,
// but now we don't. Do we figure this out promptly?
TEST_F(HugeCacheTest, Shrink) {
absl::BitGen rng;
std::uniform_int_distribution<size_t> sizes(1, 10);
for (int i = 0; i < 20; ++i) {
std::vector<HugeRange> rs;
for (int j = 0; j < 2000; ++j) {
HugeLength n = NHugePages(sizes(rng));
bool released;
rs.push_back(cache_.Get(n, &released));
}
for (auto r : rs) {
cache_.Release(r);
}
}
ASSERT_LE(NHugePages(10000), cache_.size());
for (int i = 0; i < 30; ++i) {
// New working set <= 20 pages.
Advance(absl::Seconds(1));
// And do some work.
for (int j = 0; j < 100; ++j) {
bool released;
HugeRange r1 = cache_.Get(NHugePages(sizes(rng)), &released);
HugeRange r2 = cache_.Get(NHugePages(sizes(rng)), &released);
cache_.Release(r1);
cache_.Release(r2);
}
}
ASSERT_GE(NHugePages(25), cache_.limit());
}
TEST_F(HugeCacheTest, Usage) {
bool released;
auto r1 = cache_.Get(NHugePages(10), &released);
EXPECT_EQ(NHugePages(10), cache_.usage());
auto r2 = cache_.Get(NHugePages(100), &released);
EXPECT_EQ(NHugePages(110), cache_.usage());
cache_.Release(r1);
EXPECT_EQ(NHugePages(100), cache_.usage());
// Pretend we unbacked this.
cache_.ReleaseUnbacked(r2);
EXPECT_EQ(NHugePages(0), cache_.usage());
}
class MinMaxTrackerTest : public testing::Test {
protected:
void Advance(absl::Duration d) {
clock_ += absl::ToDoubleSeconds(d) * GetFakeClockFrequency();
}
static int64_t FakeClock() { return clock_; }
static double GetFakeClockFrequency() {
return absl::ToDoubleNanoseconds(absl::Seconds(2));
}
private:
static int64_t clock_;
};
int64_t MinMaxTrackerTest::clock_{0};
TEST_F(MinMaxTrackerTest, Works) {
const absl::Duration kDuration = absl::Seconds(2);
MinMaxTracker<> tracker{
Clock{.now = FakeClock, .freq = GetFakeClockFrequency}, kDuration};
tracker.Report(NHugePages(0));
EXPECT_EQ(NHugePages(0), tracker.MaxOverTime(kDuration));
EXPECT_EQ(NHugePages(0), tracker.MinOverTime(kDuration));
tracker.Report(NHugePages(10));
EXPECT_EQ(NHugePages(10), tracker.MaxOverTime(kDuration));
EXPECT_EQ(NHugePages(0), tracker.MinOverTime(kDuration));
tracker.Report(NHugePages(5));
EXPECT_EQ(NHugePages(10), tracker.MaxOverTime(kDuration));
EXPECT_EQ(NHugePages(0), tracker.MinOverTime(kDuration));
tracker.Report(NHugePages(100));
EXPECT_EQ(NHugePages(100), tracker.MaxOverTime(kDuration));
EXPECT_EQ(NHugePages(0), tracker.MinOverTime(kDuration));
// Some tests for advancing time
Advance(kDuration / 3);
tracker.Report(NHugePages(2));
EXPECT_EQ(NHugePages(2), tracker.MaxOverTime(absl::Nanoseconds(1)));
EXPECT_EQ(NHugePages(100), tracker.MaxOverTime(kDuration / 2));
EXPECT_EQ(NHugePages(100), tracker.MaxOverTime(kDuration));
EXPECT_EQ(NHugePages(2), tracker.MinOverTime(absl::Nanoseconds(1)));
EXPECT_EQ(NHugePages(0), tracker.MinOverTime(kDuration / 2));
EXPECT_EQ(NHugePages(0), tracker.MinOverTime(kDuration));
Advance(kDuration / 3);
tracker.Report(NHugePages(5));
EXPECT_EQ(NHugePages(5), tracker.MaxOverTime(absl::Nanoseconds(1)));
EXPECT_EQ(NHugePages(5), tracker.MaxOverTime(kDuration / 2));
EXPECT_EQ(NHugePages(100), tracker.MaxOverTime(kDuration));
EXPECT_EQ(NHugePages(5), tracker.MinOverTime(absl::Nanoseconds(1)));
EXPECT_EQ(NHugePages(2), tracker.MinOverTime(kDuration / 2));
EXPECT_EQ(NHugePages(0), tracker.MinOverTime(kDuration));
// This should annihilate everything.
Advance(kDuration * 2);
tracker.Report(NHugePages(1));
EXPECT_EQ(NHugePages(1), tracker.MaxOverTime(absl::Nanoseconds(1)));
EXPECT_EQ(NHugePages(1), tracker.MinOverTime(absl::Nanoseconds(1)));
EXPECT_EQ(NHugePages(1), tracker.MaxOverTime(kDuration));
EXPECT_EQ(NHugePages(1), tracker.MinOverTime(kDuration));
}
} // namespace
} // namespace tcmalloc_internal
} // namespace tcmalloc

View File

@ -0,0 +1,847 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tcmalloc/huge_page_aware_allocator.h"
#include <stdint.h>
#include <string.h>
#include <new>
#include "absl/base/internal/cycleclock.h"
#include "absl/base/internal/spinlock.h"
#include "absl/base/thread_annotations.h"
#include "absl/time/time.h"
#include "tcmalloc/common.h"
#include "tcmalloc/experiment.h"
#include "tcmalloc/experiment_config.h"
#include "tcmalloc/huge_allocator.h"
#include "tcmalloc/huge_page_filler.h"
#include "tcmalloc/huge_pages.h"
#include "tcmalloc/internal/environment.h"
#include "tcmalloc/internal/lifetime_predictions.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/internal/optimization.h"
#include "tcmalloc/internal/prefetch.h"
#include "tcmalloc/lifetime_based_allocator.h"
#include "tcmalloc/pagemap.h"
#include "tcmalloc/parameters.h"
#include "tcmalloc/span.h"
#include "tcmalloc/static_vars.h"
#include "tcmalloc/stats.h"
#include "tcmalloc/system-alloc.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
bool decide_want_hpaa();
ABSL_ATTRIBUTE_WEAK int default_want_hpaa();
ABSL_ATTRIBUTE_WEAK int default_subrelease();
bool decide_subrelease() {
if (!decide_want_hpaa()) {
// Subrelease is off if HPAA is off.
return false;
}
const char* e = thread_safe_getenv("TCMALLOC_HPAA_CONTROL");
if (e) {
switch (e[0]) {
case '0':
if (default_want_hpaa != nullptr) {
int default_hpaa = default_want_hpaa();
if (default_hpaa < 0) {
return false;
}
}
Log(kLog, __FILE__, __LINE__,
"Runtime opt-out from HPAA requires building with "
"//tcmalloc:want_no_hpaa."
);
break;
case '1':
return false;
case '2':
return true;
default:
Crash(kCrash, __FILE__, __LINE__, "bad env var", e);
return false;
}
}
if (default_subrelease != nullptr) {
const int decision = default_subrelease();
if (decision != 0) {
return decision > 0;
}
}
return true;
}
FillerPartialRerelease decide_partial_rerelease() {
const char* e = thread_safe_getenv("TCMALLOC_PARTIAL_RELEASE_CONTROL");
if (e) {
if (e[0] == '0') {
return FillerPartialRerelease::Return;
}
if (e[0] == '1') {
return FillerPartialRerelease::Retain;
}
Crash(kCrash, __FILE__, __LINE__, "bad env var", e);
}
return FillerPartialRerelease::Retain;
}
LifetimePredictionOptions decide_lifetime_predictions() {
// See LifetimePredictionOptions::FromFlag for a description of the format.
const char* e = tcmalloc::tcmalloc_internal::thread_safe_getenv(
"TCMALLOC_LIFETIMES_CONTROL");
if (e != nullptr) {
return LifetimePredictionOptions::FromFlag(e);
}
return LifetimePredictionOptions::Default();
}
HugeRegionCountOption use_huge_region_for_often() {
return (IsExperimentActive(
Experiment::TEST_ONLY_TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN) ||
IsExperimentActive(Experiment::TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN))
? HugeRegionCountOption::kAbandonedCount
: HugeRegionCountOption::kSlack;
}
// Some notes: locking discipline here is a bit funny, because
// we want to *not* hold the pageheap lock while backing memory.
// We have here a collection of slightly different allocators each
// optimized for slightly different purposes. This file has two main purposes:
// - pick the right one for a given allocation
// - provide enough data to figure out what we picked last time!
HugePageAwareAllocator::HugePageAwareAllocator(MemoryTag tag)
: HugePageAwareAllocator(tag, use_huge_region_for_often(),
decide_lifetime_predictions()) {}
HugePageAwareAllocator::HugePageAwareAllocator(
MemoryTag tag, HugeRegionCountOption use_huge_region_more_often)
: HugePageAwareAllocator(tag, use_huge_region_more_often,
decide_lifetime_predictions()) {}
HugePageAwareAllocator::HugePageAwareAllocator(
MemoryTag tag, HugeRegionCountOption use_huge_region_more_often,
LifetimePredictionOptions lifetime_options)
: PageAllocatorInterface("HugePageAware", tag),
filler_(decide_partial_rerelease(),
Parameters::separate_allocs_for_few_and_many_objects_spans(),
MemoryModifyFunction(SystemRelease)),
alloc_(
[](MemoryTag tag) {
// TODO(ckennelly): Remove the template parameter.
switch (tag) {
case MemoryTag::kNormal:
return AllocAndReport<MemoryTag::kNormal>;
case MemoryTag::kNormalP1:
return AllocAndReport<MemoryTag::kNormalP1>;
case MemoryTag::kSampled:
return AllocAndReport<MemoryTag::kSampled>;
case MemoryTag::kCold:
return AllocAndReport<MemoryTag::kCold>;
default:
ASSUME(false);
__builtin_unreachable();
}
}(tag),
MetaDataAlloc),
cache_(HugeCache{&alloc_, MetaDataAlloc,
MemoryModifyFunction(UnbackWithoutLock)}),
lifetime_allocator_region_alloc_(this),
lifetime_allocator_(lifetime_options, &lifetime_allocator_region_alloc_),
use_huge_region_more_often_(use_huge_region_more_often) {
tracker_allocator_.Init(&tc_globals.arena());
region_allocator_.Init(&tc_globals.arena());
}
HugePageAwareAllocator::FillerType::Tracker* HugePageAwareAllocator::GetTracker(
HugePage p) {
void* v = tc_globals.pagemap().GetHugepage(p.first_page());
FillerType::Tracker* pt = reinterpret_cast<FillerType::Tracker*>(v);
ASSERT(pt == nullptr || pt->location() == p);
return pt;
}
void HugePageAwareAllocator::SetTracker(
HugePage p, HugePageAwareAllocator::FillerType::Tracker* pt) {
tc_globals.pagemap().SetHugepage(p.first_page(), pt);
}
PageId HugePageAwareAllocator::AllocAndContribute(HugePage p, Length n,
size_t num_objects,
bool donated) {
CHECK_CONDITION(p.start_addr() != nullptr);
FillerType::Tracker* pt = tracker_allocator_.New();
new (pt)
FillerType::Tracker(p, absl::base_internal::CycleClock::Now(), donated);
ASSERT(pt->longest_free_range() >= n);
ASSERT(pt->was_donated() == donated);
// if the page was donated, we track its size so that we can potentially
// measure it in abandoned_count_ once this large allocation gets deallocated.
if (pt->was_donated()) {
pt->set_abandoned_count(n);
}
PageId page = pt->Get(n).page;
ASSERT(page == p.first_page());
SetTracker(p, pt);
filler_.Contribute(pt, donated, num_objects);
ASSERT(pt->was_donated() == donated);
return page;
}
PageId HugePageAwareAllocator::RefillFiller(Length n, size_t num_objects,
bool* from_released) {
HugeRange r = cache_.Get(NHugePages(1), from_released);
if (!r.valid()) return PageId{0};
// This is duplicate to Finalize, but if we need to break up
// hugepages to get to our usage limit it would be very bad to break
// up what's left of r after we allocate from there--while r is
// mostly empty, clearly what's left in the filler is too fragmented
// to be very useful, and we would rather release those
// pages. Otherwise, we're nearly guaranteed to release r (if n
// isn't very large), and the next allocation will just repeat this
// process.
tc_globals.page_allocator().ShrinkToUsageLimit(n);
return AllocAndContribute(r.start(), n, num_objects, /*donated=*/false);
}
Span* HugePageAwareAllocator::Finalize(Length n, size_t num_objects,
PageId page)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
ASSERT(page != PageId{0});
Span* ret = Span::New(page, n);
tc_globals.pagemap().Set(page, ret);
ASSERT(!ret->sampled());
info_.RecordAlloc(page, n, num_objects);
tc_globals.page_allocator().ShrinkToUsageLimit(n);
return ret;
}
// For anything <= half a huge page, we will unconditionally use the filler
// to pack it into a single page. If we need another page, that's fine.
Span* HugePageAwareAllocator::AllocSmall(Length n, size_t objects_per_span,
bool* from_released) {
auto [pt, page] = filler_.TryGet(n, objects_per_span);
if (ABSL_PREDICT_TRUE(pt != nullptr)) {
*from_released = false;
return Finalize(n, objects_per_span, page);
}
page = RefillFiller(n, objects_per_span, from_released);
if (ABSL_PREDICT_FALSE(page == PageId{0})) {
return nullptr;
}
return Finalize(n, objects_per_span, page);
}
Span* HugePageAwareAllocator::AllocLarge(Length n, size_t objects_per_span,
bool* from_released,
LifetimeStats* lifetime_context) {
// If it's an exact page multiple, just pull it from pages directly.
HugeLength hl = HLFromPages(n);
if (hl.in_pages() == n) {
return AllocRawHugepages(n, objects_per_span, from_released);
}
PageId page;
// If we fit in a single hugepage, try the Filler first.
if (n < kPagesPerHugePage) {
auto [pt, page] = filler_.TryGet(n, objects_per_span);
if (ABSL_PREDICT_TRUE(pt != nullptr)) {
*from_released = false;
return Finalize(n, objects_per_span, page);
}
}
// Try to perform a lifetime-based allocation.
LifetimeBasedAllocator::AllocationResult lifetime =
lifetime_allocator_.MaybeGet(n, from_released, lifetime_context);
// TODO(mmaas): Implement tracking if this is subsequently put into a
// conventional region (currently ignored).
// Was an object allocated in the lifetime region? If so, we return it.
if (lifetime.TryGetAllocation(&page)) {
return Finalize(n, objects_per_span, page);
}
// If we're using regions in this binary (see below comment), is
// there currently available space there?
if (regions_.MaybeGet(n, &page, from_released)) {
return Finalize(n, objects_per_span, page);
}
// We have two choices here: allocate a new region or go to
// hugepages directly (hoping that slack will be filled by small
// allocation.) The second strategy is preferrable, as it's
// typically faster and usually more space efficient, but it's sometimes
// catastrophic.
//
// See https://github.com/google/tcmalloc/tree/master/docs/regions-are-not-optional.md
//
// So test directly if we're in the bad case--almost no binaries are.
// If not, just fall back to direct allocation (and hope we do hit that case!)
const Length slack = info_.slack();
const Length donated =
UseHugeRegionMoreOften() ? abandoned_pages_ + slack : slack;
// Don't bother at all until the binary is reasonably sized.
if (donated < HLFromBytes(64 * 1024 * 1024).in_pages()) {
return AllocRawHugepagesAndMaybeTrackLifetime(n, objects_per_span, lifetime,
from_released);
}
// In the vast majority of binaries, we have many small allocations which
// will nicely fill slack. (Fleetwide, the average ratio is 15:1; only
// a handful of binaries fall below 1:1.)
//
// If we enable an experiment that tries to use huge regions more frequently,
// we skip the check.
const Length small = info_.small();
if (slack < small && !UseHugeRegionMoreOften()) {
return AllocRawHugepagesAndMaybeTrackLifetime(n, objects_per_span, lifetime,
from_released);
}
// We couldn't allocate a new region. They're oversized, so maybe we'd get
// lucky with a smaller request?
if (!AddRegion()) {
return AllocRawHugepagesAndMaybeTrackLifetime(n, objects_per_span, lifetime,
from_released);
}
CHECK_CONDITION(regions_.MaybeGet(n, &page, from_released));
return Finalize(n, objects_per_span, page);
}
Span* HugePageAwareAllocator::AllocEnormous(Length n, size_t objects_per_span,
bool* from_released) {
return AllocRawHugepages(n, objects_per_span, from_released);
}
Span* HugePageAwareAllocator::AllocRawHugepages(Length n, size_t num_objects,
bool* from_released) {
HugeLength hl = HLFromPages(n);
HugeRange r = cache_.Get(hl, from_released);
if (!r.valid()) return nullptr;
// We now have a huge page range that covers our request. There
// might be some slack in it if n isn't a multiple of
// kPagesPerHugePage. Add the hugepage with slack to the filler,
// pretending the non-slack portion is a smaller allocation.
Length total = hl.in_pages();
Length slack = total - n;
HugePage first = r.start();
SetTracker(first, nullptr);
HugePage last = first + r.len() - NHugePages(1);
if (slack == Length(0)) {
SetTracker(last, nullptr);
return Finalize(total, num_objects, r.start().first_page());
}
++donated_huge_pages_;
Length here = kPagesPerHugePage - slack;
ASSERT(here > Length(0));
AllocAndContribute(last, here, num_objects, /*donated=*/true);
Span* span = Finalize(n, num_objects, r.start().first_page());
span->set_donated(/*value=*/true);
return span;
}
Span* HugePageAwareAllocator::AllocRawHugepagesAndMaybeTrackLifetime(
Length n, size_t num_objects,
const LifetimeBasedAllocator::AllocationResult& lifetime_alloc,
bool* from_released) ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
Span* result = AllocRawHugepages(n, num_objects, from_released);
if (result != nullptr) {
// If this is an object with a lifetime prediction and led to a donation,
// add it to the tracker so that we can track its lifetime.
HugePage hp = HugePageContaining(result->last_page());
FillerType::Tracker* pt = GetTracker(hp);
ASSERT(pt != nullptr);
// The allocator may shrink the heap in response to allocations, which may
// cause the page to be subreleased and not donated anymore once we get
// here. If it still is, we attach a lifetime tracker (if enabled).
if (ABSL_PREDICT_TRUE(pt->donated())) {
lifetime_allocator_.MaybeAddTracker(lifetime_alloc,
pt->lifetime_tracker());
}
}
return result;
}
static void BackSpan(Span* span) {
SystemBack(span->start_address(), span->bytes_in_span());
}
// public
Span* HugePageAwareAllocator::New(Length n, size_t objects_per_span) {
CHECK_CONDITION(n > Length(0));
bool from_released;
Span* s = LockAndAlloc(n, objects_per_span, &from_released);
if (s) {
// Prefetch for writing, as we anticipate using the memory soon.
PrefetchW(s->start_address());
// TODO(b/256233439): Improve accuracy of from_released value. The filler
// may have subreleased pages and is returning them now.
if (from_released) BackSpan(s);
}
ASSERT(!s || GetMemoryTag(s->start_address()) == tag_);
return s;
}
Span* HugePageAwareAllocator::LockAndAlloc(Length n, size_t objects_per_span,
bool* from_released) {
// Check whether we may perform lifetime-based allocation, and if so, collect
// the allocation context without holding the lock.
LifetimeStats* lifetime_ctx = lifetime_allocator_.CollectLifetimeContext(n);
absl::base_internal::SpinLockHolder h(&pageheap_lock);
// Our policy depends on size. For small things, we will pack them
// into single hugepages.
if (n <= kPagesPerHugePage / 2) {
return AllocSmall(n, objects_per_span, from_released);
}
// For anything too big for the filler, we use either a direct hugepage
// allocation, or possibly the regions if we are worried about slack.
if (n <= HugeRegion::size().in_pages()) {
return AllocLarge(n, objects_per_span, from_released, lifetime_ctx);
}
// In the worst case, we just fall back to directly allocating a run
// of hugepages.
return AllocEnormous(n, objects_per_span, from_released);
}
// public
Span* HugePageAwareAllocator::NewAligned(Length n, Length align,
size_t objects_per_span) {
if (align <= Length(1)) {
return New(n, objects_per_span);
}
// we can do better than this, but...
// TODO(b/134690769): support higher align.
CHECK_CONDITION(align <= kPagesPerHugePage);
bool from_released;
Span* s;
{
absl::base_internal::SpinLockHolder h(&pageheap_lock);
s = AllocRawHugepages(n, objects_per_span, &from_released);
}
if (s && from_released) BackSpan(s);
ASSERT(!s || GetMemoryTag(s->start_address()) == tag_);
return s;
}
void HugePageAwareAllocator::DeleteFromHugepage(FillerType::Tracker* pt,
PageId p, Length n,
size_t num_objects,
bool might_abandon) {
if (ABSL_PREDICT_TRUE(filler_.Put(pt, p, n, num_objects) == nullptr)) {
// If this allocation had resulted in a donation to the filler, we record
// these pages as abandoned.
if (ABSL_PREDICT_FALSE(might_abandon)) {
ASSERT(pt->was_donated());
abandoned_pages_ += pt->abandoned_count();
pt->set_abandoned(true);
}
return;
}
if (pt->was_donated()) {
--donated_huge_pages_;
if (pt->abandoned()) {
abandoned_pages_ -= pt->abandoned_count();
pt->set_abandoned(false);
}
} else {
ASSERT(pt->abandoned_count() == Length(0));
}
lifetime_allocator_.MaybePutTracker(pt->lifetime_tracker(), n);
ReleaseHugepage(pt);
}
bool HugePageAwareAllocator::AddRegion() {
HugeRange r = alloc_.Get(HugeRegion::size());
if (!r.valid()) return false;
HugeRegion* region = region_allocator_.New();
new (region) HugeRegion(r, MemoryModifyFunction(SystemRelease));
regions_.Contribute(region);
return true;
}
void HugePageAwareAllocator::Delete(Span* span, size_t objects_per_span) {
ASSERT(!span || GetMemoryTag(span->start_address()) == tag_);
PageId p = span->first_page();
HugePage hp = HugePageContaining(p);
Length n = span->num_pages();
info_.RecordFree(p, n, objects_per_span);
bool might_abandon = span->donated();
Span::Delete(span);
// Clear the descriptor of the page so a second pass through the same page
// could trigger the check on `span != nullptr` in do_free_pages.
tc_globals.pagemap().Set(p, nullptr);
// The tricky part, as with so many allocators: where did we come from?
// There are several possibilities.
FillerType::Tracker* pt = GetTracker(hp);
// a) We got packed by the filler onto a single hugepage - return our
// allocation to that hugepage in the filler.
if (ABSL_PREDICT_TRUE(pt != nullptr)) {
ASSERT(hp == HugePageContaining(p + n - Length(1)));
DeleteFromHugepage(pt, p, n, objects_per_span, might_abandon);
return;
}
// b) We got put into a region, possibly crossing hugepages -
// return our allocation to the region.
if (regions_.MaybePut(p, n)) return;
if (lifetime_allocator_.MaybePut(p, n)) return;
// c) we came straight from the HugeCache - return straight there. (We
// might have had slack put into the filler - if so, return that virtual
// allocation to the filler too!)
ASSERT(n >= kPagesPerHugePage);
HugeLength hl = HLFromPages(n);
HugePage last = hp + hl - NHugePages(1);
Length slack = hl.in_pages() - n;
if (slack == Length(0)) {
ASSERT(GetTracker(last) == nullptr);
} else {
pt = GetTracker(last);
lifetime_allocator_.MaybePutTracker(pt->lifetime_tracker(), n);
CHECK_CONDITION(pt != nullptr);
ASSERT(pt->was_donated());
// We put the slack into the filler (see AllocEnormous.)
// Handle this page separately as a virtual allocation
// onto the last hugepage.
PageId virt = last.first_page();
Length virt_len = kPagesPerHugePage - slack;
// We may have used the slack, which would prevent us from returning
// the entire range now. If filler returned a Tracker, we are fully empty.
if (filler_.Put(pt, virt, virt_len, objects_per_span) == nullptr) {
// Last page isn't empty -- pretend the range was shorter.
--hl;
// Note that we abandoned virt_len pages with pt. These can be reused for
// other allocations, but this can contribute to excessive slack in the
// filler.
abandoned_pages_ += pt->abandoned_count();
pt->set_abandoned(true);
} else {
// Last page was empty - but if we sub-released it, we still
// have to split it off and release it independently.)
//
// We were able to reclaim the donated slack.
--donated_huge_pages_;
ASSERT(!pt->abandoned());
if (pt->released()) {
--hl;
ReleaseHugepage(pt);
} else {
// Get rid of the tracker *object*, but not the *hugepage* (which is
// still part of our range.)
SetTracker(pt->location(), nullptr);
ASSERT(!pt->lifetime_tracker()->is_tracked());
tracker_allocator_.Delete(pt);
}
}
}
cache_.Release({hp, hl});
}
void HugePageAwareAllocator::ReleaseHugepage(FillerType::Tracker* pt) {
ASSERT(pt->used_pages() == Length(0));
HugeRange r = {pt->location(), NHugePages(1)};
SetTracker(pt->location(), nullptr);
if (pt->released()) {
cache_.ReleaseUnbacked(r);
} else {
cache_.Release(r);
}
ASSERT(!pt->lifetime_tracker()->is_tracked());
tracker_allocator_.Delete(pt);
}
// public
BackingStats HugePageAwareAllocator::stats() const {
BackingStats stats = alloc_.stats();
const auto actual_system = stats.system_bytes;
stats += cache_.stats();
stats += filler_.stats();
stats += regions_.stats();
stats += lifetime_allocator_.GetRegionStats().value_or(BackingStats());
// the "system" (total managed) byte count is wildly double counted,
// since it all comes from HugeAllocator but is then managed by
// cache/regions/filler. Adjust for that.
stats.system_bytes = actual_system;
return stats;
}
// public
void HugePageAwareAllocator::GetSmallSpanStats(SmallSpanStats* result) {
GetSpanStats(result, nullptr, nullptr);
}
// public
void HugePageAwareAllocator::GetLargeSpanStats(LargeSpanStats* result) {
GetSpanStats(nullptr, result, nullptr);
}
void HugePageAwareAllocator::GetSpanStats(SmallSpanStats* small,
LargeSpanStats* large,
PageAgeHistograms* ages) {
if (small != nullptr) {
*small = SmallSpanStats();
}
if (large != nullptr) {
*large = LargeSpanStats();
}
alloc_.AddSpanStats(small, large, ages);
filler_.AddSpanStats(small, large, ages);
regions_.AddSpanStats(small, large, ages);
cache_.AddSpanStats(small, large, ages);
}
// public
Length HugePageAwareAllocator::ReleaseAtLeastNPages(Length num_pages) {
Length released;
released += cache_.ReleaseCachedPages(HLFromPages(num_pages)).in_pages();
// This is our long term plan but in current state will lead to insufficient
// THP coverage. It is however very useful to have the ability to turn this on
// for testing.
// TODO(b/134690769): make this work, remove the flag guard.
if (Parameters::hpaa_subrelease()) {
if (released < num_pages) {
released += filler_.ReleasePages(
num_pages - released,
SkipSubreleaseIntervals{
.peak_interval = Parameters::filler_skip_subrelease_interval(),
.short_interval =
Parameters::filler_skip_subrelease_short_interval(),
.long_interval =
Parameters::filler_skip_subrelease_long_interval()},
/*hit_limit*/ false);
}
}
// TODO(b/134690769):
// - perhaps release region?
// - refuse to release if we're too close to zero?
info_.RecordRelease(num_pages, released);
return released;
}
static double BytesToMiB(size_t bytes) {
const double MiB = 1048576.0;
return bytes / MiB;
}
static void BreakdownStats(Printer* out, const BackingStats& s,
const char* label) {
out->printf("%s %6.1f MiB used, %6.1f MiB free, %6.1f MiB unmapped\n", label,
BytesToMiB(s.system_bytes - s.free_bytes - s.unmapped_bytes),
BytesToMiB(s.free_bytes), BytesToMiB(s.unmapped_bytes));
}
static void BreakdownStatsInPbtxt(PbtxtRegion* hpaa, const BackingStats& s,
const char* key) {
auto usage = hpaa->CreateSubRegion(key);
usage.PrintI64("used", s.system_bytes - s.free_bytes - s.unmapped_bytes);
usage.PrintI64("free", s.free_bytes);
usage.PrintI64("unmapped", s.unmapped_bytes);
}
// public
void HugePageAwareAllocator::Print(Printer* out) { Print(out, true); }
void HugePageAwareAllocator::Print(Printer* out, bool everything) {
SmallSpanStats small;
LargeSpanStats large;
BackingStats bstats;
PageAgeHistograms ages(absl::base_internal::CycleClock::Now());
absl::base_internal::SpinLockHolder h(&pageheap_lock);
bstats = stats();
GetSpanStats(&small, &large, &ages);
PrintStats("HugePageAware", out, bstats, small, large, everything);
out->printf(
"\nHuge page aware allocator components:\n"
"------------------------------------------------\n");
out->printf("HugePageAware: breakdown of used / free / unmapped space:\n");
auto fstats = filler_.stats();
BreakdownStats(out, fstats, "HugePageAware: filler ");
auto rstats = regions_.stats();
BreakdownStats(out, rstats, "HugePageAware: region ");
// Report short-lived region allocations when enabled.
auto lstats = lifetime_allocator_.GetRegionStats();
if (lstats.has_value()) {
BreakdownStats(out, lstats.value(), "HugePageAware: lifetime");
}
auto cstats = cache_.stats();
// Everything in the filler came from the cache -
// adjust the totals so we see the amount used by the mutator.
cstats.system_bytes -= fstats.system_bytes;
BreakdownStats(out, cstats, "HugePageAware: cache ");
auto astats = alloc_.stats();
// Everything in *all* components came from here -
// so again adjust the totals.
astats.system_bytes -=
(fstats + rstats + lstats.value_or(BackingStats()) + cstats).system_bytes;
BreakdownStats(out, astats, "HugePageAware: alloc ");
out->printf("\n");
out->printf(
"HugePageAware: filler donations %zu (%zu pages from abandoned "
"donations)\n",
donated_huge_pages_.raw_num(), abandoned_pages_.raw_num());
// Component debug output
// Filler is by far the most important; print (some) of it
// unconditionally.
filler_.Print(out, everything);
out->printf("\n");
if (everything) {
regions_.Print(out);
out->printf("\n");
cache_.Print(out);
lifetime_allocator_.Print(out);
out->printf("\n");
alloc_.Print(out);
out->printf("\n");
// Use statistics
info_.Print(out);
// and age tracking.
ages.Print("HugePageAware", out);
}
out->printf("PARAMETER hpaa_subrelease %d\n",
Parameters::hpaa_subrelease() ? 1 : 0);
}
void HugePageAwareAllocator::PrintInPbtxt(PbtxtRegion* region) {
SmallSpanStats small;
LargeSpanStats large;
PageAgeHistograms ages(absl::base_internal::CycleClock::Now());
absl::base_internal::SpinLockHolder h(&pageheap_lock);
GetSpanStats(&small, &large, &ages);
PrintStatsInPbtxt(region, small, large, ages);
{
auto hpaa = region->CreateSubRegion("huge_page_allocator");
hpaa.PrintBool("using_hpaa", true);
hpaa.PrintBool("using_hpaa_subrelease", Parameters::hpaa_subrelease());
// Fill HPAA Usage
auto fstats = filler_.stats();
BreakdownStatsInPbtxt(&hpaa, fstats, "filler_usage");
auto rstats = regions_.stats();
BreakdownStatsInPbtxt(&hpaa, rstats, "region_usage");
auto cstats = cache_.stats();
// Everything in the filler came from the cache -
// adjust the totals so we see the amount used by the mutator.
cstats.system_bytes -= fstats.system_bytes;
BreakdownStatsInPbtxt(&hpaa, cstats, "cache_usage");
auto astats = alloc_.stats();
// Everything in *all* components came from here -
// so again adjust the totals.
astats.system_bytes -= (fstats + rstats + cstats).system_bytes;
auto lstats = lifetime_allocator_.GetRegionStats();
if (lstats.has_value()) {
astats.system_bytes -= lstats.value().system_bytes;
BreakdownStatsInPbtxt(&hpaa, lstats.value(), "lifetime_region_usage");
}
BreakdownStatsInPbtxt(&hpaa, astats, "alloc_usage");
filler_.PrintInPbtxt(&hpaa);
regions_.PrintInPbtxt(&hpaa);
cache_.PrintInPbtxt(&hpaa);
alloc_.PrintInPbtxt(&hpaa);
lifetime_allocator_.PrintInPbtxt(&hpaa);
// Use statistics
info_.PrintInPbtxt(&hpaa, "hpaa_stat");
hpaa.PrintI64("filler_donated_huge_pages", donated_huge_pages_.raw_num());
hpaa.PrintI64("filler_abandoned_pages", abandoned_pages_.raw_num());
}
}
template <MemoryTag tag>
AddressRange HugePageAwareAllocator::AllocAndReport(size_t bytes,
size_t align) {
auto ret = SystemAlloc(bytes, align, tag);
if (ret.ptr == nullptr) return ret;
const PageId page = PageIdContaining(ret.ptr);
const Length page_len = BytesToLengthFloor(ret.bytes);
tc_globals.pagemap().Ensure(page, page_len);
return ret;
}
void* HugePageAwareAllocator::MetaDataAlloc(size_t bytes)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
return tc_globals.arena().Alloc(bytes);
}
Length HugePageAwareAllocator::ReleaseAtLeastNPagesBreakingHugepages(Length n) {
// We desperately need to release memory, and are willing to
// compromise on hugepage usage. That means releasing from the filler.
return filler_.ReleasePages(n, SkipSubreleaseIntervals{},
/*hit_limit*/ true);
}
bool HugePageAwareAllocator::UnbackWithoutLock(void* start, size_t length) {
pageheap_lock.Unlock();
const bool ret = SystemRelease(start, length);
pageheap_lock.Lock();
return ret;
}
} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END

View File

@ -0,0 +1,266 @@
// Copyright 2019 The TCMalloc Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef TCMALLOC_HUGE_PAGE_AWARE_ALLOCATOR_H_
#define TCMALLOC_HUGE_PAGE_AWARE_ALLOCATOR_H_
#include <stddef.h>
#include "absl/base/thread_annotations.h"
#include "tcmalloc/arena.h"
#include "tcmalloc/common.h"
#include "tcmalloc/huge_allocator.h"
#include "tcmalloc/huge_cache.h"
#include "tcmalloc/huge_pages.h"
#include "tcmalloc/huge_region.h"
#include "tcmalloc/internal/config.h"
#include "tcmalloc/internal/logging.h"
#include "tcmalloc/lifetime_based_allocator.h"
#include "tcmalloc/page_allocator_interface.h"
#include "tcmalloc/page_heap_allocator.h"
#include "tcmalloc/span.h"
#include "tcmalloc/stats.h"
#include "tcmalloc/system-alloc.h"
GOOGLE_MALLOC_SECTION_BEGIN
namespace tcmalloc {
namespace tcmalloc_internal {
bool decide_subrelease();
enum class HugeRegionCountOption : bool {
// This is a default behavior. We use slack to determine when to use
// HugeRegion. When slack is greater than 64MB (to ignore small binaries), and
// greater than the number of small allocations, we allocate large allocations
// from HugeRegion.
kSlack,
// When the experiment TEST_ONLY_TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN is
// enabled, we use number of abandoned pages in addition to slack to make a
// decision. If the size of abandoned pages plus slack exceeds 64MB (to ignore
// small binaries), we use HugeRegion for large allocations.
kAbandonedCount
};
// An implementation of the PageAllocator interface that is hugepage-efficient.
// Attempts to pack allocations into full hugepages wherever possible,
// and aggressively returns empty ones to the system.
class HugePageAwareAllocator final : public PageAllocatorInterface {
public:
explicit HugePageAwareAllocator(MemoryTag tag);
// For use in testing.
HugePageAwareAllocator(MemoryTag tag,
HugeRegionCountOption use_huge_region_more_often);
HugePageAwareAllocator(MemoryTag tag,
HugeRegionCountOption use_huge_region_more_often,
LifetimePredictionOptions lifetime_options);
~HugePageAwareAllocator() override = default;
// Allocate a run of "n" pages. Returns zero if out of memory.
// Caller should not pass "n == 0" -- instead, n should have
// been rounded up already.
Span* New(Length n, size_t objects_per_span)
ABSL_LOCKS_EXCLUDED(pageheap_lock) override;
// As New, but the returned span is aligned to a <align>-page boundary.
// <align> must be a power of two.
Span* NewAligned(Length n, Length align, size_t objects_per_span)
ABSL_LOCKS_EXCLUDED(pageheap_lock) override;
// Delete the span "[p, p+n-1]".
// REQUIRES: span was returned by earlier call to New() and
// has not yet been deleted.
void Delete(Span* span, size_t objects_per_span)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) override;
BackingStats stats() const
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) override;
void GetSmallSpanStats(SmallSpanStats* result)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) override;
void GetLargeSpanStats(LargeSpanStats* result)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) override;
// Try to release at least num_pages for reuse by the OS. Returns
// the actual number of pages released, which may be less than
// num_pages if there weren't enough pages to release. The result
// may also be larger than num_pages since page_heap might decide to
// release one large range instead of fragmenting it into two
// smaller released and unreleased ranges.
Length ReleaseAtLeastNPages(Length num_pages)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) override;
Length ReleaseAtLeastNPagesBreakingHugepages(Length n)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
// Prints stats about the page heap to *out.
void Print(Printer* out) ABSL_LOCKS_EXCLUDED(pageheap_lock) override;
// Print stats to *out, excluding long/likely uninteresting things
// unless <everything> is true.
void Print(Printer* out, bool everything) ABSL_LOCKS_EXCLUDED(pageheap_lock);
void PrintInPbtxt(PbtxtRegion* region)
ABSL_LOCKS_EXCLUDED(pageheap_lock) override;
HugeLength DonatedHugePages() const
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
return donated_huge_pages_;
}
// Number of pages that have been retained on huge pages by donations that did
// not reassemble by the time the larger allocation was deallocated.
Length AbandonedPages() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
return abandoned_pages_;
}
const HugeCache* cache() const { return &cache_; }
LifetimeBasedAllocator& lifetime_based_allocator() {
return lifetime_allocator_;
}
const HugeRegionSet<HugeRegion>& region() const
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
return regions_;
};
private:
typedef HugePageFiller<PageTracker> FillerType;
FillerType filler_ ABSL_GUARDED_BY(pageheap_lock);
class RegionAllocImpl final : public LifetimeBasedAllocator::RegionAlloc {
public:
explicit RegionAllocImpl(HugePageAwareAllocator* p) : p_(p) {}
// We need to explicitly instantiate the destructor here so that it gets
// placed within GOOGLE_MALLOC_SECTION.
~RegionAllocImpl() override {}
HugeRegion* AllocRegion(HugeLength n, HugeRange* range) override
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
if (!range->valid()) {
*range = p_->alloc_.Get(n);
}
if (!range->valid()) return nullptr;
HugeRegion* region = p_->region_allocator_.New();
new (region) HugeRegion(*range, MemoryModifyFunction(SystemRelease));
return region;
}
private:
HugePageAwareAllocator* p_;
};
// Calls SystemRelease, but with dropping of pageheap_lock around the call.
static ABSL_MUST_USE_RESULT bool UnbackWithoutLock(void* start, size_t length)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
HugeRegionSet<HugeRegion> regions_ ABSL_GUARDED_BY(pageheap_lock);
PageHeapAllocator<FillerType::Tracker> tracker_allocator_
ABSL_GUARDED_BY(pageheap_lock);
PageHeapAllocator<HugeRegion> region_allocator_
ABSL_GUARDED_BY(pageheap_lock);
FillerType::Tracker* GetTracker(HugePage p);
void SetTracker(HugePage p, FillerType::Tracker* pt);
template <MemoryTag tag>
static AddressRange AllocAndReport(size_t bytes, size_t align)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
static void* MetaDataAlloc(size_t bytes);
HugeAllocator alloc_ ABSL_GUARDED_BY(pageheap_lock);
HugeCache cache_ ABSL_GUARDED_BY(pageheap_lock);
// donated_huge_pages_ measures the number of huge pages contributed to the
// filler from left overs of large huge page allocations. When the large
// allocation is deallocated, we decrement this count *if* we were able to
// fully reassemble the address range (that is, the partial hugepage did not
// get stuck in the filler).
HugeLength donated_huge_pages_ ABSL_GUARDED_BY(pageheap_lock);
// abandoned_pages_ tracks the number of pages contributed to the filler after
// a donating allocation is deallocated but the entire huge page has not been
// reassembled.
Length abandoned_pages_ ABSL_GUARDED_BY(pageheap_lock);
// Performs lifetime predictions for large objects and places short-lived
// objects into a separate region to reduce filler contention.
RegionAllocImpl lifetime_allocator_region_alloc_;
LifetimeBasedAllocator lifetime_allocator_;
// Ddetermines if the experiment is enabled. If enabled, we use
// abandoned_count_ in addition to slack in determining when to use
// HugeRegion.
const HugeRegionCountOption use_huge_region_more_often_;
bool UseHugeRegionMoreOften() const {
return use_huge_region_more_often_ ==
HugeRegionCountOption::kAbandonedCount;
}
void GetSpanStats(SmallSpanStats* small, LargeSpanStats* large,
PageAgeHistograms* ages)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
PageId RefillFiller(Length n, size_t num_objects, bool* from_released)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
// Allocate the first <n> from p, and contribute the rest to the filler. If
// "donated" is true, the contribution will be marked as coming from the
// tail of a multi-hugepage alloc. Returns the allocated section.
PageId AllocAndContribute(HugePage p, Length n, size_t num_objects,
bool donated)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
// Helpers for New().
Span* LockAndAlloc(Length n, size_t objects_per_span, bool* from_released);
Span* AllocSmall(Length n, size_t objects_per_span, bool* from_released)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
Span* AllocLarge(Length n, size_t objects_per_span, bool* from_released,
LifetimeStats* lifetime_context)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
Span* AllocEnormous(Length n, size_t objects_per_span, bool* from_released)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
Span* AllocRawHugepages(Length n, size_t num_objects, bool* from_released)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
// Allocates a span and adds a tracker. This span has to be associated with a
// filler donation and have an associated page tracker. A tracker will only be
// added if there is an associated lifetime prediction.
Span* AllocRawHugepagesAndMaybeTrackLifetime(
Length n, size_t num_objects,
const LifetimeBasedAllocator::AllocationResult& lifetime_alloc,
bool* from_released) ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
bool AddRegion() ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
void ReleaseHugepage(FillerType::Tracker* pt)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
// Return an allocation from a single hugepage.
void DeleteFromHugepage(FillerType::Tracker* pt, PageId p, Length n,
size_t num_objects, bool might_abandon)
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
// Finish an allocation request - give it a span and mark it in the pagemap.
Span* Finalize(Length n, size_t num_objects, PageId page);
};
} // namespace tcmalloc_internal
} // namespace tcmalloc
GOOGLE_MALLOC_SECTION_END
#endif // TCMALLOC_HUGE_PAGE_AWARE_ALLOCATOR_H_

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More