SERVER-85737 add latest TCMalloc from google as allocator option (#18870)
GitOrigin-RevId: 23c89085da2424a0fb91913c42c5d356b6a860df
@ -3,6 +3,7 @@ src/third_party/grpc
|
||||
src/third_party/abseil-cpp
|
||||
src/third_party/protobuf
|
||||
src/third_party/re2
|
||||
src/third_party/tcmalloc
|
||||
|
||||
# Ignore node_modules due to the following error
|
||||
# ERROR: in verify_node_modules_ignored:
|
||||
|
||||
84
SConstruct
@ -105,7 +105,7 @@ def use_system_version_of_library(name):
|
||||
# add a new C++ library dependency that may be shimmed out to the system, add it to the below
|
||||
# list.
|
||||
def using_system_version_of_cxx_libraries():
|
||||
cxx_library_names = ["tcmalloc", "boost"]
|
||||
cxx_library_names = ["tcmalloc-google", "boost", "tcmalloc-gperf"]
|
||||
return True in [use_system_version_of_library(x) for x in cxx_library_names]
|
||||
|
||||
|
||||
@ -415,7 +415,7 @@ add_option(
|
||||
|
||||
add_option(
|
||||
'allocator',
|
||||
choices=["auto", "system", "tcmalloc", "tcmalloc-experimental"],
|
||||
choices=["auto", "system", "tcmalloc-google", "tcmalloc-gperf"],
|
||||
default=build_profile.allocator,
|
||||
help='allocator to use (use "auto" for best choice for current platform)',
|
||||
type='choice',
|
||||
@ -485,7 +485,8 @@ for pack in [
|
||||
('protobuf', "Protocol Buffers"),
|
||||
('snappy', ),
|
||||
('stemmer', ),
|
||||
('tcmalloc', ),
|
||||
('tcmalloc-google', ),
|
||||
('tcmalloc-gperf', ),
|
||||
('libunwind', ),
|
||||
('valgrind', ),
|
||||
('wiredtiger', ),
|
||||
@ -2124,17 +2125,48 @@ env['TARGET_OS_FAMILY'] = 'posix' if env.TargetOSIs('posix') else env.GetTargetO
|
||||
# would be nicer to use SetOption here, but you can't reset user
|
||||
# options for some strange reason in SCons. Instead, we store this
|
||||
# option as a new variable in the environment.
|
||||
try:
|
||||
kernel_version = platform.release().split(".")
|
||||
kernel_major = int(kernel_version[0])
|
||||
kernel_minor = int(kernel_version[1])
|
||||
except (ValueError, IndexError):
|
||||
print(
|
||||
f"Failed to extract kernel major and minor versions, tcmalloc-google will not be available for use: {kernel_version}"
|
||||
)
|
||||
kernel_major = 0
|
||||
kernel_minor = 0
|
||||
|
||||
if get_option('allocator') == "auto":
|
||||
# using an allocator besides system on android would require either fixing or disabling
|
||||
# gperftools on android
|
||||
if env.TargetOSIs('windows') or \
|
||||
env.TargetOSIs('linux') and not env.TargetOSIs('android'):
|
||||
env['MONGO_ALLOCATOR'] = "tcmalloc"
|
||||
if env.TargetOSIs('linux') and env['TARGET_ARCH'] in ('x86_64', 'aarch64'):
|
||||
|
||||
# TODO SERVER-86472 make bazel support both tcmalloc implementations
|
||||
if env.get("BAZEL_BUILD_ENABLED"):
|
||||
env['MONGO_ALLOCATOR'] = "tcmalloc-gperf"
|
||||
else:
|
||||
env['MONGO_ALLOCATOR'] = "tcmalloc-google"
|
||||
|
||||
# googles tcmalloc uses the membarrier() system call which was added in Linux 4.3,
|
||||
# so fall back to gperf implementation for older kernels
|
||||
if kernel_major < 4 or (kernel_major == 4 and kernel_minor < 3):
|
||||
env['MONGO_ALLOCATOR'] = "tcmalloc-gperf"
|
||||
|
||||
elif env.TargetOSIs('windows') or (env.TargetOSIs('linux')
|
||||
and env['TARGET_ARCH'] in ('ppc64le', 's390x')):
|
||||
env['MONGO_ALLOCATOR'] = "tcmalloc-gperf"
|
||||
else:
|
||||
env['MONGO_ALLOCATOR'] = "system"
|
||||
else:
|
||||
env['MONGO_ALLOCATOR'] = get_option('allocator')
|
||||
|
||||
if env['MONGO_ALLOCATOR'] == "tcmalloc-google":
|
||||
if kernel_major < 4 or (kernel_major == 4 and kernel_minor < 3):
|
||||
env.ConfError(
|
||||
f"tcmalloc-google allocator only supported on linux kernel 4.3 or greater: kenerl verison={platform.release()}"
|
||||
)
|
||||
|
||||
if env['MONGO_ALLOCATOR'] == "tcmalloc-google":
|
||||
env.Append(CPPDEFINES=["ABSL_ALLOCATOR_NOTHROW"])
|
||||
|
||||
if has_option("cache"):
|
||||
if has_option("gcov"):
|
||||
env.FatalError("Mixing --cache and --gcov doesn't work correctly yet. See SERVER-11084")
|
||||
@ -2445,6 +2477,13 @@ if not env.TargetOSIs('windows'):
|
||||
env["LINKCOM"] = env["LINKCOM"].replace("$LINKFLAGS", "$PROGLINKFLAGS")
|
||||
env["PROGLINKFLAGS"] = ['$LINKFLAGS']
|
||||
|
||||
# CPPFLAGS is used for assembler commands, this condition below assumes assembler files
|
||||
# will be only directly assembled in librarys and not programs
|
||||
if link_model.startswith("dynamic"):
|
||||
env.Append(CPPFLAGS=["-fPIC"])
|
||||
else:
|
||||
env.Append(CPPFLAGS=["-fPIE"])
|
||||
|
||||
# When it is necessary to supply additional SHLINKFLAGS without modifying the toolset default,
|
||||
# following appends contents of SHLINKFLAGS_EXTRA variable to the linker command
|
||||
env.AppendUnique(SHLINKFLAGS=['$SHLINKFLAGS_EXTRA'])
|
||||
@ -3070,7 +3109,9 @@ if env.TargetOSIs('posix'):
|
||||
# If runtime hardening is requested, then build anything
|
||||
# destined for an executable with the necessary flags for PIE.
|
||||
env.AppendUnique(
|
||||
PROGCFLAGS=['-fPIE'],
|
||||
PROGCCFLAGS=['-fPIE'],
|
||||
PROGCXXFLAGS=['-fPIE'],
|
||||
PROGLINKFLAGS=['-pie'],
|
||||
)
|
||||
|
||||
@ -3102,7 +3143,8 @@ if env.TargetOSIs('posix'):
|
||||
|
||||
# For debug builds with tcmalloc, we need the frame pointer so it can
|
||||
# record the stack of allocations.
|
||||
can_nofp &= not (debugBuild and (env['MONGO_ALLOCATOR'] == 'tcmalloc'))
|
||||
can_nofp &= not (debugBuild and
|
||||
(env['MONGO_ALLOCATOR'] in ['tcmalloc-google', 'tcmalloc-gperf']))
|
||||
|
||||
# Only disable frame pointers if requested
|
||||
can_nofp &= ("nofp" in selected_experimental_optimizations)
|
||||
@ -4116,6 +4158,10 @@ def doConfigure(myenv):
|
||||
if not myenv.ToolchainIs('clang', 'gcc'):
|
||||
env.FatalError('sanitize is only supported with clang or gcc')
|
||||
|
||||
# sanitizer libs may inject undefined refs (for hooks) at link time, but
|
||||
# the symbols will be available at runtime via the compiler runtime lib.
|
||||
env.Append(LINKFLAGS='-Wl,--allow-shlib-undefined')
|
||||
|
||||
if myenv.ToolchainIs('gcc'):
|
||||
# GCC's implementation of ASAN depends on libdl.
|
||||
env.Append(LIBS=['dl'])
|
||||
@ -4157,11 +4203,14 @@ def doConfigure(myenv):
|
||||
get_san_lib_path(sanitizer) for sanitizer in sanitizer_list
|
||||
]
|
||||
|
||||
if 'thread' not in sanitizer_list:
|
||||
env.Append(LINKFLAGS=['-rtlib=compiler-rt', '-unwindlib=libgcc'])
|
||||
|
||||
if using_lsan:
|
||||
env.FatalError("Please use --sanitize=address instead of --sanitize=leak")
|
||||
|
||||
if (using_asan
|
||||
or using_msan) and env['MONGO_ALLOCATOR'] in ['tcmalloc', 'tcmalloc-experimental']:
|
||||
or using_msan) and env['MONGO_ALLOCATOR'] in ['tcmalloc-google', 'tcmalloc-gperf']:
|
||||
# There are multiply defined symbols between the sanitizer and
|
||||
# our vendorized tcmalloc.
|
||||
env.FatalError("Cannot use --sanitize=address or --sanitize=memory with tcmalloc")
|
||||
@ -4236,7 +4285,7 @@ def doConfigure(myenv):
|
||||
else:
|
||||
myenv.ConfError('Failed to enable sanitizers with flag: {0}', sanitizer_option)
|
||||
|
||||
if get_option('shared-libsan') == 'on':
|
||||
if get_option("shared-libsan") == "on":
|
||||
shared_libsan_option = '-shared-libsan'
|
||||
if myenv.AddToCCFLAGSIfSupported(shared_libsan_option):
|
||||
myenv.Append(LINKFLAGS=[shared_libsan_option])
|
||||
@ -5279,13 +5328,16 @@ def doConfigure(myenv):
|
||||
|
||||
# 'tcmalloc' needs to be the last library linked. Please, add new libraries before this
|
||||
# point.
|
||||
if myenv['MONGO_ALLOCATOR'] == 'tcmalloc':
|
||||
if use_system_version_of_library('tcmalloc'):
|
||||
conf.FindSysLibDep("tcmalloc", ["tcmalloc"])
|
||||
elif myenv['MONGO_ALLOCATOR'] in ['system', 'tcmalloc-experimental']:
|
||||
if myenv['MONGO_ALLOCATOR'] == 'tcmalloc-google':
|
||||
if use_system_version_of_library('tcmalloc-google'):
|
||||
conf.FindSysLibDep("tcmalloc-google", ["tcmalloc"])
|
||||
elif myenv['MONGO_ALLOCATOR'] == 'tcmalloc-gperf':
|
||||
if use_system_version_of_library('tcmalloc-gperf'):
|
||||
conf.FindSysLibDep("tcmalloc-gperf", ["tcmalloc"])
|
||||
elif myenv['MONGO_ALLOCATOR'] in ['system']:
|
||||
pass
|
||||
else:
|
||||
myenv.FatalError("Invalid --allocator parameter: $MONGO_ALLOCATOR")
|
||||
myenv.FatalError(f"Invalid --allocator parameter: {env['MONGO_ALLOCATOR']}")
|
||||
|
||||
def CheckStdAtomic(context, base_type, extra_message):
|
||||
test_body = """
|
||||
|
||||
@ -348,6 +348,7 @@ buildvariants:
|
||||
archive-mongocryptd-debug
|
||||
lang_environment: LANG=C
|
||||
san_options: *ubsan_options
|
||||
# TODO SERVER-86610 set --allocator=tcmalloc-google
|
||||
compile_flags: >-
|
||||
--variables-files=etc/scons/mongodbtoolchain_stable_clang.vars
|
||||
--dbg=on
|
||||
@ -355,6 +356,7 @@ buildvariants:
|
||||
--sanitize=undefined
|
||||
--ssl
|
||||
--ocsp-stapling=off
|
||||
--allocator=tcmalloc-gperf
|
||||
-j$(grep -c ^processor /proc/cpuinfo)
|
||||
--link-model=dynamic
|
||||
--use-diagnostic-latches=on
|
||||
|
||||
@ -469,12 +469,14 @@ buildvariants:
|
||||
archive-mongocryptd-debug
|
||||
lang_environment: LANG=C
|
||||
san_options: *ubsan_options
|
||||
# TODO SERVER-86610 add tcmalloc-google as the allocator for ubsan
|
||||
compile_flags: >-
|
||||
--variables-files=etc/scons/mongodbtoolchain_${toolchain_version}_clang.vars
|
||||
--dbg=on
|
||||
--opt=on
|
||||
--sanitize=undefined
|
||||
--ssl
|
||||
--allocator=tcmalloc-gperf
|
||||
--ocsp-stapling=off
|
||||
-j$(grep -c ^processor /proc/cpuinfo)
|
||||
--use-diagnostic-latches=on
|
||||
|
||||
@ -522,9 +522,13 @@ def generate(env: SCons.Environment.Environment) -> None:
|
||||
else:
|
||||
build_mode = f"opt_{mongo_generators.get_opt_options(env)}" # one of "on", "size", "off"
|
||||
|
||||
# Deprecate tcmalloc-experimental
|
||||
allocator = "tcmalloc" if env.GetOption(
|
||||
"allocator") == "tcmalloc-experimental" else env.GetOption("allocator")
|
||||
# TODO SERVER-86472 make bazel support both tcmalloc implementations
|
||||
if env.GetOption("allocator") == "tcmalloc-google":
|
||||
env.ConfError("Bazel build currently does not support tcmalloc-google allocator.")
|
||||
if env.GetOption("allocator") == "tcmalloc-gperf":
|
||||
allocator = "tcmalloc"
|
||||
else:
|
||||
allocator = env.GetOption("allocator")
|
||||
|
||||
bazel_internal_flags = [
|
||||
f'--//bazel/config:compiler_type={env.ToolchainName()}',
|
||||
|
||||
@ -38,6 +38,7 @@ env['LINK'] = [f'{base_toolchain_bin}/g++']
|
||||
env['SHLINK'] = [f'{base_toolchain_bin}/g++']
|
||||
env['CPPPATH'] = [str(tidy_include)]
|
||||
env['LIBPATH'] = []
|
||||
env['CPPFLAGS'] = []
|
||||
env['CCFLAGS'] = [
|
||||
'-DGTEST_HAS_RTTI=0',
|
||||
'-D_GNU_SOURCE',
|
||||
|
||||
@ -37,7 +37,7 @@
|
||||
|
||||
#include <src/core/lib/security/credentials/ssl/ssl_credentials.h>
|
||||
#include <src/core/lib/security/security_connector/ssl_utils.h>
|
||||
#include <src/core/tsi/ssl_transport_security.cc>
|
||||
#include <src/core/tsi/ssl_transport_security.h>
|
||||
#include <src/cpp/server/secure_server_credentials.h>
|
||||
|
||||
#include "mongo/base/error_codes.h"
|
||||
|
||||
@ -361,20 +361,38 @@ env.CppUnitTest(
|
||||
],
|
||||
)
|
||||
|
||||
if env['MONGO_ALLOCATOR'] in ['tcmalloc', 'tcmalloc-experimental']:
|
||||
tcmspEnv = env.Clone()
|
||||
|
||||
if not use_system_version_of_library('tcmalloc'):
|
||||
|
||||
# Add in the include path for our vendored tcmalloc.
|
||||
tcmspEnv.InjectThirdParty('gperftools')
|
||||
|
||||
tcmallocAttrs = None
|
||||
for impl in [
|
||||
{
|
||||
# Modern standalone tcmalloc (not gperftools)
|
||||
'options': ['tcmalloc-google'],
|
||||
'sys_name': 'tcmalloc-google',
|
||||
'inject': 'tcmalloc',
|
||||
'cppdefs': ['MONGO_HAVE_GOOGLE_TCMALLOC'],
|
||||
},
|
||||
{
|
||||
# Old gperftools tcmalloc
|
||||
'options': ['tcmalloc-gperf'],
|
||||
'sys_name':
|
||||
'tcmalloc-gperf',
|
||||
'inject':
|
||||
'gperftools',
|
||||
# If our changes to tcmalloc are ever upstreamed, this should become set based on a top
|
||||
# level configure check, though its effects should still be scoped just to these files.
|
||||
tcmspEnv.Append(CPPDEFINES=[
|
||||
'cppdefs': [
|
||||
'MONGO_HAVE_GPERF_TCMALLOC',
|
||||
'MONGO_HAVE_GPERFTOOLS_GET_THREAD_CACHE_SIZE',
|
||||
'MONGO_HAVE_GPERFTOOLS_SIZE_CLASS_STATS',
|
||||
])
|
||||
],
|
||||
},
|
||||
]:
|
||||
if env['MONGO_ALLOCATOR'] in impl['options']:
|
||||
tcmallocAttrs = impl
|
||||
if tcmallocAttrs:
|
||||
tcmspEnv = env.Clone()
|
||||
if not use_system_version_of_library(tcmallocAttrs['sys_name']):
|
||||
tcmspEnv.InjectThirdParty(tcmallocAttrs['inject'])
|
||||
tcmspEnv.Append(CPPDEFINES=tcmallocAttrs['cppdefs'])
|
||||
|
||||
if not use_system_version_of_library('valgrind'):
|
||||
# Include valgrind since tcmalloc disables itself while running under valgrind
|
||||
|
||||
@ -27,8 +27,6 @@
|
||||
* it in the license file.
|
||||
*/
|
||||
|
||||
#include <gperftools/malloc_hook.h>
|
||||
|
||||
#include <absl/hash/hash.h>
|
||||
|
||||
// IWYU pragma: no_include "cxxabi.h"
|
||||
@ -60,6 +58,7 @@
|
||||
#include "mongo/logv2/log.h"
|
||||
#include "mongo/logv2/log_attr.h"
|
||||
#include "mongo/logv2/log_component.h"
|
||||
#include "mongo/stdx/unordered_map.h"
|
||||
#include "mongo/util/stacktrace.h"
|
||||
#include "mongo/util/tcmalloc_parameters_gen.h"
|
||||
|
||||
@ -67,11 +66,54 @@
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include <MurmurHash3.h>
|
||||
|
||||
#ifdef MONGO_HAVE_GPERF_TCMALLOC
|
||||
#include <gperftools/malloc_hook.h>
|
||||
#endif
|
||||
|
||||
#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
|
||||
#include <absl/debugging/symbolize.h>
|
||||
#include <tcmalloc/malloc_extension.h>
|
||||
#endif
|
||||
|
||||
#if defined(_POSIX_VERSION) && defined(MONGO_CONFIG_HAVE_EXECINFO_BACKTRACE)
|
||||
#include <dlfcn.h>
|
||||
#include <execinfo.h>
|
||||
#endif
|
||||
|
||||
|
||||
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kDefault
|
||||
|
||||
// for dlfcn.h and backtrace
|
||||
#if defined(_POSIX_VERSION) && defined(MONGO_CONFIG_HAVE_EXECINFO_BACKTRACE)
|
||||
#if defined(_POSIX_VERSION) && defined(MONGO_CONFIG_HAVE_EXECINFO_BACKTRACE) && \
|
||||
(defined(MONGO_HAVE_GPERF_TCMALLOC) || defined(MONGO_HAVE_GOOGLE_TCMALLOC))
|
||||
|
||||
|
||||
namespace mongo {
|
||||
namespace {
|
||||
|
||||
/** Simple wrapper for the demangler, particularly its buffer space. */
|
||||
class Demangler {
|
||||
public:
|
||||
Demangler() = default;
|
||||
Demangler(const Demangler&) = delete;
|
||||
~Demangler() {
|
||||
free(_buf);
|
||||
}
|
||||
|
||||
char* operator()(const char* sym) {
|
||||
char* dm = abi::__cxa_demangle(sym, _buf, &_bufSize, &_status);
|
||||
if (dm)
|
||||
_buf = dm;
|
||||
return dm;
|
||||
}
|
||||
|
||||
private:
|
||||
size_t _bufSize = 0;
|
||||
char* _buf = nullptr;
|
||||
int _status = 0;
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// Sampling heap profiler
|
||||
@ -149,33 +191,6 @@
|
||||
// and acceptable size overhead for the hash tables.
|
||||
//
|
||||
|
||||
namespace mongo {
|
||||
namespace {
|
||||
|
||||
// Simple wrapper for the demangler, particularly its buffer space.
|
||||
class Demangler {
|
||||
public:
|
||||
Demangler() = default;
|
||||
|
||||
Demangler(const Demangler&) = delete;
|
||||
|
||||
~Demangler() {
|
||||
free(_buf);
|
||||
}
|
||||
|
||||
char* operator()(const char* sym) {
|
||||
char* dm = abi::__cxa_demangle(sym, _buf, &_bufSize, &_status);
|
||||
if (dm)
|
||||
_buf = dm;
|
||||
return dm;
|
||||
}
|
||||
|
||||
private:
|
||||
size_t _bufSize = 0;
|
||||
char* _buf = nullptr;
|
||||
int _status = 0;
|
||||
};
|
||||
|
||||
// TODO SERVER-44010: Consider replacing this custom implementation with a generic one.
|
||||
//
|
||||
// Simple hash table maps Key->Value.
|
||||
@ -201,29 +216,6 @@ using Hash = size_t;
|
||||
|
||||
template <class Key, class Value>
|
||||
class HashTable {
|
||||
HashTable(const HashTable&) = delete;
|
||||
HashTable& operator=(const HashTable&) = delete;
|
||||
|
||||
private:
|
||||
struct Entry {
|
||||
Key key{};
|
||||
Value value{};
|
||||
std::atomic<Entry*> next{nullptr}; // NOLINT
|
||||
std::atomic<bool> valid{false}; // NOLINT
|
||||
Entry() {}
|
||||
};
|
||||
|
||||
const size_t maxEntries; // we allocate storage for this many entries on creation
|
||||
std::atomic_size_t numEntries; // number of entries currently in use NOLINT
|
||||
size_t numBuckets; // number of buckets, computed as numEntries * loadFactor
|
||||
|
||||
// pre-allocate buckets and entries
|
||||
std::unique_ptr<std::atomic<Entry*>[]> buckets; // NOLINT
|
||||
std::unique_ptr<Entry[]> entries;
|
||||
|
||||
std::atomic_size_t nextEntry; // first entry that's never been used NOLINT
|
||||
Entry* freeEntry; // linked list of entries returned to us by removeEntry
|
||||
|
||||
public:
|
||||
HashTable(size_t maxEntries, int loadFactor)
|
||||
: maxEntries(maxEntries),
|
||||
@ -246,17 +238,15 @@ public:
|
||||
} else if (nextEntry < maxEntries) {
|
||||
entry = &entries[nextEntry++];
|
||||
}
|
||||
if (entry) {
|
||||
entry->next = buckets[hash].load();
|
||||
buckets[hash] = entry;
|
||||
entry->key = key;
|
||||
entry->value = value;
|
||||
entry->valid = true; // signal that the entry is well-formed and may be traversed
|
||||
numEntries++;
|
||||
return &entry->value;
|
||||
} else {
|
||||
if (!entry)
|
||||
return nullptr;
|
||||
}
|
||||
entry->next = buckets[hash].load();
|
||||
buckets[hash] = entry;
|
||||
entry->key = key;
|
||||
entry->value = value;
|
||||
entry->valid = true; // signal that the entry is well-formed and may be traversed
|
||||
++numEntries;
|
||||
return &entry->value;
|
||||
}
|
||||
|
||||
// Find the entry containing Key in the specified hash bucket.
|
||||
@ -279,7 +269,7 @@ public:
|
||||
entry->valid = false; // first signal entry is invalid as it may get reused
|
||||
entry->next = freeEntry;
|
||||
freeEntry = entry;
|
||||
numEntries--;
|
||||
--numEntries;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -292,76 +282,105 @@ public:
|
||||
// Note however it is not guaranteed to provide snapshot semantics wrt the set of entries,
|
||||
// and caller must ensure safety wrt concurrent updates to the Value of an entry
|
||||
template <typename F>
|
||||
void forEach(F f) {
|
||||
void forEach(const F& f) {
|
||||
for (size_t i = 0; i < nextEntry; i++) {
|
||||
Entry& entry = entries[i];
|
||||
if (entry.valid) // only traverse well-formed entries
|
||||
f(entry.key, entry.value);
|
||||
Entry& e = entries[i];
|
||||
if (e.valid) // only traverse well-formed entries
|
||||
f(e.key, e.value);
|
||||
}
|
||||
}
|
||||
|
||||
// Determines whether the specified hash bucket is empty. May be called concurrently with
|
||||
// insert() and remove(). Concurrent visibility on other threads is guaranteed because
|
||||
// buckets[hash] is atomic.
|
||||
bool isEmptyBucket(Hash hash) {
|
||||
hash %= numBuckets;
|
||||
return buckets[hash] == nullptr;
|
||||
bool isEmptyBucket(Hash hash) const {
|
||||
return !buckets[hash % numBuckets];
|
||||
}
|
||||
|
||||
// Number of entries.
|
||||
size_t size() {
|
||||
size_t size() const {
|
||||
return numEntries;
|
||||
}
|
||||
|
||||
// Highwater mark of number of entries used, for reporting stats.
|
||||
size_t maxSizeSeen() {
|
||||
size_t maxSizeSeen() const {
|
||||
return nextEntry;
|
||||
}
|
||||
|
||||
// Returns total allocated size of the hash table, for reporting stats.
|
||||
size_t memorySizeBytes() {
|
||||
size_t memorySizeBytes() const {
|
||||
return numBuckets * sizeof(buckets[0]) + maxEntries * sizeof(entries[0]);
|
||||
}
|
||||
|
||||
private:
|
||||
struct Entry {
|
||||
Key key{};
|
||||
Value value{};
|
||||
std::atomic<Entry*> next{nullptr}; // NOLINT
|
||||
std::atomic<bool> valid{false}; // NOLINT
|
||||
};
|
||||
|
||||
const size_t maxEntries; // we allocate storage for this many entries on creation
|
||||
std::atomic_size_t numEntries; // number of entries currently in use NOLINT
|
||||
size_t numBuckets; // number of buckets, computed as numEntries * loadFactor
|
||||
|
||||
// pre-allocate buckets and entries
|
||||
std::unique_ptr<std::atomic<Entry*>[]> buckets; // NOLINT
|
||||
std::unique_ptr<Entry[]> entries;
|
||||
|
||||
std::atomic_size_t nextEntry; // first entry that's never been used NOLINT
|
||||
Entry* freeEntry; // linked list of entries returned to us by removeEntry
|
||||
};
|
||||
|
||||
|
||||
namespace heap_profiler_detail_gperf_tcmalloc {
|
||||
class HeapProfiler {
|
||||
public:
|
||||
static inline HeapProfiler* heapProfiler;
|
||||
|
||||
HeapProfiler() {
|
||||
// Set sample interval from the parameter.
|
||||
sampleIntervalBytes = HeapProfilingSampleIntervalBytes;
|
||||
|
||||
// This is our only allocator dependency - ifdef and change as
|
||||
// appropriate for other allocators, using hooks or shims.
|
||||
// For tcmalloc we skip two frames that are internal to the allocator
|
||||
// so that the top frame is the public tc_* function.
|
||||
skipStartFrames = 2;
|
||||
skipEndFrames = 0;
|
||||
#ifdef MONGO_HAVE_GPERF_TCMALLOC
|
||||
MallocHook::AddNewHook(+[](const void* p, size_t sz) { heapProfiler->_alloc(p, sz); });
|
||||
MallocHook::AddDeleteHook(+[](const void* p) { heapProfiler->_free(p); });
|
||||
#endif
|
||||
}
|
||||
|
||||
static void generateServerStatusSection(BSONObjBuilder& builder) {
|
||||
if (heapProfiler)
|
||||
heapProfiler->_generateServerStatusSection(builder);
|
||||
}
|
||||
|
||||
static void start() {
|
||||
heapProfiler = new HeapProfiler();
|
||||
}
|
||||
|
||||
private:
|
||||
// 0: sampling internally disabled
|
||||
// 1: sample every allocation - byte accurate but slow and big
|
||||
// >1: sample ever sampleIntervalBytes bytes allocated - less accurate but fast and small
|
||||
std::atomic_size_t sampleIntervalBytes; // NOLINT
|
||||
|
||||
// guards updates to both object and stack hash tables
|
||||
stdx::mutex hashtable_mutex; // NOLINT
|
||||
// guards against races updating the StackInfo bson representation
|
||||
stdx::mutex stackinfo_mutex; // NOLINT
|
||||
|
||||
// cumulative bytes allocated - determines when samples are taken
|
||||
std::atomic_size_t bytesAllocated{0}; // NOLINT
|
||||
|
||||
// estimated currently active bytes - sum of activeBytes for all stacks
|
||||
size_t totalActiveBytes = 0;
|
||||
|
||||
//
|
||||
// Hash table of stacks
|
||||
//
|
||||
|
||||
using FrameInfo = void*; // per-frame information is just the IP
|
||||
|
||||
static const int kMaxStackInfos = 20000; // max number of unique call sites we handle
|
||||
static const int kStackHashTableLoadFactor = 2; // keep loading <50%
|
||||
static const size_t kMaxFramesPerStack = 100; // max depth of stack
|
||||
|
||||
static const int kMaxObjInfos = 1024 * 1024; // maximum tracked allocations
|
||||
static const int kObjHashTableLoadFactor = 4; // keep hash table loading <25%
|
||||
|
||||
static const int kMaxImportantSamples = 4 * 3600; // reset every 4 hours at 1Hz
|
||||
|
||||
// stack HashTable Key
|
||||
struct Stack {
|
||||
size_t numFrames = 0;
|
||||
std::array<FrameInfo, kMaxFramesPerStack> frames;
|
||||
Stack() {}
|
||||
Stack() = default;
|
||||
|
||||
bool operator==(const Stack& that) {
|
||||
return this->numFrames == that.numFrames &&
|
||||
std::equal(frames.begin(), frames.begin() + numFrames, that.frames.begin());
|
||||
friend bool operator==(const Stack& a, const Stack& b) {
|
||||
return a.numFrames == b.numFrames &&
|
||||
std::equal(a.frames.begin(), a.frames.begin() + a.numFrames, b.frames.begin());
|
||||
}
|
||||
|
||||
Hash hash() {
|
||||
@ -371,61 +390,51 @@ private:
|
||||
numFrames * sizeof(FrameInfo)};
|
||||
return absl::HashOf(dataRange);
|
||||
}
|
||||
|
||||
size_t numFrames = 0;
|
||||
std::array<FrameInfo, kMaxFramesPerStack> frames;
|
||||
};
|
||||
|
||||
// Stack HashTable Value.
|
||||
struct StackInfo {
|
||||
StackInfo() = default;
|
||||
explicit StackInfo(int stackNum) : stackNum(stackNum) {}
|
||||
|
||||
int stackNum = 0; // used for stack short name
|
||||
size_t activeBytes = 0; // number of live allocated bytes charged to this stack
|
||||
bool logged = false; // true when stack has been logged once.
|
||||
|
||||
explicit StackInfo(int stackNum) : stackNum(stackNum) {}
|
||||
StackInfo() {}
|
||||
};
|
||||
|
||||
// The stack HashTable itself.
|
||||
HashTable<Stack, StackInfo> stackHashTable{kMaxStackInfos, kStackHashTableLoadFactor};
|
||||
struct ByPointeeStackNum {
|
||||
bool operator()(const StackInfo* a, const StackInfo* b) const {
|
||||
return a->stackNum < b->stackNum;
|
||||
}
|
||||
};
|
||||
|
||||
// frames to skip at top and bottom of backtrace when reporting stacks
|
||||
size_t skipStartFrames = 0;
|
||||
size_t skipEndFrames = 0;
|
||||
|
||||
|
||||
//
|
||||
// Hash table of allocated objects.
|
||||
//
|
||||
|
||||
static const int kMaxObjInfos = 1024 * 1024; // maximum tracked allocations
|
||||
static const int kObjHashTableLoadFactor = 4; // keep hash table loading <25%
|
||||
|
||||
// Obj HashTable Key.
|
||||
struct Obj {
|
||||
const void* objPtr = nullptr;
|
||||
Obj() = default;
|
||||
explicit Obj(const void* objPtr) : objPtr(objPtr) {}
|
||||
Obj() {}
|
||||
|
||||
bool operator==(const Obj& that) {
|
||||
return this->objPtr == that.objPtr;
|
||||
friend bool operator==(const Obj& a, const Obj& b) {
|
||||
return a.objPtr == b.objPtr;
|
||||
}
|
||||
|
||||
Hash hash() {
|
||||
return absl::HashOf(objPtr);
|
||||
}
|
||||
|
||||
const void* objPtr = nullptr;
|
||||
};
|
||||
|
||||
// Obj HashTable Value.
|
||||
struct ObjInfo {
|
||||
size_t accountedLen = 0;
|
||||
StackInfo* stackInfo = nullptr;
|
||||
ObjInfo() = default;
|
||||
ObjInfo(size_t accountedLen, StackInfo* stackInfo)
|
||||
: accountedLen(accountedLen), stackInfo(stackInfo) {}
|
||||
ObjInfo() {}
|
||||
|
||||
size_t accountedLen = 0;
|
||||
StackInfo* stackInfo = nullptr;
|
||||
};
|
||||
|
||||
// The obj HashTable itself.
|
||||
HashTable<Obj, ObjInfo> objHashTable{kMaxObjInfos, kObjHashTableLoadFactor};
|
||||
|
||||
|
||||
// If we encounter an error that doesn't allow us to proceed, for
|
||||
// example out of space for new hash table entries, we internally
|
||||
// disable profiling and then log an error message.
|
||||
@ -562,26 +571,6 @@ private:
|
||||
"stackObj"_attr = builder.done());
|
||||
}
|
||||
|
||||
//
|
||||
// Generate serverStatus section.
|
||||
//
|
||||
|
||||
bool logGeneralStats = true; // first time only
|
||||
|
||||
// In order to reduce load on ftdc we track the stacks we deem important enough to emit
|
||||
// once a stack is deemed "important" it remains important from that point on.
|
||||
// "Important" is a sticky quality to improve the stability of the set of stacks we emit,
|
||||
// and we always emit them in stackNum order, greatly improving ftdc compression efficiency.
|
||||
struct ImportantStacksOrder {
|
||||
bool operator()(const StackInfo* a, const StackInfo* b) const {
|
||||
return a->stackNum < b->stackNum;
|
||||
}
|
||||
};
|
||||
std::set<const StackInfo*, ImportantStacksOrder> importantStacks;
|
||||
|
||||
int numImportantSamples = 0; // samples currently included in importantStacks
|
||||
const int kMaxImportantSamples = 4 * 3600; // reset every 4 hours at default 1 sample / sec
|
||||
|
||||
void _generateServerStatusSection(BSONObjBuilder& builder) {
|
||||
// compute and log some informational stats first time through
|
||||
if (logGeneralStats) {
|
||||
@ -681,44 +670,218 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Static hooks to give to the allocator.
|
||||
//
|
||||
// 0: sampling internally disabled
|
||||
// 1: sample every allocation - byte accurate but slow and big
|
||||
// >1: sample ever sampleIntervalBytes bytes allocated - less accurate but fast and small
|
||||
std::atomic_size_t sampleIntervalBytes; // NOLINT
|
||||
|
||||
static void alloc(const void* obj, size_t objLen) {
|
||||
heapProfiler->_alloc(obj, objLen);
|
||||
}
|
||||
// guards updates to both object and stack hash tables
|
||||
stdx::mutex hashtable_mutex; // NOLINT
|
||||
// guards against races updating the StackInfo bson representation
|
||||
stdx::mutex stackinfo_mutex; // NOLINT
|
||||
|
||||
static void free(const void* obj) {
|
||||
heapProfiler->_free(obj);
|
||||
}
|
||||
// cumulative bytes allocated - determines when samples are taken
|
||||
std::atomic_size_t bytesAllocated{0}; // NOLINT
|
||||
|
||||
// estimated currently active bytes - sum of activeBytes for all stacks
|
||||
size_t totalActiveBytes = 0;
|
||||
|
||||
// The stack HashTable itself.
|
||||
HashTable<Stack, StackInfo> stackHashTable{kMaxStackInfos, kStackHashTableLoadFactor};
|
||||
|
||||
// frames to skip at top and bottom of backtrace when reporting stacks
|
||||
size_t skipStartFrames = 0;
|
||||
size_t skipEndFrames = 0;
|
||||
|
||||
// The obj HashTable itself.
|
||||
HashTable<Obj, ObjInfo> objHashTable{kMaxObjInfos, kObjHashTableLoadFactor};
|
||||
|
||||
bool logGeneralStats = true; // first time only
|
||||
|
||||
// In order to reduce load on ftdc we track the stacks we deem important enough to emit
|
||||
// once a stack is deemed "important" it remains important from that point on.
|
||||
// "Important" is a sticky quality to improve the stability of the set of stacks we emit,
|
||||
// and we always emit them in stackNum order, greatly improving ftdc compression efficiency.
|
||||
std::set<const StackInfo*, ByPointeeStackNum> importantStacks;
|
||||
|
||||
int numImportantSamples = 0; // samples currently included in importantStacks
|
||||
};
|
||||
} // namespace heap_profiler_detail_gperf_tcmalloc
|
||||
|
||||
namespace heap_profiler_detail_tcmalloc {
|
||||
#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
|
||||
class HeapProfiler {
|
||||
public:
|
||||
static HeapProfiler* heapProfiler;
|
||||
static const int kMaxImportantSamples = 4 * 3600; // reset every 4 hours at 1Hz
|
||||
static inline HeapProfiler* heapProfiler;
|
||||
|
||||
HeapProfiler() {
|
||||
// Set sample interval from the parameter.
|
||||
sampleIntervalBytes = HeapProfilingSampleIntervalBytes;
|
||||
|
||||
// This is our only allocator dependency - ifdef and change as
|
||||
// appropriate for other allocators, using hooks or shims.
|
||||
// For tcmalloc we skip two frames that are internal to the allocator
|
||||
// so that the top frame is the public tc_* function.
|
||||
skipStartFrames = 2;
|
||||
skipEndFrames = 0;
|
||||
MallocHook::AddNewHook(alloc);
|
||||
MallocHook::AddDeleteHook(free);
|
||||
tcmalloc::MallocExtension::SetProfileSamplingRate(sampleIntervalBytes);
|
||||
auto profileToken = tcmalloc::MallocExtension::StartAllocationProfiling();
|
||||
profileTokens.push_back(std::move(profileToken));
|
||||
}
|
||||
|
||||
static void generateServerStatusSection(BSONObjBuilder& builder) {
|
||||
if (heapProfiler)
|
||||
heapProfiler->_generateServerStatusSection(builder);
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// serverStatus section
|
||||
//
|
||||
static void start() {
|
||||
heapProfiler = new HeapProfiler();
|
||||
}
|
||||
|
||||
private:
|
||||
struct StackInfo {
|
||||
StackInfo(const tcmalloc::Profile::Sample& stackSample, int id) {
|
||||
stackNum = id;
|
||||
numFrames = stackSample.depth;
|
||||
// Generate a bson representation of our new stack.
|
||||
BSONArrayBuilder builder;
|
||||
std::string frameString(256, '\0');
|
||||
for (int i = 0; i < stackSample.depth; ++i) {
|
||||
char buf[256];
|
||||
if (!absl::Symbolize(stackSample.stack[i], buf, sizeof(buf))) {
|
||||
frameString = fmt::format("{}", stackSample.stack[i]);
|
||||
} else {
|
||||
frameString.assign(buf);
|
||||
}
|
||||
builder.append(frameString);
|
||||
}
|
||||
LOGV2(8592501,
|
||||
"heapProfile stack",
|
||||
"stackNum"_attr = stackNum,
|
||||
"stackObj"_attr = builder.obj());
|
||||
}
|
||||
|
||||
int stackNum = 0; // used for stack short name
|
||||
BSONObj stackObj; // symbolized representation
|
||||
int numFrames = 0;
|
||||
uint64_t activeBytes = 0;
|
||||
};
|
||||
|
||||
struct ByStackNum {
|
||||
bool operator()(StackInfo* a, StackInfo* b) const {
|
||||
return a->stackNum < b->stackNum;
|
||||
}
|
||||
};
|
||||
|
||||
uint32_t StackHash(const tcmalloc::Profile::Sample& stackSample) {
|
||||
uint32_t hash;
|
||||
MurmurHash3_x86_32(stackSample.stack, stackSample.depth * sizeof(void*), 0, &hash);
|
||||
return hash;
|
||||
}
|
||||
|
||||
void _generateServerStatusSection(BSONObjBuilder& builder) {
|
||||
// Compute and log some informational stats first time through
|
||||
if (logGeneralStats) {
|
||||
LOGV2(8592504,
|
||||
"Generating heap profiler serverStatus",
|
||||
"heapProfilingSampleIntervalBytes"_attr = HeapProfilingSampleIntervalBytes);
|
||||
LOGV2(8592503, "Following stack trace is for heap profiler informational purposes");
|
||||
printStackTrace();
|
||||
logGeneralStats = false;
|
||||
}
|
||||
|
||||
// Get a live snapshot profile of the current heap usage
|
||||
int64_t totalActiveBytes = 0;
|
||||
std::vector<StackInfo*> stackInfos;
|
||||
std::set<StackInfo*, ByStackNum> activeStacks;
|
||||
tcmalloc::MallocExtension::SnapshotCurrent(tcmalloc::ProfileType::kHeap)
|
||||
.Iterate([&](const auto& sample) {
|
||||
totalActiveBytes += sample.sum;
|
||||
// Compute backtrace hash of sample stack
|
||||
uint32_t stackHash = StackHash(sample);
|
||||
StackInfo* stackInfo = stackInfoMap[stackHash];
|
||||
// If this is a new stack, store in our stack map
|
||||
if (!stackInfo) {
|
||||
stackInfo = new StackInfo(sample, stackInfoMap.size());
|
||||
stackInfoMap[stackHash] = stackInfo;
|
||||
}
|
||||
auto activeStackSearch = activeStacks.find(stackInfo);
|
||||
if (activeStackSearch != activeStacks.end()) {
|
||||
stackInfo->activeBytes += sample.sum;
|
||||
} else {
|
||||
activeStacks.insert(stackInfo);
|
||||
stackInfos.push_back(stackInfo);
|
||||
stackInfo->activeBytes = sample.sum;
|
||||
}
|
||||
});
|
||||
|
||||
// Get the series of allocation samples to this point
|
||||
auto currentToken = std::move(profileTokens.back());
|
||||
profileTokens.pop_back();
|
||||
auto allocProfile = std::move(currentToken).Stop();
|
||||
// Start a new allocation profile session for the next invocation
|
||||
auto newToken = tcmalloc::MallocExtension::StartAllocationProfiling();
|
||||
profileTokens.push_back(std::move(newToken));
|
||||
|
||||
// Sum all the allocations performed (of what we sampled)
|
||||
int64_t allocatedBytes = 0;
|
||||
allocProfile.Iterate(
|
||||
[&](const tcmalloc::Profile::Sample& sample) { allocatedBytes += sample.sum; });
|
||||
sampleBytesAllocated += allocatedBytes;
|
||||
|
||||
BSONObjBuilder(builder.subobjStart("stats"))
|
||||
.appendNumber("totalActiveBytes", static_cast<long long>(totalActiveBytes))
|
||||
.appendNumber("bytesAllocated", static_cast<long long>(sampleBytesAllocated))
|
||||
.appendNumber("numStacks", static_cast<long long>(stackInfoMap.size()));
|
||||
|
||||
// Sort the stacks and find enough stacks to account for at least 99% of the active bytes
|
||||
// deem any stack that has ever met this criterion as "important".
|
||||
std::stable_sort(stackInfos.begin(), stackInfos.end(), [](StackInfo* a, StackInfo* b) {
|
||||
return a->activeBytes > b->activeBytes;
|
||||
});
|
||||
size_t threshold = totalActiveBytes * 0.99;
|
||||
size_t cumulative = 0;
|
||||
for (auto&& stackInfo : stackInfos) {
|
||||
importantStacks.insert(stackInfo);
|
||||
cumulative += stackInfo->activeBytes;
|
||||
if (cumulative > threshold)
|
||||
break;
|
||||
}
|
||||
|
||||
// Build the stacks subsection by emitting a sample of stacks that were live at a peak of
|
||||
// total heap usage.
|
||||
{
|
||||
BSONObjBuilder stacks(builder.subobjStart("stacks"));
|
||||
for (auto&& stackInfo : importantStacks)
|
||||
BSONObjBuilder{stacks.subobjStart(fmt::format("stack{}", stackInfo->stackNum))}
|
||||
.appendNumber("activeBytes", static_cast<long long>(stackInfo->activeBytes));
|
||||
}
|
||||
|
||||
// importantStacks grows monotonically, so it can accumulate unneeded stacks,
|
||||
// so we clear it periodically.
|
||||
if (++numImportantSamples >= kMaxImportantSamples) {
|
||||
LOGV2(8592502, "Clearing importantStacks");
|
||||
importantStacks.clear();
|
||||
numImportantSamples = 0;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<tcmalloc::MallocExtension::AllocationProfilingToken> profileTokens;
|
||||
std::atomic_size_t sampleIntervalBytes;
|
||||
std::atomic_size_t sampleBytesAllocated{0};
|
||||
|
||||
bool logGeneralStats = true; // first time only
|
||||
stdx::unordered_map<uint32_t, StackInfo*> stackInfoMap;
|
||||
|
||||
// In order to reduce load on ftdc we track the stacks we deem important enough to emit
|
||||
// once a stack is deemed "important" it remains important from that point on.
|
||||
// "Important" is a sticky quality to improve the stability of the set of stacks we emit,
|
||||
// and we always emit them in stackNum order, greatly improving ftdc compression efficiency.
|
||||
std::set<StackInfo*, ByStackNum> importantStacks;
|
||||
|
||||
int numImportantSamples = 0; // samples currently included in importantStacks
|
||||
};
|
||||
#endif // MONGO_HAVE_GOOGLE_TCMALLOC
|
||||
} // namespace heap_profiler_detail_tcmalloc
|
||||
|
||||
#if defined(MONGO_HAVE_GOOGLE_TCMALLOC)
|
||||
using heap_profiler_detail_tcmalloc::HeapProfiler;
|
||||
#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
|
||||
using heap_profiler_detail_gperf_tcmalloc::HeapProfiler;
|
||||
#endif
|
||||
|
||||
class HeapProfilerServerStatusSection final : public ServerStatusSection {
|
||||
public:
|
||||
@ -728,27 +891,26 @@ public:
|
||||
return HeapProfilingEnabled;
|
||||
}
|
||||
|
||||
BSONObj generateSection(OperationContext* opCtx,
|
||||
const BSONElement& configElement) const override {
|
||||
BSONObj generateSection(OperationContext*, const BSONElement&) const override {
|
||||
BSONObjBuilder builder;
|
||||
HeapProfiler::generateServerStatusSection(builder);
|
||||
return builder.obj();
|
||||
}
|
||||
} heapProfilerServerStatusSection;
|
||||
};
|
||||
|
||||
//
|
||||
// startup
|
||||
//
|
||||
#ifdef MONGO_HAVE_HEAP_PROFILER
|
||||
|
||||
HeapProfiler* HeapProfiler::heapProfiler;
|
||||
HeapProfilerServerStatusSection heapProfilerServerStatusSection;
|
||||
|
||||
MONGO_INITIALIZER_GENERAL(StartHeapProfiling, ("EndStartupOptionHandling"), ("default"))
|
||||
(InitializerContext* context) {
|
||||
(InitializerContext*) {
|
||||
if (HeapProfilingEnabled)
|
||||
HeapProfiler::heapProfiler = new HeapProfiler();
|
||||
HeapProfiler::start();
|
||||
}
|
||||
|
||||
#endif // MONGO_HAVE_HEAP_PROFILER
|
||||
|
||||
} // namespace
|
||||
} // namespace mongo
|
||||
|
||||
#endif // MONGO_HAVE_HEAP_PROFILER
|
||||
#endif //_POSIX_VERSION
|
||||
|
||||
@ -28,15 +28,17 @@
|
||||
*/
|
||||
|
||||
|
||||
#include "mongo/base/string_data_comparator.h"
|
||||
#ifdef _WIN32
|
||||
#define NVALGRIND
|
||||
#endif
|
||||
|
||||
#include <cstddef>
|
||||
#include <gperftools/malloc_extension.h>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include <valgrind/valgrind.h>
|
||||
|
||||
#include <boost/optional/optional.hpp>
|
||||
|
||||
#include "mongo/base/error_codes.h"
|
||||
@ -51,6 +53,14 @@
|
||||
#include "mongo/db/tenant_id.h"
|
||||
#include "mongo/util/tcmalloc_parameters_gen.h"
|
||||
|
||||
#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
|
||||
#include <tcmalloc/malloc_extension.h>
|
||||
auto static tcmallocProperties = tcmalloc::MallocExtension::GetProperties();
|
||||
#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
|
||||
#include <gperftools/malloc_extension.h>
|
||||
auto static mallocExtensionAPI = MallocExtension::instance();
|
||||
#endif
|
||||
|
||||
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kDefault
|
||||
|
||||
|
||||
@ -93,74 +103,89 @@ public:
|
||||
|
||||
BSONObjBuilder builder;
|
||||
|
||||
auto getValueIfExists = [&](StringData property) -> boost::optional<size_t> {
|
||||
#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
|
||||
if (auto value = tcmallocProperties.find(property.toString());
|
||||
value != tcmallocProperties.end()) {
|
||||
return {value->second.value};
|
||||
}
|
||||
#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
|
||||
size_t value;
|
||||
if (mallocExtensionAPI->GetNumericProperty(property.rawData(), &value)) {
|
||||
return {value};
|
||||
}
|
||||
#endif
|
||||
return boost::none;
|
||||
};
|
||||
|
||||
auto tryAppend = [&](BSONObjBuilder& builder, StringData bsonName, StringData property) {
|
||||
if (auto value = getValueIfExists(property); !!value) {
|
||||
builder.appendNumber(bsonName, static_cast<long long>(*value));
|
||||
}
|
||||
};
|
||||
|
||||
auto tryStat = [&](BSONObjBuilder& builder, StringData topic, StringData base) {
|
||||
tryAppend(builder, base, fmt::format("{}.{}", topic, base));
|
||||
};
|
||||
|
||||
// For a list of properties see the "Generic Tcmalloc Status" section of
|
||||
// http://google-perftools.googlecode.com/svn/trunk/doc/tcmalloc.html and
|
||||
// http://code.google.com/p/gperftools/source/browse/src/gperftools/malloc_extension.h
|
||||
{
|
||||
BSONObjBuilder sub(builder.subobjStart("generic"));
|
||||
appendNumericPropertyIfAvailable(
|
||||
sub, "current_allocated_bytes", "generic.current_allocated_bytes");
|
||||
appendNumericPropertyIfAvailable(sub, "heap_size", "generic.heap_size");
|
||||
tryStat(sub, "generic", "current_allocated_bytes");
|
||||
tryStat(sub, "generic", "heap_size");
|
||||
}
|
||||
{
|
||||
BSONObjBuilder sub(builder.subobjStart("tcmalloc"));
|
||||
auto tryTc = [&](StringData key) {
|
||||
tryStat(sub, "tcmalloc", key);
|
||||
};
|
||||
|
||||
appendNumericPropertyIfAvailable(
|
||||
sub, "pageheap_free_bytes", "tcmalloc.pageheap_free_bytes");
|
||||
appendNumericPropertyIfAvailable(
|
||||
sub, "pageheap_unmapped_bytes", "tcmalloc.pageheap_unmapped_bytes");
|
||||
appendNumericPropertyIfAvailable(
|
||||
sub, "max_total_thread_cache_bytes", "tcmalloc.max_total_thread_cache_bytes");
|
||||
appendNumericPropertyIfAvailable(sub,
|
||||
"current_total_thread_cache_bytes",
|
||||
"tcmalloc.current_total_thread_cache_bytes");
|
||||
// Not including tcmalloc.slack_bytes since it is deprecated.
|
||||
tryTc("pageheap_free_bytes");
|
||||
tryTc("pageheap_unmapped_bytes");
|
||||
tryTc("max_total_thread_cache_bytes");
|
||||
tryTc("current_total_thread_cache_bytes");
|
||||
|
||||
// Calculate total free bytes, *excluding the page heap*
|
||||
size_t central;
|
||||
size_t transfer;
|
||||
size_t thread;
|
||||
if (MallocExtension::instance()->GetNumericProperty("tcmalloc.central_cache_free_bytes",
|
||||
¢ral) &&
|
||||
MallocExtension::instance()->GetNumericProperty(
|
||||
"tcmalloc.transfer_cache_free_bytes", &transfer) &&
|
||||
MallocExtension::instance()->GetNumericProperty("tcmalloc.thread_cache_free_bytes",
|
||||
&thread)) {
|
||||
sub.appendNumber("total_free_bytes",
|
||||
static_cast<long long>(central) +
|
||||
static_cast<long long>(transfer) +
|
||||
static_cast<long long>(thread));
|
||||
{
|
||||
long long total = 0;
|
||||
if (auto central = getValueIfExists("tcmalloc.central_cache_free"); !!central) {
|
||||
sub.appendNumber("central_cache_free_bytes", static_cast<long long>(*central));
|
||||
total += *central;
|
||||
}
|
||||
if (auto transfer = getValueIfExists("tcmalloc.transfer_cache_free"); !!transfer) {
|
||||
sub.appendNumber("transfer_cache_free_bytes",
|
||||
static_cast<long long>(*transfer));
|
||||
total += *transfer;
|
||||
}
|
||||
if (auto thread = getValueIfExists("tcmalloc.thread_cache_free"); !!thread) {
|
||||
sub.appendNumber("thread_cache_free_bytes", static_cast<long long>(*thread));
|
||||
total += *thread;
|
||||
}
|
||||
if (auto cpu = getValueIfExists("tcmalloc.cpu_free"); !!cpu) {
|
||||
sub.appendNumber("cpu_cache_free_bytes", static_cast<long long>(*cpu));
|
||||
total += *cpu;
|
||||
}
|
||||
sub.appendNumber("total_free_bytes", total);
|
||||
}
|
||||
appendNumericPropertyIfAvailable(
|
||||
sub, "central_cache_free_bytes", "tcmalloc.central_cache_free_bytes");
|
||||
appendNumericPropertyIfAvailable(
|
||||
sub, "transfer_cache_free_bytes", "tcmalloc.transfer_cache_free_bytes");
|
||||
appendNumericPropertyIfAvailable(
|
||||
sub, "thread_cache_free_bytes", "tcmalloc.thread_cache_free_bytes");
|
||||
appendNumericPropertyIfAvailable(
|
||||
sub, "aggressive_memory_decommit", "tcmalloc.aggressive_memory_decommit");
|
||||
|
||||
appendNumericPropertyIfAvailable(
|
||||
sub, "pageheap_committed_bytes", "tcmalloc.pageheap_committed_bytes");
|
||||
appendNumericPropertyIfAvailable(
|
||||
sub, "pageheap_scavenge_count", "tcmalloc.pageheap_scavenge_count");
|
||||
appendNumericPropertyIfAvailable(
|
||||
sub, "pageheap_commit_count", "tcmalloc.pageheap_commit_count");
|
||||
appendNumericPropertyIfAvailable(
|
||||
sub, "pageheap_total_commit_bytes", "tcmalloc.pageheap_total_commit_bytes");
|
||||
appendNumericPropertyIfAvailable(
|
||||
sub, "pageheap_decommit_count", "tcmalloc.pageheap_decommit_count");
|
||||
appendNumericPropertyIfAvailable(
|
||||
sub, "pageheap_total_decommit_bytes", "tcmalloc.pageheap_total_decommit_bytes");
|
||||
appendNumericPropertyIfAvailable(
|
||||
sub, "pageheap_reserve_count", "tcmalloc.pageheap_reserve_count");
|
||||
appendNumericPropertyIfAvailable(
|
||||
sub, "pageheap_total_reserve_bytes", "tcmalloc.pageheap_total_reserve_bytes");
|
||||
appendNumericPropertyIfAvailable(
|
||||
sub, "spinlock_total_delay_ns", "tcmalloc.spinlock_total_delay_ns");
|
||||
tryTc("aggressive_memory_decommit");
|
||||
|
||||
auto tcmallocReleaseRate = MallocExtension::instance()->GetMemoryReleaseRate();
|
||||
sub.appendNumber("release_rate", tcmallocReleaseRate);
|
||||
tryTc("pageheap_committed_bytes");
|
||||
tryTc("pageheap_scavenge_count");
|
||||
tryTc("pageheap_commit_count");
|
||||
tryTc("pageheap_total_commit_bytes");
|
||||
tryTc("pageheap_decommit_count");
|
||||
tryTc("pageheap_total_decommit_bytes");
|
||||
tryTc("pageheap_reserve_count");
|
||||
tryTc("pageheap_total_reserve_bytes");
|
||||
tryTc("spinlock_total_delay_ns");
|
||||
|
||||
#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
|
||||
sub.appendNumber(
|
||||
"release_rate",
|
||||
static_cast<long long>(tcmalloc::MallocExtension::GetBackgroundReleaseRate()));
|
||||
#endif
|
||||
|
||||
#if MONGO_HAVE_GPERFTOOLS_SIZE_CLASS_STATS
|
||||
if (verbosity >= 2) {
|
||||
@ -170,31 +195,25 @@ public:
|
||||
|
||||
// Size classes and page heap info is dumped in 1 call so that the performance
|
||||
// sensitive tcmalloc page heap lock is only taken once
|
||||
MallocExtension::instance()->SizeClasses(
|
||||
&builders, appendSizeClassInfo, appendPageHeapInfo);
|
||||
mallocExtensionAPI->SizeClasses(&builders, appendSizeClassInfo, appendPageHeapInfo);
|
||||
|
||||
builders.first.done();
|
||||
builder.append("page_heap", builders.second.arr());
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
|
||||
builder.append("formattedString", tcmalloc::MallocExtension::GetStats());
|
||||
#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
|
||||
char buffer[4096];
|
||||
MallocExtension::instance()->GetStats(buffer, sizeof buffer);
|
||||
mallocExtensionAPI->GetStats(buffer, sizeof buffer);
|
||||
builder.append("formattedString", buffer);
|
||||
#endif
|
||||
}
|
||||
|
||||
return builder.obj();
|
||||
}
|
||||
|
||||
private:
|
||||
static void appendNumericPropertyIfAvailable(BSONObjBuilder& builder,
|
||||
StringData bsonName,
|
||||
const char* property) {
|
||||
size_t value;
|
||||
if (MallocExtension::instance()->GetNumericProperty(property, &value))
|
||||
builder.appendNumber(bsonName, static_cast<long long>(value));
|
||||
}
|
||||
|
||||
#if MONGO_HAVE_GPERFTOOLS_SIZE_CLASS_STATS
|
||||
static void appendSizeClassInfo(void* bsonarr_builder, const base::MallocSizeClass* stats) {
|
||||
BSONArrayBuilder& builder =
|
||||
|
||||
@ -33,7 +33,6 @@
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <gperftools/malloc_extension.h>
|
||||
#include <limits>
|
||||
#include <string>
|
||||
#include <valgrind/valgrind.h>
|
||||
@ -58,11 +57,23 @@
|
||||
#include "mongo/util/str.h"
|
||||
#include "mongo/util/tcmalloc_parameters_gen.h"
|
||||
|
||||
#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
|
||||
#include <tcmalloc/malloc_extension.h>
|
||||
#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
|
||||
#include <gperftools/malloc_extension.h>
|
||||
#endif
|
||||
|
||||
namespace mongo {
|
||||
namespace {
|
||||
|
||||
constexpr absl::string_view toStringView(StringData s) {
|
||||
return {s.rawData(), s.size()};
|
||||
}
|
||||
|
||||
constexpr auto kMaxTotalThreadCacheBytesPropertyName = "tcmalloc.max_total_thread_cache_bytes"_sd;
|
||||
constexpr auto kAggressiveMemoryDecommitPropertyName = "tcmalloc.aggressive_memory_decommit"_sd;
|
||||
|
||||
#if defined(MONGO_HAVE_GPERF_TCMALLOC)
|
||||
StatusWith<size_t> getProperty(StringData propname) {
|
||||
size_t value;
|
||||
if (!MallocExtension::instance()->GetNumericProperty(propname.toString().c_str(), &value)) {
|
||||
@ -81,6 +92,66 @@ Status setProperty(StringData propname, size_t value) {
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
#endif
|
||||
|
||||
void setMaxTotalThreadCacheBytes(size_t cacheSize) {
|
||||
#if defined(MONGO_HAVE_GOOGLE_TCMALLOC)
|
||||
tcmalloc::MallocExtension::SetMaxTotalThreadCacheBytes(cacheSize);
|
||||
#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
|
||||
uassertStatusOK(setProperty(kMaxTotalThreadCacheBytesPropertyName, cacheSize));
|
||||
#endif // MONGO_HAVE_GPERF_TCMALLOC
|
||||
}
|
||||
|
||||
|
||||
#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
|
||||
// Implement abstraction for the differences between gperftools and new tcmalloc.
|
||||
bool getNumericProperty(absl::string_view key, size_t* val) {
|
||||
auto optVal = tcmalloc::MallocExtension::GetNumericProperty(key);
|
||||
if (!optVal)
|
||||
return false;
|
||||
*val = *optVal;
|
||||
return true;
|
||||
}
|
||||
|
||||
StatusWith<size_t> getProperty(StringData propname) {
|
||||
size_t value;
|
||||
if (!getNumericProperty(propname.toString().c_str(), &value)) {
|
||||
return {ErrorCodes::InternalError,
|
||||
str::stream() << "Failed to retreive tcmalloc prop: " << propname};
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
bool setNumericProperty(absl::string_view key, size_t val) {
|
||||
if (key == toStringView(kMaxTotalThreadCacheBytesPropertyName)) {
|
||||
setMaxTotalThreadCacheBytes(val);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
Status setProperty(StringData propname, size_t value) {
|
||||
if (!RUNNING_ON_VALGRIND) { // NOLINT
|
||||
if (!setNumericProperty(propname.toString().c_str(), value)) {
|
||||
return {ErrorCodes::InternalError,
|
||||
str::stream() << "Failed to set internal tcmalloc property " << propname};
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
long long getMemoryReleaseRate() {
|
||||
return static_cast<size_t>(tcmalloc::MallocExtension::GetBackgroundReleaseRate());
|
||||
}
|
||||
|
||||
bool setMemoryReleaseRate(size_t val) {
|
||||
tcmalloc::MallocExtension::SetBackgroundReleaseRate(
|
||||
tcmalloc::MallocExtension::BytesPerSecond{val});
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
StatusWith<size_t> validateTCMallocValue(StringData name, const BSONElement& newValueElement) {
|
||||
if (!newValueElement.isNumber()) {
|
||||
@ -152,7 +223,20 @@ MONGO_INITIALIZER_GENERAL(TcmallocConfigurationDefaults, (), ("BeginStartupOptio
|
||||
(systemMemorySizeMB / 8) * 1024 * 1024; // 1/8 of system memory in bytes
|
||||
size_t cacheSize = std::min(defaultTcMallocCacheSize, derivedTcMallocCacheSize);
|
||||
|
||||
uassertStatusOK(setProperty(kMaxTotalThreadCacheBytesPropertyName, cacheSize));
|
||||
setMaxTotalThreadCacheBytes(cacheSize);
|
||||
|
||||
#if defined(MONGO_HAVE_GOOGLE_TCMALLOC)
|
||||
size_t numCores = pi.getNumAvailableCores();
|
||||
// 1024MB in bytes spread across cores.
|
||||
size_t defaultTcMallocPerCPUCacheSize = (1024 * 1024 * 1024) / numCores;
|
||||
size_t derivedTcMallocPerCPUCacheSize =
|
||||
((systemMemorySizeMB / 8) * 2 * 1024 * 1024) / numCores; // 1/4 of system memory in bytes
|
||||
|
||||
size_t perCPUCacheSize =
|
||||
std::min(defaultTcMallocPerCPUCacheSize, derivedTcMallocPerCPUCacheSize);
|
||||
|
||||
tcmalloc::MallocExtension::SetMaxPerCpuCacheSize(perCPUCacheSize);
|
||||
#endif // MONGO_HAVE_GOOGLE_TCMALLOC
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@ -162,7 +246,11 @@ void TCMallocReleaseRateServerParameter::append(OperationContext*,
|
||||
BSONObjBuilder* builder,
|
||||
StringData fieldName,
|
||||
const boost::optional<TenantId>&) {
|
||||
#if defined(MONGO_HAVE_GOOGLE_TCMALLOC)
|
||||
auto value = getMemoryReleaseRate();
|
||||
#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
|
||||
auto value = MallocExtension::instance()->GetMemoryReleaseRate();
|
||||
#endif
|
||||
builder->append(fieldName, value);
|
||||
}
|
||||
|
||||
@ -178,8 +266,11 @@ Status TCMallocReleaseRateServerParameter::setFromString(StringData tcmalloc_rel
|
||||
str::stream() << "tcmallocReleaseRate cannot be negative: "
|
||||
<< tcmalloc_release_rate};
|
||||
}
|
||||
|
||||
#if defined(MONGO_HAVE_GOOGLE_TCMALLOC)
|
||||
setMemoryReleaseRate(value);
|
||||
#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
|
||||
MallocExtension::instance()->SetMemoryReleaseRate(value);
|
||||
#endif
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
||||
52
src/third_party/SConscript
vendored
@ -83,10 +83,15 @@ def injectMozJS(thisEnv):
|
||||
|
||||
env.AddMethod(injectMozJS, 'InjectMozJS')
|
||||
|
||||
if not use_system_version_of_library('tcmalloc'):
|
||||
if not use_system_version_of_library('tcmalloc-gperf'):
|
||||
# GPerftools does this slightly differently than the others.
|
||||
thirdPartyEnvironmentModifications['gperftools'] = {}
|
||||
|
||||
if not use_system_version_of_library('tcmalloc-google'):
|
||||
thirdPartyEnvironmentModifications['tcmalloc'] = {
|
||||
'CPPPATH': ['#/src/third_party/tcmalloc/dist'],
|
||||
}
|
||||
|
||||
if not use_system_version_of_library('pcre2'):
|
||||
thirdPartyEnvironmentModifications['pcre2'] = {
|
||||
'CPPPATH': ['#/src/third_party/pcre2/src'],
|
||||
@ -422,6 +427,12 @@ boostEnv.ShimLibrary(name="boost")
|
||||
|
||||
abseilDirectory = 'abseil-cpp'
|
||||
abseilEnv = env.Clone()
|
||||
|
||||
# We can't depend on the allocator if we are using tcmalloc as its depends
|
||||
# on us (abseil-cpp)
|
||||
if abseilEnv['MONGO_ALLOCATOR'] in ['tcmalloc-google']:
|
||||
abseilEnv = abseilEnv.Clone(LIBDEPS_NO_INHERIT=['$BUILD_DIR/third_party/shim_allocator'])
|
||||
|
||||
abseilEnv.InjectThirdParty(libraries=['abseil-cpp'])
|
||||
abseilEnv.SConscript(abseilDirectory + '/SConscript', exports={'env': abseilEnv})
|
||||
abseilEnv = abseilEnv.Clone(LIBDEPS_INTERFACE=[
|
||||
@ -510,17 +521,34 @@ if "tom" in env["MONGO_CRYPTO"]:
|
||||
|
||||
tomcryptEnv.ShimLibrary(name="tomcrypt", )
|
||||
|
||||
gperftoolsEnv = env.Clone(LIBDEPS_NO_INHERIT=[
|
||||
# tcmallocEnv implements this shim, so it rejects the implicit dependency.
|
||||
tcmallocEnv = env.Clone(LIBDEPS_NO_INHERIT=[
|
||||
'$BUILD_DIR/third_party/shim_allocator',
|
||||
], )
|
||||
if gperftoolsEnv['MONGO_ALLOCATOR'] in ["tcmalloc", "tcmalloc-experimental"]:
|
||||
if use_system_version_of_library("tcmalloc"):
|
||||
gperftoolsEnv = gperftoolsEnv.Clone(SYSLIBDEPS=[
|
||||
if tcmallocEnv['MONGO_ALLOCATOR'] in ["tcmalloc-google"]:
|
||||
if use_system_version_of_library("tcmalloc-google"):
|
||||
tcmallocEnv = tcmallocEnv.Clone(SYSLIBDEPS=[
|
||||
env['LIBDEPS_TCMALLOC_SYSLIBDEP'],
|
||||
])
|
||||
else:
|
||||
gperftoolsEnv = gperftoolsEnv.Clone()
|
||||
gperftoolsEnv.InjectThirdParty(libraries=['gperftools'])
|
||||
tcmallocDirectory = 'tcmalloc'
|
||||
tcmallocEnv = tcmallocEnv.Clone()
|
||||
tcmallocEnv.InjectThirdParty(libraries=['tcmalloc'])
|
||||
tcmallocEnv.SConscript(
|
||||
tcmallocDirectory + '/SConscript',
|
||||
exports={'env': tcmallocEnv},
|
||||
)
|
||||
tcmallocEnv = tcmallocEnv.Clone(LIBDEPS_INTERFACE=[
|
||||
'tcmalloc/tcmalloc',
|
||||
])
|
||||
elif tcmallocEnv['MONGO_ALLOCATOR'] in ["tcmalloc-gperf"]:
|
||||
if use_system_version_of_library("tcmalloc-gperf"):
|
||||
tcmallocEnv = tcmallocEnv.Clone(SYSLIBDEPS=[
|
||||
env['LIBDEPS_TCMALLOC_SYSLIBDEP'],
|
||||
])
|
||||
else:
|
||||
tcmallocEnv = tcmallocEnv.Clone()
|
||||
tcmallocEnv.InjectThirdParty(libraries=['gperftools'])
|
||||
|
||||
# Allow gperftools to determine its own consumer-side include/ dirs.
|
||||
# Needed because those are in a platform-specific subdirectory.
|
||||
@ -528,16 +556,16 @@ if gperftoolsEnv['MONGO_ALLOCATOR'] in ["tcmalloc", "tcmalloc-experimental"]:
|
||||
for k, v in kwargs.items():
|
||||
thirdPartyEnvironmentModifications['gperftools'][k] = v
|
||||
|
||||
gperftoolsEnv.AddMethod(registerConsumerModifications, 'RegisterConsumerModifications')
|
||||
gperftoolsEnv.SConscript(
|
||||
tcmallocEnv.AddMethod(registerConsumerModifications, 'RegisterConsumerModifications')
|
||||
tcmallocEnv.SConscript(
|
||||
'gperftools' + '/SConscript',
|
||||
exports={'env': gperftoolsEnv},
|
||||
exports={'env': tcmallocEnv},
|
||||
)
|
||||
gperftoolsEnv = gperftoolsEnv.Clone(LIBDEPS_INTERFACE=[
|
||||
tcmallocEnv = tcmallocEnv.Clone(LIBDEPS_INTERFACE=[
|
||||
'gperftools/tcmalloc_minimal',
|
||||
])
|
||||
|
||||
gperftoolsEnv.ShimLibrary(
|
||||
tcmallocEnv.ShimLibrary(
|
||||
name="allocator",
|
||||
LIBDEPS_TAGS=[
|
||||
# TODO: Remove when SERVER-48291 is merged into stable build tools.
|
||||
|
||||
21
src/third_party/abseil-cpp/SConscript
vendored
@ -10,27 +10,6 @@ if env.ToolchainIs('msvc'):
|
||||
CCFLAGS=[],
|
||||
)
|
||||
|
||||
if env.GetOption('sanitize') and 'undefined' in env.GetOption('sanitize').split(','):
|
||||
# UBSAN causes the __muloti4 reference to be in the library. This is not defined in libgcc, so
|
||||
# we will just opt out of this check in this third party library. Related issues below:
|
||||
#
|
||||
# abseil issue showing the commit it was introduced
|
||||
# https://github.com/abseil/abseil-cpp/issues/841
|
||||
#
|
||||
# GCC bug saying the symbol is missing
|
||||
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103034
|
||||
#
|
||||
# LLVM bug saying the symbol requires extra linkage
|
||||
# https://bugs.llvm.org/show_bug.cgi?id=16404
|
||||
env.Append(
|
||||
CCFLAGS=[
|
||||
'-fno-sanitize=signed-integer-overflow',
|
||||
],
|
||||
LINKFLAGS=[
|
||||
'-fno-sanitize=signed-integer-overflow',
|
||||
],
|
||||
)
|
||||
|
||||
if env.ToolchainIs('gcc'):
|
||||
env.Append(CCFLAGS=[
|
||||
'-Wno-error=ignored-attributes',
|
||||
|
||||
@ -335,11 +335,7 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' ||
|
||||
#if (defined(__clang__) && !defined(_WIN32)) || \
|
||||
(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ >= 9) || \
|
||||
(defined(__GNUC__) && !defined(__clang__) && !defined(__CUDACC__))
|
||||
#if !ABSL_HAVE_FEATURE(address_sanitizer) && !ABSL_HAVE_FEATURE(memory_sanitizer) && \
|
||||
!ABSL_HAVE_FEATURE(thread_sanitizer) && !ABSL_HAVE_FEATURE(undefined_behavior_sanitizer)
|
||||
#define ABSL_HAVE_INTRINSIC_INT128 1
|
||||
#endif // !ABSL_HAVE_FEATURE(address_sanitizer) && !ABSL_HAVE_FEATURE(memory_sanitizer) &&
|
||||
// !ABSL_HAVE_FEATURE(thread_sanitizer) && !ABSL_HAVE_FEATURE(undefined_behavior_sanitizer)
|
||||
#elif defined(__CUDACC__)
|
||||
// __CUDACC_VER__ is a full version number before CUDA 9, and is defined to a
|
||||
// string explaining that it has been removed starting with CUDA 9. We use
|
||||
|
||||
2
src/third_party/abseil-cpp/scripts/import.sh
vendored
@ -8,7 +8,7 @@ IFS=$'\n\t'
|
||||
set -vx
|
||||
|
||||
NAME=abseil-cpp
|
||||
REVISION="20230802.1-mongo-20240205"
|
||||
REVISION="20230802.1-SERVER-85737"
|
||||
VERSION="20230802.1"
|
||||
|
||||
DEST_DIR=$(git rev-parse --show-toplevel)/src/third_party/abseil-cpp
|
||||
|
||||
@ -96,27 +96,6 @@ if env.ToolchainIs('msvc'):
|
||||
CCFLAGS=[],
|
||||
)
|
||||
|
||||
if env.GetOption('sanitize') and 'undefined' in env.GetOption('sanitize').split(','):
|
||||
# UBSAN causes the __muloti4 reference to be in the library. This is not defined in libgcc, so
|
||||
# we will just opt out of this check in this third party library. Related issues below:
|
||||
#
|
||||
# abseil issue showing the commit it was introduced
|
||||
# https://github.com/abseil/abseil-cpp/issues/841
|
||||
#
|
||||
# GCC bug saying the symbol is missing
|
||||
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103034
|
||||
#
|
||||
# LLVM bug saying the symbol requires extra linkage
|
||||
# https://bugs.llvm.org/show_bug.cgi?id=16404
|
||||
env.Append(
|
||||
CCFLAGS=[
|
||||
'-fno-sanitize=signed-integer-overflow',
|
||||
],
|
||||
LINKFLAGS=[
|
||||
'-fno-sanitize=signed-integer-overflow',
|
||||
],
|
||||
)
|
||||
|
||||
if env.ToolchainIs('gcc'):
|
||||
env.Append(
|
||||
CCFLAGS=[
|
||||
|
||||
10
src/third_party/grpc/SConscript
vendored
@ -152,13 +152,6 @@ upb_env.Library(
|
||||
],
|
||||
)
|
||||
|
||||
upb_generated_protobuf_descriptor_obj = upb_env.LibraryObject(
|
||||
target="upb_generated_protobuf_descriptor",
|
||||
source=[
|
||||
"dist/src/core/ext/upb-generated/google/protobuf/descriptor.upb.c",
|
||||
],
|
||||
)[0]
|
||||
|
||||
upb_env.Library(
|
||||
target="upb_wire",
|
||||
source=[
|
||||
@ -230,7 +223,7 @@ upb_env.Library(
|
||||
"dist/third_party/upb/upb/reflection/method_def.c",
|
||||
"dist/third_party/upb/upb/reflection/oneof_def.c",
|
||||
"dist/third_party/upb/upb/reflection/service_def.c",
|
||||
upb_generated_protobuf_descriptor_obj,
|
||||
"dist/src/core/ext/upb-generated/google/protobuf/descriptor.upb.c",
|
||||
],
|
||||
LIBDEPS=[
|
||||
"upb_collections",
|
||||
@ -1231,7 +1224,6 @@ grpc_env.Library(
|
||||
"dist/src/core/tsi/ssl_transport_security_utils.cc",
|
||||
"dist/src/core/tsi/transport_security.cc",
|
||||
"dist/src/core/tsi/transport_security_grpc.cc",
|
||||
upb_generated_protobuf_descriptor_obj,
|
||||
],
|
||||
OBJPREFIX=env.get('OBJPREFIX', '') + 'grpc_',
|
||||
LIBDEPS=[
|
||||
|
||||
173
src/third_party/tcmalloc/SConscript
vendored
Normal file
@ -0,0 +1,173 @@
|
||||
# Project: com_google_tcmalloc
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
|
||||
import SCons
|
||||
|
||||
Import("env")
|
||||
Import("has_option")
|
||||
|
||||
env = env.Clone(
|
||||
# Building with hidden visibility interferes with intercepting the
|
||||
# libc allocation functions.
|
||||
DISALLOW_VISHIDDEN=True,
|
||||
NINJA_GENSOURCE_INDEPENDENT=True,
|
||||
)
|
||||
|
||||
if env.Verbose():
|
||||
|
||||
def tcmalloc_scons_print(msg, *args, **kwargs):
|
||||
print("[TCMALLOC_TO_SCONS]: " + msg, *args, **kwargs)
|
||||
else:
|
||||
|
||||
def tcmalloc_scons_print(msg, *args, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
# manually switch this for all the debugging
|
||||
tcmalloc_extra_debug = False
|
||||
|
||||
if tcmalloc_extra_debug:
|
||||
|
||||
def tcmalloc_scons_debug(msg, *args, **kwargs):
|
||||
print("[TCMALLOC_TO_SCONS][DEBUG]: " + msg, *args, **kwargs)
|
||||
else:
|
||||
|
||||
def tcmalloc_scons_debug(msg, *args, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
_bazelToSconsMap = dict(
|
||||
(f'@com_google_absl//absl/{k}', [f'$BUILD_DIR/third_party/abseil-cpp/absl_{ve}' for ve in v])
|
||||
for k, v in {
|
||||
'algorithm:container': [],
|
||||
'base:config': [],
|
||||
'base:core_headers': [],
|
||||
'base:dynamic_annotations': [],
|
||||
'container:btree': [],
|
||||
'container:fixed_array': [],
|
||||
'container:flat_hash_map': ['raw_hash_set'],
|
||||
'debugging:leak_check': [],
|
||||
'debugging:stacktrace': ['stacktrace'],
|
||||
'debugging:symbolize': [],
|
||||
'functional:function_ref': [],
|
||||
'base:malloc_internal': ['malloc_internal'],
|
||||
'memory': [],
|
||||
'numeric:bits': [],
|
||||
'numeric:int128': [],
|
||||
'strings:str_format': [],
|
||||
'types:optional': [],
|
||||
'types:span': [],
|
||||
}.items())
|
||||
|
||||
sys.path.append(env.Dir('scripts/site-scons').srcnode().abspath)
|
||||
from bazel_to_scons import BazelEnv, Label
|
||||
|
||||
|
||||
def dumpBazelLibs(baz, target):
|
||||
if tcmalloc_extra_debug:
|
||||
tcmalloc_scons_debug(f"Dumping tcmalloc deps to: '{target}'", file=sys.stderr)
|
||||
with open(target.abspath, 'w') as dump:
|
||||
tcmalloc_scons_debug(
|
||||
json.dumps({'libraries': baz}, sort_keys=True, indent=4), file=dump)
|
||||
else:
|
||||
pass
|
||||
|
||||
|
||||
def _remapAbseilDep(label: Label) -> 'list[str]':
|
||||
tcmalloc_scons_print(f'Remap abseilDep {label}', file=sys.stderr)
|
||||
if str(label) in _bazelToSconsMap:
|
||||
out = _bazelToSconsMap[str(label)]
|
||||
tcmalloc_scons_print(f'Remap {label} to {out}', file=sys.stderr)
|
||||
return out
|
||||
|
||||
pkg = label.package().replace('/', '_')
|
||||
tgt = label.target()
|
||||
# bazel expands //foo/bar => //foo/bar:bar implicitly. Use short form
|
||||
if tgt and not pkg.endswith('/' + tgt):
|
||||
tgt = "_" + tgt.replace('/', '_')
|
||||
else:
|
||||
tgt = ''
|
||||
return [f'$BUILD_DIR/third_party/abseil-cpp/{pkg}{tgt}']
|
||||
|
||||
|
||||
def findAbslLibs():
|
||||
abslSconscript = env.File('$BUILD_DIR/third_party/abseil-cpp/SConscript').srcnode().abspath
|
||||
tcmalloc_scons_debug(f'abslSconscript={abslSconscript}', file=sys.stderr)
|
||||
abslLibs = []
|
||||
with open(abslSconscript) as inf:
|
||||
lines = (s.strip() for s in inf.readlines())
|
||||
targetRe = re.compile(r"\s*target=['\"](.*)['\"],")
|
||||
for line in lines:
|
||||
m = targetRe.match(line)
|
||||
if m:
|
||||
fq = f'$BUILD_DIR/third_party/abseil-cpp/{m[1]}'
|
||||
tcmalloc_scons_debug(f"found {fq} in {line}", file=sys.stderr)
|
||||
abslLibs.append(fq)
|
||||
return sorted(abslLibs)
|
||||
|
||||
|
||||
def _mapDepToScons(lab: str, base: str = '') -> str:
|
||||
if re.match(r'^@com_google_absl//', lab):
|
||||
return _remapAbseilDep(Label(lab))
|
||||
lab = re.sub(r'^:', f'//{Label(base).package()}:', lab)
|
||||
lab = re.sub(r'^//', '', lab)
|
||||
lab = re.sub(r'(.*):(.*)', r'\1_\2', lab)
|
||||
lab = lab.replace("/", "_")
|
||||
return [lab]
|
||||
|
||||
|
||||
def slurpBlaze(target, source, exports, env):
|
||||
bazel = BazelEnv(env, env.Dir("dist").srcnode().abspath, debug=tcmalloc_scons_debug)
|
||||
bazel.run()
|
||||
bazel.pruneTestOnlyLibraries()
|
||||
bazel.eliminateHeadersFromSources()
|
||||
bazel.eliminateSourcelessDeps()
|
||||
bzl = bazel.libraries()
|
||||
dumpBazelLibs(bzl, target)
|
||||
resolved = bazel.resolveDeps(exports)
|
||||
|
||||
unknowns = [(x, resolved[x]) for x in resolved if 'unknown' in resolved[x]]
|
||||
abslImports = {}
|
||||
for unk in sorted(unknowns):
|
||||
lab = Label(unk[0])
|
||||
if lab.remote() == 'com_google_absl':
|
||||
abslImports[str(lab)] = _remapAbseilDep(lab)
|
||||
tcmalloc_scons_debug(f"{json.dumps({'abslImports': abslImports}, indent=4)}", file=sys.stderr)
|
||||
|
||||
tcmalloc_scons_print('Final render into env.Library calls', file=sys.stderr)
|
||||
for libName in sorted(resolved.keys()):
|
||||
if Label(libName).remote() or libName in _bazelToSconsMap or libName not in bzl:
|
||||
continue
|
||||
libDef = bzl[libName]
|
||||
# It's the abseil name
|
||||
lab = _mapDepToScons(libName)[0]
|
||||
tcmalloc_scons_debug(f'libName: {libName:60s} => {lab}', file=sys.stderr)
|
||||
tcmalloc_scons_debug(f' {json.dumps(list(libDef), indent=4)}', file=sys.stderr)
|
||||
kwargs = {'target': lab}
|
||||
for src in libDef.get('srcs', []):
|
||||
src = f'dist/{Label(libName).package()}/{src}'
|
||||
tcmalloc_scons_debug(f'srcs for lib={libName} -> src={src}', file=sys.stderr)
|
||||
kwargs.setdefault('source', []).append(src)
|
||||
for dep in libDef.get('deps', set()):
|
||||
scons_deps = _mapDepToScons(dep, base=libName)
|
||||
tcmalloc_scons_debug(f'lib={libName}: dep={dep} => {scons_deps}', file=sys.stderr)
|
||||
kwargs.setdefault('LIBDEPS', []).extend(scons_deps)
|
||||
if 'LIBDEPS' in kwargs:
|
||||
kwargs['LIBDEPS'] = sorted(list(set(kwargs['LIBDEPS'])))
|
||||
|
||||
for cf in libDef.get('copts', []):
|
||||
kwargs.setdefault('CCFLAGS', [e for e in env.get('CCFLAGS', [])]).append(cf)
|
||||
tcmalloc_scons_print(f'env.Library(**{json.dumps(kwargs, indent=4)})', file=sys.stderr)
|
||||
env.Library(**kwargs)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
env = env.Clone()
|
||||
env.InjectThirdParty(libraries=['abseil-cpp'])
|
||||
|
||||
slurpBlaze(
|
||||
target=env.File('tcmalloc_deps.json').srcnode(), source=[],
|
||||
exports=['//tcmalloc', '//tcmalloc:tcmalloc_extension'], env=env)
|
||||
74
src/third_party/tcmalloc/dist/CONTRIBUTING.md
vendored
Normal file
@ -0,0 +1,74 @@
|
||||
# How to Contribute to TCMalloc
|
||||
|
||||
We'd love to accept your patches and contributions to this project. There are
|
||||
just a few small guidelines you need to follow.
|
||||
|
||||
NOTE: If you are new to GitHub, please start by reading [Pull Request
|
||||
howto](https://help.github.com/articles/about-pull-requests/)
|
||||
|
||||
## Contributor License Agreement
|
||||
|
||||
Contributions to this project must be accompanied by a Contributor License
|
||||
Agreement. You (or your employer) retain the copyright to your contribution;
|
||||
this simply gives us permission to use and redistribute your contributions as
|
||||
part of the project. Head over to <https://cla.developers.google.com/> to see
|
||||
your current agreements on file or to sign a new one.
|
||||
|
||||
You generally only need to submit a CLA once, so if you've already submitted one
|
||||
(even if it was for a different project), you probably don't need to do it
|
||||
again.
|
||||
|
||||
## Guidelines for Pull Requests
|
||||
|
||||
* All submissions, including submissions by project members, require review.
|
||||
We use GitHub pull requests for this purpose. Consult
|
||||
[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
|
||||
information on using pull requests.
|
||||
|
||||
* If you are a Googler, it is preferable to first create an internal CL and
|
||||
have it reviewed and submitted. The code propagation process will deliver
|
||||
the change to GitHub.
|
||||
|
||||
* Create **small PRs** that are narrowly focused on **addressing a single concern**.
|
||||
When PRs try to fix several things at a time, if only one fix is considered
|
||||
acceptable, nothing gets merged and both author's & review's time is wasted.
|
||||
Create more PRs to address different concerns and everyone will be happy.
|
||||
|
||||
* Provide a good **PR description** as a record of **what** change is being
|
||||
made and **why** it was made. Link to a GitHub issue if it exists.
|
||||
|
||||
* Don't fix code style and formatting unless you are already changing that line
|
||||
to address an issue. Formatting of modified lines may be done using
|
||||
`git clang-format`. PRs with irrelevant changes won't be merged. If you do
|
||||
want to fix formatting or style, do that in a separate PR.
|
||||
|
||||
* Unless your PR is trivial, you should expect there will be reviewer comments
|
||||
that you'll need to address before merging. We expect you to be reasonably
|
||||
responsive to those comments, otherwise the PR will be closed after 2-3 weeks
|
||||
of inactivity.
|
||||
|
||||
* Maintain **clean commit history** and use **meaningful commit messages**.
|
||||
PRs with messy commit history are difficult to review and won't be merged.
|
||||
Use `rebase -i upstream/master` to curate your commit history and/or to
|
||||
bring in latest changes from master (but avoid rebasing in the middle of a
|
||||
code review).
|
||||
|
||||
* Keep your PR up to date with upstream/master (if there are merge conflicts,
|
||||
we can't really merge your change).
|
||||
|
||||
* **All tests need to be passing** before your change can be merged. We
|
||||
recommend you **run tests locally** (see below)
|
||||
|
||||
* Exceptions to the rules can be made if there's a compelling reason for doing
|
||||
so. That is - the rules are here to serve us, not the other way around, and
|
||||
the rules need to be serving their intended purpose to be valuable.
|
||||
|
||||
## TCMalloc Committers
|
||||
|
||||
The current members of the TCMalloc engineering team are the only committers at
|
||||
present.
|
||||
|
||||
## Community Guidelines
|
||||
|
||||
This project follows
|
||||
[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
|
||||
202
src/third_party/tcmalloc/dist/LICENSE
vendored
Normal file
@ -0,0 +1,202 @@
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
https://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
https://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
45
src/third_party/tcmalloc/dist/README.md
vendored
Normal file
@ -0,0 +1,45 @@
|
||||
# TCMalloc
|
||||
|
||||
This repository contains the TCMalloc C++ code.
|
||||
|
||||
TCMalloc is Google's customized implementation of C's `malloc()` and C++'s
|
||||
`operator new` used for memory allocation within our C and C++ code. TCMalloc is
|
||||
a fast, multi-threaded malloc implementation.
|
||||
|
||||
## Building TCMalloc
|
||||
|
||||
[Bazel](https://bazel.build) is the official build system for TCMalloc.
|
||||
|
||||
The [TCMalloc Platforms Guide](docs/platforms.md) contains information on
|
||||
platform support for TCMalloc.
|
||||
|
||||
## Documentation
|
||||
|
||||
All users of TCMalloc should consult the following documentation resources:
|
||||
|
||||
* The [TCMalloc Quickstart](docs/quickstart.md) covers downloading,
|
||||
installing, building, and testing TCMalloc, including incorporating within
|
||||
your codebase.
|
||||
* The [TCMalloc Overview](docs/overview.md) covers the basic architecture of
|
||||
TCMalloc, and how that may affect configuration choices.
|
||||
* The [TCMalloc Reference](docs/reference.md) covers the C and C++ TCMalloc
|
||||
API endpoints.
|
||||
|
||||
More advanced usages of TCMalloc may find the following documentation useful:
|
||||
|
||||
* The [TCMalloc Tuning Guide](docs/tuning.md) covers the configuration
|
||||
choices in more depth, and also illustrates other ways to customize
|
||||
TCMalloc. This also covers important operating system-level properties for
|
||||
improving TCMalloc performance.
|
||||
* The [TCMalloc Design Doc](docs/design.md) covers how TCMalloc works
|
||||
underneath the hood, and why certain design choices were made. Most
|
||||
developers will not need this level of implementation detail.
|
||||
* The [TCMalloc Compatibility Guide](docs/compatibility.md) which documents
|
||||
our expectations for how our APIs are used.
|
||||
|
||||
## License
|
||||
|
||||
The TCMalloc library is licensed under the terms of the Apache license. See
|
||||
LICENSE for more information.
|
||||
|
||||
Disclaimer: This is not an officially supported Google product.
|
||||
111
src/third_party/tcmalloc/dist/WORKSPACE
vendored
Normal file
@ -0,0 +1,111 @@
|
||||
# Copyright 2019 The TCMalloc Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# https://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
workspace(name = "com_google_tcmalloc")
|
||||
|
||||
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
|
||||
|
||||
# Load a recent version of skylib in case our dependencies have obsolete
|
||||
# versions. This is needed for bazel 6 compatibility.
|
||||
http_archive(
|
||||
name = "bazel_skylib", # 2022-09-01
|
||||
urls = ["https://github.com/bazelbuild/bazel-skylib/archive/refs/tags/1.3.0.zip"],
|
||||
strip_prefix = "bazel-skylib-1.3.0",
|
||||
sha256 = "4756ab3ec46d94d99e5ed685d2d24aece484015e45af303eb3a11cab3cdc2e71",
|
||||
)
|
||||
|
||||
# Abseil
|
||||
http_archive(
|
||||
name = "com_google_absl",
|
||||
urls = ["https://github.com/abseil/abseil-cpp/archive/b3162b1da62711c663d0025e2eabeb83fd1f2728.zip"],
|
||||
strip_prefix = "abseil-cpp-b3162b1da62711c663d0025e2eabeb83fd1f2728",
|
||||
sha256 = "d5c91248c33269fcc7ab35897315a45cfa2c37abb4c6d4ed36cb5c82f366367a",
|
||||
)
|
||||
|
||||
# GoogleTest/GoogleMock framework. Used by most unit-tests.
|
||||
http_archive(
|
||||
name = "com_google_googletest", # 2021-05-19T20:10:13Z
|
||||
urls = ["https://github.com/google/googletest/archive/aa9b44a18678dfdf57089a5ac22c1edb69f35da5.zip"],
|
||||
strip_prefix = "googletest-aa9b44a18678dfdf57089a5ac22c1edb69f35da5",
|
||||
sha256 = "8cf4eaab3a13b27a95b7e74c58fb4c0788ad94d1f7ec65b20665c4caf1d245e8",
|
||||
)
|
||||
|
||||
# Google benchmark.
|
||||
http_archive(
|
||||
name = "com_github_google_benchmark",
|
||||
urls = ["https://github.com/google/benchmark/archive/0baacde3618ca617da95375e0af13ce1baadea47.zip"],
|
||||
strip_prefix = "benchmark-0baacde3618ca617da95375e0af13ce1baadea47",
|
||||
sha256 = "62e2f2e6d8a744d67e4bbc212fcfd06647080de4253c97ad5c6749e09faf2cb0",
|
||||
)
|
||||
|
||||
# C++ rules for Bazel.
|
||||
http_archive(
|
||||
name = "rules_cc", # 2021-05-14T14:51:14Z
|
||||
urls = ["https://github.com/bazelbuild/rules_cc/archive/68cb652a71e7e7e2858c50593e5a9e3b94e5b9a9.zip"],
|
||||
strip_prefix = "rules_cc-68cb652a71e7e7e2858c50593e5a9e3b94e5b9a9",
|
||||
sha256 = "1e19e9a3bc3d4ee91d7fcad00653485ee6c798efbbf9588d40b34cbfbded143d",
|
||||
)
|
||||
|
||||
# Python rules
|
||||
#
|
||||
# This is explicitly added to work around
|
||||
# https://github.com/bazelbuild/rules_fuzzing/issues/207
|
||||
# and https://github.com/google/tcmalloc/issues/127
|
||||
http_archive(
|
||||
name = "rules_python",
|
||||
urls = ["https://github.com/bazelbuild/rules_python/archive/refs/tags/0.11.0.tar.gz"],
|
||||
sha256 = "c03246c11efd49266e8e41e12931090b613e12a59e6f55ba2efd29a7cb8b4258",
|
||||
strip_prefix = "rules_python-0.11.0",
|
||||
)
|
||||
|
||||
# Proto rules for Bazel and Protobuf
|
||||
http_archive(
|
||||
name = "com_google_protobuf",
|
||||
urls = ["https://github.com/protocolbuffers/protobuf/archive/13d559beb6967033a467a7517c35d8ad970f8afb.zip"],
|
||||
strip_prefix = "protobuf-13d559beb6967033a467a7517c35d8ad970f8afb",
|
||||
sha256 = "9ca59193fcfe52c54e4c2b4584770acd1a6528fc35efad363f8513c224490c50",
|
||||
)
|
||||
load("@com_google_protobuf//:protobuf_deps.bzl", "protobuf_deps")
|
||||
protobuf_deps()
|
||||
|
||||
http_archive(
|
||||
name = "rules_proto",
|
||||
sha256 = "66bfdf8782796239d3875d37e7de19b1d94301e8972b3cbd2446b332429b4df1",
|
||||
strip_prefix = "rules_proto-4.0.0",
|
||||
urls = [
|
||||
"https://mirror.bazel.build/github.com/bazelbuild/rules_proto/archive/refs/tags/4.0.0.tar.gz",
|
||||
"https://github.com/bazelbuild/rules_proto/archive/refs/tags/4.0.0.tar.gz",
|
||||
],
|
||||
)
|
||||
|
||||
load("@rules_proto//proto:repositories.bzl", "rules_proto_dependencies", "rules_proto_toolchains")
|
||||
rules_proto_dependencies()
|
||||
rules_proto_toolchains()
|
||||
|
||||
# Fuzzing
|
||||
http_archive(
|
||||
name = "rules_fuzzing",
|
||||
sha256 = "a5734cb42b1b69395c57e0bbd32ade394d5c3d6afbfe782b24816a96da24660d",
|
||||
strip_prefix = "rules_fuzzing-0.1.1",
|
||||
urls = ["https://github.com/bazelbuild/rules_fuzzing/archive/v0.1.1.zip"],
|
||||
)
|
||||
|
||||
# Protobuf
|
||||
load("@rules_fuzzing//fuzzing:repositories.bzl", "rules_fuzzing_dependencies")
|
||||
|
||||
rules_fuzzing_dependencies()
|
||||
|
||||
load("@rules_fuzzing//fuzzing:init.bzl", "rules_fuzzing_init")
|
||||
|
||||
rules_fuzzing_init()
|
||||
58
src/third_party/tcmalloc/dist/docs/README.md
vendored
Normal file
@ -0,0 +1,58 @@
|
||||
# TCMalloc
|
||||
|
||||
This repository contains the TCMalloc C++ code.
|
||||
|
||||
TCMalloc is Google's customized implementation of C's `malloc()` and C++'s
|
||||
`operator new` used for memory allocation within our C and C++ code. TCMalloc is
|
||||
a fast, multi-threaded malloc implementation.
|
||||
|
||||
## Building TCMalloc
|
||||
|
||||
[Bazel](https://bazel.build) is the official build system for TCMalloc.
|
||||
|
||||
The [TCMalloc Platforms Guide](platforms.md) contains information on platform
|
||||
support for TCMalloc.
|
||||
|
||||
## Documentation
|
||||
|
||||
All users of TCMalloc should consult the following documentation resources:
|
||||
|
||||
* The [TCMalloc Quickstart](quickstart.md) covers downloading, installing,
|
||||
building, and testing TCMalloc, including incorporating within your
|
||||
codebase.
|
||||
* The [TCMalloc Overview](overview.md) covers the basic architecture of
|
||||
TCMalloc, and how that may affect configuration choices.
|
||||
* The [TCMalloc Reference](reference.md) covers the C and C++ TCMalloc API
|
||||
endpoints.
|
||||
|
||||
More advanced usages of TCMalloc may find the following documentation useful:
|
||||
|
||||
* The [TCMalloc Tuning Guide](tuning.md) covers the configuration choices in
|
||||
more depth, and also illustrates other ways to customize TCMalloc.
|
||||
* The [TCMalloc Design Doc](design.md) covers how TCMalloc works underneath
|
||||
the hood, and why certain design choices were made. Most developers will not
|
||||
need this level of implementation detail.
|
||||
* The [TCMalloc Compatibility Guide](compatibility.md) which documents our
|
||||
expectations for how our APIs are used.
|
||||
* The [history and differences](gperftools.md) between this repository and
|
||||
gperftools.
|
||||
|
||||
## Publications
|
||||
|
||||
We've published several papers relating to TCMalloc optimizations:
|
||||
|
||||
* ["Beyond malloc efficiency to fleet efficiency: a hugepage-aware memory
|
||||
allocator" (OSDI 2021)](https://research.google/pubs/pub50370/) relating to
|
||||
the development and rollout of [Temeraire](temeraire.md), TCMalloc's
|
||||
hugepage-aware page heap implementation.
|
||||
* ["Adaptive Hugepage Subrelease for Non-moving Memory Allocators in
|
||||
Warehouse-Scale Computers" (ISMM
|
||||
2021)](https://research.google/pubs/pub50436/) relating to optimizations for
|
||||
releasing partial hugepages to the operating system.
|
||||
|
||||
## License
|
||||
|
||||
The TCMalloc library is licensed under the terms of the Apache license. See
|
||||
LICENSE for more information.
|
||||
|
||||
Disclaimer: This is not an officially supported Google product.
|
||||
44
src/third_party/tcmalloc/dist/docs/compatibility.md
vendored
Normal file
@ -0,0 +1,44 @@
|
||||
# TCMalloc Compatibility Guidelines
|
||||
|
||||
This document details what we expect from well-behaved users. Any usage of
|
||||
TCMalloc libraries outside of these technical boundaries may result in breakage
|
||||
when upgrading to newer versions of TCMalloc.
|
||||
|
||||
Put another way: don't do things that make TCMalloc API maintenance tasks
|
||||
harder. If you misuse TCMalloc APIs, you're on your own.
|
||||
|
||||
Additionally, because TCMalloc depends on Abseil, Abseil's
|
||||
[compatibility guidelines](https://abseil.io/about/compatibility) also apply.
|
||||
|
||||
## What Users Must (And Must Not) Do
|
||||
|
||||
* **Do not depend on a compiled representation of TCMalloc.** We do not
|
||||
promise any ABI compatibility — we intend for TCMalloc to be built
|
||||
from source, hopefully from head. The internal layout of our types may
|
||||
change at any point, without notice. Building TCMalloc in the presence of
|
||||
different C++ standard library types may change Abseil types, especially for
|
||||
pre-adopted types (`string_view`, `variant`, etc) — these will become
|
||||
typedefs and their ABI will change accordingly.
|
||||
* **Do not rely on dynamic loading/unloading.** TCMalloc does not support
|
||||
dynamic loading and unloading.
|
||||
* **You may not open namespace `tcmalloc`.** You are not allowed to define
|
||||
additional names in namespace `tcmalloc`, nor are you allowed to specialize
|
||||
anything we provide.
|
||||
* **You may not depend on the signatures of TCMalloc APIs.** You cannot take
|
||||
the address of APIs in TCMalloc (that would prevent us from adding overloads
|
||||
without breaking you). You cannot use metaprogramming tricks to depend on
|
||||
those signatures either. (This is also similar to the restrictions in the
|
||||
C++ standard.)
|
||||
* **You may not forward declare TCMalloc APIs.** This is actually a sub-point
|
||||
of "do not depend on the signatures of TCMalloc APIs" as well as "do not
|
||||
open namespace `tcmalloc`", but can be surprising. Any refactoring that
|
||||
changes template parameters, default parameters, or namespaces will be a
|
||||
breaking change in the face of forward-declarations.
|
||||
* **Do not depend upon internal details.** This should go without saying: if
|
||||
something is in a namespace or filename/path that includes the word
|
||||
"internal", you are not allowed to depend upon it. It's an implementation
|
||||
detail. You cannot friend it, you cannot include it, you cannot mention it
|
||||
or refer to it in any way.
|
||||
* **Include What You Use.** We may make changes to the internal `#include`
|
||||
graph for TCMalloc headers - if you use an API, please include the relevant
|
||||
header file directly.
|
||||
470
src/third_party/tcmalloc/dist/docs/design.md
vendored
Normal file
@ -0,0 +1,470 @@
|
||||
# TCMalloc : Thread-Caching Malloc
|
||||
|
||||
## Motivation
|
||||
|
||||
TCMalloc is a memory allocator designed as an alternative to the system default
|
||||
allocator that has the following characteristics:
|
||||
|
||||
* Fast, uncontended allocation and deallocation for most objects. Objects are
|
||||
cached, depending on mode, either per-thread, or per-logical-CPU. Most
|
||||
allocations do not need to take locks, so there is low contention and good
|
||||
scaling for multi-threaded applications.
|
||||
* Flexible use of memory, so freed memory can be reused for different object
|
||||
sizes, or returned to the OS.
|
||||
* Low per object memory overhead by allocating "pages" of objects of the same
|
||||
size. Leading to space-efficient representation of small objects.
|
||||
* Low overhead sampling, enabling detailed insight into applications memory
|
||||
usage.
|
||||
|
||||
## Usage
|
||||
|
||||
You use TCMalloc by specifying it as the `malloc` attribute on your binary rules in Bazel.
|
||||
|
||||
## Overview
|
||||
|
||||
The following block diagram shows the rough internal structure of TCMalloc:
|
||||
|
||||

|
||||
|
||||
We can break TCMalloc into three components. The front-end, middle-end, and
|
||||
back-end. We will discuss these in more details in the following sections. A
|
||||
rough breakdown of responsibilities is:
|
||||
|
||||
* The front-end is a cache that provides fast allocation and deallocation of
|
||||
memory to the application.
|
||||
* The middle-end is responsible for refilling the front-end cache.
|
||||
* The back-end handles fetching memory from the OS.
|
||||
|
||||
Note that the front-end can be run in either per-CPU or legacy per-thread mode,
|
||||
and the back-end can support either the hugepage aware pageheap or the legacy
|
||||
pageheap.
|
||||
|
||||
## The TCMalloc Front-end
|
||||
|
||||
The front-end handles a request for memory of a particular size. The front-end
|
||||
has a cache of memory that it can use for allocation or to hold free memory.
|
||||
This cache is only accessible by a single thread at a time, so it does not
|
||||
require any locks, hence most allocations and deallocations are fast.
|
||||
|
||||
The front-end will satisfy any request if it has cached memory of the
|
||||
appropriate size. If the cache for that particular size is empty, the front-end
|
||||
will request a batch of memory from the middle-end to refill the cache. The
|
||||
middle-end comprises the CentralFreeList and the TransferCache.
|
||||
|
||||
If the middle-end is exhausted, or if the requested size is greater than the
|
||||
maximum size that the front-end caches handle, a request will go to the back-end
|
||||
to either satisfy the large allocation, or to refill the caches in the
|
||||
middle-end. The back-end is also referred to as the PageHeap.
|
||||
|
||||
There are two implementations of the TCMalloc front-end:
|
||||
|
||||
* Originally it supported per-thread caches of objects (hence the name Thread
|
||||
Caching Malloc). However, this resulted in memory footprints that scaled
|
||||
with the number of threads. Modern applications can have large thread
|
||||
counts, which result in either large amounts of aggregate per-thread memory,
|
||||
or many threads having minuscule per-thread caches.
|
||||
* More recently TCMalloc has supported per-CPU mode. In this mode each logical
|
||||
CPU in the system has its own cache from which to allocate memory. Note: On
|
||||
x86 a logical CPU is equivalent to a hyperthread.
|
||||
|
||||
The differences between per-thread and per-CPU modes are entirely confined to
|
||||
the implementations of malloc/new and free/delete.
|
||||
|
||||
## Small and Large Object Allocation
|
||||
|
||||
Allocations of "small" objects are mapped onto one of
|
||||
[60-80 allocatable size-classes](https://github.com/google/tcmalloc/blob/master/tcmalloc/size_classes.cc).
|
||||
For example, an allocation of 12 bytes will get rounded up to the 16 byte
|
||||
size-class. The size-classes are designed to minimize the amount of memory that
|
||||
is wasted when rounding to the next largest size-class.
|
||||
|
||||
When compiled with `__STDCPP_DEFAULT_NEW_ALIGNMENT__ <= 8`, we use a set of
|
||||
sizes aligned to 8 bytes for raw storage allocated with `::operator new`. This
|
||||
smaller alignment minimizes wasted memory for many common allocation sizes (24,
|
||||
40, etc.) which are otherwise rounded up to a multiple of 16 bytes. On many
|
||||
compilers, this behavior is controlled by the `-fnew-alignment=...` flag.
|
||||
When
|
||||
`__STDCPP_DEFAULT_NEW_ALIGNMENT__` is not specified (or is larger than 8 bytes),
|
||||
we use standard 16 byte alignments for `::operator new`. However, for
|
||||
allocations under 16 bytes, we may return an object with a lower alignment, as
|
||||
no object with a larger alignment requirement can be allocated in the space.
|
||||
|
||||
When an object of a given size is requested, that request is mapped to a request
|
||||
of a particular size-class using the
|
||||
[`SizeMap::GetSizeClass()` function](https://github.com/google/tcmalloc/blob/master/tcmalloc/common.h),
|
||||
and the returned memory is from that size-class. This means that the returned
|
||||
memory is at least as large as the requested size. Allocations from size-classes
|
||||
are handled by the front-end.
|
||||
|
||||
Objects of size greater than the limit defined by
|
||||
[`kMaxSize`](https://github.com/google/tcmalloc/blob/master/tcmalloc/common.h)
|
||||
are allocated directly from the [backend](#tcmalloc-backend). As such they are
|
||||
not cached in either the front or middle ends. Allocation requests for large
|
||||
object sizes are rounded up to the [TCMalloc page size](#tcmalloc-page-sizes).
|
||||
|
||||
## Deallocation
|
||||
|
||||
When an object is deallocated, the compiler will provide the size of the object
|
||||
if it is known at compile time. If the size is not known, it will be looked up
|
||||
in the [pagemap](#pagemap). If the object is small it will be put back into the
|
||||
front-end cache. If the object is larger than kMaxSize it is returned directly
|
||||
to the pageheap.
|
||||
|
||||
### Per-CPU Mode
|
||||
|
||||
In per-CPU mode a single large block of memory is allocated. The following
|
||||
diagram shows how this slab of memory is divided between CPUs and how each CPU
|
||||
uses a part of the slab to hold metadata as well as pointers to available
|
||||
objects.
|
||||
|
||||

|
||||
|
||||
Each logical CPU is assigned a section of this memory to hold metadata and
|
||||
pointers to available objects of particular size-classes. The metadata comprises
|
||||
one /header/ block per size-class. The header has a pointer to the start of the
|
||||
per-size-class array of pointers to objects, as well as a pointer to the
|
||||
current, dynamic, maximum capacity and the current position within that array
|
||||
segment. The static maximum capacity of each per-size-class array of pointers is
|
||||
[determined at start time](https://github.com/google/tcmalloc/blob/master/tcmalloc/internal/percpu_tcmalloc.h)
|
||||
by the difference between the start of the array for this size-class and the
|
||||
start of the array for the next size-class.
|
||||
|
||||
At runtime the maximum number of items of a particular size-class that can be
|
||||
stored in the per-cpu block will vary, but it can never exceed the statically
|
||||
determined maximum capacity assigned at start up.
|
||||
|
||||
When an object of a particular size-class is requested it is removed from this
|
||||
array, when the object is freed it is added to the array. If the array is
|
||||
[exhausted](https://github.com/google/tcmalloc/blob/master/tcmalloc/cpu_cache.h)
|
||||
the array is refilled using a batch of objects from the middle-end. If the array
|
||||
would
|
||||
[overflow](https://github.com/google/tcmalloc/blob/master/tcmalloc/cpu_cache.h),
|
||||
a batch of objects are removed from the array and returned to the middle-end.
|
||||
|
||||
The amount of memory that can be cached is limited per-cpu by the parameter
|
||||
`MallocExtension::SetMaxPerCpuCacheSize`. This means that the total amount of
|
||||
cached memory depends on the number of active per-cpu caches. Consequently
|
||||
machines with higher CPU counts can cache more memory.
|
||||
|
||||
To avoid holding memory on CPUs where the application no longer runs,
|
||||
`MallocExtension::ReleaseCpuMemory` frees objects held in a specified CPU's
|
||||
caches.
|
||||
|
||||
Within a CPU, the distribution of memory is managed across all the size-classes
|
||||
so as to keep the maximum amount of cached memory below the limit. Notice that
|
||||
it is managing the maximum amount that can be cached, and not the amount that is
|
||||
currently cached. On average the amount actually cached should be about half the
|
||||
limit.
|
||||
|
||||
The maximum capacity is increased when a size-class
|
||||
[runs out of objects](https://github.com/google/tcmalloc/blob/master/tcmalloc/cpu_cache.h),
|
||||
and when fetching more objects, it also considers
|
||||
[increasing the capacity](https://github.com/google/tcmalloc/blob/master/tcmalloc/cpu_cache.h)
|
||||
of the size-class. It can increase the capacity of the size-class up until the
|
||||
total memory (for all size-classes) that the cache could hold reaches the
|
||||
per-cpu limit or until the capacity of that size-class reaches the hard-coded
|
||||
size limit for that size-class. If the size-class has not reached the hard-coded
|
||||
limit, then in order to increase the capacity it can
|
||||
[steal](https://github.com/google/tcmalloc/blob/master/tcmalloc/cpu_cache.h)
|
||||
capacity from another size-class on the same CPU.
|
||||
|
||||
### Restartable Sequences and Per-CPU TCMalloc
|
||||
|
||||
To work correctly, per-CPU mode relies on restartable sequences (man rseq(2)). A
|
||||
restartable sequence is just a block of (assembly language) instructions,
|
||||
largely like a typical function. A restriction of restartable sequences is that
|
||||
they cannot write partial state to memory, the final instruction must be a
|
||||
single write of the updated state. The idea of restartable sequences is that if
|
||||
a thread is removed from a CPU (e.g. context switched) while it is executing a
|
||||
restartable sequence, the sequence will be restarted from the top. Hence the
|
||||
sequence will either complete without interruption, or be repeatedly restarted
|
||||
until it completes without interruption. This is achieved without using any
|
||||
locking or atomic instructions, thereby avoiding any contention in the sequence
|
||||
itself.
|
||||
|
||||
The practical implication of this for TCMalloc is that the code can use a
|
||||
restartable sequence like
|
||||
[TcmallocSlab_Internal_Push](https://github.com/google/tcmalloc/blob/master/tcmalloc/internal/percpu_tcmalloc.h)
|
||||
to fetch from or return an element to a per-CPU array without needing locking.
|
||||
The restartable sequence ensures that either the array is updated without the
|
||||
thread being interrupted, or the sequence is restarted if the thread was
|
||||
interrupted (for example, by a context switch that enables a different thread to
|
||||
run on that CPU).
|
||||
|
||||
Additional information about the design choices and implementation are discussed
|
||||
in a specific [design doc](rseq.md) for it.
|
||||
|
||||
### Legacy Per-Thread mode
|
||||
|
||||
In per-thread mode, TCMalloc assigns each thread a thread-local cache. Small
|
||||
allocations are satisfied from this thread-local cache. Objects are moved
|
||||
between the middle-end into and out of the thread-local cache as needed.
|
||||
|
||||
A thread cache contains one singly linked list of free objects per size-class
|
||||
(so if there are N size-classes, there will be N corresponding linked lists), as
|
||||
shown in the following diagram.
|
||||
|
||||

|
||||
|
||||
On allocation an object is removed from the appropriate size-class of the
|
||||
per-thread caches. On deallocation, the object is prepended to the appropriate
|
||||
size-class. Underflow and overflow are handled by accessing the middle-end to
|
||||
either fetch more objects, or to return some objects.
|
||||
|
||||
The maximum capacity of the per-thread caches is set by the parameter
|
||||
`MallocExtension::SetMaxTotalThreadCacheBytes`.
|
||||
However it is possible for the
|
||||
total size to exceed that limit as each per-thread cache has a minimum size
|
||||
[KMinThreadCacheSize](https://github.com/google/tcmalloc/blob/master/tcmalloc/common.h)
|
||||
which is usually 512KiB. In the event that a thread wishes to increase its
|
||||
capacity, it needs to
|
||||
[scavenge](https://github.com/google/tcmalloc/blob/master/tcmalloc/thread_cache.cc)
|
||||
capacity from other threads.
|
||||
|
||||
When threads exit their cached memory is
|
||||
[returned](https://github.com/google/tcmalloc/blob/master/tcmalloc/thread_cache.cc)
|
||||
to the middle-end
|
||||
|
||||
### Runtime Sizing of Front-end Caches
|
||||
|
||||
It is important for the size of the front-end cache free lists to adjust
|
||||
optimally. If the free list is too small, we'll need to go to the central free
|
||||
list too often. If the free list is too big, we'll waste memory as objects sit
|
||||
idle in there.
|
||||
|
||||
Note that the caches are just as important for deallocation as they are for
|
||||
allocation. Without a cache, each deallocation would require moving the memory
|
||||
to the central free list.
|
||||
|
||||
Per-CPU and per-thread modes have different implementations of a dynamic cache
|
||||
sizing algorithm.
|
||||
|
||||
* In per-thread mode the maximum number of objects that can be stored is
|
||||
[increased](https://github.com/google/tcmalloc/blob/master/tcmalloc/thread_cache.cc)
|
||||
up to a limit whenever more objects need to be fetched from the middle-end.
|
||||
Similarly the capacity is
|
||||
[decreased](https://github.com/google/tcmalloc/blob/master/tcmalloc/thread_cache.cc)
|
||||
when we find that we have cached too many objects. The size of the cache is
|
||||
also
|
||||
[reduced](https://github.com/google/tcmalloc/blob/master/tcmalloc/thread_cache.cc)
|
||||
should the total size of the cached objects exceed the per-thread limit.
|
||||
* In per-CPU mode the
|
||||
[capacity](https://github.com/google/tcmalloc/blob/master/tcmalloc/cpu_cache.h)
|
||||
of the free list is increased depending on whether we are alternating
|
||||
between underflows and overflows (indicating that a larger cache might stop
|
||||
this alternation). The capacity is
|
||||
[reduced](https://github.com/google/tcmalloc/blob/master/tcmalloc/cpu_cache.h)
|
||||
when it has not been grown for a time and may therefore be over capacity.
|
||||
|
||||
## TCMalloc Middle-end
|
||||
|
||||
The middle-end is responsible for providing memory to the front-end and
|
||||
returning memory to the back-end. The middle-end comprises the Transfer cache
|
||||
and the Central free list. Although these are often referred to as singular,
|
||||
there is one transfer cache and one central free list per size-class. These
|
||||
caches are each protected by a mutex lock - so there is a serialization cost to
|
||||
accessing them.
|
||||
|
||||
### Transfer Cache
|
||||
|
||||
When the front-end requests memory, or returns memory, it will reach out to the
|
||||
transfer cache.
|
||||
|
||||
The transfer cache holds an array of pointers to free memory, and it is quick to
|
||||
move objects into this array, or fetch objects from this array on behalf of the
|
||||
front-end.
|
||||
|
||||
The transfer cache gets its name from situations where one CPU (or thread) is
|
||||
allocating memory that is deallocated by another CPU (or thread). The transfer
|
||||
cache allows memory to rapidly flow between two different CPUs (or threads).
|
||||
|
||||
If the transfer cache is unable to satisfy the memory request, or has
|
||||
insufficient space to hold the returned objects, it will access the central free
|
||||
list.
|
||||
|
||||
### Central Free List
|
||||
|
||||
The central free list manages memory in "[spans](#spans)", a span is a
|
||||
collection of one or more "[TCMalloc pages](#tcmalloc-page-sizes)" of memory.
|
||||
These terms will be explained in the next couple of sections.
|
||||
|
||||
A request for one or more objects is satisfied by the central free list by
|
||||
[extracting](https://github.com/google/tcmalloc/blob/master/tcmalloc/central_freelist.cc)
|
||||
objects from spans until the request is satisfied. If there are insufficient
|
||||
available objects in the spans, more spans are requested from the back-end.
|
||||
|
||||
When objects are
|
||||
[returned to the central free list](https://github.com/google/tcmalloc/blob/master/tcmalloc/central_freelist.cc),
|
||||
each object is mapped to the span to which it belongs (using the
|
||||
[pagemap](#pagemap-and-spans)) and then released into that span. If all the
|
||||
objects that reside in a particular span are returned to it, the entire span
|
||||
gets returned to the back-end.
|
||||
|
||||
### Pagemap and Spans
|
||||
|
||||
The heap managed by TCMalloc is divided into [pages](#pagesize) of a
|
||||
compile-time determined size. A run of contiguous pages is represented by a
|
||||
`Span` object. A span can be used to manage a large object that has been handed
|
||||
off to the application, or a run of pages that have been split up into a
|
||||
sequence of small objects. If the span manages small objects, the size-class of
|
||||
the objects is recorded in the span.
|
||||
|
||||
The pagemap is used to look up the span to which an object belongs, or to
|
||||
identify the size-class for a given object.
|
||||
|
||||
TCMalloc uses a 2-level or 3-level
|
||||
[radix tree](https://github.com/google/tcmalloc/blob/master/tcmalloc/pagemap.h)
|
||||
in order to map all possible memory locations onto spans.
|
||||
|
||||
The following diagram shows how a radix-2 pagemap is used to map the address of
|
||||
objects onto the spans that control the pages where the objects reside. In the
|
||||
diagram **span A** covers two pages, and **span B** covers 3 pages.
|
||||
|
||||

|
||||
|
||||
Spans are used in the middle-end to determine where to place returned objects,
|
||||
and in the back-end to manage the handling of page ranges.
|
||||
|
||||
### Storing Small Objects in Spans
|
||||
|
||||
A span contains a pointer to the base of the TCMalloc pages that the span
|
||||
controls. For small objects those pages are divided into at most 2<sup>16</sup>
|
||||
objects. This value is selected so that within the span we can refer to objects
|
||||
by a two-byte index.
|
||||
|
||||
This means that we can use an
|
||||
[unrolled linked list](https://en.wikipedia.org/wiki/Unrolled_linked_list) to
|
||||
hold the objects. For example, if we have eight byte objects we can store the
|
||||
indexes of three ready-to-use objects, and use the forth slot to store the index
|
||||
of the next object in the chain. This data structure reduces cache misses over a
|
||||
fully linked list.
|
||||
|
||||
The other advantage of using two byte indexes is that we're able to use spare
|
||||
capacity in the span itself to
|
||||
[cache four objects](https://github.com/google/tcmalloc/blob/master/tcmalloc/span.h).
|
||||
|
||||
When we have
|
||||
[no available objects](https://github.com/google/tcmalloc/blob/master/tcmalloc/central_freelist.cc)
|
||||
for a size-class, we need to fetch a new span from the pageheap and
|
||||
[populate](https://github.com/google/tcmalloc/blob/master/tcmalloc/central_freelist.cc)
|
||||
it.
|
||||
|
||||
## TCMalloc Page Sizes
|
||||
|
||||
TCMalloc can be built with various
|
||||
["page sizes"](https://github.com/google/tcmalloc/blob/master/tcmalloc/common.h)
|
||||
. Note that these do not correspond to the page size used in the TLB of the
|
||||
underlying hardware. These TCMalloc page sizes are currently 4KiB, 8KiB, 32KiB,
|
||||
and 256KiB.
|
||||
|
||||
A TCMalloc page either holds multiple objects of a particular size, or is used
|
||||
as part of a group to hold an object of size greater than a single page. If an
|
||||
entire page becomes free it will be returned to the back-end (the pageheap) and
|
||||
can later be repurposed to hold objects of a different size (or returned to the
|
||||
OS).
|
||||
|
||||
Small pages are better able to handle the memory requirements of the application
|
||||
with less overhead. For example, a half-used 4KiB page will have 2KiB left over
|
||||
versus a 32KiB page which would have 16KiB. Small pages are also more likely to
|
||||
become free. For example, a 4KiB page can hold eight 512-byte objects versus 64
|
||||
objects on a 32KiB page; and there is much less chance of 64 objects being free
|
||||
at the same time than there is of eight becoming free.
|
||||
|
||||
Large pages result in less need to fetch and return memory from the back-end. A
|
||||
single 32KiB page can hold eight times the objects of a 4KiB page, and this can
|
||||
result in the costs of managing the larger pages being smaller. It also takes
|
||||
fewer large pages to map the entire virtual address space. TCMalloc has a
|
||||
[pagemap](https://github.com/google/tcmalloc/blob/master/tcmalloc/pagemap.h)
|
||||
which maps a virtual address onto the structures that manage the objects in that
|
||||
address range. Larger pages mean that the pagemap needs fewer entries and is
|
||||
therefore smaller.
|
||||
|
||||
Consequently, it makes sense for applications with small memory footprints, or
|
||||
that are sensitive to memory footprint size to use smaller TCMalloc page sizes.
|
||||
Applications with large memory footprints are likely to benefit from larger
|
||||
TCMalloc page sizes.
|
||||
|
||||
## TCMalloc Backend
|
||||
|
||||
The back-end of TCMalloc has three jobs:
|
||||
|
||||
* It manages large chunks of unused memory.
|
||||
* It is responsible for fetching memory from the OS when there is no suitably
|
||||
sized memory available to fulfill an allocation request.
|
||||
* It is responsible for returning unneeded memory back to the OS.
|
||||
|
||||
There are two backends for TCMalloc:
|
||||
|
||||
* The Legacy pageheap which manages memory in TCMalloc page sized chunks.
|
||||
* The hugepage aware pageheap which manages memory in chunks of hugepage
|
||||
sizes. Managing memory in hugepage chunks enables the allocator to improve
|
||||
application performance by reducing TLB misses.
|
||||
|
||||
### Legacy Pageheap
|
||||
|
||||
The legacy pageheap is an array of free lists for particular lengths of
|
||||
contiguous pages of available memory. For `k < 256`, the `k`th entry is a free
|
||||
list of runs that consist of `k` TCMalloc pages. The `256`th entry is a free
|
||||
list of runs that have length `>= 256` pages:
|
||||
|
||||

|
||||
|
||||
An allocation for `k` pages is satisfied by looking in the `k`th free list. If
|
||||
that free list is empty, we look in the next free list, and so forth.
|
||||
Eventually, we look in the last free list if necessary. If that fails, we fetch
|
||||
memory from the system `mmap`.
|
||||
|
||||
If an allocation for `k` pages is satisfied by a run of pages of length `> k` ,
|
||||
the remainder of the run is re-inserted back into the appropriate free list in
|
||||
the pageheap.
|
||||
|
||||
When a range of pages are returned to the pageheap, the adjacent pages are
|
||||
checked to determine if they now form a contiguous region, if that is the case
|
||||
then the pages are concatenated and placed into the appropriate free list.
|
||||
|
||||
### Hugepage Aware Allocator
|
||||
|
||||
The objective of the hugepage aware allocator is to hold memory in hugepage size
|
||||
chunks. On x86 a hugepage is 2MiB in size. To do this the back-end has three
|
||||
different caches:
|
||||
|
||||
* The filler cache holds hugepages which have had some memory allocated from
|
||||
them. This can be considered to be similar to the legacy pageheap in that it
|
||||
holds linked lists of memory of a particular number of TCMalloc pages.
|
||||
Allocation requests for sizes of less than a hugepage in size are
|
||||
(typically) returned from the filler cache. If the filler cache does not
|
||||
have sufficient available memory it will request additional hugepages from
|
||||
which to allocate.
|
||||
* The region cache which handles allocations of greater than a hugepage. This
|
||||
cache allows allocations to straddle multiple hugepages, and packs multiple
|
||||
such allocations into a contiguous region. This is particularly useful for
|
||||
allocations that slightly exceed the size of a hugepage (for example, 2.1
|
||||
MiB).
|
||||
* The hugepage cache handles large allocations of at least a hugepage. There
|
||||
is overlap in usage with the region cache, but the region cache is only
|
||||
enabled when it is determined (at runtime) that the allocation pattern would
|
||||
benefit from it.
|
||||
|
||||
Additional information about the design choices made in HPAA are discussed in a
|
||||
specific [design doc](temeraire.md) for it.
|
||||
|
||||
## Caveats
|
||||
|
||||
TCMalloc will reserve some memory for metadata at start up. The amount of
|
||||
metadata will grow as the heap grows. In particular the pagemap will grow with
|
||||
the virtual address range that TCMalloc uses, and the spans will grow as the
|
||||
number of active pages of memory grows. In per-CPU mode, TCMalloc will reserve a
|
||||
slab of memory per-CPU (typically 256 KiB), which, on systems with large numbers
|
||||
of logical CPUs, can lead to a multi-mebibyte footprint.
|
||||
|
||||
It is worth noting that TCMalloc requests memory from the OS in large chunks
|
||||
(typically 1 GiB regions). The address space is reserved, but not backed by
|
||||
physical memory until it is used. Because of this approach the VSS of the
|
||||
application can be substantially larger than the RSS. A side effect of this is
|
||||
that trying to limit an application's memory use by restricting VSS will fail
|
||||
long before the application has used that much physical memory.
|
||||
|
||||
Don't try to load TCMalloc into a running binary (e.g., using JNI in Java
|
||||
programs). The binary will have allocated some objects using the system malloc,
|
||||
and may try to pass them to TCMalloc for deallocation. TCMalloc will not be able
|
||||
to handle such objects.
|
||||
70
src/third_party/tcmalloc/dist/docs/gperftools.md
vendored
Normal file
@ -0,0 +1,70 @@
|
||||
# TCMalloc and gperftools
|
||||
|
||||
There are two projects on Github that are based on Google’s internal TCMalloc:
|
||||
This repository and [gperftools](https://github.com/gperftools/gperftools). Both
|
||||
are fast C/C++ memory allocators designed around a fast path that avoids
|
||||
synchronizing with other threads for most allocations.
|
||||
|
||||
This repository is Google's current implementation of TCMalloc, used by ~all of
|
||||
our C++ programs in production. The code is limited to the memory allocator
|
||||
implementation itself.
|
||||
|
||||
## History
|
||||
|
||||
Google open-sourced its memory allocator as part of "Google Performance Tools"
|
||||
in 2005. At the time, it became easy to externalize code, but more difficult to
|
||||
keep it in-sync with our internal usage, as discussed by Titus Winters’ in
|
||||
[his 2017 CppCon Talk](https://www.youtube.com/watch?v=tISy7EJQPzI) and the
|
||||
"Software Engineering at Google" book. Subsequently, our internal implementation
|
||||
diverged from the code externally. This project eventually was adopted by the
|
||||
community as "gperftools."
|
||||
|
||||
## Differences
|
||||
|
||||
Since
|
||||
[“Profiling a Warehouse-Scale Computer” (Kanev 2015)](https://research.google/pubs/pub44271/),
|
||||
we have invested in improving application productivity via optimizations to the
|
||||
implementation (per-CPU caches, sized delete, fast/slow path improvements,
|
||||
[hugepage-aware backend](temeraire.md)).
|
||||
|
||||
Because this repository reflects our day-to-day usage, we've focused on the
|
||||
platforms we regularly use and can see extensive testing and optimization.
|
||||
|
||||
This implementation is based on [Abseil](https://github.com/abseil/abseil-cpp).
|
||||
Like Abseil, we do not attempt to provide ABI stability. Providing a stable ABI
|
||||
could require compromising performance or adding otherwise unneeded complexity
|
||||
to maintain stability. These caveats are noted in our
|
||||
[Compatibility Guidelines](compatibility.md).
|
||||
|
||||
In addition to a memory allocator, the gperftools project contains a number of
|
||||
other tools:
|
||||
|
||||
* An All-Allocation Memory Profiler: We have found this prohibitively costly
|
||||
to use regularly, and instead focus on using low-overhead, always-on
|
||||
sampling profilers. This sampling based profiler is exposed in our
|
||||
`malloc_extension.h`.
|
||||
* A SIGPROF-based CPU Profiler: The Linux `perf` tool is decreasing our
|
||||
internal need for signal-based profiling. Additionally, with restartable
|
||||
sequences, signals interrupt the fastpath, leading to skew between the
|
||||
observed instruction pointer and where we actually spend CPU time.
|
||||
* A Heap Checker/Debug Allocator: The LeakSanitizer, AddressSanitizer, and
|
||||
MemorySanitizer suite provide higher accuracy and better performance.
|
||||
* A perl-based `pprof` tool: This project is now developed in Go and is
|
||||
[available on Github](https://github.com/google/pprof).
|
||||
|
||||
## Differences From Google's Implementation of TCMalloc
|
||||
|
||||
The configuration on Github mirrors our production defaults, with two notable
|
||||
exceptions:
|
||||
|
||||
* Many of our production servers start a background thread (via
|
||||
`tcmalloc::MallocExtension::ProcessBackgroundActions`) to regularly call
|
||||
`tcmalloc::MallocExtension::ReleaseMemoryToSystem`, while others never
|
||||
release memory in favor of better CPU performance. These tradeoffs are
|
||||
discussed in our [tuning page](tuning.md).
|
||||
* We do not activate [GWP ASan](gwp-asan.md) by default, but can be activated
|
||||
via `MallocExtension`.
|
||||
|
||||
Over time, we have found that configurability carries a maintenance burden.
|
||||
While a knob can provide immediate flexibility, the increased complexity can
|
||||
cause subtle problems for more rarely used combinations.
|
||||
87
src/third_party/tcmalloc/dist/docs/gwp-asan.md
vendored
Normal file
@ -0,0 +1,87 @@
|
||||
# GWP-ASan
|
||||
|
||||
GWP-ASan is a low-overhead sampling-based utility for finding
|
||||
heap-use-after-frees and heap-buffer-overflows in production.
|
||||
GWP-ASan is a recursive acronym: "**G**WP-ASan **W**ill **P**rovide
|
||||
**A**llocation **San**ity".
|
||||
|
||||
## Why not just use ASan?
|
||||
|
||||
For many cases you **should** use [ASan](https://clang.llvm.org/docs/AddressSanitizer.html)
|
||||
(e.g., on your tests). However, ASan comes with average execution slowdown of 2x
|
||||
(compared to `-O2`), binary size increase of 2x, and significant memory
|
||||
overhead. For these reasons, ASan is generally impractical for use in production
|
||||
(other than in dedicated canaries). GWP-ASan is a minimal-overhead alternative
|
||||
designed for widespread use in production.
|
||||
|
||||
## How to use GWP-ASan
|
||||
|
||||
You can enable GWP-ASan by calling `tcmalloc::MallocExtension::ActivateGuardedSampling()`.
|
||||
To adjust GWP-ASan's sampling rate, see
|
||||
[below](#what-should-i-set-the-sampling-rate-to).
|
||||
|
||||
When GWP-ASan detects a heap memory error, it prints stack traces for the point
|
||||
of the memory error, as well as the points where the memory was allocated and
|
||||
(if applicable) freed. These stack traces can then be
|
||||
symbolized offline to get file names and line
|
||||
numbers.
|
||||
|
||||
GWP-ASan will crash after printing stack traces.
|
||||
|
||||
## CPU and RAM Overhead
|
||||
|
||||
For guarded sampling rates above 100M (the default), CPU overhead is negligible. For sampling rates as low as 8M, CPU overhead is under 0.5%.
|
||||
|
||||
RAM overhead is up to 512 KB on x86\_64, or 4 MB on PowerPC.
|
||||
|
||||
## What should I set the sampling rate to?
|
||||
|
||||
`tcmalloc::MallocExtension::SetGuardedSamplingRate` sets the sampling rate for
|
||||
GWP-ASan. GWP-ASan will guard allocations approximately every
|
||||
`GuardedSamplingRate` bytes allocated. Thus, lower values will generally
|
||||
increase the the chance of finding bugs but will also have higher CPU overhead.
|
||||
|
||||
For applications that cannot tolerate any CPU overhead, we recommend
|
||||
using TCMalloc's default sampling rate. If your application can tolerate some
|
||||
CPU overhead, we recommend a sampling rate of 8MB.
|
||||
|
||||
## Limitations
|
||||
|
||||
- The current version of GWP-ASan will only find bugs in allocations of 8 KB
|
||||
or less. This restriction was made to limit the CPU/RAM overhead required by
|
||||
GWP-ASan.
|
||||
|
||||
- GWP-ASan has limited diagnostic information for buffer overflows within
|
||||
alignment padding, since overflows of this type will not touch a guard
|
||||
page. For write-overflows,
|
||||
GWP-ASan will still be able to detect the overflow during deallocation by
|
||||
checking whether magic bytes have been overwritten, but the stack trace of
|
||||
the overflow itself will not be available.
|
||||
|
||||
## FAQs
|
||||
|
||||
### Does GWP-ASan report false positives?
|
||||
|
||||
No. GWP-ASan crashes because your program accessed unmapped memory, which is
|
||||
always a true bug, or a sign of hardware failure (see below).
|
||||
|
||||
### How do I know a GWP-ASan report isn't caused by hardware failure?
|
||||
|
||||
The vast majority of GWP-ASan reports we see are true bugs, but occasionally
|
||||
faulty hardware will be the actual cause of the crash. In general, if you see
|
||||
the same GWP-ASan crash on multiple machines, it is very likely there's a true
|
||||
software bug.
|
||||
|
||||
### Can GWP-ASan cause queries of death (QoD) in my production?
|
||||
|
||||
Since GWP-ASan finds bugs with very low probability, QoD is generally not a
|
||||
concern. Even if there is a reliable way to trigger a bug, GWP-ASan will only
|
||||
detect it and crash on a tiny fraction of actual occurrences, allowing the other
|
||||
99.9% to continue without crashing.
|
||||
|
||||
## Other versions of GWP-ASan
|
||||
|
||||
Separate implementations of GWP-ASan exist for Chromium and Android. For
|
||||
GWP-ASan for Chromium see
|
||||
[here](https://chromium.googlesource.com/chromium/src/+/lkgr/docs/gwp_asan.md).
|
||||
For Android, see [here](https://developer.android.com/ndk/guides/gwp-asan).
|
||||
BIN
src/third_party/tcmalloc/dist/docs/images/legacy_pageheap.png
vendored
Normal file
|
After Width: | Height: | Size: 16 KiB |
BIN
src/third_party/tcmalloc/dist/docs/images/lifetimes-counterfactual.png
vendored
Normal file
|
After Width: | Height: | Size: 57 KiB |
BIN
src/third_party/tcmalloc/dist/docs/images/lifetimes-enabled.png
vendored
Normal file
|
After Width: | Height: | Size: 47 KiB |
BIN
src/third_party/tcmalloc/dist/docs/images/pagemap.png
vendored
Normal file
|
After Width: | Height: | Size: 20 KiB |
BIN
src/third_party/tcmalloc/dist/docs/images/per-cpu-cache-internals.png
vendored
Normal file
|
After Width: | Height: | Size: 50 KiB |
BIN
src/third_party/tcmalloc/dist/docs/images/per-thread-structure.png
vendored
Normal file
|
After Width: | Height: | Size: 18 KiB |
BIN
src/third_party/tcmalloc/dist/docs/images/spanmap.gif
vendored
Normal file
|
After Width: | Height: | Size: 8.3 KiB |
BIN
src/third_party/tcmalloc/dist/docs/images/tcmalloc_internals.png
vendored
Normal file
|
After Width: | Height: | Size: 44 KiB |
102
src/third_party/tcmalloc/dist/docs/lifetime-based-allocator.md
vendored
Normal file
@ -0,0 +1,102 @@
|
||||
# Lifetime-based Memory Allocation
|
||||
|
||||
TCMalloc contains an experimental feature that leverages object lifetime
|
||||
information for managing memory allocations. [Temeraire](temeraire.md)'s default
|
||||
allocation policy binpacks medium-sized allocations into the last hugepage
|
||||
associated with a large allocation. If the large allocation is short-lived, this
|
||||
can cause persistent fragmentation from long-lived medium-sized allocations that
|
||||
get binpacked into this region.
|
||||
|
||||
The lifetime-based allocator attempts to side-step this problem by predicting
|
||||
the lifetime of large allocations and allocating short-lived large objects from
|
||||
a special [HugeRegion](regions-are-not-optional.md) instead. Lifetimes are
|
||||
treated as binary (short, long) and are predicted based on the stack trace at
|
||||
the time of allocation. While the application is running, we are recording
|
||||
statistics about all large allocations that we encounter and once we have enough
|
||||
samples, we use these statistics to make a prediction when we encounter that
|
||||
same stack trace again. If a large allocation is predicted to be short-lived, it
|
||||
is placed into a special short-lived HugeRegion, otherwise it is handled as
|
||||
usual. We call this region "lifetime region".
|
||||
|
||||
The allocator can run in two different modes:
|
||||
|
||||
* **Enabled**: The allocator will execute the allocation policy described
|
||||
above.
|
||||
* **Counterfactual**: The allocator will execute the lifetime-based policy on
|
||||
the side but not affect the actual allocation behavior. Instead, it will
|
||||
collect statistics about the correctness of its decisions as well as the
|
||||
size of the lifetime region had the lifetime-based allocator been enabled.
|
||||
|
||||
The lifetime-based allocator has one configuration parameter (T), which is the
|
||||
cutoff below which an object is considered short-lived (T = 0.5s by default).
|
||||
Note that setting T = infinity causes all large allocations to be placed into
|
||||
the separate region.
|
||||
|
||||
## Lifetime Profiling
|
||||
|
||||
Lifetime profiling is implemented through two components:
|
||||
|
||||
* `LifetimeDatabase`: This component stores a dictionary of lifetime
|
||||
statistics, indexed by allocation stack trace. The size of the dictionary is
|
||||
limited to avoid memory blow-up. Entries are managed through a combination
|
||||
of LRU and reference counting. Each entry stores the number of long-lived
|
||||
(lifetime > T) and short-lived objects with this allocation stack trace that
|
||||
were encountered. The lifetime database does not track lifetimes itself but
|
||||
gets called from other components to 1) record lifetimes, and 2) look up
|
||||
lifetime predictions for a given stack trace. The latter works by looking up
|
||||
the statistics associated with that stack trace and predicting the object as
|
||||
long-lived if the number of long-lived allocations emanating from this stack
|
||||
trace exceeds the number of short-lived allocations by a significant margin.
|
||||
|
||||
* `LifetimeTracker`: A lifetime tracker is a small amount of meta-data that
|
||||
can be associated with an allocation and is used to track its lifetime. The
|
||||
tracker (among other information) stores a pointer to the lifetime
|
||||
statistics associated with this allocation, a timestamp, and a (possibly
|
||||
unused) counterfactual pointer whose purpose will be explained later in this
|
||||
document. Active trackers are strung together in a linked list sorted by
|
||||
allocation timestamp. The timestamp associated with the tracker at the front
|
||||
of this list is checked on every operation and if the lifetime of this
|
||||
object exceeds T, it is classified as long-lived. In this case, all trackers
|
||||
whose lifetime exceeds T are removed from the list (i.e., their trackers
|
||||
become inactive) and their associated lifetime statistics are updated to
|
||||
reflect that a long-lived allocation was encountered. If an object is
|
||||
deallocated before its tracker becomes inactive, its tracker is removed from
|
||||
the list and a short-lived allocation is recorded.
|
||||
|
||||
The use of trackers differs between enabled and counterfactual mode. In enabled
|
||||
mode, a tracker is associated with every large allocation that is placed in the
|
||||
regular hugepage-aware allocator and results in a filler donation. This tracker
|
||||
is allocated with the remaining meta-data that is already associated with any
|
||||
such donation. If an object is allocated in the lifetime region, its tracker is
|
||||
allocated in a special meta-data region associated with the lifetime region.
|
||||
This ensures that lifetimes continue to be tracked even if the allocator has
|
||||
decided to treat a particular allocation site as short-lived.
|
||||
|
||||

|
||||
|
||||
In counterfactual mode, no actual objects are allocated in the short-lived
|
||||
region. Instead, the lifetime region is a HugeRegion that is not backed by
|
||||
actual memory but otherwise executes the same logic. This means that for any
|
||||
object that would have been placed in the lifetime region had it been enabled,
|
||||
the real backing object is allocated in the existing hugepage-aware allocator.
|
||||
In this case, the tracker will store a `counterfactual_ptr` that points towards
|
||||
the address that the object would have had if it were actually allocated in the
|
||||
lifetime region. Otherwise, the object is tracked just like any other object in
|
||||
the hugepage-aware allocator.
|
||||
|
||||

|
||||
|
||||
## Lifetime-based Allocation
|
||||
|
||||
The lifetime-based allocator uses the existing HugeRegion implementation for all
|
||||
objects that are predicted short-lived. Whenever a large allocation is
|
||||
encountered, the current stack trace is collected, and the lifetime is looked up
|
||||
in the lifetime database. In regular enabled mode, the object is placed in the
|
||||
lifetime region or the regular allocator, depending on this prediction, and a
|
||||
tracker is installed. In counterfactual mode, the object is always allocated in
|
||||
the regular allocator and if the prediction called for the allocation to be
|
||||
placed in the lifetime region, an *additional* allocation call is placed to the
|
||||
lifetime region (which, in counterfactual mode, is not backed by actual memory).
|
||||
In this case, the tracker's `counterfactual_ptr` is set to the address that the
|
||||
object would have been allocated at, so that on deallocation, a corresponding
|
||||
call can be made to the lifetime region to deallocate the object.
|
||||
99
src/third_party/tcmalloc/dist/docs/overview.md
vendored
Normal file
@ -0,0 +1,99 @@
|
||||
# TCMalloc Overview
|
||||
|
||||
TCMalloc is Google's customized implementation of C's `malloc()` and C++'s
|
||||
`operator new` used for memory allocation within our C and C++ code. This custom
|
||||
memory allocation framework is an alternative to the one provided by the C
|
||||
standard library (on Linux usually through `glibc`) and C++ standard library.
|
||||
TCMalloc is designed to be more efficient at scale than other implementations.
|
||||
|
||||
Specifically, TCMalloc provides the following benefits:
|
||||
|
||||
* Performance scales with highly parallel applications.
|
||||
* Optimizations brought about with recent C++14 and C++17 standard
|
||||
enhancements, and by diverging slightly from the standard where performance
|
||||
benefits warrant. (These are noted within the
|
||||
[TCMalloc Reference](reference.md).)
|
||||
* Extensions to allow performance improvements under certain architectures,
|
||||
and additional behavior such as metric gathering.
|
||||
|
||||
## TCMalloc Cache Operation Mode
|
||||
|
||||
TCMalloc may operate in one of two fashions:
|
||||
|
||||
* (default) per-CPU caching, where TCMalloc maintains memory caches local to
|
||||
individual logical cores. Per-CPU caching is enabled when running TCMalloc
|
||||
on any Linux kernel that utilizes restartable sequences (RSEQ). Support for
|
||||
RSEQ was merged in Linux 4.18.
|
||||
* per-thread caching, where TCMalloc maintains memory caches local to each
|
||||
application thread. If RSEQ is unavailable, TCMalloc reverts to using this
|
||||
legacy behavior.
|
||||
|
||||
NOTE: the "TC" in TCMalloc refers to Thread Caching, which was originally a
|
||||
distinguishing feature of TCMalloc; the name remains as a legacy.
|
||||
|
||||
In both cases, these cache implementations allows TCMalloc to avoid requiring
|
||||
locks for most memory allocations and deallocations.
|
||||
|
||||
## TCMalloc Features
|
||||
|
||||
TCMalloc provides APIs for dynamic memory allocation: `malloc()` using the C
|
||||
API, and `::operator new` using the C++ API. TCMalloc, like most allocation
|
||||
frameworks, manages this memory better than raw memory requests (such as through
|
||||
`mmap()`) by providing several optimizations:
|
||||
|
||||
* Performs allocations from the operating system by managing
|
||||
specifically-sized chunks of memory (called "pages"). Having all of these
|
||||
chunks of memory the same size allows TCMalloc to simplify bookkeeping.
|
||||
* Devoting separate pages (or runs of pages called "Spans" in TCMalloc) to
|
||||
specific object sizes. For example, all 16-byte objects are placed within a
|
||||
"Span" specifically allocated for objects of that size. Operations to get or
|
||||
release memory in such cases are much simpler.
|
||||
* Holding memory in *caches* to speed up access of commonly-used objects.
|
||||
Holding such caches even after deallocation also helps avoid costly system
|
||||
calls if such memory is later re-allocated.
|
||||
|
||||
The cache size can also affect performance. The larger the cache, the less any
|
||||
given cache will overflow or get exhausted, and therefore require a lock to get
|
||||
more memory. TCMalloc extensions allow you to modify this cache size, though the
|
||||
default behavior should be preferred in most cases. For more information,
|
||||
consult the [TCMalloc Tuning Guide](tuning.md).
|
||||
|
||||
Additionally, TCMalloc exposes telemetry about the state of the application's
|
||||
heap via `MallocExtension`. This can be used for gathering profiles of the live
|
||||
heap, as well as a snapshot taken near the heap's highwater mark size (a peak
|
||||
heap profile).
|
||||
|
||||
## The TCMalloc API
|
||||
|
||||
TCMalloc implements the C and C++ dynamic memory API endpoints from the C11,
|
||||
C++11, C++14, and C++17 standards.
|
||||
|
||||
From C++, this includes
|
||||
|
||||
* The basic `::operator new`, `::operator delete`, and array variant
|
||||
functions.
|
||||
* C++14's sized `::operator delete`
|
||||
* C++17's overaligned `::operator new` and `::operator delete` functions.
|
||||
|
||||
Unlike in the standard implementations, TCMalloc does not throw an exception
|
||||
when allocations fail, but instead crashes directly. Such behavior can be used
|
||||
as a performance optimization for move constructors not currently marked
|
||||
`noexcept`; such move operations can be allowed to fail directly due to
|
||||
allocation failures. In [Abseil](https://abseil.io/docs/cpp/guides/base), these
|
||||
are enabled with `-DABSL_ALLOCATOR_NOTHROW`.
|
||||
|
||||
From C, this includes `malloc`, `calloc`, `realloc`, and `free`.
|
||||
|
||||
The TCMalloc API obeys the behavior of C90 DR075 and
|
||||
[DR445](http://www.open-std.org/jtc1/sc22/wg14/www/docs/summary.htm#dr_445)
|
||||
which states:
|
||||
|
||||
> The alignment requirement still applies even if the size is too small for any
|
||||
> object requiring the given alignment.
|
||||
|
||||
In other words, `malloc(1)` returns `alignof(std::max_align_t)`-aligned pointer.
|
||||
Based on the progress of
|
||||
[N2293](http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2293.htm), we may relax
|
||||
this alignment in the future.
|
||||
|
||||
For more complete information, consult the [TCMalloc Reference](reference.md).
|
||||
75
src/third_party/tcmalloc/dist/docs/platforms.md
vendored
Normal file
@ -0,0 +1,75 @@
|
||||
# TCMalloc Platforms
|
||||
|
||||
The TCMalloc code is supported on the following platforms. By "platforms", we
|
||||
mean the union of operating system, architecture (e.g. little-endian vs.
|
||||
big-endian), compiler, and standard library.
|
||||
|
||||
## Language Requirements
|
||||
|
||||
TCMalloc requires a code base that supports C++17 and our code is
|
||||
C++17-compliant. C code is required to be compliant to C11.
|
||||
|
||||
We guarantee that our code will compile under the following compilation flags:
|
||||
|
||||
Linux:
|
||||
|
||||
* gcc 9.2+, clang 9.0+: `-std=c++17`
|
||||
|
||||
(TL;DR; All code at this time must be built under C++17. We will update this
|
||||
list if circumstances change.)
|
||||
|
||||
## Supported Platforms
|
||||
|
||||
The document below lists each platform, broken down by Operating System,
|
||||
Architecture, Specific Compiler, and Standard Library implementation.
|
||||
|
||||
### Linux
|
||||
|
||||
**Supported**
|
||||
|
||||
<table width="80%">
|
||||
<col width="360">
|
||||
<col width="120">
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>Operating System</th>
|
||||
<th>Endianness/Word Size</th>
|
||||
<th>Processor Architectures</th>
|
||||
<th>Compilers*</th>
|
||||
<th>Standard Libraries</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Linux</td>
|
||||
<td>little-endian, 64-bit</td>
|
||||
<td>x86, AArch64</td>
|
||||
<td>gcc 9.2+<br/>clang 9.0+</td>
|
||||
<td>libstdc++<br/>libc++</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
\* We test on gcc 9.2, though gcc versions (which support C++17) prior to that
|
||||
release should also work.
|
||||
|
||||
**Best Effort**
|
||||
|
||||
<table width="80%">
|
||||
<col width="360">
|
||||
<col width="120">
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>Operating System</th>
|
||||
<th>Endianness/Word Size</th>
|
||||
<th>Processor Architectures</th>
|
||||
<th>Compilers*</th>
|
||||
<th>Standard Libraries</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Linux</td>
|
||||
<td>little-endian, 64-bit</td>
|
||||
<td>PPC</td>
|
||||
<td>gcc 9.2+<br/>clang 9.0+</td>
|
||||
<td>libstdc++<br/>libc++</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
267
src/third_party/tcmalloc/dist/docs/quickstart.md
vendored
Normal file
@ -0,0 +1,267 @@
|
||||
# TCMalloc Quickstart
|
||||
|
||||
Note: this Quickstart uses Bazel as the official build system for TCMalloc,
|
||||
which is supported on Linux, and compatible with most major compilers. The
|
||||
TCMalloc source code assumes you are using Bazel and contains `BUILD.bazel`
|
||||
files for that purpose.
|
||||
|
||||
This document is designed to allow you to get TCMalloc set up as your custom
|
||||
allocator within a C++ development environment. We recommend that each person
|
||||
starting development using TCMalloc at least run through this quick tutorial.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Running the code within this tutorial requires:
|
||||
|
||||
* A compatible platform (E.g. Linux). Consult the
|
||||
[Platforms Guide](platforms.md) for more information.
|
||||
* A compatible C++ compiler *supporting at least C++17*. Most major compilers
|
||||
are supported.
|
||||
* [Git](https://git-scm.com/) for interacting with the Abseil source code
|
||||
repository, which is contained on [GitHub](http://github.com). To install
|
||||
Git, consult the [Set Up Git](https://help.github.com/articles/set-up-git/)
|
||||
guide on GitHub.
|
||||
|
||||
Although you are free to use your own build system, most of the documentation
|
||||
within this guide will assume you are using [Bazel](https://bazel.build/),
|
||||
version 4.0 or newer.
|
||||
|
||||
To download and install Bazel (and any of its dependencies), consult the
|
||||
[Bazel Installation Guide](https://docs.bazel.build/versions/master/install.html).
|
||||
|
||||
## Getting the TCMalloc Code
|
||||
|
||||
You can obtain the TCMalloc code from its repository on GitHub:
|
||||
|
||||
```
|
||||
# Change to the directory where you want to create the code repository
|
||||
$ cd ~
|
||||
$ mkdir Source; cd Source
|
||||
$ git clone https://github.com/google/tcmalloc.git
|
||||
Cloning into 'tcmalloc'...
|
||||
remote: Total 1935 (delta 1083), reused 1935 (delta 1083)
|
||||
Receiving objects: 100% (1935/1935), 1.06 MiB | 0 bytes/s, done.
|
||||
Resolving deltas: 100% (1083/1083), done.
|
||||
$
|
||||
```
|
||||
|
||||
Git will create the repository within a directory named `tcmalloc`. Navigate
|
||||
into this directory and run all tests:
|
||||
|
||||
```
|
||||
$ cd tcmalloc
|
||||
$ bazel test //tcmalloc/...
|
||||
INFO: Analyzed 112 targets (12 packages loaded, 606 targets configured).
|
||||
...
|
||||
INFO: Build completed successfully, 827 total actions
|
||||
$
|
||||
```
|
||||
|
||||
Congratulations! You've installed TCMalloc
|
||||
|
||||
## Running the TCMalloc Hello World
|
||||
|
||||
Once you've verified you have TCMalloc installed correctly, you can compile and
|
||||
run the
|
||||
[tcmalloc-hello](https://github.com/google/tcmalloc/blob/master/tcmalloc/testing/hello_main.cc)
|
||||
sample binary to see how TCMalloc is linked into a sample binary. This tiny
|
||||
project features proper configuration and a simple `hello_main` to demonstrate
|
||||
how TCMalloc works.
|
||||
|
||||
First, build the `tcmalloc/testing:hello_main` target:
|
||||
|
||||
```
|
||||
tcmalloc$ bazel build tcmalloc/testing:hello_main
|
||||
Extracting Bazel installation...
|
||||
Starting local Bazel server and connecting to it...
|
||||
INFO: Analyzed target //tcmalloc/testing:hello_main (31 packages loaded ...
|
||||
...
|
||||
INFO: Build completed successfully, 102 total actions
|
||||
PASSED in 0.1s
|
||||
tcmalloc$
|
||||
```
|
||||
|
||||
Now, run the compiled program:
|
||||
|
||||
```
|
||||
tcmalloc$ bazel run tcmalloc/testing:hello_main
|
||||
...
|
||||
INFO: Found 1 target...
|
||||
...
|
||||
INFO: Build completed successfully, 1 total action
|
||||
Current heap size = 73728 bytes
|
||||
hello world!
|
||||
new'd 1073741824 bytes at 0x14ea40000000
|
||||
Current heap size = 1073816576 bytes
|
||||
malloc'd 1073741824 bytes at 0x14eac0000000
|
||||
Current heap size = 2147558400 bytes
|
||||
$
|
||||
```
|
||||
|
||||
You can inspect this code within
|
||||
[`tcmalloc/testing/hello_main.cc`](https://github.com/google/tcmalloc/blob/master/tcmalloc/testing/hello_main.cc)
|
||||
|
||||
Happy Coding!
|
||||
|
||||
## Creating and Running TCMalloc
|
||||
|
||||
Now that you've obtained the TCMalloc code and verified that you can build,
|
||||
test, and run it, you're ready to use it within your own project.
|
||||
|
||||
### Linking Your Code to the TCMalloc Repository
|
||||
|
||||
First, create (or select) a source code directory for your work. This directory
|
||||
should generally not be the `tcmalloc` directory itself; instead, you will link
|
||||
into that repository from your own source directory.
|
||||
|
||||
```
|
||||
# Change to your main development directory and create a new development
|
||||
# directory. (If you already have a development directory you'd wish to use,
|
||||
# you can use that.)
|
||||
$ cd ~/Source
|
||||
$ mkdir TestProject; cd TestProject
|
||||
```
|
||||
|
||||
Bazel allows you to link other Bazel projects using `WORKSPACE` files in the
|
||||
root of your development directories. To add a link to your local TCMalloc
|
||||
repository within your new project, add the following into a `WORKSPACE` file:
|
||||
|
||||
```
|
||||
local_repository(
|
||||
# Name of the TCMalloc repository. This name is defined within your
|
||||
# WORKSPACE file, in its `workspace()` metadata
|
||||
name = "com_google_tcmalloc",
|
||||
|
||||
# NOTE: Bazel paths must be absolute paths. E.g., you can't use ~/Source
|
||||
path = "/PATH_TO_SOURCE/Source/tcmalloc",
|
||||
)
|
||||
```
|
||||
|
||||
The "name" in the `WORKSPACE` file identifies the name you will use in Bazel
|
||||
`BUILD` files to refer to the linked repository (in this case
|
||||
"com_google_tcmalloc").
|
||||
|
||||
Note that your path to the TCMalloc source code must be an absolute path.
|
||||
|
||||
### Adding Abseil
|
||||
|
||||
TCMalloc requires [Abseil](https://abseil.io) which you will also need to
|
||||
provide as a `local_repository`, or link to a specific commit (we always
|
||||
recommend the latest commit) using an `http_archive` declaration in the
|
||||
`WORKSPACE` file:
|
||||
|
||||
<pre>
|
||||
# Abseil HTTP Archive to specific commit
|
||||
#
|
||||
# Consult https://github.com/abseil/abseil-cpp/commits/master for the latest
|
||||
# commit. But DO NOT use master.zip for that purpose. (Sha256 values are not
|
||||
# stable across master versions.) Click on that specific commit.
|
||||
#
|
||||
# Click "Browse Files" on the commit and click on "Clone or Download Code."
|
||||
#
|
||||
# Right click on "Download ZIP" to copy the HTTP Archive URL, which you will
|
||||
# use within the http_archive "urls" field.
|
||||
#
|
||||
# Note that you will need to generate a sha256 value for Bazel's http_archive
|
||||
# to ensure this code is secure. On Linux you can do so with a downloaded .zip
|
||||
# file using the sha256sum command line:
|
||||
#
|
||||
# $ sha256sum github_zip_file.zip
|
||||
http_archive(
|
||||
name = "com_google_absl",
|
||||
urls = ["https://github.com/abseil/abseil-cpp/archive/<i>commit_value</i>.zip"],
|
||||
strip_prefix = "abseil-cpp-<i>commit_value</i>",
|
||||
sha256 = "<i>sha256_of_commit_value</i>",
|
||||
)
|
||||
</pre>
|
||||
|
||||
### Creating Your Test Code
|
||||
|
||||
Within your `TestProject` create an `examples` directory:
|
||||
|
||||
```
|
||||
$ cd TestProject; mkdir examples; cd examples
|
||||
```
|
||||
|
||||
Now, create a `hello_world.cc` C++ file within your `examples` directory:
|
||||
|
||||
```
|
||||
#include <iostream>
|
||||
#include <cstddef>
|
||||
|
||||
int main() {
|
||||
std::cout << "Standard Alignment: " << alignof(std::max_align_t) << '\n';
|
||||
|
||||
double *ptr = (double*) malloc(sizeof(double));
|
||||
std::cout << "Double Alignment: " << alignof(*ptr) << '\n';
|
||||
|
||||
char *ptr2 = (char*) malloc(1);
|
||||
std::cout << "Char Alignment: " << alignof(*ptr2) << '\n';
|
||||
|
||||
void *ptr3;
|
||||
std::cout << "Sizeof void*: " << sizeof(ptr3) << '\n';
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
### Creating Your BUILD File
|
||||
|
||||
Now, create a `BUILD` file within your `examples` directory like the following:
|
||||
|
||||
```
|
||||
cc_binary(
|
||||
name = "hello_world",
|
||||
srcs = ["hello_world.cc"],
|
||||
malloc = "@com_google_tcmalloc//tcmalloc",
|
||||
)
|
||||
```
|
||||
|
||||
NOTE: For more information on how to create Bazel BUILD files, consult the
|
||||
[Bazel Tutorial](https://docs.bazel.build/versions/master/tutorial/cpp.html).
|
||||
|
||||
We declare TCMalloc as our own custom allocation framework using the `malloc`
|
||||
keyword and set this to the library name (`//tcmalloc`) within our `WORKSPACE`
|
||||
file (`@com_google_tcmalloc`).
|
||||
|
||||
Build our target ("hello_world") and run it:
|
||||
|
||||
```
|
||||
# It's often good practice to build files from the workspace root
|
||||
$ cd ~/Source/TestProject
|
||||
Source/TestProject$ bazel build //examples:hello_world --cxxopt='-std=c++17'
|
||||
INFO: Analysed target //examples:hello_world (12 packages loaded).
|
||||
INFO: Found 1 target...
|
||||
Target //examples:hello_world up-to-date:
|
||||
bazel-bin/examples/hello_world
|
||||
INFO: Elapsed time: 0.180s, Critical Path: 0.00s
|
||||
INFO: Build completed successfully, 1 total action
|
||||
|
||||
Source/TestProject$ bazel run //examples:hello_world
|
||||
INFO: Running command line: bazel-bin/examples/hello_world
|
||||
Standard Alignment: 16
|
||||
Double Alignment: 8
|
||||
Char Alignment: 1
|
||||
Sizeof void*: 8
|
||||
Source/TestProject$
|
||||
```
|
||||
|
||||
Note that we passed `--cxxopt='std=c++17'` to build using C++17. Instead of
|
||||
passing this flag you can add this line to your root `.bazelrc` file:
|
||||
|
||||
```
|
||||
build --cxxopt='-std=c++17'
|
||||
```
|
||||
|
||||
Congratulations! You've created your first binary using TCMalloc.
|
||||
|
||||
## What's Next
|
||||
|
||||
* Read our [overview](overview.md), if you haven't already. The overview
|
||||
covers memory allocation concepts and best practices for using TCMalloc.
|
||||
* Read through the TCMalloc [reference](reference.md) for information on the
|
||||
behavior of `malloc()`, `::operator new`, and other allocation/deallocation
|
||||
routines in TCMalloc.
|
||||
* Consult the TCMalloc C++ `malloc_extension.h` header file, which contains
|
||||
information on TCMalloc's supported extensions.
|
||||
* Read our [contribution guidelines](../CONTRIBUTING.md), if you intend to
|
||||
submit code to our repository.
|
||||
244
src/third_party/tcmalloc/dist/docs/reference.md
vendored
Normal file
@ -0,0 +1,244 @@
|
||||
# TCMalloc Basic Reference
|
||||
|
||||
TCMalloc provides implementations for C and C++ library memory management
|
||||
routines (`malloc()`, etc.) provided within the C and C++ standard libraries.
|
||||
|
||||
Currently, TCMalloc requires code that conforms to the C11 C standard library
|
||||
and the C++11, C++14, or C++17 C++ standard library.
|
||||
|
||||
NOTE: although the C API in this document is specific to the C language, the
|
||||
entire TCMalloc API itself is designed to be callable directly within C++ code
|
||||
(and we expect most usage to be from C++). The documentation in this section
|
||||
assumes C constructs (e.g. `size_t`) though invocations using equivalent C++
|
||||
constructs of aliased types (e.g. `std::size_t`) are instrinsically supported.
|
||||
|
||||
## C++ API
|
||||
|
||||
We implement the variants of `operator new` and `operator delete` from the
|
||||
C++11, C++14, C++17 standards exposed within the `<new>` header file. This
|
||||
includes:
|
||||
|
||||
* The basic `::operator new()`, `::operator delete()`, and array variant
|
||||
functions.
|
||||
* C++14's sized `::operator delete()`
|
||||
* C++17's overaligned `::operator new()` and `::operator delete()` functions.
|
||||
As required by the C++ standard, memory allocated using an aligned `operator
|
||||
new` function must be deallocated with an aligned `operator delete`.
|
||||
|
||||
### `::operator new` / `::operator new[]`
|
||||
|
||||
```
|
||||
void* operator new(std::size_t count);
|
||||
void* operator new(std::size_t count, const std::nothrow_t& tag) noexcept;
|
||||
void* operator new(std::size_t count, std::align_val_t al); // C++17
|
||||
void* operator new(std::size_t count,
|
||||
std::align_val_t al, const std::nothrow_t&) noexcept; // C++17
|
||||
|
||||
void* operator new[](std::size_t count);
|
||||
void* operator new[](std::size_t count, const std::nothrow_t& tag) noexcept;
|
||||
void* operator new[](std::size_t count, std::align_val_t al); // C++17
|
||||
void* operator new[](std::size_t count,
|
||||
std::align_val_t al, const std::nothrow_t&) noexcept; // C++17
|
||||
```
|
||||
|
||||
`operator new`/`operator new[]` allocates `count` bytes. They may be invoked
|
||||
directly but are more commonly invoked as part of a *new*-expression.
|
||||
|
||||
When `__STDCPP_DEFAULT_NEW_ALIGNMENT__` is not specified (or is larger than 8
|
||||
bytes), we use standard 16 byte alignments for `::operator new` without a
|
||||
`std::align_val_t` argument. However, for allocations under 16 bytes, we may
|
||||
return an object with a lower alignment, as no object with a larger alignment
|
||||
requirement can be allocated in the space. When compiled with
|
||||
`__STDCPP_DEFAULT_NEW_ALIGNMENT__ <= 8`, we use a set of sizes aligned to 8
|
||||
bytes for raw storage allocated with `::operator new`.
|
||||
|
||||
NOTE: On many platforms, the value of `__STDCPP_DEFAULT_NEW_ALIGNMENT__` can be
|
||||
configured by the `-fnew-alignment=...` flag.
|
||||
|
||||
The `std::align_val_t` variants provide storage suitably aligned to the
|
||||
requested alignment.
|
||||
|
||||
If the allocation is unsuccessful, a failure terminates the program.
|
||||
|
||||
NOTE: unlike in the C++ standard, we do not throw an exception in case of
|
||||
allocation failure, or invoke `std::get_new_handler()` repeatedly in an attempt
|
||||
to successfully allocate, but instead crash directly. Such behavior can be used
|
||||
as a performance optimization for move constructors not currently marked
|
||||
`noexcept`; such move operations can be allowed to fail directly due to
|
||||
allocation failures. Within Abseil code, these direct allocation failures are
|
||||
enabled with the Abseil build-time configuration macro
|
||||
[`ABSL_ALLOCATOR_NOTHROW`](https://abseil.io/docs/cpp/guides/base#abseil-exception-policy).
|
||||
|
||||
If the `std::no_throw_t` variant is utilized, upon failure, `::operator new`
|
||||
will return `nullptr` instead.
|
||||
|
||||
### `::operator delete` / `::operator delete[]`
|
||||
|
||||
```
|
||||
void operator delete(void* ptr) noexcept;
|
||||
void operator delete(void* ptr, std::size_t sz) noexcept;
|
||||
void operator delete(void* ptr, std::align_val_t al) noexcept;
|
||||
void operator delete(void* ptr, std::size_t sz,
|
||||
std::align_val_t all) noexcept;
|
||||
|
||||
void operator delete[](void* ptr) noexcept;
|
||||
void operator delete[](void* ptr, std::size_t sz) noexcept; // C++14
|
||||
void operator delete[](void* ptr, std::align_val_t al) noexcept; // C++17
|
||||
void operator delete[](void* ptr, std::size_t sz,
|
||||
std::align_val_t al) noexcept; // C++17
|
||||
```
|
||||
|
||||
`::operator delete`/`::operator delete[]` deallocate memory previously allocated
|
||||
by a corresponding `::operator new`/`::operator new[]` call respectively. It is
|
||||
commonly invoked as part of a *delete*-expression.
|
||||
|
||||
Sized delete is used as a critical performance optimization, eliminating the
|
||||
need to perform a costly pointer-to-size lookup.
|
||||
|
||||
### Extensions
|
||||
|
||||
We also expose a prototype of
|
||||
[P0901](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2019/p0901r5.html) in
|
||||
https://github.com/google/tcmalloc/blob/master/tcmalloc/malloc_extension.h with
|
||||
`tcmalloc_size_returning_operator_new()`. This returns both memory and the size
|
||||
of the allocation in bytes. It can be freed with `::operator delete`.
|
||||
|
||||
## C API
|
||||
|
||||
The C standard library specifies the API for dynamic memory management within
|
||||
the `<stdlib.h>` header file. Implementations require C11 or greater.
|
||||
|
||||
TCMalloc provides implementation for the following C API functions:
|
||||
|
||||
* `malloc()`
|
||||
* `calloc()`
|
||||
* `realloc()`
|
||||
* `free()`
|
||||
* `aligned_alloc()`
|
||||
|
||||
For `malloc`, `calloc`, and `realloc`, we obey the behavior of C90 DR075 and
|
||||
[DR445](http://www.open-std.org/jtc1/sc22/wg14/www/docs/summary.htm#dr_445)
|
||||
which states:
|
||||
|
||||
> The alignment requirement still applies even if the size is too small for any
|
||||
> object requiring the given alignment.
|
||||
|
||||
In other words, `malloc(1)` returns `alignof(std::max_align_t)`-aligned pointer.
|
||||
Based on the progress of
|
||||
[N2293](http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2293.htm), we may relax
|
||||
this alignment in the future.
|
||||
|
||||
Additionally, TCMalloc provides an implementation for the following POSIX
|
||||
standard library function, available within glibc:
|
||||
|
||||
* `posix_memalign()`
|
||||
|
||||
TCMalloc also provides implementations for the following obsolete functions
|
||||
typically provided within libc implementations:
|
||||
|
||||
* `cfree()`
|
||||
* `memalign()`
|
||||
* `valloc()`
|
||||
* `pvalloc()`
|
||||
|
||||
Documentation is not provided for these obsolete functions. The implementations
|
||||
are provided only for compatibility purposes.
|
||||
|
||||
### `malloc()`
|
||||
|
||||
```
|
||||
void* malloc(size_t size);
|
||||
```
|
||||
|
||||
`malloc` allocates `size` bytes of memory and returns a `void *` pointer to the
|
||||
start of that memory.
|
||||
|
||||
`malloc(0)` returns a non-NULL zero-sized pointer. (Attempting to access memory
|
||||
at this location is undefined.) If `malloc()` fails for some reason, it returns
|
||||
NULL.
|
||||
|
||||
### `calloc()`
|
||||
|
||||
```
|
||||
void* calloc(size_t num, size_t size);
|
||||
```
|
||||
|
||||
`calloc()` allocates memory for an array of objects, zero-initializes all bytes
|
||||
in allocated storage, and if allocation succeeds, returns a pointer to the first
|
||||
byte in the allocated memory block.
|
||||
|
||||
`calloc(num, 0)` or `calloc(0, size)` returns a non-NULL zero-sized pointer.
|
||||
(Attempting to access memory at this location is undefined.) If `calloc()` fails
|
||||
for some reason, it returns NULL.
|
||||
|
||||
### `realloc()`
|
||||
|
||||
```
|
||||
void* realloc(void *ptr, size_t new_size);
|
||||
```
|
||||
|
||||
`realloc()` re-allocates memory for an existing region of memory by either
|
||||
expanding or contracting the memory based on the passed `new_size` in bytes,
|
||||
returning a `void*` pointer to the start of that memory (which may not change);
|
||||
it does not perform any initialization of new areas of memory.
|
||||
|
||||
`realloc(OBJ*, 0)` returns a NULL pointer. If `realloc()` fails for some reason,
|
||||
it also returns NULL.
|
||||
|
||||
### `aligned_alloc()`
|
||||
|
||||
```
|
||||
void* aligned_alloc(size_t alignment, size_t size);
|
||||
```
|
||||
|
||||
`aligned_alloc()` allocates `size` bytes of memory with alignment of size
|
||||
`alignment` and returns a `void *` pointer to the start of that memory; it does
|
||||
not perform any initialization.
|
||||
|
||||
The `size` parameter must be an integral multiple of `alignment` and `alignment`
|
||||
must be a power of two. If either of these cases is not satisfied,
|
||||
`aligned_alloc()` will fail and return a NULL pointer.
|
||||
|
||||
`aligned_alloc` with `size=0` returns a non-NULL zero-sized pointer. (Attempting
|
||||
to access memory at this location is undefined.)
|
||||
|
||||
### `posix_memalign()`
|
||||
|
||||
```
|
||||
int posix_memalign(void **memptr, size_t alignment, size_t size);
|
||||
```
|
||||
|
||||
`posix_memalign()`, like `aligned_alloc()` allocates `size` bytes of memory with
|
||||
alignment of size `alignment` to the start of memory pointed to by `**memptr`;
|
||||
it does not perform any initialization. This pointer can be cast to the desired
|
||||
type of data pointer in order to be dereferenceable. If the alignment allocation
|
||||
succeeds, `posix_memalign()` returns `0`; otherwise it returns an error value.
|
||||
|
||||
`posix_memalign` is similar to `aligned_alloc()` but `alignment` be a power of
|
||||
two multiple of `sizeof(void *)`. If the constraints are not satisfied,
|
||||
`posix_memalign()` will fail.
|
||||
|
||||
`posix_memalign` with `size=0` returns a non-NULL zero-sized pointer.
|
||||
(Attempting to access memory at this location is undefined.)
|
||||
|
||||
### `free()`
|
||||
|
||||
```
|
||||
void free(void* ptr);
|
||||
```
|
||||
|
||||
`free()` deallocates memory previously allocated by `malloc()`, `calloc()`,
|
||||
`aligned_alloc()`, `posix_memalign()`, or `realloc()`. If `free()` is passed a
|
||||
null pointer, the function does nothing.
|
||||
|
||||
### Extensions
|
||||
|
||||
These are contained in
|
||||
https://github.com/google/tcmalloc/blob/master/tcmalloc/malloc_extension.h.
|
||||
|
||||
* `nallocx(size_t size, int flags)` - Returns the number of bytes that would
|
||||
be allocated by `malloc(size)`, subject to the alignment specified in
|
||||
`flags`.
|
||||
* `sdallocx(void* ptr, size_t size, int flags)` - Deallocates memory allocated
|
||||
by `malloc` or `memalign`. It takes a size parameter to pass the original
|
||||
allocation size, improving deallocation performance.
|
||||
154
src/third_party/tcmalloc/dist/docs/regions-are-not-optional.md
vendored
Normal file
@ -0,0 +1,154 @@
|
||||
# Regions Are Not Optional!
|
||||
|
||||
Andrew Hunter
|
||||
|
||||
Discussion on the design of [Temeraire](temeraire.md) posited that `HugeRegion`
|
||||
is a weird/complex feature that possibly is a premature optimization.
|
||||
`HugeRegion` is neither optional, nor really all that complex. We claim this is
|
||||
actually a fairly simple approach that fixes what would otherwise be a very
|
||||
serious flaw.
|
||||
|
||||
This expands on the description of `HugeRegion` in the main design doc.
|
||||
|
||||
## Our Trilemma
|
||||
|
||||
`HugeRegion` exists because of three key framing requirements for a
|
||||
Temeraire-enabled TCMalloc:
|
||||
|
||||
1. We must support allocations of any (reasonable) size, and in particular a
|
||||
heap composed of any set of reasonable sizes in any ratio; "sorry, tcmalloc
|
||||
detonates if you mostly use requests of size X" is not acceptable.
|
||||
1. We must be able to back (most, ideally all) of our heap with hugepages.
|
||||
1. We would like to tightly bound global space overhead[^1] on our heap.
|
||||
|
||||
Consider requests R<sub>i</sub> that are larger than a hugepage, but small
|
||||
enough that the rounding error from extending to a hugepage boundary is
|
||||
significant by (3). (Note that rounding up to a hugepage boundary would
|
||||
introduce a significant amount of overhead for allocations between 1 and 10
|
||||
hugepages, and the overhead could still be considered significant for
|
||||
allocations larger than that.)
|
||||
|
||||
* We *cannot* unback the unused tail of the last hugepage (requirement (2)
|
||||
would be violated).
|
||||
* We *cannot* assume these requests are necessarily rare and we will have many
|
||||
smaller ones to fill the unused tail (requirement (1) would be violated).
|
||||
Moreover this is **empirically false** for widely used
|
||||
binaries.
|
||||
|
||||
In summary, we must be able to use the unused tail of a hugepage from one
|
||||
R<sub>i</sub> as space for another large R<sub>j</sub>. If we do not enable such
|
||||
usage in our allocator, we will either potentially have space overhead of up to
|
||||
100%, or dramatically reduce our hugepage usage. The conclusion we came to is
|
||||
that we **must support**, in some form, allocating multiple such R<sub>i</sub>
|
||||
contiguously; that is, using the unused tail from R<sub>1 </sub>as the beginning
|
||||
of R<sub>2</sub> and so on.
|
||||
|
||||
**This is all `HugeRegion{,Set}` does.**
|
||||
|
||||
## The "Simple" Truth
|
||||
|
||||
The above argument is why we have `HugeRegion`: we need a way to allocate
|
||||
multiple large (>1 hugepage) allocations on overlapping hugepages. So how can we
|
||||
do that? Clearly, we need some range of hugepages, large enough for several such
|
||||
R<sub>i</sub>, from which we allocate. What should we do in that space? A
|
||||
best-fit algorithm that tracks the free lengths seems appropriate.
|
||||
|
||||
As allocations become free, it seems reasonable (by requirement (3) above) that
|
||||
we unback empty hugepages.
|
||||
|
||||
Finally, what happens if the the range we allocated is full? We could do two
|
||||
things
|
||||
|
||||
1. extend it
|
||||
1. obtain a new one and do allocations from there as needed.
|
||||
|
||||
(1) is an interesting choice, but not actually possible with the `SysAllocator`
|
||||
interface. We might get lucky with `sbrk` (or even `mmap`, though it is less
|
||||
likely) placement choice, but we also might not; we cannot rely on it. So we
|
||||
must be able to fall back to (2) anyway, and given that there's very little
|
||||
disadvantages to having multiple such ranges (we won’t need very many in any
|
||||
case), why not just only do that?
|
||||
|
||||
It should not be surprising that we have just described the algorithm
|
||||
`HugeRegion{,Set}` uses: inside some fixed-size range, do best-fit allocation
|
||||
for large allocations, backing and unbacking hugepages on demand. When one
|
||||
region fills, obtain another; fill from the most fragmented to bound total
|
||||
overhead (a policy derived from `HugePageFiller`).
|
||||
|
||||
That is *really it*. We do not see this as particularly complicated. The only
|
||||
thing left is the implementation of that policy: We used `RangeTracker` because
|
||||
it was convenient, supported exactly the API we needed, and fast enough (even
|
||||
though we're tracking fairly large bitsets).
|
||||
|
||||
## But what about...
|
||||
|
||||
There are some reasonable objections to particular details, which we are happy
|
||||
to address.
|
||||
|
||||
### Why are regions so big?
|
||||
|
||||
Because it worked. Virtual address space is virtually free. :) We can easily
|
||||
justify why they aren’t 32 MiB (our original choice, as it happens):
|
||||
[Temeraire](temeraire.md) contains a simple argument, it is trivial to waste a
|
||||
full hugepage per region, and this scales down nicely with increasing region
|
||||
size. Why did we go to a gigabyte? Because it worked. :) It had an added
|
||||
advantage: even large binaries would only use a handful of regions, and thus
|
||||
walking the list was cheap and we could print a lot of info about each in
|
||||
mallocz.
|
||||
|
||||
We've run more tests; 128 MiB and 512 MiB both perform reasonably, but this
|
||||
isn't a compelling reason to change the size. We don't really support VSS limits
|
||||
(and in practice we don't have them, outside badly behaved sandbox programs and
|
||||
some daemons that use `SMALL_BUT_SLOW` anyway, which we're not currently
|
||||
changing).
|
||||
|
||||
### How did we pick the current policy for what goes to regions?
|
||||
|
||||
Because it worked. The arguments above make it clear that anything larger than
|
||||
one hugepage and smaller than <some value we can agree is many> hugepages
|
||||
must go there. It seemed reasonable to allow slightly smaller ones to slip into
|
||||
the region if we had space and it was needed; we saw no reason not to allow
|
||||
many-hugepage allocations there if they fit. In practice, this seems to work
|
||||
well. There really isn’t more thought than that.
|
||||
|
||||
### Can’t we fix binaries with problematic allocation patterns?
|
||||
|
||||
Yes, we can. We probably should. It'd be good to do anyway. However: doing so
|
||||
doesn’t stop us from needing Regions:
|
||||
|
||||
* Changing workloads takes a long time.
|
||||
* We cannot successfully change, all the programs that make any significant
|
||||
use of allocations >2 MiB and less than (say) 50 MiB. We cannot tell
|
||||
users "Eh, no, tcmalloc does terribly if you allocate a couple megabytes at
|
||||
a time?" Requirement (1) above is our expression of how we don't think
|
||||
that's reasonable at all: we should able to handle 3 MiB allocations without
|
||||
embarrassing ourselves.
|
||||
|
||||
Recall that the trilemma leading to regions applies for **anything more than 2
|
||||
MiB which we can't just ignore the tail on**. It's easiest to show the potential
|
||||
huge problems with the canonical "2.1 MiB" allocation, but 5 MiB or 6.1 MiB or
|
||||
even 10.1 MiB allocations, if they're a significant component of heap usage,
|
||||
will lead to unacceptable overhead without `HugeRegion`, and we don't think we
|
||||
can say "don't do that."
|
||||
|
||||
## Conclusion
|
||||
|
||||
`HugeRegion` is the simplest possible solution we've found to a pressing problem
|
||||
in a hugepage-oriented allocator. When you read the [design doc](temeraire.md),
|
||||
please don't assume that HugeRegion is a speculative fix for a potential
|
||||
problem, that we might not need, nor that it's a roughed out attempt. This is a
|
||||
key part of the algorithm, and one we've thought a lot about the best fix for.
|
||||
We don't claim it is perfect and must surely have hit on the best fix, but
|
||||
"nothing" is not an acceptable solution. This gets reasonable space performance
|
||||
with badly sized allocations.
|
||||
|
||||
**In short, `HugeRegion` is neither optional nor particularly complex. Having it
|
||||
produces dramatic savings in a number of realistic scenarios, and costs us very
|
||||
little.**
|
||||
|
||||
## Notes
|
||||
|
||||
[^1]: What our designed bound of overhead is...a very interesting question.
|
||||
Different places accept different forms of overhead. While we could target
|
||||
the current overhead, we can and must do better than this. One goal of
|
||||
Temeraire is to dramatically cut this (in the pageheap).
|
||||
424
src/third_party/tcmalloc/dist/docs/rseq.md
vendored
Normal file
@ -0,0 +1,424 @@
|
||||
# Restartable Sequence Mechanism for TCMalloc
|
||||
|
||||
<!--*
|
||||
# Document freshness: For more information, see go/fresh-source.
|
||||
freshness: { owner: 'ckennelly' reviewed: '2022-12-14' }
|
||||
*-->
|
||||
|
||||
## per-CPU Caches
|
||||
|
||||
TCMalloc implements its per-CPU caches using restartable sequences (`man
|
||||
rseq(2)`) on Linux. This kernel feature was developed by
|
||||
[Paul Turner and Andrew Hunter at Google](http://www.linuxplumbersconf.net/2013/ocw//system/presentations/1695/original/LPC%20-%20PerCpu%20Atomics.pdf)
|
||||
and Mathieu Desnoyers at EfficiOS. Restartable sequences let us execute a region
|
||||
to completion (atomically with respect to other threads on the same CPU) or to
|
||||
be aborted if interrupted by the kernel by preemption, interrupts, or signal
|
||||
handling.
|
||||
|
||||
Choosing to restart on migration across cores or preemption allows us to
|
||||
optimize the common case - we stay on the same core - by avoiding atomics, over
|
||||
the more rare case - we are actually preempted. As a consequence of this
|
||||
tradeoff, we need to make our code paths actually support being restarted. The
|
||||
entire sequence, except for its final store to memory which *commits* the
|
||||
change, must be capable of starting over.
|
||||
|
||||
This carries a few implementation challenges:
|
||||
|
||||
* We need fine-grained control over the generated assembly, to ensure stores
|
||||
are not reordered in unsuitable ways.
|
||||
* The restart sequence is triggered if the kernel detects a context switch
|
||||
occurred with the PC in the restartable sequence code. If this happens
|
||||
instead of restarting at this PC, it restarts the thread at an abort
|
||||
sequence, the abort sequence determines the interrupted restartable
|
||||
sequence, and then returns to control to the entry point of this sequence.
|
||||
|
||||
We must preserve adequate state to successfully restart the code sequence.
|
||||
In particular, we must preserve the function parameters so that we can
|
||||
restart the sequence with the same conditions; next we must reload any
|
||||
parameters like the CPU ID, and recompute any necessary values.
|
||||
|
||||
## Structure of the `TcmallocSlab`
|
||||
|
||||
In per-CPU mode, we allocate an array of `N` `TcmallocSlab::Slabs`. For all
|
||||
operations, we index into the array with the logical CPU ID.
|
||||
|
||||
Each slab has a header region of control data (one 8-byte header per-size
|
||||
class). These index into the remainder of the slab, which contains pointers to
|
||||
free listed objects.
|
||||
|
||||

|
||||
|
||||
In
|
||||
[C++ code](https://github.com/google/tcmalloc/blob/master/tcmalloc/internal/percpu_tcmalloc.h),
|
||||
these are represented as:
|
||||
|
||||
```
|
||||
struct Slabs {
|
||||
std::atomic<int64_t> header[NumClasses];
|
||||
void* mem[((1ul << Shift) - sizeof(header)) / sizeof(void*)];
|
||||
};
|
||||
|
||||
// Slab header (packed, atomically updated 64-bit).
|
||||
// All {begin, current, end} values are pointer offsets from per-CPU region
|
||||
// start. The slot array is in [begin, end), and the occupied slots are in
|
||||
// [begin, current).
|
||||
struct Header {
|
||||
// The end offset of the currently occupied slots.
|
||||
uint16_t current;
|
||||
// Copy of end. Updated by Shrink/Grow, but is not overwritten by Drain.
|
||||
uint16_t end_copy;
|
||||
// Lock updates only begin and end with a 32-bit write.
|
||||
|
||||
// The begin offset of the slot array for this size class.
|
||||
uint16_t begin;
|
||||
// The end offset of the slot array for this size class.
|
||||
uint16_t end;
|
||||
|
||||
// Lock is used by Drain to stop concurrent mutations of the Header.
|
||||
// Lock sets begin to 0xffff and end to 0, which makes Push and Pop fail
|
||||
// regardless of current value.
|
||||
bool IsLocked() const;
|
||||
void Lock();
|
||||
};
|
||||
|
||||
```
|
||||
|
||||
The atomic `header` allows us to read the state (esp. for telemetry purposes) of
|
||||
a core without undefined behavior.
|
||||
|
||||
The fields in `Header` are indexed in `sizeof(void*)` strides into the slab. For
|
||||
the default value of `Shift=18`, this allows us to cache nearly 32K objects per
|
||||
CPU. Ongoing work encodes `Slabs*` and `Shift` into a single pointer, allowing
|
||||
it to be dynamically updated at runtime.
|
||||
|
||||
We have allocated capacity for `end-begin` objects for a given size-class.
|
||||
`begin` is chosen via static partitioning at initialization time. `end` is
|
||||
chosen dynamically at a higher-level (in `tcmalloc::CPUCache`), as to:
|
||||
|
||||
* Avoid running into the next size-classes' `begin`
|
||||
* Balance cached object capacity across size-classes, according to the
|
||||
specified byte limit.
|
||||
|
||||
## Usage: Allocation
|
||||
|
||||
As the first operation, we can look at allocation, which needs to read the
|
||||
pointer at index `current-1`, return that object, and decrement `current`.
|
||||
Decrementing `current` is the *commit* operation.
|
||||
|
||||
In pseudo-C++, this looks like:
|
||||
|
||||
```
|
||||
void* TcmallocSlab_Pop(
|
||||
void *slabs,
|
||||
size_t size_class,
|
||||
UnderflowHandler underflow_handler) {
|
||||
// Expanded START_RSEQ macro...
|
||||
restart:
|
||||
__rseq_abi.rseq_cs = &__rseq_cs_TcmallocSlab_Pop;
|
||||
start:
|
||||
// Actual sequence
|
||||
uint64_t cpu_id = __rseq_abi.cpu_id;
|
||||
Header* hdr = &slabs[cpu_id].header[size_class];
|
||||
uint64_t current = hdr->current;
|
||||
uint64_t begin = hdr->begin;
|
||||
if (ABSL_PREDICT_FALSE(current <= begin)) {
|
||||
goto underflow;
|
||||
}
|
||||
|
||||
void* next = *(&slabs[cpu_id] + current * sizeof(void*) - 2 * sizeof(void*))
|
||||
prefetcht0(next);
|
||||
|
||||
void* ret = *(&slabs[cpu_id] + current * sizeof(void*) - sizeof(void*));
|
||||
--current;
|
||||
hdr->current = current;
|
||||
commit:
|
||||
return ret;
|
||||
underflow:
|
||||
return underflow_handler(cpu_id, size_class);
|
||||
}
|
||||
|
||||
// This is implemented in assembly, but for exposition.
|
||||
ABSL_CONST_INIT kernel_rseq_cs __rseq_cs_TcmallocSlab_Pop = {
|
||||
.version = 0,
|
||||
.flags = 0,
|
||||
.start_ip = &&start,
|
||||
.post_commit_offset = &&commit - &&start,
|
||||
.abort_ip = &&abort,
|
||||
};
|
||||
```
|
||||
|
||||
`__rseq_cs_TcmallocSlab_Pop` is a read-only data structure, which contains
|
||||
metadata about this particular restartable sequence. When the kernel preempts
|
||||
the current thread, it examines this data structure. If the current instruction
|
||||
pointer is between `[start, commit)`, it returns control to a specified,
|
||||
per-sequence restart header at `abort`.
|
||||
|
||||
Since the *next* object is frequently allocated soon after the current object,
|
||||
the allocation path prefetches the pointed-to object. To avoid prefetching a
|
||||
wild address, we populate `slabs[cpu][begin]` for each CPU/size-class with a
|
||||
pointer-to-self.
|
||||
|
||||
This sequence terminates with the *single* committing store to `hdr->current`.
|
||||
If we are migrated or otherwise interrupted, we restart the preparatory steps,
|
||||
as the values of `cpu_id`, `current`, `begin` may have changed.
|
||||
|
||||
As these operations work on a single core's data and are executed on that core.
|
||||
From a memory ordering perspective, loads and stores need to appear on that core
|
||||
in program order.
|
||||
|
||||
### Restart Handling
|
||||
|
||||
The `abort` label is distinct from `restart`. The `rseq` API provided by the
|
||||
kernel (see below) requires a "signature" (typically an intentionally invalid
|
||||
opcode) in the 4 bytes prior to the restart handler. We form a small
|
||||
trampoline - properly signed - to jump back to `restart`.
|
||||
|
||||
In x86 assembly, this looks like:
|
||||
|
||||
```
|
||||
// Encode nop with RSEQ_SIGNATURE in its padding.
|
||||
.byte 0x0f, 0x1f, 0x05
|
||||
.long RSEQ_SIGNATURE
|
||||
.local TcmallocSlab_Push_trampoline
|
||||
.type TcmallocSlab_Push_trampoline,@function
|
||||
TcmallocSlab_Push_trampoline:
|
||||
abort:
|
||||
jmp restart
|
||||
```
|
||||
|
||||
This ensures that the 4 bytes prior to `abort` match up with the signature that
|
||||
was configured with the `rseq` syscall.
|
||||
|
||||
On x86, we can represent this with a nop which would allow for interleaving in
|
||||
the main implementation. On other platforms - with fixed width instructions -
|
||||
the signature is often chosen to be an illegal/trap instruction, so it has to be
|
||||
disjoint from the function's body.
|
||||
|
||||
## Usage: Deallocation
|
||||
|
||||
Deallocation uses two stores, one to store the deallocated object and another to
|
||||
update `current`. This is still compatible with the restartable sequence
|
||||
technique, as there is a *single* commit step, updating `current`. Any preempted
|
||||
sequences will overwrite the value of the deallocated object until a successful
|
||||
sequence commits it by updating `current`.
|
||||
|
||||
```
|
||||
int TcmallocSlab_Push(
|
||||
void *slab,
|
||||
size_t size_class,
|
||||
void* item,
|
||||
OverflowHandler overflow_handler) {
|
||||
// Expanded START_RSEQ macro...
|
||||
restart:
|
||||
__rseq_abi.rseq_cs = &__rseq_cs_TcmallocSlab_Push;
|
||||
start:
|
||||
// Actual sequence
|
||||
uint64_t cpu_id = __rseq_abi.cpu_id;
|
||||
Header* hdr = &slabs[cpu_id].header[size_class];
|
||||
uint64_t current = hdr->current;
|
||||
uint64_t end = hdr->end;
|
||||
if (ABSL_PREDICT_FALSE(current >= end)) {
|
||||
goto overflow;
|
||||
}
|
||||
|
||||
*(&slabs[cpu_id] + current * sizeof(void*) - sizeof(void*)) = item;
|
||||
current++;
|
||||
hdr->current = current;
|
||||
commit:
|
||||
return;
|
||||
overflow:
|
||||
return overflow_handler(cpu_id, size_class, item);
|
||||
}
|
||||
```
|
||||
|
||||
## Initialization of the Slab
|
||||
|
||||
To reduce metadata demands, we lazily initialize the slabs, relying on the
|
||||
kernel to provide zeroed pages from the `mmap` call to obtain memory for the
|
||||
slab metadata.
|
||||
|
||||
At startup, this leaves the `Header` of each initialized to `current = begin =
|
||||
end = 0`. The initial push or pop will trigger the overflow or underflow paths
|
||||
(respectively), so that we can populate these values.
|
||||
|
||||
## More Complex Operations: Batches
|
||||
|
||||
When the cache under or overflows, we populate or remove a full batch of objects
|
||||
obtained from inner caches. This amortizes some of the lock acquisition/logic
|
||||
for those caches. Using a similar approach to push and pop, we read/write a
|
||||
batch of `N` items and we update `current` to commit the operation.
|
||||
|
||||
## Kernel API and implementation
|
||||
|
||||
This section contains notes on the rseq API provided by the kernel, which is not
|
||||
well documented, and code pointers for how it is implemented.
|
||||
|
||||
The `rseq` syscall is implemented by
|
||||
[`sys_rseq`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L304-L366).
|
||||
It starts by
|
||||
[handling](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L312-L328)
|
||||
the case where the thread wants to unregister, implementing that by clearing the
|
||||
[rseq information](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/linux/sched.h#L1188-L1189)
|
||||
out of the `task_struct` for the thread running
|
||||
[on the current CPU](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/arch/x86/include/asm/current.h#L11-L18).
|
||||
It then moves on to
|
||||
[return an error](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L333-L345)
|
||||
if the thread is already registered for rseq. Then it
|
||||
[validates](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L347-L355)
|
||||
and
|
||||
[saves](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L356-L357)
|
||||
the input from the user, and
|
||||
[sets](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L358-L363)
|
||||
the
|
||||
[`TIF_NOTIFY_RESUME` flag](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/linux/sched.h#L2044-L2048)
|
||||
for the thread.
|
||||
|
||||
### Restarts
|
||||
|
||||
Among other things, the user's input to the `rseq` syscall is used by
|
||||
`rseq_ip_fixup` to
|
||||
[decide](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L232-L238)
|
||||
whether we're in a critical section and if so
|
||||
[restart](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L247)
|
||||
at the abort point. That function is
|
||||
[called](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L271)
|
||||
by `__rseq_handle_notify_resume`, which is
|
||||
[documented](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L251-L261)
|
||||
as needing to be called after preemption or signal delivery before returning to
|
||||
the user. That in turn is called by
|
||||
[`rseq_handle_notify_resume`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/linux/sched.h#L2052-L2057),
|
||||
a simple wrapper that bails if rseq is not enabled for the thread.
|
||||
|
||||
Here is one path that causes us to wind up here on x86:
|
||||
|
||||
* [`rseq_signal_deliver`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/linux/sched.h#L2065)
|
||||
* [`setup_rt_frame`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/arch/x86/kernel/signal.c#L690-L691)
|
||||
* [`handle_signal`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/arch/x86/kernel/signal.c#L746)
|
||||
* [`arch_do_signal_or_restart`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/arch/x86/kernel/signal.c#L812-L813)
|
||||
* [`handle_signal_work`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/entry/common.c#L147)
|
||||
* [`exit_to_user_mode_loop`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/entry/common.c#L171)
|
||||
* [`exit_to_user_mode_prepare`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/entry/common.c#L208)
|
||||
|
||||
So the choke point is the code that returns to user space. Here are some notes
|
||||
on how the restart logic varies based on user input:
|
||||
|
||||
* `rseq_ip_fixup`
|
||||
[calls](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L228)
|
||||
`rseq_get_rseq_cs` every time. That means it
|
||||
[reads](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L123-L124)
|
||||
the
|
||||
[pointer](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/uapi/linux/rseq.h#L91-L124)
|
||||
to `struct rseq_cs` and then
|
||||
[indirects](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L131-L133)
|
||||
through it fresh from user memory each time. It
|
||||
[checks](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L135-L145)
|
||||
for invalid cases (which
|
||||
[cause](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L278-L280)
|
||||
a segfault for the user process) and then does
|
||||
[validation](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L147-L157)
|
||||
of the abort IP signature discussed below.
|
||||
|
||||
* Signature validation: from
|
||||
[the code](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L147-L157)
|
||||
linked above we can see that the requirement is that the abort handler
|
||||
specified by `rseq_cs::abort_ip` be preceded by a 32-bit magic integer that
|
||||
[matches](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L152)
|
||||
the one originally provided to and
|
||||
[saved by](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L357)
|
||||
the `rseq` syscall.
|
||||
|
||||
The intent is to avoid turning buffer overflows into arbitrary code
|
||||
execution: if an attacker can write into memory then they can control
|
||||
`rseq_cs::abort_ip`, which is kind of like writing a jump instruction into
|
||||
memory, which can be seen as breaking
|
||||
[W^X](https://en.wikipedia.org/wiki/W%5EX) protections. Instead the kernel
|
||||
has the caller pre-register a magic value from the executable memory that
|
||||
they want to run, under the assumption that an attacker is unlikely to be
|
||||
able to find other usable "gadgets" in executable memory that happen to be
|
||||
preceded by that value.
|
||||
|
||||
It's also worth noting that signals and preemption always
|
||||
[result in](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L238-L242)
|
||||
[clearing](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L197-L210)
|
||||
`rseq::rseq_cs::ptr64` from user space memory on the way out, except in error
|
||||
cases that cause a segfault.
|
||||
|
||||
### CPU IDs
|
||||
|
||||
The other thing `rseq.c` takes care of is writing CPU IDs to user space memory.
|
||||
|
||||
There are two fields in user space that get this information:
|
||||
[`rseq::cpu_id_start`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/uapi/linux/rseq.h#L63-L75)
|
||||
and
|
||||
[`rseq::cpu_id`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/uapi/linux/rseq.h#L76-L90).
|
||||
The difference between the two is that `cpu_id_start` is always in range,
|
||||
whereas `cpu_id` may contain error values. The kernel provides both in order to
|
||||
support computation of values derived from the CPU ID that happens before
|
||||
entering the critical section. We could do this with one CPU ID, but it would
|
||||
require an extra branch to distinguish "not initialized" from "CPU ID changed
|
||||
after fetching it". On the other hand if (like tcmalloc) you only fetch the CPU
|
||||
Id within a critical section, then you need only one field because you have only
|
||||
one branch: am I initialized. There is no such thing as a CPU mismatch because
|
||||
instead you are just restarted when the CPU ID changes.
|
||||
|
||||
The two CPU ID fields are maintained as follows:
|
||||
|
||||
* [`rseq_update_cpu_id`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L84-L94)
|
||||
writes a CPU ID into each. This is
|
||||
[called](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L274-L275)
|
||||
by `__rseq_handle_notify_resume`, which is discussed above.
|
||||
|
||||
* [`rseq_reset_rseq_cpu_id`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L96-L113)
|
||||
sets the `cpu_id_start` field to zero and the `cpu_id` field to
|
||||
[`RSEQ_CPU_ID_UNINITIALIZED`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/uapi/linux/rseq.h#L17)
|
||||
(an out of range value). It is
|
||||
[called](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L322)
|
||||
in the unregister path discussed above.
|
||||
|
||||
## Cross-CPU Operations
|
||||
|
||||
With restartable sequences, we've optimized the fast path for same-CPU
|
||||
operations at the expense of costlier cross-CPU operations. Cross-CPU operations
|
||||
are rare—typically done only to facilitate periodic drains of idle
|
||||
caches—so this is a desirable tradeoff.
|
||||
|
||||
Cross-CPU operations rely on operating system assistance (wrapped in
|
||||
`tcmalloc::tcmalloc_internal::subtle::percpu::FenceCpu`) to interrupt any
|
||||
running restartable sequences on the remote core. When control is returned to
|
||||
the thread running on that core, we have guaranteed that either the restartable
|
||||
sequence that was running has completed *or* that the restartable sequence was
|
||||
preempted.
|
||||
|
||||
We use preemption and "locks" (`TcmallocSlab::Header::Lock`) to ensure that
|
||||
during a particular period, all accesses to the fast path will fail—the
|
||||
cache is both simultaneously "full" and "empty" so all inserts and removes will
|
||||
go to the slow path. Unlike using `sched_setaffinity` to run a remote core, this
|
||||
approach allows us to perform longer operations, such as taking elements from
|
||||
the cache and inserting them into the `TransferCache` as part of `Drain`, while
|
||||
still maintaining correctness.
|
||||
|
||||
Since we are using relaxed loads and stores, potentially with word-level
|
||||
granularity, our operations need to potentially store part of the needed data to
|
||||
`Header`, fence, and then write additional fields. For example, at the end of of
|
||||
`Drain`, we:
|
||||
|
||||
* Store `hdr.current`. `hdr.begin = 0xFFFF` and `hdr.end = 0x0`, ensuring
|
||||
insert and remove operations continue to fail.
|
||||
* `FenceCpu`
|
||||
* Store `hdr.begin` and `hdr.end` to their proper values.
|
||||
|
||||
This sequence ensures that a thread running on the remote core can only see one
|
||||
of:
|
||||
|
||||
* `hdr.current = X`; `hdr.begin = 0xFFFF`; `hdr.end = 0x0`
|
||||
* `hdr.current = Y`; `hdr.begin = 0xFFFF`; `hdr.end = 0x0`
|
||||
* `hdr.current = Y`; `hdr.begin = Y`; `hdr.end = Y`
|
||||
|
||||
`FenceCpu` ensures that after it completes, no thread can see `current=X` any
|
||||
longer.
|
||||
|
||||
If we did a single store or omitted the intervening fence operation, a thread on
|
||||
the remote core could potentially see `hdr.begin = Y < hdr.current = X` and
|
||||
attempt to remove an element from the cache. (This failure would lead to data
|
||||
corruption as the element had already been "deallocated" to the `TransferCache`,
|
||||
essentially triggering a double-free.)
|
||||
64
src/third_party/tcmalloc/dist/docs/sampling.md
vendored
Normal file
@ -0,0 +1,64 @@
|
||||
# How sampling in TCMalloc works.
|
||||
|
||||
## Introduction
|
||||
|
||||
TCMalloc uses sampling to get representative data on memory usage and
|
||||
allocation. How this works is not well documented. This doc attempts to at least
|
||||
partially fix this.
|
||||
|
||||
## Sampling
|
||||
|
||||
We chose to sample an allocation every N bytes where N is a random value using
|
||||
[Sampler::PickNextSamplingPoint()](https://github.com/google/tcmalloc/blob/master/tcmalloc/sampler.cc)
|
||||
with a mean set by the profile sample rate using
|
||||
[MallocExtension::SetProfileSamplingRate()](https://github.com/google/tcmalloc/blob/master/tcmalloc/malloc_extension.h).
|
||||
By default this is every 2MiB.
|
||||
|
||||
## How We Sample Allocations
|
||||
|
||||
When we pick an allocation such as
|
||||
[Sampler::RecordAllocationSlow()](https://github.com/google/tcmalloc/blob/master/tcmalloc/sampler.cc)
|
||||
to sample we do some additional processing around that allocation using
|
||||
[SampleifyAllocation()](https://github.com/google/tcmalloc/blob/master/tcmalloc/allocation_sampling.h) -
|
||||
recording stack, alignment, request size, and allocation size. Then we go
|
||||
through all the active samplers using
|
||||
[ReportMalloc()](https://github.com/google/tcmalloc/blob/master/tcmalloc/allocation_sample.h)
|
||||
and tell them about the allocation. We also tell the span that we're sampling
|
||||
it - we can do this because we do sampling at tcmalloc page sizes, so each
|
||||
sample corresponds to a particular page in the pagemap.
|
||||
|
||||
## How We Free Sampled Objects
|
||||
|
||||
Each sampled allocation is tagged. So we can quickly test whether a particular
|
||||
allocation might be a sample.
|
||||
|
||||
When we are done with the sampled span we release it using
|
||||
[tcmalloc::Span::Unsample()](https://github.com/google/tcmalloc/blob/master/tcmalloc/span.cc).
|
||||
|
||||
## How Do We Handle Heap and Fragmentation Profiling
|
||||
|
||||
To handle heap and fragmentation profiling we just need to traverse the list of
|
||||
sampled objects and compute either their degree of fragmentation, or the amount
|
||||
of heap they consume.
|
||||
|
||||
## How Do We Handle Allocation Profiling
|
||||
|
||||
Allocation profiling reports a list of sampled allocations during a length of
|
||||
time. We start an allocation profile using
|
||||
[MallocExtension::StartAllocationProfiling()](https://github.com/google/tcmalloc/blob/master/tcmalloc/malloc_extension.h),
|
||||
then wait until time has elapsed, then call `Stop` on the token. and report the
|
||||
profile.
|
||||
|
||||
While the allocation sampler is active it is added to the list of samplers for
|
||||
allocations and removed from the list when it is claimed.
|
||||
|
||||
## How Do We Handle Lifetime Profiling
|
||||
|
||||
Lifetime profiling reports a list of object lifetimes as pairs of allocation and
|
||||
deallocation records. Profiling is initiated by calling
|
||||
[MallocExtension::StartLifetimeProfiling()](https://github.com/google/tcmalloc/blob/master/tcmalloc/malloc_extension.h).
|
||||
Profiling continues until `Stop` is invoked on the token. Lifetimes are only
|
||||
reported for objects where allocation *and* deallocation are observed while
|
||||
profiling is active. A description of the sampling based lifetime profiler can
|
||||
be found in Section 4 of
|
||||
["Learning-based Memory Allocation for C++ Server Workloads, ASPLOS 2020"](https://research.google/pubs/pub49008/).
|
||||
961
src/third_party/tcmalloc/dist/docs/stats.md
vendored
Normal file
@ -0,0 +1,961 @@
|
||||
# Understanding Malloc Stats
|
||||
|
||||
## Getting Malloc Stats
|
||||
|
||||
Human-readable statistics can be obtained by calling
|
||||
`tcmalloc::MallocExtension::GetStats()`.
|
||||
|
||||
## Understanding Malloc Stats Output
|
||||
|
||||
### It's A Lot Of Information
|
||||
|
||||
The output contains a lot of information. Much of it can be considered debug
|
||||
info that's interesting to folks who are passingly familiar with the internals
|
||||
of TCMalloc, but potentially not that useful for most people.
|
||||
|
||||
### Summary Section
|
||||
|
||||
The most generally useful section is the first few lines:
|
||||
|
||||
```
|
||||
See https://github.com/google/tcmalloc/tree/master/docs/stats.md for an explanation of this page
|
||||
------------------------------------------------
|
||||
MALLOC: 10858234672 (10355.2 MiB) Bytes in use by application
|
||||
MALLOC: + 827129856 ( 788.8 MiB) Bytes in page heap freelist
|
||||
MALLOC: + 386098400 ( 368.2 MiB) Bytes in central cache freelist
|
||||
MALLOC: + 105330688 ( 100.5 MiB) Bytes in per-CPU cache freelist
|
||||
MALLOC: + 9095680 ( 8.7 MiB) Bytes in transfer cache freelist
|
||||
MALLOC: + 660976 ( 0.6 MiB) Bytes in thread cache freelists
|
||||
MALLOC: + 49333930 ( 47.0 MiB) Bytes in malloc metadata
|
||||
MALLOC: + 629440 ( 0.6 MiB) Bytes in malloc metadata Arena unallocated
|
||||
MALLOC: + 1599704 ( 1.5 MiB) Bytes in malloc metadata Arena unavailable
|
||||
MALLOC: ------------
|
||||
MALLOC: = 12238113346 (11671.2 MiB) Actual memory used (physical + swap)
|
||||
MALLOC: + 704643072 ( 672.0 MiB) Bytes released to OS (aka unmapped)
|
||||
MALLOC: ------------
|
||||
MALLOC: = 12942756418 (12343.2 MiB) Virtual address space used
|
||||
```
|
||||
|
||||
* **Bytes in use by application:** Number of bytes that the application is
|
||||
actively using to hold data. This is computed by the bytes requested from
|
||||
the OS minus any bytes that are held in caches and other internal data
|
||||
structures.
|
||||
* **Bytes in page heap freelist:** The pageheap is a structure that holds
|
||||
memory ready for TCMalloc to use. This memory is not actively being used,
|
||||
and could be returned to the OS. [See TCMalloc tuning](tuning.md)
|
||||
* **Bytes in central cache freelist:** This is the amount of memory currently
|
||||
held in the central freelist. This is a structure that holds partially used
|
||||
"[spans](#more-detail-on-metadata)" of memory. The spans are partially used
|
||||
because some memory has been allocated from them, but not entirely used -
|
||||
since they have some free memory on them.
|
||||
* **Bytes in per-CPU cache freelist:** In per-cpu mode (which is the default)
|
||||
each CPU holds some memory ready to quickly hand to the application. The
|
||||
maximum size of this per-cpu cache is tunable.
|
||||
[See TCMalloc tuning](tuning.md)
|
||||
* **Bytes in transfer cache freelist:** The transfer cache can be considered
|
||||
another part of the central freelist. It holds memory that is ready to be
|
||||
provided to the application for use.
|
||||
* **Bytes in thread cache freelists:** The TC in TCMalloc stands for thread
|
||||
cache. Originally each thread held its own cache of memory to provide to the
|
||||
application. Since the change of default to per-cpu caches, the thread
|
||||
caches are used by very few applications. However, TCMalloc starts in
|
||||
per-thread mode, so there may be some memory left in per-thread caches from
|
||||
before it switches into per-cpu mode.
|
||||
* **Bytes in malloc metadata:** the size of the data structures used for
|
||||
tracking memory allocation. This will grow as the amount of memory used
|
||||
grows.
|
||||
* **Bytes in malloc metadata Arena unallocated:** Metadata is allocated in an
|
||||
internal Arena. Memory requests to the OS are made in blocks which amortize
|
||||
several Arena allocations and this captures memory that is not yet allocated
|
||||
but could be by future Arena allocations.
|
||||
* **Bytes in malloc metadata Arena unavailable:** The Arena allocator may fail
|
||||
to allocate a block fully when a subsequent Arena allocation request is made
|
||||
that is larger than the block's remaining space. This memory is currently
|
||||
unavailable for allocation.
|
||||
|
||||
There's a couple of summary lines:
|
||||
|
||||
* **Actual memory used:** This is the total amount of memory that TCMalloc
|
||||
thinks it is using in the various categories. This is computed from the size
|
||||
of the various areas, the actual contribution to RSS may be larger or
|
||||
smaller than this value. The true RSS may be less if memory is not mapped
|
||||
in. In some cases RSS can be larger if small regions end up being mapped
|
||||
with huge pages. This does not count memory that TCMalloc is not aware of
|
||||
(eg memory mapped files, text segments etc.)
|
||||
* **Bytes released to OS:** TCMalloc can release memory back to the OS (see
|
||||
[tcmalloc tuning](tuning.md)), and this is the upper bound on the amount of
|
||||
released memory. However, it is up to the OS as to whether the act of
|
||||
releasing the memory actually reduces the RSS of the application. The code
|
||||
uses `MADV_DONTNEED`/`MADV_REMOVE` which tells the OS that the memory is no
|
||||
longer needed.
|
||||
* **Virtual address space used:** This is the amount of virtual address space
|
||||
that TCMalloc believes it is using. This should match the later section on
|
||||
requested memory. There are other ways that an application can increase its
|
||||
virtual address space, and this statistic does not capture them.
|
||||
|
||||
### More Detail On Metadata
|
||||
|
||||
The next section gives some insight into the amount of metadata that TCMalloc is
|
||||
using. This is really debug information, and not very actionable.
|
||||
|
||||
```
|
||||
MALLOC: 236176 Spans in use
|
||||
MALLOC: 238709 ( 10.9 MiB) Spans created
|
||||
MALLOC: 8 Thread heaps in use
|
||||
MALLOC: 46 ( 0.0 MiB) Thread heaps created
|
||||
MALLOC: 13517 Stack traces in use
|
||||
MALLOC: 13742 ( 7.2 MiB) Stack traces created
|
||||
MALLOC: 0 Table buckets in use
|
||||
MALLOC: 2808 ( 0.0 MiB) Table buckets created
|
||||
MALLOC: 11665416 ( 11.1 MiB) Pagemap bytes used
|
||||
MALLOC: 4067336 ( 3.9 MiB) Pagemap root resident bytes
|
||||
```
|
||||
|
||||
* **Spans:** structures that hold multiple [pages](#page-sizes) of allocatable
|
||||
objects.
|
||||
* **Thread heaps:** These are the per-thread structures used in per-thread
|
||||
mode.
|
||||
* **Stack traces:** These hold metadata for each sampled object.
|
||||
* **Table buckets:** These hold data for stack traces for sampled events.
|
||||
* **Pagemap:** This data structure supports the mapping of object addresses to
|
||||
information about the objects held on the page. The pagemap root is a
|
||||
potentially large array, and it is useful to know how much of it is actually
|
||||
memory resident.
|
||||
|
||||
### Realized Fragmentation
|
||||
|
||||
```
|
||||
MALLOC: 12238113346 (11671.2 MiB) Actual memory used at peak
|
||||
MALLOC: 11626207678 (11087.6 MiB) Estimated in-use at peak
|
||||
MALLOC: 5.2632 Realized fragmentation (%)
|
||||
```
|
||||
|
||||
Memory overhead at peak demand is more important than off-peak, since we need to
|
||||
provision a process with sufficient memory to run during its peak requirements
|
||||
without OOM'ing. After a peak in demand, memory may be deallocated and held in
|
||||
caches in anticipation of future reuse. Overhead as a fraction of the remaining
|
||||
live allocations rises, but no additional memory is required.
|
||||
|
||||
This metric is called "realized fragmentation" and described in ["Adaptive
|
||||
Hugepage Subrelease for Non-moving Memory Allocators in Warehouse-Scale
|
||||
Computers"](https://research.google/pubs/pub50436/) (ISMM 2021). The realized
|
||||
fragmentation metric computed here is a snapshot over the life of the entire
|
||||
process.
|
||||
|
||||
These realized fragmentation stats in the summary table indicate a snapshot of
|
||||
conditions when TCMalloc used a peak in its physical memory. As of April 2022,
|
||||
the in-use at peak number is estimated from TCMalloc's periodic allocation
|
||||
sampling.
|
||||
|
||||
### Page Sizes
|
||||
|
||||
There are three relevant "page" sizes for systems and TCMalloc. It's important
|
||||
to be able to disambiguate them.
|
||||
|
||||
* **System default page size:** this is not reported by TCMalloc. This is 4KiB
|
||||
on x86. It's not referred to in TCMalloc, and it's not important, but it's
|
||||
important to know that it is different from the sizes of pages used in
|
||||
TCMalloc.
|
||||
* **TCMalloc page size:** This is the basic unit of memory management for
|
||||
TCMalloc. Objects on the same page are the same number of bytes in size.
|
||||
Internally TCMalloc manages memory in chunks of this size. TCMalloc supports
|
||||
4 sizes: 4KiB (small but slow), 8KiB (the default), 32 KiB (large), 256 KiB
|
||||
(256 KiB pages). There are trade-offs around the page sizes:
|
||||
* Smaller page sizes are more memory efficient because we have less
|
||||
fragmentation (ie left over space) when trying to provide the requested
|
||||
amount of memory using 4KiB chunks. It's also more likely that all the
|
||||
objects on a 4KiB page will be freed allowing the page to be returned
|
||||
and used for a different size of data.
|
||||
* Larger pages result in fewer fetches from the page heap to provide a
|
||||
given amount of memory. They also keep allocated objects of the same
|
||||
size in closer proximity.
|
||||
* **TCMalloc hugepage size:** This is the size of a hugepage on the system,
|
||||
for x86 this is 2MiB. This size is used as a unit of management by
|
||||
temeriare, but not used by the pre-temeraire pageheap.
|
||||
|
||||
```
|
||||
MALLOC: 32768 Tcmalloc page size
|
||||
MALLOC: 2097152 Tcmalloc hugepage size
|
||||
```
|
||||
|
||||
### Experiments
|
||||
|
||||
There is an experiment framework embedded into TCMalloc.
|
||||
The enabled experiments are reported as part of the statistics.
|
||||
|
||||
```
|
||||
MALLOC EXPERIMENTS: TCMALLOC_TEMERAIRE=0 TCMALLOC_TEMERAIRE_WITH_SUBRELEASE_V3=0
|
||||
```
|
||||
|
||||
### Actual Memory Footprint
|
||||
|
||||
The output also reports the memory size information recorded by the OS:
|
||||
|
||||
* Bytes resident is the amount of physical memory in use by the application
|
||||
(RSS). This includes things like program text which is excluded from the
|
||||
information that TCMalloc presents.
|
||||
* Bytes mapped is the size of the virtual address space in use by the
|
||||
application (VSS). This can be substantially larger than the virtual memory
|
||||
reported by TCMalloc as applications can increase VSS in other ways. It's
|
||||
also not that useful as a metric since the VSS is a limit to the RSS, but
|
||||
not directly related to the amount of physical memory that the application
|
||||
uses.
|
||||
|
||||
```
|
||||
Total process stats (inclusive of non-malloc sources):
|
||||
TOTAL: 86880677888 (82855.9 MiB) Bytes resident (physical memory used)
|
||||
TOTAL: 89124790272 (84996.0 MiB) Bytes mapped (virtual memory used)
|
||||
```
|
||||
|
||||
### Per Size-Class Information
|
||||
|
||||
Requests for memory are rounded to convenient sizes. For example a request for
|
||||
15 bytes could be rounded to 16 bytes. These sizes are referred to as class
|
||||
sizes. There are various caches in TCMalloc where memory gets held, and the per
|
||||
size-class section reports how much memory is being used by cached objects of
|
||||
each size. The columns reported for each size-class are:
|
||||
|
||||
* The size of each object in that size-class.
|
||||
* The number of objects of that size currently held in the per-cpu,
|
||||
per-thread, transfer, and central caches.
|
||||
* The total size of those objects in MiB - ie size of each object multiplied
|
||||
by the number of objects.
|
||||
* The cumulative size of that size-class plus all smaller size-classes.
|
||||
* The number of live pages dedicated to this size-class.
|
||||
* The number of returned and requested spans of this size-class.
|
||||
|
||||
```
|
||||
Total size of freelists for per-thread and per-CPU caches,
|
||||
transfer cache, and central cache, as well as number of
|
||||
live pages, returned/requested spans by size-class
|
||||
------------------------------------------------
|
||||
class 1 [ 8 bytes ] : 45645 objs; 0.3 MiB; 0.3 cum MiB; 73 live pages; spans: 19 ret / 92 req = 0.2065;
|
||||
class 2 [ 16 bytes ] : 39942 objs; 0.6 MiB; 1.0 cum MiB; 120 live pages; spans: 3 ret / 123 req = 0.0244;
|
||||
class 3 [ 24 bytes ] : 84130 objs; 1.9 MiB; 2.9 cum MiB; 807 live pages; spans: 1330 ret / 2137 req = 0.6224;
|
||||
class 4 [ 32 bytes ] : 107271 objs; 3.3 MiB; 6.2 cum MiB; 1048 live pages; spans: 420 ret / 1468 req = 0.2861;
|
||||
class 5 [ 40 bytes ] : 82230 objs; 3.1 MiB; 9.3 cum MiB; 790 live pages; spans: 962 ret / 1752 req = 0.5491;
|
||||
...
|
||||
```
|
||||
|
||||
### Central Cache Free List Span Utilization
|
||||
|
||||
Central cache free list manages memory in spans, where each span is a collection
|
||||
of one or more TCMalloc pages. We track histogram of span utilization, where
|
||||
each column refers to the number of spans with allocated objects less than N.
|
||||
|
||||
```
|
||||
------------------------------------------------
|
||||
Central cache freelist: Span utilization histogram
|
||||
Non-cumulative number of spans with allocated objects < N
|
||||
------------------------------------------------
|
||||
class 1 [ 8 bytes ] : 0 < 1, 0 < 2, 0 < 4, 0 < 8, 0 < 16, 1 < 32, 0 < 64, 1 < 128, 1 < 256, 1 < 512, 0 < 1024, 0 < 2048, 4 < 4096, 16 < 8192, 0 < 16384, 0 < 32768, 0 < 65536
|
||||
class 2 [ 16 bytes ] : 0 < 1, 0 < 2, 0 < 4, 0 < 8, 0 < 16, 0 < 32, 0 < 64, 0 < 128, 0 < 256, 0 < 512, 1 < 1024, 0 < 2048, 47 < 4096, 0 < 8192, 0 < 16384, 0 < 32768, 0 < 65536
|
||||
class 3 [ 24 bytes ] : 0 < 1, 0 < 2, 0 < 4, 0 < 8, 0 < 16, 0 < 32, 0 < 64, 2 < 128, 1 < 256, 3 < 512, 5 < 1024, 127 < 2048, 0 < 4096, 0 < 8192, 0 < 16384, 0 < 32768, 0 < 65536
|
||||
class 4 [ 32 bytes ] : 0 < 1, 0 < 2, 0 < 4, 0 < 8, 0 < 16, 0 < 32, 0 < 64, 0 < 128, 0 < 256, 1 < 512, 0 < 1024, 129 < 2048, 0 < 4096, 0 < 8192, 0 < 16384, 0 < 32768, 0 < 65536
|
||||
class 5 [ 40 bytes ] : 0 < 1, 1 < 2, 1 < 4, 0 < 8, 0 < 16, 0 < 32, 1 < 64, 1 < 128, 4 < 256, 5 < 512, 80 < 1024, 0 < 2048, 0 < 4096, 0 < 8192, 0 < 16384, 0 < 32768, 0 < 65536
|
||||
...
|
||||
```
|
||||
|
||||
### Transfer Cache Information
|
||||
|
||||
Transfer cache is used by TCMalloc, before going to central free list. For each
|
||||
size-class, we track and report the following statistics:
|
||||
|
||||
* The size of each object in that size-class.
|
||||
* The number of objects of that size currently held in the transfer cache.
|
||||
* The total size of those objects in MiB - i.e. size of each object multiplied
|
||||
by the number of objects in the freelist.
|
||||
* The cumulative size of that size-class plus all smaller size-classes.
|
||||
* The current capacity of the freelist.
|
||||
* The maximum capacity to which the freelist is allowed to grow.
|
||||
* The number of hits observed during inserts to the transfer cache.
|
||||
* The total number batched and non-batched misses observed during insert
|
||||
operations.
|
||||
* The number of partial (i.e. non-batch-sized) misses observed during insert
|
||||
operations.
|
||||
* The number of hits observed during removes from the transfer cache.
|
||||
* The total number batched and non-batched misses observed during remove
|
||||
operations.
|
||||
* The number of partial (i.e. non-batch-sized) misses observed during remove
|
||||
operations.
|
||||
|
||||
```
|
||||
------------------------------------------------
|
||||
Used bytes, current capacity, and maximum allowed capacity
|
||||
of the transfer cache freelists.
|
||||
It also reports insert/remove hits/misses by size class.
|
||||
------------------------------------------------
|
||||
class 1 [ 8 bytes ] : 1472 objs; 0.0 MiB; 0.0 cum MiB; 2048 capacity; 2048 max_capacity; 935 insert hits; 8543 insert misses ( 4507 partial); 889 remove hits; 6612 remove misses ( 86 partial);
|
||||
class 2 [ 16 bytes ] : 608 objs; 0.0 MiB; 0.0 cum MiB; 2048 capacity; 2048 max_capacity; 575 insert hits; 3739 insert misses ( 3602 partial); 556 remove hits; 3368 remove misses ( 70 partial);
|
||||
class 3 [ 24 bytes ] : 864 objs; 0.0 MiB; 0.0 cum MiB; 2048 capacity; 2048 max_capacity; 1533 insert hits; 15594 insert misses ( 9417 partial); 1506 remove hits; 11939 remove misses ( 74 partial);
|
||||
class 4 [ 32 bytes ] : 96 objs; 0.0 MiB; 0.0 cum MiB; 2048 capacity; 2048 max_capacity; 1065 insert hits; 21772 insert misses ( 19918 partial); 1061 remove hits; 6403 remove misses ( 119 partial);
|
||||
class 5 [ 40 bytes ] : 1408 objs; 0.1 MiB; 0.1 cum MiB; 2048 capacity; 2048 max_capacity; 1475 insert hits; 16018 insert misses ( 14943 partial); 1431 remove hits; 3293 remove misses ( 60 partial);
|
||||
class 6 [ 48 bytes ] : 1664 objs; 0.1 MiB; 0.2 cum MiB; 2048 capacity; 2048 max_capacity; 1213 insert hits; 39140 insert misses ( 37096 partial); 1160 remove hits; 5909 remove misses ( 80 partial);
|
||||
class 7 [ 56 bytes ] : 1792 objs; 0.1 MiB; 0.3 cum MiB; 2048 capacity; 2048 max_capacity; 466 insert hits; 650 insert misses ( 375 partial); 410 remove hits; 1264 remove misses ( 55 partial);
|
||||
class 8 [ 64 bytes ] : 1408 objs; 0.1 MiB; 0.4 cum MiB; 2048 capacity; 2048 max_capacity; 2181 insert hits; 8816 insert misses ( 8069 partial); 2137 remove hits; 2024 remove misses ( 74 partial);
|
||||
class 9 [ 72 bytes ] : 960 objs; 0.1 MiB; 0.4 cum MiB; 1600 capacity; 2048 max_capacity; 104 insert hits; 463 insert misses ( 463 partial); 74 remove hits; 287 remove misses ( 62 partial);
|
||||
class 10 [ 80 bytes ] : 1056 objs; 0.1 MiB; 0.5 cum MiB; 2048 capacity; 2048 max_capacity; 372 insert hits; 3334 insert misses ( 3287 partial); 339 remove hits; 562 remove misses ( 80 partial);
|
||||
...
|
||||
```
|
||||
|
||||
As of July 2021, the `TransferCache` misses when inserting or removing a
|
||||
non-batch size number of objects from the cache. These are reflected in the
|
||||
"partial" column. The insert and remove miss column is *inclusive* of misses for
|
||||
both batch size and non-batch size numbers of objects.
|
||||
|
||||
### Per-CPU Information
|
||||
|
||||
If the per-cpu cache is enabled then we get a report of the memory currently
|
||||
being cached on each CPU.
|
||||
|
||||
The first number reported is the maximum size of the per-cpu cache on each CPU.
|
||||
This corresponds to the parameter `MallocExtension::GetMaxPerCpuCacheSize()`,
|
||||
which defaults to 1.5MiB. [See tuning](tuning.md)
|
||||
|
||||
The following columns are reported for each CPU:
|
||||
|
||||
* The cpu ID
|
||||
* The total size of the objects held in the CPU's cache in bytes.
|
||||
* The total size of the objects held in the CPU's cache in MiB.
|
||||
* The total number of unallocated bytes.
|
||||
|
||||
The concept of unallocated bytes needs to be explained because the definition is
|
||||
not obvious.
|
||||
|
||||
The per-cpu cache is an array of pointers to available memory. Each size-class
|
||||
has a number of entries that it can use in the array. These entries can be used
|
||||
to hold memory, or be empty.
|
||||
|
||||
To control the maximum memory that the per-cpu cache can use we sum up the
|
||||
number of slots that can be used by a size-class multiplied by the size of
|
||||
objects in that size-class. This gives us the total memory that could be held in
|
||||
the cache. This is not what is reported by unallocated memory.
|
||||
|
||||
Unallocated memory is the amount of memory left over from the per cpu limit
|
||||
after we have subtracted the total memory that could be held in the cache.
|
||||
|
||||
The in use memory is calculated from the sum of the number of populated entries
|
||||
in the per-cpu array multiplied by the size of the objects held in those
|
||||
entries.
|
||||
|
||||
To summarise, the per-cpu limit (which is reported before the per-cpu data) is
|
||||
equal to the number of bytes in use (which is reported in the second column)
|
||||
plus the number of bytes that could be used (which is not reported) plus the
|
||||
unallocated "spare" bytes (which is reported as the last column).
|
||||
|
||||
```
|
||||
Bytes in per-CPU caches (per cpu limit: 3145728 bytes)
|
||||
------------------------------------------------
|
||||
cpu 0: 2168200 bytes ( 2.1 MiB) with 52536 bytes unallocated active
|
||||
cpu 1: 1734880 bytes ( 1.7 MiB) with 258944 bytes unallocated active
|
||||
cpu 2: 1779352 bytes ( 1.7 MiB) with 8384 bytes unallocated active
|
||||
cpu 3: 1414224 bytes ( 1.3 MiB) with 112432 bytes unallocated active
|
||||
cpu 4: 1260016 bytes ( 1.2 MiB) with 179800 bytes unallocated
|
||||
...
|
||||
```
|
||||
|
||||
Some CPU caches may be marked `active`, indicating that the process is currently
|
||||
runnable on that CPU.
|
||||
|
||||
### Size Class Capacity Information in Per-CPU Caches
|
||||
|
||||
In per-CPU caches, TCMalloc caches objects of discrete sizes. These are referred
|
||||
to as size classes. Memory requests for a particular object size are rounded off
|
||||
to a convenient size class. TCMalloc populates objects in each size class based
|
||||
on their demand, but also imposes an upper limit on the number of objects that
|
||||
may be cached per size class. The statistics below measure the capacity of each
|
||||
size class freelist, where capacity represents the total number of objects
|
||||
currently cached by the freelist. The columns below report number of objects
|
||||
cached by TCMalloc per size class:
|
||||
|
||||
* Size class.
|
||||
* The size of each object in that size class.
|
||||
* Minimum capacity of the size class freelist summarized over all per-CPU
|
||||
caches.
|
||||
* Average capacity of the size class freelist summarized over all per-CPU
|
||||
caches.
|
||||
* Maximum capacity of the size class freelist summarized over all per-CPU
|
||||
caches.
|
||||
* The upper limit imposed by TCMalloc on the number of objects that can be
|
||||
cached in a per-CPU cache for that size class.
|
||||
|
||||
```
|
||||
------------------------------------------------
|
||||
Size class capacity statistics in per-cpu caches
|
||||
------------------------------------------------
|
||||
class 0 [ 0 bytes ] : 0 (minimum), 0.0 (average), 0 (maximum), 0 maximum allowed capacity
|
||||
class 1 [ 8 bytes ] : 0 (minimum), 133.1 (average), 636 (maximum), 2048 maximum allowed capacity
|
||||
class 2 [ 16 bytes ] : 0 (minimum), 51.8 (average), 378 (maximum), 2048 maximum allowed capacity
|
||||
class 3 [ 24 bytes ] : 0 (minimum), 119.3 (average), 510 (maximum), 2048 maximum allowed capacity
|
||||
class 4 [ 32 bytes ] : 0 (minimum), 100.0 (average), 542 (maximum), 2048 maximum allowed capacity
|
||||
class 5 [ 40 bytes ] : 0 (minimum), 80.6 (average), 467 (maximum), 2048 maximum allowed capacity
|
||||
```
|
||||
|
||||
### Number of per-CPU cache underflows, overflows, and reclaims
|
||||
|
||||
We also keep track of cache miss counts. Underflows are when the user allocates
|
||||
and the cache does not have any pointers to return. Overflows are when the user
|
||||
deallocates and the cache is full. The ratio of overflows to underflows gives a
|
||||
rough indication of whether the cache is large enough. If the cache had infinite
|
||||
capacity, then we would expect to have 0 overflows whereas if the cache had 0
|
||||
capacity, we would expect to see roughly equal numbers of overflows and
|
||||
underflows. Therefore, if the ratio is close to 1.0, then the cache may not be
|
||||
large enough. Reclaims are when we empty out a cache for a specific CPU because
|
||||
it has been idle for a period of time. In this section, we report the total
|
||||
numbers of each of these metrics across all CPUs as well as the numbers for each
|
||||
individual CPU.
|
||||
|
||||
```
|
||||
------------------------------------------------
|
||||
Number of per-CPU cache underflows, overflows, and reclaims
|
||||
------------------------------------------------
|
||||
Total : 242 underflows, 12 overflows, overflows / underflows: 0.05, 168 reclaims
|
||||
cpu 0: 69 underflows, 5 overflows, overflows / underflows: 0.07, 46 reclaims
|
||||
cpu 1: 58 underflows, 0 overflows, overflows / underflows: 0.00, 42 reclaims
|
||||
cpu 2: 62 underflows, 7 overflows, overflows / underflows: 0.11, 42 reclaims
|
||||
cpu 3: 40 underflows, 0 overflows, overflows / underflows: 0.00, 27 reclaims
|
||||
cpu 4: 13 underflows, 0 overflows, overflows / underflows: 0.00, 11 reclaims
|
||||
cpu 5: 0 underflows, 0 overflows, overflows / underflows: 0.00, 0 reclaims
|
||||
```
|
||||
|
||||
### Pageheap Information
|
||||
|
||||
The pageheap holds pages of memory that are not currently being used either by
|
||||
the application or by TCMalloc's internal caches. These pages are grouped into
|
||||
spans - which are ranges of contiguous pages, and these spans can be either
|
||||
mapped (backed by physical memory) or unmapped (not necessarily backed by
|
||||
physical memory).
|
||||
|
||||
Memory from the pageheap is used either to replenish the per-thread or per-cpu
|
||||
caches, or to directly satisfy requests that are larger than the sizes supported
|
||||
by the per-thread or per-cpu caches.
|
||||
|
||||
**Note:** TCMalloc cannot tell whether a span of memory is actually backed by
|
||||
physical memory, but it uses *unmapped* to indicate that it has told the OS that
|
||||
the span is not used and does not need the associated physical memory. For this
|
||||
reason the physical memory of an application may be larger that the amount that
|
||||
TCMalloc reports.
|
||||
|
||||
The pageheap section contains the following information:
|
||||
|
||||
* The first line reports the number of sizes of spans, the total memory that
|
||||
these spans cover, and the total amount of that memory that is unmapped.
|
||||
* The size of the span in number of pages.
|
||||
* The number of spans of that size.
|
||||
* The total memory consumed by those spans in MiB.
|
||||
* The cumulative total memory held in spans of that size and fewer pages.
|
||||
* The amount of that memory that has been unmapped.
|
||||
* The cumulative amount of unmapped memory for spans of that size and smaller.
|
||||
|
||||
```
|
||||
PageHeap: 30 sizes; 480.1 MiB free; 318.4 MiB unmapped
|
||||
------------------------------------------------
|
||||
1 pages * 341 spans ~ 10.7 MiB; 10.7 MiB cum; unmapped: 1.9 MiB; 1.9 MiB cum
|
||||
2 pages * 469 spans ~ 29.3 MiB; 40.0 MiB cum; unmapped: 0.0 MiB; 1.9 MiB cum
|
||||
3 pages * 462 spans ~ 43.3 MiB; 83.3 MiB cum; unmapped: 3.3 MiB; 5.2 MiB cum
|
||||
4 pages * 119 spans ~ 14.9 MiB; 98.2 MiB cum; unmapped: 0.1 MiB; 5.3 MiB cum
|
||||
...
|
||||
```
|
||||
|
||||
### Pageheap Cache Age
|
||||
|
||||
The next section gives some indication of the age of the various spans in the
|
||||
pageheap. Live (ie backed by physical memory) and unmapped spans are reported
|
||||
separately.
|
||||
|
||||
The columns indicate roughly how long the span has been in the pageheap, ranging
|
||||
from less than a second to more than 8 hours.
|
||||
|
||||
```
|
||||
------------------------------------------------
|
||||
PageHeap cache entry age (count of pages in spans of a given size that have been idle for up to the given period of time)
|
||||
------------------------------------------------
|
||||
mean <1s 1s 30s 1m 30m 1h 8+h
|
||||
Live span TOTAL PAGES: 9.1 533 13322 26 1483 0 0 0
|
||||
Live span, 1 pages: 7.4 0 256 0 24 0 0 0
|
||||
Live span, 2 pages: 1.6 38 900 0 0 0 0 0
|
||||
…
|
||||
Unmapped span TOTAL PAGES: 153.9 153 2245 1801 5991 0 0 0
|
||||
Unmapped span, 1 pages: 34.6 0 35 15 11 0 0 0
|
||||
Unmapped span, 3 pages: 28.4 0 60 42 3 0 0 0
|
||||
...
|
||||
```
|
||||
|
||||
### Pageheap Allocation Summary
|
||||
|
||||
This reports some stats on the number of pages allocated.
|
||||
|
||||
* The number of live (i.e., not on page heap) pages that were "small"
|
||||
allocations. Small allocations are ones that are tracked in the pageheap by
|
||||
size (e.g., a region of two pages in size). Larger allocations are just kept
|
||||
in an array that has to be scanned linearly.
|
||||
* The pages of slack result from situations where allocation is rounded up to
|
||||
hugepages, and this leaves some spare pages.
|
||||
* The largest seen allocation is self explanatory.
|
||||
|
||||
```
|
||||
PageHeap: stats on allocation sizes
|
||||
PageHeap: 344420 pages live small allocation
|
||||
PageHeap: 12982 pages of slack on large allocations
|
||||
PageHeap: largest seen allocation 29184 pages
|
||||
```
|
||||
|
||||
### Pageheap Per Number Of Pages In Range
|
||||
|
||||
This starts off reporting the activity for small ranges of pages, but at the end
|
||||
of the list starts aggregating information for groups of page ranges.
|
||||
|
||||
* The first column contains the number of pages (or the range of pages if the
|
||||
bucket is wider than a single page).
|
||||
* The second and third columns are the number of allocated and freed pages we
|
||||
have seen of this size.
|
||||
* The fourth column is the number of live allocations of this size.
|
||||
* The fifth column is the size of those live allocations in MiB.
|
||||
* The sixth column is the allocation rate in pages per second since the start
|
||||
of the application.
|
||||
* The seventh column is the allocation rate in MiB per second since the start
|
||||
of the application.
|
||||
|
||||
```
|
||||
PageHeap: per-size information:
|
||||
PageHeap: 1 page info: 23978897 / 23762891 a/f, 216006 (6750.2 MiB) live, 2.43e+03 allocs/s ( 76.1 MiB/s)
|
||||
PageHeap: 2 page info: 21442844 / 21436331 a/f, 6513 ( 407.1 MiB) live, 2.18e+03 allocs/s (136.0 MiB/s)
|
||||
PageHeap: 3 page info: 2333686 / 2329225 a/f, 4461 ( 418.2 MiB) live, 237 allocs/s ( 22.2 MiB/s)
|
||||
PageHeap: 4 page info: 21509168 / 21508751 a/f, 417 ( 52.1 MiB) live, 2.18e+03 allocs/s (272.9 MiB/s)
|
||||
PageHeap: 5 page info: 3356076 / 3354188 a/f, 1888 ( 295.0 MiB) live, 341 allocs/s ( 53.2 MiB/s)
|
||||
PageHeap: 6 page info: 1718534 / 1718486 a/f, 48 ( 9.0 MiB) live, 174 allocs/s ( 32.7 MiB/s)
|
||||
...
|
||||
```
|
||||
|
||||
### GWP-ASan Status
|
||||
|
||||
The GWP-ASan section displays information about allocations guarded by
|
||||
[GWP-ASan](gwp-asan.md).
|
||||
|
||||
* The number of successful and failed GWP-ASan allocations. If there are 0
|
||||
successful and 0 failed allocations, GWP-ASan is probably disabled on your
|
||||
binary. If there are a large number of failed allocations, it probably means
|
||||
your sampling rate is too high, causing the guarded slots to be exhausted.
|
||||
See
|
||||
[GWP-ASan sampling rate](gwp-asan.md#what-should-i-set-the-sampling-rate-to).
|
||||
* The number of "slots" currently allocated and quarantined. An allocated slot
|
||||
contains an allocation that is still active (i.e., not freed) while a
|
||||
quarantined slot has either not been used yet or contains an allocation that
|
||||
was freed.
|
||||
* The maximum number of slots that have been allocated at the same time. This
|
||||
number is printed along with the allocated slot limit. If the maximum slots
|
||||
allocated matches the limit, you may want to reduce your sampling rate to
|
||||
avoid failed GWP-ASan allocations.
|
||||
|
||||
```
|
||||
------------------------------------------------
|
||||
GWP-ASan Status
|
||||
------------------------------------------------
|
||||
Successful Allocations: 1823
|
||||
Failed Allocations: 0
|
||||
Slots Currently Allocated: 33
|
||||
Slots Currently Quarantined: 95
|
||||
Moximum Slots Allocated: 51 / 64
|
||||
```
|
||||
|
||||
### Memory Requested From The OS
|
||||
|
||||
The stats also report the amount of memory requested from the OS by mmap.
|
||||
|
||||
Memory is also requested, but may not actually be backed by physical memory, so
|
||||
these stats should resemble the VSS of the application, not the RSS.
|
||||
|
||||
```
|
||||
Low-level allocator stats:
|
||||
MmapSysAllocator: 18083741696 bytes (17246.0 MiB) allocated
|
||||
```
|
||||
|
||||
## Temeraire
|
||||
|
||||
### Introduction
|
||||
|
||||
Temeraire (or Huge Page Aware Allocator) is a new page heap for TCMalloc that is
|
||||
hugepage aware. It is designed to better handle memory backed by hugepages -
|
||||
avoiding breaking them up. Since it is more elaborate code, it reports
|
||||
additional information.
|
||||
|
||||
See the [Temeraire design doc](temeraire.md) for more complete information.
|
||||
|
||||
### Summary Statistics
|
||||
|
||||
The initial set of statistics from the Huge Page Aware Allocator are similar to
|
||||
the old page heap, and show a summary of the number of instances of each range
|
||||
of contiguous pages.
|
||||
|
||||
```
|
||||
------------------------------------------------
|
||||
HugePageAware: 75 sizes; 938.8 MiB free; 1154.0 MiB unmapped
|
||||
------------------------------------------------
|
||||
1 pages * 86655 spans ~ 677.0 MiB; 677.0 MiB cum; unmapped: 0.0 MiB; 0.0 MiB cum
|
||||
2 pages * 3632 spans ~ 56.8 MiB; 733.7 MiB cum; unmapped: 0.0 MiB; 0.0 MiB cum
|
||||
3 pages * 288 spans ~ 6.8 MiB; 740.5 MiB cum; unmapped: 0.0 MiB; 0.0 MiB cum
|
||||
4 pages * 250 spans ~ 7.8 MiB; 748.3 MiB cum; unmapped: 0.0 MiB; 0.0 MiB cum
|
||||
...
|
||||
```
|
||||
|
||||
The first line indicates the number of different sizes of ranges, the total MiB
|
||||
available, and the total MiB of unmapped ranges. The next lines are per number
|
||||
of continuous pages:
|
||||
|
||||
* The number of contiguous pages
|
||||
* The number of spans of that number of pages
|
||||
* The total number of MiB of that span size that are mapped.
|
||||
* The cumulative total of the mapped pages.
|
||||
* The total number of MiB of that span size that are unmapped.
|
||||
* The cumulative total of the unmapped pages.
|
||||
|
||||
### Per Component Information
|
||||
|
||||
The Huge Page Aware Allocator has multiple places where pages of memory are
|
||||
held. More details of its workings can be found in
|
||||
[the Temeraire design doc](temeraire.md). There are four caches where pages of
|
||||
memory can be located:
|
||||
|
||||
* The filler, used for allocating ranges of a few TCMalloc pages in size.
|
||||
* The region cache, used for allocating ranges of multiple pages.
|
||||
* The huge cache which contains huge pages that are backed with memory.
|
||||
* The huge page allocator which contains huge pages that are not backed by
|
||||
memory.
|
||||
|
||||
We get some summary information for the various caches, before we report
|
||||
detailed information for each of the caches.
|
||||
|
||||
```
|
||||
Huge page aware allocator components:
|
||||
------------------------------------------------
|
||||
HugePageAware: breakdown of free / unmapped / used space:
|
||||
HugePageAware: filler 38825.2 MiB used, 938.8 MiB free, 0.0 MiB unmapped
|
||||
HugePageAware: region 0.0 MiB used, 0.0 MiB free, 0.0 MiB unmapped
|
||||
HugePageAware: cache 908.0 MiB used, 0.0 MiB free, 0.0 MiB unmapped
|
||||
HugePageAware: alloc 0.0 MiB used, 0.0 MiB free, 1154.0 MiB unmapped
|
||||
```
|
||||
|
||||
The summary information tells us:
|
||||
|
||||
* The first column shows how much memory has been allocated from each of the
|
||||
caches
|
||||
* The second column indicates how much backed memory is available in each
|
||||
cache.
|
||||
* The third column indicates how much unmapped memory is available in each
|
||||
cache.
|
||||
|
||||
### Filler Cache
|
||||
|
||||
The filler cache contains TCMalloc sized pages from within a single hugepage. So
|
||||
if we want a single TCMalloc page we will look for it in the filler.
|
||||
|
||||
There are three sections of stats around the filler cache. The first section
|
||||
gives an indication of the number and state of the hugepages in the filler
|
||||
cache.
|
||||
|
||||
```
|
||||
HugePageFiller: densely pack small requests into hugepages
|
||||
HugePageFiller: 19882 total, 8083 full, 11799 partial, 0 released (0 partially), 0 quarantined
|
||||
HugePageFiller: 120168 pages free in 19882 hugepages, 0.0236 free
|
||||
HugePageFiller: among non-fulls, 0.0398 free
|
||||
HugePageFiller: 499 used pages in subreleased hugepages (0 of them in partially released)
|
||||
HugePageFiller: 0 hugepages partially released, 0.0000 released
|
||||
HugePageFiller: 1.0000 of used pages hugepageable
|
||||
HugePageFiller: Since startup, 26159 pages subreleased, 345 hugepages broken
|
||||
```
|
||||
|
||||
The summary stats are as follows:
|
||||
|
||||
* "total" refers to the total number of hugepages in the filler cache.
|
||||
* "full" is the number of those hugepages that have multiple in-use
|
||||
allocations.
|
||||
* "partial" is the remaining number of hugepages that have a single in-use
|
||||
allocation.
|
||||
* "released" is the number of hugepages that are released - i.e., partially
|
||||
unmapped. If partially released hugepages are enabled, the number in
|
||||
parentheses shows the number of hugepages in this category.
|
||||
* "quarantined" is a feature has been disabled, so the result is currently
|
||||
zero.
|
||||
|
||||
The second section gives an indication of the number of pages in various states
|
||||
in the filler cache:
|
||||
|
||||
* "pages free" refers to the number of free TCMalloc pages in the filler, as
|
||||
well as the ratio to the total number of hugepages.
|
||||
* "among non-fulls" states this ratio to the number of non-full hugepages.
|
||||
* "used pages" refers to the number of occupied pages in the different types
|
||||
of partially unmapped hugepages.
|
||||
|
||||
```
|
||||
HugePageFiller: fullness histograms
|
||||
|
||||
HugePageFiller: # of regular hps with a<= # of free pages <b
|
||||
HugePageFiller: < 0<= 8083 < 1<= 6 < 2<= 1 < 3<= 1 < 4<= 0 < 16<= 103
|
||||
HugePageFiller: < 32<= 1 < 48<= 0 < 64<= 3 < 80<= 1 < 96<= 0 <112<= 0
|
||||
HugePageFiller: <128<= 28 <144<= 0 <160<= 0 <176<= 1 <192<= 0 <208<= 0
|
||||
HugePageFiller: <224<= 2 <240<= 0 <252<= 0 <253<= 0 <254<= 0 <255<= 0
|
||||
|
||||
HugePageFiller: # of donated hps with a<= # of free pages <b
|
||||
HugePageFiller: < 0<= 0 < 1<= 0 < 2<= 0 < 3<= 0 < 4<= 0 < 16<= 0
|
||||
HugePageFiller: < 32<= 0 < 48<= 0 < 64<= 0 < 80<= 0 < 96<= 0 <112<= 0
|
||||
HugePageFiller: <128<= 1 <144<= 0 <160<= 0 <176<= 0 <192<= 0 <208<= 0
|
||||
HugePageFiller: <224<= 0 <240<= 0 <252<= 0 <253<= 0 <254<= 0 <255<= 0
|
||||
|
||||
HugePageFiller: # of released hps with a<= # of free pages <b
|
||||
...
|
||||
|
||||
HugePageFiller: # of regular hps with a<= longest free range <b
|
||||
HugePageFiller: < 0<= 8083 < 1<= 6 < 2<= 1 < 3<= 1 < 4<= 0 < 16<= 103
|
||||
HugePageFiller: < 32<= 1 < 48<= 0 < 64<= 4 < 80<= 0 < 96<= 0 <112<= 0
|
||||
HugePageFiller: <128<= 29 <144<= 0 <160<= 0 <176<= 0 <192<= 0 <208<= 1
|
||||
HugePageFiller: <224<= 1 <240<= 0 <252<= 0 <253<= 0 <254<= 0 <255<= 0
|
||||
|
||||
HugePageFiller: # of released hps with a<= longest free range <b
|
||||
...
|
||||
|
||||
HugePageFiller: # of regular hps with a<= # of allocations <b
|
||||
HugePageFiller: < 1<= 8 < 2<= 7 < 3<= 10 < 4<= 10 < 5<= 12 < 17<= 15
|
||||
HugePageFiller: < 33<= 12 < 49<= 2 < 65<= 0 < 81<= 2 < 97<= 17 <113<= 166
|
||||
HugePageFiller: <129<= 42 <145<= 6 <161<= 20 <177<= 48 <193<= 398 <209<= 1968
|
||||
HugePageFiller: <225<= 5062 <241<= 425 <253<= 0 <254<= 0 <255<= 0 <256<= 0
|
||||
|
||||
HugePageFiller: # of released hps with a<= # of allocations <b
|
||||
...
|
||||
```
|
||||
|
||||
Some sections have been elided here for space.
|
||||
|
||||
There are three sections, split by three tracker types. They use the same
|
||||
reporting format and indicate:
|
||||
|
||||
* The available TCMalloc pages in the hugepages of the given type.
|
||||
* The longest contiguous range of available TCMalloc pages in the hugepages of
|
||||
the given type.
|
||||
* The number of current allocations from each of the hugepages of the given
|
||||
type. The ranges are offset by one here, because a hugepage can't have zero
|
||||
allocations.
|
||||
|
||||
The reporting format is the number of hugepages that are between a particular
|
||||
range for the characteristic of interest. For example:
|
||||
|
||||
* There are 3 regular hugepages with TCMalloc free pages >= 64 and < 80.
|
||||
* There are 6 regular hugepages with a longest contiguous length of exactly 1
|
||||
page.
|
||||
* There are 2 regular hugepages with between 81 and 96 allocations.
|
||||
|
||||
The three tracker types are "regular," "donated," and "released." "Regular" is
|
||||
by far the most common, and indicates regular memory in the filler.
|
||||
|
||||
"Donated" is hugepages that have been donated to the filler from the tail of
|
||||
large (multi-hugepage) allocations, so that the leftover space can be packed
|
||||
with smaller allocations. But we prefer to use up all useable regular hugepages
|
||||
before touching the donated ones, which devolve to "regular" type once they are
|
||||
used. Because of this last property, donated hugepages always have only one
|
||||
allocation and their longest range equals their free space, so those histograms
|
||||
aren't shown.
|
||||
|
||||
"Released" is partially released hugepages. Normally the entirety of a hugepage
|
||||
is backed by real RAM, but in partially released hugepages most of it has been
|
||||
returned to the OS. Because this defeats the primary goal of the hugepage-aware
|
||||
allocator, this is done rarely, and we only reuse partially-released hugepages
|
||||
for new allocations as a last resort.
|
||||
|
||||
The final section shows a summary of the filler's state over the past 5 minute
|
||||
time period:
|
||||
|
||||
```
|
||||
HugePageFiller: time series over 5 min interval
|
||||
|
||||
HugePageFiller: realized fragmentation: 0.0 MiB
|
||||
HugePageFiller: minimum free pages: 0 (0 backed)
|
||||
HugePageFiller: at peak demand: 1774 pages (and 261 free, 13 unmapped)
|
||||
HugePageFiller: at peak demand: 8 hps (5 regular, 1 donated, 0 partial, 2 released)
|
||||
HugePageFiller: at peak hps: 1774 pages (and 261 free, 13 unmapped)
|
||||
HugePageFiller: at peak hps: 8 hps (5 regular, 1 donated, 0 partial, 2 released)
|
||||
```
|
||||
|
||||
The first line shows the minimum number of free pages over the time interval,
|
||||
which is an indication of how much memory could have been "usefully" reclaimed
|
||||
(i.e., free for long enough that the OS would likely be able to use the memory
|
||||
for another process). The line shows both the total number of free pages in the
|
||||
filler (whether or not released to the OS) as well as only those that were
|
||||
backed by physical memory for the full 5-min interval. The realized
|
||||
fragmentation metric computed here uses a bounded window.
|
||||
|
||||
The next two sections show the state of the filler at peak demand (i.e., when
|
||||
the maximum number of pages was in use) and at peak hps (i.e., when the maximum
|
||||
number of hugepages was in use). For each, we show the number of free (backed)
|
||||
pages as well as unmapped pages, and the number of the four different types of
|
||||
hugepages active at that time. If there are multiple peaks, we return the state
|
||||
at the latest one of them.
|
||||
|
||||
If applicable, an additional section tracks the behavior that skips subreleasing
|
||||
hugepages if behind the recent demand requirement, which is either the peak
|
||||
within `--tcmalloc_skip_subrelease_interval`, or the sum of short-term
|
||||
fluctuation peak within `--tcmalloc_skip_subrelease_short_interval` and
|
||||
long-term trend within `--tcmalloc_skip_subrelease_long_interval`.
|
||||
|
||||
**Note:** Conducting skip-subrelease using both short-term and long-term
|
||||
intervals is an experimental feature, and should not be enabled without
|
||||
understanding its performance tradeoffs.
|
||||
|
||||
```
|
||||
HugePageFiller: Since the start of the execution, 0 subreleases (0 pages) were skipped due to either recent (0s) peaks, or the sum of short-term (0s) fluctuations and long-term (0s) trends..
|
||||
HugePageFiller: 100.0000% of decisions confirmed correct, 0 pending (100.0000% of pages, 0 pending), as per anticipated 300s realized fragmentation.
|
||||
```
|
||||
|
||||
This shows how many times a page that was meant to be subreleased was not (note
|
||||
that this can refer to the same page multiple times if subrelease of this page
|
||||
would have been triggered multiple times). The percentage shows what fraction of
|
||||
times this decision would have been correct (i.e., if we decided not to
|
||||
subrelease a page because of the calculated demand requirement, did memory
|
||||
consumption increase again within the *next* five minutes?). "Pending" refers to
|
||||
subrelease decisions that were less than five minutes in the past and we
|
||||
therefore do not know yet whether or not they were correct. The correctness
|
||||
evaluation chooses to use the five minutes interval as it is the interval used
|
||||
for realized fragmentation.
|
||||
|
||||
The skip-subrelease feature prioritizes using the recent peak if
|
||||
`--tcmalloc_skip_subrelease_interval` is configured, otherwise it uses the
|
||||
combination of the recent short-term fluctuation peak and long-term trend. The
|
||||
feature is disabled if all three intervals are zero.
|
||||
|
||||
### Region Cache
|
||||
|
||||
The region cache holds a chunk of memory from which can be allocated spans of
|
||||
multiple TCMalloc pages. The region cache may not be populated, and it can
|
||||
contain multiple regions.
|
||||
|
||||
```
|
||||
HugeRegionSet: 1 MiB+ allocations best-fit into 1024 MiB slabs
|
||||
HugeRegionSet: 0 total regions
|
||||
HugeRegionSet: 0 hugepages backed out of 0 total
|
||||
HugeRegionSet: 0 pages free in backed region, 0.0000 free
|
||||
```
|
||||
|
||||
The lines of output indicate:
|
||||
|
||||
* The size of each region in MiB - this is currently 1GiB.
|
||||
* The total number of regions in the region cache, in the example above there
|
||||
are no regions in the cache.
|
||||
* The number of backed hugepages in the cache out of the total number of
|
||||
hugepages in the region cache.
|
||||
* The number of free TCMalloc pages in the regions, and as a ratio of the
|
||||
number of backed pages.
|
||||
|
||||
### Huge Cache
|
||||
|
||||
The huge cache contains backed hugepages, it grows and shrinks in size depending
|
||||
on runtime conditions. Attempting to hold onto backed memory ready to be
|
||||
provided for the application.
|
||||
|
||||
```
|
||||
HugeCache: contains unused, backed hugepage(s)
|
||||
HugeCache: 0 / 10 hugepages cached / cache limit (0.053 hit rate, 0.436 overflow rate)
|
||||
HugeCache: 88880 MiB fast unbacked, 6814 MiB periodic
|
||||
HugeCache: 1234 MiB*s cached since startup
|
||||
HugeCache: recent usage range: 40672 min - 40672 curr - 40672 max MiB
|
||||
HugeCache: recent offpeak range: 0 min - 0 curr - 0 max MiB
|
||||
HugeCache: recent cache range: 0 min - 0 curr - 0 max MiB
|
||||
```
|
||||
|
||||
The output shows the following information:
|
||||
|
||||
* The number of hugepages out of the maximum number of hugepages we will hold
|
||||
in the huge cache. The hit rate is how often we get pages from the huge
|
||||
cache vs getting them from the huge allocator. The overflow rate is the
|
||||
number of times we added something to the huge cache causing it to exceed
|
||||
its size limit.
|
||||
* The fast unbacked is the cumulative amount of memory unbacked due size
|
||||
limitations, the periodic count is the cumulative amount of memory unbacked
|
||||
by periodic calls to release unused memory.
|
||||
* The amount of cumulative memory stored in HugeCache since the startup of the
|
||||
process. In other words, the area under the cached-memory-vs-time curve.
|
||||
* The usage range is the range minimum, current, maximum in MiB of memory
|
||||
obtained from the huge cache.
|
||||
* The off-peak range is the minimum, current, maximum cache size in MiB
|
||||
compared to the peak cache size.
|
||||
* The recent range is the minimum, current, maximum size of memory in MiB in
|
||||
the huge cache.
|
||||
|
||||
### Huge Allocator
|
||||
|
||||
The huge allocator holds unmapped memory ranges. We allocate from here if we are
|
||||
unable to allocate from any of the caches.
|
||||
|
||||
```
|
||||
HugeAllocator: contiguous, unbacked hugepage(s)
|
||||
HugeAddressMap: treap 5 / 10 nodes used / created
|
||||
HugeAddressMap: 256 contiguous hugepages available
|
||||
HugeAllocator: 20913 requested - 20336 in use = 577 hugepages free
|
||||
```
|
||||
|
||||
The information reported here is:
|
||||
|
||||
* The number of nodes used and created to handle regions of memory.
|
||||
* The size of the longest contiguous region of available hugepages.
|
||||
* The number of hugepages requested from the system, the number of hugepages
|
||||
in used, and the number of hugepages available in the cache.
|
||||
|
||||
### Pageheap Summary Information
|
||||
|
||||
The new pageheap reports some summary information:
|
||||
|
||||
```
|
||||
HugePageAware: stats on allocation sizes
|
||||
HugePageAware: 4969003 pages live small allocation
|
||||
HugePageAware: 659 pages of slack on large allocations
|
||||
HugePageAware: largest seen allocation 45839 pages
|
||||
```
|
||||
|
||||
These are:
|
||||
|
||||
* The number of live "small" TCMalloc pages allocated (these less than 2MiB in
|
||||
size). [Note: the 2MiB size distinction is separate from the size of
|
||||
hugepages]
|
||||
* The number of TCMalloc pages which are left over from "large" allocations.
|
||||
These allocations are larger than 2MiB in size, and are rounded to a
|
||||
hugepage - the slack being the amount left over after rounding.
|
||||
* The largest seen allocation request in TCMalloc pages.
|
||||
|
||||
### Per Size Range Info:
|
||||
|
||||
The per size range info is the same format as the old pageheap:
|
||||
|
||||
* The first column contains the number of pages (or the range of pages if the
|
||||
bucket is wider than a single page).
|
||||
* The second and third columns are the number of allocated and freed pages we
|
||||
have seen of this size.
|
||||
* The fourth column is the number of live allocations of this size.
|
||||
* The fifth column is the size of those live allocations in MiB.
|
||||
* The sixth column is the allocation rate in pages per second since the start
|
||||
of the application.
|
||||
* The seventh column is the allocation rate in MiB per second since the start
|
||||
of the application.
|
||||
|
||||
```
|
||||
HugePageAware: per-size information:
|
||||
HugePageAware: 1 page info: 5817510 / 3863506 a/f, 1954004 (15265.7 MiB) live, 16 allocs/s ( 0.1 MiB/s)
|
||||
HugePageAware: 2 page info: 1828473 / 1254096 a/f, 574377 ( 8974.6 MiB) live, 5.03 allocs/s ( 0.1 MiB/s)
|
||||
HugePageAware: 3 page info: 1464568 / 1227253 a/f, 237315 ( 5562.1 MiB) live, 4.03 allocs/s ( 0.1 MiB/s)
|
||||
...
|
||||
```
|
||||
|
||||
### Pageheap Age Information:
|
||||
|
||||
The new pageheap allocator also reports information on the age of the various
|
||||
page ranges. In this example you can see that there was a large number of
|
||||
unmapped pages in the last minute.
|
||||
|
||||
```
|
||||
------------------------------------------------
|
||||
HugePageAware cache entry age (count of pages in spans of a given size that have been idle for up to the given period of time)
|
||||
------------------------------------------------
|
||||
mean <1s 1s 30s 1m 30m 1h 8+h
|
||||
Live span TOTAL PAGES: 29317.6 145 549 1775 13059 13561 58622 32457
|
||||
Live span, 1 pages: 35933.7 0 55 685 6354 8111 43853 27597
|
||||
...
|
||||
Unmapped span TOTAL PAGES: 51.3 0 0 131072 16640 0 0 0
|
||||
Unmapped span, >=64 pages: 51.3 0 0 131072 16640 0 0 0
|
||||
...
|
||||
```
|
||||
267
src/third_party/tcmalloc/dist/docs/temeraire.md
vendored
Normal file
@ -0,0 +1,267 @@
|
||||
# Temeraire: Hugepage-Aware Allocator
|
||||
|
||||
Andrew Hunter, [Chris Kennelly](ckennelly@google.com)
|
||||
|
||||
*Notes on the name*[^cutie]*: the french word for "reckless" or "rash" :), and
|
||||
also the name of several large and powerful English warships. So: giant and
|
||||
powerful, but maybe a little dangerous. :)*
|
||||
|
||||
This is a description of the design of the Hugepage-Aware Allocator. We have
|
||||
also published ["Beyond malloc efficiency to fleet efficiency: a hugepage-aware
|
||||
memory allocator" at OSDI 2021](https://research.google/pubs/pub50370/), which
|
||||
provides further details on the design, implementation, and rollout of
|
||||
Temeraire.
|
||||
|
||||
## GOALS
|
||||
|
||||
What do we want out of this redesign?
|
||||
|
||||
* Dramatic reduction in pageheap size. The pageheap in TCMalloc holds
|
||||
substantial amounts of memory *after* its attempts to `MADV_DONTNEED` memory
|
||||
back to the OS, due to internal fragmentation. We can recover a useful
|
||||
fraction of this. In optimal cases, we see savings of over 90%. We do not
|
||||
expect to achieve this generally, but a variety of synthetic loads suggest
|
||||
50% of pageheap is a reasonable target savings.
|
||||
* Dramatic increase in hugepage usage. The `madvise()` in
|
||||
`ReleaseMemoryToSystem` is made without any thought to transparent
|
||||
hugepages, and in practice prevent most fleet RAM from remaining as intact
|
||||
hugepages. Services have seen substantial performance gains from **from
|
||||
disabling release** (and going to various other lengths to maximize hugepage
|
||||
usage).
|
||||
* *reasonable* allocation speed. This is really stating a non-goal: speed
|
||||
parity with `PageHeap::New`. PageHeap is a relatively light consumer of
|
||||
cycles. We are willing to accept a speed hit in actual page allocation in
|
||||
exchange for better hugepage usage and space overhead. This is not free but
|
||||
we think is well justified. Our goal is more to avoid catastrophic
|
||||
regressions in speed. We intentionally accept two particular time hits:
|
||||
|
||||
* much more aggressive releasing (of entire hugepages), leading to
|
||||
increased costs for *backing* memory.
|
||||
* much more detailed (and expensive) choices of where to fulfill a
|
||||
particular request.
|
||||
|
||||
## DESIGN
|
||||
|
||||
The algorithm -- as usual here, really, the data structures, which neatly
|
||||
determine our algorithm -- are nicely divided into components. Essentially, the
|
||||
path of an allocation goes like this:
|
||||
|
||||
1. If it is sufficiently small and we have the space we take an existing,
|
||||
backed, partially empty hugepage and fit our allocation within it.
|
||||
1. If it is too large to fit in a single hugepage, but too small to simply
|
||||
round up to an integral number of hugepages, we best-fit it into one of
|
||||
several larger slabs (whose allocations can cross hugepage boundaries). We
|
||||
will back hugepages as needed for the allocation.
|
||||
1. Sufficiently large allocations are rounded up to the nearest hugepage; the
|
||||
extra space may be used for smaller allocations.
|
||||
|
||||
Deallocation simply determines which of 1), 2), or 3) happened, and marks the
|
||||
corresponding object we allocated from as free.
|
||||
|
||||
We will sketch the purpose and approach of each important part. Note that we
|
||||
have fairly detailed unit tests for each of these; one consequence on the
|
||||
implementations is that most components are templated on the
|
||||
`tcmalloc::SystemRelease` functions[^templated] as we make a strong attempt to
|
||||
be zero initializable where possible (sadly not everywhere).
|
||||
|
||||
### `RangeTracker`
|
||||
|
||||
`RangeTracker` and `Bitmap`, its underlying implementation, are helper class
|
||||
used throughout the components below. They are both quite simple: `Bitmap` is a
|
||||
fixed-size (templated) bitmap with fast operations to set and clear bits and
|
||||
ranges of bits, with extensive support for searching and iterating. (Search and
|
||||
iteration support is why `std::bitset` is not usable here.)
|
||||
|
||||
`RangeTracker` is essentially a `Bitmap` augmented with statistics on usage, in
|
||||
particular the longest range of contiguous free (false) bits. It provides
|
||||
methods to do best-fit allocation from free ranges (keeping the statistics
|
||||
correct).
|
||||
|
||||
Both of these need to be quite fast as they're on nearly every
|
||||
allocation/deallocation path in `HugePageAwareAllocator` (in multiple ways)!
|
||||
They are reasonably optimized but probably still have more headroom.
|
||||
|
||||
### HugeAllocator/HugeCache (the backing...)
|
||||
|
||||
This is a set of classes that fulfills requests for backed (or unbacked) aligned
|
||||
hugepage ranges. We use this for sufficiently large (or nicely sized) requests,
|
||||
and to provide memory for the other components to break up into smaller chunks.
|
||||
|
||||
#### `HugeAllocator`
|
||||
|
||||
`HugeAllocator` is (nearly) trivial: it requests arbitrarily large
|
||||
hugepage-sized chunks from `SysAllocator`, keeps them unbacked, and tracks the
|
||||
available (unbacked) regions. Note that we do not need to be perfectly space
|
||||
efficient here: we only pay virtual memory and metadata, since *none* of the
|
||||
contents are backed. (We do make our best efforts to be relatively frugal,
|
||||
however, since there’s no need to inflate VSS by large factors.) Nor do we have
|
||||
to be particularly fast; this is well off any hot path, and we’re going to incur
|
||||
non-trivial backing costs as soon as we’re done assigning a range.
|
||||
|
||||
The one tricky bit here is that we have to write some fiddly data structures by
|
||||
hand. We would have liked to implement this by grabbing large (gigabyte+) ranges
|
||||
from SysAllocator and using bitmaps or the like within them; however, too many
|
||||
tests have brittle reliance on details of `SysAllocator` that break if TCMalloc
|
||||
consistently requests (any considerable amount) more than the minimum needed to
|
||||
back current usage. So instead we need to track relatively small ranges. We've
|
||||
implemented a balanced tree that merges adjacent ranges; it is, as we said,
|
||||
fiddly, but reasonably efficient and not stunningly complicated.
|
||||
|
||||
#### `HugeCache`
|
||||
|
||||
This is a very simple wrapper on top of HugeAllocator. It's only purpose is to
|
||||
store some number of backed *single* hugepage ranges as a hot cache (in case we
|
||||
rapidly allocate and deallocate a 2 MiB chunk).
|
||||
|
||||
It is not clear whether the cache is necessary, but we have it and it's not
|
||||
costing us much in complexity, and will help significantly in some potential
|
||||
antagonistic scenarios, so we favor keeping it.
|
||||
|
||||
It currently attempts to estimate the optimal cache size based on past behavior.
|
||||
This may not really be needed, but it's a very minor feature to keep *or* drop.
|
||||
|
||||
### `HugePageFiller` (the core…)
|
||||
|
||||
`HugePageFiller` takes small requests (less than a hugepage) and attempts to
|
||||
pack them efficiently into hugepages. The vast majority of binaries use almost
|
||||
entirely small allocations[^conditional], so this is the dominant consumer of
|
||||
space and the most important component.
|
||||
|
||||
Our goal here is to make our live allocations fit within the smallest set of
|
||||
hugepages possible, so that we can afford to keep all used hugepages fully
|
||||
backed (and aggressively free empty ones).
|
||||
|
||||
The key challenge is avoiding fragmentation of free space within a hugepage:
|
||||
requests for 1 page are (usually) the most common, but 4, 8, or even 50+ page
|
||||
requests aren't unheard of. Many 1-page free regions won’t be useful here, and
|
||||
we'll have to request enormous numbers of new hugepages for anything large.
|
||||
|
||||
Our solution is to build a heap-ordered data structure on *fragmentation*, not
|
||||
total amount free, in each hugepage. We use the **longest free range** (the
|
||||
biggest allocation a hugepage can fulfill!) as a measurement of fragmentation.
|
||||
In other words: if a hugepage has a free range of length 8, we *never* allocate
|
||||
from it for a smaller request (unless all hugepages available have equally long
|
||||
ranges). This carefully husbands long ranges for the requests that need them,
|
||||
and allows them to grow (as neighboring allocations are freed).
|
||||
|
||||
Inside each equal-longest-free-range group, we order our heap by the **number of
|
||||
allocations** (chunked logarithmically). This helps favor allocating from fuller
|
||||
hugepages (of equal fragmented status). Number of allocations handily
|
||||
outperforms the total number of allocated pages here; our hypothesis is that
|
||||
since allocations of any size are equally likely[^radioactive] to become free at
|
||||
any given time, and we need all allocations on a hugepage to become free to make
|
||||
the hugepage empty, we’re better off hoping for 1 10-page allocation to become
|
||||
free (with some probability P) than 5 1-page allocations (with probability P^5).
|
||||
|
||||
The `HugePageFiller` contains support for releasing parts of mostly-empty
|
||||
hugepages as a last resort.
|
||||
|
||||
The actual implementation uses a fixed set of lists and a bitmap for
|
||||
acceleration.
|
||||
|
||||
### `HugeRegion` (big but not enormous...)
|
||||
|
||||
`HugeAllocator` covers very large requests and `HugePageFiller` tiny ones; what
|
||||
about the middle? In particular, requests that cannot fit into a hugepage, but
|
||||
should not be rounded to multiples? (For instance, 2.1 MiB.) These are woefully
|
||||
common.
|
||||
|
||||
In any case, we certainly have to do something with "2.1 MiB"-type allocations,
|
||||
and rounding them to 4 will produce unacceptable slack (see below for what we
|
||||
can do with the filler here; it is wildly insufficient in current binaries which
|
||||
have the majority of their allocation in these large chunks.)
|
||||
|
||||
The solution is a much larger "region" that best-fits these chunks into a large
|
||||
range of hugepages (i.e. allows them to cross a hugepage boundary). We keep a
|
||||
set of these regions, and allocate from the most fragmented one (much as with
|
||||
Filler above)! The main difference is that these regions are kept **un-backed**
|
||||
by default (whereas the Filler deals almost entirely with backed hugepages). We
|
||||
back hugepages on demand when they are used by a request hitting the region (and
|
||||
aggressively _unback _them when they become empty again).
|
||||
|
||||
A few important details:
|
||||
|
||||
* These regions are currently 1 GiB, which is very large!
|
||||
|
||||
The reason is this: suppose our entire binary allocates a huge number `N` of
|
||||
requests of size `S` that are too big for the filler, but that don’t evenly
|
||||
divide the region size `M` (say, 2.1 MiB :)) How much space will we waste?
|
||||
Answer: we will allocate about `R = N / (M / S)` regions, with each region
|
||||
storing `floor(M/S)` allocations. The tail will be unused. We can unback any
|
||||
totally untouched hugepages, but suppose that `M/S` allocations just barely
|
||||
touches the last hugepage in the region: we will then waste ~a full hugepage
|
||||
per region, and thus waste `R` hugepages. Conclusion: the larger a region we
|
||||
use, the less waste (in this case). Originally regions were 32 MiB, and this
|
||||
effect was very noticeable. This also allows us to use very few regions in a
|
||||
given binary, which means we can be less careful about how we organize the
|
||||
set of regions.
|
||||
|
||||
* We don’t make *any* attempt, when allocating from a given region, to find an
|
||||
already-backed but unused range. Nor do we prefer regions that have such
|
||||
ranges.
|
||||
|
||||
This is basically a question of effort. We'd like to do this, but we don't
|
||||
see any way to do it without making the data structure more complicated and
|
||||
cumbersome. So far in tests it hasn't proved a major problem. (Note that
|
||||
`RangeTracker` has a low-address bias, which will help somewhat here by
|
||||
compacting allocations towards the low end of any region).
|
||||
|
||||
Additional details on the design goals/tradeoffs are in the
|
||||
[Regions Are Not Optional](regions-are-not-optional.md) design doc.
|
||||
|
||||
### `HugePageAwareAllocator` (putting it all together...)
|
||||
|
||||
This class houses the above components and routes between them, in addition to
|
||||
interfacing with the rest of TCMalloc (the above classes don’t need or use
|
||||
Spans, for instance). This is mostly straightforward; two points are worth
|
||||
discussing.
|
||||
|
||||
* How do we choose which sub-allocator for a given request?
|
||||
|
||||
We use a size-based policy.
|
||||
|
||||
1. Small allocations are handed directly to the filler; we add hugepages to
|
||||
the filler as needed.
|
||||
1. For slightly larger allocations (still under a full hugepage), we *try*
|
||||
the filler, but don’t grow it if there’s not currently space. Instead,
|
||||
we look in the regions for free space. If neither the regions or the
|
||||
filler has space, we prefer growing the filler (since it comes in
|
||||
smaller chunks!) The reasoning here is that if our binary only has
|
||||
allocations of (say) ¾ a hugepage, we don’t want the filler to be giant
|
||||
but ¼ empty; but in a more reasonable binary where we can easily pack
|
||||
such allocations near smaller ones, we’d prefer to do so over using the
|
||||
region.
|
||||
1. Allocations that won’t fit in a hugepage are just given to the regions
|
||||
(or, for truly enormous ones, to `HugeAllocator` directly).
|
||||
|
||||
The changeover point between 1) and 2) is just a tuning decision (any choice
|
||||
would produce a usable binary). Half a hugepage was picked arbitrarily; this
|
||||
seems to work well.
|
||||
|
||||
* How do we handle backing?
|
||||
|
||||
Allocations from `HugeAllocator` or `HugeRegion` (some of the time) need to be
|
||||
backed; so do hugepages that grow the `HugePageFiller`. This isn’t free. Page
|
||||
heap allocation isn’t hugely expensive in practice, but it is under a lock and
|
||||
contention matters. We currently rely on access by the application to back
|
||||
memory, and assume returned memory has been backed.
|
||||
|
||||
For accounting purposes, we do a bit of tracking whether a given allocation is
|
||||
being fulfilled from previously-unbacked memory.
|
||||
|
||||
We do wire that information to the point we drop the pageheap lock; we then back
|
||||
it without producing lock contention. This made a noticeable performance
|
||||
difference when explicitly backing memory before returning it to the
|
||||
application.
|
||||
|
||||
## Notes
|
||||
|
||||
[^cutie]: Also the name of
|
||||
[this cutie](https://lh3.googleusercontent.com/VXENOSfqH1L84VMwLVAUA7JIqQh7TYH-IZHLBalvVVuMUeD3w5rOVHPsIp97nYEgmKpQoxsHO-lieGouheNmifA2X6tOPTBleTbQc_WCZIrI_roU2K37iiHg9go6omp2ys0Y7cxYc9c6EWNaCYtKG1dEPyyYLULUarCex4oqwt8KgRl95rd3yKXC6YQeW-TWkDpK786ZaAA3vKJXqT5E-ArPxQccyPH13EAmHrltKatqihC7L4Ym5IfP42u58IJwC5bRnKMczm2WwUfipGDEOvymf63mPNKmGMka50AQV4VGrE7hW_Ateb2roCTGISgZIooBSRwK0PMjqV9hBLP5DmUG4ITSV4FlOI5iWOyMSNZV6Gz5T2FgNez08Wdn98tsEsN4_lPcjdZXyJuHeVRKxAawDwjkbWP3aieXDckHY-bJMt0QfyDhPWzSOpTxTALcZiwoC069K9SrBDVKEKowJ2Zag7OlbpROhqbagM5Wuo_nn6O27yWXpihc8Lptt-Vo_e8kQZ4N2RReby3bxNPdRyv2L8BrDCIWBO-iFk7GcYRd9ox7HSD-7Y0yH1FtMP0FZKD5a2raVmabMQrolhsjc-AfYHgD3xBkNo-uTJ8YnFpqjpTdZz_1=w2170-h1446-no),
|
||||
the real reason for the choice.
|
||||
[^templated]: It will be possible, given recent improvements in constexpr usage,
|
||||
to eliminate this in followups.
|
||||
[^conditional]: Here we mean "requests to the pageheap as filtered through
|
||||
sampling, the central cache, etc"
|
||||
[^radioactive]: Well, no, this is false in our empirical data, but to first
|
||||
order.
|
||||
214
src/third_party/tcmalloc/dist/docs/tuning.md
vendored
Normal file
@ -0,0 +1,214 @@
|
||||
# Performance Tuning TCMalloc
|
||||
|
||||
## User-Accessible Controls
|
||||
|
||||
There are three user accessible controls that we can use to performance tune
|
||||
TCMalloc:
|
||||
|
||||
* The logical page size for TCMalloc (4KiB, 8KiB, 32KiB, 256KiB)
|
||||
* The per-thread or per-cpu cache sizes
|
||||
* The rate at which memory is released to the OS
|
||||
|
||||
None of these tuning parameters are clear wins, otherwise they would be the
|
||||
default. We'll discuss the advantages and disadvantages of changing them.
|
||||
|
||||
### The Logical Page Size for TCMalloc:
|
||||
|
||||
This is determined at compile time by linking in the appropriate version of
|
||||
TCMalloc. The page size indicates the unit in which TCMalloc manages memory. The
|
||||
default is in 8KiB chunks, there are larger options of 32KiB and 256KiB. There
|
||||
is also the 4KiB page size used by the small-but-slow allocator.
|
||||
|
||||
A smaller page size allows TCMalloc to provide memory to an application with
|
||||
less waste. Waste comes about through two issues:
|
||||
|
||||
* Left-over memory when rounding larger requests to the page size (eg a
|
||||
request for 62 KiB might get rounded to 64 KiB).
|
||||
* Pages of memory that are stuck because they have a single in use allocation
|
||||
on the page, and therefore cannot be repurposed to hold a different size of
|
||||
allocation.
|
||||
|
||||
The second of these points is worth elucidating. For small allocations TCMalloc
|
||||
will fit multiple objects onto a single page.
|
||||
|
||||
So if you request 512 bytes, then an entire page will be devoted to 512 byte
|
||||
objects. If the size of that page is 4KiB we get 8 objects, if the size of that
|
||||
page is 256KiB we get 512 objects. That page can only be used for 512 byte
|
||||
objects until all the objects on the page have been freed.
|
||||
|
||||
If you have 8 objects on a page, there's a reasonable chance that all 8 will
|
||||
become free at the same time, and we can repurpose the page for objects of a
|
||||
different size. If there's 512 objects on that page, then it is very unlikely
|
||||
that all the objects will become freed at the same time, so that page will
|
||||
probably never become entirely free and will probably hang around, potentially
|
||||
containing only a few in-use objects.
|
||||
|
||||
The consequence of this is that large pages tend to lead to a larger memory
|
||||
footprint. There's also the issue that if you want one object of a size, you
|
||||
need to allocate a whole page.
|
||||
|
||||
The advantage of managing objects using larger page sizes are:
|
||||
|
||||
* Objects of the same size are better clustered in memory. If you need 512 KiB
|
||||
of 8 byte objects, then that's two 256 KiB pages, or 128 x 4 KiB pages. If
|
||||
memory is largely backed by hugepages, then with large pages in the worst
|
||||
case we can map the entire demand with two large pages, whereas small pages
|
||||
could take up to 128 entries in the TLB.
|
||||
* There's a structure called the `PageMap` which enables TCMalloc to lookup
|
||||
information about any allocated memory. If we use large pages the pagemap
|
||||
needs fewer entries and can be much smaller. This makes it more likely that
|
||||
it is cache resident. However, sized delete substantially reduced the number
|
||||
of times that we need to consult the pagemap, so the benefit from larger
|
||||
pages is reduced.
|
||||
|
||||
**Suggestion:** The default of 8KiB page sizes is probably good enough for most
|
||||
applications. However, if an application has a heap measured in GiB it may be
|
||||
worth looking at using large page sizes.
|
||||
|
||||
**Suggestion:** Small-but-slow is *extremely* slow and should be used only where
|
||||
it is absolutely vital to minimize memory footprint over performance at all
|
||||
costs. Small-but-slow works by turning off and shrinking several of TCMalloc's
|
||||
caches, but this comes at a significant performance penalty.
|
||||
|
||||
**Note:** Size-classes are determined on a per-page-size basis. So changing the
|
||||
page size will implicitly change the size-classes used. Size-classes are
|
||||
selected to be memory-efficient for the applications using that page size. If an
|
||||
application changes page size, there may be a performance or memory impact from
|
||||
the different selection of size-classes.
|
||||
|
||||
### Per-thread/per-cpu Cache Sizes
|
||||
|
||||
The default is for TCMalloc to run in per-cpu mode as this is faster; however,
|
||||
there are few applications which have not yet transitioned. The plan is to move
|
||||
these across at some point soon.
|
||||
|
||||
Increasing the size of the cache is an obvious way to improve performance. The
|
||||
larger the cache the less frequently memory needs to be fetched from the central
|
||||
caches. Returning memory from the cache is substantially faster than fetching
|
||||
from the central cache.
|
||||
|
||||
The size of the per-cpu caches is controlled by
|
||||
`tcmalloc::MallocExtension::SetMaxPerCpuCacheSize`. This controls the limit for
|
||||
each CPU, so the total amount of memory for application could be much larger
|
||||
than this. Memory on CPUs where the application is no longer able to run can be
|
||||
freed by calling `tcmalloc::MallocExtension::ReleaseCpuMemory`.
|
||||
|
||||
The heterogeneous per-cpu cache optimization in TCMalloc dynamically sizes
|
||||
per-cpu caches so as to balance the miss rate across all the active and
|
||||
populated caches. It shuffles and reassigns the capacity from lightly used
|
||||
caches to the heavily used caches, using miss rate as the proxy for their usage.
|
||||
When enabled, the heavily used per-cpu caches may steal capacity from lightly
|
||||
used caches and grow beyond the limit set by `tcmalloc_max_per_cpu_cache_size`
|
||||
flag. This optimization is enabled by default in TCMalloc.
|
||||
|
||||
Releasing memory held by unuable CPU caches is handled by
|
||||
`tcmalloc::MallocExtension::ProcessBackgroundActions`.
|
||||
|
||||
In contrast `tcmalloc::MallocExtension::SetMaxTotalThreadCacheBytes` controls
|
||||
the *total* size of all thread caches in the application.
|
||||
|
||||
**Suggestion:** The default cache size is typically sufficient, but cache size
|
||||
can be increased (or decreased) depending on the amount of time spent in
|
||||
TCMalloc code, and depending on the overall size of the application (a larger
|
||||
application can afford to cache more memory without noticeably increasing its
|
||||
overall size).
|
||||
|
||||
### Memory Releasing
|
||||
|
||||
`tcmalloc::MallocExtension::ReleaseMemoryToSystem` makes a request to release
|
||||
`n` bytes of memory to TCMalloc. This can keep the memory footprint of the
|
||||
application down to a minimal amount, however it should be considered that this
|
||||
just reduces the application down from its peak memory footprint over time, and
|
||||
does not make that peak memory footprint smaller.
|
||||
|
||||
Using a background thread running
|
||||
`tcmalloc::MallocExtension::ProcessBackgroundActions()`, memory will be released
|
||||
from the page heap at the specified rate.
|
||||
|
||||
There are two disadvantages of releasing memory aggressively:
|
||||
|
||||
* Memory that is unmapped may be immediately needed, and there is a cost to
|
||||
faulting unmapped memory back into the application.
|
||||
* Memory that is unmapped at small granularity will break up hugepages, and
|
||||
this will cause some performance loss due to increased TLB misses.
|
||||
|
||||
**Note:** Release rate is not a panacea for memory usage. Jobs should be
|
||||
provisioned for peak memory usage to avoid OOM errors. Setting a release rate
|
||||
may enable an application to exceed the memory limit for short periods of time
|
||||
without triggering an OOM. A release rate is also a good citizen behavior as it
|
||||
will enable the system to use spare capacity memory for applications which are
|
||||
are under provisioned. However, it is not a substitute for setting appropriate
|
||||
memory requirements for the job.
|
||||
|
||||
**Note:** Memory is released from the `PageHeap` and stranded per-cpu caches. It
|
||||
is not possible to release memory from other internal structures, like the
|
||||
`CentralFreeList`.
|
||||
|
||||
**Suggestion:** The default release rate is probably appropriate for most
|
||||
applications. In situations where it is tempting to set a faster rate it is
|
||||
worth considering why there are memory spikes, since those spikes are likely to
|
||||
cause an OOM at some point.
|
||||
|
||||
## System-Level Optimizations
|
||||
|
||||
* TCMalloc heavily relies on Transparent Huge Pages (THP). As of February
|
||||
2020, we build and test with
|
||||
|
||||
```
|
||||
/sys/kernel/mm/transparent_hugepage/enabled:
|
||||
[always] madvise never
|
||||
|
||||
/sys/kernel/mm/transparent_hugepage/defrag:
|
||||
always defer [defer+madvise] madvise never`
|
||||
|
||||
/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none:
|
||||
0
|
||||
```
|
||||
|
||||
* TCMalloc makes assumptions about the availability of virtual address space,
|
||||
so that we can layout allocations in cetain ways. We build and test with
|
||||
|
||||
```
|
||||
/proc/sys/vm/overcommit_memory:
|
||||
1
|
||||
```
|
||||
|
||||
## Build-Time Optimizations
|
||||
|
||||
TCMalloc is built and tested in certain ways. These build-time options can
|
||||
improve performance:
|
||||
|
||||
* Statically-linking TCMalloc reduces function call overhead, by obviating the
|
||||
need to call procedure linkage stubs in the procedure linkage table (PLT).
|
||||
* Enabling
|
||||
[sized deallocation from C++14](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2013/n3778.html)
|
||||
reduces deallocation costs when the size can be determined. Sized
|
||||
deallocation is enabled with the `-fsized-deallocation` flag. This behavior
|
||||
is enabled by default in GCC), but as of early 2020, is not enabled by
|
||||
default on Clang even when compiling for C++14/C++17.
|
||||
|
||||
Some standard C++ libraries (such as
|
||||
[libc++](https://reviews.llvm.org/rCXX345214)) will take advantage of sized
|
||||
deallocation for their allocators as well, improving deallocation
|
||||
performance in C++ containers.
|
||||
|
||||
* Aligning raw storage allocated with `::operator new` to 8 bytes by compiling
|
||||
with `__STDCPP_DEFAULT_NEW_ALIGNMENT__ <= 8`. This smaller alignment
|
||||
minimizes wasted memory for many common allocation sizes (24, 40, etc.)
|
||||
which are otherwise rounded up to a multiple of 16 bytes. On many compilers,
|
||||
this behavior is controlled by the `-fnew-alignment=...` flag.
|
||||
|
||||
When `__STDCPP_DEFAULT_NEW_ALIGNMENT__` is not specified (or is larger than
|
||||
8 bytes), we use standard 16 byte alignments for `::operator new`. However,
|
||||
for allocations under 16 bytes, we may return an object with a lower
|
||||
alignment, as no object with a larger alignment requirement can be allocated
|
||||
in the space.
|
||||
|
||||
* Optimizing failures of `operator new` by directly failing instead of
|
||||
throwing exceptions. Because TCMalloc does not throw exceptions when
|
||||
`operator new` fails, this can be used as a performance optimization for
|
||||
many move constructors.
|
||||
|
||||
Within Abseil code, these direct allocation failures are enabled with the
|
||||
Abseil build-time configuration macro
|
||||
[`ABSL_ALLOCATOR_NOTHROW`](https://abseil.io/docs/cpp/guides/base#abseil-exception-policy).
|
||||
6
src/third_party/tcmalloc/dist/tcmalloc/.clang-format
vendored
Normal file
@ -0,0 +1,6 @@
|
||||
---
|
||||
Language: Cpp
|
||||
BasedOnStyle: Google
|
||||
DerivePointerAlignment: false
|
||||
PointerAlignment: Left
|
||||
...
|
||||
5
src/third_party/tcmalloc/dist/tcmalloc/.github/CODEOWNERS
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
# Default owners
|
||||
* @ckennelly
|
||||
|
||||
# Documentation
|
||||
docs/* @manshreck
|
||||
63
src/third_party/tcmalloc/dist/tcmalloc/.github/workflows/ci.yml
vendored
Normal file
@ -0,0 +1,63 @@
|
||||
# Copyright 2022 The TCMalloc Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# https://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
name: ci
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
Linux:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
compiler:
|
||||
- g++
|
||||
- clang++
|
||||
|
||||
name: "Build/Test ${{matrix.compiler}}"
|
||||
steps:
|
||||
- name: Cancel previous
|
||||
uses: styfle/cancel-workflow-action@0.8.0
|
||||
with:
|
||||
access_token: ${{ github.token }}
|
||||
|
||||
- name: Prepare
|
||||
run: |
|
||||
sudo apt-get update -qq
|
||||
sudo apt install -y g++ clang
|
||||
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Create Cache Timestamp
|
||||
id: cache_timestamp
|
||||
uses: nanzm/get-time-action@v1.1
|
||||
with:
|
||||
format: 'YYYY-MM-DD-HH-mm-ss'
|
||||
|
||||
- name: Mount bazel cache
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: "/home/runner/.cache/bazel"
|
||||
key: bazelcache_${{matrix.compiler}}_${{ steps.cache_timestamp.outputs.time }}
|
||||
restore-keys: bazelcache_${{matrix.compiler}}_
|
||||
|
||||
- name: Tests
|
||||
run: CXX=${{matrix.compiler}} bazel test --test_output=errors //...
|
||||
1571
src/third_party/tcmalloc/dist/tcmalloc/BUILD
vendored
Normal file
52
src/third_party/tcmalloc/dist/tcmalloc/allocation_sample.cc
vendored
Normal file
@ -0,0 +1,52 @@
|
||||
// Copyright 2022 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tcmalloc/allocation_sample.h"
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "absl/time/clock.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc::tcmalloc_internal {
|
||||
|
||||
AllocationSample::AllocationSample(AllocationSampleList* list, absl::Time start)
|
||||
: list_(list), start_(start) {
|
||||
mallocs_ = std::make_unique<StackTraceTable>(ProfileType::kAllocations);
|
||||
list->Add(this);
|
||||
}
|
||||
|
||||
AllocationSample::~AllocationSample() {
|
||||
if (mallocs_ == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
// deleted before ending profile, do it for them
|
||||
list_->Remove(this);
|
||||
}
|
||||
|
||||
Profile AllocationSample::Stop() && {
|
||||
// We need to remove ourselves from list_ before we mutate mallocs_;
|
||||
//
|
||||
// A concurrent call to AllocationSampleList::ReportMalloc can access mallocs_
|
||||
// until we remove it from list_.
|
||||
if (mallocs_) {
|
||||
list_->Remove(this);
|
||||
mallocs_->SetDuration(absl::Now() - start_);
|
||||
}
|
||||
return ProfileAccessor::MakeProfile(std::move(mallocs_));
|
||||
}
|
||||
|
||||
} // namespace tcmalloc::tcmalloc_internal
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
87
src/third_party/tcmalloc/dist/tcmalloc/allocation_sample.h
vendored
Normal file
@ -0,0 +1,87 @@
|
||||
// Copyright 2022 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef TCMALLOC_ALLOCATION_SAMPLE_H_
|
||||
#define TCMALLOC_ALLOCATION_SAMPLE_H_
|
||||
|
||||
#include "absl/base/dynamic_annotations.h"
|
||||
#include "absl/base/internal/spinlock.h"
|
||||
#include "absl/time/time.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
#include "tcmalloc/stack_trace_table.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc::tcmalloc_internal {
|
||||
|
||||
class AllocationSampleList;
|
||||
|
||||
class AllocationSample final : public AllocationProfilingTokenBase {
|
||||
public:
|
||||
AllocationSample(AllocationSampleList* list, absl::Time start);
|
||||
~AllocationSample() override;
|
||||
|
||||
Profile Stop() && override;
|
||||
|
||||
private:
|
||||
AllocationSampleList* list_;
|
||||
std::unique_ptr<StackTraceTable> mallocs_;
|
||||
absl::Time start_;
|
||||
AllocationSample* next_ = nullptr;
|
||||
friend class AllocationSampleList;
|
||||
};
|
||||
|
||||
class AllocationSampleList {
|
||||
public:
|
||||
constexpr AllocationSampleList() = default;
|
||||
|
||||
void Add(AllocationSample* as) {
|
||||
absl::base_internal::SpinLockHolder h(&lock_);
|
||||
as->next_ = first_;
|
||||
first_ = as;
|
||||
}
|
||||
|
||||
// This list is very short and we're nowhere near a hot path, just walk
|
||||
void Remove(AllocationSample* as) {
|
||||
absl::base_internal::SpinLockHolder h(&lock_);
|
||||
AllocationSample** link = &first_;
|
||||
AllocationSample* cur = first_;
|
||||
while (cur != as) {
|
||||
CHECK_CONDITION(cur != nullptr);
|
||||
link = &cur->next_;
|
||||
cur = cur->next_;
|
||||
}
|
||||
*link = as->next_;
|
||||
}
|
||||
|
||||
void ReportMalloc(const struct StackTrace& sample) {
|
||||
absl::base_internal::SpinLockHolder h(&lock_);
|
||||
AllocationSample* cur = first_;
|
||||
while (cur != nullptr) {
|
||||
cur->mallocs_->AddTrace(1.0, sample);
|
||||
cur = cur->next_;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// Guard against any concurrent modifications on the list of allocation
|
||||
// samples. Invoking `new` while holding this lock can lead to deadlock.
|
||||
absl::base_internal::SpinLock lock_{
|
||||
absl::kConstInit, absl::base_internal::SCHEDULE_KERNEL_ONLY};
|
||||
AllocationSample* first_ ABSL_GUARDED_BY(lock_) = nullptr;
|
||||
};
|
||||
|
||||
} // namespace tcmalloc::tcmalloc_internal
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
|
||||
#endif // TCMALLOC_ALLOCATION_SAMPLE_H_
|
||||
132
src/third_party/tcmalloc/dist/tcmalloc/allocation_sample_test.cc
vendored
Normal file
@ -0,0 +1,132 @@
|
||||
// Copyright 2022 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tcmalloc/allocation_sample.h"
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "absl/base/thread_annotations.h"
|
||||
#include "absl/random/bit_gen_ref.h"
|
||||
#include "absl/random/random.h"
|
||||
#include "absl/synchronization/mutex.h"
|
||||
#include "absl/time/clock.h"
|
||||
#include "absl/time/time.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
#include "tcmalloc/static_vars.h"
|
||||
#include "tcmalloc/testing/thread_manager.h"
|
||||
|
||||
namespace tcmalloc::tcmalloc_internal {
|
||||
namespace {
|
||||
|
||||
TEST(AllocationSample, Threaded) {
|
||||
// StackTraceTable uses a global allocator. It must be initialized.
|
||||
tc_globals.InitIfNecessary();
|
||||
|
||||
// This test exercises b/143623146 by ensuring that the state of the sample is
|
||||
// not modified before it is removed from the linked list.
|
||||
AllocationSampleList list;
|
||||
|
||||
const int kThreads = 5;
|
||||
const int kMaxSamplers = 3;
|
||||
const int kMaxAllocations = 100;
|
||||
ThreadManager m;
|
||||
std::vector<absl::BitGen> thread_states(kThreads);
|
||||
|
||||
struct GlobalState {
|
||||
absl::Mutex mu;
|
||||
std::vector<std::unique_ptr<AllocationSample>> samplers ABSL_GUARDED_BY(mu);
|
||||
} global;
|
||||
|
||||
auto PopSample = [&](absl::BitGenRef rng) {
|
||||
std::unique_ptr<AllocationSample> ret;
|
||||
|
||||
// Do our test bookkeeping separately, so we don't synchronize list
|
||||
// externally.
|
||||
absl::MutexLock l(&global.mu);
|
||||
if (global.samplers.empty()) {
|
||||
return ret;
|
||||
}
|
||||
size_t index = absl::Uniform<size_t>(rng, 0, global.samplers.size() - 1u);
|
||||
std::swap(global.samplers[index], global.samplers.back());
|
||||
ret = std::move(global.samplers.back());
|
||||
global.samplers.pop_back();
|
||||
|
||||
CHECK_CONDITION(ret != nullptr);
|
||||
return ret;
|
||||
};
|
||||
|
||||
m.Start(kThreads, [&](int thread) {
|
||||
auto& state = thread_states[thread];
|
||||
const double coin = absl::Uniform(state, 0., 1.0);
|
||||
|
||||
if (coin < 0.1) {
|
||||
// Add a sampler. This occurs implicitly in the AllocationSample
|
||||
// constructor.
|
||||
auto sampler = std::make_unique<AllocationSample>(&list, absl::Now());
|
||||
|
||||
// Do our test bookkeeping separately, so we don't synchronize list
|
||||
// externally.
|
||||
{
|
||||
absl::MutexLock l(&global.mu);
|
||||
if (global.samplers.size() < kMaxSamplers) {
|
||||
// Add to the list.
|
||||
global.samplers.push_back(std::move(sampler));
|
||||
}
|
||||
}
|
||||
|
||||
// If we didn't push it, we will unregister in ~AllocationSample.
|
||||
} else if (coin < 0.2) {
|
||||
std::unique_ptr<AllocationSample> sampler = PopSample(state);
|
||||
|
||||
// Remove a sample and allow its destructor to handle unregistering.
|
||||
sampler.reset();
|
||||
} else if (coin < 0.25) {
|
||||
// Call Stop occasionally.
|
||||
std::unique_ptr<AllocationSample> sampler = PopSample(state);
|
||||
|
||||
if (sampler) {
|
||||
std::move(*sampler).Stop();
|
||||
}
|
||||
} else {
|
||||
int allocations;
|
||||
{
|
||||
// StackTraceTable uses a global allocator, rather than one that is
|
||||
// injected. Consult the global state to see how many allocations are
|
||||
// active.
|
||||
absl::base_internal::SpinLockHolder h(&pageheap_lock);
|
||||
allocations = tc_globals.linked_sample_allocator().stats().in_use;
|
||||
}
|
||||
if (allocations >= kMaxAllocations) {
|
||||
return;
|
||||
}
|
||||
|
||||
StackTrace s{};
|
||||
s.requested_size = 16;
|
||||
s.allocated_size = 32;
|
||||
list.ReportMalloc(s);
|
||||
}
|
||||
});
|
||||
|
||||
absl::SleepFor(absl::Milliseconds(1));
|
||||
|
||||
m.Stop();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tcmalloc::tcmalloc_internal
|
||||
383
src/third_party/tcmalloc/dist/tcmalloc/allocation_sampling.h
vendored
Normal file
@ -0,0 +1,383 @@
|
||||
// Copyright 2022 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef TCMALLOC_ALLOCATION_SAMPLING_H_
|
||||
#define TCMALLOC_ALLOCATION_SAMPLING_H_
|
||||
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "tcmalloc/cpu_cache.h"
|
||||
#include "tcmalloc/guarded_page_allocator.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
#include "tcmalloc/malloc_extension.h"
|
||||
#include "tcmalloc/pagemap.h"
|
||||
#include "tcmalloc/sampler.h"
|
||||
#include "tcmalloc/span.h"
|
||||
#include "tcmalloc/stack_trace_table.h"
|
||||
#include "tcmalloc/tcmalloc_policy.h"
|
||||
#include "tcmalloc/thread_cache.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc::tcmalloc_internal {
|
||||
|
||||
// This function computes a profile that maps a live stack trace to
|
||||
// the number of bytes of central-cache memory pinned by an allocation
|
||||
// at that stack trace.
|
||||
// In the case when span is hosting >= 1 number of small objects (t.proxy !=
|
||||
// nullptr), we call span::Fragmentation() and read `span->allocated_`. It is
|
||||
// safe to do so since we hold the per-sample lock while iterating over sampled
|
||||
// allocations. It prevents the sampled allocation that has the proxy object to
|
||||
// complete deallocation, thus `proxy` can not be returned to the span yet. It
|
||||
// thus prevents the central free list to return the span to the page heap.
|
||||
template <typename State>
|
||||
static std::unique_ptr<const ProfileBase> DumpFragmentationProfile(
|
||||
State& state) {
|
||||
auto profile = std::make_unique<StackTraceTable>(ProfileType::kFragmentation);
|
||||
state.sampled_allocation_recorder().Iterate(
|
||||
[&state, &profile](const SampledAllocation& sampled_allocation) {
|
||||
// Compute fragmentation to charge to this sample:
|
||||
const StackTrace& t = sampled_allocation.sampled_stack;
|
||||
if (t.proxy == nullptr) {
|
||||
// There is just one object per-span, and neighboring spans
|
||||
// can be released back to the system, so we charge no
|
||||
// fragmentation to this sampled object.
|
||||
return;
|
||||
}
|
||||
|
||||
// Fetch the span on which the proxy lives so we can examine its
|
||||
// co-residents.
|
||||
const PageId p = PageIdContaining(t.proxy);
|
||||
Span* span = state.pagemap().GetDescriptor(p);
|
||||
if (span == nullptr) {
|
||||
// Avoid crashes in production mode code, but report in tests.
|
||||
ASSERT(span != nullptr);
|
||||
return;
|
||||
}
|
||||
|
||||
const double frag = span->Fragmentation(t.allocated_size);
|
||||
if (frag > 0) {
|
||||
// Associate the memory warmth with the actual object, not the proxy.
|
||||
// The residency information (t.span_start_address) is likely not very
|
||||
// useful, but we might as well pass it along.
|
||||
profile->AddTrace(frag, t);
|
||||
}
|
||||
});
|
||||
return profile;
|
||||
}
|
||||
|
||||
template <typename State>
|
||||
static std::unique_ptr<const ProfileBase> DumpHeapProfile(State& state) {
|
||||
auto profile = std::make_unique<StackTraceTable>(ProfileType::kHeap);
|
||||
state.sampled_allocation_recorder().Iterate(
|
||||
[&](const SampledAllocation& sampled_allocation) {
|
||||
profile->AddTrace(1.0, sampled_allocation.sampled_stack);
|
||||
});
|
||||
return profile;
|
||||
}
|
||||
|
||||
ABSL_CONST_INIT static thread_local Sampler thread_sampler_
|
||||
ABSL_ATTRIBUTE_INITIAL_EXEC;
|
||||
|
||||
inline Sampler* GetThreadSampler() { return &thread_sampler_; }
|
||||
|
||||
inline bool ShouldGuardingBeAttempted(
|
||||
Profile::Sample::GuardedStatus guarded_status) {
|
||||
switch (guarded_status) {
|
||||
case Profile::Sample::GuardedStatus::LargerThanOnePage:
|
||||
case Profile::Sample::GuardedStatus::Disabled:
|
||||
case Profile::Sample::GuardedStatus::RateLimited:
|
||||
case Profile::Sample::GuardedStatus::TooSmall:
|
||||
case Profile::Sample::GuardedStatus::NoAvailableSlots:
|
||||
case Profile::Sample::GuardedStatus::MProtectFailed:
|
||||
case Profile::Sample::GuardedStatus::Filtered:
|
||||
case Profile::Sample::GuardedStatus::Unknown:
|
||||
case Profile::Sample::GuardedStatus::NotAttempted:
|
||||
return false;
|
||||
case Profile::Sample::GuardedStatus::Requested:
|
||||
case Profile::Sample::GuardedStatus::Required:
|
||||
case Profile::Sample::GuardedStatus::Guarded:
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// If this allocation can be guarded, and if it's time to do a guarded sample,
|
||||
// returns a guarded allocation Span. Otherwise returns nullptr.
|
||||
template <typename State>
|
||||
static GuardedPageAllocator::AllocWithStatus TrySampleGuardedAllocation(
|
||||
State& state, size_t size, size_t alignment, Length num_pages) {
|
||||
if (num_pages != Length(1)) {
|
||||
return {nullptr, Profile::Sample::GuardedStatus::LargerThanOnePage};
|
||||
}
|
||||
Profile::Sample::GuardedStatus guarded_status =
|
||||
GetThreadSampler()->ShouldSampleGuardedAllocation();
|
||||
// If there is a reason not to guard, then return.
|
||||
if (!ShouldGuardingBeAttempted(guarded_status)) {
|
||||
return {nullptr, guarded_status};
|
||||
}
|
||||
// The num_pages == 1 constraint ensures that size <= kPageSize. And
|
||||
// since alignments above kPageSize cause size_class == 0, we're also
|
||||
// guaranteed alignment <= kPageSize
|
||||
//
|
||||
// In all cases kPageSize <= GPA::page_size_, so Allocate's preconditions
|
||||
// are met.
|
||||
return state.guardedpage_allocator().Allocate(size, alignment);
|
||||
}
|
||||
|
||||
// ShouldSampleAllocation() is called when an allocation of the given requested
|
||||
// size is in progress. It returns the sampling weight of the allocation if it
|
||||
// should be "sampled," and 0 otherwise. See SampleifyAllocation().
|
||||
//
|
||||
// Sampling is done based on requested sizes and later unskewed during profile
|
||||
// generation.
|
||||
inline size_t ShouldSampleAllocation(size_t size) {
|
||||
return GetThreadSampler()->RecordAllocation(size);
|
||||
}
|
||||
|
||||
template <typename State>
|
||||
ABSL_ATTRIBUTE_NOINLINE static inline void FreeProxyObject(State& state,
|
||||
void* ptr,
|
||||
size_t size_class) {
|
||||
if (ABSL_PREDICT_TRUE(UsePerCpuCache(state))) {
|
||||
state.cpu_cache().Deallocate(ptr, size_class);
|
||||
} else if (ThreadCache* cache = ThreadCache::GetCacheIfPresent();
|
||||
ABSL_PREDICT_TRUE(cache)) {
|
||||
cache->Deallocate(ptr, size_class);
|
||||
} else {
|
||||
// This thread doesn't have thread-cache yet or already. Delete directly
|
||||
// into transfer cache.
|
||||
state.transfer_cache().InsertRange(size_class, absl::Span<void*>(&ptr, 1));
|
||||
}
|
||||
}
|
||||
|
||||
// Performs sampling for already occurred allocation of object.
|
||||
//
|
||||
// For very small object sizes, object is used as 'proxy' and full
|
||||
// page with sampled marked is allocated instead.
|
||||
//
|
||||
// For medium-sized objects that have single instance per span,
|
||||
// they're simply freed and fresh page span is allocated to represent
|
||||
// sampling.
|
||||
//
|
||||
// For large objects (i.e. allocated with do_malloc_pages) they are
|
||||
// also fully reused and their span is marked as sampled.
|
||||
//
|
||||
// Note that do_free_with_size assumes sampled objects have
|
||||
// page-aligned addresses. Please change both functions if need to
|
||||
// invalidate the assumption.
|
||||
//
|
||||
// Note that size_class might not match requested_size in case of
|
||||
// memalign. I.e. when larger than requested allocation is done to
|
||||
// satisfy alignment constraint.
|
||||
//
|
||||
// In case of out-of-memory condition when allocating span or
|
||||
// stacktrace struct, this function simply cheats and returns original
|
||||
// object. As if no sampling was requested.
|
||||
template <typename State, typename Policy>
|
||||
static void* SampleifyAllocation(State& state, Policy policy,
|
||||
size_t requested_size, size_t weight,
|
||||
size_t size_class, void* obj, Span* span,
|
||||
size_t* capacity) {
|
||||
CHECK_CONDITION((size_class != 0 && obj != nullptr && span == nullptr) ||
|
||||
(size_class == 0 && obj == nullptr && span != nullptr));
|
||||
|
||||
StackTrace stack_trace;
|
||||
stack_trace.proxy = nullptr;
|
||||
stack_trace.requested_size = requested_size;
|
||||
// Grab the stack trace outside the heap lock.
|
||||
stack_trace.depth = absl::GetStackTrace(stack_trace.stack, kMaxStackDepth, 0);
|
||||
|
||||
// requested_alignment = 1 means 'small size table alignment was used'
|
||||
// Historically this is reported as requested_alignment = 0
|
||||
stack_trace.requested_alignment = policy.align();
|
||||
if (stack_trace.requested_alignment == 1) {
|
||||
stack_trace.requested_alignment = 0;
|
||||
}
|
||||
|
||||
stack_trace.requested_size_returning = capacity != nullptr;
|
||||
stack_trace.access_hint = static_cast<uint8_t>(policy.access());
|
||||
stack_trace.weight = weight;
|
||||
|
||||
GuardedPageAllocator::AllocWithStatus alloc_with_status{
|
||||
nullptr, Profile::Sample::GuardedStatus::NotAttempted};
|
||||
|
||||
if (size_class != 0) {
|
||||
ASSERT(size_class == state.pagemap().sizeclass(PageIdContaining(obj)));
|
||||
|
||||
stack_trace.allocated_size = state.sizemap().class_to_size(size_class);
|
||||
stack_trace.cold_allocated = IsExpandedSizeClass(size_class);
|
||||
|
||||
// If the caller didn't provide a span, allocate one:
|
||||
Length num_pages = BytesToLengthCeil(stack_trace.allocated_size);
|
||||
alloc_with_status = TrySampleGuardedAllocation(
|
||||
state, requested_size, stack_trace.requested_alignment, num_pages);
|
||||
if (alloc_with_status.status == Profile::Sample::GuardedStatus::Guarded) {
|
||||
ASSERT(IsSampledMemory(alloc_with_status.alloc));
|
||||
const PageId p = PageIdContaining(alloc_with_status.alloc);
|
||||
absl::base_internal::SpinLockHolder h(&pageheap_lock);
|
||||
span = Span::New(p, num_pages);
|
||||
state.pagemap().Set(p, span);
|
||||
// If we report capacity back from a size returning allocation, we can not
|
||||
// report the allocated_size, as we guard the size to 'requested_size',
|
||||
// and we maintain the invariant that GetAllocatedSize() must match the
|
||||
// returned size from size returning allocations. So in that case, we
|
||||
// report the requested size for both capacity and GetAllocatedSize().
|
||||
if (capacity) stack_trace.allocated_size = requested_size;
|
||||
} else if ((span = state.page_allocator().New(
|
||||
num_pages, 1, MemoryTag::kSampled)) == nullptr) {
|
||||
if (capacity) *capacity = stack_trace.allocated_size;
|
||||
return obj;
|
||||
}
|
||||
|
||||
size_t span_size =
|
||||
Length(state.sizemap().class_to_pages(size_class)).in_bytes();
|
||||
size_t objects_per_span = span_size / stack_trace.allocated_size;
|
||||
|
||||
if (objects_per_span != 1) {
|
||||
ASSERT(objects_per_span > 1);
|
||||
stack_trace.proxy = obj;
|
||||
obj = nullptr;
|
||||
}
|
||||
} else {
|
||||
// Set allocated_size to the exact size for a page allocation.
|
||||
// NOTE: if we introduce gwp-asan sampling / guarded allocations
|
||||
// for page allocations, then we need to revisit do_malloc_pages as
|
||||
// the current assumption is that only class sized allocs are sampled
|
||||
// for gwp-asan.
|
||||
stack_trace.allocated_size = span->bytes_in_span();
|
||||
stack_trace.cold_allocated = IsColdMemory(span->start_address());
|
||||
}
|
||||
if (capacity) *capacity = stack_trace.allocated_size;
|
||||
|
||||
ASSERT(span != nullptr);
|
||||
|
||||
stack_trace.sampled_alloc_handle =
|
||||
state.sampled_alloc_handle_generator.fetch_add(
|
||||
1, std::memory_order_relaxed) +
|
||||
1;
|
||||
stack_trace.span_start_address = span->start_address();
|
||||
stack_trace.allocation_time = absl::Now();
|
||||
stack_trace.guarded_status = static_cast<int>(alloc_with_status.status);
|
||||
|
||||
// How many allocations does this sample represent, given the sampling
|
||||
// frequency (weight) and its size.
|
||||
const double allocation_estimate =
|
||||
static_cast<double>(weight) / (requested_size + 1);
|
||||
|
||||
// Adjust our estimate of internal fragmentation.
|
||||
ASSERT(requested_size <= stack_trace.allocated_size);
|
||||
if (requested_size < stack_trace.allocated_size) {
|
||||
state.sampled_internal_fragmentation_.Add(
|
||||
allocation_estimate * (stack_trace.allocated_size - requested_size));
|
||||
}
|
||||
|
||||
state.allocation_samples.ReportMalloc(stack_trace);
|
||||
|
||||
state.deallocation_samples.ReportMalloc(stack_trace);
|
||||
|
||||
// The SampledAllocation object is visible to readers after this. Readers only
|
||||
// care about its various metadata (e.g. stack trace, weight) to generate the
|
||||
// heap profile, and won't need any information from Span::Sample() next.
|
||||
SampledAllocation* sampled_allocation =
|
||||
state.sampled_allocation_recorder().Register(std::move(stack_trace));
|
||||
// No pageheap_lock required. The span is freshly allocated and no one else
|
||||
// can access it. It is visible after we return from this allocation path.
|
||||
span->Sample(sampled_allocation);
|
||||
|
||||
state.peak_heap_tracker().MaybeSaveSample();
|
||||
|
||||
if (obj != nullptr) {
|
||||
// We are not maintaining precise statistics on malloc hit/miss rates at our
|
||||
// cache tiers. We can deallocate into our ordinary cache.
|
||||
ASSERT(size_class != 0);
|
||||
FreeProxyObject(state, obj, size_class);
|
||||
}
|
||||
return (alloc_with_status.alloc != nullptr) ? alloc_with_status.alloc
|
||||
: span->start_address();
|
||||
}
|
||||
|
||||
template <typename State>
|
||||
inline void MaybeUnsampleAllocation(State& state, void* ptr, Span* span) {
|
||||
// No pageheap_lock required. The sampled span should be unmarked and have its
|
||||
// state cleared only once. External synchronization when freeing is required;
|
||||
// otherwise, concurrent writes here would likely report a double-free.
|
||||
if (SampledAllocation* sampled_allocation = span->Unsample()) {
|
||||
void* const proxy = sampled_allocation->sampled_stack.proxy;
|
||||
const size_t weight = sampled_allocation->sampled_stack.weight;
|
||||
const size_t requested_size =
|
||||
sampled_allocation->sampled_stack.requested_size;
|
||||
const size_t allocated_size =
|
||||
sampled_allocation->sampled_stack.allocated_size;
|
||||
const size_t alignment =
|
||||
sampled_allocation->sampled_stack.requested_alignment;
|
||||
// How many allocations does this sample represent, given the sampling
|
||||
// frequency (weight) and its size.
|
||||
const double allocation_estimate =
|
||||
static_cast<double>(weight) / (requested_size + 1);
|
||||
AllocHandle sampled_alloc_handle =
|
||||
sampled_allocation->sampled_stack.sampled_alloc_handle;
|
||||
state.sampled_allocation_recorder().Unregister(sampled_allocation);
|
||||
|
||||
// Adjust our estimate of internal fragmentation.
|
||||
ASSERT(requested_size <= allocated_size);
|
||||
if (requested_size < allocated_size) {
|
||||
const size_t sampled_fragmentation =
|
||||
allocation_estimate * (allocated_size - requested_size);
|
||||
|
||||
// Check against wraparound
|
||||
ASSERT(state.sampled_internal_fragmentation_.value() >=
|
||||
sampled_fragmentation);
|
||||
state.sampled_internal_fragmentation_.Add(-sampled_fragmentation);
|
||||
}
|
||||
|
||||
state.deallocation_samples.ReportFree(sampled_alloc_handle);
|
||||
|
||||
if (proxy) {
|
||||
const auto policy = CppPolicy().InSameNumaPartitionAs(proxy);
|
||||
size_t size_class;
|
||||
if (AccessFromPointer(proxy) == AllocationAccess::kCold) {
|
||||
size_class = state.sizemap().SizeClass(
|
||||
policy.AccessAsCold().AlignAs(alignment), allocated_size);
|
||||
} else {
|
||||
size_class = state.sizemap().SizeClass(
|
||||
policy.AccessAsHot().AlignAs(alignment), allocated_size);
|
||||
}
|
||||
ASSERT(size_class == state.pagemap().sizeclass(PageIdContaining(proxy)));
|
||||
FreeProxyObject(state, proxy, size_class);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename State, typename Policy, typename CapacityPtr>
|
||||
static void* SampleLargeAllocation(State& state, Policy policy,
|
||||
size_t requested_size, size_t weight,
|
||||
Span* span, CapacityPtr capacity) {
|
||||
return SampleifyAllocation(state, policy, requested_size, weight, 0, nullptr,
|
||||
span, capacity);
|
||||
}
|
||||
|
||||
template <typename State, typename Policy, typename CapacityPtr>
|
||||
static void* SampleSmallAllocation(State& state, Policy policy,
|
||||
size_t requested_size, size_t weight,
|
||||
size_t size_class, void* obj,
|
||||
CapacityPtr capacity) {
|
||||
return SampleifyAllocation(state, policy, requested_size, weight, size_class,
|
||||
obj, nullptr, capacity);
|
||||
}
|
||||
|
||||
} // namespace tcmalloc::tcmalloc_internal
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
|
||||
#endif // TCMALLOC_ALLOCATION_SAMPLING_H_
|
||||
85
src/third_party/tcmalloc/dist/tcmalloc/arena.cc
vendored
Normal file
@ -0,0 +1,85 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tcmalloc/arena.h"
|
||||
|
||||
#include <new>
|
||||
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
#include "tcmalloc/static_vars.h"
|
||||
#include "tcmalloc/system-alloc.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
|
||||
void* Arena::Alloc(size_t bytes, std::align_val_t alignment) {
|
||||
size_t align = static_cast<size_t>(alignment);
|
||||
ASSERT(align > 0);
|
||||
{ // First we need to move up to the correct alignment.
|
||||
const int misalignment = reinterpret_cast<uintptr_t>(free_area_) % align;
|
||||
const int alignment_bytes = misalignment != 0 ? align - misalignment : 0;
|
||||
free_area_ += alignment_bytes;
|
||||
free_avail_ -= alignment_bytes;
|
||||
bytes_allocated_ += alignment_bytes;
|
||||
}
|
||||
char* result;
|
||||
if (free_avail_ < bytes) {
|
||||
size_t ask = bytes > kAllocIncrement ? bytes : kAllocIncrement;
|
||||
// TODO(b/171081864): Arena allocations should be made relatively
|
||||
// infrequently. Consider tagging this memory with sampled objects which
|
||||
// are also infrequently allocated.
|
||||
//
|
||||
// In the meantime it is important that we use the current NUMA partition
|
||||
// rather than always using a particular one because it's possible that any
|
||||
// single partition we choose might only contain nodes that the process is
|
||||
// unable to allocate from due to cgroup restrictions.
|
||||
MemoryTag tag;
|
||||
const auto& numa_topology = tc_globals.numa_topology();
|
||||
if (numa_topology.numa_aware()) {
|
||||
tag = NumaNormalTag(numa_topology.GetCurrentPartition());
|
||||
} else {
|
||||
tag = MemoryTag::kNormal;
|
||||
}
|
||||
|
||||
auto [ptr, actual_size] = SystemAlloc(ask, kPageSize, tag);
|
||||
free_area_ = reinterpret_cast<char*>(ptr);
|
||||
if (ABSL_PREDICT_FALSE(free_area_ == nullptr)) {
|
||||
Crash(kCrash, __FILE__, __LINE__,
|
||||
"FATAL ERROR: Out of memory trying to allocate internal tcmalloc "
|
||||
"data (bytes, object-size); is something preventing mmap from "
|
||||
"succeeding (sandbox, VSS limitations)?",
|
||||
kAllocIncrement, bytes);
|
||||
}
|
||||
SystemBack(free_area_, actual_size);
|
||||
|
||||
// We've discarded the previous free_area_, so any bytes that were
|
||||
// unallocated are effectively inaccessible to future allocations.
|
||||
bytes_unavailable_ += free_avail_;
|
||||
blocks_++;
|
||||
|
||||
free_avail_ = actual_size;
|
||||
}
|
||||
|
||||
ASSERT(reinterpret_cast<uintptr_t>(free_area_) % align == 0);
|
||||
result = free_area_;
|
||||
free_area_ += bytes;
|
||||
free_avail_ -= bytes;
|
||||
bytes_allocated_ += bytes;
|
||||
return reinterpret_cast<void*>(result);
|
||||
}
|
||||
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
107
src/third_party/tcmalloc/dist/tcmalloc/arena.h
vendored
Normal file
@ -0,0 +1,107 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef TCMALLOC_ARENA_H_
|
||||
#define TCMALLOC_ARENA_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <new>
|
||||
|
||||
#include "absl/base/attributes.h"
|
||||
#include "absl/base/thread_annotations.h"
|
||||
#include "tcmalloc/common.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
|
||||
struct ArenaStats {
|
||||
// The number of bytes allocated and in-use by calls to Alloc().
|
||||
size_t bytes_allocated;
|
||||
// The number of bytes currently reserved for future calls to Alloc().
|
||||
size_t bytes_unallocated;
|
||||
// The number of bytes lost and unavailable to calls to Alloc() due to
|
||||
// inefficiencies in Arena.
|
||||
size_t bytes_unavailable;
|
||||
// The number of allocated bytes that have subsequently become non-resident,
|
||||
// e.g. due to the slab being resized. Note that these bytes are disjoint from
|
||||
// the ones counted in `bytes_allocated`.
|
||||
size_t bytes_nonresident;
|
||||
|
||||
// The number of blocks allocated by the Arena.
|
||||
size_t blocks;
|
||||
};
|
||||
|
||||
// Arena allocation; designed for use by tcmalloc internal data structures like
|
||||
// spans, profiles, etc. Always expands.
|
||||
class Arena {
|
||||
public:
|
||||
constexpr Arena() {}
|
||||
|
||||
// Returns a properly aligned byte array of length "bytes". Crashes if
|
||||
// allocation fails. Requires pageheap_lock is held.
|
||||
ABSL_ATTRIBUTE_RETURNS_NONNULL void* Alloc(
|
||||
size_t bytes, std::align_val_t alignment = kAlignment)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
|
||||
|
||||
// Updates the stats for allocated and non-resident bytes.
|
||||
void UpdateAllocatedAndNonresident(int64_t allocated, int64_t nonresident)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
|
||||
ASSERT(static_cast<int64_t>(bytes_allocated_) + allocated >= 0);
|
||||
bytes_allocated_ += allocated;
|
||||
ASSERT(static_cast<int64_t>(bytes_nonresident_) + nonresident >= 0);
|
||||
bytes_nonresident_ += nonresident;
|
||||
}
|
||||
|
||||
// Returns statistics about memory allocated and managed by this Arena.
|
||||
ArenaStats stats() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
|
||||
ArenaStats s;
|
||||
s.bytes_allocated = bytes_allocated_;
|
||||
s.bytes_unallocated = free_avail_;
|
||||
s.bytes_unavailable = bytes_unavailable_;
|
||||
s.bytes_nonresident = bytes_nonresident_;
|
||||
s.blocks = blocks_;
|
||||
return s;
|
||||
}
|
||||
|
||||
private:
|
||||
// How much to allocate from system at a time
|
||||
static constexpr int kAllocIncrement = 128 << 10;
|
||||
|
||||
// Free area from which to carve new objects
|
||||
char* free_area_ ABSL_GUARDED_BY(pageheap_lock) = nullptr;
|
||||
size_t free_avail_ ABSL_GUARDED_BY(pageheap_lock) = 0;
|
||||
|
||||
// Total number of bytes allocated from this arena
|
||||
size_t bytes_allocated_ ABSL_GUARDED_BY(pageheap_lock) = 0;
|
||||
// The number of bytes that are unused and unavailable for future allocations
|
||||
// because they are at the end of a discarded arena block.
|
||||
size_t bytes_unavailable_ ABSL_GUARDED_BY(pageheap_lock) = 0;
|
||||
// The number of bytes on the arena that have been MADV_DONTNEEDed away. Note
|
||||
// that these bytes are disjoint from the ones counted in `bytes_allocated`.
|
||||
size_t bytes_nonresident_ ABSL_GUARDED_BY(pageheap_lock) = 0;
|
||||
// Total number of blocks/free areas managed by this Arena.
|
||||
size_t blocks_ ABSL_GUARDED_BY(pageheap_lock) = 0;
|
||||
|
||||
Arena(const Arena&) = delete;
|
||||
Arena& operator=(const Arena&) = delete;
|
||||
};
|
||||
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
|
||||
#endif // TCMALLOC_ARENA_H_
|
||||
158
src/third_party/tcmalloc/dist/tcmalloc/arena_test.cc
vendored
Normal file
@ -0,0 +1,158 @@
|
||||
// Copyright 2021 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tcmalloc/arena.h"
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include <new>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
namespace {
|
||||
|
||||
std::align_val_t Align(int align) {
|
||||
return static_cast<std::align_val_t>(align);
|
||||
}
|
||||
|
||||
TEST(Arena, AlignedAlloc) {
|
||||
Arena arena;
|
||||
absl::base_internal::SpinLockHolder h(&pageheap_lock);
|
||||
EXPECT_EQ(reinterpret_cast<uintptr_t>(arena.Alloc(64, Align(64))) % 64, 0);
|
||||
EXPECT_EQ(reinterpret_cast<uintptr_t>(arena.Alloc(7)) % 8, 0);
|
||||
EXPECT_EQ(reinterpret_cast<uintptr_t>(arena.Alloc(128, Align(64))) % 64, 0);
|
||||
for (int alignment = 1; alignment < 100; ++alignment) {
|
||||
EXPECT_EQ(reinterpret_cast<uintptr_t>(arena.Alloc(7, Align(alignment))) %
|
||||
alignment,
|
||||
0);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(Arena, Stats) {
|
||||
Arena arena;
|
||||
|
||||
ArenaStats stats;
|
||||
{
|
||||
absl::base_internal::SpinLockHolder h(&pageheap_lock);
|
||||
stats = arena.stats();
|
||||
}
|
||||
EXPECT_EQ(stats.bytes_allocated, 0);
|
||||
EXPECT_EQ(stats.bytes_unallocated, 0);
|
||||
EXPECT_EQ(stats.bytes_unavailable, 0);
|
||||
EXPECT_EQ(stats.bytes_nonresident, 0);
|
||||
EXPECT_EQ(stats.blocks, 0);
|
||||
|
||||
// Trigger an allocation and grab new stats.
|
||||
ArenaStats stats_after_alloc;
|
||||
void* ptr;
|
||||
{
|
||||
absl::base_internal::SpinLockHolder h(&pageheap_lock);
|
||||
ptr = arena.Alloc(1, Align(1));
|
||||
stats_after_alloc = arena.stats();
|
||||
}
|
||||
EXPECT_NE(ptr, nullptr);
|
||||
|
||||
EXPECT_EQ(stats_after_alloc.bytes_allocated, 1);
|
||||
EXPECT_GE(stats_after_alloc.bytes_unallocated, 0);
|
||||
EXPECT_EQ(stats_after_alloc.bytes_unavailable, 0);
|
||||
EXPECT_EQ(stats_after_alloc.bytes_nonresident, 0);
|
||||
EXPECT_EQ(stats_after_alloc.blocks, 1);
|
||||
|
||||
// Trigger an allocation that is larger than the remaining free bytes.
|
||||
//
|
||||
// TODO(b/201694482): Optimize this.
|
||||
ArenaStats stats_after_alloc2;
|
||||
{
|
||||
absl::base_internal::SpinLockHolder h(&pageheap_lock);
|
||||
ptr = arena.Alloc(stats_after_alloc.bytes_unallocated + 1, Align(1));
|
||||
stats_after_alloc2 = arena.stats();
|
||||
}
|
||||
EXPECT_NE(ptr, nullptr);
|
||||
|
||||
EXPECT_EQ(stats_after_alloc2.bytes_allocated,
|
||||
stats_after_alloc.bytes_unallocated + 2);
|
||||
EXPECT_GE(stats_after_alloc2.bytes_unallocated, 0);
|
||||
EXPECT_EQ(stats_after_alloc2.bytes_unavailable,
|
||||
stats_after_alloc.bytes_unallocated);
|
||||
EXPECT_EQ(stats_after_alloc.bytes_nonresident, 0);
|
||||
EXPECT_EQ(stats_after_alloc2.blocks, 2);
|
||||
}
|
||||
|
||||
TEST(Arena, ReportUnmapped) {
|
||||
Arena arena;
|
||||
ArenaStats stats_after_alloc;
|
||||
void* ptr;
|
||||
{
|
||||
absl::base_internal::SpinLockHolder h(&pageheap_lock);
|
||||
ptr = arena.Alloc(10, Align(1));
|
||||
stats_after_alloc = arena.stats();
|
||||
}
|
||||
EXPECT_NE(ptr, nullptr);
|
||||
|
||||
EXPECT_EQ(stats_after_alloc.bytes_allocated, 10);
|
||||
EXPECT_EQ(stats_after_alloc.bytes_nonresident, 0);
|
||||
|
||||
{
|
||||
absl::base_internal::SpinLockHolder h(&pageheap_lock);
|
||||
arena.UpdateAllocatedAndNonresident(-5, 5);
|
||||
stats_after_alloc = arena.stats();
|
||||
}
|
||||
|
||||
EXPECT_EQ(stats_after_alloc.bytes_allocated, 5);
|
||||
EXPECT_EQ(stats_after_alloc.bytes_nonresident, 5);
|
||||
|
||||
{
|
||||
absl::base_internal::SpinLockHolder h(&pageheap_lock);
|
||||
arena.UpdateAllocatedAndNonresident(3, -3);
|
||||
stats_after_alloc = arena.stats();
|
||||
}
|
||||
|
||||
EXPECT_EQ(stats_after_alloc.bytes_allocated, 8);
|
||||
EXPECT_EQ(stats_after_alloc.bytes_nonresident, 2);
|
||||
}
|
||||
|
||||
TEST(Arena, BytesImpending) {
|
||||
Arena arena;
|
||||
|
||||
ArenaStats stats;
|
||||
{
|
||||
absl::base_internal::SpinLockHolder h(&pageheap_lock);
|
||||
stats = arena.stats();
|
||||
}
|
||||
EXPECT_EQ(stats.bytes_allocated, 0);
|
||||
|
||||
{
|
||||
absl::base_internal::SpinLockHolder h(&pageheap_lock);
|
||||
arena.UpdateAllocatedAndNonresident(100, 0);
|
||||
stats = arena.stats();
|
||||
}
|
||||
|
||||
EXPECT_EQ(stats.bytes_allocated, 100);
|
||||
|
||||
void* ptr;
|
||||
{
|
||||
absl::base_internal::SpinLockHolder h(&pageheap_lock);
|
||||
arena.UpdateAllocatedAndNonresident(-100, 0);
|
||||
ptr = arena.Alloc(100, Align(1));
|
||||
stats = arena.stats();
|
||||
}
|
||||
EXPECT_NE(ptr, nullptr);
|
||||
EXPECT_EQ(stats.bytes_allocated, 100);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
131
src/third_party/tcmalloc/dist/tcmalloc/background.cc
vendored
Normal file
@ -0,0 +1,131 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <errno.h>
|
||||
|
||||
#include "absl/base/internal/sysinfo.h"
|
||||
#include "absl/time/clock.h"
|
||||
#include "absl/time/time.h"
|
||||
#include "tcmalloc/cpu_cache.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
#include "tcmalloc/internal/percpu.h"
|
||||
#include "tcmalloc/internal_malloc_extension.h"
|
||||
#include "tcmalloc/malloc_extension.h"
|
||||
#include "tcmalloc/parameters.h"
|
||||
#include "tcmalloc/static_vars.h"
|
||||
|
||||
// Release memory to the system at a constant rate.
|
||||
void MallocExtension_Internal_ProcessBackgroundActions() {
|
||||
using ::tcmalloc::tcmalloc_internal::Parameters;
|
||||
using ::tcmalloc::tcmalloc_internal::tc_globals;
|
||||
|
||||
tcmalloc::MallocExtension::MarkThreadIdle();
|
||||
|
||||
absl::Time prev_time = absl::Now();
|
||||
constexpr absl::Duration kSleepTime = absl::Seconds(1);
|
||||
|
||||
// Reclaim inactive per-cpu caches once per kCpuCacheReclaimPeriod.
|
||||
//
|
||||
// We use a longer 30 sec reclaim period to make sure that caches are indeed
|
||||
// idle. Reclaim drains entire cache, as opposed to cache shuffle for instance
|
||||
// that only shrinks a cache by a few objects at a time. So, we might have
|
||||
// larger performance degradation if we use a shorter reclaim interval and
|
||||
// drain caches that weren't supposed to.
|
||||
constexpr absl::Duration kCpuCacheReclaimPeriod = absl::Seconds(30);
|
||||
absl::Time last_reclaim = absl::Now();
|
||||
|
||||
// Shuffle per-cpu caches once per kCpuCacheShufflePeriod.
|
||||
constexpr absl::Duration kCpuCacheShufflePeriod = absl::Seconds(5);
|
||||
absl::Time last_shuffle = absl::Now();
|
||||
|
||||
// See if we should resize the slab once per kCpuCacheSlabResizePeriod. This
|
||||
// period is coprime to kCpuCacheShufflePeriod and kCpuCacheReclaimPeriod.
|
||||
constexpr absl::Duration kCpuCacheSlabResizePeriod = absl::Seconds(29);
|
||||
absl::Time last_slab_resize_check = absl::Now();
|
||||
|
||||
#ifndef TCMALLOC_SMALL_BUT_SLOW
|
||||
// We reclaim unused objects from the transfer caches once per
|
||||
// kTransferCacheResizePeriod.
|
||||
constexpr absl::Duration kTransferCachePlunderPeriod = absl::Seconds(5);
|
||||
absl::Time last_transfer_cache_plunder_check = absl::Now();
|
||||
|
||||
// Resize transfer caches once per kTransferCacheResizePeriod.
|
||||
constexpr absl::Duration kTransferCacheResizePeriod = absl::Seconds(2);
|
||||
absl::Time last_transfer_cache_resize_check = absl::Now();
|
||||
#endif
|
||||
|
||||
while (true) {
|
||||
absl::Time now = absl::Now();
|
||||
|
||||
// We follow the cache hierarchy in TCMalloc from outermost (per-CPU) to
|
||||
// innermost (the page heap). Freeing up objects at one layer can help aid
|
||||
// memory coalescing for inner caches.
|
||||
|
||||
if (tcmalloc::MallocExtension::PerCpuCachesActive()) {
|
||||
// Accelerate fences as part of this operation by registering this thread
|
||||
// with rseq. While this is not strictly required to succeed, we do not
|
||||
// expect an inconsistent state for rseq (some threads registered and some
|
||||
// threads unable to).
|
||||
CHECK_CONDITION(tcmalloc::tcmalloc_internal::subtle::percpu::IsFast());
|
||||
|
||||
// Try to reclaim per-cpu caches once every kCpuCacheReclaimPeriod
|
||||
// when enabled.
|
||||
if (now - last_reclaim >= kCpuCacheReclaimPeriod) {
|
||||
tc_globals.cpu_cache().TryReclaimingCaches();
|
||||
last_reclaim = now;
|
||||
}
|
||||
|
||||
if (Parameters::shuffle_per_cpu_caches() &&
|
||||
now - last_shuffle >= kCpuCacheShufflePeriod) {
|
||||
tc_globals.cpu_cache().ShuffleCpuCaches();
|
||||
last_shuffle = now;
|
||||
}
|
||||
|
||||
// See if we need to grow the slab once every kCpuCacheSlabResizePeriod
|
||||
// when enabled.
|
||||
if (Parameters::per_cpu_caches_dynamic_slab_enabled() &&
|
||||
now - last_slab_resize_check >= kCpuCacheSlabResizePeriod) {
|
||||
tc_globals.cpu_cache().ResizeSlabIfNeeded();
|
||||
last_slab_resize_check = now;
|
||||
}
|
||||
}
|
||||
|
||||
tc_globals.sharded_transfer_cache().Plunder();
|
||||
|
||||
#ifndef TCMALLOC_SMALL_BUT_SLOW
|
||||
// Try to plunder and reclaim unused objects from transfer caches.
|
||||
if (now - last_transfer_cache_plunder_check >=
|
||||
kTransferCachePlunderPeriod &&
|
||||
Parameters::partial_transfer_cache()) {
|
||||
tc_globals.transfer_cache().TryPlunder();
|
||||
last_transfer_cache_plunder_check = now;
|
||||
}
|
||||
|
||||
if (now - last_transfer_cache_resize_check >= kTransferCacheResizePeriod) {
|
||||
tc_globals.transfer_cache().TryResizingCaches();
|
||||
last_transfer_cache_resize_check = now;
|
||||
}
|
||||
#endif
|
||||
|
||||
const ssize_t bytes_to_release =
|
||||
static_cast<size_t>(Parameters::background_release_rate()) *
|
||||
absl::ToDoubleSeconds(now - prev_time);
|
||||
if (bytes_to_release > 0) { // may be negative if time goes backwards
|
||||
tcmalloc::MallocExtension::ReleaseMemoryToSystem(bytes_to_release);
|
||||
}
|
||||
|
||||
prev_time = now;
|
||||
absl::SleepFor(kSleepTime);
|
||||
}
|
||||
}
|
||||
116
src/third_party/tcmalloc/dist/tcmalloc/central_freelist.cc
vendored
Normal file
@ -0,0 +1,116 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tcmalloc/central_freelist.h"
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "tcmalloc/internal/linked_list.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
#include "tcmalloc/internal/optimization.h"
|
||||
#include "tcmalloc/internal/prefetch.h"
|
||||
#include "tcmalloc/page_heap.h"
|
||||
#include "tcmalloc/pagemap.h"
|
||||
#include "tcmalloc/pages.h"
|
||||
#include "tcmalloc/static_vars.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
namespace central_freelist_internal {
|
||||
|
||||
static MemoryTag MemoryTagFromSizeClass(size_t size_class) {
|
||||
if (IsExpandedSizeClass(size_class)) {
|
||||
return MemoryTag::kCold;
|
||||
}
|
||||
if (!tc_globals.numa_topology().numa_aware()) {
|
||||
return MemoryTag::kNormal;
|
||||
}
|
||||
return NumaNormalTag(size_class / kNumBaseClasses);
|
||||
}
|
||||
|
||||
size_t StaticForwarder::class_to_size(int size_class) {
|
||||
return tc_globals.sizemap().class_to_size(size_class);
|
||||
}
|
||||
|
||||
Length StaticForwarder::class_to_pages(int size_class) {
|
||||
return Length(tc_globals.sizemap().class_to_pages(size_class));
|
||||
}
|
||||
|
||||
Span* StaticForwarder::MapObjectToSpan(const void* object) {
|
||||
const PageId p = PageIdContaining(object);
|
||||
Span* span = tc_globals.pagemap().GetExistingDescriptor(p);
|
||||
return span;
|
||||
}
|
||||
|
||||
Span* StaticForwarder::AllocateSpan(int size_class, size_t objects_per_span,
|
||||
Length pages_per_span) {
|
||||
const MemoryTag tag = MemoryTagFromSizeClass(size_class);
|
||||
Span* span =
|
||||
tc_globals.page_allocator().New(pages_per_span, objects_per_span, tag);
|
||||
if (ABSL_PREDICT_FALSE(span == nullptr)) {
|
||||
return nullptr;
|
||||
}
|
||||
ASSERT(tag == GetMemoryTag(span->start_address()));
|
||||
ASSERT(span->num_pages() == pages_per_span);
|
||||
|
||||
tc_globals.pagemap().RegisterSizeClass(span, size_class);
|
||||
return span;
|
||||
}
|
||||
|
||||
static void ReturnSpansToPageHeap(MemoryTag tag, absl::Span<Span*> free_spans,
|
||||
size_t objects_per_span)
|
||||
ABSL_LOCKS_EXCLUDED(pageheap_lock) {
|
||||
absl::base_internal::SpinLockHolder h(&pageheap_lock);
|
||||
for (Span* const free_span : free_spans) {
|
||||
ASSERT(tag == GetMemoryTag(free_span->start_address()));
|
||||
tc_globals.page_allocator().Delete(free_span, objects_per_span, tag);
|
||||
}
|
||||
}
|
||||
|
||||
void StaticForwarder::DeallocateSpans(int size_class, size_t objects_per_span,
|
||||
absl::Span<Span*> free_spans) {
|
||||
// Unregister size class doesn't require holding any locks.
|
||||
for (Span* const free_span : free_spans) {
|
||||
ASSERT(IsNormalMemory(free_span->start_address()) ||
|
||||
IsColdMemory(free_span->start_address()));
|
||||
tc_globals.pagemap().UnregisterSizeClass(free_span);
|
||||
|
||||
// Before taking pageheap_lock, prefetch the PageTrackers these spans are
|
||||
// on.
|
||||
//
|
||||
// Small-but-slow does not use the HugePageAwareAllocator (by default), so
|
||||
// do not prefetch on this config.
|
||||
#ifndef TCMALLOC_SMALL_BUT_SLOW
|
||||
const PageId p = free_span->first_page();
|
||||
|
||||
// In huge_page_filler.h, we static_assert that PageTracker's key elements
|
||||
// for deallocation are within the first two cachelines.
|
||||
void* pt = tc_globals.pagemap().GetHugepage(p);
|
||||
// Prefetch for writing, as we will issue stores to the PageTracker
|
||||
// instance.
|
||||
PrefetchW(pt);
|
||||
PrefetchW(reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(pt) +
|
||||
ABSL_CACHELINE_SIZE));
|
||||
#endif // TCMALLOC_SMALL_BUT_SLOW
|
||||
}
|
||||
|
||||
const MemoryTag tag = MemoryTagFromSizeClass(size_class);
|
||||
ReturnSpansToPageHeap(tag, free_spans, objects_per_span);
|
||||
}
|
||||
|
||||
} // namespace central_freelist_internal
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
581
src/third_party/tcmalloc/dist/tcmalloc/central_freelist.h
vendored
Normal file
@ -0,0 +1,581 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef TCMALLOC_CENTRAL_FREELIST_H_
|
||||
#define TCMALLOC_CENTRAL_FREELIST_H_
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
|
||||
#include "absl/base/attributes.h"
|
||||
#include "absl/base/const_init.h"
|
||||
#include "absl/base/internal/spinlock.h"
|
||||
#include "absl/base/macros.h"
|
||||
#include "absl/base/thread_annotations.h"
|
||||
#include "tcmalloc/common.h"
|
||||
#include "tcmalloc/hinted_tracker_lists.h"
|
||||
#include "tcmalloc/internal/atomic_stats_counter.h"
|
||||
#include "tcmalloc/internal/optimization.h"
|
||||
#include "tcmalloc/pages.h"
|
||||
#include "tcmalloc/parameters.h"
|
||||
#include "tcmalloc/span.h"
|
||||
#include "tcmalloc/span_stats.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
|
||||
namespace central_freelist_internal {
|
||||
|
||||
// StaticForwarder provides access to the PageMap and page heap.
|
||||
//
|
||||
// This is a class, rather than namespaced globals, so that it can be mocked for
|
||||
// testing.
|
||||
class StaticForwarder {
|
||||
public:
|
||||
static size_t class_to_size(int size_class);
|
||||
static Length class_to_pages(int size_class);
|
||||
static Span* MapObjectToSpan(const void* object);
|
||||
static Span* AllocateSpan(int size_class, size_t objects_per_span,
|
||||
Length pages_per_span)
|
||||
ABSL_LOCKS_EXCLUDED(pageheap_lock);
|
||||
static void DeallocateSpans(int size_class, size_t objects_per_span,
|
||||
absl::Span<Span*> free_spans)
|
||||
ABSL_LOCKS_EXCLUDED(pageheap_lock);
|
||||
};
|
||||
|
||||
// Specifies number of nonempty_ lists that keep track of non-empty spans.
|
||||
static constexpr size_t kNumLists = 8;
|
||||
|
||||
// Data kept per size-class in central cache.
|
||||
template <typename ForwarderT>
|
||||
class CentralFreeList {
|
||||
public:
|
||||
using Forwarder = ForwarderT;
|
||||
|
||||
constexpr CentralFreeList()
|
||||
: lock_(absl::kConstInit, absl::base_internal::SCHEDULE_KERNEL_ONLY),
|
||||
size_class_(0),
|
||||
object_size_(0),
|
||||
objects_per_span_(0),
|
||||
first_nonempty_index_(0),
|
||||
pages_per_span_(0),
|
||||
nonempty_() {}
|
||||
|
||||
CentralFreeList(const CentralFreeList&) = delete;
|
||||
CentralFreeList& operator=(const CentralFreeList&) = delete;
|
||||
|
||||
void Init(size_t size_class) ABSL_LOCKS_EXCLUDED(lock_);
|
||||
|
||||
// These methods all do internal locking.
|
||||
|
||||
// Insert batch into the central freelist.
|
||||
// REQUIRES: batch.size() > 0 && batch.size() <= kMaxObjectsToMove.
|
||||
void InsertRange(absl::Span<void*> batch) ABSL_LOCKS_EXCLUDED(lock_);
|
||||
|
||||
// Fill a prefix of batch[0..N-1] with up to N elements removed from central
|
||||
// freelist. Return the number of elements removed.
|
||||
ABSL_MUST_USE_RESULT int RemoveRange(void** batch, int N)
|
||||
ABSL_LOCKS_EXCLUDED(lock_);
|
||||
|
||||
// Returns the number of free objects in cache.
|
||||
size_t length() const { return static_cast<size_t>(counter_.value()); }
|
||||
|
||||
// Returns the memory overhead (internal fragmentation) attributable
|
||||
// to the freelist. This is memory lost when the size of elements
|
||||
// in a freelist doesn't exactly divide the page-size (an 8192-byte
|
||||
// page full of 5-byte objects would have 2 bytes memory overhead).
|
||||
size_t OverheadBytes() const;
|
||||
|
||||
// Returns number of live spans currently in the nonempty_[n] list.
|
||||
// REQUIRES: n >= 0 && n < kNumLists.
|
||||
size_t NumSpansInList(int n) ABSL_LOCKS_EXCLUDED(lock_);
|
||||
SpanStats GetSpanStats() const;
|
||||
|
||||
// Reports span utilization histogram stats.
|
||||
void PrintSpanUtilStats(Printer* out) const;
|
||||
void PrintSpanUtilStatsInPbtxt(PbtxtRegion* region) const;
|
||||
|
||||
// Get number of spans in the histogram bucket. We record spans in the
|
||||
// histogram indexed by absl::bit_width(allocated). So, instead of using the
|
||||
// absolute number of allocated objects, it uses absl::bit_width(allocated),
|
||||
// passed as <bitwidth>, to index and return the number of spans in the
|
||||
// histogram.
|
||||
size_t NumSpansWith(uint16_t bitwidth) const;
|
||||
|
||||
Forwarder& forwarder() { return forwarder_; }
|
||||
|
||||
private:
|
||||
// Release an object to spans.
|
||||
// Returns object's span if it become completely free.
|
||||
Span* ReleaseToSpans(void* object, Span* span, size_t object_size)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
// Populate cache by fetching from the page heap.
|
||||
// May temporarily release lock_.
|
||||
// Fill a prefix of batch[0..N-1] with up to N elements removed from central
|
||||
// freelist. Returns the number of elements removed.
|
||||
int Populate(void** batch, int N) ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
// Parses nonempty_ lists and returns span from the list with the lowest
|
||||
// possible index.
|
||||
// Returns the span if one exists in the nonempty_ lists. Else, returns
|
||||
// nullptr.
|
||||
Span* FirstNonEmptySpan() ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
// Returns first index to the nonempty_ lists that may record spans.
|
||||
uint8_t GetFirstNonEmptyIndex() const;
|
||||
|
||||
// Returns index into nonempty_ based on the number of allocated objects for
|
||||
// the span. Instead of using the absolute number of allocated objects, it
|
||||
// uses absl::bit_width(allocated), passed as bitwidth, to calculate the list
|
||||
// index.
|
||||
static uint8_t IndexFor(uint8_t bitwidth);
|
||||
|
||||
// Records span utilization in objects_to_span_ map. Instead of using the
|
||||
// absolute number of allocated objects, it uses
|
||||
// absl::bit_width(allocated), passed as <bitwidth>, to index this map.
|
||||
//
|
||||
// If increase is set to true, includes the span by incrementing the count
|
||||
// in the map. Otherwise, removes the span by decrementing the count in
|
||||
// the map.
|
||||
void RecordSpanUtil(uint8_t bitwidth, bool increase)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
|
||||
ASSUME(bitwidth > 0);
|
||||
// Updates to objects_to_span_ are guarded by lock_, so writes may be
|
||||
// performed using LossyAdd.
|
||||
objects_to_spans_[bitwidth - 1].LossyAdd(increase ? 1 : -1);
|
||||
}
|
||||
|
||||
// This lock protects all the mutable data members.
|
||||
absl::base_internal::SpinLock lock_;
|
||||
|
||||
size_t size_class_; // My size class (immutable after Init())
|
||||
size_t object_size_;
|
||||
size_t objects_per_span_;
|
||||
// Hint used for parsing through the nonempty_ lists. This prevents us from
|
||||
// parsing the lists with an index starting zero, if the lowest possible index
|
||||
// is higher than that.
|
||||
size_t first_nonempty_index_;
|
||||
Length pages_per_span_;
|
||||
|
||||
size_t num_spans() const {
|
||||
size_t requested = num_spans_requested_.value();
|
||||
size_t returned = num_spans_returned_.value();
|
||||
if (requested < returned) return 0;
|
||||
return (requested - returned);
|
||||
}
|
||||
|
||||
void RecordSpanAllocated() ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
|
||||
counter_.LossyAdd(objects_per_span_);
|
||||
num_spans_requested_.LossyAdd(1);
|
||||
}
|
||||
|
||||
void RecordMultiSpansDeallocated(size_t num_spans_returned)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
|
||||
counter_.LossyAdd(-num_spans_returned * objects_per_span_);
|
||||
num_spans_returned_.LossyAdd(num_spans_returned);
|
||||
}
|
||||
|
||||
void UpdateObjectCounts(int num) ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
|
||||
counter_.LossyAdd(num);
|
||||
}
|
||||
|
||||
// The followings are kept as a StatsCounter so that they can read without
|
||||
// acquiring a lock. Updates to these variables are guarded by lock_
|
||||
// so writes are performed using LossyAdd for speed, the lock still
|
||||
// guarantees accuracy.
|
||||
|
||||
// Num free objects in cache entry
|
||||
StatsCounter counter_;
|
||||
|
||||
StatsCounter num_spans_requested_;
|
||||
StatsCounter num_spans_returned_;
|
||||
|
||||
// Records histogram of span utilization.
|
||||
//
|
||||
// Each bucket in the histogram records number of live spans with
|
||||
// corresponding number of allocated objects. Instead of using the absolute
|
||||
// value of number of allocated objects, we use absl::bit_width(allocated) to
|
||||
// index this map. A bucket in the histogram corresponds to power-of-two
|
||||
// number of objects. That is, bucket N tracks number of spans with allocated
|
||||
// objects < 2^(N+1). For instance, objects_to_spans_ map tracks number of
|
||||
// spans with allocated objects in the range [a,b), indexed as: [1,2) in
|
||||
// objects_to_spans_[0], [2,4) in objects_to_spans_[1], [4, 8) in
|
||||
// objects_to_spans_[2] and so on. We can query the objects_to_spans_ map
|
||||
// using NumSpansWith(bitwidth) to obtain the number of spans associated
|
||||
// with the corresponding bucket in the histogram.
|
||||
//
|
||||
// As the actual value of objects_per_span_ is not known at compile time, we
|
||||
// use maximum value that it can be to initialize this hashmap, and
|
||||
// kSpanUtilBucketCapacity determines this value. We also check during Init
|
||||
// that absl::bit_width(objects_per_span_) is indeed less than or equal to
|
||||
// kSpanUtilBucketCapacity.
|
||||
//
|
||||
// We disable collection of histogram stats for TCMalloc small-but-slow due to
|
||||
// performance issues. See b/227362263.
|
||||
static constexpr size_t kSpanUtilBucketCapacity = 16;
|
||||
StatsCounter objects_to_spans_[kSpanUtilBucketCapacity];
|
||||
|
||||
// Non-empty lists that distinguish spans based on the number of objects
|
||||
// allocated from them. As we prioritize spans, spans may be added to any of
|
||||
// the kNumLists nonempty_ lists based on their allocated objects. If span
|
||||
// prioritization is disabled, we add spans to the nonempty_[kNumlists-1]
|
||||
// list, leaving other lists unused.
|
||||
//
|
||||
// We do not enable multiple nonempty lists for small-but-slow yet due to
|
||||
// performance issues. See b/227362263.
|
||||
#ifdef TCMALLOC_SMALL_BUT_SLOW
|
||||
SpanList nonempty_ ABSL_GUARDED_BY(lock_);
|
||||
#else
|
||||
HintedTrackerLists<Span, kNumLists> nonempty_ ABSL_GUARDED_BY(lock_);
|
||||
#endif
|
||||
|
||||
TCMALLOC_NO_UNIQUE_ADDRESS Forwarder forwarder_;
|
||||
};
|
||||
|
||||
// Like a constructor and hence we disable thread safety analysis.
|
||||
template <class Forwarder>
|
||||
inline void CentralFreeList<Forwarder>::Init(size_t size_class)
|
||||
ABSL_NO_THREAD_SAFETY_ANALYSIS {
|
||||
size_class_ = size_class;
|
||||
object_size_ = Forwarder::class_to_size(size_class);
|
||||
pages_per_span_ = Forwarder::class_to_pages(size_class);
|
||||
objects_per_span_ =
|
||||
pages_per_span_.in_bytes() / (object_size_ ? object_size_ : 1);
|
||||
|
||||
// Records nonempty_ list index associated with the span with
|
||||
// objects_per_span_ number of allocated objects. Refer to the comment in
|
||||
// IndexFor(...) below for a detailed description.
|
||||
first_nonempty_index_ =
|
||||
kNumLists -
|
||||
std::min<size_t>(absl::bit_width(objects_per_span_), kNumLists);
|
||||
|
||||
ASSERT(absl::bit_width(objects_per_span_) <= kSpanUtilBucketCapacity);
|
||||
}
|
||||
|
||||
template <class Forwarder>
|
||||
inline Span* CentralFreeList<Forwarder>::ReleaseToSpans(void* object,
|
||||
Span* span,
|
||||
size_t object_size) {
|
||||
if (ABSL_PREDICT_FALSE(span->FreelistEmpty(object_size))) {
|
||||
#ifdef TCMALLOC_SMALL_BUT_SLOW
|
||||
nonempty_.prepend(span);
|
||||
#else
|
||||
const uint8_t index = GetFirstNonEmptyIndex();
|
||||
nonempty_.Add(span, index);
|
||||
span->set_nonempty_index(index);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef TCMALLOC_SMALL_BUT_SLOW
|
||||
// We maintain a single nonempty list for small-but-slow. Also, we do not
|
||||
// collect histogram stats due to performance issues.
|
||||
if (ABSL_PREDICT_TRUE(span->FreelistPush(object, object_size))) {
|
||||
return nullptr;
|
||||
}
|
||||
nonempty_.remove(span);
|
||||
return span;
|
||||
#else
|
||||
const uint8_t prev_index = span->nonempty_index();
|
||||
const uint8_t prev_bitwidth = absl::bit_width(span->Allocated());
|
||||
if (ABSL_PREDICT_FALSE(!span->FreelistPush(object, object_size))) {
|
||||
// Update the histogram as the span is full and will be removed from the
|
||||
// nonempty_ list.
|
||||
RecordSpanUtil(prev_bitwidth, /*increase=*/false);
|
||||
nonempty_.Remove(span, prev_index);
|
||||
return span;
|
||||
}
|
||||
// As the objects are being added to the span, its utilization might change.
|
||||
// We remove the stale utilization from the histogram and add the new
|
||||
// utilization to the histogram after we release objects to the span.
|
||||
const uint8_t cur_bitwidth = absl::bit_width(span->Allocated());
|
||||
if (cur_bitwidth != prev_bitwidth) {
|
||||
RecordSpanUtil(prev_bitwidth, /*increase=*/false);
|
||||
RecordSpanUtil(cur_bitwidth, /*increase=*/true);
|
||||
// If span allocation changes so that it moved to a different nonempty_
|
||||
// list, we remove it from the previous list and add it to the desired
|
||||
// list indexed by cur_index.
|
||||
const uint8_t cur_index = IndexFor(cur_bitwidth);
|
||||
if (cur_index != prev_index) {
|
||||
nonempty_.Remove(span, prev_index);
|
||||
nonempty_.Add(span, cur_index);
|
||||
span->set_nonempty_index(cur_index);
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class Forwarder>
|
||||
inline Span* CentralFreeList<Forwarder>::FirstNonEmptySpan() {
|
||||
// Scan nonempty_ lists in the range [first_nonempty_index_, kNumLists) and
|
||||
// return the span from a non-empty list if one exists. If all the lists are
|
||||
// empty, return nullptr.
|
||||
#ifdef TCMALLOC_SMALL_BUT_SLOW
|
||||
if (ABSL_PREDICT_FALSE(nonempty_.empty())) {
|
||||
return nullptr;
|
||||
}
|
||||
return nonempty_.first();
|
||||
#else
|
||||
return nonempty_.PeekLeast(GetFirstNonEmptyIndex());
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class Forwarder>
|
||||
inline uint8_t CentralFreeList<Forwarder>::GetFirstNonEmptyIndex() const {
|
||||
return first_nonempty_index_;
|
||||
}
|
||||
|
||||
template <class Forwarder>
|
||||
inline uint8_t CentralFreeList<Forwarder>::IndexFor(uint8_t bitwidth) {
|
||||
// We would like to index into the nonempty_ list based on the number of
|
||||
// allocated objects from the span. Given a span with fewer allocated objects
|
||||
// (i.e. when it is more likely to be freed), we would like to map it to a
|
||||
// higher index in the nonempty_ list. Depending on the number of kNumLists
|
||||
// and the number of objects per span, we may have to clamp multiple buckets
|
||||
// in index 0. It should be ok to do that because it is less beneficial to
|
||||
// differentiate between spans that have 128 vs 256 allocated objects,
|
||||
// compared to those that have 16 vs 32 allocated objects.
|
||||
//
|
||||
// Consider objects_per_span = 1024 and kNumLists = 8. The following examples
|
||||
// show spans with allocated objects in the range [a, b) indexed to the
|
||||
// nonempty_[idx] list using a notation [a, b) -> idx.
|
||||
// [1, 2) -> 7, [2, 4) -> 6, [4, 8) -> 5, [8, 16) -> 4, [16, 32) -> 3, [32,
|
||||
// 64) -> 2, [64, 128) -> 1, [128, 1024) -> 0.
|
||||
|
||||
ASSUME(bitwidth > 0);
|
||||
const uint8_t offset = std::min<size_t>(bitwidth, kNumLists);
|
||||
const uint8_t index = kNumLists - offset;
|
||||
ASSUME(index < kNumLists);
|
||||
return index;
|
||||
}
|
||||
|
||||
template <class Forwarder>
|
||||
inline size_t CentralFreeList<Forwarder>::NumSpansInList(int n) {
|
||||
ASSUME(n >= 0);
|
||||
ASSUME(n < kNumLists);
|
||||
absl::base_internal::SpinLockHolder h(&lock_);
|
||||
#ifdef TCMALLOC_SMALL_BUT_SLOW
|
||||
return nonempty_.length();
|
||||
#else
|
||||
return nonempty_.SizeOfList(n);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class Forwarder>
|
||||
inline void CentralFreeList<Forwarder>::InsertRange(absl::Span<void*> batch) {
|
||||
CHECK_CONDITION(!batch.empty() && batch.size() <= kMaxObjectsToMove);
|
||||
Span* spans[kMaxObjectsToMove];
|
||||
// Safe to store free spans into freed up space in span array.
|
||||
Span** free_spans = spans;
|
||||
int free_count = 0;
|
||||
|
||||
// Prefetch Span objects to reduce cache misses.
|
||||
for (int i = 0; i < batch.size(); ++i) {
|
||||
Span* span = forwarder_.MapObjectToSpan(batch[i]);
|
||||
ASSERT(span != nullptr);
|
||||
span->Prefetch();
|
||||
spans[i] = span;
|
||||
}
|
||||
|
||||
// First, release all individual objects into spans under our mutex
|
||||
// and collect spans that become completely free.
|
||||
{
|
||||
// Use local copy of variable to ensure that it is not reloaded.
|
||||
size_t object_size = object_size_;
|
||||
absl::base_internal::SpinLockHolder h(&lock_);
|
||||
for (int i = 0; i < batch.size(); ++i) {
|
||||
Span* span = ReleaseToSpans(batch[i], spans[i], object_size);
|
||||
if (ABSL_PREDICT_FALSE(span)) {
|
||||
free_spans[free_count] = span;
|
||||
free_count++;
|
||||
}
|
||||
}
|
||||
|
||||
RecordMultiSpansDeallocated(free_count);
|
||||
UpdateObjectCounts(batch.size());
|
||||
}
|
||||
|
||||
// Then, release all free spans into page heap under its mutex.
|
||||
if (ABSL_PREDICT_FALSE(free_count)) {
|
||||
forwarder_.DeallocateSpans(size_class_, objects_per_span_,
|
||||
absl::MakeSpan(free_spans, free_count));
|
||||
}
|
||||
}
|
||||
|
||||
template <class Forwarder>
|
||||
inline int CentralFreeList<Forwarder>::RemoveRange(void** batch, int N) {
|
||||
ASSUME(N > 0);
|
||||
// Use local copy of variable to ensure that it is not reloaded.
|
||||
size_t object_size = object_size_;
|
||||
int result = 0;
|
||||
absl::base_internal::SpinLockHolder h(&lock_);
|
||||
|
||||
do {
|
||||
Span* span = FirstNonEmptySpan();
|
||||
if (ABSL_PREDICT_FALSE(!span)) {
|
||||
result += Populate(batch + result, N - result);
|
||||
break;
|
||||
}
|
||||
|
||||
#ifdef TCMALLOC_SMALL_BUT_SLOW
|
||||
// We do not collect histogram stats for small-but-slow.
|
||||
int here = span->FreelistPopBatch(batch + result, N - result, object_size);
|
||||
ASSERT(here > 0);
|
||||
if (span->FreelistEmpty(object_size)) {
|
||||
nonempty_.remove(span);
|
||||
}
|
||||
#else
|
||||
const uint8_t prev_bitwidth = absl::bit_width(span->Allocated());
|
||||
const uint8_t prev_index = span->nonempty_index();
|
||||
int here = span->FreelistPopBatch(batch + result, N - result, object_size);
|
||||
ASSERT(here > 0);
|
||||
// As the objects are being popped from the span, its utilization might
|
||||
// change. So, we remove the stale utilization from the histogram here and
|
||||
// add it again once we pop the objects.
|
||||
const uint8_t cur_bitwidth = absl::bit_width(span->Allocated());
|
||||
if (cur_bitwidth != prev_bitwidth) {
|
||||
RecordSpanUtil(prev_bitwidth, /*increase=*/false);
|
||||
RecordSpanUtil(cur_bitwidth, /*increase=*/true);
|
||||
}
|
||||
if (span->FreelistEmpty(object_size)) {
|
||||
nonempty_.Remove(span, prev_index);
|
||||
} else if (cur_bitwidth != prev_bitwidth) {
|
||||
// If span allocation changes so that it must be moved to a different
|
||||
// nonempty_ list, we remove it from the previous list and add it to the
|
||||
// desired list indexed by cur_index.
|
||||
const uint8_t cur_index = IndexFor(cur_bitwidth);
|
||||
if (cur_index != prev_index) {
|
||||
nonempty_.Remove(span, prev_index);
|
||||
nonempty_.Add(span, cur_index);
|
||||
span->set_nonempty_index(cur_index);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
result += here;
|
||||
} while (result < N);
|
||||
UpdateObjectCounts(-result);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Fetch memory from the system and add to the central cache freelist.
|
||||
template <class Forwarder>
|
||||
inline int CentralFreeList<Forwarder>::Populate(void** batch, int N)
|
||||
ABSL_NO_THREAD_SAFETY_ANALYSIS {
|
||||
// Release central list lock while operating on pageheap
|
||||
// Note, this could result in multiple calls to populate each allocating
|
||||
// a new span and the pushing those partially full spans onto nonempty.
|
||||
lock_.Unlock();
|
||||
Span* span =
|
||||
forwarder_.AllocateSpan(size_class_, objects_per_span_, pages_per_span_);
|
||||
if (ABSL_PREDICT_FALSE(span == nullptr)) {
|
||||
Log(kLog, __FILE__, __LINE__, "tcmalloc: allocation failed",
|
||||
pages_per_span_.in_bytes());
|
||||
|
||||
lock_.Lock();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int result = span->BuildFreelist(object_size_, objects_per_span_, batch, N);
|
||||
ASSERT(result > 0);
|
||||
// This is a cheaper check than using FreelistEmpty().
|
||||
bool span_empty = result == objects_per_span_;
|
||||
|
||||
lock_.Lock();
|
||||
|
||||
#ifdef TCMALLOC_SMALL_BUT_SLOW
|
||||
// We do not collect histogram stats for small-but-slow. Moreover, we maintain
|
||||
// a single nonempty list to which we prepend the span.
|
||||
if (!span_empty) {
|
||||
nonempty_.prepend(span);
|
||||
}
|
||||
#else
|
||||
// Update the histogram once we populate the span.
|
||||
const uint8_t bitwidth = absl::bit_width(span->Allocated());
|
||||
RecordSpanUtil(bitwidth, /*increase=*/true);
|
||||
if (!span_empty) {
|
||||
const uint8_t index = IndexFor(bitwidth);
|
||||
nonempty_.Add(span, index);
|
||||
span->set_nonempty_index(index);
|
||||
}
|
||||
#endif
|
||||
RecordSpanAllocated();
|
||||
return result;
|
||||
}
|
||||
|
||||
template <class Forwarder>
|
||||
inline size_t CentralFreeList<Forwarder>::OverheadBytes() const {
|
||||
if (ABSL_PREDICT_FALSE(object_size_ == 0)) {
|
||||
return 0;
|
||||
}
|
||||
const size_t overhead_per_span = pages_per_span_.in_bytes() % object_size_;
|
||||
return num_spans() * overhead_per_span;
|
||||
}
|
||||
|
||||
template <class Forwarder>
|
||||
inline SpanStats CentralFreeList<Forwarder>::GetSpanStats() const {
|
||||
SpanStats stats;
|
||||
if (ABSL_PREDICT_FALSE(objects_per_span_ == 0)) {
|
||||
return stats;
|
||||
}
|
||||
stats.num_spans_requested = static_cast<size_t>(num_spans_requested_.value());
|
||||
stats.num_spans_returned = static_cast<size_t>(num_spans_returned_.value());
|
||||
stats.obj_capacity = stats.num_live_spans() * objects_per_span_;
|
||||
return stats;
|
||||
}
|
||||
|
||||
template <class Forwarder>
|
||||
inline size_t CentralFreeList<Forwarder>::NumSpansWith(
|
||||
uint16_t bitwidth) const {
|
||||
ASSERT(bitwidth > 0);
|
||||
const int bucket = bitwidth - 1;
|
||||
return objects_to_spans_[bucket].value();
|
||||
}
|
||||
|
||||
template <class Forwarder>
|
||||
inline void CentralFreeList<Forwarder>::PrintSpanUtilStats(Printer* out) const {
|
||||
out->printf("class %3d [ %8zu bytes ] : ", size_class_, object_size_);
|
||||
for (size_t i = 1; i <= kSpanUtilBucketCapacity; ++i) {
|
||||
out->printf("%6zu < %zu", NumSpansWith(i), 1 << i);
|
||||
if (i < kSpanUtilBucketCapacity) {
|
||||
out->printf(",");
|
||||
}
|
||||
}
|
||||
out->printf("\n");
|
||||
}
|
||||
|
||||
template <class Forwarder>
|
||||
inline void CentralFreeList<Forwarder>::PrintSpanUtilStatsInPbtxt(
|
||||
PbtxtRegion* region) const {
|
||||
for (size_t i = 1; i <= kSpanUtilBucketCapacity; ++i) {
|
||||
PbtxtRegion histogram = region->CreateSubRegion("span_util_histogram");
|
||||
histogram.PrintI64("lower_bound", 1 << (i - 1));
|
||||
histogram.PrintI64("upper_bound", 1 << i);
|
||||
histogram.PrintI64("value", NumSpansWith(i));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace central_freelist_internal
|
||||
|
||||
using CentralFreeList = central_freelist_internal::CentralFreeList<
|
||||
central_freelist_internal::StaticForwarder>;
|
||||
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
|
||||
#endif // TCMALLOC_CENTRAL_FREELIST_H_
|
||||
201
src/third_party/tcmalloc/dist/tcmalloc/central_freelist_benchmark.cc
vendored
Normal file
@ -0,0 +1,201 @@
|
||||
// Copyright 2021 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/algorithm/container.h"
|
||||
#include "absl/random/random.h"
|
||||
#include "benchmark/benchmark.h"
|
||||
#include "tcmalloc/central_freelist.h"
|
||||
#include "tcmalloc/common.h"
|
||||
#include "tcmalloc/static_vars.h"
|
||||
#include "tcmalloc/tcmalloc_policy.h"
|
||||
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
namespace {
|
||||
|
||||
// This benchmark measures how long it takes to populate multiple
|
||||
// spans. The spans are freed in the same order as they were populated
|
||||
// to minimize the time it takes to free them.
|
||||
void BM_Populate(benchmark::State& state) {
|
||||
size_t object_size = state.range(0);
|
||||
size_t size_class = tc_globals.sizemap().SizeClass(CppPolicy(), object_size);
|
||||
int batch_size = tc_globals.sizemap().num_objects_to_move(size_class);
|
||||
int num_objects = 64 * 1024 * 1024 / object_size;
|
||||
const int num_batches = num_objects / batch_size;
|
||||
CentralFreeList cfl;
|
||||
// Initialize the span to contain the appropriate size of object.
|
||||
cfl.Init(size_class);
|
||||
|
||||
// Allocate an array large enough to hold 64 MiB of objects.
|
||||
std::vector<void*> buffer(num_objects);
|
||||
int64_t items_processed = 0;
|
||||
absl::BitGen rnd;
|
||||
|
||||
while (state.KeepRunningBatch(num_batches)) {
|
||||
int index = 0;
|
||||
// The cost of fetching objects will include the cost of fetching and
|
||||
// populating the span.
|
||||
while (index < num_objects) {
|
||||
int count = std::min(batch_size, num_objects - index);
|
||||
int got = cfl.RemoveRange(&buffer[index], count);
|
||||
index += got;
|
||||
}
|
||||
|
||||
// Don't include the cost of returning the objects to the span, and the
|
||||
// span to the pageheap.
|
||||
state.PauseTiming();
|
||||
index = 0;
|
||||
while (index < num_objects) {
|
||||
uint64_t count = std::min(batch_size, num_objects - index);
|
||||
cfl.InsertRange({&buffer[index], count});
|
||||
index += count;
|
||||
}
|
||||
items_processed += index;
|
||||
state.ResumeTiming();
|
||||
}
|
||||
state.SetItemsProcessed(items_processed);
|
||||
}
|
||||
BENCHMARK(BM_Populate)
|
||||
->DenseRange(8, 64, 16)
|
||||
->DenseRange(64, 1024, 64)
|
||||
->DenseRange(4096, 28 * 1024, 4096)
|
||||
->DenseRange(32 * 1024, 256 * 1024, 32 * 1024);
|
||||
|
||||
// This benchmark fills a large array with objects, shuffles the objects
|
||||
// and then returns them.
|
||||
// This should be relatively representative of what happens at runtime.
|
||||
// Fetching objects from the CFL is usually done in batches, but returning
|
||||
// them is usually done spread over many active spans.
|
||||
void BM_MixAndReturn(benchmark::State& state) {
|
||||
size_t object_size = state.range(0);
|
||||
size_t size_class = tc_globals.sizemap().SizeClass(CppPolicy(), object_size);
|
||||
int batch_size = tc_globals.sizemap().num_objects_to_move(size_class);
|
||||
int num_objects = 64 * 1024 * 1024 / object_size;
|
||||
const int num_batches = num_objects / batch_size;
|
||||
CentralFreeList cfl;
|
||||
// Initialize the span to contain the appropriate size of object.
|
||||
cfl.Init(size_class);
|
||||
|
||||
// Allocate an array large enough to hold 64 MiB of objects.
|
||||
std::vector<void*> buffer(num_objects);
|
||||
int64_t items_processed = 0;
|
||||
absl::BitGen rnd;
|
||||
|
||||
while (state.KeepRunningBatch(num_batches)) {
|
||||
int index = 0;
|
||||
while (index < num_objects) {
|
||||
int count = std::min(batch_size, num_objects - index);
|
||||
int got = cfl.RemoveRange(&buffer[index], count);
|
||||
index += got;
|
||||
}
|
||||
|
||||
state.PauseTiming();
|
||||
// Shuffle the vector so that we don't return the objects in the same
|
||||
// order as they were allocated.
|
||||
absl::c_shuffle(buffer, rnd);
|
||||
state.ResumeTiming();
|
||||
|
||||
index = 0;
|
||||
while (index < num_objects) {
|
||||
unsigned int count = std::min(batch_size, num_objects - index);
|
||||
cfl.InsertRange({&buffer[index], count});
|
||||
index += count;
|
||||
}
|
||||
items_processed += index;
|
||||
}
|
||||
state.SetItemsProcessed(items_processed);
|
||||
}
|
||||
BENCHMARK(BM_MixAndReturn)
|
||||
->DenseRange(8, 64, 16)
|
||||
->DenseRange(64, 1024, 64)
|
||||
->DenseRange(4096, 28 * 1024, 4096)
|
||||
->DenseRange(32 * 1024, 256 * 1024, 32 * 1024);
|
||||
|
||||
// This benchmark holds onto half the allocated objects so that (except for
|
||||
// single object spans) spans are never allocated or freed during the
|
||||
// benchmark run. This evaluates the performance of just the span handling
|
||||
// code, and avoids timing the pageheap code.
|
||||
void BM_SpanReuse(benchmark::State& state) {
|
||||
size_t object_size = state.range(0);
|
||||
size_t size_class = tc_globals.sizemap().SizeClass(CppPolicy(), object_size);
|
||||
int batch_size = tc_globals.sizemap().num_objects_to_move(size_class);
|
||||
int num_objects = 64 * 1024 * 1024 / object_size;
|
||||
const int num_batches = num_objects / batch_size;
|
||||
CentralFreeList cfl;
|
||||
// Initialize the span to contain the appropriate size of object.
|
||||
cfl.Init(size_class);
|
||||
|
||||
// Array used to hold onto half of the objects
|
||||
std::vector<void*> held_objects(2 * num_objects);
|
||||
// Request twice the objects we need
|
||||
for (int index = 0; index < 2 * num_objects;) {
|
||||
int count = std::min(batch_size, 2 * num_objects - index);
|
||||
int got = cfl.RemoveRange(&held_objects[index], count);
|
||||
index += got;
|
||||
}
|
||||
|
||||
// Return half of the objects. This will stop the spans from being
|
||||
// returned to the pageheap. So future operations will not touch the
|
||||
// pageheap.
|
||||
for (int index = 0; index < 2 * num_objects; index += 2) {
|
||||
cfl.InsertRange({&held_objects[index], 1});
|
||||
}
|
||||
// Allocate an array large enough to hold 64 MiB of objects.
|
||||
std::vector<void*> buffer(num_objects);
|
||||
int64_t items_processed = 0;
|
||||
absl::BitGen rnd;
|
||||
|
||||
while (state.KeepRunningBatch(num_batches)) {
|
||||
int index = 0;
|
||||
while (index < num_objects) {
|
||||
int count = std::min(batch_size, num_objects - index);
|
||||
int got = cfl.RemoveRange(&buffer[index], count);
|
||||
index += got;
|
||||
}
|
||||
|
||||
state.PauseTiming();
|
||||
// Shuffle the vector so that we don't return the objects in the same
|
||||
// order as they were allocated.
|
||||
absl::c_shuffle(buffer, rnd);
|
||||
state.ResumeTiming();
|
||||
|
||||
index = 0;
|
||||
while (index < num_objects) {
|
||||
uint64_t count = std::min(batch_size, num_objects - index);
|
||||
cfl.InsertRange({&buffer[index], count});
|
||||
index += count;
|
||||
}
|
||||
items_processed += index;
|
||||
}
|
||||
state.SetItemsProcessed(items_processed);
|
||||
|
||||
// Return the other half of the objects.
|
||||
for (int index = 1; index < 2 * num_objects; index += 2) {
|
||||
cfl.InsertRange({&held_objects[index], 1});
|
||||
}
|
||||
}
|
||||
// Want to avoid benchmarking spans where there is a single object per span.
|
||||
BENCHMARK(BM_SpanReuse)
|
||||
->DenseRange(8, 64, 16)
|
||||
->DenseRange(64, 1024, 64)
|
||||
->DenseRange(1024, 4096, 512);
|
||||
|
||||
} // namespace
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
792
src/third_party/tcmalloc/dist/tcmalloc/central_freelist_test.cc
vendored
Normal file
@ -0,0 +1,792 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tcmalloc/central_freelist.h"
|
||||
|
||||
#include <math.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "benchmark/benchmark.h"
|
||||
#include "gmock/gmock.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "absl/algorithm/container.h"
|
||||
#include "absl/base/thread_annotations.h"
|
||||
#include "absl/container/fixed_array.h"
|
||||
#include "absl/memory/memory.h"
|
||||
#include "absl/numeric/bits.h"
|
||||
#include "absl/random/random.h"
|
||||
#include "absl/synchronization/mutex.h"
|
||||
#include "absl/time/clock.h"
|
||||
#include "absl/time/time.h"
|
||||
#include "absl/types/span.h"
|
||||
#include "tcmalloc/common.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
#include "tcmalloc/mock_static_forwarder.h"
|
||||
#include "tcmalloc/pagemap.h"
|
||||
#include "tcmalloc/static_vars.h"
|
||||
#include "tcmalloc/testing/thread_manager.h"
|
||||
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
|
||||
namespace central_freelist_internal {
|
||||
|
||||
class StaticForwarderTest : public testing::TestWithParam<size_t> {
|
||||
protected:
|
||||
size_t size_class_;
|
||||
size_t object_size_;
|
||||
Length pages_per_span_;
|
||||
size_t batch_size_;
|
||||
size_t objects_per_span_;
|
||||
|
||||
private:
|
||||
void SetUp() override {
|
||||
size_class_ = GetParam();
|
||||
if (IsExpandedSizeClass(size_class_)) {
|
||||
#if ABSL_HAVE_THREAD_SANITIZER
|
||||
GTEST_SKIP() << "Skipping test under sanitizers that conflict with "
|
||||
"address placement";
|
||||
#endif
|
||||
|
||||
if (!ColdFeatureActive()) {
|
||||
// If !ColdFeatureActive(), we will use the normal page heap, which will
|
||||
// keep us from seeing memory get the expected tags.
|
||||
GTEST_SKIP()
|
||||
<< "Skipping expanded size classes without cold experiment";
|
||||
}
|
||||
}
|
||||
object_size_ = tc_globals.sizemap().class_to_size(size_class_);
|
||||
if (object_size_ == 0) {
|
||||
GTEST_SKIP() << "Skipping empty size class.";
|
||||
}
|
||||
|
||||
pages_per_span_ = Length(tc_globals.sizemap().class_to_pages(size_class_));
|
||||
batch_size_ = tc_globals.sizemap().num_objects_to_move(size_class_);
|
||||
objects_per_span_ = pages_per_span_.in_bytes() / object_size_;
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(StaticForwarderTest, Simple) {
|
||||
Span* span = StaticForwarder::AllocateSpan(size_class_, objects_per_span_,
|
||||
pages_per_span_);
|
||||
ASSERT_NE(span, nullptr);
|
||||
|
||||
absl::FixedArray<void*> batch(objects_per_span_);
|
||||
size_t allocated = span->BuildFreelist(object_size_, objects_per_span_,
|
||||
&batch[0], objects_per_span_);
|
||||
ASSERT_EQ(allocated, objects_per_span_);
|
||||
|
||||
EXPECT_EQ(size_class_, tc_globals.pagemap().sizeclass(span->first_page()));
|
||||
EXPECT_EQ(size_class_, tc_globals.pagemap().sizeclass(span->last_page()));
|
||||
|
||||
// span_test.cc provides test coverage for Span, but we need to obtain several
|
||||
// objects to confirm we can map back to the Span pointer from the PageMap.
|
||||
for (void* ptr : batch) {
|
||||
EXPECT_EQ(span, StaticForwarder::MapObjectToSpan(ptr));
|
||||
}
|
||||
|
||||
for (void* ptr : batch) {
|
||||
span->FreelistPush(ptr, object_size_);
|
||||
}
|
||||
|
||||
StaticForwarder::DeallocateSpans(size_class_, objects_per_span_,
|
||||
absl::MakeSpan(&span, 1));
|
||||
}
|
||||
|
||||
class StaticForwarderEnvironment {
|
||||
struct SpanData {
|
||||
Span* span;
|
||||
void* batch[kMaxObjectsToMove];
|
||||
};
|
||||
|
||||
public:
|
||||
StaticForwarderEnvironment(int size_class, size_t object_size,
|
||||
size_t objects_per_span, Length pages_per_span,
|
||||
int batch_size)
|
||||
: size_class_(size_class),
|
||||
object_size_(object_size),
|
||||
objects_per_span_(objects_per_span),
|
||||
pages_per_span_(pages_per_span),
|
||||
batch_size_(batch_size) {}
|
||||
|
||||
~StaticForwarderEnvironment() { Drain(); }
|
||||
|
||||
void RandomlyPoke() {
|
||||
absl::BitGen rng;
|
||||
double coin = absl::Uniform(rng, 0.0, 1.0);
|
||||
|
||||
if (coin < 0.5) {
|
||||
Grow();
|
||||
} else if (coin < 0.9) {
|
||||
// Deallocate Spans. We may deallocate more than 1 span, so we bias
|
||||
// towards allocating Spans more often than we deallocate.
|
||||
Shrink();
|
||||
} else {
|
||||
Shuffle(rng);
|
||||
}
|
||||
}
|
||||
|
||||
void Drain() {
|
||||
std::vector<std::unique_ptr<SpanData>> spans;
|
||||
|
||||
{
|
||||
absl::MutexLock l(&mu_);
|
||||
if (data_.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
spans = std::move(data_);
|
||||
data_.clear();
|
||||
}
|
||||
|
||||
// Check mappings.
|
||||
std::vector<Span*> free_spans;
|
||||
for (const auto& data : spans) {
|
||||
EXPECT_EQ(size_class_,
|
||||
tc_globals.pagemap().sizeclass(data->span->first_page()));
|
||||
EXPECT_EQ(size_class_,
|
||||
tc_globals.pagemap().sizeclass(data->span->last_page()));
|
||||
// Confirm we can map at least one object back.
|
||||
EXPECT_EQ(data->span, StaticForwarder::MapObjectToSpan(data->batch[0]));
|
||||
|
||||
free_spans.push_back(data->span);
|
||||
}
|
||||
|
||||
StaticForwarder::DeallocateSpans(size_class_, objects_per_span_,
|
||||
absl::MakeSpan(free_spans));
|
||||
}
|
||||
|
||||
void Grow() {
|
||||
// Allocate a Span
|
||||
Span* span = StaticForwarder::AllocateSpan(size_class_, objects_per_span_,
|
||||
pages_per_span_);
|
||||
ASSERT_NE(span, nullptr);
|
||||
|
||||
auto d = absl::make_unique<SpanData>();
|
||||
d->span = span;
|
||||
|
||||
size_t allocated = span->BuildFreelist(object_size_, objects_per_span_,
|
||||
d->batch, batch_size_);
|
||||
EXPECT_LE(allocated, objects_per_span_);
|
||||
|
||||
EXPECT_EQ(size_class_, tc_globals.pagemap().sizeclass(span->first_page()));
|
||||
EXPECT_EQ(size_class_, tc_globals.pagemap().sizeclass(span->last_page()));
|
||||
// Confirm we can map at least one object back.
|
||||
EXPECT_EQ(span, StaticForwarder::MapObjectToSpan(d->batch[0]));
|
||||
|
||||
absl::MutexLock l(&mu_);
|
||||
spans_allocated_++;
|
||||
data_.push_back(std::move(d));
|
||||
}
|
||||
|
||||
void Shrink() {
|
||||
absl::BitGen rng;
|
||||
std::vector<std::unique_ptr<SpanData>> spans;
|
||||
|
||||
{
|
||||
absl::MutexLock l(&mu_);
|
||||
if (data_.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
size_t count = absl::LogUniform<size_t>(rng, 1, data_.size());
|
||||
spans.reserve(count);
|
||||
|
||||
for (int i = 0; i < count; i++) {
|
||||
spans.push_back(std::move(data_.back()));
|
||||
data_.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
// Check mappings.
|
||||
std::vector<Span*> free_spans;
|
||||
for (auto& data : spans) {
|
||||
EXPECT_EQ(size_class_,
|
||||
tc_globals.pagemap().sizeclass(data->span->first_page()));
|
||||
EXPECT_EQ(size_class_,
|
||||
tc_globals.pagemap().sizeclass(data->span->last_page()));
|
||||
// Confirm we can map at least one object back.
|
||||
EXPECT_EQ(data->span, StaticForwarder::MapObjectToSpan(data->batch[0]));
|
||||
|
||||
free_spans.push_back(data->span);
|
||||
}
|
||||
|
||||
StaticForwarder::DeallocateSpans(size_class_, objects_per_span_,
|
||||
absl::MakeSpan(free_spans));
|
||||
}
|
||||
|
||||
void Shuffle(absl::BitGen& rng) {
|
||||
// Shuffle the shared vector.
|
||||
absl::MutexLock l(&mu_);
|
||||
absl::c_shuffle(data_, rng);
|
||||
}
|
||||
|
||||
int64_t BytesAllocated() {
|
||||
absl::MutexLock l(&mu_);
|
||||
return pages_per_span_.in_bytes() * spans_allocated_;
|
||||
}
|
||||
|
||||
private:
|
||||
int size_class_;
|
||||
size_t object_size_;
|
||||
size_t objects_per_span_;
|
||||
Length pages_per_span_;
|
||||
int batch_size_;
|
||||
|
||||
absl::Mutex mu_;
|
||||
int64_t spans_allocated_ ABSL_GUARDED_BY(mu_) = 0;
|
||||
std::vector<std::unique_ptr<SpanData>> data_ ABSL_GUARDED_BY(mu_);
|
||||
};
|
||||
|
||||
static BackingStats PageHeapStats() {
|
||||
absl::base_internal::SpinLockHolder l(&pageheap_lock);
|
||||
return tc_globals.page_allocator().stats();
|
||||
}
|
||||
|
||||
TEST_P(StaticForwarderTest, Fuzz) {
|
||||
#if ABSL_HAVE_THREAD_SANITIZER
|
||||
// TODO(b/193887621): Enable this test under TSan after addressing benign
|
||||
// true positives.
|
||||
GTEST_SKIP() << "Skipping test under Thread Sanitizer.";
|
||||
#endif // ABSL_HAVE_THREAD_SANITIZER
|
||||
|
||||
const auto page_heap_before = PageHeapStats();
|
||||
|
||||
StaticForwarderEnvironment env(size_class_, object_size_, objects_per_span_,
|
||||
pages_per_span_, batch_size_);
|
||||
ThreadManager threads;
|
||||
threads.Start(10, [&](int) { env.RandomlyPoke(); });
|
||||
|
||||
absl::SleepFor(absl::Seconds(0.2));
|
||||
|
||||
threads.Stop();
|
||||
|
||||
const auto page_heap_after = PageHeapStats();
|
||||
// Confirm we did not leak Spans by ensuring the page heap did not grow nearly
|
||||
// 1:1 by the total number of Spans we ever allocated.
|
||||
//
|
||||
// Since we expect to allocate a significant number of spans, we apply a
|
||||
// factor of 1/2 (which is unlikely to be flaky) to avoid false negatives
|
||||
// if/when a background thread triggers a deallocation.
|
||||
const int64_t bytes_allocated = env.BytesAllocated();
|
||||
EXPECT_GT(bytes_allocated, 0);
|
||||
EXPECT_LE(static_cast<int64_t>(page_heap_after.system_bytes) -
|
||||
static_cast<int64_t>(page_heap_before.system_bytes),
|
||||
bytes_allocated / 2);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(All, StaticForwarderTest,
|
||||
testing::Range(size_t(1), kNumClasses));
|
||||
|
||||
} // namespace central_freelist_internal
|
||||
|
||||
namespace {
|
||||
|
||||
using central_freelist_internal::kNumLists;
|
||||
template <typename Env>
|
||||
using CentralFreeListTest = ::testing::Test;
|
||||
TYPED_TEST_SUITE_P(CentralFreeListTest);
|
||||
|
||||
TYPED_TEST_P(CentralFreeListTest, IsolatedSmoke) {
|
||||
TypeParam e;
|
||||
|
||||
EXPECT_CALL(e.forwarder(), AllocateSpan).Times(1);
|
||||
|
||||
absl::FixedArray<void*> batch(TypeParam::kBatchSize);
|
||||
int allocated =
|
||||
e.central_freelist().RemoveRange(&batch[0], TypeParam::kBatchSize);
|
||||
ASSERT_GT(allocated, 0);
|
||||
EXPECT_LE(allocated, TypeParam::kBatchSize);
|
||||
|
||||
// We should observe span's utilization captured in the histogram. The number
|
||||
// of spans in rest of the buckets should be zero.
|
||||
const int bitwidth = absl::bit_width(static_cast<unsigned>(allocated));
|
||||
for (int i = 1; i <= absl::bit_width(TypeParam::kObjectsPerSpan); ++i) {
|
||||
if (i == bitwidth) {
|
||||
EXPECT_EQ(e.central_freelist().NumSpansWith(i), 1);
|
||||
} else {
|
||||
EXPECT_EQ(e.central_freelist().NumSpansWith(i), 0);
|
||||
}
|
||||
}
|
||||
|
||||
EXPECT_CALL(e.forwarder(), MapObjectToSpan).Times(allocated);
|
||||
EXPECT_CALL(e.forwarder(), DeallocateSpans).Times(1);
|
||||
|
||||
SpanStats stats = e.central_freelist().GetSpanStats();
|
||||
EXPECT_EQ(stats.num_spans_requested, 1);
|
||||
EXPECT_EQ(stats.num_spans_returned, 0);
|
||||
EXPECT_EQ(stats.obj_capacity, 1024);
|
||||
|
||||
e.central_freelist().InsertRange(absl::MakeSpan(&batch[0], allocated));
|
||||
|
||||
stats = e.central_freelist().GetSpanStats();
|
||||
EXPECT_EQ(stats.num_spans_requested, 1);
|
||||
EXPECT_EQ(stats.num_spans_returned, 1);
|
||||
EXPECT_EQ(stats.obj_capacity, 0);
|
||||
|
||||
// Span captured in the histogram with the earlier utilization should have
|
||||
// been removed.
|
||||
for (int i = 1; i <= absl::bit_width(TypeParam::kObjectsPerSpan); ++i) {
|
||||
EXPECT_EQ(e.central_freelist().NumSpansWith(i), 0);
|
||||
}
|
||||
}
|
||||
|
||||
TYPED_TEST_P(CentralFreeListTest, SpanUtilizationHistogram) {
|
||||
TypeParam e;
|
||||
|
||||
constexpr size_t kNumSpans = 10;
|
||||
|
||||
// Request kNumSpans spans.
|
||||
void* batch[kMaxObjectsToMove];
|
||||
const int num_objects_to_fetch = kNumSpans * TypeParam::kObjectsPerSpan;
|
||||
int total_fetched = 0;
|
||||
// Tracks object and corresponding span idx from which it was allocated.
|
||||
std::vector<std::pair<void*, int>> objects_to_span_idx;
|
||||
// Tracks number of objects allocated per span.
|
||||
std::vector<size_t> allocated_per_span(kNumSpans, 0);
|
||||
int span_idx = 0;
|
||||
|
||||
while (total_fetched < num_objects_to_fetch) {
|
||||
size_t n = num_objects_to_fetch - total_fetched;
|
||||
int got = e.central_freelist().RemoveRange(
|
||||
batch, std::min(n, TypeParam::kBatchSize));
|
||||
total_fetched += got;
|
||||
|
||||
// Increment span_idx if current objects have been fetched from the new
|
||||
// span.
|
||||
if (total_fetched > (span_idx + 1) * TypeParam::kObjectsPerSpan) {
|
||||
++span_idx;
|
||||
}
|
||||
// Record fetched object and associated span index.
|
||||
for (int i = 0; i < got; ++i) {
|
||||
objects_to_span_idx.push_back(std::make_pair(batch[i], span_idx));
|
||||
}
|
||||
ASSERT(span_idx < kNumSpans);
|
||||
allocated_per_span[span_idx] += got;
|
||||
}
|
||||
|
||||
// Make sure that we have fetched exactly from kNumSpans spans.
|
||||
EXPECT_EQ(span_idx + 1, kNumSpans);
|
||||
|
||||
// We should have kNumSpans spans in the histogram with number of allocated
|
||||
// objects equal to TypeParam::kObjectsPerSpan (i.e. in the last bucket).
|
||||
// Rest of the buckets should be empty.
|
||||
const int expected_bitwidth = absl::bit_width(TypeParam::kObjectsPerSpan);
|
||||
EXPECT_EQ(e.central_freelist().NumSpansWith(expected_bitwidth), kNumSpans);
|
||||
for (int i = 1; i < expected_bitwidth; ++i) {
|
||||
EXPECT_EQ(e.central_freelist().NumSpansWith(i), 0);
|
||||
}
|
||||
|
||||
// Shuffle.
|
||||
absl::BitGen rng;
|
||||
std::shuffle(objects_to_span_idx.begin(), objects_to_span_idx.end(), rng);
|
||||
|
||||
// Return objects, a fraction at a time, each time checking that histogram is
|
||||
// correct.
|
||||
int total_returned = 0;
|
||||
const int last_bucket = absl::bit_width(TypeParam::kObjectsPerSpan) - 1;
|
||||
while (total_returned < num_objects_to_fetch) {
|
||||
uint64_t size_to_pop = std::min(objects_to_span_idx.size() - total_returned,
|
||||
TypeParam::kBatchSize);
|
||||
|
||||
for (int i = 0; i < size_to_pop; ++i) {
|
||||
const auto [ptr, span_idx] = objects_to_span_idx[i + total_returned];
|
||||
batch[i] = ptr;
|
||||
ASSERT(span_idx < kNumSpans);
|
||||
--allocated_per_span[span_idx];
|
||||
}
|
||||
total_returned += size_to_pop;
|
||||
e.central_freelist().InsertRange({batch, size_to_pop});
|
||||
|
||||
// Calculate expected histogram.
|
||||
size_t expected[absl::bit_width(TypeParam::kObjectsPerSpan)] = {0};
|
||||
for (int i = 0; i < kNumSpans; ++i) {
|
||||
// If span has non-zero allocated objects, include it in the histogram.
|
||||
if (allocated_per_span[i]) {
|
||||
const size_t bucket = absl::bit_width(allocated_per_span[i]) - 1;
|
||||
ASSERT(bucket <= last_bucket);
|
||||
++expected[bucket];
|
||||
}
|
||||
}
|
||||
|
||||
// Fetch number of spans logged in the histogram and compare it with the
|
||||
// expected histogram that we calculated using the tracked allocated
|
||||
// objects per span.
|
||||
for (int i = 1; i <= last_bucket; ++i) {
|
||||
EXPECT_EQ(e.central_freelist().NumSpansWith(i), expected[i - 1]);
|
||||
}
|
||||
}
|
||||
|
||||
// Since no span is live here, histogram must be empty.
|
||||
for (int i = 1; i <= last_bucket; ++i) {
|
||||
EXPECT_EQ(e.central_freelist().NumSpansWith(i), 0);
|
||||
}
|
||||
}
|
||||
|
||||
// Confirms that a call to RemoveRange returns at most kObjectsPerSpan objects
|
||||
// in cases when there are no non-empty spans in the central freelist. This
|
||||
// makes sure that we populate, and subsequently allocate from a single span.
|
||||
// This avoids memory regression due to multiple Populate calls observed in
|
||||
// b/225880278.
|
||||
TYPED_TEST_P(CentralFreeListTest, SinglePopulate) {
|
||||
// Make sure that we allocate up to kObjectsPerSpan objects in both the span
|
||||
// prioritization states.
|
||||
TypeParam e;
|
||||
// Try to fetch sufficiently large number of objects at startup.
|
||||
const int num_objects_to_fetch = 10 * TypeParam::kObjectsPerSpan;
|
||||
void* objects[num_objects_to_fetch];
|
||||
const size_t got =
|
||||
e.central_freelist().RemoveRange(objects, num_objects_to_fetch);
|
||||
// Confirm we allocated at most kObjectsPerSpan number of objects.
|
||||
EXPECT_GT(got, 0);
|
||||
EXPECT_LE(got, TypeParam::kObjectsPerSpan);
|
||||
size_t returned = 0;
|
||||
while (returned < got) {
|
||||
const size_t to_return = std::min(got - returned, TypeParam::kBatchSize);
|
||||
e.central_freelist().InsertRange({&objects[returned], to_return});
|
||||
returned += to_return;
|
||||
}
|
||||
}
|
||||
|
||||
// Checks if we are indexing a span in the nonempty_ lists as expected.
|
||||
TYPED_TEST_P(CentralFreeListTest, MultiNonEmptyLists) {
|
||||
TypeParam e;
|
||||
|
||||
ASSERT(kNumLists > 0);
|
||||
const int num_objects_to_fetch = TypeParam::kObjectsPerSpan;
|
||||
std::vector<void*> objects(num_objects_to_fetch);
|
||||
size_t fetched = 0;
|
||||
int expected_idx = kNumLists - 1;
|
||||
int prev_bitwidth = 1;
|
||||
|
||||
// Fetch one object at a time from a span and confirm that the span is moved
|
||||
// through the nonempty_ lists as we allocate more objects from it.
|
||||
while (fetched < num_objects_to_fetch) {
|
||||
// Try to fetch one object from the span.
|
||||
int got = e.central_freelist().RemoveRange(&objects[fetched], 1);
|
||||
fetched += got;
|
||||
ASSERT(fetched);
|
||||
size_t cur_bitwidth = absl::bit_width(fetched);
|
||||
// We index nonempty_ lists based on log2(allocated) and so, we update the
|
||||
// index when the bit_width changes.
|
||||
if (cur_bitwidth != prev_bitwidth) {
|
||||
// We ceil spans to nonempty_[0] when allocated objects from the span
|
||||
// increases above 2^(kNumLists-1).
|
||||
expected_idx = expected_idx > 0 ? expected_idx - 1 : 0;
|
||||
prev_bitwidth = cur_bitwidth;
|
||||
}
|
||||
ASSERT(expected_idx >= 0);
|
||||
ASSERT(expected_idx < kNumLists);
|
||||
if (fetched % num_objects_to_fetch == 0) {
|
||||
// Span should have been removed from nonempty_ lists because we have
|
||||
// allocated all the objects from it.
|
||||
EXPECT_EQ(e.central_freelist().NumSpansInList(expected_idx), 0);
|
||||
} else {
|
||||
// Check that the span exists in the corresponding nonempty_ list.
|
||||
EXPECT_EQ(e.central_freelist().NumSpansInList(expected_idx), 1);
|
||||
}
|
||||
}
|
||||
|
||||
// Similar to our previous test, we now make sure that the span is moved
|
||||
// through the nonempty_ lists when we deallocate objects back to it.
|
||||
size_t remaining = fetched;
|
||||
|
||||
// We ceil spans to nonempty_[0] when allocated objects from the span
|
||||
// increases above 2^(kNumLists-1).
|
||||
const size_t threshold = pow(2, kNumLists - 1);
|
||||
while (--remaining > 0) {
|
||||
// Return objects back to the span one at a time.
|
||||
e.central_freelist().InsertRange({&objects[remaining], 1});
|
||||
ASSERT(remaining);
|
||||
const size_t cur_bitwidth = absl::bit_width(remaining);
|
||||
// If we cross pow2 boundaries, update the expected index into nonempty_
|
||||
// lists.
|
||||
if (cur_bitwidth != prev_bitwidth) {
|
||||
// When allocated objects are more than the threshold, the span is indexed
|
||||
// to nonempty_ list 0.
|
||||
expected_idx = remaining < threshold ? expected_idx + 1 : 0;
|
||||
prev_bitwidth = cur_bitwidth;
|
||||
}
|
||||
EXPECT_LT(expected_idx, kNumLists);
|
||||
EXPECT_EQ(e.central_freelist().NumSpansInList(expected_idx), 1);
|
||||
}
|
||||
|
||||
// When the last object is returned, we release the span to the page heap. So,
|
||||
// nonempty_[0] should also be empty.
|
||||
e.central_freelist().InsertRange({&objects[remaining], 1});
|
||||
EXPECT_EQ(e.central_freelist().NumSpansInList(0), 0);
|
||||
}
|
||||
|
||||
// Checks if we are indexing a span in the nonempty_ lists as expected. We also
|
||||
// check if the spans are correctly being prioritized. That is, we create a
|
||||
// scenario where we have two live spans, and one span has more allocated
|
||||
// objects than the other span. On subsequent allocations, we confirm that the
|
||||
// objects are allocated from the span with a higher number of allocated objects
|
||||
// as enforced by our prioritization scheme.
|
||||
TYPED_TEST_P(CentralFreeListTest, SpanPriority) {
|
||||
TypeParam e;
|
||||
|
||||
// If the number of objects per span is less than 3, we do not use more than
|
||||
// one nonempty_ lists. So, we can not prioritize the spans based on how many
|
||||
// objects were allocated from them.
|
||||
const int objects_per_span = TypeParam::kObjectsPerSpan;
|
||||
if (objects_per_span < 3 || kNumLists < 2) return;
|
||||
|
||||
constexpr int kNumSpans = 2;
|
||||
|
||||
// Track objects allocated per span.
|
||||
absl::FixedArray<std::vector<void*>> objects(kNumSpans);
|
||||
void* batch[kMaxObjectsToMove];
|
||||
|
||||
const size_t to_fetch = objects_per_span;
|
||||
// Allocate all objects from kNumSpans.
|
||||
for (int span = 0; span < kNumSpans; ++span) {
|
||||
size_t fetched = 0;
|
||||
while (fetched < to_fetch) {
|
||||
const size_t n = to_fetch - fetched;
|
||||
int got = e.central_freelist().RemoveRange(
|
||||
batch, std::min(n, TypeParam::kBatchSize));
|
||||
for (int i = 0; i < got; ++i) {
|
||||
objects[span].push_back(batch[i]);
|
||||
}
|
||||
fetched += got;
|
||||
}
|
||||
}
|
||||
|
||||
// Perform deallocations so that each span contains only two objects.
|
||||
size_t to_release = to_fetch - 2;
|
||||
for (int span = 0; span < kNumSpans; ++span) {
|
||||
size_t released = 0;
|
||||
while (released < to_release) {
|
||||
uint64_t n = std::min(to_release - released, TypeParam::kBatchSize);
|
||||
for (int i = 0; i < n; ++i) {
|
||||
batch[i] = objects[span][i + released];
|
||||
}
|
||||
released += n;
|
||||
e.central_freelist().InsertRange({batch, n});
|
||||
}
|
||||
objects[span].erase(objects[span].begin(),
|
||||
objects[span].begin() + released);
|
||||
}
|
||||
|
||||
// Make sure we have kNumSpans in the expected second-last nonempty_ list.
|
||||
EXPECT_EQ(e.central_freelist().NumSpansInList(kNumLists - 2), kNumSpans);
|
||||
|
||||
// Release an additional object from all but one spans so that they are
|
||||
// deprioritized for subsequent allocations.
|
||||
to_release = 1;
|
||||
for (int span = 1; span < kNumSpans; ++span) {
|
||||
size_t released = 0;
|
||||
while (released < to_release) {
|
||||
uint64_t n = std::min(to_release - released, TypeParam::kBatchSize);
|
||||
for (int i = 0; i < n; ++i) {
|
||||
batch[i] = objects[span][i + released];
|
||||
}
|
||||
released += n;
|
||||
e.central_freelist().InsertRange({batch, n});
|
||||
}
|
||||
objects[span].erase(objects[span].begin(),
|
||||
objects[span].begin() + released);
|
||||
}
|
||||
|
||||
// Make sure we have kNumSpans-1 spans in the last nonempty_ list and just one
|
||||
// span in the second-last list.
|
||||
EXPECT_EQ(e.central_freelist().NumSpansInList(kNumLists - 1), kNumSpans - 1);
|
||||
EXPECT_EQ(e.central_freelist().NumSpansInList(kNumLists - 2), 1);
|
||||
|
||||
// Allocate one object to ensure that it is being allocated from the span with
|
||||
// the highest number of allocated objects.
|
||||
int got = e.central_freelist().RemoveRange(batch, 1);
|
||||
EXPECT_EQ(got, 1);
|
||||
// Number of spans in the last nonempty_ list should be unchanged (i.e.
|
||||
// kNumSpans-1).
|
||||
EXPECT_EQ(e.central_freelist().NumSpansInList(kNumLists - 1), kNumSpans - 1);
|
||||
// We should have only one span in the second-last nonempty_ list; this is the
|
||||
// span from which we should have allocated the last object.
|
||||
EXPECT_EQ(e.central_freelist().NumSpansInList(kNumLists - 2), 1);
|
||||
// Return previously allocated object.
|
||||
e.central_freelist().InsertRange({batch, 1});
|
||||
|
||||
// Return rest of the objects.
|
||||
for (int span = 0; span < kNumSpans; ++span) {
|
||||
for (int i = 0; i < objects[span].size(); ++i) {
|
||||
e.central_freelist().InsertRange({&objects[span][i], 1});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TYPED_TEST_P(CentralFreeListTest, MultipleSpans) {
|
||||
TypeParam e;
|
||||
std::vector<void*> all_objects;
|
||||
|
||||
constexpr size_t kNumSpans = 10;
|
||||
|
||||
// Request kNumSpans spans.
|
||||
void* batch[kMaxObjectsToMove];
|
||||
const int num_objects_to_fetch = kNumSpans * TypeParam::kObjectsPerSpan;
|
||||
int total_fetched = 0;
|
||||
while (total_fetched < num_objects_to_fetch) {
|
||||
size_t n = num_objects_to_fetch - total_fetched;
|
||||
int got = e.central_freelist().RemoveRange(
|
||||
batch, std::min(n, TypeParam::kBatchSize));
|
||||
for (int i = 0; i < got; ++i) {
|
||||
all_objects.push_back(batch[i]);
|
||||
}
|
||||
total_fetched += got;
|
||||
}
|
||||
|
||||
// We should have kNumSpans spans in the histogram with number of
|
||||
// allocated objects equal to TypeParam::kObjectsPerSpan (i.e. in the last
|
||||
// bucket). Rest of the buckets should be empty.
|
||||
const int expected_bitwidth = absl::bit_width(TypeParam::kObjectsPerSpan);
|
||||
EXPECT_EQ(e.central_freelist().NumSpansWith(expected_bitwidth), kNumSpans);
|
||||
for (int i = 1; i < expected_bitwidth; ++i) {
|
||||
EXPECT_EQ(e.central_freelist().NumSpansWith(i), 0);
|
||||
}
|
||||
|
||||
SpanStats stats = e.central_freelist().GetSpanStats();
|
||||
EXPECT_EQ(stats.num_spans_requested, kNumSpans);
|
||||
EXPECT_EQ(stats.num_spans_returned, 0);
|
||||
|
||||
EXPECT_EQ(all_objects.size(), num_objects_to_fetch);
|
||||
|
||||
// Shuffle
|
||||
absl::BitGen rng;
|
||||
std::shuffle(all_objects.begin(), all_objects.end(), rng);
|
||||
|
||||
// Return all
|
||||
int total_returned = 0;
|
||||
bool checked_half = false;
|
||||
while (total_returned < num_objects_to_fetch) {
|
||||
uint64_t size_to_pop =
|
||||
std::min(all_objects.size() - total_returned, TypeParam::kBatchSize);
|
||||
for (int i = 0; i < size_to_pop; ++i) {
|
||||
batch[i] = all_objects[i + total_returned];
|
||||
}
|
||||
total_returned += size_to_pop;
|
||||
e.central_freelist().InsertRange({batch, size_to_pop});
|
||||
// sanity check
|
||||
if (!checked_half && total_returned >= (num_objects_to_fetch / 2)) {
|
||||
stats = e.central_freelist().GetSpanStats();
|
||||
EXPECT_GT(stats.num_spans_requested, stats.num_spans_returned);
|
||||
EXPECT_NE(stats.obj_capacity, 0);
|
||||
// Total spans recorded in the histogram must be equal to the number of
|
||||
// live spans.
|
||||
size_t spans_in_histogram = 0;
|
||||
for (int i = 1; i <= absl::bit_width(TypeParam::kObjectsPerSpan); ++i) {
|
||||
spans_in_histogram += e.central_freelist().NumSpansWith(i);
|
||||
}
|
||||
EXPECT_EQ(spans_in_histogram, stats.num_live_spans());
|
||||
checked_half = true;
|
||||
}
|
||||
}
|
||||
|
||||
stats = e.central_freelist().GetSpanStats();
|
||||
EXPECT_EQ(stats.num_spans_requested, stats.num_spans_returned);
|
||||
// Since no span is live, histogram must be empty.
|
||||
for (int i = 1; i <= absl::bit_width(TypeParam::kObjectsPerSpan); ++i) {
|
||||
EXPECT_EQ(e.central_freelist().NumSpansWith(i), 0);
|
||||
}
|
||||
EXPECT_EQ(stats.obj_capacity, 0);
|
||||
}
|
||||
|
||||
TYPED_TEST_P(CentralFreeListTest, PassSpanObjectCountToPageheap) {
|
||||
ASSERT_GT(TypeParam::kObjectsPerSpan, 1);
|
||||
auto test_function = [&](size_t num_objects) {
|
||||
TypeParam e;
|
||||
std::vector<void*> objects(TypeParam::kObjectsPerSpan);
|
||||
EXPECT_CALL(
|
||||
e.forwarder(),
|
||||
AllocateSpan(testing::_, TypeParam::kObjectsPerSpan, testing::_))
|
||||
.Times(1);
|
||||
const size_t to_fetch =
|
||||
std::min(TypeParam::kObjectsPerSpan, TypeParam::kBatchSize);
|
||||
const size_t fetched =
|
||||
e.central_freelist().RemoveRange(&objects[0], to_fetch);
|
||||
size_t returned = 0;
|
||||
while (returned < fetched) {
|
||||
EXPECT_CALL(
|
||||
e.forwarder(),
|
||||
DeallocateSpans(testing::_, TypeParam::kObjectsPerSpan, testing::_))
|
||||
.Times(1);
|
||||
const size_t to_return =
|
||||
std::min(fetched - returned, TypeParam::kBatchSize);
|
||||
e.central_freelist().InsertRange({&objects[returned], to_return});
|
||||
returned += to_return;
|
||||
}
|
||||
};
|
||||
test_function(1);
|
||||
test_function(TypeParam::kObjectsPerSpan);
|
||||
}
|
||||
|
||||
TYPED_TEST_P(CentralFreeListTest, SpanFragmentation) {
|
||||
// This test is primarily exercising Span itself to model how tcmalloc.cc uses
|
||||
// it, but this gives us a self-contained (and sanitizable) implementation of
|
||||
// the CentralFreeList.
|
||||
TypeParam e;
|
||||
|
||||
// Allocate one object from the CFL to allocate a span.
|
||||
void* initial;
|
||||
int got = e.central_freelist().RemoveRange(&initial, 1);
|
||||
ASSERT_EQ(got, 1);
|
||||
|
||||
Span* const span = e.central_freelist().forwarder().MapObjectToSpan(initial);
|
||||
const size_t object_size =
|
||||
e.central_freelist().forwarder().class_to_size(TypeParam::kSizeClass);
|
||||
|
||||
ThreadManager fragmentation;
|
||||
fragmentation.Start(1, [&](int) {
|
||||
benchmark::DoNotOptimize(span->Fragmentation(object_size));
|
||||
});
|
||||
|
||||
ThreadManager cfl;
|
||||
cfl.Start(1, [&](int) {
|
||||
void* next;
|
||||
int got = e.central_freelist().RemoveRange(&next, 1);
|
||||
e.central_freelist().InsertRange(absl::MakeSpan(&next, got));
|
||||
});
|
||||
|
||||
absl::SleepFor(absl::Seconds(0.1));
|
||||
|
||||
fragmentation.Stop();
|
||||
cfl.Stop();
|
||||
|
||||
e.central_freelist().InsertRange(absl::MakeSpan(&initial, 1));
|
||||
}
|
||||
|
||||
REGISTER_TYPED_TEST_SUITE_P(CentralFreeListTest, IsolatedSmoke,
|
||||
MultiNonEmptyLists, SpanPriority,
|
||||
SpanUtilizationHistogram, MultipleSpans,
|
||||
SinglePopulate, PassSpanObjectCountToPageheap,
|
||||
SpanFragmentation);
|
||||
|
||||
namespace unit_tests {
|
||||
|
||||
using Env = FakeCentralFreeListEnvironment<
|
||||
central_freelist_internal::CentralFreeList<MockStaticForwarder>>;
|
||||
|
||||
INSTANTIATE_TYPED_TEST_SUITE_P(CentralFreeList, CentralFreeListTest,
|
||||
::testing::Types<Env>);
|
||||
|
||||
} // namespace unit_tests
|
||||
|
||||
} // namespace
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
53
src/third_party/tcmalloc/dist/tcmalloc/common.cc
vendored
Normal file
@ -0,0 +1,53 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tcmalloc/common.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "tcmalloc/experiment.h"
|
||||
#include "tcmalloc/internal/environment.h"
|
||||
#include "tcmalloc/internal/optimization.h"
|
||||
#include "tcmalloc/pages.h"
|
||||
#include "tcmalloc/sampler.h"
|
||||
#include "tcmalloc/span.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
|
||||
absl::string_view MemoryTagToLabel(MemoryTag tag) {
|
||||
switch (tag) {
|
||||
case MemoryTag::kNormal:
|
||||
return "NORMAL";
|
||||
case MemoryTag::kNormalP1:
|
||||
return "NORMAL_P1";
|
||||
case MemoryTag::kSampled:
|
||||
return "SAMPLED";
|
||||
case MemoryTag::kCold:
|
||||
return "COLD";
|
||||
default:
|
||||
ASSUME(false);
|
||||
}
|
||||
}
|
||||
|
||||
// This only provides correct answer for TCMalloc-allocated memory,
|
||||
// and may give a false positive for non-allocated block.
|
||||
extern "C" bool TCMalloc_Internal_PossiblyCold(const void* ptr) {
|
||||
return IsColdMemory(ptr);
|
||||
}
|
||||
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
380
src/third_party/tcmalloc/dist/tcmalloc/common.h
vendored
Normal file
@ -0,0 +1,380 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// Common definitions for tcmalloc code.
|
||||
|
||||
#ifndef TCMALLOC_COMMON_H_
|
||||
#define TCMALLOC_COMMON_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <new>
|
||||
#include <type_traits>
|
||||
|
||||
#include "absl/base/attributes.h"
|
||||
#include "absl/base/dynamic_annotations.h"
|
||||
#include "absl/base/internal/spinlock.h"
|
||||
#include "absl/base/macros.h"
|
||||
#include "absl/base/optimization.h"
|
||||
#include "absl/numeric/bits.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "absl/types/span.h"
|
||||
#include "tcmalloc/experiment.h"
|
||||
#include "tcmalloc/internal/config.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
#include "tcmalloc/internal/optimization.h"
|
||||
#include "tcmalloc/malloc_extension.h"
|
||||
#include "tcmalloc/size_class_info.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
|
||||
static_assert(sizeof(void*) == 8);
|
||||
|
||||
//-------------------------------------------------------------------
|
||||
// Configuration
|
||||
//-------------------------------------------------------------------
|
||||
|
||||
// There are four different models for tcmalloc which are created by defining a
|
||||
// set of constant variables differently:
|
||||
//
|
||||
// DEFAULT:
|
||||
// The default configuration strives for good performance while trying to
|
||||
// minimize fragmentation. It uses a smaller page size to reduce
|
||||
// fragmentation, but allocates per-thread and per-cpu capacities similar to
|
||||
// TCMALLOC_LARGE_PAGES / TCMALLOC_256K_PAGES.
|
||||
//
|
||||
// TCMALLOC_LARGE_PAGES:
|
||||
// Larger page sizes increase the bookkeeping granularity used by TCMalloc for
|
||||
// its allocations. This can reduce PageMap size and traffic to the
|
||||
// innermost cache (the page heap), but can increase memory footprints. As
|
||||
// TCMalloc will not reuse a page for a different allocation size until the
|
||||
// entire page is deallocated, this can be a source of increased memory
|
||||
// fragmentation.
|
||||
//
|
||||
// Historically, larger page sizes improved lookup performance for the
|
||||
// pointer-to-size lookup in the PageMap that was part of the critical path.
|
||||
// With most deallocations leveraging C++14's sized delete feature
|
||||
// (https://isocpp.org/files/papers/n3778.html), this optimization is less
|
||||
// significant.
|
||||
//
|
||||
// TCMALLOC_256K_PAGES
|
||||
// This configuration uses an even larger page size (256KB) as the unit of
|
||||
// accounting granularity.
|
||||
//
|
||||
// TCMALLOC_SMALL_BUT_SLOW:
|
||||
// Used for situations where minimizing the memory footprint is the most
|
||||
// desirable attribute, even at the cost of performance.
|
||||
//
|
||||
// The constants that vary between models are:
|
||||
//
|
||||
// kPageShift - Shift amount used to compute the page size.
|
||||
// kNumBaseClasses - Number of size classes serviced by bucket allocators
|
||||
// kMaxSize - Maximum size serviced by bucket allocators (thread/cpu/central)
|
||||
// kMinThreadCacheSize - The minimum size in bytes of each ThreadCache.
|
||||
// kMaxThreadCacheSize - The maximum size in bytes of each ThreadCache.
|
||||
// kDefaultOverallThreadCacheSize - The maximum combined size in bytes of all
|
||||
// ThreadCaches for an executable.
|
||||
// kStealAmount - The number of bytes one ThreadCache will steal from another
|
||||
// when the first ThreadCache is forced to Scavenge(), delaying the next
|
||||
// call to Scavenge for this thread.
|
||||
|
||||
// Older configurations had their own customized macros. Convert them into
|
||||
// a page-shift parameter that is checked below.
|
||||
|
||||
#ifndef TCMALLOC_PAGE_SHIFT
|
||||
#ifdef TCMALLOC_SMALL_BUT_SLOW
|
||||
#define TCMALLOC_PAGE_SHIFT 12
|
||||
#define TCMALLOC_USE_PAGEMAP3
|
||||
#elif defined(TCMALLOC_256K_PAGES)
|
||||
#define TCMALLOC_PAGE_SHIFT 18
|
||||
#elif defined(TCMALLOC_LARGE_PAGES)
|
||||
#define TCMALLOC_PAGE_SHIFT 15
|
||||
#else
|
||||
#define TCMALLOC_PAGE_SHIFT 13
|
||||
#endif
|
||||
#else
|
||||
#error "TCMALLOC_PAGE_SHIFT is an internal macro!"
|
||||
#endif
|
||||
|
||||
#if TCMALLOC_PAGE_SHIFT == 12
|
||||
inline constexpr size_t kPageShift = 12;
|
||||
inline constexpr size_t kNumBaseClasses = 46;
|
||||
inline constexpr bool kHasExpandedClasses = false;
|
||||
inline constexpr size_t kMaxSize = 8 << 10;
|
||||
inline constexpr size_t kMinThreadCacheSize = 4 * 1024;
|
||||
inline constexpr size_t kMaxThreadCacheSize = 64 * 1024;
|
||||
inline constexpr size_t kMaxCpuCacheSize = 10 * 1024;
|
||||
inline constexpr size_t kDefaultOverallThreadCacheSize = kMaxThreadCacheSize;
|
||||
inline constexpr size_t kStealAmount = kMinThreadCacheSize;
|
||||
inline constexpr size_t kDefaultProfileSamplingRate = 1 << 19;
|
||||
inline constexpr size_t kMinPages = 2;
|
||||
#elif TCMALLOC_PAGE_SHIFT == 15
|
||||
inline constexpr size_t kPageShift = 15;
|
||||
inline constexpr size_t kNumBaseClasses = 78;
|
||||
inline constexpr bool kHasExpandedClasses = true;
|
||||
inline constexpr size_t kMaxSize = 256 * 1024;
|
||||
inline constexpr size_t kMinThreadCacheSize = kMaxSize * 2;
|
||||
inline constexpr size_t kMaxThreadCacheSize = 4 << 20;
|
||||
inline constexpr size_t kMaxCpuCacheSize = 1.5 * 1024 * 1024;
|
||||
inline constexpr size_t kDefaultOverallThreadCacheSize =
|
||||
8u * kMaxThreadCacheSize;
|
||||
inline constexpr size_t kStealAmount = 1 << 16;
|
||||
inline constexpr size_t kDefaultProfileSamplingRate = 1 << 21;
|
||||
inline constexpr size_t kMinPages = 8;
|
||||
#elif TCMALLOC_PAGE_SHIFT == 18
|
||||
inline constexpr size_t kPageShift = 18;
|
||||
inline constexpr size_t kNumBaseClasses = 89;
|
||||
inline constexpr bool kHasExpandedClasses = true;
|
||||
inline constexpr size_t kMaxSize = 256 * 1024;
|
||||
inline constexpr size_t kMinThreadCacheSize = kMaxSize * 2;
|
||||
inline constexpr size_t kMaxThreadCacheSize = 4 << 20;
|
||||
inline constexpr size_t kMaxCpuCacheSize = 1.5 * 1024 * 1024;
|
||||
inline constexpr size_t kDefaultOverallThreadCacheSize =
|
||||
8u * kMaxThreadCacheSize;
|
||||
inline constexpr size_t kStealAmount = 1 << 16;
|
||||
inline constexpr size_t kDefaultProfileSamplingRate = 1 << 21;
|
||||
inline constexpr size_t kMinPages = 8;
|
||||
#elif TCMALLOC_PAGE_SHIFT == 13
|
||||
inline constexpr size_t kPageShift = 13;
|
||||
inline constexpr size_t kNumBaseClasses = 86;
|
||||
inline constexpr bool kHasExpandedClasses = true;
|
||||
inline constexpr size_t kMaxSize = 256 * 1024;
|
||||
inline constexpr size_t kMinThreadCacheSize = kMaxSize * 2;
|
||||
inline constexpr size_t kMaxThreadCacheSize = 4 << 20;
|
||||
inline constexpr size_t kMaxCpuCacheSize = 1.5 * 1024 * 1024;
|
||||
inline constexpr size_t kDefaultOverallThreadCacheSize =
|
||||
8u * kMaxThreadCacheSize;
|
||||
inline constexpr size_t kStealAmount = 1 << 16;
|
||||
inline constexpr size_t kDefaultProfileSamplingRate = 1 << 21;
|
||||
inline constexpr size_t kMinPages = 8;
|
||||
#else
|
||||
#error "Unsupported TCMALLOC_PAGE_SHIFT value!"
|
||||
#endif
|
||||
|
||||
// Sanitizers constrain the memory layout which causes problems with the
|
||||
// enlarged tags required to represent NUMA partitions. Disable NUMA awareness
|
||||
// to avoid failing to mmap memory.
|
||||
#if defined(TCMALLOC_NUMA_AWARE) && !defined(MEMORY_SANITIZER) && \
|
||||
!defined(THREAD_SANITIZER)
|
||||
inline constexpr size_t kNumaPartitions = 2;
|
||||
#else
|
||||
inline constexpr size_t kNumaPartitions = 1;
|
||||
#endif
|
||||
|
||||
// We have copies of kNumBaseClasses size classes for each NUMA node, followed
|
||||
// by any expanded classes.
|
||||
inline constexpr size_t kExpandedClassesStart =
|
||||
kNumBaseClasses * kNumaPartitions;
|
||||
inline constexpr size_t kNumClasses =
|
||||
kExpandedClassesStart + (kHasExpandedClasses ? kNumBaseClasses : 0);
|
||||
|
||||
// Size classes are often stored as uint32_t values, but there are some
|
||||
// situations where we need to store a size class with as compact a
|
||||
// representation as possible (e.g. in PageMap). Here we determine the integer
|
||||
// type to use in these situations - i.e. the smallest integer type large
|
||||
// enough to store values in the range [0,kNumClasses).
|
||||
constexpr size_t kMaxClass = kNumClasses - 1;
|
||||
using CompactSizeClass =
|
||||
std::conditional_t<kMaxClass <= std::numeric_limits<uint8_t>::max(),
|
||||
uint8_t, uint16_t>;
|
||||
|
||||
// ~64K classes ought to be enough for anybody, but let's be sure.
|
||||
static_assert(kMaxClass <= std::numeric_limits<CompactSizeClass>::max());
|
||||
|
||||
// Minimum/maximum number of batches in TransferCache per size class.
|
||||
// Actual numbers depends on a number of factors, see TransferCache::Init
|
||||
// for details.
|
||||
inline constexpr size_t kMinObjectsToMove = 2;
|
||||
inline constexpr size_t kMaxObjectsToMove = 128;
|
||||
|
||||
inline constexpr size_t kPageSize = 1 << kPageShift;
|
||||
// Verify that the page size used is at least 8x smaller than the maximum
|
||||
// element size in the thread cache. This guarantees at most 12.5% internal
|
||||
// fragmentation (1/8). When page size is 256k (kPageShift == 18), the benefit
|
||||
// of increasing kMaxSize to be multiple of kPageSize is unclear. Object size
|
||||
// profile data indicates that the number of simultaneously live objects (of
|
||||
// size >= 256k) tends to be very small. Keeping those objects as 'large'
|
||||
// objects won't cause too much memory waste, while heap memory reuse can be
|
||||
// improved. Increasing kMaxSize to be too large has another bad side effect --
|
||||
// the thread cache pressure is increased, which will in turn increase traffic
|
||||
// between central cache and thread cache, leading to performance degradation.
|
||||
static_assert((kMaxSize / kPageSize) >= kMinPages || kPageShift >= 18,
|
||||
"Ratio of kMaxSize / kPageSize is too small");
|
||||
|
||||
inline constexpr std::align_val_t kAlignment{8};
|
||||
// log2 (kAlignment)
|
||||
inline constexpr size_t kAlignmentShift =
|
||||
absl::bit_width(static_cast<size_t>(kAlignment) - 1u);
|
||||
|
||||
// The number of times that a deallocation can cause a freelist to
|
||||
// go over its max_length() before shrinking max_length().
|
||||
inline constexpr int kMaxOverages = 3;
|
||||
|
||||
// Maximum length we allow a per-thread free-list to have before we
|
||||
// move objects from it into the corresponding central free-list. We
|
||||
// want this big to avoid locking the central free-list too often. It
|
||||
// should not hurt to make this list somewhat big because the
|
||||
// scavenging code will shrink it down when its contents are not in use.
|
||||
inline constexpr int kMaxDynamicFreeListLength = 8192;
|
||||
|
||||
enum class MemoryTag : uint8_t {
|
||||
// Sampled, infrequently allocated
|
||||
kSampled = 0x0,
|
||||
// Not sampled, NUMA partition 0
|
||||
kNormalP0 = 0x1,
|
||||
// Not sampled, NUMA partition 1
|
||||
kNormalP1 = (kNumaPartitions > 1) ? 0x2 : 0xff,
|
||||
// Not sampled
|
||||
kNormal = kNormalP0,
|
||||
// Cold
|
||||
kCold = (kNumaPartitions > 1) ? 0x4 : 0x2,
|
||||
};
|
||||
|
||||
// We make kNormal and kCold disjoint so that IsCold implies IsSampled. This
|
||||
// allows us to avoid modifying the fast delete path in any way when cold-tagged
|
||||
// memory allocations are absent. We can overload the IsSampled check and then
|
||||
// do a second check for whether the possibly-sampled allocation is actually
|
||||
// IsCold.
|
||||
static_assert((static_cast<uint8_t>(MemoryTag::kNormal) &
|
||||
static_cast<uint8_t>(MemoryTag::kCold)) == 0,
|
||||
"kNormal and kCold should have disjoint bit patterns");
|
||||
|
||||
inline constexpr uintptr_t kTagShift = std::min(kAddressBits - 4, 42);
|
||||
inline constexpr uintptr_t kTagMask = uintptr_t{kNumaPartitions > 1 ? 0x7 : 0x3}
|
||||
<< kTagShift;
|
||||
|
||||
inline bool IsSampledMemory(const void* ptr) {
|
||||
constexpr uintptr_t kSampledNormalMask = kNumaPartitions > 1 ? 0x3 : 0x1;
|
||||
|
||||
static_assert(static_cast<uintptr_t>(MemoryTag::kNormalP0) &
|
||||
kSampledNormalMask);
|
||||
static_assert(static_cast<uintptr_t>(MemoryTag::kNormalP1) &
|
||||
kSampledNormalMask);
|
||||
|
||||
const uintptr_t tag =
|
||||
(reinterpret_cast<uintptr_t>(ptr) & kTagMask) >> kTagShift;
|
||||
return (tag & kSampledNormalMask) ==
|
||||
static_cast<uintptr_t>(MemoryTag::kSampled);
|
||||
}
|
||||
|
||||
inline bool IsNormalMemory(const void* ptr) { return !IsSampledMemory(ptr); }
|
||||
|
||||
inline bool IsColdMemory(const void* ptr) {
|
||||
bool r = (reinterpret_cast<uintptr_t>(ptr) & kTagMask) ==
|
||||
(static_cast<uintptr_t>(MemoryTag::kCold) << kTagShift);
|
||||
// IsColdMemory(ptr) implies IsSampledMemory(ptr). This allows us to avoid
|
||||
// introducing new branches on the delete fast path when cold memory tags are
|
||||
// not in use.
|
||||
ASSERT(!r || IsSampledMemory(ptr));
|
||||
return r;
|
||||
}
|
||||
|
||||
inline constexpr bool ColdFeatureActive() { return kHasExpandedClasses; }
|
||||
|
||||
inline MemoryTag GetMemoryTag(const void* ptr) {
|
||||
return static_cast<MemoryTag>((reinterpret_cast<uintptr_t>(ptr) & kTagMask) >>
|
||||
kTagShift);
|
||||
}
|
||||
|
||||
absl::string_view MemoryTagToLabel(MemoryTag tag);
|
||||
|
||||
inline constexpr bool IsExpandedSizeClass(unsigned size_class) {
|
||||
return kHasExpandedClasses && (size_class >= kExpandedClassesStart);
|
||||
}
|
||||
|
||||
#if !defined(TCMALLOC_SMALL_BUT_SLOW) && __SIZEOF_POINTER__ != 4
|
||||
// Always allocate at least a huge page
|
||||
inline constexpr size_t kMinSystemAlloc = kHugePageSize;
|
||||
inline constexpr size_t kMinMmapAlloc = 1 << 30; // mmap() in 1GiB ranges.
|
||||
#else
|
||||
// Allocate in units of 2MiB. This is the size of a huge page for x86, but
|
||||
// not for Power.
|
||||
inline constexpr size_t kMinSystemAlloc = 2 << 20;
|
||||
// mmap() in units of 32MiB. This is a multiple of huge page size for
|
||||
// both x86 (2MiB) and Power (16MiB)
|
||||
inline constexpr size_t kMinMmapAlloc = 32 << 20;
|
||||
#endif
|
||||
|
||||
static_assert(kMinMmapAlloc % kMinSystemAlloc == 0,
|
||||
"Minimum mmap allocation size is not a multiple of"
|
||||
" minimum system allocation size");
|
||||
|
||||
enum class AllocationAccess {
|
||||
kHot,
|
||||
kCold,
|
||||
};
|
||||
|
||||
inline bool IsColdHint(hot_cold_t hint) {
|
||||
return static_cast<uint8_t>(hint) < uint8_t{128};
|
||||
}
|
||||
|
||||
inline AllocationAccess AccessFromPointer(void* ptr) {
|
||||
if (!kHasExpandedClasses) {
|
||||
ASSERT(!IsColdMemory(ptr));
|
||||
return AllocationAccess::kHot;
|
||||
}
|
||||
|
||||
return ABSL_PREDICT_FALSE(IsColdMemory(ptr)) ? AllocationAccess::kCold
|
||||
: AllocationAccess::kHot;
|
||||
}
|
||||
|
||||
inline MemoryTag NumaNormalTag(size_t numa_partition) {
|
||||
switch (numa_partition) {
|
||||
case 0:
|
||||
return MemoryTag::kNormalP0;
|
||||
case 1:
|
||||
return MemoryTag::kNormalP1;
|
||||
default:
|
||||
ASSUME(false);
|
||||
__builtin_unreachable();
|
||||
}
|
||||
}
|
||||
|
||||
inline size_t NumaPartitionFromPointer(void* ptr) {
|
||||
if constexpr (kNumaPartitions == 1) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
switch (GetMemoryTag(ptr)) {
|
||||
case MemoryTag::kNormalP1:
|
||||
return 1;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Linker initialized, so this lock can be accessed at any time.
|
||||
// Note: `CpuCache::ResizeInfo::lock` must be taken before the `pageheap_lock`
|
||||
// if both are going to be held simultaneously.
|
||||
extern absl::base_internal::SpinLock pageheap_lock;
|
||||
|
||||
// Evaluates a/b, avoiding division by zero.
|
||||
inline double safe_div(double a, double b) {
|
||||
if (b == 0) {
|
||||
return 0.;
|
||||
} else {
|
||||
return a / b;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
|
||||
#endif // TCMALLOC_COMMON_H_
|
||||
47
src/third_party/tcmalloc/dist/tcmalloc/copts.bzl
vendored
Normal file
@ -0,0 +1,47 @@
|
||||
# Copyright 2019 The TCMalloc Authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# https://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""This package provides default compiler warning flags for the OSS release"""
|
||||
|
||||
TCMALLOC_LLVM_FLAGS = [
|
||||
# Ensure TCMalloc itself builds without errors, even if its dependencies
|
||||
# aren't necessarily -Werror clean.
|
||||
"-Werror",
|
||||
"-Wno-deprecated-declarations",
|
||||
"-Wno-deprecated-volatile",
|
||||
"-Wno-implicit-int-float-conversion",
|
||||
"-Wno-sign-compare",
|
||||
"-Wno-uninitialized",
|
||||
"-Wno-unused-function",
|
||||
"-Wno-unused-variable",
|
||||
]
|
||||
|
||||
TCMALLOC_GCC_FLAGS = [
|
||||
# Ensure TCMalloc itself builds without errors, even if its dependencies
|
||||
# aren't necessarily -Werror clean.
|
||||
"-Werror",
|
||||
"-Wno-attribute-alias",
|
||||
"-Wno-sign-compare",
|
||||
"-Wno-stringop-overflow",
|
||||
"-Wno-uninitialized",
|
||||
"-Wno-unused-function",
|
||||
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66425
|
||||
"-Wno-unused-result",
|
||||
"-Wno-unused-variable",
|
||||
]
|
||||
|
||||
TCMALLOC_DEFAULT_COPTS = select({
|
||||
"//tcmalloc:llvm": TCMALLOC_LLVM_FLAGS,
|
||||
"//conditions:default": TCMALLOC_GCC_FLAGS,
|
||||
})
|
||||
82
src/third_party/tcmalloc/dist/tcmalloc/cpu_cache.cc
vendored
Normal file
@ -0,0 +1,82 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tcmalloc/cpu_cache.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <atomic>
|
||||
|
||||
#include "absl/base/dynamic_annotations.h"
|
||||
#include "absl/base/macros.h"
|
||||
#include "absl/base/thread_annotations.h"
|
||||
#include "absl/container/fixed_array.h"
|
||||
#include "tcmalloc/arena.h"
|
||||
#include "tcmalloc/common.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
#include "tcmalloc/internal_malloc_extension.h"
|
||||
#include "tcmalloc/parameters.h"
|
||||
#include "tcmalloc/static_vars.h"
|
||||
#include "tcmalloc/transfer_cache.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
|
||||
static void ActivatePerCpuCaches() {
|
||||
if (tcmalloc::tcmalloc_internal::tc_globals.CpuCacheActive()) {
|
||||
// Already active.
|
||||
return;
|
||||
}
|
||||
|
||||
if (Parameters::per_cpu_caches() && subtle::percpu::IsFast()) {
|
||||
tc_globals.InitIfNecessary();
|
||||
tc_globals.cpu_cache().Activate();
|
||||
tc_globals.ActivateCpuCache();
|
||||
// no need for this thread cache anymore, I guess.
|
||||
ThreadCache::BecomeIdle();
|
||||
// If there's a problem with this code, let's notice it right away:
|
||||
::operator delete(::operator new(1));
|
||||
}
|
||||
}
|
||||
|
||||
class PerCPUInitializer {
|
||||
public:
|
||||
PerCPUInitializer() {
|
||||
ActivatePerCpuCaches();
|
||||
}
|
||||
};
|
||||
static PerCPUInitializer module_enter_exit;
|
||||
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
|
||||
extern "C" void TCMalloc_Internal_ForceCpuCacheActivation() {
|
||||
tcmalloc::tcmalloc_internal::ActivatePerCpuCaches();
|
||||
}
|
||||
|
||||
extern "C" bool MallocExtension_Internal_GetPerCpuCachesActive() {
|
||||
return tcmalloc::tcmalloc_internal::tc_globals.CpuCacheActive();
|
||||
}
|
||||
|
||||
extern "C" int32_t MallocExtension_Internal_GetMaxPerCpuCacheSize() {
|
||||
return tcmalloc::tcmalloc_internal::Parameters::max_per_cpu_cache_size();
|
||||
}
|
||||
|
||||
extern "C" void MallocExtension_Internal_SetMaxPerCpuCacheSize(int32_t value) {
|
||||
tcmalloc::tcmalloc_internal::Parameters::set_max_per_cpu_cache_size(value);
|
||||
}
|
||||
1982
src/third_party/tcmalloc/dist/tcmalloc/cpu_cache.h
vendored
Normal file
86
src/third_party/tcmalloc/dist/tcmalloc/cpu_cache_activate_test.cc
vendored
Normal file
@ -0,0 +1,86 @@
|
||||
// Copyright 2021 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include <string>
|
||||
#include <thread> // NOLINT(build/c++11)
|
||||
|
||||
#include "benchmark/benchmark.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "absl/base/internal/sysinfo.h"
|
||||
#include "absl/random/random.h"
|
||||
#include "absl/synchronization/notification.h"
|
||||
#include "absl/time/clock.h"
|
||||
#include "absl/time/time.h"
|
||||
#include "tcmalloc/cpu_cache.h"
|
||||
#include "tcmalloc/internal/percpu.h"
|
||||
#include "tcmalloc/internal_malloc_extension.h"
|
||||
#include "tcmalloc/static_vars.h"
|
||||
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
namespace {
|
||||
|
||||
// This test mutates global state, including triggering the activation of the
|
||||
// per-CPU caches. It should not be run along side other tests in the same
|
||||
// process that may rely on an isolated global instance.
|
||||
TEST(CpuCacheActivateTest, GlobalInstance) {
|
||||
if (!subtle::percpu::IsFast()) {
|
||||
return;
|
||||
}
|
||||
|
||||
CpuCache& cache = tc_globals.cpu_cache();
|
||||
|
||||
absl::Notification done;
|
||||
|
||||
std::thread t([&]() {
|
||||
const int num_cpus = absl::base_internal::NumCPUs();
|
||||
absl::BitGen rng;
|
||||
|
||||
while (!done.HasBeenNotified()) {
|
||||
const double coin = absl::Uniform(rng, 0., 1.);
|
||||
const bool ready = tc_globals.CpuCacheActive();
|
||||
|
||||
if (ready && coin < 0.25) {
|
||||
const int cpu = absl::Uniform(rng, 0, num_cpus);
|
||||
benchmark::DoNotOptimize(cache.UsedBytes(cpu));
|
||||
} else if (ready && coin < 0.5) {
|
||||
const int cpu = absl::Uniform(rng, 0, num_cpus);
|
||||
benchmark::DoNotOptimize(cache.Capacity(cpu));
|
||||
} else if (ready && coin < 0.75) {
|
||||
benchmark::DoNotOptimize(cache.TotalUsedBytes());
|
||||
} else {
|
||||
benchmark::DoNotOptimize(cache.CacheLimit());
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Trigger initialization of the CpuCache, confirming it was not initialized
|
||||
// at the start of the test and is afterwards.
|
||||
EXPECT_FALSE(tc_globals.CpuCacheActive());
|
||||
ASSERT_NE(&TCMalloc_Internal_ForceCpuCacheActivation, nullptr);
|
||||
Parameters::set_per_cpu_caches(true);
|
||||
TCMalloc_Internal_ForceCpuCacheActivation();
|
||||
EXPECT_TRUE(tc_globals.CpuCacheActive());
|
||||
|
||||
absl::SleepFor(absl::Seconds(0.2));
|
||||
|
||||
done.Notify();
|
||||
t.join();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
1242
src/third_party/tcmalloc/dist/tcmalloc/cpu_cache_test.cc
vendored
Normal file
606
src/third_party/tcmalloc/dist/tcmalloc/deallocation_profiler.cc
vendored
Normal file
@ -0,0 +1,606 @@
|
||||
// Copyright 2022 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tcmalloc/deallocation_profiler.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath> // for std::lround
|
||||
#include <cstdint> // for uintptr_t
|
||||
#include <functional>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <string> // for memset
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
#include "absl/base/attributes.h"
|
||||
#include "absl/base/internal/low_level_alloc.h"
|
||||
#include "absl/base/internal/spinlock.h"
|
||||
#include "absl/base/internal/sysinfo.h"
|
||||
#include "absl/container/flat_hash_map.h"
|
||||
#include "absl/debugging/stacktrace.h" // for GetStackTrace
|
||||
#include "absl/hash/hash.h"
|
||||
#include "absl/time/clock.h"
|
||||
#include "absl/time/time.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
#include "tcmalloc/internal/percpu.h"
|
||||
#include "tcmalloc/internal_malloc_extension.h"
|
||||
#include "tcmalloc/malloc_extension.h"
|
||||
#include "tcmalloc/sampled_allocation.h"
|
||||
#include "tcmalloc/static_vars.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace deallocationz {
|
||||
namespace {
|
||||
using ::absl::base_internal::SpinLock;
|
||||
using ::absl::base_internal::SpinLockHolder;
|
||||
|
||||
// STL adaptor for an arena based allocator which provides the following:
|
||||
// static void* Alloc::Allocate(size_t size);
|
||||
// static void Alloc::Free(void* ptr, size_t size);
|
||||
template <typename T, class Alloc>
|
||||
class AllocAdaptor final {
|
||||
public:
|
||||
using value_type = T;
|
||||
|
||||
AllocAdaptor() {}
|
||||
AllocAdaptor(const AllocAdaptor&) {}
|
||||
|
||||
template <class T1>
|
||||
using rebind = AllocAdaptor<T1, Alloc>;
|
||||
|
||||
template <class T1>
|
||||
explicit AllocAdaptor(const AllocAdaptor<T1, Alloc>&) {}
|
||||
|
||||
T* allocate(size_t n) {
|
||||
// Check if n is too big to allocate.
|
||||
ASSERT((n * sizeof(T)) / sizeof(T) == n);
|
||||
return static_cast<T*>(Alloc::Allocate(n * sizeof(T)));
|
||||
}
|
||||
void deallocate(T* p, size_t n) { Alloc::Free(p, n * sizeof(T)); }
|
||||
|
||||
// There's no state, so these allocators are always equal
|
||||
bool operator==(const AllocAdaptor&) const { return true; }
|
||||
bool operator!=(const AllocAdaptor&) const { return false; }
|
||||
};
|
||||
|
||||
const int64_t kMaxStackDepth = 64;
|
||||
|
||||
// Stores stack traces and metadata for any allocation or deallocation
|
||||
// encountered by the profiler.
|
||||
struct DeallocationSampleRecord {
|
||||
double weight = 0.0;
|
||||
size_t requested_size = 0;
|
||||
size_t requested_alignment = 0;
|
||||
size_t allocated_size = 0; // size after sizeclass/page rounding
|
||||
|
||||
int depth = 0; // Number of PC values stored in array below
|
||||
void* stack[kMaxStackDepth];
|
||||
|
||||
// creation_time is used to capture the life_time of sampled allocations
|
||||
absl::Time creation_time;
|
||||
int cpu_id = -1;
|
||||
pid_t thread_id = 0;
|
||||
|
||||
template <typename H>
|
||||
friend H AbslHashValue(H h, const DeallocationSampleRecord& c) {
|
||||
return H::combine(H::combine_contiguous(std::move(h), c.stack, c.depth),
|
||||
c.depth, c.requested_size, c.requested_alignment,
|
||||
c.allocated_size);
|
||||
}
|
||||
|
||||
bool operator==(const DeallocationSampleRecord& other) const {
|
||||
if (depth != other.depth || requested_size != other.requested_size ||
|
||||
requested_alignment != other.requested_alignment ||
|
||||
allocated_size != other.allocated_size) {
|
||||
return false;
|
||||
}
|
||||
return std::equal(stack, stack + depth, other.stack);
|
||||
}
|
||||
};
|
||||
|
||||
// Tracks whether an object was allocated/deallocated by the same CPU/thread.
|
||||
struct CpuThreadMatchingStatus {
|
||||
constexpr CpuThreadMatchingStatus(bool cpu_matched, bool thread_matched)
|
||||
: cpu_matched(cpu_matched),
|
||||
thread_matched(thread_matched),
|
||||
value((static_cast<int>(cpu_matched) << 1) |
|
||||
static_cast<int>(thread_matched)) {}
|
||||
bool cpu_matched;
|
||||
bool thread_matched;
|
||||
int value;
|
||||
};
|
||||
|
||||
struct RpcMatchingStatus {
|
||||
static constexpr int ComputeValue(uint64_t alloc, uint64_t dealloc) {
|
||||
if (alloc != 0 && dealloc != 0) {
|
||||
return static_cast<int>(alloc == dealloc);
|
||||
} else {
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
constexpr RpcMatchingStatus(uint64_t alloc, uint64_t dealloc)
|
||||
: value(ComputeValue(alloc, dealloc)) {}
|
||||
|
||||
int value;
|
||||
};
|
||||
|
||||
int ComputeIndex(CpuThreadMatchingStatus status, RpcMatchingStatus rpc_status) {
|
||||
return status.value * 3 + rpc_status.value;
|
||||
}
|
||||
|
||||
constexpr std::pair<CpuThreadMatchingStatus, RpcMatchingStatus> kAllCases[] = {
|
||||
{CpuThreadMatchingStatus(false, false), RpcMatchingStatus(0, 0)},
|
||||
{CpuThreadMatchingStatus(false, true), RpcMatchingStatus(0, 0)},
|
||||
{CpuThreadMatchingStatus(true, false), RpcMatchingStatus(0, 0)},
|
||||
{CpuThreadMatchingStatus(true, true), RpcMatchingStatus(0, 0)},
|
||||
|
||||
{CpuThreadMatchingStatus(false, false), RpcMatchingStatus(1, 2)},
|
||||
{CpuThreadMatchingStatus(false, true), RpcMatchingStatus(1, 2)},
|
||||
{CpuThreadMatchingStatus(true, false), RpcMatchingStatus(1, 2)},
|
||||
{CpuThreadMatchingStatus(true, true), RpcMatchingStatus(1, 2)},
|
||||
|
||||
{CpuThreadMatchingStatus(false, false), RpcMatchingStatus(1, 1)},
|
||||
{CpuThreadMatchingStatus(false, true), RpcMatchingStatus(1, 1)},
|
||||
{CpuThreadMatchingStatus(true, false), RpcMatchingStatus(1, 1)},
|
||||
{CpuThreadMatchingStatus(true, true), RpcMatchingStatus(1, 1)},
|
||||
};
|
||||
} // namespace
|
||||
|
||||
class DeallocationProfiler {
|
||||
private:
|
||||
// Arena and allocator used to back STL objects used by DeallocationProfiler
|
||||
// Shared between all instances of DeallocationProfiler
|
||||
// TODO(b/248332543): Use TCMalloc's own arena allocator instead of defining a
|
||||
// new one here. The need for refcount management could be the reason for
|
||||
// using a custom allocator in the first place.
|
||||
class MyAllocator {
|
||||
public:
|
||||
static void* Allocate(size_t n) {
|
||||
return absl::base_internal::LowLevelAlloc::AllocWithArena(n, arena_);
|
||||
}
|
||||
static void Free(const void* p, size_t /* n */) {
|
||||
absl::base_internal::LowLevelAlloc::Free(const_cast<void*>(p));
|
||||
}
|
||||
|
||||
// The lifetime of the arena is managed using a reference count and
|
||||
// determined by how long at least one emitted Profile remains alive.
|
||||
struct LowLevelArenaReference {
|
||||
LowLevelArenaReference() {
|
||||
SpinLockHolder h(&arena_lock_);
|
||||
if ((refcount_++) == 0) {
|
||||
CHECK_CONDITION(arena_ == nullptr);
|
||||
arena_ = absl::base_internal::LowLevelAlloc::NewArena(0);
|
||||
}
|
||||
}
|
||||
|
||||
~LowLevelArenaReference() {
|
||||
SpinLockHolder h(&arena_lock_);
|
||||
if ((--refcount_) == 0) {
|
||||
CHECK_CONDITION(
|
||||
absl::base_internal::LowLevelAlloc::DeleteArena(arena_));
|
||||
arena_ = nullptr;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
private:
|
||||
// We need to protect the arena with a mutex and ensure that every thread
|
||||
// acquires that mutex before it uses the arena for the first time. Once
|
||||
// it has acquired the mutex, it is guaranteed that arena won't change
|
||||
// between that point in time and when the thread stops accessing it (as
|
||||
// enforced by LowLevelArenaReference below).
|
||||
ABSL_CONST_INIT static SpinLock arena_lock_;
|
||||
static absl::base_internal::LowLevelAlloc::Arena* arena_;
|
||||
|
||||
// We assume that launching a new deallocation profiler takes too long
|
||||
// to cause this to overflow within the sampling period. The reason this
|
||||
// is not using std::shared_ptr is that we do not only need to protect the
|
||||
// value of the reference count but also the pointer itself (and therefore
|
||||
// need a separate mutex either way).
|
||||
static uint32_t refcount_;
|
||||
};
|
||||
|
||||
// This must be the first member of the class to be initialized. The
|
||||
// underlying arena must stay alive as long as the profiler.
|
||||
MyAllocator::LowLevelArenaReference arena_ref_;
|
||||
|
||||
// All active profilers are stored in a list.
|
||||
DeallocationProfiler* next_;
|
||||
DeallocationProfilerList* list_ = nullptr;
|
||||
friend class DeallocationProfilerList;
|
||||
|
||||
using AllocsTable = absl::flat_hash_map<
|
||||
tcmalloc_internal::AllocHandle, DeallocationSampleRecord,
|
||||
absl::Hash<tcmalloc_internal::AllocHandle>,
|
||||
std::equal_to<tcmalloc_internal::AllocHandle>,
|
||||
AllocAdaptor<std::pair<const tcmalloc_internal::AllocHandle,
|
||||
DeallocationSampleRecord>,
|
||||
MyAllocator>>;
|
||||
|
||||
class DeallocationStackTraceTable final
|
||||
: public tcmalloc_internal::ProfileBase {
|
||||
public:
|
||||
// We define the dtor to ensure it is placed in the desired text section.
|
||||
~DeallocationStackTraceTable() override = default;
|
||||
void AddTrace(const DeallocationSampleRecord& alloc_trace,
|
||||
const DeallocationSampleRecord& dealloc_trace);
|
||||
|
||||
void Iterate(
|
||||
absl::FunctionRef<void(const Profile::Sample&)> func) const override;
|
||||
|
||||
ProfileType Type() const override {
|
||||
return tcmalloc::ProfileType::kLifetimes;
|
||||
}
|
||||
|
||||
absl::Duration Duration() const override {
|
||||
return stop_time_ - start_time_;
|
||||
}
|
||||
|
||||
void StopAndRecord(const AllocsTable& allocs);
|
||||
|
||||
private:
|
||||
// This must be the first member of the class to be initialized. The
|
||||
// underlying arena must stay alive as long as the profile.
|
||||
MyAllocator::LowLevelArenaReference arena_ref_;
|
||||
|
||||
static constexpr int kNumCases =
|
||||
12; // CPUthreadMatchingStatus({T,F},{T,F}) x RPCMatchingStatus
|
||||
|
||||
struct Key {
|
||||
DeallocationSampleRecord alloc;
|
||||
DeallocationSampleRecord dealloc;
|
||||
|
||||
Key(const DeallocationSampleRecord& alloc,
|
||||
const DeallocationSampleRecord& dealloc)
|
||||
: alloc(alloc), dealloc(dealloc) {}
|
||||
|
||||
template <typename H>
|
||||
friend H AbslHashValue(H h, const Key& c) {
|
||||
return H::combine(std::move(h), c.alloc, c.dealloc);
|
||||
}
|
||||
|
||||
bool operator==(const Key& other) const {
|
||||
return (alloc == other.alloc) && (dealloc == other.dealloc);
|
||||
}
|
||||
};
|
||||
|
||||
struct Value {
|
||||
// for each possible cases, we collect repetition count and avg lifetime
|
||||
// we also collect the minimum and maximum lifetimes, as well as the sum
|
||||
// of squares (to calculate the standard deviation).
|
||||
double counts[kNumCases] = {0.0};
|
||||
double mean_life_times_ns[kNumCases] = {0.0};
|
||||
double variance_life_times_ns[kNumCases] = {0.0};
|
||||
double min_life_times_ns[kNumCases] = {0.0};
|
||||
double max_life_times_ns[kNumCases] = {0.0};
|
||||
|
||||
Value() {
|
||||
std::fill_n(min_life_times_ns, kNumCases,
|
||||
std::numeric_limits<double>::max());
|
||||
}
|
||||
};
|
||||
|
||||
absl::flat_hash_map<Key, Value, absl::Hash<Key>, std::equal_to<Key>,
|
||||
AllocAdaptor<std::pair<const Key, Value>, MyAllocator>>
|
||||
table_;
|
||||
|
||||
absl::Time start_time_ = absl::Now();
|
||||
absl::Time stop_time_;
|
||||
};
|
||||
|
||||
// Keep track of allocations that are in flight
|
||||
AllocsTable allocs_;
|
||||
|
||||
// Table to store lifetime information collected by this profiler
|
||||
std::unique_ptr<DeallocationStackTraceTable> reports_ = nullptr;
|
||||
|
||||
public:
|
||||
explicit DeallocationProfiler(DeallocationProfilerList* list) : list_(list) {
|
||||
reports_ = std::make_unique<DeallocationStackTraceTable>();
|
||||
list_->Add(this);
|
||||
}
|
||||
|
||||
~DeallocationProfiler() {
|
||||
if (reports_ != nullptr) {
|
||||
Stop();
|
||||
}
|
||||
}
|
||||
|
||||
const tcmalloc::Profile Stop() {
|
||||
if (reports_ != nullptr) {
|
||||
// We first remove the profiler from the list to avoid racing with
|
||||
// potential allocations which may modify the allocs_ table.
|
||||
list_->Remove(this);
|
||||
reports_->StopAndRecord(allocs_);
|
||||
return tcmalloc_internal::ProfileAccessor::MakeProfile(
|
||||
std::move(reports_));
|
||||
}
|
||||
return tcmalloc::Profile();
|
||||
}
|
||||
|
||||
void ReportMalloc(const tcmalloc_internal::StackTrace& stack_trace) {
|
||||
// store sampled alloc in the hashmap
|
||||
DeallocationSampleRecord& allocation =
|
||||
allocs_[stack_trace.sampled_alloc_handle];
|
||||
|
||||
allocation.allocated_size = stack_trace.allocated_size;
|
||||
allocation.requested_size = stack_trace.requested_size;
|
||||
allocation.requested_alignment = stack_trace.requested_alignment;
|
||||
allocation.depth = stack_trace.depth;
|
||||
memcpy(allocation.stack, stack_trace.stack,
|
||||
sizeof(void*) * std::min(static_cast<int64_t>(stack_trace.depth),
|
||||
kMaxStackDepth));
|
||||
// TODO(mmaas): Do we need to worry about b/65384231 anymore?
|
||||
allocation.creation_time = stack_trace.allocation_time;
|
||||
allocation.cpu_id = tcmalloc_internal::subtle::percpu::GetCurrentCpu();
|
||||
allocation.thread_id = absl::base_internal::GetTID();
|
||||
// We divide by the requested size to obtain the number of allocations.
|
||||
// TODO(b/248332543): Consider using AllocatedBytes from sampler.h.
|
||||
allocation.weight = static_cast<double>(stack_trace.weight) /
|
||||
(stack_trace.requested_size + 1);
|
||||
}
|
||||
|
||||
void ReportFree(tcmalloc_internal::AllocHandle handle) {
|
||||
auto it = allocs_.find(handle);
|
||||
|
||||
// Handle the case that we observed the deallocation but not the allocation
|
||||
if (it == allocs_.end()) {
|
||||
return;
|
||||
}
|
||||
|
||||
DeallocationSampleRecord sample = it->second;
|
||||
allocs_.erase(it);
|
||||
|
||||
DeallocationSampleRecord deallocation;
|
||||
deallocation.allocated_size = sample.allocated_size;
|
||||
deallocation.requested_alignment = sample.requested_alignment;
|
||||
deallocation.requested_size = sample.requested_size;
|
||||
deallocation.creation_time = absl::Now();
|
||||
deallocation.cpu_id = tcmalloc_internal::subtle::percpu::GetCurrentCpu();
|
||||
deallocation.thread_id = absl::base_internal::GetTID();
|
||||
deallocation.depth =
|
||||
absl::GetStackTrace(deallocation.stack, kMaxStackDepth, 1);
|
||||
|
||||
reports_->AddTrace(sample, deallocation);
|
||||
}
|
||||
};
|
||||
|
||||
void DeallocationProfilerList::Add(DeallocationProfiler* profiler) {
|
||||
SpinLockHolder h(&profilers_lock_);
|
||||
profiler->next_ = first_;
|
||||
first_ = profiler;
|
||||
|
||||
// Whenever a new profiler is created, we seed it with live allocations.
|
||||
tcmalloc_internal::tc_globals.sampled_allocation_recorder().Iterate(
|
||||
[profiler](
|
||||
const tcmalloc_internal::SampledAllocation& sampled_allocation) {
|
||||
profiler->ReportMalloc(sampled_allocation.sampled_stack);
|
||||
});
|
||||
}
|
||||
|
||||
// This list is very short and we're nowhere near a hot path, just walk
|
||||
void DeallocationProfilerList::Remove(DeallocationProfiler* profiler) {
|
||||
SpinLockHolder h(&profilers_lock_);
|
||||
DeallocationProfiler** link = &first_;
|
||||
DeallocationProfiler* cur = first_;
|
||||
while (cur != profiler) {
|
||||
CHECK_CONDITION(cur != nullptr);
|
||||
link = &cur->next_;
|
||||
cur = cur->next_;
|
||||
}
|
||||
*link = profiler->next_;
|
||||
}
|
||||
|
||||
void DeallocationProfilerList::ReportMalloc(
|
||||
const tcmalloc_internal::StackTrace& stack_trace) {
|
||||
SpinLockHolder h(&profilers_lock_);
|
||||
DeallocationProfiler* cur = first_;
|
||||
while (cur != nullptr) {
|
||||
cur->ReportMalloc(stack_trace);
|
||||
cur = cur->next_;
|
||||
}
|
||||
}
|
||||
|
||||
void DeallocationProfilerList::ReportFree(
|
||||
tcmalloc_internal::AllocHandle handle) {
|
||||
SpinLockHolder h(&profilers_lock_);
|
||||
DeallocationProfiler* cur = first_;
|
||||
while (cur != nullptr) {
|
||||
cur->ReportFree(handle);
|
||||
cur = cur->next_;
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize static variables
|
||||
absl::base_internal::LowLevelAlloc::Arena*
|
||||
DeallocationProfiler::MyAllocator::arena_ = nullptr;
|
||||
uint32_t DeallocationProfiler::MyAllocator::refcount_ = 0;
|
||||
ABSL_CONST_INIT SpinLock DeallocationProfiler::MyAllocator::arena_lock_(
|
||||
absl::kConstInit, absl::base_internal::SCHEDULE_KERNEL_ONLY);
|
||||
|
||||
void DeallocationProfiler::DeallocationStackTraceTable::StopAndRecord(
|
||||
const AllocsTable& allocs) {
|
||||
stop_time_ = absl::Now();
|
||||
|
||||
// Insert a dummy DeallocationSampleRecord since the table stores pairs. This
|
||||
// allows us to make minimal changes to the rest of the sample processing
|
||||
// steps reducing special casing for censored samples. This also allows us to
|
||||
// aggregate censored samples just like regular deallocation samples.
|
||||
const DeallocationSampleRecord censored{
|
||||
.creation_time = stop_time_,
|
||||
};
|
||||
for (const auto& [unused, alloc] : allocs) {
|
||||
AddTrace(alloc, censored);
|
||||
}
|
||||
}
|
||||
|
||||
void DeallocationProfiler::DeallocationStackTraceTable::AddTrace(
|
||||
const DeallocationSampleRecord& alloc_trace,
|
||||
const DeallocationSampleRecord& dealloc_trace) {
|
||||
CpuThreadMatchingStatus status =
|
||||
CpuThreadMatchingStatus(alloc_trace.cpu_id == dealloc_trace.cpu_id,
|
||||
alloc_trace.thread_id == dealloc_trace.thread_id);
|
||||
|
||||
// Initialize a default rpc matched status.
|
||||
RpcMatchingStatus rpc_status(/*alloc=*/0, /*dealloc=*/0);
|
||||
|
||||
const int index = ComputeIndex(status, rpc_status);
|
||||
|
||||
DeallocationStackTraceTable::Value& v =
|
||||
table_[DeallocationStackTraceTable::Key(alloc_trace, dealloc_trace)];
|
||||
|
||||
const absl::Duration life_time =
|
||||
dealloc_trace.creation_time - alloc_trace.creation_time;
|
||||
double life_time_ns = absl::ToDoubleNanoseconds(life_time);
|
||||
|
||||
// Update mean and variance using Welford’s online algorithm.
|
||||
double old_mean_ns = v.mean_life_times_ns[index];
|
||||
v.mean_life_times_ns[index] +=
|
||||
(life_time_ns - old_mean_ns) / static_cast<double>(v.counts[index] + 1);
|
||||
v.variance_life_times_ns[index] +=
|
||||
(life_time_ns - v.mean_life_times_ns[index]) *
|
||||
(v.mean_life_times_ns[index] - old_mean_ns);
|
||||
|
||||
v.min_life_times_ns[index] =
|
||||
std::min(v.min_life_times_ns[index], life_time_ns);
|
||||
v.max_life_times_ns[index] =
|
||||
std::max(v.max_life_times_ns[index], life_time_ns);
|
||||
v.counts[index]++;
|
||||
}
|
||||
|
||||
void DeallocationProfiler::DeallocationStackTraceTable::Iterate(
|
||||
absl::FunctionRef<void(const Profile::Sample&)> func) const {
|
||||
uint64_t pair_id = 1;
|
||||
|
||||
for (auto& it : table_) {
|
||||
const Key& k = it.first;
|
||||
const Value& v = it.second;
|
||||
|
||||
// Report total bytes that are a multiple of the object size.
|
||||
size_t allocated_size = k.alloc.allocated_size;
|
||||
|
||||
for (const auto& matching_case : kAllCases) {
|
||||
const int index = ComputeIndex(matching_case.first, matching_case.second);
|
||||
if (v.counts[index] == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
uintptr_t bytes =
|
||||
std::lround(v.counts[index] * k.alloc.weight * allocated_size);
|
||||
int64_t count = (bytes + allocated_size - 1) / allocated_size;
|
||||
int64_t sum = count * allocated_size;
|
||||
|
||||
// The variance should be >= 0, but it's not impossible that it drops
|
||||
// below 0 for numerical reasons. We don't want to crash in this case,
|
||||
// so we ensure to return 0 if this happens.
|
||||
double stddev_life_time_ns =
|
||||
sqrt(std::max(0.0, v.variance_life_times_ns[index] /
|
||||
static_cast<double>((v.counts[index]))));
|
||||
|
||||
const auto bucketize = internal::LifetimeNsToBucketedDuration;
|
||||
Profile::Sample sample{
|
||||
.sum = sum,
|
||||
.requested_size = k.alloc.requested_size,
|
||||
.requested_alignment = k.alloc.requested_alignment,
|
||||
.allocated_size = allocated_size,
|
||||
.profile_id = pair_id++,
|
||||
// Set the is_censored flag so that when we create a proto
|
||||
// sample later we can treat the *_lifetime accordingly.
|
||||
.is_censored = (k.dealloc.depth == 0),
|
||||
.avg_lifetime = bucketize(v.mean_life_times_ns[index]),
|
||||
.stddev_lifetime = bucketize(stddev_life_time_ns),
|
||||
.min_lifetime = bucketize(v.min_life_times_ns[index]),
|
||||
.max_lifetime = bucketize(v.max_life_times_ns[index])};
|
||||
// Only set the cpu and thread matched flags if the sample is not
|
||||
// censored.
|
||||
if (!sample.is_censored) {
|
||||
sample.allocator_deallocator_cpu_matched =
|
||||
matching_case.first.cpu_matched;
|
||||
sample.allocator_deallocator_thread_matched =
|
||||
matching_case.first.thread_matched;
|
||||
}
|
||||
|
||||
// first for allocation
|
||||
sample.count = count;
|
||||
sample.depth = k.alloc.depth;
|
||||
std::copy(k.alloc.stack, k.alloc.stack + k.alloc.depth, sample.stack);
|
||||
func(sample);
|
||||
|
||||
// If this is a right-censored allocation (i.e. we did not observe the
|
||||
// deallocation) then do not emit a deallocation sample pair.
|
||||
if (sample.is_censored) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// second for deallocation
|
||||
static_assert(
|
||||
std::is_signed<decltype(tcmalloc::Profile::Sample::count)>::value,
|
||||
"Deallocation samples are tagged with negative count values.");
|
||||
sample.count = -1 * count;
|
||||
sample.depth = k.dealloc.depth;
|
||||
std::copy(k.dealloc.stack, k.dealloc.stack + k.dealloc.depth,
|
||||
sample.stack);
|
||||
func(sample);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DeallocationSample::DeallocationSample(DeallocationProfilerList* list) {
|
||||
profiler_ = std::make_unique<DeallocationProfiler>(list);
|
||||
}
|
||||
|
||||
tcmalloc::Profile DeallocationSample::Stop() && {
|
||||
if (profiler_ != nullptr) {
|
||||
tcmalloc::Profile profile = profiler_->Stop();
|
||||
profiler_.reset();
|
||||
return profile;
|
||||
}
|
||||
return tcmalloc::Profile();
|
||||
}
|
||||
|
||||
namespace internal {
|
||||
|
||||
// Lifetimes below 1ns are truncated to 1ns. Lifetimes between 1ns and 1ms
|
||||
// are rounded to the next smaller power of 10. Lifetimes above 1ms are rounded
|
||||
// down to the nearest millisecond.
|
||||
absl::Duration LifetimeNsToBucketedDuration(double lifetime_ns) {
|
||||
if (lifetime_ns < 1000000.0) {
|
||||
if (lifetime_ns <= 1) {
|
||||
// Avoid negatives. We can't allocate in a negative amount of time or
|
||||
// even as quickly as a nanosecond (microbenchmarks of
|
||||
// allocation/deallocation in a tight loop are several nanoseconds), so
|
||||
// results this small indicate probable clock skew or other confounding
|
||||
// factors in the data.
|
||||
return absl::Nanoseconds(1);
|
||||
}
|
||||
|
||||
for (uint64_t cutoff_ns = 10; cutoff_ns <= 1000000; cutoff_ns *= 10) {
|
||||
if (lifetime_ns < cutoff_ns) {
|
||||
return absl::Nanoseconds(cutoff_ns / 10);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Round down to nearest millisecond.
|
||||
return absl::Nanoseconds(static_cast<uint64_t>(lifetime_ns / 1000000.0) *
|
||||
1000000L);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace deallocationz
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
67
src/third_party/tcmalloc/dist/tcmalloc/deallocation_profiler.h
vendored
Normal file
@ -0,0 +1,67 @@
|
||||
// Copyright 2022 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef TCMALLOC_DEALLOCATION_PROFILER_H_
|
||||
#define TCMALLOC_DEALLOCATION_PROFILER_H_
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "absl/base/const_init.h"
|
||||
#include "absl/base/internal/spinlock.h"
|
||||
#include "tcmalloc/internal/config.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
#include "tcmalloc/malloc_extension.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace deallocationz {
|
||||
|
||||
class DeallocationProfiler;
|
||||
|
||||
class DeallocationProfilerList {
|
||||
public:
|
||||
constexpr DeallocationProfilerList() = default;
|
||||
|
||||
void ReportMalloc(const tcmalloc_internal::StackTrace& stack_trace);
|
||||
void ReportFree(tcmalloc_internal::AllocHandle handle);
|
||||
void Add(DeallocationProfiler* profiler);
|
||||
void Remove(DeallocationProfiler* profiler);
|
||||
|
||||
private:
|
||||
DeallocationProfiler* first_ = nullptr;
|
||||
absl::base_internal::SpinLock profilers_lock_{
|
||||
absl::kConstInit, absl::base_internal::SCHEDULE_KERNEL_ONLY};
|
||||
};
|
||||
|
||||
class DeallocationSample final
|
||||
: public tcmalloc_internal::AllocationProfilingTokenBase {
|
||||
public:
|
||||
explicit DeallocationSample(DeallocationProfilerList* list);
|
||||
// We define the dtor to ensure it is placed in the desired text section.
|
||||
~DeallocationSample() override = default;
|
||||
|
||||
tcmalloc::Profile Stop() && override;
|
||||
|
||||
private:
|
||||
std::unique_ptr<DeallocationProfiler> profiler_;
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
absl::Duration LifetimeNsToBucketedDuration(double lifetime_ns);
|
||||
} // namespace internal
|
||||
} // namespace deallocationz
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
|
||||
#endif // TCMALLOC_DEALLOCATION_PROFILER_H_
|
||||
140
src/third_party/tcmalloc/dist/tcmalloc/experiment.cc
vendored
Normal file
@ -0,0 +1,140 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tcmalloc/experiment.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
|
||||
#include "absl/base/macros.h"
|
||||
#include "absl/strings/match.h"
|
||||
#include "absl/strings/str_cat.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "tcmalloc/internal/environment.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
namespace {
|
||||
|
||||
const char kDelimiter = ',';
|
||||
const char kExperiments[] = "BORG_EXPERIMENTS";
|
||||
const char kDisableExperiments[] = "BORG_DISABLE_EXPERIMENTS";
|
||||
constexpr absl::string_view kEnableAll = "enable-all-known-experiments";
|
||||
constexpr absl::string_view kDisableAll = "all";
|
||||
|
||||
bool LookupExperimentID(absl::string_view label, Experiment* exp) {
|
||||
for (auto config : experiments) {
|
||||
if (config.name == label) {
|
||||
*exp = config.id;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
const bool* GetSelectedExperiments() {
|
||||
static bool by_id[kNumExperiments];
|
||||
|
||||
static const bool* status = [&]() {
|
||||
const char* active_experiments = thread_safe_getenv(kExperiments);
|
||||
const char* disabled_experiments = thread_safe_getenv(kDisableExperiments);
|
||||
return SelectExperiments(by_id,
|
||||
active_experiments ? active_experiments : "",
|
||||
disabled_experiments ? disabled_experiments : "");
|
||||
}();
|
||||
return status;
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
void ParseExperiments(absl::string_view labels, F f) {
|
||||
absl::string_view::size_type pos = 0;
|
||||
do {
|
||||
absl::string_view token;
|
||||
auto end = labels.find(kDelimiter, pos);
|
||||
if (end == absl::string_view::npos) {
|
||||
token = labels.substr(pos);
|
||||
pos = end;
|
||||
} else {
|
||||
token = labels.substr(pos, end - pos);
|
||||
pos = end + 1;
|
||||
}
|
||||
|
||||
f(token);
|
||||
} while (pos != absl::string_view::npos);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
const bool* SelectExperiments(bool* buffer, absl::string_view active,
|
||||
absl::string_view disabled) {
|
||||
memset(buffer, 0, sizeof(*buffer) * kNumExperiments);
|
||||
|
||||
if (active == kEnableAll) {
|
||||
std::fill(buffer, buffer + kNumExperiments, true);
|
||||
}
|
||||
|
||||
ParseExperiments(active, [buffer](absl::string_view token) {
|
||||
Experiment id;
|
||||
if (LookupExperimentID(token, &id)) {
|
||||
buffer[static_cast<int>(id)] = true;
|
||||
}
|
||||
});
|
||||
|
||||
if (disabled == kDisableAll) {
|
||||
memset(buffer, 0, sizeof(*buffer) * kNumExperiments);
|
||||
}
|
||||
|
||||
ParseExperiments(disabled, [buffer](absl::string_view token) {
|
||||
Experiment id;
|
||||
if (LookupExperimentID(token, &id)) {
|
||||
buffer[static_cast<int>(id)] = false;
|
||||
}
|
||||
});
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
} // namespace tcmalloc_internal
|
||||
|
||||
bool IsExperimentActive(Experiment exp) {
|
||||
ASSERT(static_cast<int>(exp) >= 0);
|
||||
ASSERT(exp < Experiment::kMaxExperimentID);
|
||||
|
||||
return tcmalloc_internal::GetSelectedExperiments()[static_cast<int>(exp)];
|
||||
}
|
||||
|
||||
absl::optional<Experiment> FindExperimentByName(absl::string_view name) {
|
||||
for (const auto& config : experiments) {
|
||||
if (name == config.name) {
|
||||
return config.id;
|
||||
}
|
||||
}
|
||||
|
||||
return absl::nullopt;
|
||||
}
|
||||
|
||||
void WalkExperiments(
|
||||
absl::FunctionRef<void(absl::string_view name, bool active)> callback) {
|
||||
for (const auto& config : experiments) {
|
||||
callback(config.name, IsExperimentActive(config.id));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
68
src/third_party/tcmalloc/dist/tcmalloc/experiment.h
vendored
Normal file
@ -0,0 +1,68 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef TCMALLOC_EXPERIMENT_H_
|
||||
#define TCMALLOC_EXPERIMENT_H_
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "absl/functional/function_ref.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "absl/types/optional.h"
|
||||
#include "tcmalloc/experiment_config.h"
|
||||
#include "tcmalloc/internal/config.h"
|
||||
|
||||
// TCMalloc Experiment Controller
|
||||
//
|
||||
// This consumes environment variables to decide whether to activate experiments
|
||||
// to control TCMalloc behavior. It avoids memory allocations when making
|
||||
// experiment decisions to allow experiments to be used in critical TCMalloc
|
||||
// initialization paths.
|
||||
//
|
||||
// If an experiment is causing difficulty, all experiments can be disabled by
|
||||
// setting the environment variable:
|
||||
// BORG_DISABLE_EXPERIMENTS=all *or*
|
||||
// BORG_DISABLE_EXPERIMENTS=BAD_EXPERIMENT_LABEL
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
|
||||
constexpr size_t kNumExperiments =
|
||||
static_cast<size_t>(Experiment::kMaxExperimentID);
|
||||
|
||||
// SelectExperiments parses the experiments enumerated by active and disabled
|
||||
// and updates buffer[experiment_id] accordingly.
|
||||
//
|
||||
// buffer must be sized for kMaxExperimentID entries.
|
||||
//
|
||||
// This is exposed for testing purposes only.
|
||||
const bool* SelectExperiments(bool* buffer, absl::string_view active,
|
||||
absl::string_view disabled);
|
||||
|
||||
} // namespace tcmalloc_internal
|
||||
|
||||
bool IsExperimentActive(Experiment exp);
|
||||
|
||||
absl::optional<Experiment> FindExperimentByName(absl::string_view name);
|
||||
|
||||
void WalkExperiments(
|
||||
absl::FunctionRef<void(absl::string_view name, bool active)> callback);
|
||||
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
|
||||
#endif // TCMALLOC_EXPERIMENT_H_
|
||||
55
src/third_party/tcmalloc/dist/tcmalloc/experiment_config.h
vendored
Normal file
@ -0,0 +1,55 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef TCMALLOC_EXPERIMENT_CONFIG_H_
|
||||
#define TCMALLOC_EXPERIMENT_CONFIG_H_
|
||||
|
||||
#include "absl/strings/string_view.h"
|
||||
|
||||
// Autogenerated by experiments_proto_test --experiments_generate_config=true
|
||||
namespace tcmalloc {
|
||||
|
||||
enum class Experiment : int {
|
||||
TEST_ONLY_TCMALLOC_POW2_SIZECLASS,
|
||||
TEST_ONLY_TCMALLOC_SHARDED_TRANSFER_CACHE,
|
||||
TEST_ONLY_TCMALLOC_GENERIC_SHARDED_TRANSFER_CACHE,
|
||||
TEST_ONLY_TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN,
|
||||
TEST_ONLY_TCMALLOC_SEPARATE_ALLOCS_FOR_FEW_AND_MANY_OBJECTS_SPANS,
|
||||
TCMALLOC_GENERIC_SHARDED_TRANSFER_CACHE,
|
||||
TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN,
|
||||
TCMALLOC_SEPARATE_ALLOCS_FOR_FEW_AND_MANY_OBJECTS_SPANS2,
|
||||
kMaxExperimentID,
|
||||
};
|
||||
|
||||
struct ExperimentConfig {
|
||||
Experiment id;
|
||||
absl::string_view name;
|
||||
};
|
||||
|
||||
// clang-format off
|
||||
inline constexpr ExperimentConfig experiments[] = {
|
||||
{Experiment::TEST_ONLY_TCMALLOC_POW2_SIZECLASS, "TEST_ONLY_TCMALLOC_POW2_SIZECLASS"},
|
||||
{Experiment::TEST_ONLY_TCMALLOC_SHARDED_TRANSFER_CACHE, "TEST_ONLY_TCMALLOC_SHARDED_TRANSFER_CACHE"},
|
||||
{Experiment::TEST_ONLY_TCMALLOC_GENERIC_SHARDED_TRANSFER_CACHE, "TEST_ONLY_TCMALLOC_GENERIC_SHARDED_TRANSFER_CACHE"},
|
||||
{Experiment::TEST_ONLY_TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN, "TEST_ONLY_TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN"},
|
||||
{Experiment::TEST_ONLY_TCMALLOC_SEPARATE_ALLOCS_FOR_FEW_AND_MANY_OBJECTS_SPANS, "TEST_ONLY_TCMALLOC_SEPARATE_ALLOCS_FOR_FEW_AND_MANY_OBJECTS_SPANS"},
|
||||
{Experiment::TCMALLOC_GENERIC_SHARDED_TRANSFER_CACHE, "TCMALLOC_GENERIC_SHARDED_TRANSFER_CACHE"},
|
||||
{Experiment::TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN, "TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN"},
|
||||
{Experiment::TCMALLOC_SEPARATE_ALLOCS_FOR_FEW_AND_MANY_OBJECTS_SPANS2, "TCMALLOC_SEPARATE_ALLOCS_FOR_FEW_AND_MANY_OBJECTS_SPANS2"},
|
||||
};
|
||||
// clang-format on
|
||||
|
||||
} // namespace tcmalloc
|
||||
|
||||
#endif // TCMALLOC_EXPERIMENT_CONFIG_H_
|
||||
31
src/third_party/tcmalloc/dist/tcmalloc/experiment_config_test.cc
vendored
Normal file
@ -0,0 +1,31 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tcmalloc/experiment_config.h"
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace tcmalloc {
|
||||
namespace {
|
||||
|
||||
// Verify IDs are non-negative and strictly less than kMaxExperimentID.
|
||||
TEST(ExperimentConfigTest, ValidateIDs) {
|
||||
for (const auto& exp : experiments) {
|
||||
ASSERT_LE(0, static_cast<int>(exp.id));
|
||||
ASSERT_LT(exp.id, Experiment::kMaxExperimentID);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tcmalloc
|
||||
38
src/third_party/tcmalloc/dist/tcmalloc/experiment_fuzz.cc
vendored
Normal file
@ -0,0 +1,38 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "tcmalloc/experiment.h"
|
||||
|
||||
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* d, size_t size) {
|
||||
const char* data = reinterpret_cast<const char*>(d);
|
||||
|
||||
bool buffer[tcmalloc::tcmalloc_internal::kNumExperiments];
|
||||
absl::string_view active, disabled;
|
||||
|
||||
const char* split = static_cast<const char*>(memchr(data, ';', size));
|
||||
if (split == nullptr) {
|
||||
active = absl::string_view(data, size);
|
||||
} else {
|
||||
active = absl::string_view(data, split - data);
|
||||
disabled = absl::string_view(split + 1, size - (split - data + 1));
|
||||
}
|
||||
|
||||
tcmalloc::tcmalloc_internal::SelectExperiments(buffer, active, disabled);
|
||||
return 0;
|
||||
}
|
||||
240
src/third_party/tcmalloc/dist/tcmalloc/experimental_pow2_size_class.cc
vendored
Normal file
@ -0,0 +1,240 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tcmalloc/common.h"
|
||||
#include "tcmalloc/sizemap.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
|
||||
namespace tcmalloc_internal {
|
||||
|
||||
// <fixed> is fixed per-size-class overhead due to end-of-span fragmentation
|
||||
// and other factors. For instance, if we have a 96 byte size class, and use a
|
||||
// single 8KiB page, then we will hold 85 objects per span, and have 32 bytes
|
||||
// left over. There is also a fixed component of 48 bytes of TCMalloc metadata
|
||||
// per span. Together, the fixed overhead would be wasted/allocated =
|
||||
// (32 + 48) / (8192 - 32) ~= 0.98%.
|
||||
// There is also a dynamic component to overhead based on mismatches between the
|
||||
// number of bytes requested and the number of bytes provided by the size class.
|
||||
// Together they sum to the total overhead; for instance if you asked for a
|
||||
// 50-byte allocation that rounds up to a 64-byte size class, the dynamic
|
||||
// overhead would be 28%, and if <fixed> were 22% it would mean (on average)
|
||||
// 25 bytes of overhead for allocations of that size.
|
||||
|
||||
// clang-format off
|
||||
#if defined(__cpp_aligned_new) && __STDCPP_DEFAULT_NEW_ALIGNMENT__ <= 8
|
||||
#if TCMALLOC_PAGE_SHIFT == 13
|
||||
static_assert(kMaxSize == 262144, "kMaxSize mismatch");
|
||||
static const int kCount = 17;
|
||||
static_assert(kCount <= kNumClasses);
|
||||
static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
|
||||
// <bytes>, <pages>, <batch size> <fixed>
|
||||
{ 0, 0, 0}, // +Inf%
|
||||
{ 8, 1, 32}, // 0.59%
|
||||
{ 16, 1, 32}, // 0.59%
|
||||
{ 32, 1, 32}, // 0.59%
|
||||
{ 64, 1, 32}, // 0.59%
|
||||
{ 128, 1, 32}, // 0.59%
|
||||
{ 256, 1, 32}, // 0.59%
|
||||
{ 512, 1, 32}, // 0.59%
|
||||
{ 1024, 1, 32}, // 0.59%
|
||||
{ 2048, 2, 32}, // 0.29%
|
||||
{ 4096, 1, 16}, // 0.59%
|
||||
{ 8192, 1, 8}, // 0.59%
|
||||
{ 16384, 2, 4}, // 0.29%
|
||||
{ 32768, 4, 2}, // 0.15%
|
||||
{ 65536, 8, 2}, // 0.07%
|
||||
{ 131072, 16, 2}, // 0.04%
|
||||
{ 262144, 32, 2}, // 0.02%
|
||||
};
|
||||
constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
|
||||
#elif TCMALLOC_PAGE_SHIFT == 15
|
||||
static_assert(kMaxSize == 262144, "kMaxSize mismatch");
|
||||
static const int kCount = 17;
|
||||
static_assert(kCount <= kNumClasses);
|
||||
static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
|
||||
// <bytes>, <pages>, <batch size> <fixed>
|
||||
{ 0, 0, 0}, // +Inf%
|
||||
{ 8, 1, 32}, // 0.15%
|
||||
{ 16, 1, 32}, // 0.15%
|
||||
{ 32, 1, 32}, // 0.15%
|
||||
{ 64, 1, 32}, // 0.15%
|
||||
{ 128, 1, 32}, // 0.15%
|
||||
{ 256, 1, 32}, // 0.15%
|
||||
{ 512, 1, 32}, // 0.15%
|
||||
{ 1024, 1, 32}, // 0.15%
|
||||
{ 2048, 1, 32}, // 0.15%
|
||||
{ 4096, 1, 16}, // 0.15%
|
||||
{ 8192, 1, 8}, // 0.15%
|
||||
{ 16384, 1, 4}, // 0.15%
|
||||
{ 32768, 1, 2}, // 0.15%
|
||||
{ 65536, 2, 2}, // 0.07%
|
||||
{ 131072, 4, 2}, // 0.04%
|
||||
{ 262144, 8, 2}, // 0.02%
|
||||
};
|
||||
constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
|
||||
#elif TCMALLOC_PAGE_SHIFT == 18
|
||||
static_assert(kMaxSize == 262144, "kMaxSize mismatch");
|
||||
static const int kCount = 17;
|
||||
static_assert(kCount <= kNumClasses);
|
||||
static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
|
||||
// <bytes>, <pages>, <batch size> <fixed>
|
||||
{ 0, 0, 0}, // +Inf%
|
||||
{ 8, 1, 32}, // 0.02%
|
||||
{ 16, 1, 32}, // 0.02%
|
||||
{ 32, 1, 32}, // 0.02%
|
||||
{ 64, 1, 32}, // 0.02%
|
||||
{ 128, 1, 32}, // 0.02%
|
||||
{ 256, 1, 32}, // 0.02%
|
||||
{ 512, 1, 32}, // 0.02%
|
||||
{ 1024, 1, 32}, // 0.02%
|
||||
{ 2048, 1, 32}, // 0.02%
|
||||
{ 4096, 1, 16}, // 0.02%
|
||||
{ 8192, 1, 8}, // 0.02%
|
||||
{ 16384, 1, 4}, // 0.02%
|
||||
{ 32768, 1, 2}, // 0.02%
|
||||
{ 65536, 1, 2}, // 0.02%
|
||||
{ 131072, 1, 2}, // 0.02%
|
||||
{ 262144, 1, 2}, // 0.02%
|
||||
};
|
||||
constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
|
||||
#elif TCMALLOC_PAGE_SHIFT == 12
|
||||
static_assert(kMaxSize == 8192, "kMaxSize mismatch");
|
||||
static const int kCount = 12;
|
||||
static_assert(kCount <= kNumClasses);
|
||||
static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
|
||||
// <bytes>, <pages>, <batch size> <fixed>
|
||||
{ 0, 0, 0}, // +Inf%
|
||||
{ 8, 1, 32}, // 1.17%
|
||||
{ 16, 1, 32}, // 1.17%
|
||||
{ 32, 1, 32}, // 1.17%
|
||||
{ 64, 1, 32}, // 1.17%
|
||||
{ 128, 1, 32}, // 1.17%
|
||||
{ 256, 1, 32}, // 1.17%
|
||||
{ 512, 1, 32}, // 1.17%
|
||||
{ 1024, 2, 32}, // 0.59%
|
||||
{ 2048, 4, 32}, // 0.29%
|
||||
{ 4096, 4, 16}, // 0.29%
|
||||
{ 8192, 4, 8}, // 0.29%
|
||||
};
|
||||
constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
|
||||
#else
|
||||
#error "Unsupported TCMALLOC_PAGE_SHIFT value!"
|
||||
#endif
|
||||
#else
|
||||
#if TCMALLOC_PAGE_SHIFT == 13
|
||||
static_assert(kMaxSize == 262144, "kMaxSize mismatch");
|
||||
static const int kCount = 17;
|
||||
static_assert(kCount <= kNumClasses);
|
||||
static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
|
||||
// <bytes>, <pages>, <batch size> <fixed>
|
||||
{ 0, 0, 0}, // +Inf%
|
||||
{ 8, 1, 32}, // 0.59%
|
||||
{ 16, 1, 32}, // 0.59%
|
||||
{ 32, 1, 32}, // 0.59%
|
||||
{ 64, 1, 32}, // 0.59%
|
||||
{ 128, 1, 32}, // 0.59%
|
||||
{ 256, 1, 32}, // 0.59%
|
||||
{ 512, 1, 32}, // 0.59%
|
||||
{ 1024, 1, 32}, // 0.59%
|
||||
{ 2048, 2, 32}, // 0.29%
|
||||
{ 4096, 1, 16}, // 0.59%
|
||||
{ 8192, 1, 8}, // 0.59%
|
||||
{ 16384, 2, 4}, // 0.29%
|
||||
{ 32768, 4, 2}, // 0.15%
|
||||
{ 65536, 8, 2}, // 0.07%
|
||||
{ 131072, 16, 2}, // 0.04%
|
||||
{ 262144, 32, 2}, // 0.02%
|
||||
};
|
||||
constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
|
||||
#elif TCMALLOC_PAGE_SHIFT == 15
|
||||
static_assert(kMaxSize == 262144, "kMaxSize mismatch");
|
||||
static const int kCount = 17;
|
||||
static_assert(kCount <= kNumClasses);
|
||||
static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
|
||||
// <bytes>, <pages>, <batch size> <fixed>
|
||||
{ 0, 0, 0}, // +Inf%
|
||||
{ 8, 1, 32}, // 0.15%
|
||||
{ 16, 1, 32}, // 0.15%
|
||||
{ 32, 1, 32}, // 0.15%
|
||||
{ 64, 1, 32}, // 0.15%
|
||||
{ 128, 1, 32}, // 0.15%
|
||||
{ 256, 1, 32}, // 0.15%
|
||||
{ 512, 1, 32}, // 0.15%
|
||||
{ 1024, 1, 32}, // 0.15%
|
||||
{ 2048, 1, 32}, // 0.15%
|
||||
{ 4096, 1, 16}, // 0.15%
|
||||
{ 8192, 1, 8}, // 0.15%
|
||||
{ 16384, 1, 4}, // 0.15%
|
||||
{ 32768, 1, 2}, // 0.15%
|
||||
{ 65536, 2, 2}, // 0.07%
|
||||
{ 131072, 4, 2}, // 0.04%
|
||||
{ 262144, 8, 2}, // 0.02%
|
||||
};
|
||||
constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
|
||||
#elif TCMALLOC_PAGE_SHIFT == 18
|
||||
static_assert(kMaxSize == 262144, "kMaxSize mismatch");
|
||||
static const int kCount = 17;
|
||||
static_assert(kCount <= kNumClasses);
|
||||
static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
|
||||
// <bytes>, <pages>, <batch size> <fixed>
|
||||
{ 0, 0, 0}, // +Inf%
|
||||
{ 8, 1, 32}, // 0.02%
|
||||
{ 16, 1, 32}, // 0.02%
|
||||
{ 32, 1, 32}, // 0.02%
|
||||
{ 64, 1, 32}, // 0.02%
|
||||
{ 128, 1, 32}, // 0.02%
|
||||
{ 256, 1, 32}, // 0.02%
|
||||
{ 512, 1, 32}, // 0.02%
|
||||
{ 1024, 1, 32}, // 0.02%
|
||||
{ 2048, 1, 32}, // 0.02%
|
||||
{ 4096, 1, 16}, // 0.02%
|
||||
{ 8192, 1, 8}, // 0.02%
|
||||
{ 16384, 1, 4}, // 0.02%
|
||||
{ 32768, 1, 2}, // 0.02%
|
||||
{ 65536, 1, 2}, // 0.02%
|
||||
{ 131072, 1, 2}, // 0.02%
|
||||
{ 262144, 1, 2}, // 0.02%
|
||||
};
|
||||
constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
|
||||
#elif TCMALLOC_PAGE_SHIFT == 12
|
||||
static_assert(kMaxSize == 8192, "kMaxSize mismatch");
|
||||
static const int kCount = 12;
|
||||
static_assert(kCount <= kNumClasses);
|
||||
static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
|
||||
// <bytes>, <pages>, <batch size> <fixed>
|
||||
{ 0, 0, 0}, // +Inf%
|
||||
{ 8, 1, 32}, // 1.17%
|
||||
{ 16, 1, 32}, // 1.17%
|
||||
{ 32, 1, 32}, // 1.17%
|
||||
{ 64, 1, 32}, // 1.17%
|
||||
{ 128, 1, 32}, // 1.17%
|
||||
{ 256, 1, 32}, // 1.17%
|
||||
{ 512, 1, 32}, // 1.17%
|
||||
{ 1024, 2, 32}, // 0.59%
|
||||
{ 2048, 4, 32}, // 0.29%
|
||||
{ 4096, 4, 16}, // 0.29%
|
||||
{ 8192, 4, 8}, // 0.29%
|
||||
};
|
||||
constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
|
||||
#else
|
||||
#error "Unsupported TCMALLOC_PAGE_SHIFT value!"
|
||||
#endif
|
||||
#endif
|
||||
// clang-format on
|
||||
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
62
src/third_party/tcmalloc/dist/tcmalloc/explicitly_constructed.h
vendored
Normal file
@ -0,0 +1,62 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef TCMALLOC_EXPLICITLY_CONSTRUCTED_H_
|
||||
#define TCMALLOC_EXPLICITLY_CONSTRUCTED_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include <utility>
|
||||
|
||||
#include "tcmalloc/internal/config.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
|
||||
// Wraps a variable whose constructor is explicitly called. It is particularly
|
||||
// useful for a global variable, without its constructor and destructor run on
|
||||
// start and end of the program lifetime. This circumvents the initial
|
||||
// construction order fiasco, while keeping the address of the empty string a
|
||||
// compile time constant.
|
||||
//
|
||||
// Pay special attention to the initialization state of the object.
|
||||
// 1. The object is "uninitialized" to begin with.
|
||||
// 2. Call Construct() only if the object is uninitialized. After the call, the
|
||||
// object becomes "initialized".
|
||||
// 3. Call get_mutable() only if the object is initialized.
|
||||
template <typename T>
|
||||
class ExplicitlyConstructed {
|
||||
public:
|
||||
template <typename... Args>
|
||||
void Construct(Args&&... args) {
|
||||
new (&union_) T(std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
T& get_mutable() { return reinterpret_cast<T&>(union_); }
|
||||
|
||||
private:
|
||||
union AlignedUnion {
|
||||
constexpr AlignedUnion() = default;
|
||||
alignas(T) char space[sizeof(T)];
|
||||
int64_t align_to_int64;
|
||||
void* align_to_ptr;
|
||||
} union_;
|
||||
};
|
||||
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
|
||||
#endif // TCMALLOC_EXPLICITLY_CONSTRUCTED_H_
|
||||
800
src/third_party/tcmalloc/dist/tcmalloc/global_stats.cc
vendored
Normal file
@ -0,0 +1,800 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tcmalloc/global_stats.h"
|
||||
|
||||
#include "absl/strings/match.h"
|
||||
#include "absl/strings/strip.h"
|
||||
#include "tcmalloc/central_freelist.h"
|
||||
#include "tcmalloc/common.h"
|
||||
#include "tcmalloc/cpu_cache.h"
|
||||
#include "tcmalloc/experiment.h"
|
||||
#include "tcmalloc/guarded_page_allocator.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
#include "tcmalloc/internal/memory_stats.h"
|
||||
#include "tcmalloc/page_allocator.h"
|
||||
#include "tcmalloc/page_heap.h"
|
||||
#include "tcmalloc/page_heap_allocator.h"
|
||||
#include "tcmalloc/pagemap.h"
|
||||
#include "tcmalloc/pages.h"
|
||||
#include "tcmalloc/parameters.h"
|
||||
#include "tcmalloc/sampled_allocation.h"
|
||||
#include "tcmalloc/sampler.h"
|
||||
#include "tcmalloc/span.h"
|
||||
#include "tcmalloc/static_vars.h"
|
||||
#include "tcmalloc/stats.h"
|
||||
#include "tcmalloc/system-alloc.h"
|
||||
#include "tcmalloc/thread_cache.h"
|
||||
#include "tcmalloc/transfer_cache.h"
|
||||
#include "tcmalloc/transfer_cache_stats.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
|
||||
// Get stats into "r". Also, if class_count != NULL, class_count[k]
|
||||
// will be set to the total number of objects of size class k in the
|
||||
// central cache, transfer cache, and per-thread and per-CPU caches.
|
||||
// If small_spans is non-NULL, it is filled. Same for large_spans.
|
||||
// The boolean report_residence determines whether residence information
|
||||
// should be captured or not. Residence info requires a potentially
|
||||
// costly OS call, and is not necessary in all situations.
|
||||
void ExtractStats(TCMallocStats* r, uint64_t* class_count,
|
||||
SpanStats* span_stats, SmallSpanStats* small_spans,
|
||||
LargeSpanStats* large_spans, bool report_residence) {
|
||||
r->central_bytes = 0;
|
||||
r->transfer_bytes = 0;
|
||||
for (int size_class = 0; size_class < kNumClasses; ++size_class) {
|
||||
const size_t length = tc_globals.central_freelist(size_class).length();
|
||||
const size_t tc_length = tc_globals.transfer_cache().tc_length(size_class);
|
||||
const size_t cache_overhead =
|
||||
tc_globals.central_freelist(size_class).OverheadBytes();
|
||||
const size_t size = tc_globals.sizemap().class_to_size(size_class);
|
||||
r->central_bytes += (size * length) + cache_overhead;
|
||||
r->transfer_bytes += (size * tc_length);
|
||||
if (class_count) {
|
||||
// Sum the lengths of all per-class freelists, except the per-thread
|
||||
// freelists, which get counted when we call GetThreadStats(), below.
|
||||
class_count[size_class] = length + tc_length;
|
||||
if (UsePerCpuCache(tc_globals)) {
|
||||
class_count[size_class] +=
|
||||
tc_globals.cpu_cache().TotalObjectsOfClass(size_class);
|
||||
}
|
||||
}
|
||||
if (span_stats) {
|
||||
span_stats[size_class] =
|
||||
tc_globals.central_freelist(size_class).GetSpanStats();
|
||||
}
|
||||
}
|
||||
|
||||
// Add stats from per-thread heaps
|
||||
r->thread_bytes = 0;
|
||||
{ // scope
|
||||
absl::base_internal::SpinLockHolder h(&pageheap_lock);
|
||||
ThreadCache::GetThreadStats(&r->thread_bytes, class_count);
|
||||
r->tc_stats = ThreadCache::HeapStats();
|
||||
r->span_stats = tc_globals.span_allocator().stats();
|
||||
r->stack_stats = tc_globals.sampledallocation_allocator().stats();
|
||||
r->linked_sample_stats = tc_globals.linked_sample_allocator().stats();
|
||||
r->metadata_bytes = tc_globals.metadata_bytes();
|
||||
r->pagemap_bytes = tc_globals.pagemap().bytes();
|
||||
r->pageheap = tc_globals.page_allocator().stats();
|
||||
r->peak_stats = tc_globals.page_allocator().peak_stats();
|
||||
if (small_spans != nullptr) {
|
||||
tc_globals.page_allocator().GetSmallSpanStats(small_spans);
|
||||
}
|
||||
if (large_spans != nullptr) {
|
||||
tc_globals.page_allocator().GetLargeSpanStats(large_spans);
|
||||
}
|
||||
|
||||
r->arena = tc_globals.arena().stats();
|
||||
if (!report_residence) {
|
||||
r->metadata_bytes += r->arena.bytes_nonresident;
|
||||
}
|
||||
}
|
||||
// We can access the pagemap without holding the pageheap_lock since it
|
||||
// is static data, and we are only taking address and size which are
|
||||
// constants.
|
||||
if (report_residence) {
|
||||
auto resident_bytes = tc_globals.pagemap_residence();
|
||||
r->pagemap_root_bytes_res = resident_bytes;
|
||||
ASSERT(r->metadata_bytes >= r->pagemap_bytes);
|
||||
r->metadata_bytes = r->metadata_bytes - r->pagemap_bytes + resident_bytes;
|
||||
} else {
|
||||
r->pagemap_root_bytes_res = 0;
|
||||
}
|
||||
|
||||
r->per_cpu_bytes = 0;
|
||||
r->sharded_transfer_bytes = 0;
|
||||
r->percpu_metadata_bytes_res = 0;
|
||||
r->percpu_metadata_bytes = 0;
|
||||
if (UsePerCpuCache(tc_globals)) {
|
||||
r->per_cpu_bytes = tc_globals.cpu_cache().TotalUsedBytes();
|
||||
r->sharded_transfer_bytes =
|
||||
tc_globals.sharded_transfer_cache().TotalBytes();
|
||||
|
||||
if (report_residence) {
|
||||
auto percpu_metadata = tc_globals.cpu_cache().MetadataMemoryUsage();
|
||||
r->percpu_metadata_bytes_res = percpu_metadata.resident_size;
|
||||
r->percpu_metadata_bytes = percpu_metadata.virtual_size;
|
||||
|
||||
ASSERT(r->metadata_bytes >= r->percpu_metadata_bytes);
|
||||
r->metadata_bytes = r->metadata_bytes - r->percpu_metadata_bytes +
|
||||
r->percpu_metadata_bytes_res;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ExtractTCMallocStats(TCMallocStats* r, bool report_residence) {
|
||||
ExtractStats(r, nullptr, nullptr, nullptr, nullptr, report_residence);
|
||||
}
|
||||
|
||||
// Because different fields of stats are computed from state protected
|
||||
// by different locks, they may be inconsistent. Prevent underflow
|
||||
// when subtracting to avoid gigantic results.
|
||||
static uint64_t StatSub(uint64_t a, uint64_t b) {
|
||||
return (a >= b) ? (a - b) : 0;
|
||||
}
|
||||
|
||||
// Return approximate number of bytes in use by app.
|
||||
uint64_t InUseByApp(const TCMallocStats& stats) {
|
||||
return StatSub(stats.pageheap.system_bytes,
|
||||
stats.thread_bytes + stats.central_bytes +
|
||||
stats.transfer_bytes + stats.per_cpu_bytes +
|
||||
stats.sharded_transfer_bytes + stats.pageheap.free_bytes +
|
||||
stats.pageheap.unmapped_bytes);
|
||||
}
|
||||
|
||||
uint64_t VirtualMemoryUsed(const TCMallocStats& stats) {
|
||||
return stats.pageheap.system_bytes + stats.metadata_bytes +
|
||||
stats.arena.bytes_unallocated + stats.arena.bytes_unavailable +
|
||||
stats.arena.bytes_nonresident;
|
||||
}
|
||||
|
||||
uint64_t UnmappedBytes(const TCMallocStats& stats) {
|
||||
return stats.pageheap.unmapped_bytes + stats.arena.bytes_nonresident;
|
||||
}
|
||||
|
||||
uint64_t PhysicalMemoryUsed(const TCMallocStats& stats) {
|
||||
return StatSub(VirtualMemoryUsed(stats), UnmappedBytes(stats));
|
||||
}
|
||||
|
||||
// The number of bytes either in use by the app or fragmented so that
|
||||
// it cannot be (arbitrarily) reused.
|
||||
uint64_t RequiredBytes(const TCMallocStats& stats) {
|
||||
return StatSub(PhysicalMemoryUsed(stats), stats.pageheap.free_bytes);
|
||||
}
|
||||
|
||||
size_t ExternalBytes(const TCMallocStats& stats) {
|
||||
return stats.pageheap.free_bytes + stats.central_bytes + stats.per_cpu_bytes +
|
||||
stats.sharded_transfer_bytes + stats.transfer_bytes +
|
||||
stats.thread_bytes + stats.metadata_bytes +
|
||||
stats.arena.bytes_unavailable + stats.arena.bytes_unallocated;
|
||||
}
|
||||
|
||||
size_t HeapSizeBytes(const BackingStats& stats) {
|
||||
return StatSub(stats.system_bytes, stats.unmapped_bytes);
|
||||
}
|
||||
|
||||
size_t LocalBytes(const TCMallocStats& stats) {
|
||||
return stats.thread_bytes + stats.per_cpu_bytes +
|
||||
stats.sharded_transfer_bytes;
|
||||
}
|
||||
|
||||
size_t SlackBytes(const BackingStats& stats) {
|
||||
return stats.free_bytes + stats.unmapped_bytes;
|
||||
}
|
||||
|
||||
static int CountAllowedCpus() {
|
||||
cpu_set_t allowed_cpus;
|
||||
if (sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus) != 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return CPU_COUNT(&allowed_cpus);
|
||||
}
|
||||
|
||||
void DumpStats(Printer* out, int level) {
|
||||
TCMallocStats stats;
|
||||
uint64_t class_count[kNumClasses];
|
||||
SpanStats span_stats[kNumClasses];
|
||||
if (level >= 2) {
|
||||
ExtractStats(&stats, class_count, span_stats, nullptr, nullptr, true);
|
||||
} else {
|
||||
ExtractTCMallocStats(&stats, true);
|
||||
}
|
||||
|
||||
static const double MiB = 1048576.0;
|
||||
|
||||
out->printf(
|
||||
"See https://github.com/google/tcmalloc/tree/master/docs/stats.md for an explanation of "
|
||||
"this page\n");
|
||||
|
||||
const uint64_t virtual_memory_used = VirtualMemoryUsed(stats);
|
||||
const uint64_t physical_memory_used = PhysicalMemoryUsed(stats);
|
||||
const uint64_t unmapped_bytes = UnmappedBytes(stats);
|
||||
const uint64_t bytes_in_use_by_app = InUseByApp(stats);
|
||||
|
||||
#ifdef TCMALLOC_SMALL_BUT_SLOW
|
||||
out->printf("NOTE: SMALL MEMORY MODEL IS IN USE, PERFORMANCE MAY SUFFER.\n");
|
||||
#endif
|
||||
// clang-format off
|
||||
// Avoid clang-format complaining about the way that this text is laid out.
|
||||
out->printf(
|
||||
"------------------------------------------------\n"
|
||||
"MALLOC: %12u (%7.1f MiB) Bytes in use by application\n"
|
||||
"MALLOC: + %12u (%7.1f MiB) Bytes in page heap freelist\n"
|
||||
"MALLOC: + %12u (%7.1f MiB) Bytes in central cache freelist\n"
|
||||
"MALLOC: + %12u (%7.1f MiB) Bytes in per-CPU cache freelist\n"
|
||||
"MALLOC: + %12u (%7.1f MiB) Bytes in Sharded cache freelist\n"
|
||||
"MALLOC: + %12u (%7.1f MiB) Bytes in transfer cache freelist\n"
|
||||
"MALLOC: + %12u (%7.1f MiB) Bytes in thread cache freelists\n"
|
||||
"MALLOC: + %12u (%7.1f MiB) Bytes in malloc metadata\n"
|
||||
"MALLOC: + %12u (%7.1f MiB) Bytes in malloc metadata Arena unallocated\n"
|
||||
"MALLOC: + %12u (%7.1f MiB) Bytes in malloc metadata Arena unavailable\n"
|
||||
|
||||
"MALLOC: ------------\n"
|
||||
"MALLOC: = %12u (%7.1f MiB) Actual memory used (physical + swap)\n"
|
||||
"MALLOC: + %12u (%7.1f MiB) Bytes released to OS (aka unmapped)\n"
|
||||
"MALLOC: ------------\n"
|
||||
"MALLOC: = %12u (%7.1f MiB) Virtual address space used\n"
|
||||
"MALLOC:\n"
|
||||
"MALLOC: %12u Spans in use\n"
|
||||
"MALLOC: %12u (%7.1f MiB) Spans created\n"
|
||||
"MALLOC: %12u Thread heaps in use\n"
|
||||
"MALLOC: %12u (%7.1f MiB) Thread heaps created\n"
|
||||
"MALLOC: %12u Stack traces in use\n"
|
||||
"MALLOC: %12u (%7.1f MiB) Stack traces created\n"
|
||||
"MALLOC: %12u Table buckets in use\n"
|
||||
"MALLOC: %12u (%7.1f MiB) Table buckets created\n"
|
||||
"MALLOC: %12u (%7.1f MiB) Pagemap bytes used\n"
|
||||
"MALLOC: %12u (%7.1f MiB) Pagemap root resident bytes\n"
|
||||
"MALLOC: %12u (%7.1f MiB) per-CPU slab bytes used\n"
|
||||
"MALLOC: %12u (%7.1f MiB) per-CPU slab resident bytes\n"
|
||||
"MALLOC: %12u (%7.1f MiB) malloc metadata Arena non-resident bytes\n"
|
||||
"MALLOC: %12u (%7.1f MiB) Actual memory used at peak\n"
|
||||
"MALLOC: %12u (%7.1f MiB) Estimated in-use at peak\n"
|
||||
"MALLOC: %12.4f Realized fragmentation (%%)\n"
|
||||
"MALLOC: %12u Tcmalloc page size\n"
|
||||
"MALLOC: %12u Tcmalloc hugepage size\n"
|
||||
"MALLOC: %12u CPUs Allowed in Mask\n"
|
||||
"MALLOC: %12u Arena blocks\n",
|
||||
bytes_in_use_by_app, bytes_in_use_by_app / MiB,
|
||||
stats.pageheap.free_bytes, stats.pageheap.free_bytes / MiB,
|
||||
stats.central_bytes, stats.central_bytes / MiB,
|
||||
stats.per_cpu_bytes, stats.per_cpu_bytes / MiB,
|
||||
stats.sharded_transfer_bytes, stats.sharded_transfer_bytes / MiB,
|
||||
stats.transfer_bytes, stats.transfer_bytes / MiB,
|
||||
stats.thread_bytes, stats.thread_bytes / MiB,
|
||||
stats.metadata_bytes, stats.metadata_bytes / MiB,
|
||||
stats.arena.bytes_unallocated, stats.arena.bytes_unallocated / MiB,
|
||||
stats.arena.bytes_unavailable, stats.arena.bytes_unavailable / MiB,
|
||||
physical_memory_used, physical_memory_used / MiB,
|
||||
unmapped_bytes, unmapped_bytes / MiB,
|
||||
virtual_memory_used, virtual_memory_used / MiB,
|
||||
uint64_t(stats.span_stats.in_use),
|
||||
uint64_t(stats.span_stats.total),
|
||||
(stats.span_stats.total * sizeof(Span)) / MiB,
|
||||
uint64_t(stats.tc_stats.in_use),
|
||||
uint64_t(stats.tc_stats.total),
|
||||
(stats.tc_stats.total * sizeof(ThreadCache)) / MiB,
|
||||
uint64_t(stats.stack_stats.in_use),
|
||||
uint64_t(stats.stack_stats.total),
|
||||
(stats.stack_stats.total * sizeof(StackTrace)) / MiB,
|
||||
uint64_t(stats.linked_sample_stats.in_use),
|
||||
uint64_t(stats.linked_sample_stats.total),
|
||||
(stats.linked_sample_stats.total * sizeof(StackTraceTable::LinkedSample)) / MiB,
|
||||
uint64_t(stats.pagemap_bytes),
|
||||
stats.pagemap_bytes / MiB,
|
||||
stats.pagemap_root_bytes_res, stats.pagemap_root_bytes_res / MiB,
|
||||
uint64_t(stats.percpu_metadata_bytes),
|
||||
stats.percpu_metadata_bytes / MiB,
|
||||
stats.percpu_metadata_bytes_res, stats.percpu_metadata_bytes_res / MiB,
|
||||
stats.arena.bytes_nonresident, stats.arena.bytes_nonresident / MiB,
|
||||
uint64_t(stats.peak_stats.backed_bytes),
|
||||
stats.peak_stats.backed_bytes / MiB,
|
||||
uint64_t(stats.peak_stats.sampled_application_bytes),
|
||||
stats.peak_stats.sampled_application_bytes / MiB,
|
||||
100. * safe_div(stats.peak_stats.backed_bytes - stats.peak_stats.sampled_application_bytes, stats.peak_stats.sampled_application_bytes),
|
||||
uint64_t(kPageSize),
|
||||
uint64_t(kHugePageSize),
|
||||
CountAllowedCpus(),
|
||||
stats.arena.blocks
|
||||
);
|
||||
// clang-format on
|
||||
|
||||
out->printf("MALLOC EXPERIMENTS:");
|
||||
WalkExperiments([&](absl::string_view name, bool active) {
|
||||
const char* value = active ? "1" : "0";
|
||||
out->printf(" %s=%s", name, value);
|
||||
});
|
||||
out->printf("\n");
|
||||
|
||||
out->printf(
|
||||
"MALLOC SAMPLED PROFILES: %zu bytes (current), %zu bytes (internal "
|
||||
"fragmentation), %zu bytes (peak), %zu count (total)\n",
|
||||
static_cast<size_t>(tc_globals.sampled_objects_size_.value()),
|
||||
tc_globals.sampled_internal_fragmentation_.value(),
|
||||
tc_globals.peak_heap_tracker().CurrentPeakSize(),
|
||||
tc_globals.total_sampled_count_.value());
|
||||
|
||||
MemoryStats memstats;
|
||||
if (GetMemoryStats(&memstats)) {
|
||||
uint64_t rss = memstats.rss;
|
||||
uint64_t vss = memstats.vss;
|
||||
// clang-format off
|
||||
out->printf(
|
||||
"\n"
|
||||
"Total process stats (inclusive of non-malloc sources):\n"
|
||||
"TOTAL: %12u (%7.1f MiB) Bytes resident (physical memory used)\n"
|
||||
"TOTAL: %12u (%7.1f MiB) Bytes mapped (virtual memory used)\n",
|
||||
rss, rss / MiB, vss, vss / MiB);
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
out->printf(
|
||||
"------------------------------------------------\n"
|
||||
"Call ReleaseMemoryToSystem() to release freelist memory to the OS"
|
||||
" (via madvise()).\n"
|
||||
"Bytes released to the OS take up virtual address space"
|
||||
" but no physical memory.\n");
|
||||
if (level >= 2) {
|
||||
out->printf("------------------------------------------------\n");
|
||||
out->printf("Total size of freelists for per-thread and per-CPU caches,\n");
|
||||
out->printf("transfer cache, and central cache, as well as number of\n");
|
||||
out->printf("live pages, returned/requested spans by size class\n");
|
||||
out->printf("------------------------------------------------\n");
|
||||
|
||||
uint64_t cumulative = 0;
|
||||
for (int size_class = 1; size_class < kNumClasses; ++size_class) {
|
||||
uint64_t class_bytes = class_count[size_class] *
|
||||
tc_globals.sizemap().class_to_size(size_class);
|
||||
|
||||
cumulative += class_bytes;
|
||||
out->printf(
|
||||
// clang-format off
|
||||
"class %3d [ %8zu bytes ] : %8u objs; %5.1f MiB; %6.1f cum MiB; "
|
||||
"%8u live pages; spans: %10zu ret / %10zu req = %5.4f;\n",
|
||||
// clang-format on
|
||||
size_class, tc_globals.sizemap().class_to_size(size_class),
|
||||
class_count[size_class], class_bytes / MiB, cumulative / MiB,
|
||||
span_stats[size_class].num_live_spans() *
|
||||
tc_globals.sizemap().class_to_pages(size_class),
|
||||
span_stats[size_class].num_spans_returned,
|
||||
span_stats[size_class].num_spans_requested,
|
||||
span_stats[size_class].prob_returned());
|
||||
}
|
||||
|
||||
#ifndef TCMALLOC_SMALL_BUT_SLOW
|
||||
out->printf("------------------------------------------------\n");
|
||||
out->printf("Central cache freelist: Span utilization histogram\n");
|
||||
out->printf("Non-cumulative number of spans with allocated objects < N\n");
|
||||
out->printf("------------------------------------------------\n");
|
||||
for (int size_class = 1; size_class < kNumClasses; ++size_class) {
|
||||
tc_globals.central_freelist(size_class).PrintSpanUtilStats(out);
|
||||
}
|
||||
#endif
|
||||
|
||||
tc_globals.transfer_cache().Print(out);
|
||||
tc_globals.sharded_transfer_cache().Print(out);
|
||||
|
||||
if (UsePerCpuCache(tc_globals)) {
|
||||
tc_globals.cpu_cache().Print(out);
|
||||
}
|
||||
|
||||
tc_globals.page_allocator().Print(out, MemoryTag::kNormal);
|
||||
if (tc_globals.numa_topology().active_partitions() > 1) {
|
||||
tc_globals.page_allocator().Print(out, MemoryTag::kNormalP1);
|
||||
}
|
||||
tc_globals.page_allocator().Print(out, MemoryTag::kSampled);
|
||||
tc_globals.page_allocator().Print(out, MemoryTag::kCold);
|
||||
tc_globals.guardedpage_allocator().Print(out);
|
||||
|
||||
uint64_t limit_bytes;
|
||||
bool is_hard;
|
||||
std::tie(limit_bytes, is_hard) = tc_globals.page_allocator().limit();
|
||||
out->printf("PARAMETER desired_usage_limit_bytes %u %s\n", limit_bytes,
|
||||
is_hard ? "(hard)" : "");
|
||||
out->printf("Number of times limit was hit: %lld\n",
|
||||
tc_globals.page_allocator().limit_hits());
|
||||
|
||||
out->printf("PARAMETER tcmalloc_per_cpu_caches %d\n",
|
||||
Parameters::per_cpu_caches() ? 1 : 0);
|
||||
out->printf("PARAMETER tcmalloc_max_per_cpu_cache_size %d\n",
|
||||
Parameters::max_per_cpu_cache_size());
|
||||
out->printf("PARAMETER tcmalloc_max_total_thread_cache_bytes %lld\n",
|
||||
Parameters::max_total_thread_cache_bytes());
|
||||
out->printf("PARAMETER malloc_release_bytes_per_sec %llu\n",
|
||||
Parameters::background_release_rate());
|
||||
out->printf(
|
||||
"PARAMETER tcmalloc_skip_subrelease_interval %s\n",
|
||||
absl::FormatDuration(Parameters::filler_skip_subrelease_interval()));
|
||||
out->printf("PARAMETER tcmalloc_skip_subrelease_short_interval %s\n",
|
||||
absl::FormatDuration(
|
||||
Parameters::filler_skip_subrelease_short_interval()));
|
||||
out->printf("PARAMETER tcmalloc_skip_subrelease_long_interval %s\n",
|
||||
absl::FormatDuration(
|
||||
Parameters::filler_skip_subrelease_long_interval()));
|
||||
out->printf("PARAMETER flat vcpus %d\n",
|
||||
subtle::percpu::UsingFlatVirtualCpus() ? 1 : 0);
|
||||
out->printf("PARAMETER tcmalloc_shuffle_per_cpu_caches %d\n",
|
||||
Parameters::shuffle_per_cpu_caches() ? 1 : 0);
|
||||
out->printf("PARAMETER tcmalloc_partial_transfer_cache %d\n",
|
||||
Parameters::partial_transfer_cache() ? 1 : 0);
|
||||
out->printf(
|
||||
"PARAMETER tcmalloc_separate_allocs_for_few_and_many_objects_spans "
|
||||
"%d\n",
|
||||
Parameters::separate_allocs_for_few_and_many_objects_spans());
|
||||
}
|
||||
}
|
||||
|
||||
void DumpStatsInPbtxt(Printer* out, int level) {
|
||||
TCMallocStats stats;
|
||||
uint64_t class_count[kNumClasses];
|
||||
SpanStats span_stats[kNumClasses];
|
||||
if (level >= 2) {
|
||||
ExtractStats(&stats, class_count, span_stats, nullptr, nullptr, true);
|
||||
} else {
|
||||
ExtractTCMallocStats(&stats, true);
|
||||
}
|
||||
|
||||
const uint64_t bytes_in_use_by_app = InUseByApp(stats);
|
||||
const uint64_t virtual_memory_used = VirtualMemoryUsed(stats);
|
||||
const uint64_t physical_memory_used = PhysicalMemoryUsed(stats);
|
||||
const uint64_t unmapped_bytes = UnmappedBytes(stats);
|
||||
|
||||
PbtxtRegion region(out, kTop);
|
||||
region.PrintI64("in_use_by_app", bytes_in_use_by_app);
|
||||
region.PrintI64("page_heap_freelist", stats.pageheap.free_bytes);
|
||||
region.PrintI64("central_cache_freelist", stats.central_bytes);
|
||||
region.PrintI64("per_cpu_cache_freelist", stats.per_cpu_bytes);
|
||||
region.PrintI64("sharded_transfer_cache_freelist",
|
||||
stats.sharded_transfer_bytes);
|
||||
region.PrintI64("transfer_cache_freelist", stats.transfer_bytes);
|
||||
region.PrintI64("thread_cache_freelists", stats.thread_bytes);
|
||||
region.PrintI64("malloc_metadata", stats.metadata_bytes);
|
||||
region.PrintI64("malloc_metadata_arena_unavailable",
|
||||
stats.arena.bytes_unavailable);
|
||||
region.PrintI64("malloc_metadata_arena_unallocated",
|
||||
stats.arena.bytes_unallocated);
|
||||
region.PrintI64("actual_mem_used", physical_memory_used);
|
||||
region.PrintI64("unmapped", unmapped_bytes);
|
||||
region.PrintI64("virtual_address_space_used", virtual_memory_used);
|
||||
region.PrintI64("num_spans", uint64_t(stats.span_stats.in_use));
|
||||
region.PrintI64("num_spans_created", uint64_t(stats.span_stats.total));
|
||||
region.PrintI64("num_thread_heaps", uint64_t(stats.tc_stats.in_use));
|
||||
region.PrintI64("num_thread_heaps_created", uint64_t(stats.tc_stats.total));
|
||||
region.PrintI64("num_stack_traces", uint64_t(stats.stack_stats.in_use));
|
||||
region.PrintI64("num_stack_traces_created",
|
||||
uint64_t(stats.stack_stats.total));
|
||||
region.PrintI64("num_table_buckets",
|
||||
uint64_t(stats.linked_sample_stats.in_use));
|
||||
region.PrintI64("num_table_buckets_created",
|
||||
uint64_t(stats.linked_sample_stats.total));
|
||||
region.PrintI64("pagemap_size", uint64_t(stats.pagemap_bytes));
|
||||
region.PrintI64("pagemap_root_residence", stats.pagemap_root_bytes_res);
|
||||
region.PrintI64("percpu_slab_size", stats.percpu_metadata_bytes);
|
||||
region.PrintI64("percpu_slab_residence", stats.percpu_metadata_bytes_res);
|
||||
region.PrintI64("peak_backed", stats.peak_stats.backed_bytes);
|
||||
region.PrintI64("peak_application_demand",
|
||||
stats.peak_stats.sampled_application_bytes);
|
||||
region.PrintI64("tcmalloc_page_size", uint64_t(kPageSize));
|
||||
region.PrintI64("tcmalloc_huge_page_size", uint64_t(kHugePageSize));
|
||||
region.PrintI64("cpus_allowed", CountAllowedCpus());
|
||||
region.PrintI64("arena_blocks", stats.arena.blocks);
|
||||
|
||||
{
|
||||
auto sampled_profiles = region.CreateSubRegion("sampled_profiles");
|
||||
sampled_profiles.PrintI64("current_bytes",
|
||||
tc_globals.sampled_objects_size_.value());
|
||||
sampled_profiles.PrintI64(
|
||||
"current_fragmentation_bytes",
|
||||
tc_globals.sampled_internal_fragmentation_.value());
|
||||
sampled_profiles.PrintI64("peak_bytes",
|
||||
tc_globals.peak_heap_tracker().CurrentPeakSize());
|
||||
}
|
||||
|
||||
// Print total process stats (inclusive of non-malloc sources).
|
||||
MemoryStats memstats;
|
||||
if (GetMemoryStats(&memstats)) {
|
||||
region.PrintI64("total_resident", uint64_t(memstats.rss));
|
||||
region.PrintI64("total_mapped", uint64_t(memstats.vss));
|
||||
}
|
||||
|
||||
region.PrintI64("total_sampled_count",
|
||||
tc_globals.total_sampled_count_.value());
|
||||
|
||||
if (level >= 2) {
|
||||
{
|
||||
#ifndef TCMALLOC_SMALL_BUT_SLOW
|
||||
for (int size_class = 1; size_class < kNumClasses; ++size_class) {
|
||||
uint64_t class_bytes = class_count[size_class] *
|
||||
tc_globals.sizemap().class_to_size(size_class);
|
||||
PbtxtRegion entry = region.CreateSubRegion("freelist");
|
||||
entry.PrintI64("sizeclass",
|
||||
tc_globals.sizemap().class_to_size(size_class));
|
||||
entry.PrintI64("bytes", class_bytes);
|
||||
entry.PrintI64("num_spans_requested",
|
||||
span_stats[size_class].num_spans_requested);
|
||||
entry.PrintI64("num_spans_returned",
|
||||
span_stats[size_class].num_spans_returned);
|
||||
entry.PrintI64("obj_capacity", span_stats[size_class].obj_capacity);
|
||||
tc_globals.central_freelist(size_class)
|
||||
.PrintSpanUtilStatsInPbtxt(&entry);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
tc_globals.transfer_cache().PrintInPbtxt(®ion);
|
||||
tc_globals.sharded_transfer_cache().PrintInPbtxt(®ion);
|
||||
|
||||
region.PrintRaw("transfer_cache_implementation",
|
||||
TransferCacheImplementationToLabel(
|
||||
tc_globals.transfer_cache().implementation()));
|
||||
|
||||
if (UsePerCpuCache(tc_globals)) {
|
||||
tc_globals.cpu_cache().PrintInPbtxt(®ion);
|
||||
}
|
||||
}
|
||||
tc_globals.page_allocator().PrintInPbtxt(®ion, MemoryTag::kNormal);
|
||||
if (tc_globals.numa_topology().active_partitions() > 1) {
|
||||
tc_globals.page_allocator().PrintInPbtxt(®ion, MemoryTag::kNormalP1);
|
||||
}
|
||||
tc_globals.page_allocator().PrintInPbtxt(®ion, MemoryTag::kSampled);
|
||||
tc_globals.page_allocator().PrintInPbtxt(®ion, MemoryTag::kCold);
|
||||
// We do not collect tracking information in pbtxt.
|
||||
|
||||
size_t limit_bytes;
|
||||
bool is_hard;
|
||||
std::tie(limit_bytes, is_hard) = tc_globals.page_allocator().limit();
|
||||
region.PrintI64("desired_usage_limit_bytes", limit_bytes);
|
||||
region.PrintBool("hard_limit", is_hard);
|
||||
region.PrintI64("limit_hits", tc_globals.page_allocator().limit_hits());
|
||||
|
||||
{
|
||||
auto gwp_asan = region.CreateSubRegion("gwp_asan");
|
||||
tc_globals.guardedpage_allocator().PrintInPbtxt(&gwp_asan);
|
||||
}
|
||||
|
||||
region.PrintI64("memory_release_failures", SystemReleaseErrors());
|
||||
|
||||
region.PrintBool("tcmalloc_per_cpu_caches", Parameters::per_cpu_caches());
|
||||
region.PrintI64("tcmalloc_max_per_cpu_cache_size",
|
||||
Parameters::max_per_cpu_cache_size());
|
||||
region.PrintI64("tcmalloc_max_total_thread_cache_bytes",
|
||||
Parameters::max_total_thread_cache_bytes());
|
||||
region.PrintI64("malloc_release_bytes_per_sec",
|
||||
static_cast<int64_t>(Parameters::background_release_rate()));
|
||||
region.PrintI64(
|
||||
"tcmalloc_skip_subrelease_interval_ns",
|
||||
absl::ToInt64Nanoseconds(Parameters::filler_skip_subrelease_interval()));
|
||||
region.PrintI64("tcmalloc_skip_subrelease_short_interval_ns",
|
||||
absl::ToInt64Nanoseconds(
|
||||
Parameters::filler_skip_subrelease_short_interval()));
|
||||
region.PrintI64("tcmalloc_skip_subrelease_long_interval_ns",
|
||||
absl::ToInt64Nanoseconds(
|
||||
Parameters::filler_skip_subrelease_long_interval()));
|
||||
region.PrintBool("tcmalloc_shuffle_per_cpu_caches",
|
||||
Parameters::shuffle_per_cpu_caches());
|
||||
region.PrintI64("profile_sampling_rate", Parameters::profile_sampling_rate());
|
||||
region.PrintRaw("percpu_vcpu_type",
|
||||
subtle::percpu::UsingFlatVirtualCpus() ? "FLAT" : "NONE");
|
||||
region.PrintBool("tcmalloc_partial_transfer_cache",
|
||||
Parameters::partial_transfer_cache());
|
||||
region.PrintI64("separate_allocs_for_few_and_many_objects_spans",
|
||||
Parameters::separate_allocs_for_few_and_many_objects_spans());
|
||||
}
|
||||
|
||||
bool GetNumericProperty(const char* name_data, size_t name_size,
|
||||
size_t* value) {
|
||||
// LINT.IfChange
|
||||
ASSERT(name_data != nullptr);
|
||||
ASSERT(value != nullptr);
|
||||
const absl::string_view name(name_data, name_size);
|
||||
|
||||
// This is near the top since ReleasePerCpuMemoryToOS() calls it frequently.
|
||||
if (name == "tcmalloc.per_cpu_caches_active") {
|
||||
*value = tc_globals.CpuCacheActive();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (name == "generic.virtual_memory_used") {
|
||||
TCMallocStats stats;
|
||||
ExtractTCMallocStats(&stats, false);
|
||||
*value = VirtualMemoryUsed(stats);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (name == "generic.physical_memory_used") {
|
||||
TCMallocStats stats;
|
||||
ExtractTCMallocStats(&stats, false);
|
||||
*value = PhysicalMemoryUsed(stats);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (name == "generic.current_allocated_bytes" ||
|
||||
name == "generic.bytes_in_use_by_app") {
|
||||
TCMallocStats stats;
|
||||
ExtractTCMallocStats(&stats, false);
|
||||
*value = InUseByApp(stats);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (name == "generic.peak_memory_usage") {
|
||||
TCMallocStats stats;
|
||||
ExtractTCMallocStats(&stats, false);
|
||||
*value = static_cast<uint64_t>(stats.peak_stats.sampled_application_bytes);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (name == "generic.realized_fragmentation") {
|
||||
TCMallocStats stats;
|
||||
ExtractTCMallocStats(&stats, false);
|
||||
*value = static_cast<uint64_t>(
|
||||
100. * safe_div(stats.peak_stats.backed_bytes -
|
||||
stats.peak_stats.sampled_application_bytes,
|
||||
stats.peak_stats.sampled_application_bytes));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
if (name == "generic.heap_size") {
|
||||
absl::base_internal::SpinLockHolder l(&pageheap_lock);
|
||||
BackingStats stats = tc_globals.page_allocator().stats();
|
||||
*value = HeapSizeBytes(stats);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (name == "tcmalloc.central_cache_free") {
|
||||
TCMallocStats stats;
|
||||
ExtractTCMallocStats(&stats, false);
|
||||
*value = stats.central_bytes;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (name == "tcmalloc.cpu_free") {
|
||||
TCMallocStats stats;
|
||||
ExtractTCMallocStats(&stats, false);
|
||||
*value = stats.per_cpu_bytes;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (name == "tcmalloc.sharded_transfer_cache_free") {
|
||||
TCMallocStats stats;
|
||||
ExtractTCMallocStats(&stats, false);
|
||||
*value = stats.sharded_transfer_bytes;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (name == "tcmalloc.slack_bytes") {
|
||||
// Kept for backwards compatibility. Now defined externally as:
|
||||
// pageheap_free_bytes + pageheap_unmapped_bytes.
|
||||
absl::base_internal::SpinLockHolder l(&pageheap_lock);
|
||||
BackingStats stats = tc_globals.page_allocator().stats();
|
||||
*value = SlackBytes(stats);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (name == "tcmalloc.pageheap_free_bytes" ||
|
||||
name == "tcmalloc.page_heap_free") {
|
||||
absl::base_internal::SpinLockHolder l(&pageheap_lock);
|
||||
*value = tc_globals.page_allocator().stats().free_bytes;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (name == "tcmalloc.pageheap_unmapped_bytes" ||
|
||||
name == "tcmalloc.page_heap_unmapped") {
|
||||
absl::base_internal::SpinLockHolder l(&pageheap_lock);
|
||||
// Arena non-resident bytes aren't on the page heap, but they are unmapped.
|
||||
*value = tc_globals.page_allocator().stats().unmapped_bytes +
|
||||
tc_globals.arena().stats().bytes_nonresident;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (name == "tcmalloc.sampled_internal_fragmentation") {
|
||||
*value = tc_globals.sampled_internal_fragmentation_.value();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (name == "tcmalloc.page_algorithm") {
|
||||
absl::base_internal::SpinLockHolder l(&pageheap_lock);
|
||||
*value = tc_globals.page_allocator().algorithm();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (name == "tcmalloc.max_total_thread_cache_bytes") {
|
||||
absl::base_internal::SpinLockHolder l(&pageheap_lock);
|
||||
*value = ThreadCache::overall_thread_cache_size();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (name == "tcmalloc.current_total_thread_cache_bytes" ||
|
||||
name == "tcmalloc.thread_cache_free") {
|
||||
TCMallocStats stats;
|
||||
ExtractTCMallocStats(&stats, false);
|
||||
*value = stats.thread_bytes;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (name == "tcmalloc.thread_cache_count") {
|
||||
TCMallocStats stats;
|
||||
ExtractTCMallocStats(&stats, false);
|
||||
*value = stats.tc_stats.in_use;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (name == "tcmalloc.local_bytes") {
|
||||
TCMallocStats stats;
|
||||
ExtractTCMallocStats(&stats, false);
|
||||
*value = LocalBytes(stats);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (name == "tcmalloc.external_fragmentation_bytes") {
|
||||
TCMallocStats stats;
|
||||
ExtractTCMallocStats(&stats, false);
|
||||
*value = ExternalBytes(stats);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (name == "tcmalloc.metadata_bytes") {
|
||||
TCMallocStats stats;
|
||||
ExtractTCMallocStats(&stats, true);
|
||||
*value = stats.metadata_bytes;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (name == "tcmalloc.transfer_cache_free") {
|
||||
TCMallocStats stats;
|
||||
ExtractTCMallocStats(&stats, false);
|
||||
*value = stats.transfer_bytes;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool want_hard_limit = (name == "tcmalloc.hard_usage_limit_bytes");
|
||||
if (want_hard_limit || name == "tcmalloc.desired_usage_limit_bytes") {
|
||||
size_t amount;
|
||||
bool is_hard;
|
||||
std::tie(amount, is_hard) = tc_globals.page_allocator().limit();
|
||||
if (want_hard_limit != is_hard) {
|
||||
amount = std::numeric_limits<size_t>::max();
|
||||
}
|
||||
*value = amount;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (name == "tcmalloc.required_bytes") {
|
||||
TCMallocStats stats;
|
||||
ExtractTCMallocStats(&stats, false);
|
||||
*value = RequiredBytes(stats);
|
||||
return true;
|
||||
}
|
||||
|
||||
const absl::string_view kExperimentPrefix = "tcmalloc.experiment.";
|
||||
if (absl::StartsWith(name, kExperimentPrefix)) {
|
||||
absl::optional<Experiment> exp =
|
||||
FindExperimentByName(absl::StripPrefix(name, kExperimentPrefix));
|
||||
if (exp.has_value()) {
|
||||
*value = IsExperimentActive(*exp) ? 1 : 0;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// LINT.ThenChange(//depot/google3/tcmalloc/malloc_extension_test.cc)
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
82
src/third_party/tcmalloc/dist/tcmalloc/global_stats.h
vendored
Normal file
@ -0,0 +1,82 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef TCMALLOC_GLOBAL_STATS_H_
|
||||
#define TCMALLOC_GLOBAL_STATS_H_
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
#include "tcmalloc/page_allocator.h"
|
||||
#include "tcmalloc/span_stats.h"
|
||||
#include "tcmalloc/stats.h"
|
||||
#include "tcmalloc/transfer_cache_stats.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
|
||||
// Extract interesting stats
|
||||
struct TCMallocStats {
|
||||
uint64_t thread_bytes; // Bytes in thread caches
|
||||
uint64_t central_bytes; // Bytes in central cache
|
||||
uint64_t transfer_bytes; // Bytes in central transfer cache
|
||||
uint64_t metadata_bytes; // Bytes alloced for metadata
|
||||
uint64_t sharded_transfer_bytes; // Bytes in per-CCX cache
|
||||
uint64_t per_cpu_bytes; // Bytes in per-CPU cache
|
||||
uint64_t pagemap_root_bytes_res; // Resident bytes of pagemap root node
|
||||
uint64_t percpu_metadata_bytes_res; // Resident bytes of the per-CPU metadata
|
||||
AllocatorStats tc_stats; // ThreadCache objects
|
||||
AllocatorStats span_stats; // Span objects
|
||||
AllocatorStats stack_stats; // StackTrace objects
|
||||
AllocatorStats linked_sample_stats; // StackTraceTable::LinkedSample objects
|
||||
size_t pagemap_bytes; // included in metadata bytes
|
||||
size_t percpu_metadata_bytes; // included in metadata bytes
|
||||
BackingStats pageheap; // Stats from page heap
|
||||
PageAllocator::PeakStats peak_stats;
|
||||
|
||||
ArenaStats arena; // Stats from the metadata Arena
|
||||
|
||||
// Explicitly declare the ctor to put it in the google_malloc section.
|
||||
TCMallocStats() = default;
|
||||
};
|
||||
|
||||
void ExtractStats(TCMallocStats* r, uint64_t* class_count,
|
||||
SpanStats* span_stats, SmallSpanStats* small_spans,
|
||||
LargeSpanStats* large_spans, TransferCacheStats* tc_stats,
|
||||
bool report_residence);
|
||||
|
||||
void ExtractTCMallocStats(TCMallocStats* r, bool report_residence);
|
||||
|
||||
uint64_t InUseByApp(const TCMallocStats& stats);
|
||||
uint64_t VirtualMemoryUsed(const TCMallocStats& stats);
|
||||
uint64_t UnmappedBytes(const TCMallocStats& stats);
|
||||
uint64_t PhysicalMemoryUsed(const TCMallocStats& stats);
|
||||
uint64_t RequiredBytes(const TCMallocStats& stats);
|
||||
size_t ExternalBytes(const TCMallocStats& stats);
|
||||
size_t HeapSizeBytes(const BackingStats& stats);
|
||||
size_t LocalBytes(const TCMallocStats& stats);
|
||||
size_t SlackBytes(const BackingStats& stats);
|
||||
|
||||
// WRITE stats to "out"
|
||||
void DumpStats(Printer* out, int level);
|
||||
void DumpStatsInPbtxt(Printer* out, int level);
|
||||
|
||||
bool GetNumericProperty(const char* name_data, size_t name_size, size_t* value);
|
||||
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
|
||||
#endif // TCMALLOC_GLOBAL_STATS_H_
|
||||
569
src/third_party/tcmalloc/dist/tcmalloc/guarded_page_allocator.cc
vendored
Normal file
@ -0,0 +1,569 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tcmalloc/guarded_page_allocator.h"
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <string.h>
|
||||
#include <sys/mman.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cmath>
|
||||
#include <csignal>
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
|
||||
#include "absl/base/call_once.h"
|
||||
#include "absl/base/internal/spinlock.h"
|
||||
#include "absl/base/internal/sysinfo.h"
|
||||
#include "absl/debugging/stacktrace.h"
|
||||
#include "absl/numeric/bits.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "tcmalloc/common.h"
|
||||
#include "tcmalloc/internal/environment.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
#include "tcmalloc/internal/page_size.h"
|
||||
#include "tcmalloc/internal/util.h"
|
||||
#include "tcmalloc/pagemap.h"
|
||||
#include "tcmalloc/sampler.h"
|
||||
#include "tcmalloc/static_vars.h"
|
||||
#include "tcmalloc/system-alloc.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
|
||||
const size_t GuardedPageAllocator::kMagicSize; // NOLINT
|
||||
|
||||
void GuardedPageAllocator::Init(size_t max_alloced_pages, size_t total_pages) {
|
||||
CHECK_CONDITION(max_alloced_pages > 0);
|
||||
CHECK_CONDITION(max_alloced_pages <= total_pages);
|
||||
CHECK_CONDITION(total_pages <= kGpaMaxPages);
|
||||
max_alloced_pages_ = max_alloced_pages;
|
||||
total_pages_ = total_pages;
|
||||
|
||||
// If the system page size is larger than kPageSize, we need to use the
|
||||
// system page size for this allocator since mprotect operates on full pages
|
||||
// only. This case happens on PPC.
|
||||
page_size_ = std::max(kPageSize, static_cast<size_t>(GetPageSize()));
|
||||
ASSERT(page_size_ % kPageSize == 0);
|
||||
|
||||
rand_ = reinterpret_cast<uint64_t>(this); // Initialize RNG seed.
|
||||
MapPages();
|
||||
}
|
||||
|
||||
void GuardedPageAllocator::Destroy() {
|
||||
absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
|
||||
if (initialized_) {
|
||||
size_t len = pages_end_addr_ - pages_base_addr_;
|
||||
int err = munmap(reinterpret_cast<void*>(pages_base_addr_), len);
|
||||
ASSERT(err != -1);
|
||||
(void)err;
|
||||
initialized_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
GuardedPageAllocator::AllocWithStatus GuardedPageAllocator::Allocate(
|
||||
size_t size, size_t alignment) {
|
||||
if (size == 0) {
|
||||
return {nullptr, Profile::Sample::GuardedStatus::TooSmall};
|
||||
}
|
||||
ssize_t free_slot = ReserveFreeSlot();
|
||||
// All slots are reserved.
|
||||
if (free_slot == -1) {
|
||||
return {nullptr, Profile::Sample::GuardedStatus::NoAvailableSlots};
|
||||
}
|
||||
|
||||
ASSERT(size <= page_size_);
|
||||
ASSERT(alignment <= page_size_);
|
||||
ASSERT(alignment == 0 || absl::has_single_bit(alignment));
|
||||
void* result = reinterpret_cast<void*>(SlotToAddr(free_slot));
|
||||
if (mprotect(result, page_size_, PROT_READ | PROT_WRITE) == -1) {
|
||||
ASSERT(false && "mprotect failed");
|
||||
absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
|
||||
num_failed_allocations_++;
|
||||
FreeSlot(free_slot);
|
||||
return {nullptr, Profile::Sample::GuardedStatus::MProtectFailed};
|
||||
}
|
||||
|
||||
// Place some allocations at end of page for better overflow detection.
|
||||
MaybeRightAlign(free_slot, size, alignment, &result);
|
||||
|
||||
// Record stack trace.
|
||||
SlotMetadata& d = data_[free_slot];
|
||||
d.dealloc_trace.depth = 0;
|
||||
d.alloc_trace.depth = absl::GetStackTrace(d.alloc_trace.stack, kMaxStackDepth,
|
||||
/*skip_count=*/3);
|
||||
d.alloc_trace.tid = absl::base_internal::GetTID();
|
||||
d.requested_size = size;
|
||||
d.allocation_start = reinterpret_cast<uintptr_t>(result);
|
||||
|
||||
ASSERT(!alignment || d.allocation_start % alignment == 0);
|
||||
return {result, Profile::Sample::GuardedStatus::Guarded};
|
||||
}
|
||||
|
||||
void GuardedPageAllocator::Deallocate(void* ptr) {
|
||||
ASSERT(PointerIsMine(ptr));
|
||||
const uintptr_t page_addr = GetPageAddr(reinterpret_cast<uintptr_t>(ptr));
|
||||
size_t slot = AddrToSlot(page_addr);
|
||||
|
||||
absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
|
||||
if (IsFreed(slot)) {
|
||||
double_free_detected_ = true;
|
||||
} else if (WriteOverflowOccurred(slot)) {
|
||||
write_overflow_detected_ = true;
|
||||
}
|
||||
|
||||
CHECK_CONDITION(mprotect(reinterpret_cast<void*>(page_addr), page_size_,
|
||||
PROT_NONE) != -1);
|
||||
|
||||
if (write_overflow_detected_ || double_free_detected_) {
|
||||
*reinterpret_cast<char*>(ptr) = 'X'; // Trigger SEGV handler.
|
||||
CHECK_CONDITION(false); // Unreachable.
|
||||
}
|
||||
|
||||
// Record stack trace.
|
||||
GpaStackTrace& trace = data_[slot].dealloc_trace;
|
||||
trace.depth = absl::GetStackTrace(trace.stack, kMaxStackDepth,
|
||||
/*skip_count=*/2);
|
||||
trace.tid = absl::base_internal::GetTID();
|
||||
|
||||
FreeSlot(slot);
|
||||
}
|
||||
|
||||
size_t GuardedPageAllocator::GetRequestedSize(const void* ptr) const {
|
||||
ASSERT(PointerIsMine(ptr));
|
||||
size_t slot = AddrToSlot(GetPageAddr(reinterpret_cast<uintptr_t>(ptr)));
|
||||
return data_[slot].requested_size;
|
||||
}
|
||||
|
||||
std::pair<off_t, size_t> GuardedPageAllocator::GetAllocationOffsetAndSize(
|
||||
const void* ptr) const {
|
||||
ASSERT(PointerIsMine(ptr));
|
||||
const uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
|
||||
const size_t slot = GetNearestSlot(addr);
|
||||
return {addr - data_[slot].allocation_start, data_[slot].requested_size};
|
||||
}
|
||||
|
||||
GuardedPageAllocator::ErrorType GuardedPageAllocator::GetStackTraces(
|
||||
const void* ptr, GpaStackTrace* alloc_trace,
|
||||
GpaStackTrace* dealloc_trace) const {
|
||||
ASSERT(PointerIsMine(ptr));
|
||||
const uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
|
||||
size_t slot = GetNearestSlot(addr);
|
||||
*alloc_trace = data_[slot].alloc_trace;
|
||||
*dealloc_trace = data_[slot].dealloc_trace;
|
||||
return GetErrorType(addr, data_[slot]);
|
||||
}
|
||||
|
||||
// We take guarded samples during periodic profiling samples. Computes the
|
||||
// mean number of profiled samples made for every guarded sample.
|
||||
static int GetChainedRate() {
|
||||
auto guarded_rate = Parameters::guarded_sampling_rate();
|
||||
auto sample_rate = Parameters::profile_sampling_rate();
|
||||
if (guarded_rate < 0 || sample_rate <= 0) {
|
||||
return guarded_rate;
|
||||
} else {
|
||||
return std::ceil(static_cast<double>(guarded_rate) /
|
||||
static_cast<double>(sample_rate));
|
||||
}
|
||||
}
|
||||
|
||||
void GuardedPageAllocator::Print(Printer* out) {
|
||||
absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
|
||||
out->printf(
|
||||
"\n"
|
||||
"------------------------------------------------\n"
|
||||
"GWP-ASan Status\n"
|
||||
"------------------------------------------------\n"
|
||||
"Successful Allocations: %zu\n"
|
||||
"Failed Allocations: %zu\n"
|
||||
"Slots Currently Allocated: %zu\n"
|
||||
"Slots Currently Quarantined: %zu\n"
|
||||
"Maximum Slots Allocated: %zu / %zu\n"
|
||||
"PARAMETER tcmalloc_guarded_sample_parameter %d\n",
|
||||
num_allocation_requests_ - num_failed_allocations_,
|
||||
num_failed_allocations_, num_alloced_pages_,
|
||||
total_pages_ - num_alloced_pages_, num_alloced_pages_max_,
|
||||
max_alloced_pages_, GetChainedRate());
|
||||
}
|
||||
|
||||
void GuardedPageAllocator::PrintInPbtxt(PbtxtRegion* gwp_asan) {
|
||||
absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
|
||||
gwp_asan->PrintI64("successful_allocations",
|
||||
num_allocation_requests_ - num_failed_allocations_);
|
||||
gwp_asan->PrintI64("failed_allocations", num_failed_allocations_);
|
||||
gwp_asan->PrintI64("current_slots_allocated", num_alloced_pages_);
|
||||
gwp_asan->PrintI64("current_slots_quarantined",
|
||||
total_pages_ - num_alloced_pages_);
|
||||
gwp_asan->PrintI64("max_slots_allocated", num_alloced_pages_max_);
|
||||
gwp_asan->PrintI64("allocated_slot_limit", max_alloced_pages_);
|
||||
gwp_asan->PrintI64("tcmalloc_guarded_sample_parameter", GetChainedRate());
|
||||
}
|
||||
|
||||
// Maps 2 * total_pages_ + 1 pages so that there are total_pages_ unique pages
|
||||
// we can return from Allocate with guard pages before and after them.
|
||||
void GuardedPageAllocator::MapPages() {
|
||||
absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
|
||||
ASSERT(!first_page_addr_);
|
||||
ASSERT(page_size_ % GetPageSize() == 0);
|
||||
size_t len = (2 * total_pages_ + 1) * page_size_;
|
||||
auto base_addr = reinterpret_cast<uintptr_t>(
|
||||
MmapAligned(len, page_size_, MemoryTag::kSampled));
|
||||
ASSERT(base_addr);
|
||||
if (!base_addr) return;
|
||||
|
||||
// Tell TCMalloc's PageMap about the memory we own.
|
||||
const PageId page = PageIdContaining(reinterpret_cast<void*>(base_addr));
|
||||
const Length page_len = BytesToLengthFloor(len);
|
||||
if (!tc_globals.pagemap().Ensure(page, page_len)) {
|
||||
ASSERT(false && "Failed to notify page map of page-guarded memory.");
|
||||
return;
|
||||
}
|
||||
|
||||
// Allocate memory for slot metadata.
|
||||
data_ = reinterpret_cast<SlotMetadata*>(
|
||||
tc_globals.arena().Alloc(sizeof(*data_) * total_pages_));
|
||||
for (size_t i = 0; i < total_pages_; ++i) {
|
||||
new (&data_[i]) SlotMetadata;
|
||||
}
|
||||
|
||||
pages_base_addr_ = base_addr;
|
||||
pages_end_addr_ = pages_base_addr_ + len;
|
||||
|
||||
// Align first page to page_size_.
|
||||
first_page_addr_ = GetPageAddr(pages_base_addr_ + page_size_);
|
||||
|
||||
std::fill_n(free_pages_, total_pages_, true);
|
||||
initialized_ = true;
|
||||
}
|
||||
|
||||
// Selects a random slot in O(total_pages_) time.
|
||||
ssize_t GuardedPageAllocator::ReserveFreeSlot() {
|
||||
absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
|
||||
if (!initialized_ || !allow_allocations_) return -1;
|
||||
num_allocation_requests_++;
|
||||
if (num_alloced_pages_ == max_alloced_pages_) {
|
||||
num_failed_allocations_++;
|
||||
return -1;
|
||||
}
|
||||
|
||||
rand_ = Sampler::NextRandom(rand_);
|
||||
size_t num_free_pages = total_pages_ - num_alloced_pages_;
|
||||
size_t slot = GetIthFreeSlot(rand_ % num_free_pages);
|
||||
ASSERT(free_pages_[slot]);
|
||||
free_pages_[slot] = false;
|
||||
num_alloced_pages_++;
|
||||
num_alloced_pages_max_ = std::max(num_alloced_pages_, num_alloced_pages_max_);
|
||||
return slot;
|
||||
}
|
||||
|
||||
size_t GuardedPageAllocator::GetIthFreeSlot(size_t ith_free_slot) {
|
||||
ASSERT(ith_free_slot < total_pages_ - num_alloced_pages_);
|
||||
for (size_t free_slot_count = 0, j = 0;; j++) {
|
||||
if (free_pages_[j]) {
|
||||
if (free_slot_count == ith_free_slot) return j;
|
||||
free_slot_count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GuardedPageAllocator::FreeSlot(size_t slot) {
|
||||
ASSERT(slot < total_pages_);
|
||||
ASSERT(!free_pages_[slot]);
|
||||
free_pages_[slot] = true;
|
||||
num_alloced_pages_--;
|
||||
}
|
||||
|
||||
uintptr_t GuardedPageAllocator::GetPageAddr(uintptr_t addr) const {
|
||||
const uintptr_t addr_mask = ~(page_size_ - 1ULL);
|
||||
return addr & addr_mask;
|
||||
}
|
||||
|
||||
uintptr_t GuardedPageAllocator::GetNearestValidPage(uintptr_t addr) const {
|
||||
if (addr < first_page_addr_) return first_page_addr_;
|
||||
const uintptr_t last_page_addr =
|
||||
first_page_addr_ + 2 * (total_pages_ - 1) * page_size_;
|
||||
if (addr > last_page_addr) return last_page_addr;
|
||||
uintptr_t offset = addr - first_page_addr_;
|
||||
|
||||
// If addr is already on a valid page, just return addr.
|
||||
if ((offset / page_size_) % 2 == 0) return addr;
|
||||
|
||||
// ptr points to a guard page, so get nearest valid page.
|
||||
const size_t kHalfPageSize = page_size_ / 2;
|
||||
if ((offset / kHalfPageSize) % 2 == 0) {
|
||||
return addr - kHalfPageSize; // Round down.
|
||||
}
|
||||
return addr + kHalfPageSize; // Round up.
|
||||
}
|
||||
|
||||
size_t GuardedPageAllocator::GetNearestSlot(uintptr_t addr) const {
|
||||
return AddrToSlot(GetPageAddr(GetNearestValidPage(addr)));
|
||||
}
|
||||
|
||||
bool GuardedPageAllocator::IsFreed(size_t slot) const {
|
||||
return free_pages_[slot];
|
||||
}
|
||||
|
||||
bool GuardedPageAllocator::WriteOverflowOccurred(size_t slot) const {
|
||||
if (!ShouldRightAlign(slot)) return false;
|
||||
uint8_t magic = GetWriteOverflowMagic(slot);
|
||||
uintptr_t alloc_end =
|
||||
data_[slot].allocation_start + data_[slot].requested_size;
|
||||
uintptr_t page_end = SlotToAddr(slot) + page_size_;
|
||||
uintptr_t magic_end = std::min(page_end, alloc_end + kMagicSize);
|
||||
for (uintptr_t p = alloc_end; p < magic_end; ++p) {
|
||||
if (*reinterpret_cast<uint8_t*>(p) != magic) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
GuardedPageAllocator::ErrorType GuardedPageAllocator::GetErrorType(
|
||||
uintptr_t addr, const SlotMetadata& d) const {
|
||||
if (!d.allocation_start) return ErrorType::kUnknown;
|
||||
if (double_free_detected_) return ErrorType::kDoubleFree;
|
||||
if (write_overflow_detected_) return ErrorType::kBufferOverflowOnDealloc;
|
||||
if (d.dealloc_trace.depth) return ErrorType::kUseAfterFree;
|
||||
if (addr < d.allocation_start) return ErrorType::kBufferUnderflow;
|
||||
if (addr >= d.allocation_start + d.requested_size) {
|
||||
return ErrorType::kBufferOverflow;
|
||||
}
|
||||
return ErrorType::kUnknown;
|
||||
}
|
||||
|
||||
uintptr_t GuardedPageAllocator::SlotToAddr(size_t slot) const {
|
||||
ASSERT(slot < total_pages_);
|
||||
return first_page_addr_ + 2 * slot * page_size_;
|
||||
}
|
||||
|
||||
size_t GuardedPageAllocator::AddrToSlot(uintptr_t addr) const {
|
||||
uintptr_t offset = addr - first_page_addr_;
|
||||
ASSERT(offset % page_size_ == 0);
|
||||
ASSERT((offset / page_size_) % 2 == 0);
|
||||
int slot = offset / page_size_ / 2;
|
||||
ASSERT(slot >= 0 && slot < total_pages_);
|
||||
return slot;
|
||||
}
|
||||
|
||||
void GuardedPageAllocator::MaybeRightAlign(size_t slot, size_t size,
|
||||
size_t alignment, void** ptr) {
|
||||
if (!ShouldRightAlign(slot)) return;
|
||||
uintptr_t adjusted_ptr =
|
||||
reinterpret_cast<uintptr_t>(*ptr) + page_size_ - size;
|
||||
|
||||
// If alignment == 0, the necessary alignment is never larger than the size
|
||||
// rounded up to the next power of 2. We use this fact to minimize alignment
|
||||
// padding between the end of small allocations and their guard pages.
|
||||
//
|
||||
// For allocations larger than the greater of kAlignment and
|
||||
// __STDCPP_DEFAULT_NEW_ALIGNMENT__, we're safe aligning to that value.
|
||||
size_t default_alignment =
|
||||
std::min(absl::bit_ceil(size),
|
||||
std::max(static_cast<size_t>(kAlignment),
|
||||
static_cast<size_t>(__STDCPP_DEFAULT_NEW_ALIGNMENT__)));
|
||||
|
||||
// Ensure valid alignment.
|
||||
alignment = std::max(alignment, default_alignment);
|
||||
uintptr_t alignment_padding = adjusted_ptr & (alignment - 1);
|
||||
adjusted_ptr -= alignment_padding;
|
||||
|
||||
// Write magic bytes in alignment padding to detect small overflow writes.
|
||||
size_t magic_size = std::min(alignment_padding, kMagicSize);
|
||||
memset(reinterpret_cast<void*>(adjusted_ptr + size),
|
||||
GetWriteOverflowMagic(slot), magic_size);
|
||||
*ptr = reinterpret_cast<void*>(adjusted_ptr);
|
||||
}
|
||||
|
||||
// If this failure occurs during "bazel test", writes a warning for Bazel to
|
||||
// display.
|
||||
static void RecordBazelWarning(absl::string_view error) {
|
||||
const char* warning_file = thread_safe_getenv("TEST_WARNINGS_OUTPUT_FILE");
|
||||
if (!warning_file) return; // Not a bazel test.
|
||||
|
||||
constexpr char warning[] = "GWP-ASan error detected: ";
|
||||
int fd = open(warning_file, O_CREAT | O_WRONLY | O_APPEND, 0644);
|
||||
if (fd == -1) return;
|
||||
(void)write(fd, warning, sizeof(warning) - 1);
|
||||
(void)write(fd, error.data(), error.size());
|
||||
(void)write(fd, "\n", 1);
|
||||
close(fd);
|
||||
}
|
||||
|
||||
// If this failure occurs during a gUnit test, writes an XML file describing the
|
||||
// error type. Note that we cannot use ::testing::Test::RecordProperty()
|
||||
// because it doesn't write the XML file if a test crashes (which we're about to
|
||||
// do here). So we write directly to the XML file instead.
|
||||
//
|
||||
static void RecordTestFailure(absl::string_view error) {
|
||||
const char* xml_file = thread_safe_getenv("XML_OUTPUT_FILE");
|
||||
if (!xml_file) return; // Not a gUnit test.
|
||||
|
||||
// Record test failure for Sponge.
|
||||
constexpr char xml_text_header[] =
|
||||
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
|
||||
"<testsuites><testsuite><testcase>"
|
||||
" <properties>"
|
||||
" <property name=\"gwp-asan-report\" value=\"";
|
||||
constexpr char xml_text_footer[] =
|
||||
"\"/>"
|
||||
" </properties>"
|
||||
" <failure message=\"MemoryError\">"
|
||||
" GWP-ASan detected a memory error. See the test log for full report."
|
||||
" </failure>"
|
||||
"</testcase></testsuite></testsuites>";
|
||||
|
||||
int fd = open(xml_file, O_CREAT | O_WRONLY | O_TRUNC, 0644);
|
||||
if (fd == -1) return;
|
||||
(void)write(fd, xml_text_header, sizeof(xml_text_header) - 1);
|
||||
(void)write(fd, error.data(), error.size());
|
||||
(void)write(fd, xml_text_footer, sizeof(xml_text_footer) - 1);
|
||||
close(fd);
|
||||
}
|
||||
//
|
||||
// If this crash occurs in a test, records test failure summaries.
|
||||
//
|
||||
// error contains the type of error to record.
|
||||
static void RecordCrash(absl::string_view error) {
|
||||
|
||||
RecordBazelWarning(error);
|
||||
RecordTestFailure(error);
|
||||
}
|
||||
|
||||
static void PrintStackTrace(void** stack_frames, size_t depth) {
|
||||
for (size_t i = 0; i < depth; ++i) {
|
||||
Log(kLog, __FILE__, __LINE__, " @ ", stack_frames[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static void PrintStackTraceFromSignalHandler(void* context) {
|
||||
void* stack_frames[kMaxStackDepth];
|
||||
size_t depth = absl::GetStackTraceWithContext(stack_frames, kMaxStackDepth, 1,
|
||||
context, nullptr);
|
||||
PrintStackTrace(stack_frames, depth);
|
||||
}
|
||||
|
||||
// A SEGV handler that prints stack traces for the allocation and deallocation
|
||||
// of relevant memory as well as the location of the memory error.
|
||||
static void SegvHandler(int signo, siginfo_t* info, void* context) {
|
||||
if (signo != SIGSEGV) return;
|
||||
void* fault = info->si_addr;
|
||||
if (!tc_globals.guardedpage_allocator().PointerIsMine(fault)) return;
|
||||
GuardedPageAllocator::GpaStackTrace alloc_trace, dealloc_trace;
|
||||
GuardedPageAllocator::ErrorType error =
|
||||
tc_globals.guardedpage_allocator().GetStackTraces(fault, &alloc_trace,
|
||||
&dealloc_trace);
|
||||
if (error == GuardedPageAllocator::ErrorType::kUnknown) return;
|
||||
pid_t current_thread = absl::base_internal::GetTID();
|
||||
off_t offset;
|
||||
size_t size;
|
||||
std::tie(offset, size) =
|
||||
tc_globals.guardedpage_allocator().GetAllocationOffsetAndSize(fault);
|
||||
|
||||
Log(kLog, __FILE__, __LINE__,
|
||||
"*** GWP-ASan "
|
||||
"(https://google.github.io/tcmalloc/gwp-asan.html) "
|
||||
"has detected a memory error ***");
|
||||
Log(kLog, __FILE__, __LINE__, ">>> Access at offset", offset,
|
||||
"into buffer of length", size);
|
||||
Log(kLog, __FILE__, __LINE__,
|
||||
"Error originates from memory allocated in thread", alloc_trace.tid,
|
||||
"at:");
|
||||
PrintStackTrace(alloc_trace.stack, alloc_trace.depth);
|
||||
|
||||
switch (error) {
|
||||
case GuardedPageAllocator::ErrorType::kUseAfterFree:
|
||||
Log(kLog, __FILE__, __LINE__, "The memory was freed in thread",
|
||||
dealloc_trace.tid, "at:");
|
||||
PrintStackTrace(dealloc_trace.stack, dealloc_trace.depth);
|
||||
Log(kLog, __FILE__, __LINE__, "Use-after-free occurs in thread",
|
||||
current_thread, "at:");
|
||||
RecordCrash("use-after-free");
|
||||
break;
|
||||
case GuardedPageAllocator::ErrorType::kBufferUnderflow:
|
||||
Log(kLog, __FILE__, __LINE__, "Buffer underflow occurs in thread",
|
||||
current_thread, "at:");
|
||||
RecordCrash("buffer-underflow");
|
||||
break;
|
||||
case GuardedPageAllocator::ErrorType::kBufferOverflow:
|
||||
Log(kLog, __FILE__, __LINE__, "Buffer overflow occurs in thread",
|
||||
current_thread, "at:");
|
||||
RecordCrash("buffer-overflow");
|
||||
break;
|
||||
case GuardedPageAllocator::ErrorType::kDoubleFree:
|
||||
Log(kLog, __FILE__, __LINE__, "The memory was freed in thread",
|
||||
dealloc_trace.tid, "at:");
|
||||
PrintStackTrace(dealloc_trace.stack, dealloc_trace.depth);
|
||||
Log(kLog, __FILE__, __LINE__, "Double free occurs in thread",
|
||||
current_thread, "at:");
|
||||
RecordCrash("double-free");
|
||||
break;
|
||||
case GuardedPageAllocator::ErrorType::kBufferOverflowOnDealloc:
|
||||
Log(kLog, __FILE__, __LINE__,
|
||||
"Buffer overflow (write) detected in thread", current_thread,
|
||||
"at free:");
|
||||
RecordCrash("buffer-overflow-detected-at-free");
|
||||
break;
|
||||
case GuardedPageAllocator::ErrorType::kUnknown:
|
||||
Crash(kCrash, __FILE__, __LINE__, "Unexpected ErrorType::kUnknown");
|
||||
}
|
||||
PrintStackTraceFromSignalHandler(context);
|
||||
if (error == GuardedPageAllocator::ErrorType::kBufferOverflowOnDealloc) {
|
||||
Log(kLog, __FILE__, __LINE__,
|
||||
"*** Try rerunning with --config=asan to get stack trace of overflow "
|
||||
"***");
|
||||
}
|
||||
}
|
||||
|
||||
static struct sigaction old_sa;
|
||||
|
||||
static void ForwardSignal(int signo, siginfo_t* info, void* context) {
|
||||
if (old_sa.sa_flags & SA_SIGINFO) {
|
||||
old_sa.sa_sigaction(signo, info, context);
|
||||
} else if (old_sa.sa_handler == SIG_DFL) {
|
||||
// No previous handler registered. Re-raise signal for core dump.
|
||||
int err = sigaction(signo, &old_sa, nullptr);
|
||||
if (err == -1) {
|
||||
Log(kLog, __FILE__, __LINE__, "Couldn't restore previous sigaction!");
|
||||
}
|
||||
raise(signo);
|
||||
} else if (old_sa.sa_handler == SIG_IGN) {
|
||||
return; // Previous sigaction ignored signal, so do the same.
|
||||
} else {
|
||||
old_sa.sa_handler(signo);
|
||||
}
|
||||
}
|
||||
|
||||
static void HandleSegvAndForward(int signo, siginfo_t* info, void* context) {
|
||||
SegvHandler(signo, info, context);
|
||||
ForwardSignal(signo, info, context);
|
||||
}
|
||||
|
||||
extern "C" void MallocExtension_Internal_ActivateGuardedSampling() {
|
||||
static absl::once_flag flag;
|
||||
absl::call_once(flag, []() {
|
||||
struct sigaction action = {};
|
||||
action.sa_sigaction = HandleSegvAndForward;
|
||||
sigemptyset(&action.sa_mask);
|
||||
action.sa_flags = SA_SIGINFO;
|
||||
sigaction(SIGSEGV, &action, &old_sa);
|
||||
tc_globals.guardedpage_allocator().AllowAllocations();
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
315
src/third_party/tcmalloc/dist/tcmalloc/guarded_page_allocator.h
vendored
Normal file
@ -0,0 +1,315 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef TCMALLOC_GUARDED_PAGE_ALLOCATOR_H_
|
||||
#define TCMALLOC_GUARDED_PAGE_ALLOCATOR_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include <utility>
|
||||
|
||||
#include "absl/base/attributes.h"
|
||||
#include "absl/base/internal/spinlock.h"
|
||||
#include "absl/base/thread_annotations.h"
|
||||
#include "tcmalloc/common.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
|
||||
// An allocator that gives each allocation a new region, with guard pages on
|
||||
// either side of the allocated region. If a buffer is overflowed to the next
|
||||
// guard page or underflowed to the previous guard page, a segfault occurs.
|
||||
// After an allocation is freed, the underlying page is marked as inaccessible,
|
||||
// and any future accesses to it will also cause segfaults until the page is
|
||||
// reallocated.
|
||||
//
|
||||
// Is safe to use with static storage duration and is thread safe with the
|
||||
// exception of calls to Init() and Destroy() (see corresponding function
|
||||
// comments).
|
||||
//
|
||||
// Example:
|
||||
// ABSL_CONST_INIT GuardedPageAllocator gpa;
|
||||
//
|
||||
// void foo() {
|
||||
// char *buf = reinterpret_cast<char *>(gpa.Allocate(8000, 1));
|
||||
// buf[0] = 'A'; // OK. No segfault occurs.
|
||||
// memset(buf, 'A', 8000); // OK. No segfault occurs.
|
||||
// buf[-300] = 'A'; // Segfault!
|
||||
// buf[9000] = 'A'; // Segfault!
|
||||
// gpa.Deallocate(buf);
|
||||
// buf[0] = 'B'; // Segfault!
|
||||
// }
|
||||
//
|
||||
// int main() {
|
||||
// // Call Init() only once.
|
||||
// gpa.Init(64, GuardedPageAllocator::kGpaMaxPages);
|
||||
// gpa.AllowAllocations();
|
||||
// for (int i = 0; i < 1000; i++) foo();
|
||||
// return 0;
|
||||
// }
|
||||
class GuardedPageAllocator {
|
||||
public:
|
||||
struct GpaStackTrace {
|
||||
void* stack[kMaxStackDepth];
|
||||
size_t depth = 0;
|
||||
pid_t tid = 0;
|
||||
};
|
||||
|
||||
// Maximum number of pages this class can allocate.
|
||||
static constexpr size_t kGpaMaxPages = 512;
|
||||
|
||||
enum class ErrorType {
|
||||
kUseAfterFree,
|
||||
kBufferUnderflow,
|
||||
kBufferOverflow,
|
||||
kDoubleFree,
|
||||
kBufferOverflowOnDealloc,
|
||||
kUnknown,
|
||||
};
|
||||
|
||||
constexpr GuardedPageAllocator()
|
||||
: guarded_page_lock_(absl::kConstInit,
|
||||
absl::base_internal::SCHEDULE_KERNEL_ONLY),
|
||||
free_pages_{},
|
||||
num_alloced_pages_(0),
|
||||
num_alloced_pages_max_(0),
|
||||
num_allocation_requests_(0),
|
||||
num_failed_allocations_(0),
|
||||
data_(nullptr),
|
||||
pages_base_addr_(0),
|
||||
pages_end_addr_(0),
|
||||
first_page_addr_(0),
|
||||
max_alloced_pages_(0),
|
||||
total_pages_(0),
|
||||
page_size_(0),
|
||||
rand_(0),
|
||||
initialized_(false),
|
||||
allow_allocations_(false),
|
||||
double_free_detected_(false),
|
||||
write_overflow_detected_(false) {}
|
||||
|
||||
GuardedPageAllocator(const GuardedPageAllocator&) = delete;
|
||||
GuardedPageAllocator& operator=(const GuardedPageAllocator&) = delete;
|
||||
|
||||
~GuardedPageAllocator() = default;
|
||||
|
||||
// Configures this allocator to allocate up to max_alloced_pages pages at a
|
||||
// time from a pool of total_pages pages, where:
|
||||
// 1 <= max_alloced_pages <= total_pages <= kGpaMaxPages
|
||||
//
|
||||
// This method should be called non-concurrently and only once to complete
|
||||
// initialization. Dynamic initialization is deliberately done here and not
|
||||
// in the constructor, thereby allowing the constructor to be constexpr and
|
||||
// avoiding static initialization order issues.
|
||||
void Init(size_t max_alloced_pages, size_t total_pages)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
|
||||
|
||||
// Unmaps memory allocated by this class.
|
||||
//
|
||||
// This method should be called non-concurrently and only once to complete
|
||||
// destruction. Destruction is deliberately done here and not in the
|
||||
// destructor, thereby allowing the destructor to be trivial (i.e. a no-op)
|
||||
// and avoiding use-after-destruction issues for static/global instances.
|
||||
void Destroy();
|
||||
|
||||
struct AllocWithStatus {
|
||||
void* alloc = nullptr;
|
||||
Profile::Sample::GuardedStatus status =
|
||||
Profile::Sample::GuardedStatus::Unknown;
|
||||
};
|
||||
|
||||
// On success, returns an instance of AllocWithStatus which includes a pointer
|
||||
// to size bytes of page-guarded memory, aligned to alignment. The member
|
||||
// 'alloc' is a pointer that is guaranteed to be tagged.
|
||||
// The 'status' member is set to GuardedStatus::Guarded.
|
||||
// On failure, returns an instance of AllocWithStatus (the 'alloc' member is
|
||||
// set to 'nullptr'). Failure can occur if memory could not be mapped or
|
||||
// protected, if all guarded pages are already allocated, or if size is 0.
|
||||
// These conditions are reflected in the 'status' member of the
|
||||
// AllocWithStatus return value.
|
||||
//
|
||||
// Precondition: size and alignment <= page_size_
|
||||
// Precondition: alignment is 0 or a power of 2
|
||||
AllocWithStatus Allocate(size_t size, size_t alignment)
|
||||
ABSL_LOCKS_EXCLUDED(guarded_page_lock_);
|
||||
|
||||
// Deallocates memory pointed to by ptr. ptr must have been previously
|
||||
// returned by a call to Allocate.
|
||||
void Deallocate(void* ptr) ABSL_LOCKS_EXCLUDED(guarded_page_lock_);
|
||||
|
||||
// Returns the size requested when ptr was allocated. ptr must have been
|
||||
// previously returned by a call to Allocate.
|
||||
size_t GetRequestedSize(const void* ptr) const;
|
||||
|
||||
// Returns ptr's offset from the beginning of its allocation along with the
|
||||
// allocation's size.
|
||||
std::pair<off_t, size_t> GetAllocationOffsetAndSize(const void* ptr) const;
|
||||
|
||||
// Records stack traces in alloc_trace and dealloc_trace for the page nearest
|
||||
// to ptr. alloc_trace is the trace at the time the page was allocated. If
|
||||
// the page is still allocated, dealloc_trace->depth will be 0. If the page
|
||||
// has been deallocated, dealloc_trace is the trace at the time the page was
|
||||
// deallocated.
|
||||
//
|
||||
// Returns the likely error type for an access at ptr.
|
||||
//
|
||||
// Requires that ptr points to memory mapped by this class.
|
||||
ErrorType GetStackTraces(const void* ptr, GpaStackTrace* alloc_trace,
|
||||
GpaStackTrace* dealloc_trace) const;
|
||||
|
||||
// Writes a human-readable summary of GuardedPageAllocator's internal state to
|
||||
// *out.
|
||||
void Print(Printer* out) ABSL_LOCKS_EXCLUDED(guarded_page_lock_);
|
||||
void PrintInPbtxt(PbtxtRegion* gwp_asan)
|
||||
ABSL_LOCKS_EXCLUDED(guarded_page_lock_);
|
||||
|
||||
// Returns true if ptr points to memory managed by this class.
|
||||
inline bool ABSL_ATTRIBUTE_ALWAYS_INLINE
|
||||
PointerIsMine(const void* ptr) const {
|
||||
uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
|
||||
return pages_base_addr_ <= addr && addr < pages_end_addr_;
|
||||
}
|
||||
|
||||
// Allows Allocate() to start returning allocations.
|
||||
void AllowAllocations() ABSL_LOCKS_EXCLUDED(guarded_page_lock_) {
|
||||
absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
|
||||
allow_allocations_ = true;
|
||||
}
|
||||
|
||||
// Returns the number of pages available for allocation, based on how many are
|
||||
// currently in use. (Should only be used in testing.)
|
||||
size_t GetNumAvailablePages() ABSL_LOCKS_EXCLUDED(guarded_page_lock_) {
|
||||
absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
|
||||
return max_alloced_pages_ - num_alloced_pages_;
|
||||
}
|
||||
|
||||
private:
|
||||
// Structure for storing data about a slot.
|
||||
struct SlotMetadata {
|
||||
GpaStackTrace alloc_trace;
|
||||
GpaStackTrace dealloc_trace;
|
||||
size_t requested_size = 0;
|
||||
uintptr_t allocation_start = 0;
|
||||
};
|
||||
|
||||
// Max number of magic bytes we use to detect write-overflows at deallocation.
|
||||
static constexpr size_t kMagicSize = 32;
|
||||
|
||||
// Maps pages into memory.
|
||||
void MapPages() ABSL_LOCKS_EXCLUDED(guarded_page_lock_)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
|
||||
|
||||
// Reserves and returns a slot randomly selected from the free slots in
|
||||
// free_pages_. Returns -1 if no slots available, or if AllowAllocations()
|
||||
// hasn't been called yet.
|
||||
ssize_t ReserveFreeSlot() ABSL_LOCKS_EXCLUDED(guarded_page_lock_);
|
||||
|
||||
// Returns the i-th free slot of free_pages_. i must be in the range [0,
|
||||
// total_pages_ - num_alloced_pages_).
|
||||
size_t GetIthFreeSlot(size_t i)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(guarded_page_lock_);
|
||||
|
||||
// Marks the specified slot as unreserved.
|
||||
void FreeSlot(size_t slot) ABSL_EXCLUSIVE_LOCKS_REQUIRED(guarded_page_lock_);
|
||||
|
||||
// Returns the address of the page that addr resides on.
|
||||
uintptr_t GetPageAddr(uintptr_t addr) const;
|
||||
|
||||
// Returns an address somewhere on the valid page nearest to addr.
|
||||
uintptr_t GetNearestValidPage(uintptr_t addr) const;
|
||||
|
||||
// Returns the slot number for the page nearest to addr.
|
||||
size_t GetNearestSlot(uintptr_t addr) const;
|
||||
|
||||
// Returns true if the specified slot has already been freed.
|
||||
bool IsFreed(size_t slot) const
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(guarded_page_lock_);
|
||||
|
||||
// Returns true if magic bytes for slot were overwritten.
|
||||
bool WriteOverflowOccurred(size_t slot) const;
|
||||
|
||||
// Returns the likely error type for the given access address and metadata
|
||||
// associated with the nearest slot.
|
||||
ErrorType GetErrorType(uintptr_t addr, const SlotMetadata& d) const;
|
||||
|
||||
// Magic constant used for detecting write-overflows at deallocation time.
|
||||
static uint8_t GetWriteOverflowMagic(size_t slot) {
|
||||
// Only even slots get magic bytes, so use slot / 2 for more unique magics.
|
||||
return uint8_t{0xcd} * static_cast<uint8_t>(slot / 2);
|
||||
}
|
||||
|
||||
// Returns true if slot should be right aligned.
|
||||
static bool ShouldRightAlign(size_t slot) { return slot % 2 == 0; }
|
||||
|
||||
// If slot is marked for right alignment, moves the allocation in *ptr to the
|
||||
// right end of the slot, maintaining the specified size and alignment. Magic
|
||||
// bytes are written in any alignment padding.
|
||||
void MaybeRightAlign(size_t slot, size_t size, size_t alignment, void** ptr);
|
||||
|
||||
uintptr_t SlotToAddr(size_t slot) const;
|
||||
size_t AddrToSlot(uintptr_t addr) const;
|
||||
|
||||
absl::base_internal::SpinLock guarded_page_lock_;
|
||||
|
||||
// Maps each bool to one page.
|
||||
// true: Free. false: Reserved.
|
||||
bool free_pages_[kGpaMaxPages] ABSL_GUARDED_BY(guarded_page_lock_);
|
||||
|
||||
// Number of currently-allocated pages.
|
||||
size_t num_alloced_pages_ ABSL_GUARDED_BY(guarded_page_lock_);
|
||||
|
||||
// The high-water mark for num_alloced_pages_.
|
||||
size_t num_alloced_pages_max_ ABSL_GUARDED_BY(guarded_page_lock_);
|
||||
|
||||
// Number of calls to Allocate.
|
||||
size_t num_allocation_requests_ ABSL_GUARDED_BY(guarded_page_lock_);
|
||||
|
||||
// Number of times Allocate has failed.
|
||||
size_t num_failed_allocations_ ABSL_GUARDED_BY(guarded_page_lock_);
|
||||
|
||||
// A dynamically-allocated array of stack trace data captured when each page
|
||||
// is allocated/deallocated. Printed by the SEGV handler when a memory error
|
||||
// is detected.
|
||||
SlotMetadata* data_;
|
||||
|
||||
uintptr_t pages_base_addr_; // Points to start of mapped region.
|
||||
uintptr_t pages_end_addr_; // Points to the end of mapped region.
|
||||
uintptr_t first_page_addr_; // Points to first page returnable by Allocate.
|
||||
size_t max_alloced_pages_; // Max number of pages to allocate at once.
|
||||
size_t total_pages_; // Size of the page pool to allocate from.
|
||||
size_t page_size_; // Size of pages we allocate.
|
||||
uint64_t rand_; // RNG seed.
|
||||
|
||||
// True if this object has been fully initialized.
|
||||
bool initialized_ ABSL_GUARDED_BY(guarded_page_lock_);
|
||||
|
||||
// Flag to control whether we can return allocations or not.
|
||||
bool allow_allocations_ ABSL_GUARDED_BY(guarded_page_lock_);
|
||||
|
||||
// Set to true if a double free has occurred.
|
||||
bool double_free_detected_;
|
||||
|
||||
// Set to true if a write overflow was detected on deallocation.
|
||||
bool write_overflow_detected_;
|
||||
};
|
||||
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
|
||||
#endif // TCMALLOC_GUARDED_PAGE_ALLOCATOR_H_
|
||||
63
src/third_party/tcmalloc/dist/tcmalloc/guarded_page_allocator_benchmark.cc
vendored
Normal file
@ -0,0 +1,63 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "absl/base/internal/spinlock.h"
|
||||
#include "benchmark/benchmark.h"
|
||||
#include "tcmalloc/guarded_page_allocator.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
#include "tcmalloc/internal/page_size.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
namespace {
|
||||
|
||||
static constexpr size_t kMaxGpaPages = GuardedPageAllocator::kGpaMaxPages;
|
||||
|
||||
// Size of pages used by GuardedPageAllocator.
|
||||
static size_t PageSize() {
|
||||
static const size_t page_size =
|
||||
std::max(kPageSize, static_cast<size_t>(GetPageSize()));
|
||||
return page_size;
|
||||
}
|
||||
|
||||
void BM_AllocDealloc(benchmark::State& state) {
|
||||
static GuardedPageAllocator* gpa = []() {
|
||||
auto gpa = new GuardedPageAllocator;
|
||||
absl::base_internal::SpinLockHolder h(&pageheap_lock);
|
||||
gpa->Init(kMaxGpaPages, kMaxGpaPages);
|
||||
gpa->AllowAllocations();
|
||||
return gpa;
|
||||
}();
|
||||
size_t alloc_size = state.range(0);
|
||||
for (auto _ : state) {
|
||||
char* ptr = reinterpret_cast<char*>(gpa->Allocate(alloc_size, 0).alloc);
|
||||
CHECK_CONDITION(ptr != nullptr);
|
||||
ptr[0] = 'X'; // Page fault first page.
|
||||
ptr[alloc_size - 1] = 'X'; // Page fault last page.
|
||||
gpa->Deallocate(ptr);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK(BM_AllocDealloc)->Range(1, PageSize());
|
||||
BENCHMARK(BM_AllocDealloc)->Arg(1)->ThreadRange(1, kMaxGpaPages);
|
||||
|
||||
} // namespace
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
266
src/third_party/tcmalloc/dist/tcmalloc/guarded_page_allocator_profile_test.cc
vendored
Normal file
@ -0,0 +1,266 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "benchmark/benchmark.h"
|
||||
#include "gmock/gmock.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "absl/container/flat_hash_set.h"
|
||||
#include "absl/functional/function_ref.h"
|
||||
#include "tcmalloc/malloc_extension.h"
|
||||
#include "tcmalloc/static_vars.h"
|
||||
#include "tcmalloc/testing/testutil.h"
|
||||
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
namespace {
|
||||
|
||||
class GuardedPageAllocatorProfileTest : public testing::Test {
|
||||
public:
|
||||
struct NextSteps {
|
||||
bool stop = true; // stop allocating
|
||||
bool free = true; // free allocation
|
||||
};
|
||||
|
||||
void SetUp() override { MallocExtension::ActivateGuardedSampling(); }
|
||||
|
||||
// Return the number of allocations
|
||||
int AllocateUntil(size_t size,
|
||||
absl::FunctionRef<NextSteps(void*)> evaluate_alloc) {
|
||||
int alloc_count = 0;
|
||||
while (true) {
|
||||
void* alloc = ::operator new(size);
|
||||
++alloc_count;
|
||||
benchmark::DoNotOptimize(alloc);
|
||||
auto result = evaluate_alloc(alloc);
|
||||
// evaluate_alloc takes responsibility for delete/free if result.free is
|
||||
// set to false.
|
||||
if (result.free) {
|
||||
::operator delete(alloc);
|
||||
}
|
||||
if (result.stop) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return alloc_count;
|
||||
}
|
||||
|
||||
// Allocate until sample is guarded
|
||||
// Called to reduce the internal counter to -1, which will trigger resetting
|
||||
// the counter to the configured rate.
|
||||
void AllocateUntilGuarded() {
|
||||
AllocateUntil(968, [&](void* alloc) -> NextSteps {
|
||||
return {IsSampledMemory(alloc) &&
|
||||
Static::guardedpage_allocator().PointerIsMine(alloc),
|
||||
true};
|
||||
});
|
||||
}
|
||||
|
||||
void ExamineSamples(
|
||||
Profile& profile, Profile::Sample::GuardedStatus sought_status,
|
||||
absl::flat_hash_set<Profile::Sample::GuardedStatus> allowable_statuses,
|
||||
absl::FunctionRef<void(const Profile::Sample& s)> verify =
|
||||
[](const Profile::Sample& s) { /* do nothing */ }) {
|
||||
absl::flat_hash_set<Profile::Sample::GuardedStatus> found_statuses;
|
||||
int samples = 0;
|
||||
profile.Iterate([&](const Profile::Sample& s) {
|
||||
++samples;
|
||||
found_statuses.insert(s.guarded_status);
|
||||
verify(s);
|
||||
});
|
||||
EXPECT_THAT(found_statuses, ::testing::Contains(sought_status));
|
||||
found_statuses.erase(sought_status);
|
||||
EXPECT_THAT(found_statuses, ::testing::IsSubsetOf(allowable_statuses));
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(GuardedPageAllocatorProfileTest, Guarded) {
|
||||
ScopedAlwaysSample sas;
|
||||
AllocateUntilGuarded();
|
||||
auto token = MallocExtension::StartAllocationProfiling();
|
||||
|
||||
AllocateUntil(1051, [&](void* alloc) -> NextSteps { return {true, true}; });
|
||||
|
||||
auto profile = std::move(token).Stop();
|
||||
ExamineSamples(profile, Profile::Sample::GuardedStatus::Guarded, {});
|
||||
}
|
||||
|
||||
TEST_F(GuardedPageAllocatorProfileTest, NotAttempted) {
|
||||
ScopedProfileSamplingRate spsr(4096);
|
||||
auto token = MallocExtension::StartAllocationProfiling();
|
||||
|
||||
constexpr size_t alloc_size = 2 * 1024 * 1024;
|
||||
AllocateUntil(alloc_size, [&](void* alloc) -> NextSteps {
|
||||
return {true, true};
|
||||
});
|
||||
|
||||
auto profile = std::move(token).Stop();
|
||||
ExamineSamples(profile, Profile::Sample::GuardedStatus::NotAttempted,
|
||||
{Profile::Sample::GuardedStatus::Guarded},
|
||||
[&](const Profile::Sample& s) {
|
||||
switch (s.guarded_status) {
|
||||
case Profile::Sample::GuardedStatus::Guarded:
|
||||
EXPECT_NE(alloc_size, s.requested_size);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
TEST_F(GuardedPageAllocatorProfileTest, LargerThanOnePage) {
|
||||
ScopedAlwaysSample sas;
|
||||
AllocateUntilGuarded();
|
||||
auto token = MallocExtension::StartAllocationProfiling();
|
||||
|
||||
constexpr size_t alloc_size = kPageSize + 1;
|
||||
AllocateUntil(alloc_size, [&](void* alloc) -> NextSteps {
|
||||
return {true, true};
|
||||
});
|
||||
|
||||
auto profile = std::move(token).Stop();
|
||||
ExamineSamples(profile, Profile::Sample::GuardedStatus::LargerThanOnePage,
|
||||
{Profile::Sample::GuardedStatus::Guarded},
|
||||
[&](const Profile::Sample& s) {
|
||||
switch (s.guarded_status) {
|
||||
case Profile::Sample::GuardedStatus::Guarded:
|
||||
EXPECT_NE(alloc_size, s.requested_size);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
TEST_F(GuardedPageAllocatorProfileTest, Disabled) {
|
||||
ScopedGuardedSamplingRate sgsr(-1);
|
||||
ScopedProfileSamplingRate spsr(1);
|
||||
auto token = MallocExtension::StartAllocationProfiling();
|
||||
|
||||
AllocateUntil(1024, [&](void* alloc) -> NextSteps { return {true, true}; });
|
||||
|
||||
auto profile = std::move(token).Stop();
|
||||
ExamineSamples(profile, Profile::Sample::GuardedStatus::Disabled, {});
|
||||
}
|
||||
|
||||
TEST_F(GuardedPageAllocatorProfileTest, RateLimited) {
|
||||
ScopedGuardedSamplingRate sgsr(1);
|
||||
ScopedProfileSamplingRate spsr(1);
|
||||
auto token = MallocExtension::StartAllocationProfiling();
|
||||
|
||||
// Keep allocating until something is sampled
|
||||
constexpr size_t alloc_size = 1033;
|
||||
bool guarded_found = false;
|
||||
bool unguarded_found = false;
|
||||
AllocateUntil(alloc_size, [&](void* alloc) -> NextSteps {
|
||||
if (IsSampledMemory(alloc)) {
|
||||
if (Static::guardedpage_allocator().PointerIsMine(alloc)) {
|
||||
guarded_found = true;
|
||||
} else {
|
||||
unguarded_found = true;
|
||||
}
|
||||
return {guarded_found && unguarded_found, true};
|
||||
}
|
||||
return {false, true};
|
||||
});
|
||||
|
||||
// Ensure Guarded and RateLimited both occur for the alloc_size
|
||||
bool success_found = false;
|
||||
bool ratelimited_found = false;
|
||||
auto profile = std::move(token).Stop();
|
||||
ExamineSamples(profile, Profile::Sample::GuardedStatus::RateLimited,
|
||||
{Profile::Sample::GuardedStatus::Guarded},
|
||||
[&](const Profile::Sample& s) {
|
||||
if (s.requested_size != alloc_size) {
|
||||
return;
|
||||
}
|
||||
switch (s.guarded_status) {
|
||||
case Profile::Sample::GuardedStatus::Guarded:
|
||||
success_found = true;
|
||||
break;
|
||||
case Profile::Sample::GuardedStatus::RateLimited:
|
||||
ratelimited_found = true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
});
|
||||
EXPECT_TRUE(success_found);
|
||||
EXPECT_TRUE(ratelimited_found);
|
||||
}
|
||||
|
||||
TEST_F(GuardedPageAllocatorProfileTest, TooSmall) {
|
||||
ScopedAlwaysSample sas;
|
||||
AllocateUntilGuarded();
|
||||
auto token = MallocExtension::StartAllocationProfiling();
|
||||
|
||||
// Next sampled allocation should be too small
|
||||
constexpr size_t alloc_size = 0;
|
||||
AllocateUntil(alloc_size, [&](void* alloc) -> NextSteps {
|
||||
return {true, true};
|
||||
});
|
||||
|
||||
auto profile = std::move(token).Stop();
|
||||
ExamineSamples(profile, Profile::Sample::GuardedStatus::TooSmall,
|
||||
{Profile::Sample::GuardedStatus::RateLimited,
|
||||
Profile::Sample::GuardedStatus::Guarded},
|
||||
[&](const Profile::Sample& s) {
|
||||
switch (s.guarded_status) {
|
||||
case Profile::Sample::GuardedStatus::Guarded:
|
||||
EXPECT_NE(alloc_size, s.requested_size);
|
||||
break;
|
||||
case Profile::Sample::GuardedStatus::TooSmall:
|
||||
EXPECT_EQ(alloc_size, s.requested_size);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
TEST_F(GuardedPageAllocatorProfileTest, NoAvailableSlots) {
|
||||
ScopedAlwaysSample sas;
|
||||
AllocateUntilGuarded();
|
||||
|
||||
std::vector<std::unique_ptr<char>> allocs;
|
||||
// Guard until there are no slots available.
|
||||
AllocateUntil(1039, [&](void* alloc) -> NextSteps {
|
||||
if (Static::guardedpage_allocator().PointerIsMine(alloc)) {
|
||||
allocs.emplace_back(static_cast<char*>(alloc));
|
||||
return {Static::guardedpage_allocator().GetNumAvailablePages() == 0,
|
||||
false};
|
||||
}
|
||||
return {false, true};
|
||||
});
|
||||
|
||||
auto token = MallocExtension::StartAllocationProfiling();
|
||||
// This should fail for lack of slots
|
||||
constexpr size_t alloc_size = 1055;
|
||||
AllocateUntil(alloc_size, [&](void* alloc) -> NextSteps {
|
||||
return {true, true};
|
||||
});
|
||||
|
||||
auto profile = std::move(token).Stop();
|
||||
ExamineSamples(profile, Profile::Sample::GuardedStatus::NoAvailableSlots, {});
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
275
src/third_party/tcmalloc/dist/tcmalloc/guarded_page_allocator_test.cc
vendored
Normal file
@ -0,0 +1,275 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tcmalloc/guarded_page_allocator.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <thread> // NOLINT(build/c++11)
|
||||
#include <vector>
|
||||
|
||||
#include "gmock/gmock.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "absl/base/attributes.h"
|
||||
#include "absl/base/casts.h"
|
||||
#include "absl/base/internal/spinlock.h"
|
||||
#include "absl/base/internal/sysinfo.h"
|
||||
#include "absl/container/flat_hash_set.h"
|
||||
#include "absl/memory/memory.h"
|
||||
#include "absl/numeric/bits.h"
|
||||
#include "absl/time/clock.h"
|
||||
#include "absl/time/time.h"
|
||||
#include "tcmalloc/common.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
#include "tcmalloc/internal/page_size.h"
|
||||
#include "tcmalloc/malloc_extension.h"
|
||||
#include "tcmalloc/static_vars.h"
|
||||
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
namespace {
|
||||
|
||||
static constexpr size_t kMaxGpaPages = GuardedPageAllocator::kGpaMaxPages;
|
||||
|
||||
// Size of pages used by GuardedPageAllocator.
|
||||
static size_t PageSize() {
|
||||
static const size_t page_size =
|
||||
std::max(kPageSize, static_cast<size_t>(GetPageSize()));
|
||||
return page_size;
|
||||
}
|
||||
|
||||
class GuardedPageAllocatorTest : public testing::Test {
|
||||
protected:
|
||||
GuardedPageAllocatorTest() {
|
||||
absl::base_internal::SpinLockHolder h(&pageheap_lock);
|
||||
gpa_.Init(kMaxGpaPages, kMaxGpaPages);
|
||||
gpa_.AllowAllocations();
|
||||
}
|
||||
|
||||
explicit GuardedPageAllocatorTest(size_t num_pages) {
|
||||
absl::base_internal::SpinLockHolder h(&pageheap_lock);
|
||||
gpa_.Init(num_pages, kMaxGpaPages);
|
||||
gpa_.AllowAllocations();
|
||||
}
|
||||
|
||||
~GuardedPageAllocatorTest() override { gpa_.Destroy(); }
|
||||
|
||||
GuardedPageAllocator gpa_;
|
||||
};
|
||||
|
||||
class GuardedPageAllocatorParamTest
|
||||
: public GuardedPageAllocatorTest,
|
||||
public testing::WithParamInterface<size_t> {
|
||||
protected:
|
||||
GuardedPageAllocatorParamTest() : GuardedPageAllocatorTest(GetParam()) {}
|
||||
};
|
||||
|
||||
TEST_F(GuardedPageAllocatorTest, SingleAllocDealloc) {
|
||||
auto alloc_with_status = gpa_.Allocate(PageSize(), 0);
|
||||
EXPECT_EQ(alloc_with_status.status, Profile::Sample::GuardedStatus::Guarded);
|
||||
char* buf = static_cast<char*>(alloc_with_status.alloc);
|
||||
EXPECT_NE(buf, nullptr);
|
||||
EXPECT_TRUE(gpa_.PointerIsMine(buf));
|
||||
memset(buf, 'A', PageSize());
|
||||
EXPECT_DEATH(buf[-1] = 'A', "");
|
||||
EXPECT_DEATH(buf[PageSize()] = 'A', "");
|
||||
gpa_.Deallocate(buf);
|
||||
EXPECT_DEATH(buf[0] = 'B', "");
|
||||
EXPECT_DEATH(buf[PageSize() / 2] = 'B', "");
|
||||
EXPECT_DEATH(buf[PageSize() - 1] = 'B', "");
|
||||
}
|
||||
|
||||
TEST_F(GuardedPageAllocatorTest, NoAlignmentProvided) {
|
||||
constexpr size_t kLargeObjectAlignment =
|
||||
std::max(static_cast<size_t>(kAlignment),
|
||||
static_cast<size_t>(__STDCPP_DEFAULT_NEW_ALIGNMENT__));
|
||||
|
||||
for (size_t base_size = 1; base_size <= 64; base_size <<= 1) {
|
||||
for (size_t size : {base_size, base_size + 1}) {
|
||||
SCOPED_TRACE(size);
|
||||
|
||||
constexpr int kElements = 10;
|
||||
std::array<void*, kElements> ptrs;
|
||||
|
||||
// Make several allocation attempts to encounter left/right-alignment in
|
||||
// the guarded region.
|
||||
for (int i = 0; i < kElements; i++) {
|
||||
auto alloc_with_status = gpa_.Allocate(size, 0);
|
||||
EXPECT_EQ(alloc_with_status.status,
|
||||
Profile::Sample::GuardedStatus::Guarded);
|
||||
ptrs[i] = alloc_with_status.alloc;
|
||||
EXPECT_NE(ptrs[i], nullptr);
|
||||
EXPECT_TRUE(gpa_.PointerIsMine(ptrs[i]));
|
||||
|
||||
size_t observed_alignment =
|
||||
1 << absl::countr_zero(absl::bit_cast<uintptr_t>(ptrs[i]));
|
||||
EXPECT_GE(observed_alignment, std::min(size, kLargeObjectAlignment));
|
||||
}
|
||||
|
||||
for (void* ptr : ptrs) {
|
||||
gpa_.Deallocate(ptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(GuardedPageAllocatorTest, AllocDeallocAligned) {
|
||||
for (size_t align = 1; align <= PageSize(); align <<= 1) {
|
||||
constexpr size_t alloc_size = 1;
|
||||
auto alloc_with_status = gpa_.Allocate(alloc_size, align);
|
||||
EXPECT_EQ(alloc_with_status.status,
|
||||
Profile::Sample::GuardedStatus::Guarded);
|
||||
EXPECT_NE(alloc_with_status.alloc, nullptr);
|
||||
EXPECT_TRUE(gpa_.PointerIsMine(alloc_with_status.alloc));
|
||||
EXPECT_EQ(reinterpret_cast<uintptr_t>(alloc_with_status.alloc) % align, 0);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(GuardedPageAllocatorParamTest, AllocDeallocAllPages) {
|
||||
size_t num_pages = GetParam();
|
||||
char* bufs[kMaxGpaPages];
|
||||
for (size_t i = 0; i < num_pages; i++) {
|
||||
auto alloc_with_status = gpa_.Allocate(1, 0);
|
||||
EXPECT_EQ(alloc_with_status.status,
|
||||
Profile::Sample::GuardedStatus::Guarded);
|
||||
bufs[i] = reinterpret_cast<char*>(alloc_with_status.alloc);
|
||||
EXPECT_NE(bufs[i], nullptr);
|
||||
EXPECT_TRUE(gpa_.PointerIsMine(bufs[i]));
|
||||
}
|
||||
auto alloc_with_status = gpa_.Allocate(1, 0);
|
||||
EXPECT_EQ(alloc_with_status.status,
|
||||
Profile::Sample::GuardedStatus::NoAvailableSlots);
|
||||
EXPECT_EQ(alloc_with_status.alloc, nullptr);
|
||||
gpa_.Deallocate(bufs[0]);
|
||||
alloc_with_status = gpa_.Allocate(1, 0);
|
||||
EXPECT_EQ(alloc_with_status.status, Profile::Sample::GuardedStatus::Guarded);
|
||||
bufs[0] = reinterpret_cast<char*>(alloc_with_status.alloc);
|
||||
EXPECT_NE(bufs[0], nullptr);
|
||||
EXPECT_TRUE(gpa_.PointerIsMine(bufs[0]));
|
||||
for (size_t i = 0; i < num_pages; i++) {
|
||||
bufs[i][0] = 'A';
|
||||
gpa_.Deallocate(bufs[i]);
|
||||
}
|
||||
}
|
||||
INSTANTIATE_TEST_SUITE_P(VaryNumPages, GuardedPageAllocatorParamTest,
|
||||
testing::Values(1, kMaxGpaPages / 2, kMaxGpaPages));
|
||||
|
||||
TEST_F(GuardedPageAllocatorTest, PointerIsMine) {
|
||||
auto alloc_with_status = gpa_.Allocate(1, 0);
|
||||
EXPECT_EQ(alloc_with_status.status, Profile::Sample::GuardedStatus::Guarded);
|
||||
void* buf = alloc_with_status.alloc;
|
||||
int stack_var;
|
||||
auto malloc_ptr = absl::make_unique<char>();
|
||||
EXPECT_TRUE(gpa_.PointerIsMine(buf));
|
||||
EXPECT_FALSE(gpa_.PointerIsMine(&stack_var));
|
||||
EXPECT_FALSE(gpa_.PointerIsMine(malloc_ptr.get()));
|
||||
}
|
||||
|
||||
TEST_F(GuardedPageAllocatorTest, Print) {
|
||||
char buf[1024] = {};
|
||||
Printer out(buf, sizeof(buf));
|
||||
gpa_.Print(&out);
|
||||
EXPECT_THAT(buf, testing::ContainsRegex("GWP-ASan Status"));
|
||||
}
|
||||
|
||||
// Test that no pages are double-allocated or left unallocated, and that no
|
||||
// extra pages are allocated when there's concurrent calls to Allocate().
|
||||
TEST_F(GuardedPageAllocatorTest, ThreadedAllocCount) {
|
||||
constexpr size_t kNumThreads = 2;
|
||||
void* allocations[kNumThreads][kMaxGpaPages];
|
||||
{
|
||||
std::vector<std::thread> threads;
|
||||
threads.reserve(kNumThreads);
|
||||
for (size_t i = 0; i < kNumThreads; i++) {
|
||||
threads.push_back(std::thread([this, &allocations, i]() {
|
||||
for (size_t j = 0; j < kMaxGpaPages; j++) {
|
||||
allocations[i][j] = gpa_.Allocate(1, 0).alloc;
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
for (auto& t : threads) {
|
||||
t.join();
|
||||
}
|
||||
}
|
||||
absl::flat_hash_set<void*> allocations_set;
|
||||
for (size_t i = 0; i < kNumThreads; i++) {
|
||||
for (size_t j = 0; j < kMaxGpaPages; j++) {
|
||||
allocations_set.insert(allocations[i][j]);
|
||||
}
|
||||
}
|
||||
allocations_set.erase(nullptr);
|
||||
EXPECT_EQ(allocations_set.size(), kMaxGpaPages);
|
||||
}
|
||||
|
||||
// Test that allocator remains in consistent state under high contention and
|
||||
// doesn't double-allocate pages or fail to deallocate pages.
|
||||
TEST_F(GuardedPageAllocatorTest, ThreadedHighContention) {
|
||||
const size_t kNumThreads = 4 * absl::base_internal::NumCPUs();
|
||||
{
|
||||
std::vector<std::thread> threads;
|
||||
threads.reserve(kNumThreads);
|
||||
for (size_t i = 0; i < kNumThreads; i++) {
|
||||
threads.push_back(std::thread([this]() {
|
||||
char* buf;
|
||||
while (true) {
|
||||
auto alloc_with_status = gpa_.Allocate(1, 0);
|
||||
if (alloc_with_status.status ==
|
||||
Profile::Sample::GuardedStatus::Guarded) {
|
||||
buf = reinterpret_cast<char*>(alloc_with_status.alloc);
|
||||
EXPECT_NE(buf, nullptr);
|
||||
break;
|
||||
}
|
||||
absl::SleepFor(absl::Nanoseconds(5000));
|
||||
}
|
||||
|
||||
// Verify that no other thread has access to this page.
|
||||
EXPECT_EQ(buf[0], 0);
|
||||
|
||||
// Mark this page and allow some time for another thread to potentially
|
||||
// gain access to this page.
|
||||
buf[0] = 'A';
|
||||
absl::SleepFor(absl::Nanoseconds(5000));
|
||||
|
||||
// Unmark this page and deallocate.
|
||||
buf[0] = 0;
|
||||
gpa_.Deallocate(buf);
|
||||
}));
|
||||
}
|
||||
|
||||
for (auto& t : threads) {
|
||||
t.join();
|
||||
}
|
||||
}
|
||||
// Verify all pages have been deallocated now that all threads are done.
|
||||
for (size_t i = 0; i < kMaxGpaPages; i++) {
|
||||
auto alloc_with_status = gpa_.Allocate(1, 0);
|
||||
EXPECT_EQ(alloc_with_status.status,
|
||||
Profile::Sample::GuardedStatus::Guarded);
|
||||
EXPECT_NE(alloc_with_status.alloc, nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
ABSL_CONST_INIT ABSL_ATTRIBUTE_UNUSED GuardedPageAllocator
|
||||
gpa_is_constant_initializable;
|
||||
|
||||
} // namespace
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
239
src/third_party/tcmalloc/dist/tcmalloc/heap_profiling_test.cc
vendored
Normal file
@ -0,0 +1,239 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <sys/mman.h>
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
#include "tcmalloc/internal/profile.pb.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "absl/base/attributes.h"
|
||||
#include "absl/base/const_init.h"
|
||||
#include "absl/base/internal/low_level_alloc.h"
|
||||
#include "absl/base/internal/spinlock.h"
|
||||
#include "absl/base/thread_annotations.h"
|
||||
#include "absl/container/flat_hash_set.h"
|
||||
#include "absl/status/statusor.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "absl/time/clock.h"
|
||||
#include "absl/time/time.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
#include "tcmalloc/internal/profile_builder.h"
|
||||
#include "tcmalloc/malloc_extension.h"
|
||||
#include "tcmalloc/sampled_allocation.h"
|
||||
#include "tcmalloc/static_vars.h"
|
||||
#include "tcmalloc/testing/test_allocator_harness.h"
|
||||
#include "tcmalloc/testing/thread_manager.h"
|
||||
|
||||
namespace tcmalloc {
|
||||
namespace {
|
||||
|
||||
class HeapProfilingTest : public ::testing::TestWithParam<int64_t> {};
|
||||
|
||||
// Verify that heap profiling sessions concurrent with allocations/deallocations
|
||||
// do not crash, as they all use `tc_globals.sampled_allocation_recorder_`. Also
|
||||
// check that the data in the sample make sense. Here the
|
||||
// allocations/deallocations can happen on the same thread or the object is
|
||||
// allocated in one thread, transferred to another thread and deleted there.
|
||||
TEST_P(HeapProfilingTest, GetHeapProfileWhileAllocAndDealloc) {
|
||||
ScopedProfileSamplingRate s(GetParam());
|
||||
const int kThreads = 10;
|
||||
ThreadManager manager;
|
||||
AllocatorHarness harness(kThreads);
|
||||
|
||||
// Some threads are busy with allocating and deallocating.
|
||||
manager.Start(kThreads, [&](int thread_id) { harness.Run(thread_id); });
|
||||
|
||||
absl::Time start = absl::Now();
|
||||
// Another few threads busy with iterating different kinds of heap profiles.
|
||||
for (auto t : {
|
||||
ProfileType::kHeap,
|
||||
ProfileType::kFragmentation,
|
||||
ProfileType::kPeakHeap,
|
||||
}) {
|
||||
manager.Start(2, [&](int) {
|
||||
MallocExtension::SnapshotCurrent(t).Iterate(
|
||||
[&](const Profile::Sample& s) {
|
||||
// Inspect a few fields in the sample.
|
||||
EXPECT_GE(s.sum, 0);
|
||||
EXPECT_GT(s.depth, 0);
|
||||
EXPECT_GT(s.requested_size, 0);
|
||||
EXPECT_GT(s.allocated_size, 0);
|
||||
EXPECT_GT(s.allocation_time, start - absl::Seconds(10));
|
||||
EXPECT_LT(s.allocation_time, start + absl::Seconds(10));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
absl::SleepFor(absl::Seconds(1));
|
||||
manager.Stop();
|
||||
}
|
||||
|
||||
// Test at different sampling rates, from always sampling to lower sampling
|
||||
// probabilities. This is stress testing and attempts to expose potential
|
||||
// failure modes when we only have sampled allocations and when we have a mix of
|
||||
// sampled/unsampled allocations.
|
||||
INSTANTIATE_TEST_SUITE_P(SamplingRates, HeapProfilingTest,
|
||||
testing::Values(1, 1 << 7, 1 << 14, 1 << 21),
|
||||
testing::PrintToStringParamName());
|
||||
|
||||
TEST(HeapProfilingTest, AllocateDifferentSizes) {
|
||||
const int num_allocations = 1000;
|
||||
const size_t requested_size1 = (1 << 19) + 1;
|
||||
const size_t requested_size2 = (1 << 20) + 1;
|
||||
int requested_size1_count = 0;
|
||||
int requested_size2_count = 0;
|
||||
|
||||
// First allocate some large objects at a specific size, verify through heap
|
||||
// profile, and deallocate them.
|
||||
void* allocations1[num_allocations];
|
||||
for (int i = 0; i < num_allocations; i++) {
|
||||
allocations1[i] = ::operator new(requested_size1);
|
||||
}
|
||||
|
||||
MallocExtension::SnapshotCurrent(ProfileType::kHeap)
|
||||
.Iterate([&](const Profile::Sample& s) {
|
||||
if (s.requested_size == requested_size1) requested_size1_count++;
|
||||
if (s.requested_size == requested_size2) requested_size2_count++;
|
||||
});
|
||||
|
||||
EXPECT_GT(requested_size1_count, 0);
|
||||
EXPECT_EQ(requested_size2_count, 0);
|
||||
requested_size1_count = 0;
|
||||
|
||||
for (int i = 0; i < num_allocations; i++) {
|
||||
::operator delete(allocations1[i]);
|
||||
}
|
||||
|
||||
// Next allocate some large objects at a different size, verify through heap
|
||||
// profile, and deallocate them.
|
||||
void* allocations2[num_allocations];
|
||||
for (int i = 0; i < num_allocations; i++) {
|
||||
allocations2[i] = ::operator new(requested_size2);
|
||||
}
|
||||
|
||||
MallocExtension::SnapshotCurrent(ProfileType::kHeap)
|
||||
.Iterate([&](const Profile::Sample& s) {
|
||||
if (s.requested_size == requested_size1) requested_size1_count++;
|
||||
if (s.requested_size == requested_size2) requested_size2_count++;
|
||||
});
|
||||
|
||||
EXPECT_EQ(requested_size1_count, 0);
|
||||
EXPECT_GT(requested_size2_count, 0);
|
||||
|
||||
for (int i = 0; i < num_allocations; i++) {
|
||||
::operator delete(allocations2[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(HeapProfilingTest, CheckResidency) {
|
||||
ScopedProfileSamplingRate s(1);
|
||||
const int num_allocations = 1000;
|
||||
const size_t requested_size = (1 << 19) + 1;
|
||||
|
||||
void* allocations[num_allocations];
|
||||
for (int i = 0; i < num_allocations; i++) {
|
||||
allocations[i] = ::operator new(requested_size);
|
||||
}
|
||||
|
||||
bool mlock_failure = false;
|
||||
for (int i = 0; i < num_allocations; i++) {
|
||||
if (::mlock(allocations[i], requested_size) != 0) {
|
||||
mlock_failure = true;
|
||||
for (int j = 0; j < requested_size; ++j) {
|
||||
static_cast<volatile char*>(allocations[i])[j] = 0x20;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (mlock_failure) {
|
||||
absl::FPrintF(
|
||||
stderr,
|
||||
"one or more mlocks failed, which could cause test flakiness\n");
|
||||
}
|
||||
|
||||
// Collect the heap profile and look for residency info.
|
||||
auto converted_or = tcmalloc_internal::MakeProfileProto(
|
||||
MallocExtension::SnapshotCurrent(ProfileType::kHeap));
|
||||
ASSERT_TRUE(converted_or.ok());
|
||||
const auto& converted = **converted_or;
|
||||
|
||||
// Look for "sampled_resident_bytes" string in string table.
|
||||
std::optional<int> sampled_resident_bytes_id;
|
||||
for (int i = 0, n = converted.string_table().size(); i < n; ++i) {
|
||||
if (converted.string_table(i) == "sampled_resident_bytes") {
|
||||
sampled_resident_bytes_id = i;
|
||||
}
|
||||
}
|
||||
ASSERT_TRUE(sampled_resident_bytes_id.has_value());
|
||||
|
||||
size_t resident_size = 0;
|
||||
for (const auto& sample : converted.sample()) {
|
||||
for (const auto& label : sample.label()) {
|
||||
if (label.key() == sampled_resident_bytes_id) {
|
||||
resident_size += label.num();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EXPECT_GE(resident_size, num_allocations * requested_size);
|
||||
EXPECT_LE(resident_size, num_allocations * requested_size * 2);
|
||||
|
||||
for (int i = 0; i < num_allocations; i++) {
|
||||
// throw away the error
|
||||
::munlock(allocations[i], requested_size);
|
||||
}
|
||||
for (int i = 0; i < num_allocations; i++) {
|
||||
::operator delete(allocations[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Make sure users can allocate when iterating over the heap samples. For now
|
||||
// `MallocExtension::SnapshotCurrent()` uses `StackTraceTable` to make a copy of
|
||||
// the sampled allocations from `tc_globals.sampled_allocation_recorder()` and
|
||||
// then iterate from the `StackTraceTable`. Ideally, we would want to avoid the
|
||||
// extra copy and iterate over sampled allocations directly. However, this would
|
||||
// result in deadlocks for the test case below. If we `Iterate()` directly on
|
||||
// `tc_globals.sampled_allocation_recorder()`, we hold the per-sample lock. As
|
||||
// we add data to a hashtable that stores allocations (always sampled here), the
|
||||
// hashtable can decide to `resize()`, deallocates the same sampled allocation
|
||||
// it is iterating at, wants to get the per-sample lock and ends up with a
|
||||
// deadlock. At the current state, making copies over sampled allocations and
|
||||
// iterate over those copies would not deadlock and the test case below passes.
|
||||
TEST(HeapProfilingTest, AllocateWhileIterating) {
|
||||
ScopedProfileSamplingRate s(1);
|
||||
absl::flat_hash_set<void*> set;
|
||||
// This fills up the slots in hashtable and so there is a good chance it would
|
||||
// call `resize()` when inserting new entries later. This makes it easier for
|
||||
// the deadlock to happen (>95% of the cases when directly iterating over
|
||||
// `tc_globals.sampled_allocation_recorder()`).
|
||||
set.reserve(1);
|
||||
set.insert(::operator new(100));
|
||||
for (int i = 0; i < 3; i++) {
|
||||
MallocExtension::SnapshotCurrent(ProfileType::kHeap)
|
||||
.Iterate(
|
||||
[&](const Profile::Sample& s) { set.insert(::operator new(100)); });
|
||||
}
|
||||
for (void* obj : set) {
|
||||
::operator delete(obj);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tcmalloc
|
||||
126
src/third_party/tcmalloc/dist/tcmalloc/hinted_tracker_lists.h
vendored
Normal file
@ -0,0 +1,126 @@
|
||||
// Copyright 2022 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef TCMALLOC_HINTED_TRACKER_LIST_H_
|
||||
#define TCMALLOC_HINTED_TRACKER_LIST_H_
|
||||
|
||||
#include "tcmalloc/internal/linked_list.h"
|
||||
#include "tcmalloc/internal/range_tracker.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
|
||||
// This class wraps an array of N TrackerLists and a Bitmap storing which
|
||||
// elements are non-empty.
|
||||
template <class TrackerType, size_t N>
|
||||
class HintedTrackerLists {
|
||||
public:
|
||||
using TrackerList = TList<TrackerType>;
|
||||
|
||||
constexpr HintedTrackerLists() : size_{} {}
|
||||
|
||||
// Removes a TrackerType from the first non-empty freelist with index at
|
||||
// least n and returns it. Returns nullptr if there is none.
|
||||
TrackerType* GetLeast(const size_t n) {
|
||||
ASSERT(n < N);
|
||||
size_t i = nonempty_.FindSet(n);
|
||||
if (i == N) {
|
||||
return nullptr;
|
||||
}
|
||||
ASSERT(!lists_[i].empty());
|
||||
TrackerType* pt = lists_[i].first();
|
||||
if (lists_[i].remove(pt)) {
|
||||
nonempty_.ClearBit(i);
|
||||
}
|
||||
--size_;
|
||||
return pt;
|
||||
}
|
||||
|
||||
// Returns a pointer to the TrackerType from the first non-empty freelist with
|
||||
// index at least n and returns it. Returns nullptr if there is none.
|
||||
//
|
||||
// Unlike GetLeast, this does not remove the pointer from the list when it is
|
||||
// found.
|
||||
TrackerType* PeekLeast(const size_t n) {
|
||||
ASSERT(n < N);
|
||||
size_t i = nonempty_.FindSet(n);
|
||||
if (i == N) {
|
||||
return nullptr;
|
||||
}
|
||||
ASSERT(!lists_[i].empty());
|
||||
return lists_[i].first();
|
||||
}
|
||||
|
||||
// Adds pointer <pt> to the nonempty_[i] list.
|
||||
// REQUIRES: i < N && pt != nullptr.
|
||||
void Add(TrackerType* pt, const size_t i) {
|
||||
ASSERT(i < N);
|
||||
ASSERT(pt != nullptr);
|
||||
lists_[i].prepend(pt);
|
||||
++size_;
|
||||
nonempty_.SetBit(i);
|
||||
}
|
||||
|
||||
// Removes pointer <pt> from the nonempty_[i] list.
|
||||
// REQUIRES: i < N && pt != nullptr.
|
||||
void Remove(TrackerType* pt, const size_t i) {
|
||||
ASSERT(i < N);
|
||||
ASSERT(pt != nullptr);
|
||||
if (lists_[i].remove(pt)) {
|
||||
nonempty_.ClearBit(i);
|
||||
}
|
||||
--size_;
|
||||
}
|
||||
const TrackerList& operator[](const size_t n) const {
|
||||
ASSERT(n < N);
|
||||
return lists_[n];
|
||||
}
|
||||
size_t size() const { return size_; }
|
||||
bool empty() const { return size_ == 0; }
|
||||
|
||||
// Returns length of the list at an index <n>.
|
||||
// REQUIRES: n < N.
|
||||
size_t SizeOfList(const size_t n) const {
|
||||
ASSERT(n < N);
|
||||
return lists_[n].length();
|
||||
}
|
||||
// Runs a functor on all pointers in the TrackerLists.
|
||||
// This method is const but the Functor gets passed a non-const pointer.
|
||||
// This quirk is inherited from TrackerList.
|
||||
template <typename Functor>
|
||||
void Iter(const Functor& func, size_t start) const {
|
||||
size_t i = nonempty_.FindSet(start);
|
||||
while (i < N) {
|
||||
auto& list = lists_[i];
|
||||
ASSERT(!list.empty());
|
||||
for (TrackerType* pt : list) {
|
||||
func(pt);
|
||||
}
|
||||
i++;
|
||||
if (i < N) i = nonempty_.FindSet(i);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
TrackerList lists_[N];
|
||||
size_t size_;
|
||||
Bitmap<N> nonempty_;
|
||||
};
|
||||
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
|
||||
#endif // TCMALLOC_HINTED_TRACKER_LIST_H_
|
||||
374
src/third_party/tcmalloc/dist/tcmalloc/huge_address_map.cc
vendored
Normal file
@ -0,0 +1,374 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tcmalloc/huge_address_map.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <new>
|
||||
|
||||
#include "absl/base/internal/cycleclock.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
|
||||
const HugeAddressMap::Node* HugeAddressMap::Node::next() const {
|
||||
const Node* n = right_;
|
||||
if (n) {
|
||||
while (n->left_) n = n->left_;
|
||||
return n;
|
||||
}
|
||||
|
||||
n = parent_;
|
||||
const Node* last = this;
|
||||
while (n) {
|
||||
if (n->left_ == last) return n;
|
||||
last = n;
|
||||
n = n->parent_;
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
HugeAddressMap::Node* HugeAddressMap::Node::next() {
|
||||
const Node* n = static_cast<const Node*>(this)->next();
|
||||
return const_cast<Node*>(n);
|
||||
}
|
||||
|
||||
void HugeAddressMap::Node::Check(size_t* num_nodes, HugeLength* size) const {
|
||||
HugeLength longest = range_.len();
|
||||
*num_nodes += 1;
|
||||
*size += range_.len();
|
||||
|
||||
if (left_) {
|
||||
// tree
|
||||
CHECK_CONDITION(left_->range_.start() < range_.start());
|
||||
// disjoint
|
||||
CHECK_CONDITION(left_->range_.end_addr() < range_.start_addr());
|
||||
// well-formed
|
||||
CHECK_CONDITION(left_->parent_ == this);
|
||||
// heap
|
||||
CHECK_CONDITION(left_->prio_ <= prio_);
|
||||
left_->Check(num_nodes, size);
|
||||
if (left_->longest_ > longest) longest = left_->longest_;
|
||||
}
|
||||
|
||||
if (right_) {
|
||||
// tree
|
||||
CHECK_CONDITION(right_->range_.start() > range_.start());
|
||||
// disjoint
|
||||
CHECK_CONDITION(right_->range_.start_addr() > range_.end_addr());
|
||||
// well-formed
|
||||
CHECK_CONDITION(right_->parent_ == this);
|
||||
// heap
|
||||
CHECK_CONDITION(right_->prio_ <= prio_);
|
||||
right_->Check(num_nodes, size);
|
||||
if (right_->longest_ > longest) longest = right_->longest_;
|
||||
}
|
||||
|
||||
CHECK_CONDITION(longest_ == longest);
|
||||
}
|
||||
|
||||
const HugeAddressMap::Node* HugeAddressMap::first() const {
|
||||
const Node* n = root();
|
||||
if (!n) return nullptr;
|
||||
const Node* left = n->left_;
|
||||
while (left) {
|
||||
n = left;
|
||||
left = n->left_;
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
HugeAddressMap::Node* HugeAddressMap::first() {
|
||||
const Node* f = static_cast<const HugeAddressMap*>(this)->first();
|
||||
return const_cast<Node*>(f);
|
||||
}
|
||||
|
||||
void HugeAddressMap::Check() {
|
||||
size_t nodes = 0;
|
||||
HugeLength size = NHugePages(0);
|
||||
if (root_) {
|
||||
CHECK_CONDITION(root_->parent_ == nullptr);
|
||||
root_->Check(&nodes, &size);
|
||||
}
|
||||
CHECK_CONDITION(nodes == nranges());
|
||||
CHECK_CONDITION(size == total_mapped());
|
||||
CHECK_CONDITION(total_nodes_ == used_nodes_ + freelist_size_);
|
||||
}
|
||||
|
||||
size_t HugeAddressMap::nranges() const { return used_nodes_; }
|
||||
|
||||
HugeLength HugeAddressMap::total_mapped() const { return total_size_; }
|
||||
|
||||
void HugeAddressMap::Print(Printer* out) const {
|
||||
out->printf("HugeAddressMap: treap %zu / %zu nodes used / created\n",
|
||||
used_nodes_, total_nodes_);
|
||||
const size_t longest = root_ ? root_->longest_.raw_num() : 0;
|
||||
out->printf("HugeAddressMap: %zu contiguous hugepages available\n", longest);
|
||||
}
|
||||
|
||||
void HugeAddressMap::PrintInPbtxt(PbtxtRegion* hpaa) const {
|
||||
hpaa->PrintI64("num_huge_address_map_treap_nodes_used", used_nodes_);
|
||||
hpaa->PrintI64("num_huge_address_map_treap_nodes_created", total_nodes_);
|
||||
const size_t longest = root_ ? root_->longest_.in_bytes() : 0;
|
||||
hpaa->PrintI64("contiguous_free_bytes", longest);
|
||||
}
|
||||
|
||||
HugeAddressMap::Node* HugeAddressMap::Predecessor(HugePage p) {
|
||||
Node* n = root();
|
||||
Node* best = nullptr;
|
||||
while (n) {
|
||||
HugeRange here = n->range_;
|
||||
if (here.contains(p)) return n;
|
||||
if (p < here.start()) {
|
||||
// p comes before here:
|
||||
// our predecessor isn't here, nor in the right subtree.
|
||||
n = n->left_;
|
||||
} else {
|
||||
// p comes after here:
|
||||
// here is a valid candidate, and the right subtree might have better.
|
||||
best = n;
|
||||
n = n->right_;
|
||||
}
|
||||
}
|
||||
|
||||
return best;
|
||||
}
|
||||
|
||||
void HugeAddressMap::Merge(Node* b, HugeRange r, Node* a) {
|
||||
auto merge_when = [](HugeRange x, int64_t x_when, HugeRange y,
|
||||
int64_t y_when) {
|
||||
// avoid overflow with floating-point
|
||||
const size_t x_len = x.len().raw_num();
|
||||
const size_t y_len = y.len().raw_num();
|
||||
const double x_weight = static_cast<double>(x_len) * x_when;
|
||||
const double y_weight = static_cast<double>(y_len) * y_when;
|
||||
return static_cast<int64_t>((x_weight + y_weight) / (x_len + y_len));
|
||||
};
|
||||
|
||||
int64_t when = absl::base_internal::CycleClock::Now();
|
||||
// Two way merges are easy.
|
||||
if (a == nullptr) {
|
||||
b->when_ = merge_when(b->range_, b->when(), r, when);
|
||||
b->range_ = Join(b->range_, r);
|
||||
FixLongest(b);
|
||||
return;
|
||||
} else if (b == nullptr) {
|
||||
a->when_ = merge_when(r, when, a->range_, a->when());
|
||||
a->range_ = Join(r, a->range_);
|
||||
FixLongest(a);
|
||||
return;
|
||||
}
|
||||
|
||||
// Three way merge: slightly harder. We must remove one node
|
||||
// (arbitrarily picking next).
|
||||
HugeRange partial = Join(r, a->range_);
|
||||
int64_t partial_when = merge_when(r, when, a->range_, a->when());
|
||||
HugeRange full = Join(b->range_, partial);
|
||||
int64_t full_when = merge_when(b->range_, b->when(), partial, partial_when);
|
||||
// Removing a will reduce total_size_ by that length, but since we're merging
|
||||
// we actually don't change lengths at all; undo that.
|
||||
total_size_ += a->range_.len();
|
||||
Remove(a);
|
||||
b->range_ = full;
|
||||
b->when_ = full_when;
|
||||
FixLongest(b);
|
||||
}
|
||||
|
||||
void HugeAddressMap::Insert(HugeRange r) {
|
||||
total_size_ += r.len();
|
||||
// First, try to merge if necessary. Note there are three possibilities:
|
||||
// we might need to merge before with r, r with after, or all three together.
|
||||
Node* before = Predecessor(r.start());
|
||||
CHECK_CONDITION(!before || !before->range_.intersects(r));
|
||||
Node* after = before ? before->next() : first();
|
||||
CHECK_CONDITION(!after || !after->range_.intersects(r));
|
||||
if (before && before->range_.precedes(r)) {
|
||||
if (after && r.precedes(after->range_)) {
|
||||
Merge(before, r, after);
|
||||
} else {
|
||||
Merge(before, r, nullptr);
|
||||
}
|
||||
return;
|
||||
} else if (after && r.precedes(after->range_)) {
|
||||
Merge(nullptr, r, after);
|
||||
return;
|
||||
}
|
||||
CHECK_CONDITION(!before || !before->range_.precedes(r));
|
||||
CHECK_CONDITION(!after || !r.precedes(after->range_));
|
||||
// No merging possible; just add a new node.
|
||||
Node* n = Get(r);
|
||||
Node* curr = root();
|
||||
Node* parent = nullptr;
|
||||
Node** link = &root_;
|
||||
// Walk down the tree to our correct location
|
||||
while (curr != nullptr && curr->prio_ >= n->prio_) {
|
||||
curr->longest_ = std::max(curr->longest_, r.len());
|
||||
parent = curr;
|
||||
if (curr->range_.start() < r.start()) {
|
||||
link = &curr->right_;
|
||||
curr = curr->right_;
|
||||
} else {
|
||||
link = &curr->left_;
|
||||
curr = curr->left_;
|
||||
}
|
||||
}
|
||||
*link = n;
|
||||
n->parent_ = parent;
|
||||
n->left_ = n->right_ = nullptr;
|
||||
n->longest_ = r.len();
|
||||
if (curr) {
|
||||
HugePage p = r.start();
|
||||
// We need to split the treap at curr into n's children.
|
||||
// This will be two treaps: one less than p, one greater, and has
|
||||
// a nice recursive structure.
|
||||
Node** less = &n->left_;
|
||||
Node* lp = n;
|
||||
Node** more = &n->right_;
|
||||
Node* mp = n;
|
||||
while (curr) {
|
||||
if (curr->range_.start() < p) {
|
||||
*less = curr;
|
||||
curr->parent_ = lp;
|
||||
less = &curr->right_;
|
||||
lp = curr;
|
||||
curr = curr->right_;
|
||||
} else {
|
||||
*more = curr;
|
||||
curr->parent_ = mp;
|
||||
more = &curr->left_;
|
||||
mp = curr;
|
||||
curr = curr->left_;
|
||||
}
|
||||
}
|
||||
*more = *less = nullptr;
|
||||
// We ripped apart the tree along these two paths--fix longest pointers.
|
||||
FixLongest(lp);
|
||||
FixLongest(mp);
|
||||
}
|
||||
}
|
||||
|
||||
void HugeAddressMap::Node::FixLongest() {
|
||||
const HugeLength l = left_ ? left_->longest_ : NHugePages(0);
|
||||
const HugeLength r = right_ ? right_->longest_ : NHugePages(0);
|
||||
const HugeLength c = range_.len();
|
||||
const HugeLength new_longest = std::max({l, r, c});
|
||||
longest_ = new_longest;
|
||||
}
|
||||
|
||||
void HugeAddressMap::FixLongest(HugeAddressMap::Node* n) {
|
||||
while (n) {
|
||||
n->FixLongest();
|
||||
n = n->parent_;
|
||||
}
|
||||
}
|
||||
|
||||
void HugeAddressMap::Remove(HugeAddressMap::Node* n) {
|
||||
total_size_ -= n->range_.len();
|
||||
// We need to merge the left and right children of n into one
|
||||
// treap, then glue it into place wherever n was.
|
||||
Node** link;
|
||||
Node* parent = n->parent_;
|
||||
Node* top = n->left_;
|
||||
Node* bottom = n->right_;
|
||||
|
||||
const HugeLength child_longest =
|
||||
std::max(top ? top->longest_ : NHugePages(0),
|
||||
bottom ? bottom->longest_ : NHugePages(0));
|
||||
if (!parent) {
|
||||
link = &root_;
|
||||
} else {
|
||||
// Account for the removed child--might change longests.
|
||||
// Easiest way: update this subtree to ignore the removed node,
|
||||
// then fix the chain of parents.
|
||||
n->longest_ = child_longest;
|
||||
FixLongest(parent);
|
||||
if (parent->range_.start() > n->range_.start()) {
|
||||
link = &parent->left_;
|
||||
} else {
|
||||
link = &parent->right_;
|
||||
}
|
||||
}
|
||||
|
||||
// A routine op we'll need a lot: given two (possibly null)
|
||||
// children, put the root-ier one into top.
|
||||
auto reorder_maybe = [](Node** top, Node** bottom) {
|
||||
Node *b = *bottom, *t = *top;
|
||||
if (b && (!t || t->prio_ < b->prio_)) {
|
||||
*bottom = t;
|
||||
*top = b;
|
||||
}
|
||||
};
|
||||
|
||||
reorder_maybe(&top, &bottom);
|
||||
// if we have two treaps to merge (top is always non-null if bottom is)
|
||||
// Invariant: top, bottom are two valid (longest included)
|
||||
// treaps. parent (and all above/elsewhere) have the correct longest
|
||||
// values, though parent does not have the correct children (will be the
|
||||
// merged value of top and bottom.)
|
||||
while (bottom) {
|
||||
*link = top;
|
||||
top->parent_ = parent;
|
||||
// We're merging bottom into top, so top might contain a longer
|
||||
// chunk than it thinks.
|
||||
top->longest_ = std::max(top->longest_, bottom->longest_);
|
||||
parent = top;
|
||||
if (bottom->range_.start() < top->range_.start()) {
|
||||
link = &top->left_;
|
||||
top = top->left_;
|
||||
} else {
|
||||
link = &top->right_;
|
||||
top = top->right_;
|
||||
}
|
||||
reorder_maybe(&top, &bottom);
|
||||
}
|
||||
*link = top;
|
||||
if (top) top->parent_ = parent;
|
||||
Put(n);
|
||||
}
|
||||
|
||||
void HugeAddressMap::Put(Node* n) {
|
||||
freelist_size_++;
|
||||
used_nodes_--;
|
||||
n->left_ = freelist_;
|
||||
freelist_ = n;
|
||||
}
|
||||
|
||||
HugeAddressMap::Node* HugeAddressMap::Get(HugeRange r) {
|
||||
CHECK_CONDITION((freelist_ == nullptr) == (freelist_size_ == 0));
|
||||
used_nodes_++;
|
||||
int prio = rand_r(&seed_);
|
||||
if (freelist_size_ == 0) {
|
||||
total_nodes_++;
|
||||
Node* ret = reinterpret_cast<Node*>(meta_(sizeof(Node)));
|
||||
return new (ret) Node(r, prio);
|
||||
}
|
||||
|
||||
freelist_size_--;
|
||||
Node* ret = freelist_;
|
||||
freelist_ = ret->left_;
|
||||
return new (ret) Node(r, prio);
|
||||
}
|
||||
|
||||
HugeAddressMap::Node::Node(HugeRange r, int prio)
|
||||
: range_(r), prio_(prio), when_(absl::base_internal::CycleClock::Now()) {}
|
||||
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
147
src/third_party/tcmalloc/dist/tcmalloc/huge_address_map.h
vendored
Normal file
@ -0,0 +1,147 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef TCMALLOC_HUGE_ADDRESS_MAP_H_
|
||||
#define TCMALLOC_HUGE_ADDRESS_MAP_H_
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "tcmalloc/huge_pages.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
|
||||
// Maintains a set of disjoint HugeRanges, merging adjacent ranges into one.
|
||||
// Exposes a balanced (somehow) binary tree of free ranges on address,
|
||||
// augmented with the largest range in each subtree (this allows fairly simple
|
||||
// allocation algorithms from the contained ranges.
|
||||
//
|
||||
// This class scales well and is *reasonably* performant, but it is not intended
|
||||
// for use on extremely hot paths.
|
||||
class HugeAddressMap {
|
||||
public:
|
||||
typedef void* (*MetadataAllocFunction)(size_t bytes);
|
||||
explicit constexpr HugeAddressMap(MetadataAllocFunction meta);
|
||||
|
||||
// IMPORTANT: DESTROYING A HUGE ADDRESS MAP DOES NOT MAKE ANY ATTEMPT
|
||||
// AT FREEING ALLOCATED METADATA.
|
||||
~HugeAddressMap() = default;
|
||||
|
||||
class Node {
|
||||
public:
|
||||
// the range stored at this point
|
||||
HugeRange range() const;
|
||||
// Tree structure
|
||||
Node* left();
|
||||
Node* right();
|
||||
// Iterate to the next node in address order
|
||||
const Node* next() const;
|
||||
Node* next();
|
||||
// when were this node's content added (in
|
||||
// absl::base_internal::CycleClock::Now units)?
|
||||
int64_t when() const;
|
||||
|
||||
// What is the length of the longest range in the subtree rooted here?
|
||||
HugeLength longest() const;
|
||||
|
||||
private:
|
||||
Node(HugeRange r, int prio);
|
||||
friend class HugeAddressMap;
|
||||
HugeRange range_;
|
||||
int prio_; // chosen randomly
|
||||
Node *left_, *right_;
|
||||
Node* parent_;
|
||||
HugeLength longest_;
|
||||
int64_t when_;
|
||||
// Expensive, recursive consistency check.
|
||||
// Accumulates node count and range sizes into passed arguments.
|
||||
void Check(size_t* num_nodes, HugeLength* size) const;
|
||||
|
||||
// We've broken longest invariants somehow; fix them here.
|
||||
void FixLongest();
|
||||
};
|
||||
|
||||
// Get root of the tree.
|
||||
Node* root();
|
||||
const Node* root() const;
|
||||
|
||||
// Get lowest-addressed node
|
||||
const Node* first() const;
|
||||
Node* first();
|
||||
|
||||
// Returns the highest-addressed range that does not lie completely
|
||||
// after p (if any).
|
||||
Node* Predecessor(HugePage p);
|
||||
|
||||
// Expensive consistency check.
|
||||
void Check();
|
||||
|
||||
// Statistics
|
||||
size_t nranges() const;
|
||||
HugeLength total_mapped() const;
|
||||
void Print(Printer* out) const;
|
||||
void PrintInPbtxt(PbtxtRegion* hpaa) const;
|
||||
|
||||
// Add <r> to the map, merging with adjacent ranges as needed.
|
||||
void Insert(HugeRange r);
|
||||
|
||||
// Delete n from the map.
|
||||
void Remove(Node* n);
|
||||
|
||||
private:
|
||||
// our tree
|
||||
Node* root_{nullptr};
|
||||
size_t used_nodes_{0};
|
||||
HugeLength total_size_{NHugePages(0)};
|
||||
|
||||
// cache of unused nodes
|
||||
Node* freelist_{nullptr};
|
||||
size_t freelist_size_{0};
|
||||
// How we get more
|
||||
MetadataAllocFunction meta_;
|
||||
Node* Get(HugeRange r);
|
||||
void Put(Node* n);
|
||||
|
||||
size_t total_nodes_{0};
|
||||
|
||||
void Merge(Node* b, HugeRange r, Node* a);
|
||||
void FixLongest(Node* n);
|
||||
// Note that we always use the same seed, currently; this isn't very random.
|
||||
// In practice we're not worried about adversarial input and this works well
|
||||
// enough.
|
||||
unsigned int seed_{0};
|
||||
};
|
||||
|
||||
inline constexpr HugeAddressMap::HugeAddressMap(MetadataAllocFunction meta)
|
||||
: meta_(meta) {}
|
||||
|
||||
inline HugeRange HugeAddressMap::Node::range() const { return range_; }
|
||||
inline HugeAddressMap::Node* HugeAddressMap::Node::left() { return left_; }
|
||||
inline HugeAddressMap::Node* HugeAddressMap::Node::right() { return right_; }
|
||||
|
||||
inline int64_t HugeAddressMap::Node::when() const { return when_; }
|
||||
inline HugeLength HugeAddressMap::Node::longest() const { return longest_; }
|
||||
|
||||
inline HugeAddressMap::Node* HugeAddressMap::root() { return root_; }
|
||||
inline const HugeAddressMap::Node* HugeAddressMap::root() const {
|
||||
return root_;
|
||||
}
|
||||
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
|
||||
#endif // TCMALLOC_HUGE_ADDRESS_MAP_H_
|
||||
86
src/third_party/tcmalloc/dist/tcmalloc/huge_address_map_test.cc
vendored
Normal file
@ -0,0 +1,86 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tcmalloc/huge_address_map.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "gmock/gmock.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
namespace {
|
||||
|
||||
class HugeAddressMapTest : public ::testing::Test {
|
||||
protected:
|
||||
HugeAddressMapTest() : map_(MallocMetadata) { metadata_allocs_.clear(); }
|
||||
|
||||
~HugeAddressMapTest() override {
|
||||
for (void* p : metadata_allocs_) {
|
||||
free(p);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<HugeRange> Contents() {
|
||||
std::vector<HugeRange> ret;
|
||||
auto node = map_.first();
|
||||
while (node) {
|
||||
ret.push_back(node->range());
|
||||
node = node->next();
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
HugePage hp(size_t i) { return {i}; }
|
||||
HugeLength hl(size_t i) { return NHugePages(i); }
|
||||
|
||||
HugeAddressMap map_;
|
||||
|
||||
private:
|
||||
static void* MallocMetadata(size_t size) {
|
||||
void* ptr = malloc(size);
|
||||
metadata_allocs_.push_back(ptr);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
static std::vector<void*> metadata_allocs_;
|
||||
};
|
||||
|
||||
std::vector<void*> HugeAddressMapTest::metadata_allocs_;
|
||||
|
||||
// This test verifies that HugeAddressMap merges properly.
|
||||
TEST_F(HugeAddressMapTest, Merging) {
|
||||
const HugeRange r1 = HugeRange::Make(hp(0), hl(1));
|
||||
const HugeRange r2 = HugeRange::Make(hp(1), hl(1));
|
||||
const HugeRange r3 = HugeRange::Make(hp(2), hl(1));
|
||||
const HugeRange all = Join(r1, Join(r2, r3));
|
||||
map_.Insert(r1);
|
||||
map_.Check();
|
||||
EXPECT_THAT(Contents(), testing::ElementsAre(r1));
|
||||
map_.Insert(r3);
|
||||
map_.Check();
|
||||
EXPECT_THAT(Contents(), testing::ElementsAre(r1, r3));
|
||||
map_.Insert(r2);
|
||||
map_.Check();
|
||||
EXPECT_THAT(Contents(), testing::ElementsAre(all));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
174
src/third_party/tcmalloc/dist/tcmalloc/huge_allocator.cc
vendored
Normal file
@ -0,0 +1,174 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tcmalloc/huge_allocator.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "tcmalloc/huge_address_map.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
|
||||
void HugeAllocator::Print(Printer* out) {
|
||||
out->printf("HugeAllocator: contiguous, unbacked hugepage(s)\n");
|
||||
free_.Print(out);
|
||||
out->printf(
|
||||
"HugeAllocator: %zu requested - %zu in use = %zu hugepages free\n",
|
||||
from_system_.raw_num(), in_use_.raw_num(),
|
||||
(from_system_ - in_use_).raw_num());
|
||||
}
|
||||
|
||||
void HugeAllocator::PrintInPbtxt(PbtxtRegion* hpaa) const {
|
||||
free_.PrintInPbtxt(hpaa);
|
||||
hpaa->PrintI64("num_total_requested_huge_pages", from_system_.raw_num());
|
||||
hpaa->PrintI64("num_in_use_huge_pages", in_use_.raw_num());
|
||||
}
|
||||
|
||||
HugeAddressMap::Node* HugeAllocator::Find(HugeLength n) {
|
||||
HugeAddressMap::Node* curr = free_.root();
|
||||
// invariant: curr != nullptr && curr->longest >= n
|
||||
// we favor smaller gaps and lower nodes and lower addresses, in that
|
||||
// order. The net effect is that we are neither a best-fit nor a
|
||||
// lowest-address allocator but vaguely close to both.
|
||||
HugeAddressMap::Node* best = nullptr;
|
||||
while (curr && curr->longest() >= n) {
|
||||
if (curr->range().len() >= n) {
|
||||
if (!best || best->range().len() > curr->range().len()) {
|
||||
best = curr;
|
||||
}
|
||||
}
|
||||
|
||||
// Either subtree could contain a better fit and we don't want to
|
||||
// search the whole tree. Pick a reasonable child to look at.
|
||||
auto left = curr->left();
|
||||
auto right = curr->right();
|
||||
if (!left || left->longest() < n) {
|
||||
curr = right;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!right || right->longest() < n) {
|
||||
curr = left;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Here, we have a nontrivial choice.
|
||||
if (left->range().len() == right->range().len()) {
|
||||
if (left->longest() <= right->longest()) {
|
||||
curr = left;
|
||||
} else {
|
||||
curr = right;
|
||||
}
|
||||
} else if (left->range().len() < right->range().len()) {
|
||||
// Here, the longest range in both children is the same...look
|
||||
// in the subtree with the smaller root, as that's slightly
|
||||
// more likely to be our best.
|
||||
curr = left;
|
||||
} else {
|
||||
curr = right;
|
||||
}
|
||||
}
|
||||
return best;
|
||||
}
|
||||
|
||||
void HugeAllocator::CheckFreelist() {
|
||||
free_.Check();
|
||||
size_t num_nodes = free_.nranges();
|
||||
HugeLength n = free_.total_mapped();
|
||||
free_.Check();
|
||||
CHECK_CONDITION(n == from_system_ - in_use_);
|
||||
LargeSpanStats large;
|
||||
AddSpanStats(nullptr, &large, nullptr);
|
||||
CHECK_CONDITION(num_nodes == large.spans);
|
||||
CHECK_CONDITION(n.in_pages() == large.returned_pages);
|
||||
}
|
||||
|
||||
HugeRange HugeAllocator::AllocateRange(HugeLength n) {
|
||||
if (n.overflows()) return HugeRange::Nil();
|
||||
size_t bytes = n.in_bytes();
|
||||
size_t align = kHugePageSize;
|
||||
auto [ptr, actual] = allocate_(bytes, align);
|
||||
if (ptr == nullptr) {
|
||||
// OOM...
|
||||
return HugeRange::Nil();
|
||||
}
|
||||
CHECK_CONDITION(ptr != nullptr);
|
||||
// It's possible for a request to return extra hugepages.
|
||||
CHECK_CONDITION(actual % kHugePageSize == 0);
|
||||
n = HLFromBytes(actual);
|
||||
from_system_ += n;
|
||||
return HugeRange::Make(HugePageContaining(ptr), n);
|
||||
}
|
||||
|
||||
HugeRange HugeAllocator::Get(HugeLength n) {
|
||||
CHECK_CONDITION(n > NHugePages(0));
|
||||
auto* node = Find(n);
|
||||
if (!node) {
|
||||
// Get more memory, then "delete" it
|
||||
HugeRange r = AllocateRange(n);
|
||||
if (!r.valid()) return r;
|
||||
in_use_ += r.len();
|
||||
Release(r);
|
||||
node = Find(n);
|
||||
CHECK_CONDITION(node != nullptr);
|
||||
}
|
||||
in_use_ += n;
|
||||
|
||||
HugeRange r = node->range();
|
||||
free_.Remove(node);
|
||||
if (r.len() > n) {
|
||||
HugeLength before = r.len();
|
||||
HugeRange extra = HugeRange::Make(r.start() + n, before - n);
|
||||
r = HugeRange::Make(r.start(), n);
|
||||
ASSERT(r.precedes(extra));
|
||||
ASSERT(r.len() + extra.len() == before);
|
||||
in_use_ += extra.len();
|
||||
Release(extra);
|
||||
} else {
|
||||
// Release does this for us
|
||||
DebugCheckFreelist();
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
void HugeAllocator::Release(HugeRange r) {
|
||||
in_use_ -= r.len();
|
||||
|
||||
free_.Insert(r);
|
||||
DebugCheckFreelist();
|
||||
}
|
||||
|
||||
void HugeAllocator::AddSpanStats(SmallSpanStats* small, LargeSpanStats* large,
|
||||
PageAgeHistograms* ages) const {
|
||||
for (const HugeAddressMap::Node* node = free_.first(); node != nullptr;
|
||||
node = node->next()) {
|
||||
HugeLength n = node->range().len();
|
||||
if (large != nullptr) {
|
||||
large->spans++;
|
||||
large->returned_pages += n.in_pages();
|
||||
}
|
||||
|
||||
if (ages != nullptr) {
|
||||
ages->RecordRange(n.in_pages(), true, node->when());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
108
src/third_party/tcmalloc/dist/tcmalloc/huge_allocator.h
vendored
Normal file
@ -0,0 +1,108 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// Tracking information for the available range of hugepages,
|
||||
// and a basic allocator for unmapped hugepages.
|
||||
#ifndef TCMALLOC_HUGE_ALLOCATOR_H_
|
||||
#define TCMALLOC_HUGE_ALLOCATOR_H_
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include "tcmalloc/common.h"
|
||||
#include "tcmalloc/huge_address_map.h"
|
||||
#include "tcmalloc/huge_pages.h"
|
||||
#include "tcmalloc/stats.h"
|
||||
#include "tcmalloc/system-alloc.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
|
||||
// these typedefs allow replacement of tcmalloc::System* for tests.
|
||||
using MemoryAllocFunction = AddressRange (*)(size_t bytes, size_t align);
|
||||
using MetadataAllocFunction = void* (*)(size_t bytes);
|
||||
|
||||
// This tracks available ranges of hugepages and fulfills requests for
|
||||
// usable memory, allocating more from the system as needed. All
|
||||
// hugepages are treated as (and assumed to be) unbacked.
|
||||
class HugeAllocator {
|
||||
public:
|
||||
constexpr HugeAllocator(MemoryAllocFunction allocate,
|
||||
MetadataAllocFunction meta_allocate)
|
||||
: free_(meta_allocate), allocate_(allocate) {}
|
||||
|
||||
// Obtain a range of n unbacked hugepages, distinct from all other
|
||||
// calls to Get (other than those that have been Released.)
|
||||
HugeRange Get(HugeLength n);
|
||||
|
||||
// Returns a range of hugepages for reuse by subsequent Gets().
|
||||
// REQUIRES: <r> is the return value (or a subrange thereof) of a previous
|
||||
// call to Get(); neither <r> nor any overlapping range has been released
|
||||
// since that Get().
|
||||
void Release(HugeRange r);
|
||||
|
||||
// Total memory requested from the system, whether in use or not,
|
||||
HugeLength system() const { return from_system_; }
|
||||
// Unused memory in the allocator.
|
||||
HugeLength size() const { return from_system_ - in_use_; }
|
||||
|
||||
void AddSpanStats(SmallSpanStats* small, LargeSpanStats* large,
|
||||
PageAgeHistograms* ages) const;
|
||||
|
||||
BackingStats stats() const {
|
||||
BackingStats s;
|
||||
s.system_bytes = system().in_bytes();
|
||||
s.free_bytes = 0;
|
||||
s.unmapped_bytes = size().in_bytes();
|
||||
return s;
|
||||
}
|
||||
|
||||
void Print(Printer* out);
|
||||
void PrintInPbtxt(PbtxtRegion* hpaa) const;
|
||||
|
||||
private:
|
||||
// We're constrained in several ways by existing code. Hard requirements:
|
||||
// * no radix tree or similar O(address space) external space tracking
|
||||
// * support sub releasing
|
||||
// * low metadata overhead
|
||||
// * no pre-allocation.
|
||||
// * reasonable space overhead
|
||||
//
|
||||
// We use a treap ordered on addresses to track. This isn't the most
|
||||
// efficient thing ever but we're about to hit 100usec+/hugepage
|
||||
// backing costs if we've gotten this far; the last few bits of performance
|
||||
// don't matter, and most of the simple ideas can't hit all of the above
|
||||
// requirements.
|
||||
HugeAddressMap free_;
|
||||
HugeAddressMap::Node* Find(HugeLength n);
|
||||
|
||||
void CheckFreelist();
|
||||
void DebugCheckFreelist() {
|
||||
#ifndef NDEBUG
|
||||
CheckFreelist();
|
||||
#endif
|
||||
}
|
||||
|
||||
HugeLength from_system_{NHugePages(0)};
|
||||
HugeLength in_use_{NHugePages(0)};
|
||||
|
||||
MemoryAllocFunction allocate_;
|
||||
HugeRange AllocateRange(HugeLength n);
|
||||
};
|
||||
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
|
||||
#endif // TCMALLOC_HUGE_ALLOCATOR_H_
|
||||
448
src/third_party/tcmalloc/dist/tcmalloc/huge_allocator_test.cc
vendored
Normal file
@ -0,0 +1,448 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tcmalloc/huge_allocator.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "absl/base/internal/cycleclock.h"
|
||||
#include "absl/random/random.h"
|
||||
#include "absl/time/clock.h"
|
||||
#include "absl/time/time.h"
|
||||
#include "tcmalloc/huge_pages.h"
|
||||
#include "tcmalloc/internal/config.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
namespace {
|
||||
|
||||
class HugeAllocatorTest : public testing::TestWithParam<bool> {
|
||||
private:
|
||||
// Use a tiny fraction of actual size so we can test aggressively.
|
||||
static AddressRange AllocateFake(size_t bytes, size_t align);
|
||||
|
||||
static constexpr size_t kMaxBacking = 1024 * 1024;
|
||||
// This isn't super good form but we'll never have more than one HAT
|
||||
// extant at once.
|
||||
static std::vector<size_t> backing_;
|
||||
|
||||
// We use actual malloc for metadata allocations, but we track them so they
|
||||
// can be deleted.
|
||||
static void* MallocMetadata(size_t size);
|
||||
static std::vector<void*> metadata_allocs_;
|
||||
static size_t metadata_bytes_;
|
||||
static bool should_overallocate_;
|
||||
static HugeLength huge_pages_requested_;
|
||||
static HugeLength huge_pages_received_;
|
||||
|
||||
protected:
|
||||
HugeLength HugePagesRequested() { return huge_pages_requested_; }
|
||||
HugeLength HugePagesReceived() { return huge_pages_received_; }
|
||||
|
||||
HugeAllocatorTest() {
|
||||
should_overallocate_ = GetParam();
|
||||
huge_pages_requested_ = NHugePages(0);
|
||||
huge_pages_received_ = NHugePages(0);
|
||||
// We don't use the first few bytes, because things might get weird
|
||||
// given zero pointers.
|
||||
backing_.resize(1024);
|
||||
metadata_bytes_ = 0;
|
||||
}
|
||||
|
||||
~HugeAllocatorTest() override {
|
||||
for (void* p : metadata_allocs_) {
|
||||
free(p);
|
||||
}
|
||||
metadata_allocs_.clear();
|
||||
backing_.clear();
|
||||
}
|
||||
|
||||
size_t* GetActual(HugePage p) { return &backing_[p.index()]; }
|
||||
|
||||
// We're dealing with a lot of memory, so we don't want to do full memset
|
||||
// and then check every byte for corruption. So set the first and last
|
||||
// byte in each page...
|
||||
void CheckPages(HugeRange r, size_t c) {
|
||||
for (HugePage p = r.first; p < r.first + r.n; ++p) {
|
||||
EXPECT_EQ(c, *GetActual(p));
|
||||
}
|
||||
}
|
||||
|
||||
void MarkPages(HugeRange r, size_t c) {
|
||||
for (HugePage p = r.first; p < r.first + r.n; ++p) {
|
||||
*GetActual(p) = c;
|
||||
}
|
||||
}
|
||||
|
||||
void CheckStats(HugeLength expected_use) {
|
||||
const HugeLength received = HugePagesReceived();
|
||||
EXPECT_EQ(received, allocator_.system());
|
||||
HugeLength used = received - allocator_.size();
|
||||
EXPECT_EQ(used, expected_use);
|
||||
}
|
||||
|
||||
HugeAllocator allocator_{AllocateFake, MallocMetadata};
|
||||
};
|
||||
|
||||
// Use a tiny fraction of actual size so we can test aggressively.
|
||||
AddressRange HugeAllocatorTest::AllocateFake(size_t bytes, size_t align) {
|
||||
CHECK_CONDITION(bytes % kHugePageSize == 0);
|
||||
CHECK_CONDITION(align % kHugePageSize == 0);
|
||||
HugeLength req = HLFromBytes(bytes);
|
||||
huge_pages_requested_ += req;
|
||||
// Test the case where our sys allocator provides too much.
|
||||
if (should_overallocate_) ++req;
|
||||
huge_pages_received_ += req;
|
||||
// we'll actually provide hidden backing, one word per hugepage.
|
||||
bytes = req / NHugePages(1);
|
||||
align /= kHugePageSize;
|
||||
size_t index = backing_.size();
|
||||
if (index % align != 0) {
|
||||
index += (align - (index & align));
|
||||
}
|
||||
if (index + bytes > kMaxBacking) return {nullptr, 0};
|
||||
backing_.resize(index + bytes);
|
||||
void* ptr = reinterpret_cast<void*>(index * kHugePageSize);
|
||||
return {ptr, req.in_bytes()};
|
||||
}
|
||||
|
||||
// We use actual malloc for metadata allocations, but we track them so they
|
||||
// can be deleted.
|
||||
void* HugeAllocatorTest::MallocMetadata(size_t size) {
|
||||
metadata_bytes_ += size;
|
||||
void* ptr = malloc(size);
|
||||
metadata_allocs_.push_back(ptr);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
std::vector<size_t> HugeAllocatorTest::backing_;
|
||||
std::vector<void*> HugeAllocatorTest::metadata_allocs_;
|
||||
size_t HugeAllocatorTest::metadata_bytes_;
|
||||
bool HugeAllocatorTest::should_overallocate_;
|
||||
HugeLength HugeAllocatorTest::huge_pages_requested_;
|
||||
HugeLength HugeAllocatorTest::huge_pages_received_;
|
||||
|
||||
TEST_P(HugeAllocatorTest, Basic) {
|
||||
std::vector<std::pair<HugeRange, size_t>> allocs;
|
||||
absl::BitGen rng;
|
||||
size_t label = 0;
|
||||
HugeLength total = NHugePages(0);
|
||||
static const size_t kSize = 1000;
|
||||
HugeLength peak = total;
|
||||
for (int i = 0; i < kSize; ++i) {
|
||||
HugeLength len =
|
||||
NHugePages(absl::LogUniform<int32_t>(rng, 0, (1 << 12) - 1) + 1);
|
||||
auto r = allocator_.Get(len);
|
||||
ASSERT_TRUE(r.valid());
|
||||
total += len;
|
||||
peak = std::max(peak, total);
|
||||
CheckStats(total);
|
||||
MarkPages(r, label);
|
||||
allocs.push_back({r, label});
|
||||
label++;
|
||||
}
|
||||
|
||||
for (int i = 0; i < 1000 * 25; ++i) {
|
||||
size_t index = absl::Uniform<int32_t>(rng, 0, kSize);
|
||||
std::swap(allocs[index], allocs[kSize - 1]);
|
||||
auto p = allocs[kSize - 1];
|
||||
CheckPages(p.first, p.second);
|
||||
total -= p.first.len();
|
||||
allocator_.Release(p.first);
|
||||
CheckStats(total);
|
||||
|
||||
HugeLength len =
|
||||
NHugePages(absl::LogUniform<int32_t>(rng, 0, (1 << 12) - 1) + 1);
|
||||
auto r = allocator_.Get(len);
|
||||
ASSERT_TRUE(r.valid());
|
||||
ASSERT_EQ(r.len(), len);
|
||||
total += len;
|
||||
peak = std::max(peak, total);
|
||||
CheckStats(total);
|
||||
MarkPages(r, label);
|
||||
allocs[kSize - 1] = {r, label};
|
||||
label++;
|
||||
}
|
||||
for (auto p : allocs) {
|
||||
CheckPages(p.first, p.second);
|
||||
allocator_.Release(p.first);
|
||||
}
|
||||
}
|
||||
|
||||
// Check that releasing small chunks of allocations works OK.
|
||||
TEST_P(HugeAllocatorTest, Subrelease) {
|
||||
size_t label = 1;
|
||||
const HugeLength kLen = NHugePages(8);
|
||||
const HugeLength kTotal = kLen * (kLen / NHugePages(1) - 1);
|
||||
for (int i = 0; i < 100; ++i) {
|
||||
std::vector<std::pair<HugeRange, size_t>> allocs;
|
||||
// get allocs of kLen and release different sized sub-chunks of them -
|
||||
// make sure that doesn't break anything else.
|
||||
for (HugeLength j = NHugePages(1); j < kLen; ++j) {
|
||||
auto r = allocator_.Get(kLen);
|
||||
ASSERT_TRUE(r.valid());
|
||||
MarkPages(r, label);
|
||||
allocator_.Release({r.start(), j});
|
||||
allocs.push_back({{r.start() + j, kLen - j}, label});
|
||||
label++;
|
||||
}
|
||||
EXPECT_EQ(kTotal, HugePagesRequested());
|
||||
for (auto p : allocs) {
|
||||
CheckPages(p.first, p.second);
|
||||
allocator_.Release(p.first);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Does subreleasing work OK for absurdly large allocations?
|
||||
TEST_P(HugeAllocatorTest, SubreleaseLarge) {
|
||||
absl::BitGen rng;
|
||||
std::vector<std::pair<HugeRange, size_t>> allocs;
|
||||
size_t label = 1;
|
||||
const HugeLength kLimit = HLFromBytes(1024ul * 1024 * 1024 * 1024);
|
||||
for (HugeLength n = NHugePages(2); n < kLimit; n *= 2) {
|
||||
auto r = allocator_.Get(n);
|
||||
ASSERT_TRUE(r.valid());
|
||||
MarkPages(r, label);
|
||||
// chunk of less than half
|
||||
HugeLength chunk =
|
||||
NHugePages(absl::Uniform<int32_t>(rng, 0, n / NHugePages(2)) + 1);
|
||||
allocator_.Release({r.start(), chunk});
|
||||
allocs.push_back({{r.start() + chunk, n - chunk}, label});
|
||||
label++;
|
||||
}
|
||||
// reuse the released space
|
||||
const HugeLength total = HugePagesRequested();
|
||||
while (total == HugePagesRequested()) {
|
||||
HugeLength n =
|
||||
NHugePages(absl::LogUniform<int32_t>(rng, 0, (1 << 8) - 1) + 1);
|
||||
auto r = allocator_.Get(n);
|
||||
ASSERT_TRUE(r.valid());
|
||||
MarkPages(r, label);
|
||||
allocs.push_back({r, label});
|
||||
label++;
|
||||
}
|
||||
for (auto p : allocs) {
|
||||
CheckPages(p.first, p.second);
|
||||
allocator_.Release(p.first);
|
||||
}
|
||||
}
|
||||
|
||||
// We don't care *that* much about vaddress space, but let's not be crazy.
|
||||
// Don't fill tiny requests from big spaces.
|
||||
TEST_P(HugeAllocatorTest, Fragmentation) {
|
||||
// Prime the pump with some random allocations.
|
||||
absl::BitGen rng;
|
||||
|
||||
std::vector<HugeRange> free;
|
||||
constexpr int kSlots = 50;
|
||||
|
||||
// Plan to insert a large allocation at the big_slot'th index, then free it
|
||||
// during the initial priming step (so we have at least a contiguous region of
|
||||
// at least big hugepages).
|
||||
HugeLength big = NHugePages(8);
|
||||
const int big_slot = absl::Uniform(rng, 0, kSlots);
|
||||
|
||||
for (int i = 0; i < kSlots; ++i) {
|
||||
if (i == big_slot) {
|
||||
auto r = allocator_.Get(big);
|
||||
ASSERT_TRUE(r.valid());
|
||||
free.push_back(r);
|
||||
}
|
||||
|
||||
auto r = allocator_.Get(NHugePages(1));
|
||||
ASSERT_TRUE(r.valid());
|
||||
if (absl::Bernoulli(rng, 1.0 / 2)) {
|
||||
free.push_back(r);
|
||||
}
|
||||
}
|
||||
size_t slots = free.size() - 1;
|
||||
for (auto r : free) {
|
||||
allocator_.Release(r);
|
||||
}
|
||||
free.clear();
|
||||
static const size_t kReps = 5;
|
||||
for (int i = 0; i < kReps; ++i) {
|
||||
SCOPED_TRACE(i);
|
||||
|
||||
// Ensure we have a range of this size.
|
||||
HugeRange r = allocator_.Get(big);
|
||||
ASSERT_TRUE(r.valid());
|
||||
if (NHugePages(slots) > allocator_.size()) {
|
||||
// We should also have slots pages left over after allocating big
|
||||
for (int i = 0; i < slots; ++i) {
|
||||
HugeRange f = allocator_.Get(NHugePages(1));
|
||||
ASSERT_TRUE(f.valid());
|
||||
free.push_back(f);
|
||||
}
|
||||
for (auto f : free) {
|
||||
allocator_.Release(f);
|
||||
}
|
||||
free.clear();
|
||||
}
|
||||
allocator_.Release(r);
|
||||
// We should definitely have at least this many small spaces...
|
||||
for (int i = 0; i < slots; ++i) {
|
||||
r = allocator_.Get(NHugePages(1));
|
||||
ASSERT_TRUE(r.valid());
|
||||
free.push_back(r);
|
||||
}
|
||||
// that don't interfere with the available big space.
|
||||
auto before = allocator_.system();
|
||||
r = allocator_.Get(big);
|
||||
ASSERT_TRUE(r.valid());
|
||||
EXPECT_EQ(before, allocator_.system());
|
||||
allocator_.Release(r);
|
||||
for (auto r : free) {
|
||||
allocator_.Release(r);
|
||||
}
|
||||
free.clear();
|
||||
slots += big.raw_num();
|
||||
big += big;
|
||||
}
|
||||
}
|
||||
|
||||
// Check that we only request as much as we actually need from the system.
|
||||
TEST_P(HugeAllocatorTest, Frugal) {
|
||||
HugeLength total = NHugePages(0);
|
||||
static const size_t kSize = 1000;
|
||||
for (int i = 1; i < kSize; ++i) {
|
||||
HugeLength len = NHugePages(i);
|
||||
// toss the range, we ain't using it
|
||||
ASSERT_TRUE(allocator_.Get(len).valid());
|
||||
|
||||
total += len;
|
||||
CheckStats(total);
|
||||
EXPECT_EQ(total, HugePagesRequested());
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(HugeAllocatorTest, Stats) {
|
||||
struct Helper {
|
||||
static void Stats(const HugeAllocator* huge, size_t* num_spans,
|
||||
Length* pages, absl::Duration* avg_age) {
|
||||
SmallSpanStats small;
|
||||
LargeSpanStats large;
|
||||
PageAgeHistograms ages(absl::base_internal::CycleClock::Now());
|
||||
huge->AddSpanStats(&small, &large, &ages);
|
||||
for (auto i = Length(0); i < kMaxPages; ++i) {
|
||||
EXPECT_EQ(0, small.normal_length[i.raw_num()]);
|
||||
EXPECT_EQ(0, small.returned_length[i.raw_num()]);
|
||||
}
|
||||
*num_spans = large.spans;
|
||||
EXPECT_EQ(Length(0), large.normal_pages);
|
||||
*pages = large.returned_pages;
|
||||
const PageAgeHistograms::Histogram* hist = ages.GetTotalHistogram(true);
|
||||
*avg_age = absl::Seconds(hist->avg_age());
|
||||
}
|
||||
};
|
||||
|
||||
if (GetParam()) {
|
||||
// Ensure overallocation doesn't skew our measurements below.
|
||||
allocator_.Release(allocator_.Get(NHugePages(7)));
|
||||
}
|
||||
const HugeRange r = allocator_.Get(NHugePages(8));
|
||||
ASSERT_TRUE(r.valid());
|
||||
const HugePage p = r.start();
|
||||
// Break it into 3 ranges, separated by one-page regions,
|
||||
// so we can easily track the internal state in stats.
|
||||
const HugeRange r1 = {p, NHugePages(1)};
|
||||
const HugeRange b1 = {p + NHugePages(1), NHugePages(1)};
|
||||
const HugeRange r2 = {p + NHugePages(2), NHugePages(2)};
|
||||
const HugeRange b2 = {p + NHugePages(4), NHugePages(1)};
|
||||
const HugeRange r3 = {p + NHugePages(5), NHugePages(3)};
|
||||
|
||||
size_t num_spans;
|
||||
Length pages;
|
||||
absl::Duration avg_age;
|
||||
|
||||
Helper::Stats(&allocator_, &num_spans, &pages, &avg_age);
|
||||
EXPECT_EQ(0, num_spans);
|
||||
EXPECT_EQ(Length(0), pages);
|
||||
EXPECT_EQ(absl::ZeroDuration(), avg_age);
|
||||
|
||||
allocator_.Release(r1);
|
||||
constexpr absl::Duration kDelay = absl::Milliseconds(500);
|
||||
absl::SleepFor(kDelay);
|
||||
Helper::Stats(&allocator_, &num_spans, &pages, &avg_age);
|
||||
EXPECT_EQ(1, num_spans);
|
||||
EXPECT_EQ(NHugePages(1).in_pages(), pages);
|
||||
// We can only do >= testing, because we might be arbitrarily delayed.
|
||||
// Since avg_age is computed in floating point, we may have round-off from
|
||||
// TCMalloc's internal use of absl::base_internal::CycleClock down through
|
||||
// computing the average age of the spans. kEpsilon allows for a tiny amount
|
||||
// of slop.
|
||||
constexpr absl::Duration kEpsilon = absl::Microseconds(500);
|
||||
EXPECT_LE(kDelay - kEpsilon, avg_age);
|
||||
|
||||
allocator_.Release(r2);
|
||||
absl::SleepFor(absl::Milliseconds(250));
|
||||
Helper::Stats(&allocator_, &num_spans, &pages, &avg_age);
|
||||
EXPECT_EQ(2, num_spans);
|
||||
EXPECT_EQ(NHugePages(3).in_pages(), pages);
|
||||
EXPECT_LE(
|
||||
(absl::Seconds(0.75) * 1 + absl::Seconds(0.25) * 2) / (1 + 2) - kEpsilon,
|
||||
avg_age);
|
||||
|
||||
allocator_.Release(r3);
|
||||
absl::SleepFor(absl::Milliseconds(125));
|
||||
Helper::Stats(&allocator_, &num_spans, &pages, &avg_age);
|
||||
EXPECT_EQ(3, num_spans);
|
||||
EXPECT_EQ(NHugePages(6).in_pages(), pages);
|
||||
EXPECT_LE((absl::Seconds(0.875) * 1 + absl::Seconds(0.375) * 2 +
|
||||
absl::Seconds(0.125) * 3) /
|
||||
(1 + 2 + 3) -
|
||||
kEpsilon,
|
||||
avg_age);
|
||||
|
||||
allocator_.Release(b1);
|
||||
allocator_.Release(b2);
|
||||
absl::SleepFor(absl::Milliseconds(100));
|
||||
Helper::Stats(&allocator_, &num_spans, &pages, &avg_age);
|
||||
EXPECT_EQ(1, num_spans);
|
||||
EXPECT_EQ(NHugePages(8).in_pages(), pages);
|
||||
EXPECT_LE((absl::Seconds(0.975) * 1 + absl::Seconds(0.475) * 2 +
|
||||
absl::Seconds(0.225) * 3 + absl::Seconds(0.1) * 2) /
|
||||
(1 + 2 + 3 + 2) -
|
||||
kEpsilon,
|
||||
avg_age);
|
||||
}
|
||||
|
||||
// Make sure we're well-behaved in the presence of OOM (and that we do
|
||||
// OOM at some point...)
|
||||
TEST_P(HugeAllocatorTest, OOM) {
|
||||
HugeLength n = NHugePages(1);
|
||||
while (allocator_.Get(n).valid()) {
|
||||
n *= 2;
|
||||
}
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
NormalOverAlloc, HugeAllocatorTest, testing::Values(false, true),
|
||||
+[](const testing::TestParamInfo<bool>& info) {
|
||||
return info.param ? "overallocates" : "normal";
|
||||
});
|
||||
|
||||
} // namespace
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
497
src/third_party/tcmalloc/dist/tcmalloc/huge_cache.cc
vendored
Normal file
@ -0,0 +1,497 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tcmalloc/huge_cache.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <tuple>
|
||||
|
||||
#include "absl/time/time.h"
|
||||
#include "tcmalloc/common.h"
|
||||
#include "tcmalloc/huge_address_map.h"
|
||||
#include "tcmalloc/huge_pages.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
#include "tcmalloc/stats.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
|
||||
template <size_t kEpochs>
|
||||
void MinMaxTracker<kEpochs>::Report(HugeLength val) {
|
||||
timeseries_.Report(val);
|
||||
}
|
||||
|
||||
template <size_t kEpochs>
|
||||
HugeLength MinMaxTracker<kEpochs>::MaxOverTime(absl::Duration t) const {
|
||||
HugeLength m = NHugePages(0);
|
||||
size_t num_epochs = ceil(absl::FDivDuration(t, kEpochLength));
|
||||
timeseries_.IterBackwards([&](size_t offset, int64_t ts,
|
||||
const Extrema& e) { m = std::max(m, e.max); },
|
||||
num_epochs);
|
||||
return m;
|
||||
}
|
||||
|
||||
template <size_t kEpochs>
|
||||
HugeLength MinMaxTracker<kEpochs>::MinOverTime(absl::Duration t) const {
|
||||
HugeLength m = kMaxVal;
|
||||
size_t num_epochs = ceil(absl::FDivDuration(t, kEpochLength));
|
||||
timeseries_.IterBackwards([&](size_t offset, int64_t ts,
|
||||
const Extrema& e) { m = std::min(m, e.min); },
|
||||
num_epochs);
|
||||
return m;
|
||||
}
|
||||
|
||||
template <size_t kEpochs>
|
||||
void MinMaxTracker<kEpochs>::Print(Printer* out) const {
|
||||
// Prints timestamp:min_pages:max_pages for each window with records.
|
||||
// Timestamp == kEpochs - 1 is the most recent measurement.
|
||||
const int64_t millis = absl::ToInt64Milliseconds(kEpochLength);
|
||||
out->printf("\nHugeCache: window %lldms * %zu", millis, kEpochs);
|
||||
int written = 0;
|
||||
timeseries_.Iter(
|
||||
[&](size_t offset, int64_t ts, const Extrema& e) {
|
||||
if ((written++) % 100 == 0)
|
||||
out->printf("\nHugeCache: Usage timeseries ");
|
||||
out->printf("%zu:%zu:%zd,", offset, e.min.raw_num(), e.max.raw_num());
|
||||
},
|
||||
timeseries_.kSkipEmptyEntries);
|
||||
out->printf("\n");
|
||||
}
|
||||
|
||||
template <size_t kEpochs>
|
||||
void MinMaxTracker<kEpochs>::PrintInPbtxt(PbtxtRegion* hpaa) const {
|
||||
// Prints content of each non-empty epoch, from oldest to most recent data
|
||||
auto huge_cache_history = hpaa->CreateSubRegion("huge_cache_history");
|
||||
huge_cache_history.PrintI64("window_ms",
|
||||
absl::ToInt64Milliseconds(kEpochLength));
|
||||
huge_cache_history.PrintI64("epochs", kEpochs);
|
||||
|
||||
timeseries_.Iter(
|
||||
[&](size_t offset, int64_t ts, const Extrema& e) {
|
||||
auto m = huge_cache_history.CreateSubRegion("measurements");
|
||||
m.PrintI64("epoch", offset);
|
||||
m.PrintI64("min_bytes", e.min.in_bytes());
|
||||
m.PrintI64("max_bytes", e.max.in_bytes());
|
||||
},
|
||||
timeseries_.kSkipEmptyEntries);
|
||||
}
|
||||
|
||||
template <size_t kEpochs>
|
||||
bool MinMaxTracker<kEpochs>::Extrema::operator==(const Extrema& other) const {
|
||||
return (other.max == max) && (other.min == min);
|
||||
}
|
||||
|
||||
// Explicit instantiations of template
|
||||
template class MinMaxTracker<>;
|
||||
template class MinMaxTracker<600>;
|
||||
|
||||
// The logic for actually allocating from the cache or backing, and keeping
|
||||
// the hit rates specified.
|
||||
HugeRange HugeCache::DoGet(HugeLength n, bool* from_released) {
|
||||
auto* node = Find(n);
|
||||
if (!node) {
|
||||
misses_++;
|
||||
weighted_misses_ += n.raw_num();
|
||||
HugeRange res = allocator_->Get(n);
|
||||
if (res.valid()) {
|
||||
*from_released = true;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
hits_++;
|
||||
weighted_hits_ += n.raw_num();
|
||||
*from_released = false;
|
||||
size_ -= n;
|
||||
UpdateSize(size());
|
||||
HugeRange result, leftover;
|
||||
// Put back whatever we have left (or nothing, if it's exact.)
|
||||
std::tie(result, leftover) = Split(node->range(), n);
|
||||
cache_.Remove(node);
|
||||
if (leftover.valid()) {
|
||||
cache_.Insert(leftover);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void HugeCache::MaybeGrowCacheLimit(HugeLength missed) {
|
||||
// Our goal is to make the cache size = the largest "brief dip."
|
||||
//
|
||||
// A "dip" being a case where usage shrinks, then increases back up
|
||||
// to previous levels (at least partially).
|
||||
//
|
||||
// "brief" is "returns to normal usage in < kCacheTime." (In
|
||||
// other words, we ideally want to be willing to cache memory for
|
||||
// kCacheTime before expecting it to be used again--we are loose
|
||||
// on the timing..)
|
||||
//
|
||||
// The interesting part is finding those dips.
|
||||
|
||||
// This is the downward slope: we lost some usage. (This in theory could
|
||||
// be as much as 2 * kCacheTime old, which is fine.)
|
||||
const HugeLength shrink = off_peak_tracker_.MaxOverTime(kCacheTime);
|
||||
|
||||
// This is the upward slope: we are coming back up.
|
||||
const HugeLength grow = usage_ - usage_tracker_.MinOverTime(kCacheTime);
|
||||
|
||||
// Ideally we now know that we dipped down by some amount, then came
|
||||
// up. Sadly our stats aren't quite good enough to guarantee things
|
||||
// happened in the proper order. Suppose our usage takes the
|
||||
// following path (in essentially zero time):
|
||||
// 0, 10000, 5000, 5500.
|
||||
//
|
||||
// Clearly the proven dip here is 500. But we'll compute shrink = 5000,
|
||||
// grow = 5500--we'd prefer to measure from a min *after* that shrink.
|
||||
//
|
||||
// It's difficult to ensure this, and hopefully this case is rare.
|
||||
// TODO(b/134690209): figure out if we can solve that problem.
|
||||
const HugeLength dip = std::min(shrink, grow);
|
||||
|
||||
// Fragmentation: we may need to cache a little more than the actual
|
||||
// usage jump. 10% seems to be a reasonable addition that doesn't waste
|
||||
// much space, but gets good performance on tests.
|
||||
const HugeLength slack = dip / 10;
|
||||
|
||||
const HugeLength lim = dip + slack;
|
||||
|
||||
if (lim > limit()) {
|
||||
last_limit_change_ = clock_.now();
|
||||
limit_ = lim;
|
||||
}
|
||||
}
|
||||
|
||||
void HugeCache::IncUsage(HugeLength n) {
|
||||
usage_ += n;
|
||||
usage_tracker_.Report(usage_);
|
||||
detailed_tracker_.Report(usage_);
|
||||
off_peak_tracker_.Report(NHugePages(0));
|
||||
}
|
||||
|
||||
void HugeCache::DecUsage(HugeLength n) {
|
||||
usage_ -= n;
|
||||
usage_tracker_.Report(usage_);
|
||||
detailed_tracker_.Report(usage_);
|
||||
const HugeLength max = usage_tracker_.MaxOverTime(kCacheTime);
|
||||
ASSERT(max >= usage_);
|
||||
const HugeLength off_peak = max - usage_;
|
||||
off_peak_tracker_.Report(off_peak);
|
||||
}
|
||||
|
||||
void HugeCache::UpdateSize(HugeLength size) {
|
||||
size_tracker_.Report(size);
|
||||
|
||||
// TODO(b/134691947): moving this inside the MinMaxTracker would save one call
|
||||
// to clock_.now() but all MinMaxTrackers would track regret instead.
|
||||
int64_t now = clock_.now();
|
||||
if (now > last_regret_update_) {
|
||||
regret_ += size.raw_num() * (now - last_regret_update_);
|
||||
last_regret_update_ = now;
|
||||
}
|
||||
}
|
||||
|
||||
HugeRange HugeCache::Get(HugeLength n, bool* from_released) {
|
||||
HugeRange r = DoGet(n, from_released);
|
||||
// failure to get a range should "never" "never" happen (VSS limits
|
||||
// or wildly incorrect allocation sizes only...) Don't deal with
|
||||
// this case for cache size accounting.
|
||||
IncUsage(r.len());
|
||||
|
||||
const bool miss = r.valid() && *from_released;
|
||||
if (miss) MaybeGrowCacheLimit(n);
|
||||
return r;
|
||||
}
|
||||
|
||||
void HugeCache::Release(HugeRange r) {
|
||||
DecUsage(r.len());
|
||||
|
||||
cache_.Insert(r);
|
||||
size_ += r.len();
|
||||
if (size_ <= limit()) {
|
||||
fills_++;
|
||||
} else {
|
||||
overflows_++;
|
||||
}
|
||||
|
||||
// Shrink the limit, if we're going to do it, before we shrink to
|
||||
// the max size. (This could reduce the number of regions we break
|
||||
// in half to avoid overshrinking.)
|
||||
if ((clock_.now() - last_limit_change_) > (cache_time_ticks_ * 2)) {
|
||||
total_fast_unbacked_ += MaybeShrinkCacheLimit();
|
||||
}
|
||||
total_fast_unbacked_ += ShrinkCache(limit());
|
||||
|
||||
UpdateSize(size());
|
||||
}
|
||||
|
||||
void HugeCache::ReleaseUnbacked(HugeRange r) {
|
||||
DecUsage(r.len());
|
||||
// No point in trying to cache it, just hand it back.
|
||||
allocator_->Release(r);
|
||||
}
|
||||
|
||||
HugeLength HugeCache::MaybeShrinkCacheLimit() {
|
||||
last_limit_change_ = clock_.now();
|
||||
|
||||
const HugeLength min = size_tracker_.MinOverTime(kCacheTime * 2);
|
||||
// If cache size has gotten down to at most 20% of max, we assume
|
||||
// we're close enough to the optimal size--we don't want to fiddle
|
||||
// too much/too often unless we have large gaps in usage.
|
||||
if (min < limit() / 5) return NHugePages(0);
|
||||
|
||||
// Take away half of the unused portion.
|
||||
HugeLength drop = std::max(min / 2, NHugePages(1));
|
||||
limit_ = std::max(limit() <= drop ? NHugePages(0) : limit() - drop,
|
||||
MinCacheLimit());
|
||||
return ShrinkCache(limit());
|
||||
}
|
||||
|
||||
HugeLength HugeCache::ShrinkCache(HugeLength target) {
|
||||
HugeLength removed = NHugePages(0);
|
||||
while (size_ > target) {
|
||||
// Remove smallest-ish nodes, to avoid fragmentation where possible.
|
||||
auto* node = Find(NHugePages(1));
|
||||
CHECK_CONDITION(node);
|
||||
HugeRange r = node->range();
|
||||
cache_.Remove(node);
|
||||
// Suppose we're 10 MiB over target but the smallest available node
|
||||
// is 100 MiB. Don't go overboard--split up the range.
|
||||
// In particular - this prevents disastrous results if we've decided
|
||||
// the cache should be 99 MiB but the actual hot usage is 100 MiB
|
||||
// (and it is unfragmented).
|
||||
const HugeLength delta = size() - target;
|
||||
if (r.len() > delta) {
|
||||
HugeRange to_remove, leftover;
|
||||
std::tie(to_remove, leftover) = Split(r, delta);
|
||||
ASSERT(leftover.valid());
|
||||
cache_.Insert(leftover);
|
||||
r = to_remove;
|
||||
}
|
||||
|
||||
size_ -= r.len();
|
||||
// Note, actual unback implementation is temporarily dropping and
|
||||
// re-acquiring the page heap lock here.
|
||||
if (ABSL_PREDICT_FALSE(!unback_(r.start_addr(), r.byte_len()))) {
|
||||
// We failed to release r. Retain it in the cache instead of returning it
|
||||
// to the HugeAllocator.
|
||||
size_ += r.len();
|
||||
cache_.Insert(r);
|
||||
break;
|
||||
}
|
||||
allocator_->Release(r);
|
||||
removed += r.len();
|
||||
}
|
||||
|
||||
return removed;
|
||||
}
|
||||
|
||||
HugeLength HugeCache::ReleaseCachedPages(HugeLength n) {
|
||||
// This is a good time to check: is our cache going persistently unused?
|
||||
HugeLength released = MaybeShrinkCacheLimit();
|
||||
|
||||
if (released < n) {
|
||||
n -= released;
|
||||
const HugeLength target = n > size() ? NHugePages(0) : size() - n;
|
||||
released += ShrinkCache(target);
|
||||
}
|
||||
|
||||
UpdateSize(size());
|
||||
total_periodic_unbacked_ += released;
|
||||
return released;
|
||||
}
|
||||
|
||||
void HugeCache::AddSpanStats(SmallSpanStats* small, LargeSpanStats* large,
|
||||
PageAgeHistograms* ages) const {
|
||||
static_assert(kPagesPerHugePage >= kMaxPages);
|
||||
for (const HugeAddressMap::Node* node = cache_.first(); node != nullptr;
|
||||
node = node->next()) {
|
||||
HugeLength n = node->range().len();
|
||||
if (large != nullptr) {
|
||||
large->spans++;
|
||||
large->normal_pages += n.in_pages();
|
||||
}
|
||||
|
||||
if (ages != nullptr) {
|
||||
ages->RecordRange(n.in_pages(), false, node->when());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
HugeAddressMap::Node* HugeCache::Find(HugeLength n) {
|
||||
HugeAddressMap::Node* curr = cache_.root();
|
||||
// invariant: curr != nullptr && curr->longest >= n
|
||||
// we favor smaller gaps and lower nodes and lower addresses, in that
|
||||
// order. The net effect is that we are neither a best-fit nor a
|
||||
// lowest-address allocator but vaguely close to both.
|
||||
HugeAddressMap::Node* best = nullptr;
|
||||
while (curr && curr->longest() >= n) {
|
||||
if (curr->range().len() >= n) {
|
||||
if (!best || best->range().len() > curr->range().len()) {
|
||||
best = curr;
|
||||
}
|
||||
}
|
||||
|
||||
// Either subtree could contain a better fit and we don't want to
|
||||
// search the whole tree. Pick a reasonable child to look at.
|
||||
auto left = curr->left();
|
||||
auto right = curr->right();
|
||||
if (!left || left->longest() < n) {
|
||||
curr = right;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!right || right->longest() < n) {
|
||||
curr = left;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Here, we have a nontrivial choice.
|
||||
if (left->range().len() == right->range().len()) {
|
||||
if (left->longest() <= right->longest()) {
|
||||
curr = left;
|
||||
} else {
|
||||
curr = right;
|
||||
}
|
||||
} else if (left->range().len() < right->range().len()) {
|
||||
// Here, the longest range in both children is the same...look
|
||||
// in the subtree with the smaller root, as that's slightly
|
||||
// more likely to be our best.
|
||||
curr = left;
|
||||
} else {
|
||||
curr = right;
|
||||
}
|
||||
}
|
||||
return best;
|
||||
}
|
||||
|
||||
void HugeCache::Print(Printer* out) {
|
||||
const int64_t millis = absl::ToInt64Milliseconds(kCacheTime);
|
||||
out->printf(
|
||||
"HugeCache: contains unused, backed hugepage(s) "
|
||||
"(kCacheTime = %lldms)\n",
|
||||
millis);
|
||||
// a / (a + b), avoiding division by zero
|
||||
auto safe_ratio = [](double a, double b) {
|
||||
const double total = a + b;
|
||||
if (total == 0) return 0.0;
|
||||
return a / total;
|
||||
};
|
||||
|
||||
const double hit_rate = safe_ratio(hits_, misses_);
|
||||
const double overflow_rate = safe_ratio(overflows_, fills_);
|
||||
|
||||
out->printf(
|
||||
"HugeCache: %zu / %zu hugepages cached / cache limit "
|
||||
"(%.3f hit rate, %.3f overflow rate)\n",
|
||||
size_.raw_num(), limit().raw_num(), hit_rate, overflow_rate);
|
||||
out->printf("HugeCache: %zu MiB fast unbacked, %zu MiB periodic\n",
|
||||
total_fast_unbacked_.in_bytes() / 1024 / 1024,
|
||||
total_periodic_unbacked_.in_bytes() / 1024 / 1024);
|
||||
UpdateSize(size());
|
||||
out->printf(
|
||||
"HugeCache: %zu MiB*s cached since startup\n",
|
||||
NHugePages(regret_).in_mib() / static_cast<size_t>(clock_.freq()));
|
||||
|
||||
usage_tracker_.Report(usage_);
|
||||
const HugeLength usage_min = usage_tracker_.MinOverTime(kCacheTime);
|
||||
const HugeLength usage_max = usage_tracker_.MaxOverTime(kCacheTime);
|
||||
out->printf(
|
||||
"HugeCache: recent usage range: %zu min - %zu curr - %zu max MiB\n",
|
||||
usage_min.in_mib(), usage_.in_mib(), usage_max.in_mib());
|
||||
|
||||
const HugeLength off_peak = usage_max - usage_;
|
||||
off_peak_tracker_.Report(off_peak);
|
||||
const HugeLength off_peak_min = off_peak_tracker_.MinOverTime(kCacheTime);
|
||||
const HugeLength off_peak_max = off_peak_tracker_.MaxOverTime(kCacheTime);
|
||||
out->printf(
|
||||
"HugeCache: recent offpeak range: %zu min - %zu curr - %zu max MiB\n",
|
||||
off_peak_min.in_mib(), off_peak.in_mib(), off_peak_max.in_mib());
|
||||
|
||||
const HugeLength cache_min = size_tracker_.MinOverTime(kCacheTime);
|
||||
const HugeLength cache_max = size_tracker_.MaxOverTime(kCacheTime);
|
||||
out->printf(
|
||||
"HugeCache: recent cache range: %zu min - %zu curr - %zu max MiB\n",
|
||||
cache_min.in_mib(), size_.in_mib(), cache_max.in_mib());
|
||||
|
||||
detailed_tracker_.Print(out);
|
||||
}
|
||||
|
||||
void HugeCache::PrintInPbtxt(PbtxtRegion* hpaa) {
|
||||
hpaa->PrintI64("huge_cache_time_const",
|
||||
absl::ToInt64Milliseconds(kCacheTime));
|
||||
|
||||
// a / (a + b), avoiding division by zero
|
||||
auto safe_ratio = [](double a, double b) {
|
||||
const double total = a + b;
|
||||
if (total == 0) return 0.0;
|
||||
return a / total;
|
||||
};
|
||||
|
||||
const double hit_rate = safe_ratio(hits_, misses_);
|
||||
const double overflow_rate = safe_ratio(overflows_, fills_);
|
||||
|
||||
// number of bytes in HugeCache
|
||||
hpaa->PrintI64("cached_huge_page_bytes", size_.in_bytes());
|
||||
// max allowed bytes in HugeCache
|
||||
hpaa->PrintI64("max_cached_huge_page_bytes", limit().in_bytes());
|
||||
// lifetime cache hit rate
|
||||
hpaa->PrintDouble("huge_cache_hit_rate", hit_rate);
|
||||
// lifetime cache overflow rate
|
||||
hpaa->PrintDouble("huge_cache_overflow_rate", overflow_rate);
|
||||
// bytes eagerly unbacked by HugeCache
|
||||
hpaa->PrintI64("fast_unbacked_bytes", total_fast_unbacked_.in_bytes());
|
||||
// bytes unbacked by periodic releaser thread
|
||||
hpaa->PrintI64("periodic_unbacked_bytes",
|
||||
total_periodic_unbacked_.in_bytes());
|
||||
UpdateSize(size());
|
||||
// memory cached since startup (in MiB*s)
|
||||
hpaa->PrintI64("huge_cache_regret", NHugePages(regret_).in_mib() /
|
||||
static_cast<size_t>(clock_.freq()));
|
||||
|
||||
usage_tracker_.Report(usage_);
|
||||
const HugeLength usage_min = usage_tracker_.MinOverTime(kCacheTime);
|
||||
const HugeLength usage_max = usage_tracker_.MaxOverTime(kCacheTime);
|
||||
{
|
||||
auto usage_stats = hpaa->CreateSubRegion("huge_cache_usage_stats");
|
||||
usage_stats.PrintI64("min_bytes", usage_min.in_bytes());
|
||||
usage_stats.PrintI64("current_bytes", usage_.in_bytes());
|
||||
usage_stats.PrintI64("max_bytes", usage_max.in_bytes());
|
||||
}
|
||||
|
||||
const HugeLength off_peak = usage_max - usage_;
|
||||
off_peak_tracker_.Report(off_peak);
|
||||
const HugeLength off_peak_min = off_peak_tracker_.MinOverTime(kCacheTime);
|
||||
const HugeLength off_peak_max = off_peak_tracker_.MaxOverTime(kCacheTime);
|
||||
{
|
||||
auto usage_stats = hpaa->CreateSubRegion("huge_cache_offpeak_stats");
|
||||
usage_stats.PrintI64("min_bytes", off_peak_min.in_bytes());
|
||||
usage_stats.PrintI64("current_bytes", off_peak.in_bytes());
|
||||
usage_stats.PrintI64("max_bytes", off_peak_max.in_bytes());
|
||||
}
|
||||
|
||||
const HugeLength cache_min = size_tracker_.MinOverTime(kCacheTime);
|
||||
const HugeLength cache_max = size_tracker_.MaxOverTime(kCacheTime);
|
||||
{
|
||||
auto usage_stats = hpaa->CreateSubRegion("huge_cache_cache_stats");
|
||||
usage_stats.PrintI64("min_bytes", cache_min.in_bytes());
|
||||
usage_stats.PrintI64("current_bytes", size_.in_bytes());
|
||||
usage_stats.PrintI64("max_bytes", cache_max.in_bytes());
|
||||
}
|
||||
|
||||
detailed_tracker_.PrintInPbtxt(hpaa);
|
||||
}
|
||||
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
263
src/third_party/tcmalloc/dist/tcmalloc/huge_cache.h
vendored
Normal file
@ -0,0 +1,263 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// Wrapping interface for HugeAllocator that handles backing and
|
||||
// unbacking, including a hot cache of backed single hugepages.
|
||||
#ifndef TCMALLOC_HUGE_CACHE_H_
|
||||
#define TCMALLOC_HUGE_CACHE_H_
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
|
||||
#include "absl/time/time.h"
|
||||
#include "tcmalloc/common.h"
|
||||
#include "tcmalloc/experiment.h"
|
||||
#include "tcmalloc/experiment_config.h"
|
||||
#include "tcmalloc/huge_allocator.h"
|
||||
#include "tcmalloc/huge_pages.h"
|
||||
#include "tcmalloc/internal/config.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
#include "tcmalloc/internal/timeseries_tracker.h"
|
||||
#include "tcmalloc/stats.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
|
||||
class MemoryModifyFunction {
|
||||
using ReleaseFunction = bool (*)(void*, size_t);
|
||||
|
||||
public:
|
||||
explicit MemoryModifyFunction(ReleaseFunction func) : func_(func) {}
|
||||
|
||||
ABSL_MUST_USE_RESULT bool operator()(void* start, size_t len) {
|
||||
return func_(start, len);
|
||||
}
|
||||
|
||||
private:
|
||||
ReleaseFunction func_;
|
||||
};
|
||||
|
||||
// Track the extreme values of a HugeLength value over the past
|
||||
// kWindow (time ranges approximate.)
|
||||
template <size_t kEpochs = 16>
|
||||
class MinMaxTracker {
|
||||
public:
|
||||
explicit constexpr MinMaxTracker(Clock clock, absl::Duration w)
|
||||
: kEpochLength(w / kEpochs), timeseries_(clock, w) {}
|
||||
|
||||
void Report(HugeLength val);
|
||||
void Print(Printer* out) const;
|
||||
void PrintInPbtxt(PbtxtRegion* hpaa) const;
|
||||
|
||||
// If t < kEpochLength, these functions return statistics for last epoch. The
|
||||
// granularity is kEpochLength (rounded up).
|
||||
HugeLength MaxOverTime(absl::Duration t) const;
|
||||
HugeLength MinOverTime(absl::Duration t) const;
|
||||
|
||||
private:
|
||||
const absl::Duration kEpochLength;
|
||||
|
||||
static constexpr HugeLength kMaxVal =
|
||||
NHugePages(std::numeric_limits<size_t>::max());
|
||||
struct Extrema {
|
||||
HugeLength min, max;
|
||||
|
||||
static Extrema Nil() {
|
||||
Extrema e;
|
||||
e.max = NHugePages(0);
|
||||
e.min = kMaxVal;
|
||||
return e;
|
||||
}
|
||||
|
||||
void Report(HugeLength n) {
|
||||
max = std::max(max, n);
|
||||
min = std::min(min, n);
|
||||
}
|
||||
|
||||
bool empty() const { return (*this == Nil()); }
|
||||
|
||||
bool operator==(const Extrema& other) const;
|
||||
};
|
||||
|
||||
TimeSeriesTracker<Extrema, HugeLength, kEpochs> timeseries_;
|
||||
};
|
||||
|
||||
// Explicit instantiations are defined in huge_cache.cc.
|
||||
extern template class MinMaxTracker<>;
|
||||
extern template class MinMaxTracker<600>;
|
||||
|
||||
template <size_t kEpochs>
|
||||
constexpr HugeLength MinMaxTracker<kEpochs>::kMaxVal;
|
||||
|
||||
class HugeCache {
|
||||
public:
|
||||
// For use in production
|
||||
HugeCache(HugeAllocator* allocator, MetadataAllocFunction meta_allocate,
|
||||
MemoryModifyFunction unback)
|
||||
: HugeCache(allocator, meta_allocate, unback,
|
||||
Clock{.now = absl::base_internal::CycleClock::Now,
|
||||
.freq = absl::base_internal::CycleClock::Frequency}) {}
|
||||
|
||||
// For testing with mock clock.
|
||||
//
|
||||
// 2s (kCacheTime * 2) looks like an arbitrary window; it mostly is.
|
||||
//
|
||||
// Suffice to say that the below code (see MaybeGrowCacheLimit)
|
||||
// tries to make sure the cache is sized to protect a working set
|
||||
// that ebbs for 1 second, as a reasonable heuristic. This means it
|
||||
// needs 1s of historical data to examine.
|
||||
//
|
||||
// Why 2s duration, then? Two reasons:
|
||||
//
|
||||
// - (minor) granularity of epoch boundaries make me want to err towards
|
||||
// keeping a bit too much data over a bit too little.
|
||||
//
|
||||
// - (major) hysteresis: in ReleaseCachedPages we try to detect
|
||||
// mistaken cache expansion and reverse it. I hope that using a
|
||||
// longer timescale than our expansion will increase stability
|
||||
// here: I will take some caches staying a bit too big over caches
|
||||
// oscillating back and forth between two size estimates, so we
|
||||
// require stronger evidence (longer time) to reverse an expansion
|
||||
// than to make it.
|
||||
//
|
||||
// We also tried other algorithms, but this one is simple and suffices to
|
||||
// capture the empirical dynamics we've seen. See "Beyond Malloc
|
||||
// Efficiency..." (https://research.google/pubs/pub50370/) for more
|
||||
// information.
|
||||
HugeCache(HugeAllocator* allocator, MetadataAllocFunction meta_allocate,
|
||||
MemoryModifyFunction unback, Clock clock)
|
||||
: allocator_(allocator),
|
||||
cache_(meta_allocate),
|
||||
clock_(clock),
|
||||
cache_time_ticks_(clock_.freq() * absl::ToDoubleSeconds(kCacheTime)),
|
||||
nanoseconds_per_tick_(absl::ToInt64Nanoseconds(absl::Seconds(1)) /
|
||||
clock_.freq()),
|
||||
last_limit_change_(clock.now()),
|
||||
last_regret_update_(clock.now()),
|
||||
detailed_tracker_(clock, absl::Minutes(10)),
|
||||
usage_tracker_(clock, kCacheTime * 2),
|
||||
off_peak_tracker_(clock, kCacheTime * 2),
|
||||
size_tracker_(clock, kCacheTime * 2),
|
||||
unback_(unback) {}
|
||||
// Allocate a usable set of <n> contiguous hugepages. Try to give out
|
||||
// memory that's currently backed from the kernel if we have it available.
|
||||
// *from_released is set to false if the return range is already backed;
|
||||
// otherwise, it is set to true (and the caller should back it.)
|
||||
HugeRange Get(HugeLength n, bool* from_released);
|
||||
|
||||
// Deallocate <r> (assumed to be backed by the kernel.)
|
||||
void Release(HugeRange r);
|
||||
// As Release, but the range is assumed to _not_ be backed.
|
||||
void ReleaseUnbacked(HugeRange r);
|
||||
|
||||
// Release to the system up to <n> hugepages of cache contents; returns
|
||||
// the number of hugepages released.
|
||||
HugeLength ReleaseCachedPages(HugeLength n);
|
||||
|
||||
// Backed memory available.
|
||||
HugeLength size() const { return size_; }
|
||||
// Total memory cached (in HugeLength * nanoseconds)
|
||||
uint64_t regret() const { return regret_ * nanoseconds_per_tick_; }
|
||||
// Current limit for how much backed memory we'll cache.
|
||||
HugeLength limit() const { return limit_; }
|
||||
// Sum total of unreleased requests.
|
||||
HugeLength usage() const { return usage_; }
|
||||
|
||||
void AddSpanStats(SmallSpanStats* small, LargeSpanStats* large,
|
||||
PageAgeHistograms* ages) const;
|
||||
|
||||
BackingStats stats() const {
|
||||
BackingStats s;
|
||||
s.system_bytes = (usage() + size()).in_bytes();
|
||||
s.free_bytes = size().in_bytes();
|
||||
s.unmapped_bytes = 0;
|
||||
return s;
|
||||
}
|
||||
|
||||
void Print(Printer* out);
|
||||
void PrintInPbtxt(PbtxtRegion* hpaa);
|
||||
|
||||
private:
|
||||
HugeAllocator* allocator_;
|
||||
|
||||
// We just cache-missed a request for <missed> pages;
|
||||
// should we grow?
|
||||
void MaybeGrowCacheLimit(HugeLength missed);
|
||||
// Check if the cache seems consistently too big. Returns the
|
||||
// number of pages *evicted* (not the change in limit).
|
||||
HugeLength MaybeShrinkCacheLimit();
|
||||
|
||||
// Ensure the cache contains at most <target> hugepages,
|
||||
// returning the number removed.
|
||||
HugeLength ShrinkCache(HugeLength target);
|
||||
|
||||
HugeRange DoGet(HugeLength n, bool* from_released);
|
||||
|
||||
HugeAddressMap::Node* Find(HugeLength n);
|
||||
|
||||
HugeAddressMap cache_;
|
||||
HugeLength size_{NHugePages(0)};
|
||||
|
||||
HugeLength limit_{NHugePages(10)};
|
||||
const absl::Duration kCacheTime = absl::Seconds(1);
|
||||
|
||||
size_t hits_{0};
|
||||
size_t misses_{0};
|
||||
size_t fills_{0};
|
||||
size_t overflows_{0};
|
||||
uint64_t weighted_hits_{0};
|
||||
uint64_t weighted_misses_{0};
|
||||
|
||||
// Sum(size of Gets) - Sum(size of Releases), i.e. amount of backed
|
||||
// hugepages our user currently wants to have.
|
||||
void IncUsage(HugeLength n);
|
||||
void DecUsage(HugeLength n);
|
||||
HugeLength usage_{NHugePages(0)};
|
||||
|
||||
// This is CycleClock, except overridable for tests.
|
||||
Clock clock_;
|
||||
const int64_t cache_time_ticks_;
|
||||
const double nanoseconds_per_tick_;
|
||||
|
||||
int64_t last_limit_change_;
|
||||
|
||||
// 10 hugepages is a good baseline for our cache--easily wiped away
|
||||
// by periodic release, and not that much memory on any real server.
|
||||
// However, we can go below it if we haven't used that much for 30 seconds.
|
||||
HugeLength MinCacheLimit() const { return NHugePages(10); }
|
||||
|
||||
uint64_t regret_{0}; // overflows if we cache 585 hugepages for 1 year
|
||||
int64_t last_regret_update_;
|
||||
void UpdateSize(HugeLength size);
|
||||
|
||||
MinMaxTracker<600> detailed_tracker_;
|
||||
|
||||
MinMaxTracker<> usage_tracker_;
|
||||
MinMaxTracker<> off_peak_tracker_;
|
||||
MinMaxTracker<> size_tracker_;
|
||||
|
||||
HugeLength total_fast_unbacked_{NHugePages(0)};
|
||||
HugeLength total_periodic_unbacked_{NHugePages(0)};
|
||||
|
||||
MemoryModifyFunction unback_;
|
||||
};
|
||||
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
|
||||
#endif // TCMALLOC_HUGE_CACHE_H_
|
||||
622
src/third_party/tcmalloc/dist/tcmalloc/huge_cache_test.cc
vendored
Normal file
@ -0,0 +1,622 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tcmalloc/huge_cache.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "gmock/gmock.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "absl/base/internal/cycleclock.h"
|
||||
#include "absl/memory/memory.h"
|
||||
#include "absl/random/random.h"
|
||||
#include "absl/strings/str_cat.h"
|
||||
#include "absl/time/clock.h"
|
||||
#include "absl/time/time.h"
|
||||
#include "tcmalloc/huge_pages.h"
|
||||
#include "tcmalloc/internal/clock.h"
|
||||
#include "tcmalloc/internal/config.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
#include "tcmalloc/stats.h"
|
||||
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
namespace {
|
||||
|
||||
using testing::Return;
|
||||
|
||||
class HugeCacheTest : public testing::Test {
|
||||
private:
|
||||
// Allow tests to modify the clock used by the cache.
|
||||
static int64_t clock_offset_;
|
||||
static double GetClockFrequency() {
|
||||
return absl::base_internal::CycleClock::Frequency();
|
||||
}
|
||||
static int64_t GetClock() {
|
||||
return absl::base_internal::CycleClock::Now() +
|
||||
clock_offset_ * GetClockFrequency() /
|
||||
absl::ToDoubleNanoseconds(absl::Seconds(1));
|
||||
}
|
||||
|
||||
// Use a tiny fraction of actual size so we can test aggressively.
|
||||
static AddressRange AllocateFake(size_t bytes, size_t align) {
|
||||
if (bytes % kHugePageSize != 0) {
|
||||
Crash(kCrash, __FILE__, __LINE__, "not aligned", bytes, kHugePageSize);
|
||||
}
|
||||
if (align % kHugePageSize != 0) {
|
||||
Crash(kCrash, __FILE__, __LINE__, "not aligned", align, kHugePageSize);
|
||||
}
|
||||
// we'll actually provide hidden backing, one word per hugepage.
|
||||
bytes /= kHugePageSize;
|
||||
align /= kHugePageSize;
|
||||
size_t index = backing.size();
|
||||
if (index % align != 0) {
|
||||
index += (align - (index & align));
|
||||
}
|
||||
backing.resize(index + bytes);
|
||||
void* ptr = reinterpret_cast<void*>(index * kHugePageSize);
|
||||
return {ptr, bytes * kHugePageSize};
|
||||
}
|
||||
// This isn't super good form but we'll never have more than one HAT
|
||||
// extant at once.
|
||||
static std::vector<size_t> backing;
|
||||
|
||||
// We use actual malloc for metadata allocations, but we track them so they
|
||||
// can be deleted. (TODO make this an arena if we care, which I doubt)
|
||||
static void* MallocMetadata(size_t size) {
|
||||
metadata_bytes += size;
|
||||
void* ptr = calloc(size, 1);
|
||||
metadata_allocs.push_back(ptr);
|
||||
return ptr;
|
||||
}
|
||||
static std::vector<void*> metadata_allocs;
|
||||
static size_t metadata_bytes;
|
||||
|
||||
// This is wordy, but necessary for mocking:
|
||||
class BackingInterface {
|
||||
public:
|
||||
virtual bool Unback(void* p, size_t len) = 0;
|
||||
virtual ~BackingInterface() {}
|
||||
};
|
||||
|
||||
class MockBackingInterface : public BackingInterface {
|
||||
public:
|
||||
MOCK_METHOD(bool, Unback, (void* p, size_t len), (override));
|
||||
};
|
||||
|
||||
static bool MockUnback(void* p, size_t len) { return mock_->Unback(p, len); }
|
||||
|
||||
protected:
|
||||
static std::unique_ptr<testing::NiceMock<MockBackingInterface>> mock_;
|
||||
|
||||
HugeCacheTest() {
|
||||
// We don't use the first few bytes, because things might get weird
|
||||
// given zero pointers.
|
||||
backing.resize(1024);
|
||||
metadata_bytes = 0;
|
||||
mock_ = absl::make_unique<testing::NiceMock<MockBackingInterface>>();
|
||||
}
|
||||
|
||||
~HugeCacheTest() override {
|
||||
for (void* p : metadata_allocs) {
|
||||
free(p);
|
||||
}
|
||||
metadata_allocs.clear();
|
||||
backing.clear();
|
||||
mock_.reset(nullptr);
|
||||
|
||||
clock_offset_ = 0;
|
||||
}
|
||||
|
||||
void Advance(absl::Duration d) {
|
||||
clock_offset_ += absl::ToInt64Nanoseconds(d);
|
||||
}
|
||||
|
||||
HugeAllocator alloc_{AllocateFake, MallocMetadata};
|
||||
HugeCache cache_{&alloc_, MallocMetadata, MemoryModifyFunction(MockUnback),
|
||||
Clock{.now = GetClock, .freq = GetClockFrequency}};
|
||||
};
|
||||
|
||||
std::vector<size_t> HugeCacheTest::backing;
|
||||
std::vector<void*> HugeCacheTest::metadata_allocs;
|
||||
size_t HugeCacheTest::metadata_bytes;
|
||||
std::unique_ptr<testing::NiceMock<HugeCacheTest::MockBackingInterface>>
|
||||
HugeCacheTest::mock_;
|
||||
|
||||
int64_t HugeCacheTest::clock_offset_ = 0;
|
||||
|
||||
TEST_F(HugeCacheTest, Basic) {
|
||||
bool from;
|
||||
for (int i = 0; i < 100 * 1000; ++i) {
|
||||
cache_.Release(cache_.Get(NHugePages(1), &from));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(HugeCacheTest, Backing) {
|
||||
bool from;
|
||||
cache_.Release(cache_.Get(NHugePages(4), &from));
|
||||
EXPECT_TRUE(from);
|
||||
// We should be able to split up a large range...
|
||||
HugeRange r1 = cache_.Get(NHugePages(3), &from);
|
||||
EXPECT_FALSE(from);
|
||||
HugeRange r2 = cache_.Get(NHugePages(1), &from);
|
||||
EXPECT_FALSE(from);
|
||||
|
||||
// and then merge it back.
|
||||
cache_.Release(r1);
|
||||
cache_.Release(r2);
|
||||
HugeRange r = cache_.Get(NHugePages(4), &from);
|
||||
EXPECT_FALSE(from);
|
||||
cache_.Release(r);
|
||||
}
|
||||
|
||||
TEST_F(HugeCacheTest, Release) {
|
||||
bool from;
|
||||
const HugeLength one = NHugePages(1);
|
||||
cache_.Release(cache_.Get(NHugePages(5), &from));
|
||||
HugeRange r1, r2, r3, r4, r5;
|
||||
r1 = cache_.Get(one, &from);
|
||||
r2 = cache_.Get(one, &from);
|
||||
r3 = cache_.Get(one, &from);
|
||||
r4 = cache_.Get(one, &from);
|
||||
r5 = cache_.Get(one, &from);
|
||||
cache_.Release(r1);
|
||||
cache_.Release(r2);
|
||||
cache_.Release(r3);
|
||||
cache_.Release(r4);
|
||||
cache_.Release(r5);
|
||||
|
||||
r1 = cache_.Get(one, &from);
|
||||
ASSERT_EQ(false, from);
|
||||
r2 = cache_.Get(one, &from);
|
||||
ASSERT_EQ(false, from);
|
||||
r3 = cache_.Get(one, &from);
|
||||
ASSERT_EQ(false, from);
|
||||
r4 = cache_.Get(one, &from);
|
||||
ASSERT_EQ(false, from);
|
||||
r5 = cache_.Get(one, &from);
|
||||
ASSERT_EQ(false, from);
|
||||
cache_.Release(r1);
|
||||
cache_.Release(r2);
|
||||
cache_.Release(r5);
|
||||
|
||||
ASSERT_EQ(NHugePages(3), cache_.size());
|
||||
EXPECT_CALL(*mock_, Unback(r5.start_addr(), kHugePageSize * 1))
|
||||
.WillOnce(Return(true));
|
||||
EXPECT_EQ(NHugePages(1), cache_.ReleaseCachedPages(NHugePages(1)));
|
||||
cache_.Release(r3);
|
||||
cache_.Release(r4);
|
||||
|
||||
EXPECT_CALL(*mock_, Unback(r1.start_addr(), 4 * kHugePageSize))
|
||||
.WillOnce(Return(true));
|
||||
EXPECT_EQ(NHugePages(4), cache_.ReleaseCachedPages(NHugePages(200)));
|
||||
}
|
||||
|
||||
TEST_F(HugeCacheTest, ReleaseFailure) {
|
||||
bool from;
|
||||
const HugeLength one = NHugePages(1);
|
||||
cache_.Release(cache_.Get(NHugePages(5), &from));
|
||||
HugeRange r1, r2, r3, r4, r5;
|
||||
r1 = cache_.Get(one, &from);
|
||||
r2 = cache_.Get(one, &from);
|
||||
r3 = cache_.Get(one, &from);
|
||||
r4 = cache_.Get(one, &from);
|
||||
r5 = cache_.Get(one, &from);
|
||||
cache_.Release(r1);
|
||||
cache_.Release(r2);
|
||||
cache_.Release(r3);
|
||||
cache_.Release(r4);
|
||||
cache_.Release(r5);
|
||||
|
||||
r1 = cache_.Get(one, &from);
|
||||
ASSERT_EQ(false, from);
|
||||
r2 = cache_.Get(one, &from);
|
||||
ASSERT_EQ(false, from);
|
||||
r3 = cache_.Get(one, &from);
|
||||
ASSERT_EQ(false, from);
|
||||
r4 = cache_.Get(one, &from);
|
||||
ASSERT_EQ(false, from);
|
||||
r5 = cache_.Get(one, &from);
|
||||
ASSERT_EQ(false, from);
|
||||
cache_.Release(r1);
|
||||
cache_.Release(r2);
|
||||
cache_.Release(r5);
|
||||
|
||||
ASSERT_EQ(NHugePages(3), cache_.size());
|
||||
EXPECT_CALL(*mock_, Unback(r5.start_addr(), 1 * kHugePageSize))
|
||||
.WillOnce(Return(false));
|
||||
EXPECT_EQ(NHugePages(0), cache_.ReleaseCachedPages(NHugePages(1)));
|
||||
cache_.Release(r3);
|
||||
cache_.Release(r4);
|
||||
|
||||
EXPECT_CALL(*mock_, Unback(r1.start_addr(), 5 * kHugePageSize))
|
||||
.WillOnce(Return(false));
|
||||
EXPECT_EQ(NHugePages(0), cache_.ReleaseCachedPages(NHugePages(200)));
|
||||
}
|
||||
|
||||
TEST_F(HugeCacheTest, Regret) {
|
||||
bool from;
|
||||
HugeRange r = cache_.Get(NHugePages(20), &from);
|
||||
cache_.Release(r);
|
||||
HugeLength cached = cache_.size();
|
||||
absl::Duration d = absl::Seconds(20);
|
||||
Advance(d);
|
||||
char buf[512];
|
||||
Printer out(buf, 512);
|
||||
cache_.Print(&out); // To update the regret
|
||||
uint64_t expected_regret = absl::ToInt64Nanoseconds(d) * cached.raw_num();
|
||||
// Not exactly accurate since the mock clock advances with real time, and
|
||||
// when we measure regret will be updated.
|
||||
EXPECT_NEAR(cache_.regret(), expected_regret, expected_regret / 100);
|
||||
EXPECT_GE(cache_.regret(), expected_regret);
|
||||
}
|
||||
|
||||
TEST_F(HugeCacheTest, Stats) {
|
||||
bool from;
|
||||
HugeRange r = cache_.Get(NHugePages(1 + 1 + 2 + 1 + 3), &from);
|
||||
HugeRange r1, r2, r3, spacer1, spacer2;
|
||||
std::tie(r1, spacer1) = Split(r, NHugePages(1));
|
||||
std::tie(spacer1, r2) = Split(spacer1, NHugePages(1));
|
||||
std::tie(r2, spacer2) = Split(r2, NHugePages(2));
|
||||
std::tie(spacer2, r3) = Split(spacer2, NHugePages(1));
|
||||
cache_.Release(r1);
|
||||
cache_.Release(r2);
|
||||
cache_.Release(r3);
|
||||
|
||||
ASSERT_EQ(NHugePages(6), cache_.size());
|
||||
r1 = cache_.Get(NHugePages(1), &from);
|
||||
ASSERT_EQ(false, from);
|
||||
r2 = cache_.Get(NHugePages(2), &from);
|
||||
ASSERT_EQ(false, from);
|
||||
r3 = cache_.Get(NHugePages(3), &from);
|
||||
ASSERT_EQ(false, from);
|
||||
|
||||
struct Helper {
|
||||
static void Stat(const HugeCache& cache, size_t* spans,
|
||||
Length* pages_backed, Length* pages_unbacked,
|
||||
double* avg_age) {
|
||||
PageAgeHistograms ages(absl::base_internal::CycleClock::Now());
|
||||
LargeSpanStats large;
|
||||
cache.AddSpanStats(nullptr, &large, &ages);
|
||||
|
||||
const PageAgeHistograms::Histogram* hist = ages.GetTotalHistogram(false);
|
||||
*spans = large.spans;
|
||||
*pages_backed = large.normal_pages;
|
||||
*pages_unbacked = large.returned_pages;
|
||||
*avg_age = hist->avg_age();
|
||||
}
|
||||
};
|
||||
|
||||
double avg_age;
|
||||
size_t spans;
|
||||
Length pages_backed;
|
||||
Length pages_unbacked;
|
||||
|
||||
cache_.Release(r1);
|
||||
absl::SleepFor(absl::Microseconds(5000));
|
||||
Helper::Stat(cache_, &spans, &pages_backed, &pages_unbacked, &avg_age);
|
||||
EXPECT_EQ(Length(0), pages_unbacked);
|
||||
EXPECT_EQ(1, spans);
|
||||
EXPECT_EQ(NHugePages(1).in_pages(), pages_backed);
|
||||
EXPECT_LE(0.005, avg_age);
|
||||
|
||||
cache_.Release(r2);
|
||||
absl::SleepFor(absl::Microseconds(2500));
|
||||
Helper::Stat(cache_, &spans, &pages_backed, &pages_unbacked, &avg_age);
|
||||
EXPECT_EQ(Length(0), pages_unbacked);
|
||||
EXPECT_EQ(2, spans);
|
||||
EXPECT_EQ(NHugePages(3).in_pages(), pages_backed);
|
||||
EXPECT_LE((0.0075 * 1 + 0.0025 * 2) / (1 + 2), avg_age);
|
||||
|
||||
cache_.Release(r3);
|
||||
absl::SleepFor(absl::Microseconds(1250));
|
||||
Helper::Stat(cache_, &spans, &pages_backed, &pages_unbacked, &avg_age);
|
||||
EXPECT_EQ(Length(0), pages_unbacked);
|
||||
EXPECT_EQ(3, spans);
|
||||
EXPECT_EQ(NHugePages(6).in_pages(), pages_backed);
|
||||
EXPECT_LE((0.00875 * 1 + 0.00375 * 2 + 0.00125 * 3) / (1 + 2 + 3), avg_age);
|
||||
}
|
||||
|
||||
static double Frac(HugeLength num, HugeLength denom) {
|
||||
return static_cast<double>(num.raw_num()) / denom.raw_num();
|
||||
}
|
||||
|
||||
TEST_F(HugeCacheTest, Growth) {
|
||||
EXPECT_CALL(*mock_, Unback(testing::_, testing::_))
|
||||
.WillRepeatedly(Return(true));
|
||||
|
||||
bool released;
|
||||
absl::BitGen rng;
|
||||
// fragmentation is a bit of a challenge
|
||||
std::uniform_int_distribution<size_t> sizes(1, 5);
|
||||
// fragment the cache badly.
|
||||
std::vector<HugeRange> keep;
|
||||
std::vector<HugeRange> drop;
|
||||
for (int i = 0; i < 1000; ++i) {
|
||||
auto& l = std::bernoulli_distribution()(rng) ? keep : drop;
|
||||
l.push_back(cache_.Get(NHugePages(sizes(rng)), &released));
|
||||
}
|
||||
|
||||
for (auto r : drop) {
|
||||
cache_.Release(r);
|
||||
}
|
||||
|
||||
// See the TODO in HugeCache::MaybeGrowCache; without this delay,
|
||||
// the above fragmentation plays merry havoc with our instrumentation.
|
||||
Advance(absl::Seconds(30));
|
||||
|
||||
// Test that our cache can grow to fit a working set.
|
||||
HugeLength hot_set_sizes[] = {NHugePages(5), NHugePages(10), NHugePages(100),
|
||||
NHugePages(10000)};
|
||||
|
||||
for (const HugeLength hot : hot_set_sizes) {
|
||||
SCOPED_TRACE(absl::StrCat("cache size = ", hot.in_bytes() / 1024.0 / 1024.0,
|
||||
" MiB"));
|
||||
// Exercise the cache allocating about <hot> worth of data. After
|
||||
// a brief warmup phase, we should do this without needing to back much.
|
||||
auto alloc = [&]() -> std::pair<HugeLength, HugeLength> {
|
||||
HugeLength got = NHugePages(0);
|
||||
HugeLength needed_backing = NHugePages(0);
|
||||
std::vector<HugeRange> items;
|
||||
while (got < hot) {
|
||||
HugeLength rest = hot - got;
|
||||
HugeLength l = std::min(rest, NHugePages(sizes(rng)));
|
||||
got += l;
|
||||
items.push_back(cache_.Get(l, &released));
|
||||
if (released) needed_backing += l;
|
||||
}
|
||||
for (auto r : items) {
|
||||
cache_.Release(r);
|
||||
}
|
||||
return {needed_backing, got};
|
||||
};
|
||||
|
||||
// warmup - we're allowed to incur misses and be too big.
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
alloc();
|
||||
}
|
||||
|
||||
HugeLength needed_backing = NHugePages(0);
|
||||
HugeLength total = NHugePages(0);
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
auto r = alloc();
|
||||
needed_backing += r.first;
|
||||
total += r.second;
|
||||
// Cache shouldn't have just grown arbitrarily
|
||||
const HugeLength cached = cache_.size();
|
||||
// Allow us 10% slop, but don't get out of bed for tiny caches anyway.
|
||||
const double ratio = Frac(cached, hot);
|
||||
SCOPED_TRACE(
|
||||
absl::StrCat(cached.raw_num(), "hps ", Frac(r.first, r.second)));
|
||||
if (ratio > 1 && cached > NHugePages(16)) {
|
||||
EXPECT_LE(ratio, 1.1);
|
||||
}
|
||||
}
|
||||
// approximately, given the randomized sizing...
|
||||
|
||||
const double ratio = Frac(needed_backing, total);
|
||||
EXPECT_LE(ratio, 0.3);
|
||||
}
|
||||
}
|
||||
|
||||
// If we repeatedly grow and shrink, but do so very slowly, we should *not*
|
||||
// cache the large variation.
|
||||
TEST_F(HugeCacheTest, SlowGrowthUncached) {
|
||||
EXPECT_CALL(*mock_, Unback(testing::_, testing::_))
|
||||
.WillRepeatedly(Return(true));
|
||||
|
||||
absl::BitGen rng;
|
||||
std::uniform_int_distribution<size_t> sizes(1, 10);
|
||||
for (int i = 0; i < 20; ++i) {
|
||||
std::vector<HugeRange> rs;
|
||||
for (int j = 0; j < 20; ++j) {
|
||||
Advance(absl::Milliseconds(600));
|
||||
bool released;
|
||||
rs.push_back(cache_.Get(NHugePages(sizes(rng)), &released));
|
||||
}
|
||||
HugeLength max_cached = NHugePages(0);
|
||||
for (auto r : rs) {
|
||||
Advance(absl::Milliseconds(600));
|
||||
cache_.Release(r);
|
||||
max_cached = std::max(max_cached, cache_.size());
|
||||
}
|
||||
EXPECT_GE(NHugePages(10), max_cached);
|
||||
}
|
||||
}
|
||||
|
||||
// If very rarely we have a huge increase in usage, it shouldn't be cached.
|
||||
TEST_F(HugeCacheTest, SpikesUncached) {
|
||||
EXPECT_CALL(*mock_, Unback(testing::_, testing::_))
|
||||
.WillRepeatedly(Return(true));
|
||||
|
||||
absl::BitGen rng;
|
||||
std::uniform_int_distribution<size_t> sizes(1, 10);
|
||||
for (int i = 0; i < 20; ++i) {
|
||||
std::vector<HugeRange> rs;
|
||||
for (int j = 0; j < 2000; ++j) {
|
||||
bool released;
|
||||
rs.push_back(cache_.Get(NHugePages(sizes(rng)), &released));
|
||||
}
|
||||
HugeLength max_cached = NHugePages(0);
|
||||
for (auto r : rs) {
|
||||
cache_.Release(r);
|
||||
max_cached = std::max(max_cached, cache_.size());
|
||||
}
|
||||
EXPECT_GE(NHugePages(10), max_cached);
|
||||
Advance(absl::Seconds(30));
|
||||
}
|
||||
}
|
||||
|
||||
// If very rarely we have a huge *decrease* in usage, it *should* be cached.
|
||||
TEST_F(HugeCacheTest, DipsCached) {
|
||||
absl::BitGen rng;
|
||||
std::uniform_int_distribution<size_t> sizes(1, 10);
|
||||
for (int i = 0; i < 20; ++i) {
|
||||
std::vector<HugeRange> rs;
|
||||
HugeLength got = NHugePages(0);
|
||||
HugeLength uncached = NHugePages(0);
|
||||
for (int j = 0; j < 2000; ++j) {
|
||||
bool released;
|
||||
HugeLength n = NHugePages(sizes(rng));
|
||||
rs.push_back(cache_.Get(n, &released));
|
||||
got += n;
|
||||
if (released) uncached += n;
|
||||
}
|
||||
// Most of our time is at high usage...
|
||||
Advance(absl::Seconds(30));
|
||||
// Now immediately release and reallocate.
|
||||
for (auto r : rs) {
|
||||
cache_.Release(r);
|
||||
}
|
||||
|
||||
// warmup
|
||||
if (i >= 2) {
|
||||
EXPECT_GE(0.07, Frac(uncached, got));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Suppose in a previous era of behavior we needed a giant cache,
|
||||
// but now we don't. Do we figure this out promptly?
|
||||
TEST_F(HugeCacheTest, Shrink) {
|
||||
absl::BitGen rng;
|
||||
std::uniform_int_distribution<size_t> sizes(1, 10);
|
||||
for (int i = 0; i < 20; ++i) {
|
||||
std::vector<HugeRange> rs;
|
||||
for (int j = 0; j < 2000; ++j) {
|
||||
HugeLength n = NHugePages(sizes(rng));
|
||||
bool released;
|
||||
rs.push_back(cache_.Get(n, &released));
|
||||
}
|
||||
for (auto r : rs) {
|
||||
cache_.Release(r);
|
||||
}
|
||||
}
|
||||
|
||||
ASSERT_LE(NHugePages(10000), cache_.size());
|
||||
|
||||
for (int i = 0; i < 30; ++i) {
|
||||
// New working set <= 20 pages.
|
||||
Advance(absl::Seconds(1));
|
||||
|
||||
// And do some work.
|
||||
for (int j = 0; j < 100; ++j) {
|
||||
bool released;
|
||||
HugeRange r1 = cache_.Get(NHugePages(sizes(rng)), &released);
|
||||
HugeRange r2 = cache_.Get(NHugePages(sizes(rng)), &released);
|
||||
cache_.Release(r1);
|
||||
cache_.Release(r2);
|
||||
}
|
||||
}
|
||||
|
||||
ASSERT_GE(NHugePages(25), cache_.limit());
|
||||
}
|
||||
|
||||
TEST_F(HugeCacheTest, Usage) {
|
||||
bool released;
|
||||
|
||||
auto r1 = cache_.Get(NHugePages(10), &released);
|
||||
EXPECT_EQ(NHugePages(10), cache_.usage());
|
||||
|
||||
auto r2 = cache_.Get(NHugePages(100), &released);
|
||||
EXPECT_EQ(NHugePages(110), cache_.usage());
|
||||
|
||||
cache_.Release(r1);
|
||||
EXPECT_EQ(NHugePages(100), cache_.usage());
|
||||
|
||||
// Pretend we unbacked this.
|
||||
cache_.ReleaseUnbacked(r2);
|
||||
EXPECT_EQ(NHugePages(0), cache_.usage());
|
||||
}
|
||||
|
||||
class MinMaxTrackerTest : public testing::Test {
|
||||
protected:
|
||||
void Advance(absl::Duration d) {
|
||||
clock_ += absl::ToDoubleSeconds(d) * GetFakeClockFrequency();
|
||||
}
|
||||
|
||||
static int64_t FakeClock() { return clock_; }
|
||||
|
||||
static double GetFakeClockFrequency() {
|
||||
return absl::ToDoubleNanoseconds(absl::Seconds(2));
|
||||
}
|
||||
|
||||
private:
|
||||
static int64_t clock_;
|
||||
};
|
||||
|
||||
int64_t MinMaxTrackerTest::clock_{0};
|
||||
|
||||
TEST_F(MinMaxTrackerTest, Works) {
|
||||
const absl::Duration kDuration = absl::Seconds(2);
|
||||
MinMaxTracker<> tracker{
|
||||
Clock{.now = FakeClock, .freq = GetFakeClockFrequency}, kDuration};
|
||||
|
||||
tracker.Report(NHugePages(0));
|
||||
EXPECT_EQ(NHugePages(0), tracker.MaxOverTime(kDuration));
|
||||
EXPECT_EQ(NHugePages(0), tracker.MinOverTime(kDuration));
|
||||
|
||||
tracker.Report(NHugePages(10));
|
||||
EXPECT_EQ(NHugePages(10), tracker.MaxOverTime(kDuration));
|
||||
EXPECT_EQ(NHugePages(0), tracker.MinOverTime(kDuration));
|
||||
|
||||
tracker.Report(NHugePages(5));
|
||||
EXPECT_EQ(NHugePages(10), tracker.MaxOverTime(kDuration));
|
||||
EXPECT_EQ(NHugePages(0), tracker.MinOverTime(kDuration));
|
||||
|
||||
tracker.Report(NHugePages(100));
|
||||
EXPECT_EQ(NHugePages(100), tracker.MaxOverTime(kDuration));
|
||||
EXPECT_EQ(NHugePages(0), tracker.MinOverTime(kDuration));
|
||||
|
||||
// Some tests for advancing time
|
||||
Advance(kDuration / 3);
|
||||
tracker.Report(NHugePages(2));
|
||||
EXPECT_EQ(NHugePages(2), tracker.MaxOverTime(absl::Nanoseconds(1)));
|
||||
EXPECT_EQ(NHugePages(100), tracker.MaxOverTime(kDuration / 2));
|
||||
EXPECT_EQ(NHugePages(100), tracker.MaxOverTime(kDuration));
|
||||
EXPECT_EQ(NHugePages(2), tracker.MinOverTime(absl::Nanoseconds(1)));
|
||||
EXPECT_EQ(NHugePages(0), tracker.MinOverTime(kDuration / 2));
|
||||
EXPECT_EQ(NHugePages(0), tracker.MinOverTime(kDuration));
|
||||
|
||||
Advance(kDuration / 3);
|
||||
tracker.Report(NHugePages(5));
|
||||
EXPECT_EQ(NHugePages(5), tracker.MaxOverTime(absl::Nanoseconds(1)));
|
||||
EXPECT_EQ(NHugePages(5), tracker.MaxOverTime(kDuration / 2));
|
||||
EXPECT_EQ(NHugePages(100), tracker.MaxOverTime(kDuration));
|
||||
EXPECT_EQ(NHugePages(5), tracker.MinOverTime(absl::Nanoseconds(1)));
|
||||
EXPECT_EQ(NHugePages(2), tracker.MinOverTime(kDuration / 2));
|
||||
EXPECT_EQ(NHugePages(0), tracker.MinOverTime(kDuration));
|
||||
|
||||
// This should annihilate everything.
|
||||
Advance(kDuration * 2);
|
||||
tracker.Report(NHugePages(1));
|
||||
EXPECT_EQ(NHugePages(1), tracker.MaxOverTime(absl::Nanoseconds(1)));
|
||||
EXPECT_EQ(NHugePages(1), tracker.MinOverTime(absl::Nanoseconds(1)));
|
||||
EXPECT_EQ(NHugePages(1), tracker.MaxOverTime(kDuration));
|
||||
EXPECT_EQ(NHugePages(1), tracker.MinOverTime(kDuration));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
847
src/third_party/tcmalloc/dist/tcmalloc/huge_page_aware_allocator.cc
vendored
Normal file
@ -0,0 +1,847 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tcmalloc/huge_page_aware_allocator.h"
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <new>
|
||||
|
||||
#include "absl/base/internal/cycleclock.h"
|
||||
#include "absl/base/internal/spinlock.h"
|
||||
#include "absl/base/thread_annotations.h"
|
||||
#include "absl/time/time.h"
|
||||
#include "tcmalloc/common.h"
|
||||
#include "tcmalloc/experiment.h"
|
||||
#include "tcmalloc/experiment_config.h"
|
||||
#include "tcmalloc/huge_allocator.h"
|
||||
#include "tcmalloc/huge_page_filler.h"
|
||||
#include "tcmalloc/huge_pages.h"
|
||||
#include "tcmalloc/internal/environment.h"
|
||||
#include "tcmalloc/internal/lifetime_predictions.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
#include "tcmalloc/internal/optimization.h"
|
||||
#include "tcmalloc/internal/prefetch.h"
|
||||
#include "tcmalloc/lifetime_based_allocator.h"
|
||||
#include "tcmalloc/pagemap.h"
|
||||
#include "tcmalloc/parameters.h"
|
||||
#include "tcmalloc/span.h"
|
||||
#include "tcmalloc/static_vars.h"
|
||||
#include "tcmalloc/stats.h"
|
||||
#include "tcmalloc/system-alloc.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
|
||||
bool decide_want_hpaa();
|
||||
ABSL_ATTRIBUTE_WEAK int default_want_hpaa();
|
||||
ABSL_ATTRIBUTE_WEAK int default_subrelease();
|
||||
|
||||
bool decide_subrelease() {
|
||||
if (!decide_want_hpaa()) {
|
||||
// Subrelease is off if HPAA is off.
|
||||
return false;
|
||||
}
|
||||
|
||||
const char* e = thread_safe_getenv("TCMALLOC_HPAA_CONTROL");
|
||||
if (e) {
|
||||
switch (e[0]) {
|
||||
case '0':
|
||||
if (default_want_hpaa != nullptr) {
|
||||
int default_hpaa = default_want_hpaa();
|
||||
if (default_hpaa < 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
Log(kLog, __FILE__, __LINE__,
|
||||
"Runtime opt-out from HPAA requires building with "
|
||||
"//tcmalloc:want_no_hpaa."
|
||||
);
|
||||
break;
|
||||
case '1':
|
||||
return false;
|
||||
case '2':
|
||||
return true;
|
||||
default:
|
||||
Crash(kCrash, __FILE__, __LINE__, "bad env var", e);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (default_subrelease != nullptr) {
|
||||
const int decision = default_subrelease();
|
||||
if (decision != 0) {
|
||||
return decision > 0;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
FillerPartialRerelease decide_partial_rerelease() {
|
||||
const char* e = thread_safe_getenv("TCMALLOC_PARTIAL_RELEASE_CONTROL");
|
||||
if (e) {
|
||||
if (e[0] == '0') {
|
||||
return FillerPartialRerelease::Return;
|
||||
}
|
||||
if (e[0] == '1') {
|
||||
return FillerPartialRerelease::Retain;
|
||||
}
|
||||
Crash(kCrash, __FILE__, __LINE__, "bad env var", e);
|
||||
}
|
||||
|
||||
return FillerPartialRerelease::Retain;
|
||||
}
|
||||
|
||||
LifetimePredictionOptions decide_lifetime_predictions() {
|
||||
// See LifetimePredictionOptions::FromFlag for a description of the format.
|
||||
const char* e = tcmalloc::tcmalloc_internal::thread_safe_getenv(
|
||||
"TCMALLOC_LIFETIMES_CONTROL");
|
||||
|
||||
if (e != nullptr) {
|
||||
return LifetimePredictionOptions::FromFlag(e);
|
||||
}
|
||||
|
||||
return LifetimePredictionOptions::Default();
|
||||
}
|
||||
|
||||
HugeRegionCountOption use_huge_region_for_often() {
|
||||
return (IsExperimentActive(
|
||||
Experiment::TEST_ONLY_TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN) ||
|
||||
IsExperimentActive(Experiment::TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN))
|
||||
? HugeRegionCountOption::kAbandonedCount
|
||||
: HugeRegionCountOption::kSlack;
|
||||
}
|
||||
|
||||
// Some notes: locking discipline here is a bit funny, because
|
||||
// we want to *not* hold the pageheap lock while backing memory.
|
||||
|
||||
// We have here a collection of slightly different allocators each
|
||||
// optimized for slightly different purposes. This file has two main purposes:
|
||||
// - pick the right one for a given allocation
|
||||
// - provide enough data to figure out what we picked last time!
|
||||
|
||||
HugePageAwareAllocator::HugePageAwareAllocator(MemoryTag tag)
|
||||
: HugePageAwareAllocator(tag, use_huge_region_for_often(),
|
||||
decide_lifetime_predictions()) {}
|
||||
|
||||
HugePageAwareAllocator::HugePageAwareAllocator(
|
||||
MemoryTag tag, HugeRegionCountOption use_huge_region_more_often)
|
||||
: HugePageAwareAllocator(tag, use_huge_region_more_often,
|
||||
decide_lifetime_predictions()) {}
|
||||
|
||||
HugePageAwareAllocator::HugePageAwareAllocator(
|
||||
MemoryTag tag, HugeRegionCountOption use_huge_region_more_often,
|
||||
LifetimePredictionOptions lifetime_options)
|
||||
: PageAllocatorInterface("HugePageAware", tag),
|
||||
filler_(decide_partial_rerelease(),
|
||||
Parameters::separate_allocs_for_few_and_many_objects_spans(),
|
||||
MemoryModifyFunction(SystemRelease)),
|
||||
alloc_(
|
||||
[](MemoryTag tag) {
|
||||
// TODO(ckennelly): Remove the template parameter.
|
||||
switch (tag) {
|
||||
case MemoryTag::kNormal:
|
||||
return AllocAndReport<MemoryTag::kNormal>;
|
||||
case MemoryTag::kNormalP1:
|
||||
return AllocAndReport<MemoryTag::kNormalP1>;
|
||||
case MemoryTag::kSampled:
|
||||
return AllocAndReport<MemoryTag::kSampled>;
|
||||
case MemoryTag::kCold:
|
||||
return AllocAndReport<MemoryTag::kCold>;
|
||||
default:
|
||||
ASSUME(false);
|
||||
__builtin_unreachable();
|
||||
}
|
||||
}(tag),
|
||||
MetaDataAlloc),
|
||||
cache_(HugeCache{&alloc_, MetaDataAlloc,
|
||||
MemoryModifyFunction(UnbackWithoutLock)}),
|
||||
lifetime_allocator_region_alloc_(this),
|
||||
lifetime_allocator_(lifetime_options, &lifetime_allocator_region_alloc_),
|
||||
use_huge_region_more_often_(use_huge_region_more_often) {
|
||||
tracker_allocator_.Init(&tc_globals.arena());
|
||||
region_allocator_.Init(&tc_globals.arena());
|
||||
}
|
||||
|
||||
HugePageAwareAllocator::FillerType::Tracker* HugePageAwareAllocator::GetTracker(
|
||||
HugePage p) {
|
||||
void* v = tc_globals.pagemap().GetHugepage(p.first_page());
|
||||
FillerType::Tracker* pt = reinterpret_cast<FillerType::Tracker*>(v);
|
||||
ASSERT(pt == nullptr || pt->location() == p);
|
||||
return pt;
|
||||
}
|
||||
|
||||
void HugePageAwareAllocator::SetTracker(
|
||||
HugePage p, HugePageAwareAllocator::FillerType::Tracker* pt) {
|
||||
tc_globals.pagemap().SetHugepage(p.first_page(), pt);
|
||||
}
|
||||
|
||||
PageId HugePageAwareAllocator::AllocAndContribute(HugePage p, Length n,
|
||||
size_t num_objects,
|
||||
bool donated) {
|
||||
CHECK_CONDITION(p.start_addr() != nullptr);
|
||||
FillerType::Tracker* pt = tracker_allocator_.New();
|
||||
new (pt)
|
||||
FillerType::Tracker(p, absl::base_internal::CycleClock::Now(), donated);
|
||||
ASSERT(pt->longest_free_range() >= n);
|
||||
ASSERT(pt->was_donated() == donated);
|
||||
// if the page was donated, we track its size so that we can potentially
|
||||
// measure it in abandoned_count_ once this large allocation gets deallocated.
|
||||
if (pt->was_donated()) {
|
||||
pt->set_abandoned_count(n);
|
||||
}
|
||||
PageId page = pt->Get(n).page;
|
||||
ASSERT(page == p.first_page());
|
||||
SetTracker(p, pt);
|
||||
filler_.Contribute(pt, donated, num_objects);
|
||||
ASSERT(pt->was_donated() == donated);
|
||||
return page;
|
||||
}
|
||||
|
||||
PageId HugePageAwareAllocator::RefillFiller(Length n, size_t num_objects,
|
||||
bool* from_released) {
|
||||
HugeRange r = cache_.Get(NHugePages(1), from_released);
|
||||
if (!r.valid()) return PageId{0};
|
||||
// This is duplicate to Finalize, but if we need to break up
|
||||
// hugepages to get to our usage limit it would be very bad to break
|
||||
// up what's left of r after we allocate from there--while r is
|
||||
// mostly empty, clearly what's left in the filler is too fragmented
|
||||
// to be very useful, and we would rather release those
|
||||
// pages. Otherwise, we're nearly guaranteed to release r (if n
|
||||
// isn't very large), and the next allocation will just repeat this
|
||||
// process.
|
||||
tc_globals.page_allocator().ShrinkToUsageLimit(n);
|
||||
return AllocAndContribute(r.start(), n, num_objects, /*donated=*/false);
|
||||
}
|
||||
|
||||
Span* HugePageAwareAllocator::Finalize(Length n, size_t num_objects,
|
||||
PageId page)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
|
||||
ASSERT(page != PageId{0});
|
||||
Span* ret = Span::New(page, n);
|
||||
tc_globals.pagemap().Set(page, ret);
|
||||
ASSERT(!ret->sampled());
|
||||
info_.RecordAlloc(page, n, num_objects);
|
||||
tc_globals.page_allocator().ShrinkToUsageLimit(n);
|
||||
return ret;
|
||||
}
|
||||
|
||||
// For anything <= half a huge page, we will unconditionally use the filler
|
||||
// to pack it into a single page. If we need another page, that's fine.
|
||||
Span* HugePageAwareAllocator::AllocSmall(Length n, size_t objects_per_span,
|
||||
bool* from_released) {
|
||||
auto [pt, page] = filler_.TryGet(n, objects_per_span);
|
||||
if (ABSL_PREDICT_TRUE(pt != nullptr)) {
|
||||
*from_released = false;
|
||||
return Finalize(n, objects_per_span, page);
|
||||
}
|
||||
|
||||
page = RefillFiller(n, objects_per_span, from_released);
|
||||
if (ABSL_PREDICT_FALSE(page == PageId{0})) {
|
||||
return nullptr;
|
||||
}
|
||||
return Finalize(n, objects_per_span, page);
|
||||
}
|
||||
|
||||
Span* HugePageAwareAllocator::AllocLarge(Length n, size_t objects_per_span,
|
||||
bool* from_released,
|
||||
LifetimeStats* lifetime_context) {
|
||||
// If it's an exact page multiple, just pull it from pages directly.
|
||||
HugeLength hl = HLFromPages(n);
|
||||
if (hl.in_pages() == n) {
|
||||
return AllocRawHugepages(n, objects_per_span, from_released);
|
||||
}
|
||||
|
||||
PageId page;
|
||||
// If we fit in a single hugepage, try the Filler first.
|
||||
if (n < kPagesPerHugePage) {
|
||||
auto [pt, page] = filler_.TryGet(n, objects_per_span);
|
||||
if (ABSL_PREDICT_TRUE(pt != nullptr)) {
|
||||
*from_released = false;
|
||||
return Finalize(n, objects_per_span, page);
|
||||
}
|
||||
}
|
||||
|
||||
// Try to perform a lifetime-based allocation.
|
||||
LifetimeBasedAllocator::AllocationResult lifetime =
|
||||
lifetime_allocator_.MaybeGet(n, from_released, lifetime_context);
|
||||
|
||||
// TODO(mmaas): Implement tracking if this is subsequently put into a
|
||||
// conventional region (currently ignored).
|
||||
|
||||
// Was an object allocated in the lifetime region? If so, we return it.
|
||||
if (lifetime.TryGetAllocation(&page)) {
|
||||
return Finalize(n, objects_per_span, page);
|
||||
}
|
||||
|
||||
// If we're using regions in this binary (see below comment), is
|
||||
// there currently available space there?
|
||||
if (regions_.MaybeGet(n, &page, from_released)) {
|
||||
return Finalize(n, objects_per_span, page);
|
||||
}
|
||||
|
||||
// We have two choices here: allocate a new region or go to
|
||||
// hugepages directly (hoping that slack will be filled by small
|
||||
// allocation.) The second strategy is preferrable, as it's
|
||||
// typically faster and usually more space efficient, but it's sometimes
|
||||
// catastrophic.
|
||||
//
|
||||
// See https://github.com/google/tcmalloc/tree/master/docs/regions-are-not-optional.md
|
||||
//
|
||||
// So test directly if we're in the bad case--almost no binaries are.
|
||||
// If not, just fall back to direct allocation (and hope we do hit that case!)
|
||||
const Length slack = info_.slack();
|
||||
const Length donated =
|
||||
UseHugeRegionMoreOften() ? abandoned_pages_ + slack : slack;
|
||||
// Don't bother at all until the binary is reasonably sized.
|
||||
if (donated < HLFromBytes(64 * 1024 * 1024).in_pages()) {
|
||||
return AllocRawHugepagesAndMaybeTrackLifetime(n, objects_per_span, lifetime,
|
||||
from_released);
|
||||
}
|
||||
|
||||
// In the vast majority of binaries, we have many small allocations which
|
||||
// will nicely fill slack. (Fleetwide, the average ratio is 15:1; only
|
||||
// a handful of binaries fall below 1:1.)
|
||||
//
|
||||
// If we enable an experiment that tries to use huge regions more frequently,
|
||||
// we skip the check.
|
||||
const Length small = info_.small();
|
||||
if (slack < small && !UseHugeRegionMoreOften()) {
|
||||
return AllocRawHugepagesAndMaybeTrackLifetime(n, objects_per_span, lifetime,
|
||||
from_released);
|
||||
}
|
||||
|
||||
// We couldn't allocate a new region. They're oversized, so maybe we'd get
|
||||
// lucky with a smaller request?
|
||||
if (!AddRegion()) {
|
||||
return AllocRawHugepagesAndMaybeTrackLifetime(n, objects_per_span, lifetime,
|
||||
from_released);
|
||||
}
|
||||
|
||||
CHECK_CONDITION(regions_.MaybeGet(n, &page, from_released));
|
||||
return Finalize(n, objects_per_span, page);
|
||||
}
|
||||
|
||||
Span* HugePageAwareAllocator::AllocEnormous(Length n, size_t objects_per_span,
|
||||
bool* from_released) {
|
||||
return AllocRawHugepages(n, objects_per_span, from_released);
|
||||
}
|
||||
|
||||
Span* HugePageAwareAllocator::AllocRawHugepages(Length n, size_t num_objects,
|
||||
bool* from_released) {
|
||||
HugeLength hl = HLFromPages(n);
|
||||
|
||||
HugeRange r = cache_.Get(hl, from_released);
|
||||
if (!r.valid()) return nullptr;
|
||||
|
||||
// We now have a huge page range that covers our request. There
|
||||
// might be some slack in it if n isn't a multiple of
|
||||
// kPagesPerHugePage. Add the hugepage with slack to the filler,
|
||||
// pretending the non-slack portion is a smaller allocation.
|
||||
Length total = hl.in_pages();
|
||||
Length slack = total - n;
|
||||
HugePage first = r.start();
|
||||
SetTracker(first, nullptr);
|
||||
HugePage last = first + r.len() - NHugePages(1);
|
||||
if (slack == Length(0)) {
|
||||
SetTracker(last, nullptr);
|
||||
return Finalize(total, num_objects, r.start().first_page());
|
||||
}
|
||||
|
||||
++donated_huge_pages_;
|
||||
|
||||
Length here = kPagesPerHugePage - slack;
|
||||
ASSERT(here > Length(0));
|
||||
AllocAndContribute(last, here, num_objects, /*donated=*/true);
|
||||
Span* span = Finalize(n, num_objects, r.start().first_page());
|
||||
span->set_donated(/*value=*/true);
|
||||
return span;
|
||||
}
|
||||
|
||||
Span* HugePageAwareAllocator::AllocRawHugepagesAndMaybeTrackLifetime(
|
||||
Length n, size_t num_objects,
|
||||
const LifetimeBasedAllocator::AllocationResult& lifetime_alloc,
|
||||
bool* from_released) ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
|
||||
Span* result = AllocRawHugepages(n, num_objects, from_released);
|
||||
|
||||
if (result != nullptr) {
|
||||
// If this is an object with a lifetime prediction and led to a donation,
|
||||
// add it to the tracker so that we can track its lifetime.
|
||||
HugePage hp = HugePageContaining(result->last_page());
|
||||
FillerType::Tracker* pt = GetTracker(hp);
|
||||
ASSERT(pt != nullptr);
|
||||
|
||||
// The allocator may shrink the heap in response to allocations, which may
|
||||
// cause the page to be subreleased and not donated anymore once we get
|
||||
// here. If it still is, we attach a lifetime tracker (if enabled).
|
||||
if (ABSL_PREDICT_TRUE(pt->donated())) {
|
||||
lifetime_allocator_.MaybeAddTracker(lifetime_alloc,
|
||||
pt->lifetime_tracker());
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static void BackSpan(Span* span) {
|
||||
SystemBack(span->start_address(), span->bytes_in_span());
|
||||
}
|
||||
|
||||
// public
|
||||
Span* HugePageAwareAllocator::New(Length n, size_t objects_per_span) {
|
||||
CHECK_CONDITION(n > Length(0));
|
||||
bool from_released;
|
||||
Span* s = LockAndAlloc(n, objects_per_span, &from_released);
|
||||
if (s) {
|
||||
// Prefetch for writing, as we anticipate using the memory soon.
|
||||
PrefetchW(s->start_address());
|
||||
// TODO(b/256233439): Improve accuracy of from_released value. The filler
|
||||
// may have subreleased pages and is returning them now.
|
||||
if (from_released) BackSpan(s);
|
||||
}
|
||||
ASSERT(!s || GetMemoryTag(s->start_address()) == tag_);
|
||||
return s;
|
||||
}
|
||||
|
||||
Span* HugePageAwareAllocator::LockAndAlloc(Length n, size_t objects_per_span,
|
||||
bool* from_released) {
|
||||
// Check whether we may perform lifetime-based allocation, and if so, collect
|
||||
// the allocation context without holding the lock.
|
||||
LifetimeStats* lifetime_ctx = lifetime_allocator_.CollectLifetimeContext(n);
|
||||
|
||||
absl::base_internal::SpinLockHolder h(&pageheap_lock);
|
||||
// Our policy depends on size. For small things, we will pack them
|
||||
// into single hugepages.
|
||||
if (n <= kPagesPerHugePage / 2) {
|
||||
return AllocSmall(n, objects_per_span, from_released);
|
||||
}
|
||||
|
||||
// For anything too big for the filler, we use either a direct hugepage
|
||||
// allocation, or possibly the regions if we are worried about slack.
|
||||
if (n <= HugeRegion::size().in_pages()) {
|
||||
return AllocLarge(n, objects_per_span, from_released, lifetime_ctx);
|
||||
}
|
||||
|
||||
// In the worst case, we just fall back to directly allocating a run
|
||||
// of hugepages.
|
||||
return AllocEnormous(n, objects_per_span, from_released);
|
||||
}
|
||||
|
||||
// public
|
||||
Span* HugePageAwareAllocator::NewAligned(Length n, Length align,
|
||||
size_t objects_per_span) {
|
||||
if (align <= Length(1)) {
|
||||
return New(n, objects_per_span);
|
||||
}
|
||||
|
||||
// we can do better than this, but...
|
||||
// TODO(b/134690769): support higher align.
|
||||
CHECK_CONDITION(align <= kPagesPerHugePage);
|
||||
bool from_released;
|
||||
Span* s;
|
||||
{
|
||||
absl::base_internal::SpinLockHolder h(&pageheap_lock);
|
||||
s = AllocRawHugepages(n, objects_per_span, &from_released);
|
||||
}
|
||||
if (s && from_released) BackSpan(s);
|
||||
ASSERT(!s || GetMemoryTag(s->start_address()) == tag_);
|
||||
return s;
|
||||
}
|
||||
|
||||
void HugePageAwareAllocator::DeleteFromHugepage(FillerType::Tracker* pt,
|
||||
PageId p, Length n,
|
||||
size_t num_objects,
|
||||
bool might_abandon) {
|
||||
if (ABSL_PREDICT_TRUE(filler_.Put(pt, p, n, num_objects) == nullptr)) {
|
||||
// If this allocation had resulted in a donation to the filler, we record
|
||||
// these pages as abandoned.
|
||||
if (ABSL_PREDICT_FALSE(might_abandon)) {
|
||||
ASSERT(pt->was_donated());
|
||||
abandoned_pages_ += pt->abandoned_count();
|
||||
pt->set_abandoned(true);
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (pt->was_donated()) {
|
||||
--donated_huge_pages_;
|
||||
if (pt->abandoned()) {
|
||||
abandoned_pages_ -= pt->abandoned_count();
|
||||
pt->set_abandoned(false);
|
||||
}
|
||||
} else {
|
||||
ASSERT(pt->abandoned_count() == Length(0));
|
||||
}
|
||||
lifetime_allocator_.MaybePutTracker(pt->lifetime_tracker(), n);
|
||||
ReleaseHugepage(pt);
|
||||
}
|
||||
|
||||
bool HugePageAwareAllocator::AddRegion() {
|
||||
HugeRange r = alloc_.Get(HugeRegion::size());
|
||||
if (!r.valid()) return false;
|
||||
HugeRegion* region = region_allocator_.New();
|
||||
new (region) HugeRegion(r, MemoryModifyFunction(SystemRelease));
|
||||
regions_.Contribute(region);
|
||||
return true;
|
||||
}
|
||||
|
||||
void HugePageAwareAllocator::Delete(Span* span, size_t objects_per_span) {
|
||||
ASSERT(!span || GetMemoryTag(span->start_address()) == tag_);
|
||||
PageId p = span->first_page();
|
||||
HugePage hp = HugePageContaining(p);
|
||||
Length n = span->num_pages();
|
||||
info_.RecordFree(p, n, objects_per_span);
|
||||
|
||||
bool might_abandon = span->donated();
|
||||
Span::Delete(span);
|
||||
// Clear the descriptor of the page so a second pass through the same page
|
||||
// could trigger the check on `span != nullptr` in do_free_pages.
|
||||
tc_globals.pagemap().Set(p, nullptr);
|
||||
|
||||
// The tricky part, as with so many allocators: where did we come from?
|
||||
// There are several possibilities.
|
||||
FillerType::Tracker* pt = GetTracker(hp);
|
||||
// a) We got packed by the filler onto a single hugepage - return our
|
||||
// allocation to that hugepage in the filler.
|
||||
if (ABSL_PREDICT_TRUE(pt != nullptr)) {
|
||||
ASSERT(hp == HugePageContaining(p + n - Length(1)));
|
||||
DeleteFromHugepage(pt, p, n, objects_per_span, might_abandon);
|
||||
return;
|
||||
}
|
||||
|
||||
// b) We got put into a region, possibly crossing hugepages -
|
||||
// return our allocation to the region.
|
||||
if (regions_.MaybePut(p, n)) return;
|
||||
if (lifetime_allocator_.MaybePut(p, n)) return;
|
||||
|
||||
// c) we came straight from the HugeCache - return straight there. (We
|
||||
// might have had slack put into the filler - if so, return that virtual
|
||||
// allocation to the filler too!)
|
||||
ASSERT(n >= kPagesPerHugePage);
|
||||
HugeLength hl = HLFromPages(n);
|
||||
HugePage last = hp + hl - NHugePages(1);
|
||||
Length slack = hl.in_pages() - n;
|
||||
if (slack == Length(0)) {
|
||||
ASSERT(GetTracker(last) == nullptr);
|
||||
} else {
|
||||
pt = GetTracker(last);
|
||||
lifetime_allocator_.MaybePutTracker(pt->lifetime_tracker(), n);
|
||||
CHECK_CONDITION(pt != nullptr);
|
||||
ASSERT(pt->was_donated());
|
||||
// We put the slack into the filler (see AllocEnormous.)
|
||||
// Handle this page separately as a virtual allocation
|
||||
// onto the last hugepage.
|
||||
PageId virt = last.first_page();
|
||||
Length virt_len = kPagesPerHugePage - slack;
|
||||
// We may have used the slack, which would prevent us from returning
|
||||
// the entire range now. If filler returned a Tracker, we are fully empty.
|
||||
if (filler_.Put(pt, virt, virt_len, objects_per_span) == nullptr) {
|
||||
// Last page isn't empty -- pretend the range was shorter.
|
||||
--hl;
|
||||
|
||||
// Note that we abandoned virt_len pages with pt. These can be reused for
|
||||
// other allocations, but this can contribute to excessive slack in the
|
||||
// filler.
|
||||
abandoned_pages_ += pt->abandoned_count();
|
||||
pt->set_abandoned(true);
|
||||
} else {
|
||||
// Last page was empty - but if we sub-released it, we still
|
||||
// have to split it off and release it independently.)
|
||||
//
|
||||
// We were able to reclaim the donated slack.
|
||||
--donated_huge_pages_;
|
||||
ASSERT(!pt->abandoned());
|
||||
|
||||
if (pt->released()) {
|
||||
--hl;
|
||||
ReleaseHugepage(pt);
|
||||
} else {
|
||||
// Get rid of the tracker *object*, but not the *hugepage* (which is
|
||||
// still part of our range.)
|
||||
SetTracker(pt->location(), nullptr);
|
||||
ASSERT(!pt->lifetime_tracker()->is_tracked());
|
||||
tracker_allocator_.Delete(pt);
|
||||
}
|
||||
}
|
||||
}
|
||||
cache_.Release({hp, hl});
|
||||
}
|
||||
|
||||
void HugePageAwareAllocator::ReleaseHugepage(FillerType::Tracker* pt) {
|
||||
ASSERT(pt->used_pages() == Length(0));
|
||||
HugeRange r = {pt->location(), NHugePages(1)};
|
||||
SetTracker(pt->location(), nullptr);
|
||||
|
||||
if (pt->released()) {
|
||||
cache_.ReleaseUnbacked(r);
|
||||
} else {
|
||||
cache_.Release(r);
|
||||
}
|
||||
|
||||
ASSERT(!pt->lifetime_tracker()->is_tracked());
|
||||
tracker_allocator_.Delete(pt);
|
||||
}
|
||||
|
||||
// public
|
||||
BackingStats HugePageAwareAllocator::stats() const {
|
||||
BackingStats stats = alloc_.stats();
|
||||
const auto actual_system = stats.system_bytes;
|
||||
stats += cache_.stats();
|
||||
stats += filler_.stats();
|
||||
stats += regions_.stats();
|
||||
stats += lifetime_allocator_.GetRegionStats().value_or(BackingStats());
|
||||
// the "system" (total managed) byte count is wildly double counted,
|
||||
// since it all comes from HugeAllocator but is then managed by
|
||||
// cache/regions/filler. Adjust for that.
|
||||
stats.system_bytes = actual_system;
|
||||
return stats;
|
||||
}
|
||||
|
||||
// public
|
||||
void HugePageAwareAllocator::GetSmallSpanStats(SmallSpanStats* result) {
|
||||
GetSpanStats(result, nullptr, nullptr);
|
||||
}
|
||||
|
||||
// public
|
||||
void HugePageAwareAllocator::GetLargeSpanStats(LargeSpanStats* result) {
|
||||
GetSpanStats(nullptr, result, nullptr);
|
||||
}
|
||||
|
||||
void HugePageAwareAllocator::GetSpanStats(SmallSpanStats* small,
|
||||
LargeSpanStats* large,
|
||||
PageAgeHistograms* ages) {
|
||||
if (small != nullptr) {
|
||||
*small = SmallSpanStats();
|
||||
}
|
||||
if (large != nullptr) {
|
||||
*large = LargeSpanStats();
|
||||
}
|
||||
|
||||
alloc_.AddSpanStats(small, large, ages);
|
||||
filler_.AddSpanStats(small, large, ages);
|
||||
regions_.AddSpanStats(small, large, ages);
|
||||
cache_.AddSpanStats(small, large, ages);
|
||||
}
|
||||
|
||||
// public
|
||||
Length HugePageAwareAllocator::ReleaseAtLeastNPages(Length num_pages) {
|
||||
Length released;
|
||||
released += cache_.ReleaseCachedPages(HLFromPages(num_pages)).in_pages();
|
||||
|
||||
// This is our long term plan but in current state will lead to insufficient
|
||||
// THP coverage. It is however very useful to have the ability to turn this on
|
||||
// for testing.
|
||||
// TODO(b/134690769): make this work, remove the flag guard.
|
||||
if (Parameters::hpaa_subrelease()) {
|
||||
if (released < num_pages) {
|
||||
released += filler_.ReleasePages(
|
||||
num_pages - released,
|
||||
SkipSubreleaseIntervals{
|
||||
.peak_interval = Parameters::filler_skip_subrelease_interval(),
|
||||
.short_interval =
|
||||
Parameters::filler_skip_subrelease_short_interval(),
|
||||
.long_interval =
|
||||
Parameters::filler_skip_subrelease_long_interval()},
|
||||
/*hit_limit*/ false);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(b/134690769):
|
||||
// - perhaps release region?
|
||||
// - refuse to release if we're too close to zero?
|
||||
info_.RecordRelease(num_pages, released);
|
||||
return released;
|
||||
}
|
||||
|
||||
static double BytesToMiB(size_t bytes) {
|
||||
const double MiB = 1048576.0;
|
||||
return bytes / MiB;
|
||||
}
|
||||
|
||||
static void BreakdownStats(Printer* out, const BackingStats& s,
|
||||
const char* label) {
|
||||
out->printf("%s %6.1f MiB used, %6.1f MiB free, %6.1f MiB unmapped\n", label,
|
||||
BytesToMiB(s.system_bytes - s.free_bytes - s.unmapped_bytes),
|
||||
BytesToMiB(s.free_bytes), BytesToMiB(s.unmapped_bytes));
|
||||
}
|
||||
|
||||
static void BreakdownStatsInPbtxt(PbtxtRegion* hpaa, const BackingStats& s,
|
||||
const char* key) {
|
||||
auto usage = hpaa->CreateSubRegion(key);
|
||||
usage.PrintI64("used", s.system_bytes - s.free_bytes - s.unmapped_bytes);
|
||||
usage.PrintI64("free", s.free_bytes);
|
||||
usage.PrintI64("unmapped", s.unmapped_bytes);
|
||||
}
|
||||
|
||||
// public
|
||||
void HugePageAwareAllocator::Print(Printer* out) { Print(out, true); }
|
||||
|
||||
void HugePageAwareAllocator::Print(Printer* out, bool everything) {
|
||||
SmallSpanStats small;
|
||||
LargeSpanStats large;
|
||||
BackingStats bstats;
|
||||
PageAgeHistograms ages(absl::base_internal::CycleClock::Now());
|
||||
absl::base_internal::SpinLockHolder h(&pageheap_lock);
|
||||
bstats = stats();
|
||||
GetSpanStats(&small, &large, &ages);
|
||||
PrintStats("HugePageAware", out, bstats, small, large, everything);
|
||||
out->printf(
|
||||
"\nHuge page aware allocator components:\n"
|
||||
"------------------------------------------------\n");
|
||||
out->printf("HugePageAware: breakdown of used / free / unmapped space:\n");
|
||||
|
||||
auto fstats = filler_.stats();
|
||||
BreakdownStats(out, fstats, "HugePageAware: filler ");
|
||||
|
||||
auto rstats = regions_.stats();
|
||||
BreakdownStats(out, rstats, "HugePageAware: region ");
|
||||
|
||||
// Report short-lived region allocations when enabled.
|
||||
auto lstats = lifetime_allocator_.GetRegionStats();
|
||||
if (lstats.has_value()) {
|
||||
BreakdownStats(out, lstats.value(), "HugePageAware: lifetime");
|
||||
}
|
||||
|
||||
auto cstats = cache_.stats();
|
||||
// Everything in the filler came from the cache -
|
||||
// adjust the totals so we see the amount used by the mutator.
|
||||
cstats.system_bytes -= fstats.system_bytes;
|
||||
BreakdownStats(out, cstats, "HugePageAware: cache ");
|
||||
|
||||
auto astats = alloc_.stats();
|
||||
// Everything in *all* components came from here -
|
||||
// so again adjust the totals.
|
||||
astats.system_bytes -=
|
||||
(fstats + rstats + lstats.value_or(BackingStats()) + cstats).system_bytes;
|
||||
BreakdownStats(out, astats, "HugePageAware: alloc ");
|
||||
out->printf("\n");
|
||||
|
||||
out->printf(
|
||||
"HugePageAware: filler donations %zu (%zu pages from abandoned "
|
||||
"donations)\n",
|
||||
donated_huge_pages_.raw_num(), abandoned_pages_.raw_num());
|
||||
|
||||
// Component debug output
|
||||
// Filler is by far the most important; print (some) of it
|
||||
// unconditionally.
|
||||
filler_.Print(out, everything);
|
||||
out->printf("\n");
|
||||
if (everything) {
|
||||
regions_.Print(out);
|
||||
out->printf("\n");
|
||||
cache_.Print(out);
|
||||
lifetime_allocator_.Print(out);
|
||||
out->printf("\n");
|
||||
alloc_.Print(out);
|
||||
out->printf("\n");
|
||||
|
||||
// Use statistics
|
||||
info_.Print(out);
|
||||
|
||||
// and age tracking.
|
||||
ages.Print("HugePageAware", out);
|
||||
}
|
||||
|
||||
out->printf("PARAMETER hpaa_subrelease %d\n",
|
||||
Parameters::hpaa_subrelease() ? 1 : 0);
|
||||
}
|
||||
|
||||
void HugePageAwareAllocator::PrintInPbtxt(PbtxtRegion* region) {
|
||||
SmallSpanStats small;
|
||||
LargeSpanStats large;
|
||||
PageAgeHistograms ages(absl::base_internal::CycleClock::Now());
|
||||
absl::base_internal::SpinLockHolder h(&pageheap_lock);
|
||||
GetSpanStats(&small, &large, &ages);
|
||||
PrintStatsInPbtxt(region, small, large, ages);
|
||||
{
|
||||
auto hpaa = region->CreateSubRegion("huge_page_allocator");
|
||||
hpaa.PrintBool("using_hpaa", true);
|
||||
hpaa.PrintBool("using_hpaa_subrelease", Parameters::hpaa_subrelease());
|
||||
|
||||
// Fill HPAA Usage
|
||||
auto fstats = filler_.stats();
|
||||
BreakdownStatsInPbtxt(&hpaa, fstats, "filler_usage");
|
||||
|
||||
auto rstats = regions_.stats();
|
||||
BreakdownStatsInPbtxt(&hpaa, rstats, "region_usage");
|
||||
|
||||
auto cstats = cache_.stats();
|
||||
// Everything in the filler came from the cache -
|
||||
// adjust the totals so we see the amount used by the mutator.
|
||||
cstats.system_bytes -= fstats.system_bytes;
|
||||
BreakdownStatsInPbtxt(&hpaa, cstats, "cache_usage");
|
||||
|
||||
auto astats = alloc_.stats();
|
||||
// Everything in *all* components came from here -
|
||||
// so again adjust the totals.
|
||||
astats.system_bytes -= (fstats + rstats + cstats).system_bytes;
|
||||
|
||||
auto lstats = lifetime_allocator_.GetRegionStats();
|
||||
if (lstats.has_value()) {
|
||||
astats.system_bytes -= lstats.value().system_bytes;
|
||||
BreakdownStatsInPbtxt(&hpaa, lstats.value(), "lifetime_region_usage");
|
||||
}
|
||||
|
||||
BreakdownStatsInPbtxt(&hpaa, astats, "alloc_usage");
|
||||
|
||||
filler_.PrintInPbtxt(&hpaa);
|
||||
regions_.PrintInPbtxt(&hpaa);
|
||||
cache_.PrintInPbtxt(&hpaa);
|
||||
alloc_.PrintInPbtxt(&hpaa);
|
||||
lifetime_allocator_.PrintInPbtxt(&hpaa);
|
||||
|
||||
// Use statistics
|
||||
info_.PrintInPbtxt(&hpaa, "hpaa_stat");
|
||||
|
||||
hpaa.PrintI64("filler_donated_huge_pages", donated_huge_pages_.raw_num());
|
||||
hpaa.PrintI64("filler_abandoned_pages", abandoned_pages_.raw_num());
|
||||
}
|
||||
}
|
||||
|
||||
template <MemoryTag tag>
|
||||
AddressRange HugePageAwareAllocator::AllocAndReport(size_t bytes,
|
||||
size_t align) {
|
||||
auto ret = SystemAlloc(bytes, align, tag);
|
||||
if (ret.ptr == nullptr) return ret;
|
||||
const PageId page = PageIdContaining(ret.ptr);
|
||||
const Length page_len = BytesToLengthFloor(ret.bytes);
|
||||
tc_globals.pagemap().Ensure(page, page_len);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void* HugePageAwareAllocator::MetaDataAlloc(size_t bytes)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
|
||||
return tc_globals.arena().Alloc(bytes);
|
||||
}
|
||||
|
||||
Length HugePageAwareAllocator::ReleaseAtLeastNPagesBreakingHugepages(Length n) {
|
||||
// We desperately need to release memory, and are willing to
|
||||
// compromise on hugepage usage. That means releasing from the filler.
|
||||
return filler_.ReleasePages(n, SkipSubreleaseIntervals{},
|
||||
/*hit_limit*/ true);
|
||||
}
|
||||
|
||||
bool HugePageAwareAllocator::UnbackWithoutLock(void* start, size_t length) {
|
||||
pageheap_lock.Unlock();
|
||||
const bool ret = SystemRelease(start, length);
|
||||
pageheap_lock.Lock();
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
266
src/third_party/tcmalloc/dist/tcmalloc/huge_page_aware_allocator.h
vendored
Normal file
@ -0,0 +1,266 @@
|
||||
// Copyright 2019 The TCMalloc Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// https://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef TCMALLOC_HUGE_PAGE_AWARE_ALLOCATOR_H_
|
||||
#define TCMALLOC_HUGE_PAGE_AWARE_ALLOCATOR_H_
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include "absl/base/thread_annotations.h"
|
||||
#include "tcmalloc/arena.h"
|
||||
#include "tcmalloc/common.h"
|
||||
#include "tcmalloc/huge_allocator.h"
|
||||
#include "tcmalloc/huge_cache.h"
|
||||
#include "tcmalloc/huge_pages.h"
|
||||
#include "tcmalloc/huge_region.h"
|
||||
#include "tcmalloc/internal/config.h"
|
||||
#include "tcmalloc/internal/logging.h"
|
||||
#include "tcmalloc/lifetime_based_allocator.h"
|
||||
#include "tcmalloc/page_allocator_interface.h"
|
||||
#include "tcmalloc/page_heap_allocator.h"
|
||||
#include "tcmalloc/span.h"
|
||||
#include "tcmalloc/stats.h"
|
||||
#include "tcmalloc/system-alloc.h"
|
||||
|
||||
GOOGLE_MALLOC_SECTION_BEGIN
|
||||
namespace tcmalloc {
|
||||
namespace tcmalloc_internal {
|
||||
|
||||
bool decide_subrelease();
|
||||
|
||||
enum class HugeRegionCountOption : bool {
|
||||
// This is a default behavior. We use slack to determine when to use
|
||||
// HugeRegion. When slack is greater than 64MB (to ignore small binaries), and
|
||||
// greater than the number of small allocations, we allocate large allocations
|
||||
// from HugeRegion.
|
||||
kSlack,
|
||||
// When the experiment TEST_ONLY_TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN is
|
||||
// enabled, we use number of abandoned pages in addition to slack to make a
|
||||
// decision. If the size of abandoned pages plus slack exceeds 64MB (to ignore
|
||||
// small binaries), we use HugeRegion for large allocations.
|
||||
kAbandonedCount
|
||||
};
|
||||
|
||||
// An implementation of the PageAllocator interface that is hugepage-efficient.
|
||||
// Attempts to pack allocations into full hugepages wherever possible,
|
||||
// and aggressively returns empty ones to the system.
|
||||
class HugePageAwareAllocator final : public PageAllocatorInterface {
|
||||
public:
|
||||
explicit HugePageAwareAllocator(MemoryTag tag);
|
||||
// For use in testing.
|
||||
HugePageAwareAllocator(MemoryTag tag,
|
||||
HugeRegionCountOption use_huge_region_more_often);
|
||||
HugePageAwareAllocator(MemoryTag tag,
|
||||
HugeRegionCountOption use_huge_region_more_often,
|
||||
LifetimePredictionOptions lifetime_options);
|
||||
~HugePageAwareAllocator() override = default;
|
||||
|
||||
// Allocate a run of "n" pages. Returns zero if out of memory.
|
||||
// Caller should not pass "n == 0" -- instead, n should have
|
||||
// been rounded up already.
|
||||
Span* New(Length n, size_t objects_per_span)
|
||||
ABSL_LOCKS_EXCLUDED(pageheap_lock) override;
|
||||
|
||||
// As New, but the returned span is aligned to a <align>-page boundary.
|
||||
// <align> must be a power of two.
|
||||
Span* NewAligned(Length n, Length align, size_t objects_per_span)
|
||||
ABSL_LOCKS_EXCLUDED(pageheap_lock) override;
|
||||
|
||||
// Delete the span "[p, p+n-1]".
|
||||
// REQUIRES: span was returned by earlier call to New() and
|
||||
// has not yet been deleted.
|
||||
void Delete(Span* span, size_t objects_per_span)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) override;
|
||||
|
||||
BackingStats stats() const
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) override;
|
||||
|
||||
void GetSmallSpanStats(SmallSpanStats* result)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) override;
|
||||
|
||||
void GetLargeSpanStats(LargeSpanStats* result)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) override;
|
||||
|
||||
// Try to release at least num_pages for reuse by the OS. Returns
|
||||
// the actual number of pages released, which may be less than
|
||||
// num_pages if there weren't enough pages to release. The result
|
||||
// may also be larger than num_pages since page_heap might decide to
|
||||
// release one large range instead of fragmenting it into two
|
||||
// smaller released and unreleased ranges.
|
||||
Length ReleaseAtLeastNPages(Length num_pages)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) override;
|
||||
|
||||
Length ReleaseAtLeastNPagesBreakingHugepages(Length n)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
|
||||
|
||||
// Prints stats about the page heap to *out.
|
||||
void Print(Printer* out) ABSL_LOCKS_EXCLUDED(pageheap_lock) override;
|
||||
|
||||
// Print stats to *out, excluding long/likely uninteresting things
|
||||
// unless <everything> is true.
|
||||
void Print(Printer* out, bool everything) ABSL_LOCKS_EXCLUDED(pageheap_lock);
|
||||
|
||||
void PrintInPbtxt(PbtxtRegion* region)
|
||||
ABSL_LOCKS_EXCLUDED(pageheap_lock) override;
|
||||
|
||||
HugeLength DonatedHugePages() const
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
|
||||
return donated_huge_pages_;
|
||||
}
|
||||
|
||||
// Number of pages that have been retained on huge pages by donations that did
|
||||
// not reassemble by the time the larger allocation was deallocated.
|
||||
Length AbandonedPages() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
|
||||
return abandoned_pages_;
|
||||
}
|
||||
|
||||
const HugeCache* cache() const { return &cache_; }
|
||||
|
||||
LifetimeBasedAllocator& lifetime_based_allocator() {
|
||||
return lifetime_allocator_;
|
||||
}
|
||||
|
||||
const HugeRegionSet<HugeRegion>& region() const
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
|
||||
return regions_;
|
||||
};
|
||||
|
||||
private:
|
||||
typedef HugePageFiller<PageTracker> FillerType;
|
||||
FillerType filler_ ABSL_GUARDED_BY(pageheap_lock);
|
||||
|
||||
class RegionAllocImpl final : public LifetimeBasedAllocator::RegionAlloc {
|
||||
public:
|
||||
explicit RegionAllocImpl(HugePageAwareAllocator* p) : p_(p) {}
|
||||
|
||||
// We need to explicitly instantiate the destructor here so that it gets
|
||||
// placed within GOOGLE_MALLOC_SECTION.
|
||||
~RegionAllocImpl() override {}
|
||||
|
||||
HugeRegion* AllocRegion(HugeLength n, HugeRange* range) override
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
|
||||
if (!range->valid()) {
|
||||
*range = p_->alloc_.Get(n);
|
||||
}
|
||||
if (!range->valid()) return nullptr;
|
||||
HugeRegion* region = p_->region_allocator_.New();
|
||||
new (region) HugeRegion(*range, MemoryModifyFunction(SystemRelease));
|
||||
return region;
|
||||
}
|
||||
|
||||
private:
|
||||
HugePageAwareAllocator* p_;
|
||||
};
|
||||
|
||||
// Calls SystemRelease, but with dropping of pageheap_lock around the call.
|
||||
static ABSL_MUST_USE_RESULT bool UnbackWithoutLock(void* start, size_t length)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
|
||||
|
||||
HugeRegionSet<HugeRegion> regions_ ABSL_GUARDED_BY(pageheap_lock);
|
||||
|
||||
PageHeapAllocator<FillerType::Tracker> tracker_allocator_
|
||||
ABSL_GUARDED_BY(pageheap_lock);
|
||||
PageHeapAllocator<HugeRegion> region_allocator_
|
||||
ABSL_GUARDED_BY(pageheap_lock);
|
||||
|
||||
FillerType::Tracker* GetTracker(HugePage p);
|
||||
|
||||
void SetTracker(HugePage p, FillerType::Tracker* pt);
|
||||
|
||||
template <MemoryTag tag>
|
||||
static AddressRange AllocAndReport(size_t bytes, size_t align)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
|
||||
static void* MetaDataAlloc(size_t bytes);
|
||||
HugeAllocator alloc_ ABSL_GUARDED_BY(pageheap_lock);
|
||||
HugeCache cache_ ABSL_GUARDED_BY(pageheap_lock);
|
||||
|
||||
// donated_huge_pages_ measures the number of huge pages contributed to the
|
||||
// filler from left overs of large huge page allocations. When the large
|
||||
// allocation is deallocated, we decrement this count *if* we were able to
|
||||
// fully reassemble the address range (that is, the partial hugepage did not
|
||||
// get stuck in the filler).
|
||||
HugeLength donated_huge_pages_ ABSL_GUARDED_BY(pageheap_lock);
|
||||
// abandoned_pages_ tracks the number of pages contributed to the filler after
|
||||
// a donating allocation is deallocated but the entire huge page has not been
|
||||
// reassembled.
|
||||
Length abandoned_pages_ ABSL_GUARDED_BY(pageheap_lock);
|
||||
|
||||
// Performs lifetime predictions for large objects and places short-lived
|
||||
// objects into a separate region to reduce filler contention.
|
||||
RegionAllocImpl lifetime_allocator_region_alloc_;
|
||||
LifetimeBasedAllocator lifetime_allocator_;
|
||||
|
||||
// Ddetermines if the experiment is enabled. If enabled, we use
|
||||
// abandoned_count_ in addition to slack in determining when to use
|
||||
// HugeRegion.
|
||||
const HugeRegionCountOption use_huge_region_more_often_;
|
||||
bool UseHugeRegionMoreOften() const {
|
||||
return use_huge_region_more_often_ ==
|
||||
HugeRegionCountOption::kAbandonedCount;
|
||||
}
|
||||
|
||||
void GetSpanStats(SmallSpanStats* small, LargeSpanStats* large,
|
||||
PageAgeHistograms* ages)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
|
||||
|
||||
PageId RefillFiller(Length n, size_t num_objects, bool* from_released)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
|
||||
|
||||
// Allocate the first <n> from p, and contribute the rest to the filler. If
|
||||
// "donated" is true, the contribution will be marked as coming from the
|
||||
// tail of a multi-hugepage alloc. Returns the allocated section.
|
||||
PageId AllocAndContribute(HugePage p, Length n, size_t num_objects,
|
||||
bool donated)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
|
||||
// Helpers for New().
|
||||
|
||||
Span* LockAndAlloc(Length n, size_t objects_per_span, bool* from_released);
|
||||
|
||||
Span* AllocSmall(Length n, size_t objects_per_span, bool* from_released)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
|
||||
Span* AllocLarge(Length n, size_t objects_per_span, bool* from_released,
|
||||
LifetimeStats* lifetime_context)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
|
||||
Span* AllocEnormous(Length n, size_t objects_per_span, bool* from_released)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
|
||||
|
||||
Span* AllocRawHugepages(Length n, size_t num_objects, bool* from_released)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
|
||||
|
||||
// Allocates a span and adds a tracker. This span has to be associated with a
|
||||
// filler donation and have an associated page tracker. A tracker will only be
|
||||
// added if there is an associated lifetime prediction.
|
||||
Span* AllocRawHugepagesAndMaybeTrackLifetime(
|
||||
Length n, size_t num_objects,
|
||||
const LifetimeBasedAllocator::AllocationResult& lifetime_alloc,
|
||||
bool* from_released) ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
|
||||
|
||||
bool AddRegion() ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
|
||||
|
||||
void ReleaseHugepage(FillerType::Tracker* pt)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
|
||||
// Return an allocation from a single hugepage.
|
||||
void DeleteFromHugepage(FillerType::Tracker* pt, PageId p, Length n,
|
||||
size_t num_objects, bool might_abandon)
|
||||
ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
|
||||
|
||||
// Finish an allocation request - give it a span and mark it in the pagemap.
|
||||
Span* Finalize(Length n, size_t num_objects, PageId page);
|
||||
};
|
||||
|
||||
} // namespace tcmalloc_internal
|
||||
} // namespace tcmalloc
|
||||
GOOGLE_MALLOC_SECTION_END
|
||||
|
||||
#endif // TCMALLOC_HUGE_PAGE_AWARE_ALLOCATOR_H_
|
||||