SERVER-85737 add latest TCMalloc from google as allocator option (#18870)

GitOrigin-RevId: 23c89085da2424a0fb91913c42c5d356b6a860df
2024-02-16 13:47:20 -06:00 · 2024-02-16 13:47:20 -06:00 · 13401a4bfe
commit 13401a4bfe
parent bd01a61df2
324 changed files with 77664 additions and 360 deletions
--- a/.bazelignore
+++ b/.bazelignore
@ -3,6 +3,7 @@ src/third_party/grpc
 src/third_party/abseil-cpp
 src/third_party/protobuf
 src/third_party/re2
+src/third_party/tcmalloc

 # Ignore node_modules due to the following error
 # ERROR: in verify_node_modules_ignored:
--- a/84
+++ b/84
@ -105,7 +105,7 @@ def use_system_version_of_library(name):
 # add a new C++ library dependency that may be shimmed out to the system, add it to the below
 # list.
 def using_system_version_of_cxx_libraries():
-    cxx_library_names = ["tcmalloc", "boost"]
+    cxx_library_names = ["tcmalloc-google", "boost", "tcmalloc-gperf"]
    return True in [use_system_version_of_library(x) for x in cxx_library_names]


@ -415,7 +415,7 @@ add_option(

 add_option(
    'allocator',
-    choices=["auto", "system", "tcmalloc", "tcmalloc-experimental"],
+    choices=["auto", "system", "tcmalloc-google", "tcmalloc-gperf"],
    default=build_profile.allocator,
    help='allocator to use (use "auto" for best choice for current platform)',
    type='choice',
@ -485,7 +485,8 @@ for pack in [
    ('protobuf', "Protocol Buffers"),
    ('snappy', ),
    ('stemmer', ),
-    ('tcmalloc', ),
+    ('tcmalloc-google', ),
+    ('tcmalloc-gperf', ),
    ('libunwind', ),
    ('valgrind', ),
    ('wiredtiger', ),
@ -2124,17 +2125,48 @@ env['TARGET_OS_FAMILY'] = 'posix' if env.TargetOSIs('posix') else env.GetTargetO
 # would be nicer to use SetOption here, but you can't reset user
 # options for some strange reason in SCons. Instead, we store this
 # option as a new variable in the environment.
+try:
+    kernel_version = platform.release().split(".")
+    kernel_major = int(kernel_version[0])
+    kernel_minor = int(kernel_version[1])
+except (ValueError, IndexError):
+    print(
+        f"Failed to extract kernel major and minor versions, tcmalloc-google will not be available for use: {kernel_version}"
+    )
+    kernel_major = 0
+    kernel_minor = 0
+
 if get_option('allocator') == "auto":
-    # using an allocator besides system on android would require either fixing or disabling
-    # gperftools on android
-    if env.TargetOSIs('windows') or \
-       env.TargetOSIs('linux') and not env.TargetOSIs('android'):
-        env['MONGO_ALLOCATOR'] = "tcmalloc"
+    if env.TargetOSIs('linux') and env['TARGET_ARCH'] in ('x86_64', 'aarch64'):
+
+        # TODO SERVER-86472 make bazel support both tcmalloc implementations
+        if env.get("BAZEL_BUILD_ENABLED"):
+            env['MONGO_ALLOCATOR'] = "tcmalloc-gperf"
+        else:
+            env['MONGO_ALLOCATOR'] = "tcmalloc-google"
+
+        # googles tcmalloc uses the membarrier() system call which was added in Linux 4.3,
+        # so fall back to gperf implementation for older kernels
+        if kernel_major < 4 or (kernel_major == 4 and kernel_minor < 3):
+            env['MONGO_ALLOCATOR'] = "tcmalloc-gperf"
+
+    elif env.TargetOSIs('windows') or (env.TargetOSIs('linux')
+                                       and env['TARGET_ARCH'] in ('ppc64le', 's390x')):
+        env['MONGO_ALLOCATOR'] = "tcmalloc-gperf"
    else:
        env['MONGO_ALLOCATOR'] = "system"
 else:
    env['MONGO_ALLOCATOR'] = get_option('allocator')

+    if env['MONGO_ALLOCATOR'] == "tcmalloc-google":
+        if kernel_major < 4 or (kernel_major == 4 and kernel_minor < 3):
+            env.ConfError(
+                f"tcmalloc-google allocator only supported on linux kernel 4.3 or greater: kenerl verison={platform.release()}"
+            )
+
+if env['MONGO_ALLOCATOR'] == "tcmalloc-google":
+    env.Append(CPPDEFINES=["ABSL_ALLOCATOR_NOTHROW"])
+
 if has_option("cache"):
    if has_option("gcov"):
        env.FatalError("Mixing --cache and --gcov doesn't work correctly yet. See SERVER-11084")
@ -2445,6 +2477,13 @@ if not env.TargetOSIs('windows'):
    env["LINKCOM"] = env["LINKCOM"].replace("$LINKFLAGS", "$PROGLINKFLAGS")
    env["PROGLINKFLAGS"] = ['$LINKFLAGS']

+    # CPPFLAGS is used for assembler commands, this condition below assumes assembler files
+    # will be only directly assembled in librarys and not programs
+    if link_model.startswith("dynamic"):
+        env.Append(CPPFLAGS=["-fPIC"])
+    else:
+        env.Append(CPPFLAGS=["-fPIE"])
+
 # When it is necessary to supply additional SHLINKFLAGS without modifying the toolset default,
 # following appends contents of SHLINKFLAGS_EXTRA variable to the linker command
 env.AppendUnique(SHLINKFLAGS=['$SHLINKFLAGS_EXTRA'])
@ -3070,7 +3109,9 @@ if env.TargetOSIs('posix'):
            # If runtime hardening is requested, then build anything
            # destined for an executable with the necessary flags for PIE.
            env.AppendUnique(
+                PROGCFLAGS=['-fPIE'],
                PROGCCFLAGS=['-fPIE'],
+                PROGCXXFLAGS=['-fPIE'],
                PROGLINKFLAGS=['-pie'],
            )

@ -3102,7 +3143,8 @@ if env.TargetOSIs('posix'):

    # For debug builds with tcmalloc, we need the frame pointer so it can
    # record the stack of allocations.
-    can_nofp &= not (debugBuild and (env['MONGO_ALLOCATOR'] == 'tcmalloc'))
+    can_nofp &= not (debugBuild and
+                     (env['MONGO_ALLOCATOR'] in ['tcmalloc-google', 'tcmalloc-gperf']))

    # Only disable frame pointers if requested
    can_nofp &= ("nofp" in selected_experimental_optimizations)
@ -4116,6 +4158,10 @@ def doConfigure(myenv):
        if not myenv.ToolchainIs('clang', 'gcc'):
            env.FatalError('sanitize is only supported with clang or gcc')

+        # sanitizer libs may inject undefined refs (for hooks) at link time, but
+        # the symbols will be available at runtime via the compiler runtime lib.
+        env.Append(LINKFLAGS='-Wl,--allow-shlib-undefined')
+
        if myenv.ToolchainIs('gcc'):
            # GCC's implementation of ASAN depends on libdl.
            env.Append(LIBS=['dl'])
@ -4157,11 +4203,14 @@ def doConfigure(myenv):
                get_san_lib_path(sanitizer) for sanitizer in sanitizer_list
            ]

+        if 'thread' not in sanitizer_list:
+            env.Append(LINKFLAGS=['-rtlib=compiler-rt', '-unwindlib=libgcc'])
+
        if using_lsan:
            env.FatalError("Please use --sanitize=address instead of --sanitize=leak")

        if (using_asan
-                or using_msan) and env['MONGO_ALLOCATOR'] in ['tcmalloc', 'tcmalloc-experimental']:
+                or using_msan) and env['MONGO_ALLOCATOR'] in ['tcmalloc-google', 'tcmalloc-gperf']:
            # There are multiply defined symbols between the sanitizer and
            # our vendorized tcmalloc.
            env.FatalError("Cannot use --sanitize=address or --sanitize=memory with tcmalloc")
@ -4236,7 +4285,7 @@ def doConfigure(myenv):
        else:
            myenv.ConfError('Failed to enable sanitizers with flag: {0}', sanitizer_option)

-        if get_option('shared-libsan') == 'on':
+        if get_option("shared-libsan") == "on":
            shared_libsan_option = '-shared-libsan'
            if myenv.AddToCCFLAGSIfSupported(shared_libsan_option):
                myenv.Append(LINKFLAGS=[shared_libsan_option])
@ -5279,13 +5328,16 @@ def doConfigure(myenv):

    # 'tcmalloc' needs to be the last library linked. Please, add new libraries before this
    # point.
-    if myenv['MONGO_ALLOCATOR'] == 'tcmalloc':
-        if use_system_version_of_library('tcmalloc'):
-            conf.FindSysLibDep("tcmalloc", ["tcmalloc"])
-    elif myenv['MONGO_ALLOCATOR'] in ['system', 'tcmalloc-experimental']:
+    if myenv['MONGO_ALLOCATOR'] == 'tcmalloc-google':
+        if use_system_version_of_library('tcmalloc-google'):
+            conf.FindSysLibDep("tcmalloc-google", ["tcmalloc"])
+    elif myenv['MONGO_ALLOCATOR'] == 'tcmalloc-gperf':
+        if use_system_version_of_library('tcmalloc-gperf'):
+            conf.FindSysLibDep("tcmalloc-gperf", ["tcmalloc"])
+    elif myenv['MONGO_ALLOCATOR'] in ['system']:
        pass
    else:
-        myenv.FatalError("Invalid --allocator parameter: $MONGO_ALLOCATOR")
+        myenv.FatalError(f"Invalid --allocator parameter: {env['MONGO_ALLOCATOR']}")

    def CheckStdAtomic(context, base_type, extra_message):
        test_body = """
--- a/etc/evergreen_yml_components/variants/sanitizer/test_dev.yml
+++ b/etc/evergreen_yml_components/variants/sanitizer/test_dev.yml
@ -348,6 +348,7 @@ buildvariants:
      archive-mongocryptd-debug
    lang_environment: LANG=C
    san_options: *ubsan_options
+    # TODO SERVER-86610 set --allocator=tcmalloc-google
    compile_flags: >-
      --variables-files=etc/scons/mongodbtoolchain_stable_clang.vars
      --dbg=on
@ -355,6 +356,7 @@ buildvariants:
      --sanitize=undefined
      --ssl
      --ocsp-stapling=off
+      --allocator=tcmalloc-gperf
      -j$(grep -c ^processor /proc/cpuinfo)
      --link-model=dynamic
      --use-diagnostic-latches=on
--- a/etc/evergreen_yml_components/variants/sanitizer/test_dev_master_branch_only.yml
+++ b/etc/evergreen_yml_components/variants/sanitizer/test_dev_master_branch_only.yml
@ -469,12 +469,14 @@ buildvariants:
      archive-mongocryptd-debug
    lang_environment: LANG=C
    san_options: *ubsan_options
+    # TODO SERVER-86610 add tcmalloc-google as the allocator for ubsan
    compile_flags: >-
      --variables-files=etc/scons/mongodbtoolchain_${toolchain_version}_clang.vars
      --dbg=on
      --opt=on
      --sanitize=undefined
      --ssl
+      --allocator=tcmalloc-gperf
      --ocsp-stapling=off
      -j$(grep -c ^processor /proc/cpuinfo)
      --use-diagnostic-latches=on
--- a/site_scons/site_tools/integrate_bazel.py
+++ b/site_scons/site_tools/integrate_bazel.py
@ -522,9 +522,13 @@ def generate(env: SCons.Environment.Environment) -> None:
        else:
            build_mode = f"opt_{mongo_generators.get_opt_options(env)}"  # one of "on", "size", "off"

-        # Deprecate tcmalloc-experimental
-        allocator = "tcmalloc" if env.GetOption(
-            "allocator") == "tcmalloc-experimental" else env.GetOption("allocator")
+        # TODO SERVER-86472 make bazel support both tcmalloc implementations
+        if env.GetOption("allocator") == "tcmalloc-google":
+            env.ConfError("Bazel build currently does not support tcmalloc-google allocator.")
+        if env.GetOption("allocator") == "tcmalloc-gperf":
+            allocator = "tcmalloc"
+        else:
+            allocator = env.GetOption("allocator")

        bazel_internal_flags = [
            f'--//bazel/config:compiler_type={env.ToolchainName()}',
--- a/src/mongo/tools/mongo_tidy_checks/SConscript
+++ b/src/mongo/tools/mongo_tidy_checks/SConscript
@ -38,6 +38,7 @@ env['LINK'] = [f'{base_toolchain_bin}/g++']
 env['SHLINK'] = [f'{base_toolchain_bin}/g++']
 env['CPPPATH'] = [str(tidy_include)]
 env['LIBPATH'] = []
+env['CPPFLAGS'] = []
 env['CCFLAGS'] = [
    '-DGTEST_HAS_RTTI=0',
    '-D_GNU_SOURCE',
--- a/src/mongo/transport/grpc/server.cpp
+++ b/src/mongo/transport/grpc/server.cpp
@ -37,7 +37,7 @@

 #include <src/core/lib/security/credentials/ssl/ssl_credentials.h>
 #include <src/core/lib/security/security_connector/ssl_utils.h>
-#include <src/core/tsi/ssl_transport_security.cc>
+#include <src/core/tsi/ssl_transport_security.h>
 #include <src/cpp/server/secure_server_credentials.h>

 #include "mongo/base/error_codes.h"
--- a/src/mongo/util/SConscript
+++ b/src/mongo/util/SConscript
@ -361,20 +361,38 @@ env.CppUnitTest(
    ],
 )

-if env['MONGO_ALLOCATOR'] in ['tcmalloc', 'tcmalloc-experimental']:
-    tcmspEnv = env.Clone()
-
-    if not use_system_version_of_library('tcmalloc'):
-
-        # Add in the include path for our vendored tcmalloc.
-        tcmspEnv.InjectThirdParty('gperftools')
-
+tcmallocAttrs = None
+for impl in [
+    {
+        # Modern standalone tcmalloc (not gperftools)
+        'options': ['tcmalloc-google'],
+        'sys_name': 'tcmalloc-google',
+        'inject': 'tcmalloc',
+        'cppdefs': ['MONGO_HAVE_GOOGLE_TCMALLOC'],
+    },
+    {
+        # Old gperftools tcmalloc
+        'options': ['tcmalloc-gperf'],
+        'sys_name':
+            'tcmalloc-gperf',
+        'inject':
+            'gperftools',
        # If our changes to tcmalloc are ever upstreamed, this should become set based on a top
        # level configure check, though its effects should still be scoped just to these files.
-        tcmspEnv.Append(CPPDEFINES=[
+        'cppdefs': [
+            'MONGO_HAVE_GPERF_TCMALLOC',
            'MONGO_HAVE_GPERFTOOLS_GET_THREAD_CACHE_SIZE',
            'MONGO_HAVE_GPERFTOOLS_SIZE_CLASS_STATS',
-        ])
+        ],
+    },
+]:
+    if env['MONGO_ALLOCATOR'] in impl['options']:
+        tcmallocAttrs = impl
+if tcmallocAttrs:
+    tcmspEnv = env.Clone()
+    if not use_system_version_of_library(tcmallocAttrs['sys_name']):
+        tcmspEnv.InjectThirdParty(tcmallocAttrs['inject'])
+    tcmspEnv.Append(CPPDEFINES=tcmallocAttrs['cppdefs'])

    if not use_system_version_of_library('valgrind'):
        # Include valgrind since tcmalloc disables itself while running under valgrind
--- a/src/mongo/util/heap_profiler.cpp
+++ b/src/mongo/util/heap_profiler.cpp
@ -27,8 +27,6 @@
 *    it in the license file.
 */

-#include <gperftools/malloc_hook.h>
-
 #include <absl/hash/hash.h>

 // IWYU pragma: no_include "cxxabi.h"
@ -60,6 +58,7 @@
 #include "mongo/logv2/log.h"
 #include "mongo/logv2/log_attr.h"
 #include "mongo/logv2/log_component.h"
+#include "mongo/stdx/unordered_map.h"
 #include "mongo/util/stacktrace.h"
 #include "mongo/util/tcmalloc_parameters_gen.h"

@ -67,11 +66,54 @@
 #include <unistd.h>
 #endif

+#include <MurmurHash3.h>
+
+#ifdef MONGO_HAVE_GPERF_TCMALLOC
+#include <gperftools/malloc_hook.h>
+#endif
+
+#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
+#include <absl/debugging/symbolize.h>
+#include <tcmalloc/malloc_extension.h>
+#endif
+
+#if defined(_POSIX_VERSION) && defined(MONGO_CONFIG_HAVE_EXECINFO_BACKTRACE)
+#include <dlfcn.h>
+#include <execinfo.h>
+#endif
+

 #define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kDefault

-// for dlfcn.h and backtrace
-#if defined(_POSIX_VERSION) && defined(MONGO_CONFIG_HAVE_EXECINFO_BACKTRACE)
+#if defined(_POSIX_VERSION) && defined(MONGO_CONFIG_HAVE_EXECINFO_BACKTRACE) && \
+    (defined(MONGO_HAVE_GPERF_TCMALLOC) || defined(MONGO_HAVE_GOOGLE_TCMALLOC))
+
+
+namespace mongo {
+namespace {
+
+/** Simple wrapper for the demangler, particularly its buffer space. */
+class Demangler {
+public:
+    Demangler() = default;
+    Demangler(const Demangler&) = delete;
+    ~Demangler() {
+        free(_buf);
+    }
+
+    char* operator()(const char* sym) {
+        char* dm = abi::__cxa_demangle(sym, _buf, &_bufSize, &_status);
+        if (dm)
+            _buf = dm;
+        return dm;
+    }
+
+private:
+    size_t _bufSize = 0;
+    char* _buf = nullptr;
+    int _status = 0;
+};
+

 //
 // Sampling heap profiler
@ -149,33 +191,6 @@
 // and acceptable size overhead for the hash tables.
 //

-namespace mongo {
-namespace {
-
-// Simple wrapper for the demangler, particularly its buffer space.
-class Demangler {
-public:
-    Demangler() = default;
-
-    Demangler(const Demangler&) = delete;
-
-    ~Demangler() {
-        free(_buf);
-    }
-
-    char* operator()(const char* sym) {
-        char* dm = abi::__cxa_demangle(sym, _buf, &_bufSize, &_status);
-        if (dm)
-            _buf = dm;
-        return dm;
-    }
-
-private:
-    size_t _bufSize = 0;
-    char* _buf = nullptr;
-    int _status = 0;
-};
-
 // TODO SERVER-44010: Consider replacing this custom implementation with a generic one.
 //
 // Simple hash table maps Key->Value.
@ -201,29 +216,6 @@ using Hash = size_t;

 template <class Key, class Value>
 class HashTable {
-    HashTable(const HashTable&) = delete;
-    HashTable& operator=(const HashTable&) = delete;
-
-private:
-    struct Entry {
-        Key key{};
-        Value value{};
-        std::atomic<Entry*> next{nullptr};  // NOLINT
-        std::atomic<bool> valid{false};     // NOLINT
-        Entry() {}
-    };
-
-    const size_t maxEntries;        // we allocate storage for this many entries on creation
-    std::atomic_size_t numEntries;  // number of entries currently in use  NOLINT
-    size_t numBuckets;              // number of buckets, computed as numEntries * loadFactor
-
-    // pre-allocate buckets and entries
-    std::unique_ptr<std::atomic<Entry*>[]> buckets;  // NOLINT
-    std::unique_ptr<Entry[]> entries;
-
-    std::atomic_size_t nextEntry;  // first entry that's never been used  NOLINT
-    Entry* freeEntry;              // linked list of entries returned to us by removeEntry
-
 public:
    HashTable(size_t maxEntries, int loadFactor)
        : maxEntries(maxEntries),
@ -246,17 +238,15 @@ public:
        } else if (nextEntry < maxEntries) {
            entry = &entries[nextEntry++];
        }
-        if (entry) {
-            entry->next = buckets[hash].load();
-            buckets[hash] = entry;
-            entry->key = key;
-            entry->value = value;
-            entry->valid = true;  // signal that the entry is well-formed and may be traversed
-            numEntries++;
-            return &entry->value;
-        } else {
+        if (!entry)
            return nullptr;
-        }
+        entry->next = buckets[hash].load();
+        buckets[hash] = entry;
+        entry->key = key;
+        entry->value = value;
+        entry->valid = true;  // signal that the entry is well-formed and may be traversed
+        ++numEntries;
+        return &entry->value;
    }

    // Find the entry containing Key in the specified hash bucket.
@ -279,7 +269,7 @@ public:
                entry->valid = false;  // first signal entry is invalid as it may get reused
                entry->next = freeEntry;
                freeEntry = entry;
-                numEntries--;
+                --numEntries;
                break;
            }
        }
@ -292,76 +282,105 @@ public:
    // Note however it is not guaranteed to provide snapshot semantics wrt the set of entries,
    // and caller must ensure safety wrt concurrent updates to the Value of an entry
    template <typename F>
-    void forEach(F f) {
+    void forEach(const F& f) {
        for (size_t i = 0; i < nextEntry; i++) {
-            Entry& entry = entries[i];
-            if (entry.valid)  // only traverse well-formed entries
-                f(entry.key, entry.value);
+            Entry& e = entries[i];
+            if (e.valid)  // only traverse well-formed entries
+                f(e.key, e.value);
        }
    }

    // Determines whether the specified hash bucket is empty. May be called concurrently with
    // insert() and remove(). Concurrent visibility on other threads is guaranteed because
    // buckets[hash] is atomic.
-    bool isEmptyBucket(Hash hash) {
-        hash %= numBuckets;
-        return buckets[hash] == nullptr;
+    bool isEmptyBucket(Hash hash) const {
+        return !buckets[hash % numBuckets];
    }

    // Number of entries.
-    size_t size() {
+    size_t size() const {
        return numEntries;
    }

    // Highwater mark of number of entries used, for reporting stats.
-    size_t maxSizeSeen() {
+    size_t maxSizeSeen() const {
        return nextEntry;
    }

    // Returns total allocated size of the hash table, for reporting stats.
-    size_t memorySizeBytes() {
+    size_t memorySizeBytes() const {
        return numBuckets * sizeof(buckets[0]) + maxEntries * sizeof(entries[0]);
    }
+
+private:
+    struct Entry {
+        Key key{};
+        Value value{};
+        std::atomic<Entry*> next{nullptr};  // NOLINT
+        std::atomic<bool> valid{false};     // NOLINT
+    };
+
+    const size_t maxEntries;        // we allocate storage for this many entries on creation
+    std::atomic_size_t numEntries;  // number of entries currently in use  NOLINT
+    size_t numBuckets;              // number of buckets, computed as numEntries * loadFactor
+
+    // pre-allocate buckets and entries
+    std::unique_ptr<std::atomic<Entry*>[]> buckets;  // NOLINT
+    std::unique_ptr<Entry[]> entries;
+
+    std::atomic_size_t nextEntry;  // first entry that's never been used  NOLINT
+    Entry* freeEntry;              // linked list of entries returned to us by removeEntry
 };

-
+namespace heap_profiler_detail_gperf_tcmalloc {
 class HeapProfiler {
+public:
+    static inline HeapProfiler* heapProfiler;
+
+    HeapProfiler() {
+        // Set sample interval from the parameter.
+        sampleIntervalBytes = HeapProfilingSampleIntervalBytes;
+
+        // This is our only allocator dependency - ifdef and change as
+        // appropriate for other allocators, using hooks or shims.
+        // For tcmalloc we skip two frames that are internal to the allocator
+        // so that the top frame is the public tc_* function.
+        skipStartFrames = 2;
+        skipEndFrames = 0;
+#ifdef MONGO_HAVE_GPERF_TCMALLOC
+        MallocHook::AddNewHook(+[](const void* p, size_t sz) { heapProfiler->_alloc(p, sz); });
+        MallocHook::AddDeleteHook(+[](const void* p) { heapProfiler->_free(p); });
+#endif
+    }
+
+    static void generateServerStatusSection(BSONObjBuilder& builder) {
+        if (heapProfiler)
+            heapProfiler->_generateServerStatusSection(builder);
+    }
+
+    static void start() {
+        heapProfiler = new HeapProfiler();
+    }
+
 private:
-    // 0: sampling internally disabled
-    // 1: sample every allocation - byte accurate but slow and big
-    // >1: sample ever sampleIntervalBytes bytes allocated - less accurate but fast and small
-    std::atomic_size_t sampleIntervalBytes;  // NOLINT
-
-    // guards updates to both object and stack hash tables
-    stdx::mutex hashtable_mutex;  // NOLINT
-    // guards against races updating the StackInfo bson representation
-    stdx::mutex stackinfo_mutex;  // NOLINT
-
-    // cumulative bytes allocated - determines when samples are taken
-    std::atomic_size_t bytesAllocated{0};  // NOLINT
-
-    // estimated currently active bytes - sum of activeBytes for all stacks
-    size_t totalActiveBytes = 0;
-
-    //
-    // Hash table of stacks
-    //
-
    using FrameInfo = void*;  // per-frame information is just the IP

    static const int kMaxStackInfos = 20000;         // max number of unique call sites we handle
    static const int kStackHashTableLoadFactor = 2;  // keep loading <50%
    static const size_t kMaxFramesPerStack = 100;    // max depth of stack

+    static const int kMaxObjInfos = 1024 * 1024;   // maximum tracked allocations
+    static const int kObjHashTableLoadFactor = 4;  // keep hash table loading <25%
+
+    static const int kMaxImportantSamples = 4 * 3600;  // reset every 4 hours at 1Hz
+
    // stack HashTable Key
    struct Stack {
-        size_t numFrames = 0;
-        std::array<FrameInfo, kMaxFramesPerStack> frames;
-        Stack() {}
+        Stack() = default;

-        bool operator==(const Stack& that) {
-            return this->numFrames == that.numFrames &&
-                std::equal(frames.begin(), frames.begin() + numFrames, that.frames.begin());
+        friend bool operator==(const Stack& a, const Stack& b) {
+            return a.numFrames == b.numFrames &&
+                std::equal(a.frames.begin(), a.frames.begin() + a.numFrames, b.frames.begin());
        }

        Hash hash() {
@ -371,61 +390,51 @@ private:
                                     numFrames * sizeof(FrameInfo)};
            return absl::HashOf(dataRange);
        }
+
+        size_t numFrames = 0;
+        std::array<FrameInfo, kMaxFramesPerStack> frames;
    };

    // Stack HashTable Value.
    struct StackInfo {
+        StackInfo() = default;
+        explicit StackInfo(int stackNum) : stackNum(stackNum) {}
+
        int stackNum = 0;        // used for stack short name
        size_t activeBytes = 0;  // number of live allocated bytes charged to this stack
        bool logged = false;     // true when stack has been logged once.
-
-        explicit StackInfo(int stackNum) : stackNum(stackNum) {}
-        StackInfo() {}
    };

-    // The stack HashTable itself.
-    HashTable<Stack, StackInfo> stackHashTable{kMaxStackInfos, kStackHashTableLoadFactor};
+    struct ByPointeeStackNum {
+        bool operator()(const StackInfo* a, const StackInfo* b) const {
+            return a->stackNum < b->stackNum;
+        }
+    };

-    // frames to skip at top and bottom of backtrace when reporting stacks
-    size_t skipStartFrames = 0;
-    size_t skipEndFrames = 0;
-
-
-    //
-    // Hash table of allocated objects.
-    //
-
-    static const int kMaxObjInfos = 1024 * 1024;   // maximum tracked allocations
-    static const int kObjHashTableLoadFactor = 4;  // keep hash table loading <25%
-
-    // Obj HashTable Key.
    struct Obj {
-        const void* objPtr = nullptr;
+        Obj() = default;
        explicit Obj(const void* objPtr) : objPtr(objPtr) {}
-        Obj() {}

-        bool operator==(const Obj& that) {
-            return this->objPtr == that.objPtr;
+        friend bool operator==(const Obj& a, const Obj& b) {
+            return a.objPtr == b.objPtr;
        }

        Hash hash() {
            return absl::HashOf(objPtr);
        }
+
+        const void* objPtr = nullptr;
    };

-    // Obj HashTable Value.
    struct ObjInfo {
-        size_t accountedLen = 0;
-        StackInfo* stackInfo = nullptr;
+        ObjInfo() = default;
        ObjInfo(size_t accountedLen, StackInfo* stackInfo)
            : accountedLen(accountedLen), stackInfo(stackInfo) {}
-        ObjInfo() {}
+
+        size_t accountedLen = 0;
+        StackInfo* stackInfo = nullptr;
    };

-    // The obj HashTable itself.
-    HashTable<Obj, ObjInfo> objHashTable{kMaxObjInfos, kObjHashTableLoadFactor};
-
-
    // If we encounter an error that doesn't allow us to proceed, for
    // example out of space for new hash table entries, we internally
    // disable profiling and then log an error message.
@ -562,26 +571,6 @@ private:
              "stackObj"_attr = builder.done());
    }

-    //
-    // Generate serverStatus section.
-    //
-
-    bool logGeneralStats = true;  // first time only
-
-    // In order to reduce load on ftdc we track the stacks we deem important enough to emit
-    // once a stack is deemed "important" it remains important from that point on.
-    // "Important" is a sticky quality to improve the stability of the set of stacks we emit,
-    // and we always emit them in stackNum order, greatly improving ftdc compression efficiency.
-    struct ImportantStacksOrder {
-        bool operator()(const StackInfo* a, const StackInfo* b) const {
-            return a->stackNum < b->stackNum;
-        }
-    };
-    std::set<const StackInfo*, ImportantStacksOrder> importantStacks;
-
-    int numImportantSamples = 0;                // samples currently included in importantStacks
-    const int kMaxImportantSamples = 4 * 3600;  // reset every 4 hours at default 1 sample / sec
-
    void _generateServerStatusSection(BSONObjBuilder& builder) {
        // compute and log some informational stats first time through
        if (logGeneralStats) {
@ -681,44 +670,218 @@ private:
        }
    }

-    //
-    // Static hooks to give to the allocator.
-    //
+    // 0: sampling internally disabled
+    // 1: sample every allocation - byte accurate but slow and big
+    // >1: sample ever sampleIntervalBytes bytes allocated - less accurate but fast and small
+    std::atomic_size_t sampleIntervalBytes;  // NOLINT

-    static void alloc(const void* obj, size_t objLen) {
-        heapProfiler->_alloc(obj, objLen);
-    }
+    // guards updates to both object and stack hash tables
+    stdx::mutex hashtable_mutex;  // NOLINT
+    // guards against races updating the StackInfo bson representation
+    stdx::mutex stackinfo_mutex;  // NOLINT

-    static void free(const void* obj) {
-        heapProfiler->_free(obj);
-    }
+    // cumulative bytes allocated - determines when samples are taken
+    std::atomic_size_t bytesAllocated{0};  // NOLINT

+    // estimated currently active bytes - sum of activeBytes for all stacks
+    size_t totalActiveBytes = 0;
+
+    // The stack HashTable itself.
+    HashTable<Stack, StackInfo> stackHashTable{kMaxStackInfos, kStackHashTableLoadFactor};
+
+    // frames to skip at top and bottom of backtrace when reporting stacks
+    size_t skipStartFrames = 0;
+    size_t skipEndFrames = 0;
+
+    // The obj HashTable itself.
+    HashTable<Obj, ObjInfo> objHashTable{kMaxObjInfos, kObjHashTableLoadFactor};
+
+    bool logGeneralStats = true;  // first time only
+
+    // In order to reduce load on ftdc we track the stacks we deem important enough to emit
+    // once a stack is deemed "important" it remains important from that point on.
+    // "Important" is a sticky quality to improve the stability of the set of stacks we emit,
+    // and we always emit them in stackNum order, greatly improving ftdc compression efficiency.
+    std::set<const StackInfo*, ByPointeeStackNum> importantStacks;
+
+    int numImportantSamples = 0;  // samples currently included in importantStacks
+};
+}  // namespace heap_profiler_detail_gperf_tcmalloc
+
+namespace heap_profiler_detail_tcmalloc {
+#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
+class HeapProfiler {
 public:
-    static HeapProfiler* heapProfiler;
+    static const int kMaxImportantSamples = 4 * 3600;  // reset every 4 hours at 1Hz
+    static inline HeapProfiler* heapProfiler;

    HeapProfiler() {
-        // Set sample interval from the parameter.
        sampleIntervalBytes = HeapProfilingSampleIntervalBytes;
-
-        // This is our only allocator dependency - ifdef and change as
-        // appropriate for other allocators, using hooks or shims.
-        // For tcmalloc we skip two frames that are internal to the allocator
-        // so that the top frame is the public tc_* function.
-        skipStartFrames = 2;
-        skipEndFrames = 0;
-        MallocHook::AddNewHook(alloc);
-        MallocHook::AddDeleteHook(free);
+        tcmalloc::MallocExtension::SetProfileSamplingRate(sampleIntervalBytes);
+        auto profileToken = tcmalloc::MallocExtension::StartAllocationProfiling();
+        profileTokens.push_back(std::move(profileToken));
    }

    static void generateServerStatusSection(BSONObjBuilder& builder) {
        if (heapProfiler)
            heapProfiler->_generateServerStatusSection(builder);
    }
-};

-//
-// serverStatus section
-//
+    static void start() {
+        heapProfiler = new HeapProfiler();
+    }
+
+private:
+    struct StackInfo {
+        StackInfo(const tcmalloc::Profile::Sample& stackSample, int id) {
+            stackNum = id;
+            numFrames = stackSample.depth;
+            // Generate a bson representation of our new stack.
+            BSONArrayBuilder builder;
+            std::string frameString(256, '\0');
+            for (int i = 0; i < stackSample.depth; ++i) {
+                char buf[256];
+                if (!absl::Symbolize(stackSample.stack[i], buf, sizeof(buf))) {
+                    frameString = fmt::format("{}", stackSample.stack[i]);
+                } else {
+                    frameString.assign(buf);
+                }
+                builder.append(frameString);
+            }
+            LOGV2(8592501,
+                  "heapProfile stack",
+                  "stackNum"_attr = stackNum,
+                  "stackObj"_attr = builder.obj());
+        }
+
+        int stackNum = 0;  // used for stack short name
+        BSONObj stackObj;  // symbolized representation
+        int numFrames = 0;
+        uint64_t activeBytes = 0;
+    };
+
+    struct ByStackNum {
+        bool operator()(StackInfo* a, StackInfo* b) const {
+            return a->stackNum < b->stackNum;
+        }
+    };
+
+    uint32_t StackHash(const tcmalloc::Profile::Sample& stackSample) {
+        uint32_t hash;
+        MurmurHash3_x86_32(stackSample.stack, stackSample.depth * sizeof(void*), 0, &hash);
+        return hash;
+    }
+
+    void _generateServerStatusSection(BSONObjBuilder& builder) {
+        // Compute and log some informational stats first time through
+        if (logGeneralStats) {
+            LOGV2(8592504,
+                  "Generating heap profiler serverStatus",
+                  "heapProfilingSampleIntervalBytes"_attr = HeapProfilingSampleIntervalBytes);
+            LOGV2(8592503, "Following stack trace is for heap profiler informational purposes");
+            printStackTrace();
+            logGeneralStats = false;
+        }
+
+        // Get a live snapshot profile of the current heap usage
+        int64_t totalActiveBytes = 0;
+        std::vector<StackInfo*> stackInfos;
+        std::set<StackInfo*, ByStackNum> activeStacks;
+        tcmalloc::MallocExtension::SnapshotCurrent(tcmalloc::ProfileType::kHeap)
+            .Iterate([&](const auto& sample) {
+                totalActiveBytes += sample.sum;
+                // Compute backtrace hash of sample stack
+                uint32_t stackHash = StackHash(sample);
+                StackInfo* stackInfo = stackInfoMap[stackHash];
+                // If this is a new stack, store in our stack map
+                if (!stackInfo) {
+                    stackInfo = new StackInfo(sample, stackInfoMap.size());
+                    stackInfoMap[stackHash] = stackInfo;
+                }
+                auto activeStackSearch = activeStacks.find(stackInfo);
+                if (activeStackSearch != activeStacks.end()) {
+                    stackInfo->activeBytes += sample.sum;
+                } else {
+                    activeStacks.insert(stackInfo);
+                    stackInfos.push_back(stackInfo);
+                    stackInfo->activeBytes = sample.sum;
+                }
+            });
+
+        // Get the series of allocation samples to this point
+        auto currentToken = std::move(profileTokens.back());
+        profileTokens.pop_back();
+        auto allocProfile = std::move(currentToken).Stop();
+        // Start a new allocation profile session for the next invocation
+        auto newToken = tcmalloc::MallocExtension::StartAllocationProfiling();
+        profileTokens.push_back(std::move(newToken));
+
+        // Sum all the allocations performed (of what we sampled)
+        int64_t allocatedBytes = 0;
+        allocProfile.Iterate(
+            [&](const tcmalloc::Profile::Sample& sample) { allocatedBytes += sample.sum; });
+        sampleBytesAllocated += allocatedBytes;
+
+        BSONObjBuilder(builder.subobjStart("stats"))
+            .appendNumber("totalActiveBytes", static_cast<long long>(totalActiveBytes))
+            .appendNumber("bytesAllocated", static_cast<long long>(sampleBytesAllocated))
+            .appendNumber("numStacks", static_cast<long long>(stackInfoMap.size()));
+
+        // Sort the stacks and find enough stacks to account for at least 99% of the active bytes
+        // deem any stack that has ever met this criterion as "important".
+        std::stable_sort(stackInfos.begin(), stackInfos.end(), [](StackInfo* a, StackInfo* b) {
+            return a->activeBytes > b->activeBytes;
+        });
+        size_t threshold = totalActiveBytes * 0.99;
+        size_t cumulative = 0;
+        for (auto&& stackInfo : stackInfos) {
+            importantStacks.insert(stackInfo);
+            cumulative += stackInfo->activeBytes;
+            if (cumulative > threshold)
+                break;
+        }
+
+        // Build the stacks subsection by emitting a sample of stacks that were live at a peak of
+        // total heap usage.
+        {
+            BSONObjBuilder stacks(builder.subobjStart("stacks"));
+            for (auto&& stackInfo : importantStacks)
+                BSONObjBuilder{stacks.subobjStart(fmt::format("stack{}", stackInfo->stackNum))}
+                    .appendNumber("activeBytes", static_cast<long long>(stackInfo->activeBytes));
+        }
+
+        // importantStacks grows monotonically, so it can accumulate unneeded stacks,
+        // so we clear it periodically.
+        if (++numImportantSamples >= kMaxImportantSamples) {
+            LOGV2(8592502, "Clearing importantStacks");
+            importantStacks.clear();
+            numImportantSamples = 0;
+        }
+    }
+
+    std::vector<tcmalloc::MallocExtension::AllocationProfilingToken> profileTokens;
+    std::atomic_size_t sampleIntervalBytes;
+    std::atomic_size_t sampleBytesAllocated{0};
+
+    bool logGeneralStats = true;  // first time only
+    stdx::unordered_map<uint32_t, StackInfo*> stackInfoMap;
+
+    // In order to reduce load on ftdc we track the stacks we deem important enough to emit
+    // once a stack is deemed "important" it remains important from that point on.
+    // "Important" is a sticky quality to improve the stability of the set of stacks we emit,
+    // and we always emit them in stackNum order, greatly improving ftdc compression efficiency.
+    std::set<StackInfo*, ByStackNum> importantStacks;
+
+    int numImportantSamples = 0;  // samples currently included in importantStacks
+};
+#endif  // MONGO_HAVE_GOOGLE_TCMALLOC
+}  // namespace heap_profiler_detail_tcmalloc
+
+#if defined(MONGO_HAVE_GOOGLE_TCMALLOC)
+using heap_profiler_detail_tcmalloc::HeapProfiler;
+#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
+using heap_profiler_detail_gperf_tcmalloc::HeapProfiler;
+#endif

 class HeapProfilerServerStatusSection final : public ServerStatusSection {
 public:
@ -728,27 +891,26 @@ public:
        return HeapProfilingEnabled;
    }

-    BSONObj generateSection(OperationContext* opCtx,
-                            const BSONElement& configElement) const override {
+    BSONObj generateSection(OperationContext*, const BSONElement&) const override {
        BSONObjBuilder builder;
        HeapProfiler::generateServerStatusSection(builder);
        return builder.obj();
    }
-} heapProfilerServerStatusSection;
+};

-//
-// startup
-//
+#ifdef MONGO_HAVE_HEAP_PROFILER

-HeapProfiler* HeapProfiler::heapProfiler;
+HeapProfilerServerStatusSection heapProfilerServerStatusSection;

 MONGO_INITIALIZER_GENERAL(StartHeapProfiling, ("EndStartupOptionHandling"), ("default"))
-(InitializerContext* context) {
+(InitializerContext*) {
    if (HeapProfilingEnabled)
-        HeapProfiler::heapProfiler = new HeapProfiler();
+        HeapProfiler::start();
 }

+#endif  // MONGO_HAVE_HEAP_PROFILER
+
 }  // namespace
 }  // namespace mongo

-#endif  // MONGO_HAVE_HEAP_PROFILER
+#endif  //_POSIX_VERSION
--- a/src/mongo/util/tcmalloc_server_status_section.cpp
+++ b/src/mongo/util/tcmalloc_server_status_section.cpp
@ -28,15 +28,17 @@
 */


+#include "mongo/base/string_data_comparator.h"
 #ifdef _WIN32
 #define NVALGRIND
 #endif

 #include <cstddef>
-#include <gperftools/malloc_extension.h>
 #include <memory>
 #include <utility>

+#include <valgrind/valgrind.h>
+
 #include <boost/optional/optional.hpp>

 #include "mongo/base/error_codes.h"
@ -51,6 +53,14 @@
 #include "mongo/db/tenant_id.h"
 #include "mongo/util/tcmalloc_parameters_gen.h"

+#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
+#include <tcmalloc/malloc_extension.h>
+auto static tcmallocProperties = tcmalloc::MallocExtension::GetProperties();
+#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
+#include <gperftools/malloc_extension.h>
+auto static mallocExtensionAPI = MallocExtension::instance();
+#endif
+
 #define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kDefault


@ -93,74 +103,89 @@ public:

        BSONObjBuilder builder;

+        auto getValueIfExists = [&](StringData property) -> boost::optional<size_t> {
+#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
+            if (auto value = tcmallocProperties.find(property.toString());
+                value != tcmallocProperties.end()) {
+                return {value->second.value};
+            }
+#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
+            size_t value;
+            if (mallocExtensionAPI->GetNumericProperty(property.rawData(), &value)) {
+                return {value};
+            }
+#endif
+            return boost::none;
+        };
+
+        auto tryAppend = [&](BSONObjBuilder& builder, StringData bsonName, StringData property) {
+            if (auto value = getValueIfExists(property); !!value) {
+                builder.appendNumber(bsonName, static_cast<long long>(*value));
+            }
+        };
+
+        auto tryStat = [&](BSONObjBuilder& builder, StringData topic, StringData base) {
+            tryAppend(builder, base, fmt::format("{}.{}", topic, base));
+        };
+
        // For a list of properties see the "Generic Tcmalloc Status" section of
        // http://google-perftools.googlecode.com/svn/trunk/doc/tcmalloc.html and
        // http://code.google.com/p/gperftools/source/browse/src/gperftools/malloc_extension.h
        {
            BSONObjBuilder sub(builder.subobjStart("generic"));
-            appendNumericPropertyIfAvailable(
-                sub, "current_allocated_bytes", "generic.current_allocated_bytes");
-            appendNumericPropertyIfAvailable(sub, "heap_size", "generic.heap_size");
+            tryStat(sub, "generic", "current_allocated_bytes");
+            tryStat(sub, "generic", "heap_size");
        }
        {
            BSONObjBuilder sub(builder.subobjStart("tcmalloc"));
+            auto tryTc = [&](StringData key) {
+                tryStat(sub, "tcmalloc", key);
+            };

-            appendNumericPropertyIfAvailable(
-                sub, "pageheap_free_bytes", "tcmalloc.pageheap_free_bytes");
-            appendNumericPropertyIfAvailable(
-                sub, "pageheap_unmapped_bytes", "tcmalloc.pageheap_unmapped_bytes");
-            appendNumericPropertyIfAvailable(
-                sub, "max_total_thread_cache_bytes", "tcmalloc.max_total_thread_cache_bytes");
-            appendNumericPropertyIfAvailable(sub,
-                                             "current_total_thread_cache_bytes",
-                                             "tcmalloc.current_total_thread_cache_bytes");
-            // Not including tcmalloc.slack_bytes since it is deprecated.
+            tryTc("pageheap_free_bytes");
+            tryTc("pageheap_unmapped_bytes");
+            tryTc("max_total_thread_cache_bytes");
+            tryTc("current_total_thread_cache_bytes");

-            // Calculate total free bytes, *excluding the page heap*
-            size_t central;
-            size_t transfer;
-            size_t thread;
-            if (MallocExtension::instance()->GetNumericProperty("tcmalloc.central_cache_free_bytes",
-                                                                &central) &&
-                MallocExtension::instance()->GetNumericProperty(
-                    "tcmalloc.transfer_cache_free_bytes", &transfer) &&
-                MallocExtension::instance()->GetNumericProperty("tcmalloc.thread_cache_free_bytes",
-                                                                &thread)) {
-                sub.appendNumber("total_free_bytes",
-                                 static_cast<long long>(central) +
-                                     static_cast<long long>(transfer) +
-                                     static_cast<long long>(thread));
+            {
+                long long total = 0;
+                if (auto central = getValueIfExists("tcmalloc.central_cache_free"); !!central) {
+                    sub.appendNumber("central_cache_free_bytes", static_cast<long long>(*central));
+                    total += *central;
+                }
+                if (auto transfer = getValueIfExists("tcmalloc.transfer_cache_free"); !!transfer) {
+                    sub.appendNumber("transfer_cache_free_bytes",
+                                     static_cast<long long>(*transfer));
+                    total += *transfer;
+                }
+                if (auto thread = getValueIfExists("tcmalloc.thread_cache_free"); !!thread) {
+                    sub.appendNumber("thread_cache_free_bytes", static_cast<long long>(*thread));
+                    total += *thread;
+                }
+                if (auto cpu = getValueIfExists("tcmalloc.cpu_free"); !!cpu) {
+                    sub.appendNumber("cpu_cache_free_bytes", static_cast<long long>(*cpu));
+                    total += *cpu;
+                }
+                sub.appendNumber("total_free_bytes", total);
            }
-            appendNumericPropertyIfAvailable(
-                sub, "central_cache_free_bytes", "tcmalloc.central_cache_free_bytes");
-            appendNumericPropertyIfAvailable(
-                sub, "transfer_cache_free_bytes", "tcmalloc.transfer_cache_free_bytes");
-            appendNumericPropertyIfAvailable(
-                sub, "thread_cache_free_bytes", "tcmalloc.thread_cache_free_bytes");
-            appendNumericPropertyIfAvailable(
-                sub, "aggressive_memory_decommit", "tcmalloc.aggressive_memory_decommit");

-            appendNumericPropertyIfAvailable(
-                sub, "pageheap_committed_bytes", "tcmalloc.pageheap_committed_bytes");
-            appendNumericPropertyIfAvailable(
-                sub, "pageheap_scavenge_count", "tcmalloc.pageheap_scavenge_count");
-            appendNumericPropertyIfAvailable(
-                sub, "pageheap_commit_count", "tcmalloc.pageheap_commit_count");
-            appendNumericPropertyIfAvailable(
-                sub, "pageheap_total_commit_bytes", "tcmalloc.pageheap_total_commit_bytes");
-            appendNumericPropertyIfAvailable(
-                sub, "pageheap_decommit_count", "tcmalloc.pageheap_decommit_count");
-            appendNumericPropertyIfAvailable(
-                sub, "pageheap_total_decommit_bytes", "tcmalloc.pageheap_total_decommit_bytes");
-            appendNumericPropertyIfAvailable(
-                sub, "pageheap_reserve_count", "tcmalloc.pageheap_reserve_count");
-            appendNumericPropertyIfAvailable(
-                sub, "pageheap_total_reserve_bytes", "tcmalloc.pageheap_total_reserve_bytes");
-            appendNumericPropertyIfAvailable(
-                sub, "spinlock_total_delay_ns", "tcmalloc.spinlock_total_delay_ns");
+            tryTc("aggressive_memory_decommit");

-            auto tcmallocReleaseRate = MallocExtension::instance()->GetMemoryReleaseRate();
-            sub.appendNumber("release_rate", tcmallocReleaseRate);
+            tryTc("pageheap_committed_bytes");
+            tryTc("pageheap_scavenge_count");
+            tryTc("pageheap_commit_count");
+            tryTc("pageheap_total_commit_bytes");
+            tryTc("pageheap_decommit_count");
+            tryTc("pageheap_total_decommit_bytes");
+            tryTc("pageheap_reserve_count");
+            tryTc("pageheap_total_reserve_bytes");
+            tryTc("spinlock_total_delay_ns");
+
+#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
+            sub.appendNumber(
+                "release_rate",
+                static_cast<long long>(tcmalloc::MallocExtension::GetBackgroundReleaseRate()));
+#endif

 #if MONGO_HAVE_GPERFTOOLS_SIZE_CLASS_STATS
            if (verbosity >= 2) {
@ -170,31 +195,25 @@ public:

                // Size classes and page heap info is dumped in 1 call so that the performance
                // sensitive tcmalloc page heap lock is only taken once
-                MallocExtension::instance()->SizeClasses(
-                    &builders, appendSizeClassInfo, appendPageHeapInfo);
+                mallocExtensionAPI->SizeClasses(&builders, appendSizeClassInfo, appendPageHeapInfo);

                builders.first.done();
                builder.append("page_heap", builders.second.arr());
            }
 #endif
-
+#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
+            builder.append("formattedString", tcmalloc::MallocExtension::GetStats());
+#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
            char buffer[4096];
-            MallocExtension::instance()->GetStats(buffer, sizeof buffer);
+            mallocExtensionAPI->GetStats(buffer, sizeof buffer);
            builder.append("formattedString", buffer);
+#endif
        }

        return builder.obj();
    }

 private:
-    static void appendNumericPropertyIfAvailable(BSONObjBuilder& builder,
-                                                 StringData bsonName,
-                                                 const char* property) {
-        size_t value;
-        if (MallocExtension::instance()->GetNumericProperty(property, &value))
-            builder.appendNumber(bsonName, static_cast<long long>(value));
-    }
-
 #if MONGO_HAVE_GPERFTOOLS_SIZE_CLASS_STATS
    static void appendSizeClassInfo(void* bsonarr_builder, const base::MallocSizeClass* stats) {
        BSONArrayBuilder& builder =
--- a/src/mongo/util/tcmalloc_set_parameter.cpp
+++ b/src/mongo/util/tcmalloc_set_parameter.cpp
@ -33,7 +33,6 @@

 #include <algorithm>
 #include <cstdlib>
-#include <gperftools/malloc_extension.h>
 #include <limits>
 #include <string>
 #include <valgrind/valgrind.h>
@ -58,11 +57,23 @@
 #include "mongo/util/str.h"
 #include "mongo/util/tcmalloc_parameters_gen.h"

+#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
+#include <tcmalloc/malloc_extension.h>
+#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
+#include <gperftools/malloc_extension.h>
+#endif
+
 namespace mongo {
 namespace {
+
+constexpr absl::string_view toStringView(StringData s) {
+    return {s.rawData(), s.size()};
+}
+
 constexpr auto kMaxTotalThreadCacheBytesPropertyName = "tcmalloc.max_total_thread_cache_bytes"_sd;
 constexpr auto kAggressiveMemoryDecommitPropertyName = "tcmalloc.aggressive_memory_decommit"_sd;

+#if defined(MONGO_HAVE_GPERF_TCMALLOC)
 StatusWith<size_t> getProperty(StringData propname) {
    size_t value;
    if (!MallocExtension::instance()->GetNumericProperty(propname.toString().c_str(), &value)) {
@ -81,6 +92,66 @@ Status setProperty(StringData propname, size_t value) {
    }
    return Status::OK();
 }
+#endif
+
+void setMaxTotalThreadCacheBytes(size_t cacheSize) {
+#if defined(MONGO_HAVE_GOOGLE_TCMALLOC)
+    tcmalloc::MallocExtension::SetMaxTotalThreadCacheBytes(cacheSize);
+#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
+    uassertStatusOK(setProperty(kMaxTotalThreadCacheBytesPropertyName, cacheSize));
+#endif  // MONGO_HAVE_GPERF_TCMALLOC
+}
+
+
+#ifdef MONGO_HAVE_GOOGLE_TCMALLOC
+// Implement abstraction for the differences between gperftools and new tcmalloc.
+bool getNumericProperty(absl::string_view key, size_t* val) {
+    auto optVal = tcmalloc::MallocExtension::GetNumericProperty(key);
+    if (!optVal)
+        return false;
+    *val = *optVal;
+    return true;
+}
+
+StatusWith<size_t> getProperty(StringData propname) {
+    size_t value;
+    if (!getNumericProperty(propname.toString().c_str(), &value)) {
+        return {ErrorCodes::InternalError,
+                str::stream() << "Failed to retreive tcmalloc prop: " << propname};
+    }
+    return value;
+}
+
+bool setNumericProperty(absl::string_view key, size_t val) {
+    if (key == toStringView(kMaxTotalThreadCacheBytesPropertyName)) {
+        setMaxTotalThreadCacheBytes(val);
+        return true;
+    }
+    return false;
+}
+
+Status setProperty(StringData propname, size_t value) {
+    if (!RUNNING_ON_VALGRIND) {  // NOLINT
+        if (!setNumericProperty(propname.toString().c_str(), value)) {
+            return {ErrorCodes::InternalError,
+                    str::stream() << "Failed to set internal tcmalloc property " << propname};
+        }
+    }
+    return Status::OK();
+}
+
+long long getMemoryReleaseRate() {
+    return static_cast<size_t>(tcmalloc::MallocExtension::GetBackgroundReleaseRate());
+}
+
+bool setMemoryReleaseRate(size_t val) {
+    tcmalloc::MallocExtension::SetBackgroundReleaseRate(
+        tcmalloc::MallocExtension::BytesPerSecond{val});
+    return true;
+}
+
+#endif
+

 StatusWith<size_t> validateTCMallocValue(StringData name, const BSONElement& newValueElement) {
    if (!newValueElement.isNumber()) {
@ -152,7 +223,20 @@ MONGO_INITIALIZER_GENERAL(TcmallocConfigurationDefaults, (), ("BeginStartupOptio
        (systemMemorySizeMB / 8) * 1024 * 1024;  // 1/8 of system memory in bytes
    size_t cacheSize = std::min(defaultTcMallocCacheSize, derivedTcMallocCacheSize);

-    uassertStatusOK(setProperty(kMaxTotalThreadCacheBytesPropertyName, cacheSize));
+    setMaxTotalThreadCacheBytes(cacheSize);
+
+#if defined(MONGO_HAVE_GOOGLE_TCMALLOC)
+    size_t numCores = pi.getNumAvailableCores();
+    // 1024MB in bytes spread across cores.
+    size_t defaultTcMallocPerCPUCacheSize = (1024 * 1024 * 1024) / numCores;
+    size_t derivedTcMallocPerCPUCacheSize =
+        ((systemMemorySizeMB / 8) * 2 * 1024 * 1024) / numCores;  // 1/4 of system memory in bytes
+
+    size_t perCPUCacheSize =
+        std::min(defaultTcMallocPerCPUCacheSize, derivedTcMallocPerCPUCacheSize);
+
+    tcmalloc::MallocExtension::SetMaxPerCpuCacheSize(perCPUCacheSize);
+#endif  // MONGO_HAVE_GOOGLE_TCMALLOC
 }

 }  // namespace
@ -162,7 +246,11 @@ void TCMallocReleaseRateServerParameter::append(OperationContext*,
                                                BSONObjBuilder* builder,
                                                StringData fieldName,
                                                const boost::optional<TenantId>&) {
+#if defined(MONGO_HAVE_GOOGLE_TCMALLOC)
+    auto value = getMemoryReleaseRate();
+#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
    auto value = MallocExtension::instance()->GetMemoryReleaseRate();
+#endif
    builder->append(fieldName, value);
 }

@ -178,8 +266,11 @@ Status TCMallocReleaseRateServerParameter::setFromString(StringData tcmalloc_rel
                str::stream() << "tcmallocReleaseRate cannot be negative: "
                              << tcmalloc_release_rate};
    }
-
+#if defined(MONGO_HAVE_GOOGLE_TCMALLOC)
+    setMemoryReleaseRate(value);
+#elif defined(MONGO_HAVE_GPERF_TCMALLOC)
    MallocExtension::instance()->SetMemoryReleaseRate(value);
+#endif
    return Status::OK();
 }

--- a/src/third_party/SConscript
+++ b/src/third_party/SConscript
@ -83,10 +83,15 @@ def injectMozJS(thisEnv):

 env.AddMethod(injectMozJS, 'InjectMozJS')

-if not use_system_version_of_library('tcmalloc'):
+if not use_system_version_of_library('tcmalloc-gperf'):
    # GPerftools does this slightly differently than the others.
    thirdPartyEnvironmentModifications['gperftools'] = {}

+if not use_system_version_of_library('tcmalloc-google'):
+    thirdPartyEnvironmentModifications['tcmalloc'] = {
+        'CPPPATH': ['#/src/third_party/tcmalloc/dist'],
+    }
+
 if not use_system_version_of_library('pcre2'):
    thirdPartyEnvironmentModifications['pcre2'] = {
        'CPPPATH': ['#/src/third_party/pcre2/src'],
@ -422,6 +427,12 @@ boostEnv.ShimLibrary(name="boost")

 abseilDirectory = 'abseil-cpp'
 abseilEnv = env.Clone()
+
+# We can't depend on the allocator if we are using tcmalloc as its depends
+# on us (abseil-cpp)
+if abseilEnv['MONGO_ALLOCATOR'] in ['tcmalloc-google']:
+    abseilEnv = abseilEnv.Clone(LIBDEPS_NO_INHERIT=['$BUILD_DIR/third_party/shim_allocator'])
+
 abseilEnv.InjectThirdParty(libraries=['abseil-cpp'])
 abseilEnv.SConscript(abseilDirectory + '/SConscript', exports={'env': abseilEnv})
 abseilEnv = abseilEnv.Clone(LIBDEPS_INTERFACE=[
@ -510,17 +521,34 @@ if "tom" in env["MONGO_CRYPTO"]:

    tomcryptEnv.ShimLibrary(name="tomcrypt", )

-gperftoolsEnv = env.Clone(LIBDEPS_NO_INHERIT=[
+# tcmallocEnv implements this shim, so it rejects the implicit dependency.
+tcmallocEnv = env.Clone(LIBDEPS_NO_INHERIT=[
    '$BUILD_DIR/third_party/shim_allocator',
 ], )
-if gperftoolsEnv['MONGO_ALLOCATOR'] in ["tcmalloc", "tcmalloc-experimental"]:
-    if use_system_version_of_library("tcmalloc"):
-        gperftoolsEnv = gperftoolsEnv.Clone(SYSLIBDEPS=[
+if tcmallocEnv['MONGO_ALLOCATOR'] in ["tcmalloc-google"]:
+    if use_system_version_of_library("tcmalloc-google"):
+        tcmallocEnv = tcmallocEnv.Clone(SYSLIBDEPS=[
            env['LIBDEPS_TCMALLOC_SYSLIBDEP'],
        ])
    else:
-        gperftoolsEnv = gperftoolsEnv.Clone()
-        gperftoolsEnv.InjectThirdParty(libraries=['gperftools'])
+        tcmallocDirectory = 'tcmalloc'
+        tcmallocEnv = tcmallocEnv.Clone()
+        tcmallocEnv.InjectThirdParty(libraries=['tcmalloc'])
+        tcmallocEnv.SConscript(
+            tcmallocDirectory + '/SConscript',
+            exports={'env': tcmallocEnv},
+        )
+        tcmallocEnv = tcmallocEnv.Clone(LIBDEPS_INTERFACE=[
+            'tcmalloc/tcmalloc',
+        ])
+elif tcmallocEnv['MONGO_ALLOCATOR'] in ["tcmalloc-gperf"]:
+    if use_system_version_of_library("tcmalloc-gperf"):
+        tcmallocEnv = tcmallocEnv.Clone(SYSLIBDEPS=[
+            env['LIBDEPS_TCMALLOC_SYSLIBDEP'],
+        ])
+    else:
+        tcmallocEnv = tcmallocEnv.Clone()
+        tcmallocEnv.InjectThirdParty(libraries=['gperftools'])

        # Allow gperftools to determine its own consumer-side include/ dirs.
        # Needed because those are in a platform-specific subdirectory.
@ -528,16 +556,16 @@ if gperftoolsEnv['MONGO_ALLOCATOR'] in ["tcmalloc", "tcmalloc-experimental"]:
            for k, v in kwargs.items():
                thirdPartyEnvironmentModifications['gperftools'][k] = v

-        gperftoolsEnv.AddMethod(registerConsumerModifications, 'RegisterConsumerModifications')
-        gperftoolsEnv.SConscript(
+        tcmallocEnv.AddMethod(registerConsumerModifications, 'RegisterConsumerModifications')
+        tcmallocEnv.SConscript(
            'gperftools' + '/SConscript',
-            exports={'env': gperftoolsEnv},
+            exports={'env': tcmallocEnv},
        )
-        gperftoolsEnv = gperftoolsEnv.Clone(LIBDEPS_INTERFACE=[
+        tcmallocEnv = tcmallocEnv.Clone(LIBDEPS_INTERFACE=[
            'gperftools/tcmalloc_minimal',
        ])

-gperftoolsEnv.ShimLibrary(
+tcmallocEnv.ShimLibrary(
    name="allocator",
    LIBDEPS_TAGS=[
        # TODO: Remove when SERVER-48291 is merged into stable build tools.
--- a/src/third_party/abseil-cpp/SConscript
+++ b/src/third_party/abseil-cpp/SConscript
@ -10,27 +10,6 @@ if env.ToolchainIs('msvc'):
        CCFLAGS=[],
    )

-if env.GetOption('sanitize') and 'undefined' in env.GetOption('sanitize').split(','):
-    # UBSAN causes the __muloti4 reference to be in the library. This is not defined in libgcc, so
-    # we will just opt out of this check in this third party library. Related issues below:
-    #
-    # abseil issue showing the commit it was introduced
-    # https://github.com/abseil/abseil-cpp/issues/841
-    #
-    # GCC bug saying the symbol is missing
-    # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103034
-    #
-    # LLVM bug saying the symbol requires extra linkage
-    # https://bugs.llvm.org/show_bug.cgi?id=16404
-    env.Append(
-        CCFLAGS=[
-            '-fno-sanitize=signed-integer-overflow',
-        ],
-        LINKFLAGS=[
-            '-fno-sanitize=signed-integer-overflow',
-        ],
-    )
-
 if env.ToolchainIs('gcc'):
    env.Append(CCFLAGS=[
        '-Wno-error=ignored-attributes',
--- a/src/third_party/abseil-cpp/dist/absl/base/config.h
+++ b/src/third_party/abseil-cpp/dist/absl/base/config.h
@ -335,11 +335,7 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' ||
 #if (defined(__clang__) && !defined(_WIN32)) || \
    (defined(__CUDACC__) && __CUDACC_VER_MAJOR__ >= 9) ||                \
    (defined(__GNUC__) && !defined(__clang__) && !defined(__CUDACC__))
-#if !ABSL_HAVE_FEATURE(address_sanitizer) && !ABSL_HAVE_FEATURE(memory_sanitizer) && \
-    !ABSL_HAVE_FEATURE(thread_sanitizer) && !ABSL_HAVE_FEATURE(undefined_behavior_sanitizer)
 #define ABSL_HAVE_INTRINSIC_INT128 1
-#endif  // !ABSL_HAVE_FEATURE(address_sanitizer) && !ABSL_HAVE_FEATURE(memory_sanitizer) &&
-        // !ABSL_HAVE_FEATURE(thread_sanitizer) && !ABSL_HAVE_FEATURE(undefined_behavior_sanitizer)
 #elif defined(__CUDACC__)
 // __CUDACC_VER__ is a full version number before CUDA 9, and is defined to a
 // string explaining that it has been removed starting with CUDA 9. We use
--- a/src/third_party/abseil-cpp/scripts/import.sh
+++ b/src/third_party/abseil-cpp/scripts/import.sh
@ -8,7 +8,7 @@ IFS=$'\n\t'
 set -vx

 NAME=abseil-cpp
-REVISION="20230802.1-mongo-20240205"
+REVISION="20230802.1-SERVER-85737"
 VERSION="20230802.1"

 DEST_DIR=$(git rev-parse --show-toplevel)/src/third_party/abseil-cpp
--- a/src/third_party/abseil-cpp/scripts/parse_libs_from_ninja.py
+++ b/src/third_party/abseil-cpp/scripts/parse_libs_from_ninja.py
@ -96,27 +96,6 @@ if env.ToolchainIs('msvc'):
        CCFLAGS=[],
    )

-if env.GetOption('sanitize') and 'undefined' in env.GetOption('sanitize').split(','):
-    # UBSAN causes the __muloti4 reference to be in the library. This is not defined in libgcc, so
-    # we will just opt out of this check in this third party library. Related issues below:
-    #
-    # abseil issue showing the commit it was introduced
-    # https://github.com/abseil/abseil-cpp/issues/841
-    #
-    # GCC bug saying the symbol is missing
-    # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103034
-    #
-    # LLVM bug saying the symbol requires extra linkage
-    # https://bugs.llvm.org/show_bug.cgi?id=16404
-    env.Append(
-        CCFLAGS=[
-            '-fno-sanitize=signed-integer-overflow',
-        ],
-        LINKFLAGS=[
-            '-fno-sanitize=signed-integer-overflow',
-        ],
-    )
-
 if env.ToolchainIs('gcc'):
    env.Append(
        CCFLAGS=[
--- a/src/third_party/grpc/SConscript
+++ b/src/third_party/grpc/SConscript
@ -152,13 +152,6 @@ upb_env.Library(
    ],
 )

-upb_generated_protobuf_descriptor_obj = upb_env.LibraryObject(
-    target="upb_generated_protobuf_descriptor",
-    source=[
-        "dist/src/core/ext/upb-generated/google/protobuf/descriptor.upb.c",
-    ],
-)[0]
-
 upb_env.Library(
    target="upb_wire",
    source=[
@ -230,7 +223,7 @@ upb_env.Library(
        "dist/third_party/upb/upb/reflection/method_def.c",
        "dist/third_party/upb/upb/reflection/oneof_def.c",
        "dist/third_party/upb/upb/reflection/service_def.c",
-        upb_generated_protobuf_descriptor_obj,
+        "dist/src/core/ext/upb-generated/google/protobuf/descriptor.upb.c",
    ],
    LIBDEPS=[
        "upb_collections",
@ -1231,7 +1224,6 @@ grpc_env.Library(
        "dist/src/core/tsi/ssl_transport_security_utils.cc",
        "dist/src/core/tsi/transport_security.cc",
        "dist/src/core/tsi/transport_security_grpc.cc",
-        upb_generated_protobuf_descriptor_obj,
    ],
    OBJPREFIX=env.get('OBJPREFIX', '') + 'grpc_',
    LIBDEPS=[
--- a/src/third_party/tcmalloc/SConscript
+++ b/src/third_party/tcmalloc/SConscript
@ -0,0 +1,173 @@
+# Project: com_google_tcmalloc
+import json
+import re
+import sys
+
+import SCons
+
+Import("env")
+Import("has_option")
+
+env = env.Clone(
+    # Building with hidden visibility interferes with intercepting the
+    # libc allocation functions.
+    DISALLOW_VISHIDDEN=True,
+    NINJA_GENSOURCE_INDEPENDENT=True,
+)
+
+if env.Verbose():
+
+    def tcmalloc_scons_print(msg, *args, **kwargs):
+        print("[TCMALLOC_TO_SCONS]: " + msg, *args, **kwargs)
+else:
+
+    def tcmalloc_scons_print(msg, *args, **kwargs):
+        pass
+
+
+# manually switch this for all the debugging
+tcmalloc_extra_debug = False
+
+if tcmalloc_extra_debug:
+
+    def tcmalloc_scons_debug(msg, *args, **kwargs):
+        print("[TCMALLOC_TO_SCONS][DEBUG]: " + msg, *args, **kwargs)
+else:
+
+    def tcmalloc_scons_debug(msg, *args, **kwargs):
+        pass
+
+
+_bazelToSconsMap = dict(
+    (f'@com_google_absl//absl/{k}', [f'$BUILD_DIR/third_party/abseil-cpp/absl_{ve}' for ve in v])
+    for k, v in {
+        'algorithm:container': [],
+        'base:config': [],
+        'base:core_headers': [],
+        'base:dynamic_annotations': [],
+        'container:btree': [],
+        'container:fixed_array': [],
+        'container:flat_hash_map': ['raw_hash_set'],
+        'debugging:leak_check': [],
+        'debugging:stacktrace': ['stacktrace'],
+        'debugging:symbolize': [],
+        'functional:function_ref': [],
+        'base:malloc_internal': ['malloc_internal'],
+        'memory': [],
+        'numeric:bits': [],
+        'numeric:int128': [],
+        'strings:str_format': [],
+        'types:optional': [],
+        'types:span': [],
+    }.items())
+
+sys.path.append(env.Dir('scripts/site-scons').srcnode().abspath)
+from bazel_to_scons import BazelEnv, Label
+
+
+def dumpBazelLibs(baz, target):
+    if tcmalloc_extra_debug:
+        tcmalloc_scons_debug(f"Dumping tcmalloc deps to: '{target}'", file=sys.stderr)
+        with open(target.abspath, 'w') as dump:
+            tcmalloc_scons_debug(
+                json.dumps({'libraries': baz}, sort_keys=True, indent=4), file=dump)
+    else:
+        pass
+
+
+def _remapAbseilDep(label: Label) -> 'list[str]':
+    tcmalloc_scons_print(f'Remap abseilDep {label}', file=sys.stderr)
+    if str(label) in _bazelToSconsMap:
+        out = _bazelToSconsMap[str(label)]
+        tcmalloc_scons_print(f'Remap {label} to {out}', file=sys.stderr)
+        return out
+
+    pkg = label.package().replace('/', '_')
+    tgt = label.target()
+    # bazel expands //foo/bar => //foo/bar:bar implicitly. Use short form
+    if tgt and not pkg.endswith('/' + tgt):
+        tgt = "_" + tgt.replace('/', '_')
+    else:
+        tgt = ''
+    return [f'$BUILD_DIR/third_party/abseil-cpp/{pkg}{tgt}']
+
+
+def findAbslLibs():
+    abslSconscript = env.File('$BUILD_DIR/third_party/abseil-cpp/SConscript').srcnode().abspath
+    tcmalloc_scons_debug(f'abslSconscript={abslSconscript}', file=sys.stderr)
+    abslLibs = []
+    with open(abslSconscript) as inf:
+        lines = (s.strip() for s in inf.readlines())
+        targetRe = re.compile(r"\s*target=['\"](.*)['\"],")
+        for line in lines:
+            m = targetRe.match(line)
+            if m:
+                fq = f'$BUILD_DIR/third_party/abseil-cpp/{m[1]}'
+                tcmalloc_scons_debug(f"found {fq} in {line}", file=sys.stderr)
+                abslLibs.append(fq)
+    return sorted(abslLibs)
+
+
+def _mapDepToScons(lab: str, base: str = '') -> str:
+    if re.match(r'^@com_google_absl//', lab):
+        return _remapAbseilDep(Label(lab))
+    lab = re.sub(r'^:', f'//{Label(base).package()}:', lab)
+    lab = re.sub(r'^//', '', lab)
+    lab = re.sub(r'(.*):(.*)', r'\1_\2', lab)
+    lab = lab.replace("/", "_")
+    return [lab]
+
+
+def slurpBlaze(target, source, exports, env):
+    bazel = BazelEnv(env, env.Dir("dist").srcnode().abspath, debug=tcmalloc_scons_debug)
+    bazel.run()
+    bazel.pruneTestOnlyLibraries()
+    bazel.eliminateHeadersFromSources()
+    bazel.eliminateSourcelessDeps()
+    bzl = bazel.libraries()
+    dumpBazelLibs(bzl, target)
+    resolved = bazel.resolveDeps(exports)
+
+    unknowns = [(x, resolved[x]) for x in resolved if 'unknown' in resolved[x]]
+    abslImports = {}
+    for unk in sorted(unknowns):
+        lab = Label(unk[0])
+        if lab.remote() == 'com_google_absl':
+            abslImports[str(lab)] = _remapAbseilDep(lab)
+    tcmalloc_scons_debug(f"{json.dumps({'abslImports': abslImports}, indent=4)}", file=sys.stderr)
+
+    tcmalloc_scons_print('Final render into env.Library calls', file=sys.stderr)
+    for libName in sorted(resolved.keys()):
+        if Label(libName).remote() or libName in _bazelToSconsMap or libName not in bzl:
+            continue
+        libDef = bzl[libName]
+        # It's the abseil name
+        lab = _mapDepToScons(libName)[0]
+        tcmalloc_scons_debug(f'libName: {libName:60s} => {lab}', file=sys.stderr)
+        tcmalloc_scons_debug(f'    {json.dumps(list(libDef), indent=4)}', file=sys.stderr)
+        kwargs = {'target': lab}
+        for src in libDef.get('srcs', []):
+            src = f'dist/{Label(libName).package()}/{src}'
+            tcmalloc_scons_debug(f'srcs for lib={libName} -> src={src}', file=sys.stderr)
+            kwargs.setdefault('source', []).append(src)
+        for dep in libDef.get('deps', set()):
+            scons_deps = _mapDepToScons(dep, base=libName)
+            tcmalloc_scons_debug(f'lib={libName}: dep={dep} => {scons_deps}', file=sys.stderr)
+            kwargs.setdefault('LIBDEPS', []).extend(scons_deps)
+        if 'LIBDEPS' in kwargs:
+            kwargs['LIBDEPS'] = sorted(list(set(kwargs['LIBDEPS'])))
+
+        for cf in libDef.get('copts', []):
+            kwargs.setdefault('CCFLAGS', [e for e in env.get('CCFLAGS', [])]).append(cf)
+        tcmalloc_scons_print(f'env.Library(**{json.dumps(kwargs, indent=4)})', file=sys.stderr)
+        env.Library(**kwargs)
+
+    return 0
+
+
+env = env.Clone()
+env.InjectThirdParty(libraries=['abseil-cpp'])
+
+slurpBlaze(
+    target=env.File('tcmalloc_deps.json').srcnode(), source=[],
+    exports=['//tcmalloc', '//tcmalloc:tcmalloc_extension'], env=env)
--- a/src/third_party/tcmalloc/dist/CONTRIBUTING.md
+++ b/src/third_party/tcmalloc/dist/CONTRIBUTING.md
@ -0,0 +1,74 @@
+# How to Contribute to TCMalloc
+
+We'd love to accept your patches and contributions to this project. There are
+just a few small guidelines you need to follow.
+
+NOTE: If you are new to GitHub, please start by reading [Pull Request
+howto](https://help.github.com/articles/about-pull-requests/)
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to <https://cla.developers.google.com/> to see
+your current agreements on file or to sign a new one.
+
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+
+## Guidelines for Pull Requests
+
+*  All submissions, including submissions by project members, require review.
+   We use GitHub pull requests for this purpose. Consult
+   [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
+   information on using pull requests.
+
+*  If you are a Googler, it is preferable to first create an internal CL and
+   have it reviewed and submitted. The code propagation process will deliver
+   the change to GitHub.
+
+*  Create **small PRs** that are narrowly focused on **addressing a single concern**.
+   When PRs try to fix several things at a time, if only one fix is considered
+   acceptable, nothing gets merged and both author's & review's time is wasted.
+   Create more PRs to address different concerns and everyone will be happy.
+
+*  Provide a good **PR description** as a record of **what** change is being
+   made and **why** it was made. Link to a GitHub issue if it exists.
+
+*  Don't fix code style and formatting unless you are already changing that line
+   to address an issue. Formatting of modified lines may be done using
+   `git clang-format`. PRs with irrelevant changes won't be merged. If you do
+   want to fix formatting or style, do that in a separate PR.
+
+*  Unless your PR is trivial, you should expect there will be reviewer comments
+   that you'll need to address before merging. We expect you to be reasonably
+   responsive to those comments, otherwise the PR will be closed after 2-3 weeks
+   of inactivity.
+
+*  Maintain **clean commit history** and use **meaningful commit messages**.
+   PRs with messy commit history are difficult to review and won't be merged.
+   Use `rebase -i upstream/master` to curate your commit history and/or to
+   bring in latest changes from master (but avoid rebasing in the middle of a
+   code review).
+
+*  Keep your PR up to date with upstream/master (if there are merge conflicts,
+   we can't really merge your change).
+
+*  **All tests need to be passing** before your change can be merged. We
+   recommend you **run tests locally** (see below)
+
+*  Exceptions to the rules can be made if there's a compelling reason for doing
+   so. That is - the rules are here to serve us, not the other way around, and
+   the rules need to be serving their intended purpose to be valuable.
+
+## TCMalloc Committers
+
+The current members of the TCMalloc engineering team are the only committers at
+present.
+
+## Community Guidelines
+
+This project follows
+[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
--- a/src/third_party/tcmalloc/dist/LICENSE
+++ b/src/third_party/tcmalloc/dist/LICENSE
@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        https://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       https://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/src/third_party/tcmalloc/dist/README.md
+++ b/src/third_party/tcmalloc/dist/README.md
@ -0,0 +1,45 @@
+# TCMalloc
+
+This repository contains the TCMalloc C++ code.
+
+TCMalloc is Google's customized implementation of C's `malloc()` and C++'s
+`operator new` used for memory allocation within our C and C++ code. TCMalloc is
+a fast, multi-threaded malloc implementation.
+
+## Building TCMalloc
+
+[Bazel](https://bazel.build) is the official build system for TCMalloc.
+
+The [TCMalloc Platforms Guide](docs/platforms.md) contains information on
+platform support for TCMalloc.
+
+## Documentation
+
+All users of TCMalloc should consult the following documentation resources:
+
+*   The [TCMalloc Quickstart](docs/quickstart.md) covers downloading,
+    installing, building, and testing TCMalloc, including incorporating within
+    your codebase.
+*   The [TCMalloc Overview](docs/overview.md) covers the basic architecture of
+    TCMalloc, and how that may affect configuration choices.
+*   The [TCMalloc Reference](docs/reference.md) covers the C and C++ TCMalloc
+    API endpoints.
+
+More advanced usages of TCMalloc may find the following documentation useful:
+
+*   The [TCMalloc Tuning Guide](docs/tuning.md) covers the configuration
+    choices in more depth, and also illustrates other ways to customize
+    TCMalloc. This also covers important operating system-level properties for
+    improving TCMalloc performance.
+*   The [TCMalloc Design Doc](docs/design.md) covers how TCMalloc works
+    underneath the hood, and why certain design choices were made. Most
+    developers will not need this level of implementation detail.
+*   The [TCMalloc Compatibility Guide](docs/compatibility.md) which documents
+    our expectations for how our APIs are used.
+
+## License
+
+The TCMalloc library is licensed under the terms of the Apache license. See
+LICENSE for more information.
+
+Disclaimer: This is not an officially supported Google product.
--- a/src/third_party/tcmalloc/dist/WORKSPACE
+++ b/src/third_party/tcmalloc/dist/WORKSPACE
@ -0,0 +1,111 @@
+# Copyright 2019 The TCMalloc Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+workspace(name = "com_google_tcmalloc")
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+# Load a recent version of skylib in case our dependencies have obsolete
+# versions. This is needed for bazel 6 compatibility.
+http_archive(
+    name = "bazel_skylib", # 2022-09-01
+    urls = ["https://github.com/bazelbuild/bazel-skylib/archive/refs/tags/1.3.0.zip"],
+    strip_prefix = "bazel-skylib-1.3.0",
+    sha256 = "4756ab3ec46d94d99e5ed685d2d24aece484015e45af303eb3a11cab3cdc2e71",
+)
+
+# Abseil
+http_archive(
+    name = "com_google_absl",
+    urls = ["https://github.com/abseil/abseil-cpp/archive/b3162b1da62711c663d0025e2eabeb83fd1f2728.zip"],
+    strip_prefix = "abseil-cpp-b3162b1da62711c663d0025e2eabeb83fd1f2728",
+    sha256 = "d5c91248c33269fcc7ab35897315a45cfa2c37abb4c6d4ed36cb5c82f366367a",
+)
+
+# GoogleTest/GoogleMock framework. Used by most unit-tests.
+http_archive(
+    name = "com_google_googletest",  # 2021-05-19T20:10:13Z
+    urls = ["https://github.com/google/googletest/archive/aa9b44a18678dfdf57089a5ac22c1edb69f35da5.zip"],
+    strip_prefix = "googletest-aa9b44a18678dfdf57089a5ac22c1edb69f35da5",
+    sha256 = "8cf4eaab3a13b27a95b7e74c58fb4c0788ad94d1f7ec65b20665c4caf1d245e8",
+)
+
+# Google benchmark.
+http_archive(
+    name = "com_github_google_benchmark",
+    urls = ["https://github.com/google/benchmark/archive/0baacde3618ca617da95375e0af13ce1baadea47.zip"],
+    strip_prefix = "benchmark-0baacde3618ca617da95375e0af13ce1baadea47",
+    sha256 = "62e2f2e6d8a744d67e4bbc212fcfd06647080de4253c97ad5c6749e09faf2cb0",
+)
+
+# C++ rules for Bazel.
+http_archive(
+    name = "rules_cc",  # 2021-05-14T14:51:14Z
+    urls = ["https://github.com/bazelbuild/rules_cc/archive/68cb652a71e7e7e2858c50593e5a9e3b94e5b9a9.zip"],
+    strip_prefix = "rules_cc-68cb652a71e7e7e2858c50593e5a9e3b94e5b9a9",
+    sha256 = "1e19e9a3bc3d4ee91d7fcad00653485ee6c798efbbf9588d40b34cbfbded143d",
+)
+
+# Python rules
+#
+# This is explicitly added to work around
+# https://github.com/bazelbuild/rules_fuzzing/issues/207
+# and https://github.com/google/tcmalloc/issues/127
+http_archive(
+    name = "rules_python",
+    urls = ["https://github.com/bazelbuild/rules_python/archive/refs/tags/0.11.0.tar.gz"],
+    sha256 = "c03246c11efd49266e8e41e12931090b613e12a59e6f55ba2efd29a7cb8b4258",
+    strip_prefix = "rules_python-0.11.0",
+)
+
+# Proto rules for Bazel and Protobuf
+http_archive(
+    name = "com_google_protobuf",
+    urls = ["https://github.com/protocolbuffers/protobuf/archive/13d559beb6967033a467a7517c35d8ad970f8afb.zip"],
+    strip_prefix = "protobuf-13d559beb6967033a467a7517c35d8ad970f8afb",
+    sha256 = "9ca59193fcfe52c54e4c2b4584770acd1a6528fc35efad363f8513c224490c50",
+)
+load("@com_google_protobuf//:protobuf_deps.bzl", "protobuf_deps")
+protobuf_deps()
+
+http_archive(
+    name = "rules_proto",
+    sha256 = "66bfdf8782796239d3875d37e7de19b1d94301e8972b3cbd2446b332429b4df1",
+    strip_prefix = "rules_proto-4.0.0",
+    urls = [
+        "https://mirror.bazel.build/github.com/bazelbuild/rules_proto/archive/refs/tags/4.0.0.tar.gz",
+        "https://github.com/bazelbuild/rules_proto/archive/refs/tags/4.0.0.tar.gz",
+    ],
+)
+
+load("@rules_proto//proto:repositories.bzl", "rules_proto_dependencies", "rules_proto_toolchains")
+rules_proto_dependencies()
+rules_proto_toolchains()
+
+# Fuzzing
+http_archive(
+    name = "rules_fuzzing",
+    sha256 = "a5734cb42b1b69395c57e0bbd32ade394d5c3d6afbfe782b24816a96da24660d",
+    strip_prefix = "rules_fuzzing-0.1.1",
+    urls = ["https://github.com/bazelbuild/rules_fuzzing/archive/v0.1.1.zip"],
+)
+
+# Protobuf
+load("@rules_fuzzing//fuzzing:repositories.bzl", "rules_fuzzing_dependencies")
+
+rules_fuzzing_dependencies()
+
+load("@rules_fuzzing//fuzzing:init.bzl", "rules_fuzzing_init")
+
+rules_fuzzing_init()
--- a/src/third_party/tcmalloc/dist/docs/README.md
+++ b/src/third_party/tcmalloc/dist/docs/README.md
@ -0,0 +1,58 @@
+# TCMalloc
+
+This repository contains the TCMalloc C++ code.
+
+TCMalloc is Google's customized implementation of C's `malloc()` and C++'s
+`operator new` used for memory allocation within our C and C++ code. TCMalloc is
+a fast, multi-threaded malloc implementation.
+
+## Building TCMalloc
+
+[Bazel](https://bazel.build) is the official build system for TCMalloc.
+
+The [TCMalloc Platforms Guide](platforms.md) contains information on platform
+support for TCMalloc.
+
+## Documentation
+
+All users of TCMalloc should consult the following documentation resources:
+
+*   The [TCMalloc Quickstart](quickstart.md) covers downloading, installing,
+    building, and testing TCMalloc, including incorporating within your
+    codebase.
+*   The [TCMalloc Overview](overview.md) covers the basic architecture of
+    TCMalloc, and how that may affect configuration choices.
+*   The [TCMalloc Reference](reference.md) covers the C and C++ TCMalloc API
+    endpoints.
+
+More advanced usages of TCMalloc may find the following documentation useful:
+
+*   The [TCMalloc Tuning Guide](tuning.md) covers the configuration choices in
+    more depth, and also illustrates other ways to customize TCMalloc.
+*   The [TCMalloc Design Doc](design.md) covers how TCMalloc works underneath
+    the hood, and why certain design choices were made. Most developers will not
+    need this level of implementation detail.
+*   The [TCMalloc Compatibility Guide](compatibility.md) which documents our
+    expectations for how our APIs are used.
+*   The [history and differences](gperftools.md) between this repository and
+    gperftools.
+
+## Publications
+
+We've published several papers relating to TCMalloc optimizations:
+
+*   ["Beyond malloc efficiency to fleet efficiency: a hugepage-aware memory
+    allocator" (OSDI 2021)](https://research.google/pubs/pub50370/) relating to
+    the development and rollout of [Temeraire](temeraire.md), TCMalloc's
+    hugepage-aware page heap implementation.
+*   ["Adaptive Hugepage Subrelease for Non-moving Memory Allocators in
+    Warehouse-Scale Computers" (ISMM
+    2021)](https://research.google/pubs/pub50436/) relating to optimizations for
+    releasing partial hugepages to the operating system.
+
+## License
+
+The TCMalloc library is licensed under the terms of the Apache license. See
+LICENSE for more information.
+
+Disclaimer: This is not an officially supported Google product.
--- a/src/third_party/tcmalloc/dist/docs/compatibility.md
+++ b/src/third_party/tcmalloc/dist/docs/compatibility.md
@ -0,0 +1,44 @@
+# TCMalloc Compatibility Guidelines
+
+This document details what we expect from well-behaved users. Any usage of
+TCMalloc libraries outside of these technical boundaries may result in breakage
+when upgrading to newer versions of TCMalloc.
+
+Put another way: don't do things that make TCMalloc API maintenance tasks
+harder. If you misuse TCMalloc APIs, you're on your own.
+
+Additionally, because TCMalloc depends on Abseil, Abseil's
+[compatibility guidelines](https://abseil.io/about/compatibility) also apply.
+
+## What Users Must (And Must Not) Do
+
+*   **Do not depend on a compiled representation of TCMalloc.** We do not
+    promise any ABI compatibility &mdash; we intend for TCMalloc to be built
+    from source, hopefully from head. The internal layout of our types may
+    change at any point, without notice. Building TCMalloc in the presence of
+    different C++ standard library types may change Abseil types, especially for
+    pre-adopted types (`string_view`, `variant`, etc) &mdash; these will become
+    typedefs and their ABI will change accordingly.
+*   **Do not rely on dynamic loading/unloading.** TCMalloc does not support
+    dynamic loading and unloading.
+*   **You may not open namespace `tcmalloc`.** You are not allowed to define
+    additional names in namespace `tcmalloc`, nor are you allowed to specialize
+    anything we provide.
+*   **You may not depend on the signatures of TCMalloc APIs.** You cannot take
+    the address of APIs in TCMalloc (that would prevent us from adding overloads
+    without breaking you). You cannot use metaprogramming tricks to depend on
+    those signatures either. (This is also similar to the restrictions in the
+    C++ standard.)
+*   **You may not forward declare TCMalloc APIs.** This is actually a sub-point
+    of "do not depend on the signatures of TCMalloc APIs" as well as "do not
+    open namespace `tcmalloc`", but can be surprising. Any refactoring that
+    changes template parameters, default parameters, or namespaces will be a
+    breaking change in the face of forward-declarations.
+*   **Do not depend upon internal details.** This should go without saying: if
+    something is in a namespace or filename/path that includes the word
+    "internal", you are not allowed to depend upon it. It's an implementation
+    detail. You cannot friend it, you cannot include it, you cannot mention it
+    or refer to it in any way.
+*   **Include What You Use.** We may make changes to the internal `#include`
+    graph for TCMalloc headers - if you use an API, please include the relevant
+    header file directly.
--- a/src/third_party/tcmalloc/dist/docs/design.md
+++ b/src/third_party/tcmalloc/dist/docs/design.md
@ -0,0 +1,470 @@
+# TCMalloc : Thread-Caching Malloc
+
+## Motivation
+
+TCMalloc is a memory allocator designed as an alternative to the system default
+allocator that has the following characteristics:
+
+*   Fast, uncontended allocation and deallocation for most objects. Objects are
+    cached, depending on mode, either per-thread, or per-logical-CPU. Most
+    allocations do not need to take locks, so there is low contention and good
+    scaling for multi-threaded applications.
+*   Flexible use of memory, so freed memory can be reused for different object
+    sizes, or returned to the OS.
+*   Low per object memory overhead by allocating "pages" of objects of the same
+    size. Leading to space-efficient representation of small objects.
+*   Low overhead sampling, enabling detailed insight into applications memory
+    usage.
+
+## Usage
+
+You use TCMalloc by specifying it as the `malloc` attribute on your binary rules in Bazel.
+
+## Overview
+
+The following block diagram shows the rough internal structure of TCMalloc:
+
+![Diagram of TCMalloc internal structure](images/tcmalloc_internals.png "TCMalloc internal structure")
+
+We can break TCMalloc into three components. The front-end, middle-end, and
+back-end. We will discuss these in more details in the following sections. A
+rough breakdown of responsibilities is:
+
+*   The front-end is a cache that provides fast allocation and deallocation of
+    memory to the application.
+*   The middle-end is responsible for refilling the front-end cache.
+*   The back-end handles fetching memory from the OS.
+
+Note that the front-end can be run in either per-CPU or legacy per-thread mode,
+and the back-end can support either the hugepage aware pageheap or the legacy
+pageheap.
+
+## The TCMalloc Front-end
+
+The front-end handles a request for memory of a particular size. The front-end
+has a cache of memory that it can use for allocation or to hold free memory.
+This cache is only accessible by a single thread at a time, so it does not
+require any locks, hence most allocations and deallocations are fast.
+
+The front-end will satisfy any request if it has cached memory of the
+appropriate size. If the cache for that particular size is empty, the front-end
+will request a batch of memory from the middle-end to refill the cache. The
+middle-end comprises the CentralFreeList and the TransferCache.
+
+If the middle-end is exhausted, or if the requested size is greater than the
+maximum size that the front-end caches handle, a request will go to the back-end
+to either satisfy the large allocation, or to refill the caches in the
+middle-end. The back-end is also referred to as the PageHeap.
+
+There are two implementations of the TCMalloc front-end:
+
+*   Originally it supported per-thread caches of objects (hence the name Thread
+    Caching Malloc). However, this resulted in memory footprints that scaled
+    with the number of threads. Modern applications can have large thread
+    counts, which result in either large amounts of aggregate per-thread memory,
+    or many threads having minuscule per-thread caches.
+*   More recently TCMalloc has supported per-CPU mode. In this mode each logical
+    CPU in the system has its own cache from which to allocate memory. Note: On
+    x86 a logical CPU is equivalent to a hyperthread.
+
+The differences between per-thread and per-CPU modes are entirely confined to
+the implementations of malloc/new and free/delete.
+
+## Small and Large Object Allocation
+
+Allocations of "small" objects are mapped onto one of
+[60-80 allocatable size-classes](https://github.com/google/tcmalloc/blob/master/tcmalloc/size_classes.cc).
+For example, an allocation of 12 bytes will get rounded up to the 16 byte
+size-class. The size-classes are designed to minimize the amount of memory that
+is wasted when rounding to the next largest size-class.
+
+When compiled with `__STDCPP_DEFAULT_NEW_ALIGNMENT__ <= 8`, we use a set of
+sizes aligned to 8 bytes for raw storage allocated with `::operator new`. This
+smaller alignment minimizes wasted memory for many common allocation sizes (24,
+40, etc.) which are otherwise rounded up to a multiple of 16 bytes. On many
+compilers, this behavior is controlled by the `-fnew-alignment=...` flag.
+When
+`__STDCPP_DEFAULT_NEW_ALIGNMENT__` is not specified (or is larger than 8 bytes),
+we use standard 16 byte alignments for `::operator new`. However, for
+allocations under 16 bytes, we may return an object with a lower alignment, as
+no object with a larger alignment requirement can be allocated in the space.
+
+When an object of a given size is requested, that request is mapped to a request
+of a particular size-class using the
+[`SizeMap::GetSizeClass()` function](https://github.com/google/tcmalloc/blob/master/tcmalloc/common.h),
+and the returned memory is from that size-class. This means that the returned
+memory is at least as large as the requested size. Allocations from size-classes
+are handled by the front-end.
+
+Objects of size greater than the limit defined by
+[`kMaxSize`](https://github.com/google/tcmalloc/blob/master/tcmalloc/common.h)
+are allocated directly from the [backend](#tcmalloc-backend). As such they are
+not cached in either the front or middle ends. Allocation requests for large
+object sizes are rounded up to the [TCMalloc page size](#tcmalloc-page-sizes).
+
+## Deallocation
+
+When an object is deallocated, the compiler will provide the size of the object
+if it is known at compile time. If the size is not known, it will be looked up
+in the [pagemap](#pagemap). If the object is small it will be put back into the
+front-end cache. If the object is larger than kMaxSize it is returned directly
+to the pageheap.
+
+### Per-CPU Mode
+
+In per-CPU mode a single large block of memory is allocated. The following
+diagram shows how this slab of memory is divided between CPUs and how each CPU
+uses a part of the slab to hold metadata as well as pointers to available
+objects.
+
+![Memory layout of per-cpu data structures](images/per-cpu-cache-internals.png "Memory layout of per-cpu data structures")
+
+Each logical CPU is assigned a section of this memory to hold metadata and
+pointers to available objects of particular size-classes. The metadata comprises
+one /header/ block per size-class. The header has a pointer to the start of the
+per-size-class array of pointers to objects, as well as a pointer to the
+current, dynamic, maximum capacity and the current position within that array
+segment. The static maximum capacity of each per-size-class array of pointers is
+[determined at start time](https://github.com/google/tcmalloc/blob/master/tcmalloc/internal/percpu_tcmalloc.h)
+by the difference between the start of the array for this size-class and the
+start of the array for the next size-class.
+
+At runtime the maximum number of items of a particular size-class that can be
+stored in the per-cpu block will vary, but it can never exceed the statically
+determined maximum capacity assigned at start up.
+
+When an object of a particular size-class is requested it is removed from this
+array, when the object is freed it is added to the array. If the array is
+[exhausted](https://github.com/google/tcmalloc/blob/master/tcmalloc/cpu_cache.h)
+the array is refilled using a batch of objects from the middle-end. If the array
+would
+[overflow](https://github.com/google/tcmalloc/blob/master/tcmalloc/cpu_cache.h),
+a batch of objects are removed from the array and returned to the middle-end.
+
+The amount of memory that can be cached is limited per-cpu by the parameter
+`MallocExtension::SetMaxPerCpuCacheSize`. This means that the total amount of
+cached memory depends on the number of active per-cpu caches. Consequently
+machines with higher CPU counts can cache more memory.
+
+To avoid holding memory on CPUs where the application no longer runs,
+`MallocExtension::ReleaseCpuMemory` frees objects held in a specified CPU's
+caches.
+
+Within a CPU, the distribution of memory is managed across all the size-classes
+so as to keep the maximum amount of cached memory below the limit. Notice that
+it is managing the maximum amount that can be cached, and not the amount that is
+currently cached. On average the amount actually cached should be about half the
+limit.
+
+The maximum capacity is increased when a size-class
+[runs out of objects](https://github.com/google/tcmalloc/blob/master/tcmalloc/cpu_cache.h),
+and when fetching more objects, it also considers
+[increasing the capacity](https://github.com/google/tcmalloc/blob/master/tcmalloc/cpu_cache.h)
+of the size-class. It can increase the capacity of the size-class up until the
+total memory (for all size-classes) that the cache could hold reaches the
+per-cpu limit or until the capacity of that size-class reaches the hard-coded
+size limit for that size-class. If the size-class has not reached the hard-coded
+limit, then in order to increase the capacity it can
+[steal](https://github.com/google/tcmalloc/blob/master/tcmalloc/cpu_cache.h)
+capacity from another size-class on the same CPU.
+
+### Restartable Sequences and Per-CPU TCMalloc
+
+To work correctly, per-CPU mode relies on restartable sequences (man rseq(2)). A
+restartable sequence is just a block of (assembly language) instructions,
+largely like a typical function. A restriction of restartable sequences is that
+they cannot write partial state to memory, the final instruction must be a
+single write of the updated state. The idea of restartable sequences is that if
+a thread is removed from a CPU (e.g. context switched) while it is executing a
+restartable sequence, the sequence will be restarted from the top. Hence the
+sequence will either complete without interruption, or be repeatedly restarted
+until it completes without interruption. This is achieved without using any
+locking or atomic instructions, thereby avoiding any contention in the sequence
+itself.
+
+The practical implication of this for TCMalloc is that the code can use a
+restartable sequence like
+[TcmallocSlab_Internal_Push](https://github.com/google/tcmalloc/blob/master/tcmalloc/internal/percpu_tcmalloc.h)
+to fetch from or return an element to a per-CPU array without needing locking.
+The restartable sequence ensures that either the array is updated without the
+thread being interrupted, or the sequence is restarted if the thread was
+interrupted (for example, by a context switch that enables a different thread to
+run on that CPU).
+
+Additional information about the design choices and implementation are discussed
+in a specific [design doc](rseq.md) for it.
+
+### Legacy Per-Thread mode
+
+In per-thread mode, TCMalloc assigns each thread a thread-local cache. Small
+allocations are satisfied from this thread-local cache. Objects are moved
+between the middle-end into and out of the thread-local cache as needed.
+
+A thread cache contains one singly linked list of free objects per size-class
+(so if there are N size-classes, there will be N corresponding linked lists), as
+shown in the following diagram.
+
+![Structure of per-thread cache](images/per-thread-structure.png "Structure of per-thread cache")
+
+On allocation an object is removed from the appropriate size-class of the
+per-thread caches. On deallocation, the object is prepended to the appropriate
+size-class. Underflow and overflow are handled by accessing the middle-end to
+either fetch more objects, or to return some objects.
+
+The maximum capacity of the per-thread caches is set by the parameter
+`MallocExtension::SetMaxTotalThreadCacheBytes`.
+However it is possible for the
+total size to exceed that limit as each per-thread cache has a minimum size
+[KMinThreadCacheSize](https://github.com/google/tcmalloc/blob/master/tcmalloc/common.h)
+which is usually 512KiB. In the event that a thread wishes to increase its
+capacity, it needs to
+[scavenge](https://github.com/google/tcmalloc/blob/master/tcmalloc/thread_cache.cc)
+capacity from other threads.
+
+When threads exit their cached memory is
+[returned](https://github.com/google/tcmalloc/blob/master/tcmalloc/thread_cache.cc)
+to the middle-end
+
+### Runtime Sizing of Front-end Caches
+
+It is important for the size of the front-end cache free lists to adjust
+optimally. If the free list is too small, we'll need to go to the central free
+list too often. If the free list is too big, we'll waste memory as objects sit
+idle in there.
+
+Note that the caches are just as important for deallocation as they are for
+allocation. Without a cache, each deallocation would require moving the memory
+to the central free list.
+
+Per-CPU and per-thread modes have different implementations of a dynamic cache
+sizing algorithm.
+
+*   In per-thread mode the maximum number of objects that can be stored is
+    [increased](https://github.com/google/tcmalloc/blob/master/tcmalloc/thread_cache.cc)
+    up to a limit whenever more objects need to be fetched from the middle-end.
+    Similarly the capacity is
+    [decreased](https://github.com/google/tcmalloc/blob/master/tcmalloc/thread_cache.cc)
+    when we find that we have cached too many objects. The size of the cache is
+    also
+    [reduced](https://github.com/google/tcmalloc/blob/master/tcmalloc/thread_cache.cc)
+    should the total size of the cached objects exceed the per-thread limit.
+*   In per-CPU mode the
+    [capacity](https://github.com/google/tcmalloc/blob/master/tcmalloc/cpu_cache.h)
+    of the free list is increased depending on whether we are alternating
+    between underflows and overflows (indicating that a larger cache might stop
+    this alternation). The capacity is
+    [reduced](https://github.com/google/tcmalloc/blob/master/tcmalloc/cpu_cache.h)
+    when it has not been grown for a time and may therefore be over capacity.
+
+## TCMalloc Middle-end
+
+The middle-end is responsible for providing memory to the front-end and
+returning memory to the back-end. The middle-end comprises the Transfer cache
+and the Central free list. Although these are often referred to as singular,
+there is one transfer cache and one central free list per size-class. These
+caches are each protected by a mutex lock - so there is a serialization cost to
+accessing them.
+
+### Transfer Cache
+
+When the front-end requests memory, or returns memory, it will reach out to the
+transfer cache.
+
+The transfer cache holds an array of pointers to free memory, and it is quick to
+move objects into this array, or fetch objects from this array on behalf of the
+front-end.
+
+The transfer cache gets its name from situations where one CPU (or thread) is
+allocating memory that is deallocated by another CPU (or thread). The transfer
+cache allows memory to rapidly flow between two different CPUs (or threads).
+
+If the transfer cache is unable to satisfy the memory request, or has
+insufficient space to hold the returned objects, it will access the central free
+list.
+
+### Central Free List
+
+The central free list manages memory in "[spans](#spans)", a span is a
+collection of one or more "[TCMalloc pages](#tcmalloc-page-sizes)" of memory.
+These terms will be explained in the next couple of sections.
+
+A request for one or more objects is satisfied by the central free list by
+[extracting](https://github.com/google/tcmalloc/blob/master/tcmalloc/central_freelist.cc)
+objects from spans until the request is satisfied. If there are insufficient
+available objects in the spans, more spans are requested from the back-end.
+
+When objects are
+[returned to the central free list](https://github.com/google/tcmalloc/blob/master/tcmalloc/central_freelist.cc),
+each object is mapped to the span to which it belongs (using the
+[pagemap](#pagemap-and-spans)) and then released into that span. If all the
+objects that reside in a particular span are returned to it, the entire span
+gets returned to the back-end.
+
+### Pagemap and Spans
+
+The heap managed by TCMalloc is divided into [pages](#pagesize) of a
+compile-time determined size. A run of contiguous pages is represented by a
+`Span` object. A span can be used to manage a large object that has been handed
+off to the application, or a run of pages that have been split up into a
+sequence of small objects. If the span manages small objects, the size-class of
+the objects is recorded in the span.
+
+The pagemap is used to look up the span to which an object belongs, or to
+identify the size-class for a given object.
+
+TCMalloc uses a 2-level or 3-level
+[radix tree](https://github.com/google/tcmalloc/blob/master/tcmalloc/pagemap.h)
+in order to map all possible memory locations onto spans.
+
+The following diagram shows how a radix-2 pagemap is used to map the address of
+objects onto the spans that control the pages where the objects reside. In the
+diagram **span A** covers two pages, and **span B** covers 3 pages.
+
+![The pagemap maps objects to spans.](images/pagemap.png "The pagemap maps objects to spans.")
+
+Spans are used in the middle-end to determine where to place returned objects,
+and in the back-end to manage the handling of page ranges.
+
+### Storing Small Objects in Spans
+
+A span contains a pointer to the base of the TCMalloc pages that the span
+controls. For small objects those pages are divided into at most 2<sup>16</sup>
+objects. This value is selected so that within the span we can refer to objects
+by a two-byte index.
+
+This means that we can use an
+[unrolled linked list](https://en.wikipedia.org/wiki/Unrolled_linked_list) to
+hold the objects. For example, if we have eight byte objects we can store the
+indexes of three ready-to-use objects, and use the forth slot to store the index
+of the next object in the chain. This data structure reduces cache misses over a
+fully linked list.
+
+The other advantage of using two byte indexes is that we're able to use spare
+capacity in the span itself to
+[cache four objects](https://github.com/google/tcmalloc/blob/master/tcmalloc/span.h).
+
+When we have
+[no available objects](https://github.com/google/tcmalloc/blob/master/tcmalloc/central_freelist.cc)
+for a size-class, we need to fetch a new span from the pageheap and
+[populate](https://github.com/google/tcmalloc/blob/master/tcmalloc/central_freelist.cc)
+it.
+
+## TCMalloc Page Sizes
+
+TCMalloc can be built with various
+["page sizes"](https://github.com/google/tcmalloc/blob/master/tcmalloc/common.h)
+. Note that these do not correspond to the page size used in the TLB of the
+underlying hardware. These TCMalloc page sizes are currently 4KiB, 8KiB, 32KiB,
+and 256KiB.
+
+A TCMalloc page either holds multiple objects of a particular size, or is used
+as part of a group to hold an object of size greater than a single page. If an
+entire page becomes free it will be returned to the back-end (the pageheap) and
+can later be repurposed to hold objects of a different size (or returned to the
+OS).
+
+Small pages are better able to handle the memory requirements of the application
+with less overhead. For example, a half-used 4KiB page will have 2KiB left over
+versus a 32KiB page which would have 16KiB. Small pages are also more likely to
+become free. For example, a 4KiB page can hold eight 512-byte objects versus 64
+objects on a 32KiB page; and there is much less chance of 64 objects being free
+at the same time than there is of eight becoming free.
+
+Large pages result in less need to fetch and return memory from the back-end. A
+single 32KiB page can hold eight times the objects of a 4KiB page, and this can
+result in the costs of managing the larger pages being smaller. It also takes
+fewer large pages to map the entire virtual address space. TCMalloc has a
+[pagemap](https://github.com/google/tcmalloc/blob/master/tcmalloc/pagemap.h)
+which maps a virtual address onto the structures that manage the objects in that
+address range. Larger pages mean that the pagemap needs fewer entries and is
+therefore smaller.
+
+Consequently, it makes sense for applications with small memory footprints, or
+that are sensitive to memory footprint size to use smaller TCMalloc page sizes.
+Applications with large memory footprints are likely to benefit from larger
+TCMalloc page sizes.
+
+## TCMalloc Backend
+
+The back-end of TCMalloc has three jobs:
+
+*   It manages large chunks of unused memory.
+*   It is responsible for fetching memory from the OS when there is no suitably
+    sized memory available to fulfill an allocation request.
+*   It is responsible for returning unneeded memory back to the OS.
+
+There are two backends for TCMalloc:
+
+*   The Legacy pageheap which manages memory in TCMalloc page sized chunks.
+*   The hugepage aware pageheap which manages memory in chunks of hugepage
+    sizes. Managing memory in hugepage chunks enables the allocator to improve
+    application performance by reducing TLB misses.
+
+### Legacy Pageheap
+
+The legacy pageheap is an array of free lists for particular lengths of
+contiguous pages of available memory. For `k < 256`, the `k`th entry is a free
+list of runs that consist of `k` TCMalloc pages. The `256`th entry is a free
+list of runs that have length `>= 256` pages:
+
+![Layout of legacy pageheap.](images/legacy_pageheap.png "Layout of legacy pageheap.")
+
+An allocation for `k` pages is satisfied by looking in the `k`th free list. If
+that free list is empty, we look in the next free list, and so forth.
+Eventually, we look in the last free list if necessary. If that fails, we fetch
+memory from the system `mmap`.
+
+If an allocation for `k` pages is satisfied by a run of pages of length `> k` ,
+the remainder of the run is re-inserted back into the appropriate free list in
+the pageheap.
+
+When a range of pages are returned to the pageheap, the adjacent pages are
+checked to determine if they now form a contiguous region, if that is the case
+then the pages are concatenated and placed into the appropriate free list.
+
+### Hugepage Aware Allocator
+
+The objective of the hugepage aware allocator is to hold memory in hugepage size
+chunks. On x86 a hugepage is 2MiB in size. To do this the back-end has three
+different caches:
+
+*   The filler cache holds hugepages which have had some memory allocated from
+    them. This can be considered to be similar to the legacy pageheap in that it
+    holds linked lists of memory of a particular number of TCMalloc pages.
+    Allocation requests for sizes of less than a hugepage in size are
+    (typically) returned from the filler cache. If the filler cache does not
+    have sufficient available memory it will request additional hugepages from
+    which to allocate.
+*   The region cache which handles allocations of greater than a hugepage. This
+    cache allows allocations to straddle multiple hugepages, and packs multiple
+    such allocations into a contiguous region. This is particularly useful for
+    allocations that slightly exceed the size of a hugepage (for example, 2.1
+    MiB).
+*   The hugepage cache handles large allocations of at least a hugepage. There
+    is overlap in usage with the region cache, but the region cache is only
+    enabled when it is determined (at runtime) that the allocation pattern would
+    benefit from it.
+
+Additional information about the design choices made in HPAA are discussed in a
+specific [design doc](temeraire.md) for it.
+
+## Caveats
+
+TCMalloc will reserve some memory for metadata at start up. The amount of
+metadata will grow as the heap grows. In particular the pagemap will grow with
+the virtual address range that TCMalloc uses, and the spans will grow as the
+number of active pages of memory grows. In per-CPU mode, TCMalloc will reserve a
+slab of memory per-CPU (typically 256 KiB), which, on systems with large numbers
+of logical CPUs, can lead to a multi-mebibyte footprint.
+
+It is worth noting that TCMalloc requests memory from the OS in large chunks
+(typically 1 GiB regions). The address space is reserved, but not backed by
+physical memory until it is used. Because of this approach the VSS of the
+application can be substantially larger than the RSS. A side effect of this is
+that trying to limit an application's memory use by restricting VSS will fail
+long before the application has used that much physical memory.
+
+Don't try to load TCMalloc into a running binary (e.g., using JNI in Java
+programs). The binary will have allocated some objects using the system malloc,
+and may try to pass them to TCMalloc for deallocation. TCMalloc will not be able
+to handle such objects.
--- a/src/third_party/tcmalloc/dist/docs/gperftools.md
+++ b/src/third_party/tcmalloc/dist/docs/gperftools.md
@ -0,0 +1,70 @@
+# TCMalloc and gperftools
+
+There are two projects on Github that are based on Google’s internal TCMalloc:
+This repository and [gperftools](https://github.com/gperftools/gperftools). Both
+are fast C/C++ memory allocators designed around a fast path that avoids
+synchronizing with other threads for most allocations.
+
+This repository is Google's current implementation of TCMalloc, used by ~all of
+our C++ programs in production. The code is limited to the memory allocator
+implementation itself.
+
+## History
+
+Google open-sourced its memory allocator as part of "Google Performance Tools"
+in 2005. At the time, it became easy to externalize code, but more difficult to
+keep it in-sync with our internal usage, as discussed by Titus Winters’ in
+[his 2017 CppCon Talk](https://www.youtube.com/watch?v=tISy7EJQPzI) and the
+"Software Engineering at Google" book. Subsequently, our internal implementation
+diverged from the code externally. This project eventually was adopted by the
+community as "gperftools."
+
+## Differences
+
+Since
+[“Profiling a Warehouse-Scale Computer” (Kanev 2015)](https://research.google/pubs/pub44271/),
+we have invested in improving application productivity via optimizations to the
+implementation (per-CPU caches, sized delete, fast/slow path improvements,
+[hugepage-aware backend](temeraire.md)).
+
+Because this repository reflects our day-to-day usage, we've focused on the
+platforms we regularly use and can see extensive testing and optimization.
+
+This implementation is based on [Abseil](https://github.com/abseil/abseil-cpp).
+Like Abseil, we do not attempt to provide ABI stability. Providing a stable ABI
+could require compromising performance or adding otherwise unneeded complexity
+to maintain stability. These caveats are noted in our
+[Compatibility Guidelines](compatibility.md).
+
+In addition to a memory allocator, the gperftools project contains a number of
+other tools:
+
+*   An All-Allocation Memory Profiler: We have found this prohibitively costly
+    to use regularly, and instead focus on using low-overhead, always-on
+    sampling profilers. This sampling based profiler is exposed in our
+    `malloc_extension.h`.
+*   A SIGPROF-based CPU Profiler: The Linux `perf` tool is decreasing our
+    internal need for signal-based profiling. Additionally, with restartable
+    sequences, signals interrupt the fastpath, leading to skew between the
+    observed instruction pointer and where we actually spend CPU time.
+*   A Heap Checker/Debug Allocator: The LeakSanitizer, AddressSanitizer, and
+    MemorySanitizer suite provide higher accuracy and better performance.
+*   A perl-based `pprof` tool: This project is now developed in Go and is
+    [available on Github](https://github.com/google/pprof).
+
+## Differences From Google's Implementation of TCMalloc
+
+The configuration on Github mirrors our production defaults, with two notable
+exceptions:
+
+*   Many of our production servers start a background thread (via
+    `tcmalloc::MallocExtension::ProcessBackgroundActions`) to regularly call
+    `tcmalloc::MallocExtension::ReleaseMemoryToSystem`, while others never
+    release memory in favor of better CPU performance. These tradeoffs are
+    discussed in our [tuning page](tuning.md).
+*   We do not activate [GWP ASan](gwp-asan.md) by default, but can be activated
+    via `MallocExtension`.
+
+Over time, we have found that configurability carries a maintenance burden.
+While a knob can provide immediate flexibility, the increased complexity can
+cause subtle problems for more rarely used combinations.
--- a/src/third_party/tcmalloc/dist/docs/gwp-asan.md
+++ b/src/third_party/tcmalloc/dist/docs/gwp-asan.md
@ -0,0 +1,87 @@
+# GWP-ASan
+
+GWP-ASan is a low-overhead sampling-based utility for finding
+heap-use-after-frees and heap-buffer-overflows in production.
+GWP-ASan is a recursive acronym: "**G**WP-ASan **W**ill **P**rovide
+**A**llocation **San**ity".
+
+## Why not just use ASan?
+
+For many cases you **should** use [ASan](https://clang.llvm.org/docs/AddressSanitizer.html)
+(e.g., on your tests). However, ASan comes with average execution slowdown of 2x
+(compared to `-O2`), binary size increase of 2x, and significant memory
+overhead. For these reasons, ASan is generally impractical for use in production
+(other than in dedicated canaries). GWP-ASan is a minimal-overhead alternative
+designed for widespread use in production.
+
+## How to use GWP-ASan
+
+You can enable GWP-ASan by calling `tcmalloc::MallocExtension::ActivateGuardedSampling()`.
+To adjust GWP-ASan's sampling rate, see
+[below](#what-should-i-set-the-sampling-rate-to).
+
+When GWP-ASan detects a heap memory error, it prints stack traces for the point
+of the memory error, as well as the points where the memory was allocated and
+(if applicable) freed. These stack traces can then be
+symbolized offline to get file names and line
+numbers.
+
+GWP-ASan will crash after printing stack traces.
+
+## CPU and RAM Overhead
+
+For guarded sampling rates above 100M (the default), CPU overhead is negligible. For sampling rates as low as 8M, CPU overhead is under 0.5%.
+
+RAM overhead is up to 512 KB on x86\_64, or 4 MB on PowerPC.
+
+## What should I set the sampling rate to?
+
+`tcmalloc::MallocExtension::SetGuardedSamplingRate` sets the sampling rate for
+GWP-ASan. GWP-ASan will guard allocations approximately every
+`GuardedSamplingRate` bytes allocated. Thus, lower values will generally
+increase the the chance of finding bugs but will also have higher CPU overhead.
+
+For applications that cannot tolerate any CPU overhead, we recommend
+using TCMalloc's default sampling rate.  If your application can tolerate some
+CPU overhead, we recommend a sampling rate of 8MB.
+
+## Limitations
+
+-   The current version of GWP-ASan will only find bugs in allocations of 8 KB
+    or less. This restriction was made to limit the CPU/RAM overhead required by
+    GWP-ASan.
+
+-   GWP-ASan has limited diagnostic information for buffer overflows within
+    alignment padding, since overflows of this type will not touch a guard
+    page. For write-overflows,
+    GWP-ASan will still be able to detect the overflow during deallocation by
+    checking whether magic bytes have been overwritten, but the stack trace of
+    the overflow itself will not be available.
+
+## FAQs
+
+### Does GWP-ASan report false positives?
+
+No. GWP-ASan crashes because your program accessed unmapped memory, which is
+always a true bug, or a sign of hardware failure (see below).
+
+### How do I know a GWP-ASan report isn't caused by hardware failure?
+
+The vast majority of GWP-ASan reports we see are true bugs, but occasionally
+faulty hardware will be the actual cause of the crash. In general, if you see
+the same GWP-ASan crash on multiple machines, it is very likely there's a true
+software bug.
+
+### Can GWP-ASan cause queries of death (QoD) in my production?
+
+Since GWP-ASan finds bugs with very low probability, QoD is generally not a
+concern. Even if there is a reliable way to trigger a bug, GWP-ASan will only
+detect it and crash on a tiny fraction of actual occurrences, allowing the other
+99.9% to continue without crashing.
+
+## Other versions of GWP-ASan
+
+Separate implementations of GWP-ASan exist for Chromium and Android. For
+GWP-ASan for Chromium see
+[here](https://chromium.googlesource.com/chromium/src/+/lkgr/docs/gwp_asan.md).
+For Android, see [here](https://developer.android.com/ndk/guides/gwp-asan).
--- a/src/third_party/tcmalloc/dist/docs/images/legacy_pageheap.png
+++ b/src/third_party/tcmalloc/dist/docs/images/legacy_pageheap.png
--- a/src/third_party/tcmalloc/dist/docs/images/lifetimes-counterfactual.png
+++ b/src/third_party/tcmalloc/dist/docs/images/lifetimes-counterfactual.png
--- a/src/third_party/tcmalloc/dist/docs/images/lifetimes-enabled.png
+++ b/src/third_party/tcmalloc/dist/docs/images/lifetimes-enabled.png
--- a/src/third_party/tcmalloc/dist/docs/images/pagemap.png
+++ b/src/third_party/tcmalloc/dist/docs/images/pagemap.png
--- a/src/third_party/tcmalloc/dist/docs/images/per-cpu-cache-internals.png
+++ b/src/third_party/tcmalloc/dist/docs/images/per-cpu-cache-internals.png
--- a/src/third_party/tcmalloc/dist/docs/images/per-thread-structure.png
+++ b/src/third_party/tcmalloc/dist/docs/images/per-thread-structure.png
--- a/src/third_party/tcmalloc/dist/docs/images/spanmap.gif
+++ b/src/third_party/tcmalloc/dist/docs/images/spanmap.gif
--- a/src/third_party/tcmalloc/dist/docs/images/tcmalloc_internals.png
+++ b/src/third_party/tcmalloc/dist/docs/images/tcmalloc_internals.png
--- a/src/third_party/tcmalloc/dist/docs/lifetime-based-allocator.md
+++ b/src/third_party/tcmalloc/dist/docs/lifetime-based-allocator.md
@ -0,0 +1,102 @@
+# Lifetime-based Memory Allocation
+
+TCMalloc contains an experimental feature that leverages object lifetime
+information for managing memory allocations. [Temeraire](temeraire.md)'s default
+allocation policy binpacks medium-sized allocations into the last hugepage
+associated with a large allocation. If the large allocation is short-lived, this
+can cause persistent fragmentation from long-lived medium-sized allocations that
+get binpacked into this region.
+
+The lifetime-based allocator attempts to side-step this problem by predicting
+the lifetime of large allocations and allocating short-lived large objects from
+a special [HugeRegion](regions-are-not-optional.md) instead. Lifetimes are
+treated as binary (short, long) and are predicted based on the stack trace at
+the time of allocation. While the application is running, we are recording
+statistics about all large allocations that we encounter and once we have enough
+samples, we use these statistics to make a prediction when we encounter that
+same stack trace again. If a large allocation is predicted to be short-lived, it
+is placed into a special short-lived HugeRegion, otherwise it is handled as
+usual. We call this region "lifetime region".
+
+The allocator can run in two different modes:
+
+*   **Enabled**: The allocator will execute the allocation policy described
+    above.
+*   **Counterfactual**: The allocator will execute the lifetime-based policy on
+    the side but not affect the actual allocation behavior. Instead, it will
+    collect statistics about the correctness of its decisions as well as the
+    size of the lifetime region had the lifetime-based allocator been enabled.
+
+The lifetime-based allocator has one configuration parameter (T), which is the
+cutoff below which an object is considered short-lived (T = 0.5s by default).
+Note that setting T = infinity causes all large allocations to be placed into
+the separate region.
+
+## Lifetime Profiling
+
+Lifetime profiling is implemented through two components:
+
+*   `LifetimeDatabase`: This component stores a dictionary of lifetime
+    statistics, indexed by allocation stack trace. The size of the dictionary is
+    limited to avoid memory blow-up. Entries are managed through a combination
+    of LRU and reference counting. Each entry stores the number of long-lived
+    (lifetime > T) and short-lived objects with this allocation stack trace that
+    were encountered. The lifetime database does not track lifetimes itself but
+    gets called from other components to 1) record lifetimes, and 2) look up
+    lifetime predictions for a given stack trace. The latter works by looking up
+    the statistics associated with that stack trace and predicting the object as
+    long-lived if the number of long-lived allocations emanating from this stack
+    trace exceeds the number of short-lived allocations by a significant margin.
+
+*   `LifetimeTracker`: A lifetime tracker is a small amount of meta-data that
+    can be associated with an allocation and is used to track its lifetime. The
+    tracker (among other information) stores a pointer to the lifetime
+    statistics associated with this allocation, a timestamp, and a (possibly
+    unused) counterfactual pointer whose purpose will be explained later in this
+    document. Active trackers are strung together in a linked list sorted by
+    allocation timestamp. The timestamp associated with the tracker at the front
+    of this list is checked on every operation and if the lifetime of this
+    object exceeds T, it is classified as long-lived. In this case, all trackers
+    whose lifetime exceeds T are removed from the list (i.e., their trackers
+    become inactive) and their associated lifetime statistics are updated to
+    reflect that a long-lived allocation was encountered. If an object is
+    deallocated before its tracker becomes inactive, its tracker is removed from
+    the list and a short-lived allocation is recorded.
+
+The use of trackers differs between enabled and counterfactual mode. In enabled
+mode, a tracker is associated with every large allocation that is placed in the
+regular hugepage-aware allocator and results in a filler donation. This tracker
+is allocated with the remaining meta-data that is already associated with any
+such donation. If an object is allocated in the lifetime region, its tracker is
+allocated in a special meta-data region associated with the lifetime region.
+This ensures that lifetimes continue to be tracked even if the allocator has
+decided to treat a particular allocation site as short-lived.
+
+![Lifetime Tracking (enabled)](images/lifetimes-enabled.png "Lifetime tracking in enabled mode")
+
+In counterfactual mode, no actual objects are allocated in the short-lived
+region. Instead, the lifetime region is a HugeRegion that is not backed by
+actual memory but otherwise executes the same logic. This means that for any
+object that would have been placed in the lifetime region had it been enabled,
+the real backing object is allocated in the existing hugepage-aware allocator.
+In this case, the tracker will store a `counterfactual_ptr` that points towards
+the address that the object would have had if it were actually allocated in the
+lifetime region. Otherwise, the object is tracked just like any other object in
+the hugepage-aware allocator.
+
+![Lifetime Tracking (counterfactual)](images/lifetimes-counterfactual.png "Lifetime tracking in enabled mode")
+
+## Lifetime-based Allocation
+
+The lifetime-based allocator uses the existing HugeRegion implementation for all
+objects that are predicted short-lived. Whenever a large allocation is
+encountered, the current stack trace is collected, and the lifetime is looked up
+in the lifetime database. In regular enabled mode, the object is placed in the
+lifetime region or the regular allocator, depending on this prediction, and a
+tracker is installed. In counterfactual mode, the object is always allocated in
+the regular allocator and if the prediction called for the allocation to be
+placed in the lifetime region, an *additional* allocation call is placed to the
+lifetime region (which, in counterfactual mode, is not backed by actual memory).
+In this case, the tracker's `counterfactual_ptr` is set to the address that the
+object would have been allocated at, so that on deallocation, a corresponding
+call can be made to the lifetime region to deallocate the object.
--- a/src/third_party/tcmalloc/dist/docs/overview.md
+++ b/src/third_party/tcmalloc/dist/docs/overview.md
@ -0,0 +1,99 @@
+# TCMalloc Overview
+
+TCMalloc is Google's customized implementation of C's `malloc()` and C++'s
+`operator new` used for memory allocation within our C and C++ code. This custom
+memory allocation framework is an alternative to the one provided by the C
+standard library (on Linux usually through `glibc`) and C++ standard library.
+TCMalloc is designed to be more efficient at scale than other implementations.
+
+Specifically, TCMalloc provides the following benefits:
+
+*   Performance scales with highly parallel applications.
+*   Optimizations brought about with recent C++14 and C++17 standard
+    enhancements, and by diverging slightly from the standard where performance
+    benefits warrant. (These are noted within the
+    [TCMalloc Reference](reference.md).)
+*   Extensions to allow performance improvements under certain architectures,
+    and additional behavior such as metric gathering.
+
+## TCMalloc Cache Operation Mode
+
+TCMalloc may operate in one of two fashions:
+
+*   (default) per-CPU caching, where TCMalloc maintains memory caches local to
+    individual logical cores. Per-CPU caching is enabled when running TCMalloc
+    on any Linux kernel that utilizes restartable sequences (RSEQ). Support for
+    RSEQ was merged in Linux 4.18.
+*   per-thread caching, where TCMalloc maintains memory caches local to each
+    application thread. If RSEQ is unavailable, TCMalloc reverts to using this
+    legacy behavior.
+
+NOTE: the "TC" in TCMalloc refers to Thread Caching, which was originally a
+distinguishing feature of TCMalloc; the name remains as a legacy.
+
+In both cases, these cache implementations allows TCMalloc to avoid requiring
+locks for most memory allocations and deallocations.
+
+## TCMalloc Features
+
+TCMalloc provides APIs for dynamic memory allocation: `malloc()` using the C
+API, and `::operator new` using the C++ API. TCMalloc, like most allocation
+frameworks, manages this memory better than raw memory requests (such as through
+`mmap()`) by providing several optimizations:
+
+*   Performs allocations from the operating system by managing
+    specifically-sized chunks of memory (called "pages"). Having all of these
+    chunks of memory the same size allows TCMalloc to simplify bookkeeping.
+*   Devoting separate pages (or runs of pages called "Spans" in TCMalloc) to
+    specific object sizes. For example, all 16-byte objects are placed within a
+    "Span" specifically allocated for objects of that size. Operations to get or
+    release memory in such cases are much simpler.
+*   Holding memory in *caches* to speed up access of commonly-used objects.
+    Holding such caches even after deallocation also helps avoid costly system
+    calls if such memory is later re-allocated.
+
+The cache size can also affect performance. The larger the cache, the less any
+given cache will overflow or get exhausted, and therefore require a lock to get
+more memory. TCMalloc extensions allow you to modify this cache size, though the
+default behavior should be preferred in most cases. For more information,
+consult the [TCMalloc Tuning Guide](tuning.md).
+
+Additionally, TCMalloc exposes telemetry about the state of the application's
+heap via `MallocExtension`. This can be used for gathering profiles of the live
+heap, as well as a snapshot taken near the heap's highwater mark size (a peak
+heap profile).
+
+## The TCMalloc API
+
+TCMalloc implements the C and C++ dynamic memory API endpoints from the C11,
+C++11, C++14, and C++17 standards.
+
+From C++, this includes
+
+*   The basic `::operator new`, `::operator delete`, and array variant
+    functions.
+*   C++14's sized `::operator delete`
+*   C++17's overaligned `::operator new` and `::operator delete` functions.
+
+Unlike in the standard implementations, TCMalloc does not throw an exception
+when allocations fail, but instead crashes directly. Such behavior can be used
+as a performance optimization for move constructors not currently marked
+`noexcept`; such move operations can be allowed to fail directly due to
+allocation failures. In [Abseil](https://abseil.io/docs/cpp/guides/base), these
+are enabled with `-DABSL_ALLOCATOR_NOTHROW`.
+
+From C, this includes `malloc`, `calloc`, `realloc`, and `free`.
+
+The TCMalloc API obeys the behavior of C90 DR075 and
+[DR445](http://www.open-std.org/jtc1/sc22/wg14/www/docs/summary.htm#dr_445)
+which states:
+
+> The alignment requirement still applies even if the size is too small for any
+> object requiring the given alignment.
+
+In other words, `malloc(1)` returns `alignof(std::max_align_t)`-aligned pointer.
+Based on the progress of
+[N2293](http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2293.htm), we may relax
+this alignment in the future.
+
+For more complete information, consult the [TCMalloc Reference](reference.md).
--- a/src/third_party/tcmalloc/dist/docs/platforms.md
+++ b/src/third_party/tcmalloc/dist/docs/platforms.md
@ -0,0 +1,75 @@
+# TCMalloc Platforms
+
+The TCMalloc code is supported on the following platforms. By "platforms", we
+mean the union of operating system, architecture (e.g. little-endian vs.
+big-endian), compiler, and standard library.
+
+## Language Requirements
+
+TCMalloc requires a code base that supports C++17 and our code is
+C++17-compliant. C code is required to be compliant to C11.
+
+We guarantee that our code will compile under the following compilation flags:
+
+Linux:
+
+*   gcc 9.2+, clang 9.0+: `-std=c++17`
+
+(TL;DR; All code at this time must be built under C++17. We will update this
+list if circumstances change.)
+
+## Supported Platforms
+
+The document below lists each platform, broken down by Operating System,
+Architecture, Specific Compiler, and Standard Library implementation.
+
+### Linux
+
+**Supported**
+
+<table width="80%">
+  <col width="360">
+  <col width="120">
+  <tbody>
+    <tr>
+      <th>Operating System</th>
+      <th>Endianness/Word Size</th>
+      <th>Processor Architectures</th>
+      <th>Compilers*</th>
+      <th>Standard Libraries</th>
+    </tr>
+    <tr>
+      <td>Linux</td>
+      <td>little-endian, 64-bit</td>
+      <td>x86, AArch64</td>
+      <td>gcc 9.2+<br/>clang 9.0+</td>
+      <td>libstdc++<br/>libc++</td>
+    </tr>
+  </tbody>
+</table>
+
+\* We test on gcc 9.2, though gcc versions (which support C++17) prior to that
+release should also work.
+
+**Best Effort**
+
+<table width="80%">
+  <col width="360">
+  <col width="120">
+  <tbody>
+    <tr>
+      <th>Operating System</th>
+      <th>Endianness/Word Size</th>
+      <th>Processor Architectures</th>
+      <th>Compilers*</th>
+      <th>Standard Libraries</th>
+    </tr>
+    <tr>
+      <td>Linux</td>
+      <td>little-endian, 64-bit</td>
+      <td>PPC</td>
+      <td>gcc 9.2+<br/>clang 9.0+</td>
+      <td>libstdc++<br/>libc++</td>
+    </tr>
+  </tbody>
+</table>
--- a/src/third_party/tcmalloc/dist/docs/quickstart.md
+++ b/src/third_party/tcmalloc/dist/docs/quickstart.md
@ -0,0 +1,267 @@
+# TCMalloc Quickstart
+
+Note: this Quickstart uses Bazel as the official build system for TCMalloc,
+which is supported on Linux, and compatible with most major compilers. The
+TCMalloc source code assumes you are using Bazel and contains `BUILD.bazel`
+files for that purpose.
+
+This document is designed to allow you to get TCMalloc set up as your custom
+allocator within a C++ development environment. We recommend that each person
+starting development using TCMalloc at least run through this quick tutorial.
+
+## Prerequisites
+
+Running the code within this tutorial requires:
+
+*   A compatible platform (E.g. Linux). Consult the
+    [Platforms Guide](platforms.md) for more information.
+*   A compatible C++ compiler *supporting at least C++17*. Most major compilers
+    are supported.
+*   [Git](https://git-scm.com/) for interacting with the Abseil source code
+    repository, which is contained on [GitHub](http://github.com). To install
+    Git, consult the [Set Up Git](https://help.github.com/articles/set-up-git/)
+    guide on GitHub.
+
+Although you are free to use your own build system, most of the documentation
+within this guide will assume you are using [Bazel](https://bazel.build/),
+version 4.0 or newer.
+
+To download and install Bazel (and any of its dependencies), consult the
+[Bazel Installation Guide](https://docs.bazel.build/versions/master/install.html).
+
+## Getting the TCMalloc Code
+
+You can obtain the TCMalloc code from its repository on GitHub:
+
+```
+# Change to the directory where you want to create the code repository
+$ cd ~
+$ mkdir Source; cd Source
+$ git clone https://github.com/google/tcmalloc.git
+Cloning into 'tcmalloc'...
+remote: Total 1935 (delta 1083), reused 1935 (delta 1083)
+Receiving objects: 100% (1935/1935), 1.06 MiB | 0 bytes/s, done.
+Resolving deltas: 100% (1083/1083), done.
+$
+```
+
+Git will create the repository within a directory named `tcmalloc`. Navigate
+into this directory and run all tests:
+
+```
+$ cd tcmalloc
+$ bazel test //tcmalloc/...
+INFO: Analyzed 112 targets (12 packages loaded, 606 targets configured).
+...
+INFO: Build completed successfully, 827 total actions
+$
+```
+
+Congratulations! You've installed TCMalloc
+
+## Running the TCMalloc Hello World
+
+Once you've verified you have TCMalloc installed correctly, you can compile and
+run the
+[tcmalloc-hello](https://github.com/google/tcmalloc/blob/master/tcmalloc/testing/hello_main.cc)
+sample binary to see how TCMalloc is linked into a sample binary. This tiny
+project features proper configuration and a simple `hello_main` to demonstrate
+how TCMalloc works.
+
+First, build the `tcmalloc/testing:hello_main` target:
+
+```
+tcmalloc$ bazel build tcmalloc/testing:hello_main
+Extracting Bazel installation...
+Starting local Bazel server and connecting to it...
+INFO: Analyzed target //tcmalloc/testing:hello_main (31 packages loaded ...
+...
+INFO: Build completed successfully, 102 total actions
+PASSED in 0.1s
+tcmalloc$
+```
+
+Now, run the compiled program:
+
+```
+tcmalloc$ bazel run tcmalloc/testing:hello_main
+...
+INFO: Found 1 target...
+...
+INFO: Build completed successfully, 1 total action
+Current heap size = 73728 bytes
+hello world!
+new'd 1073741824 bytes at 0x14ea40000000
+Current heap size = 1073816576 bytes
+malloc'd 1073741824 bytes at 0x14eac0000000
+Current heap size = 2147558400 bytes
+$
+```
+
+You can inspect this code within
+[`tcmalloc/testing/hello_main.cc`](https://github.com/google/tcmalloc/blob/master/tcmalloc/testing/hello_main.cc)
+
+Happy Coding!
+
+## Creating and Running TCMalloc
+
+Now that you've obtained the TCMalloc code and verified that you can build,
+test, and run it, you're ready to use it within your own project.
+
+### Linking Your Code to the TCMalloc Repository
+
+First, create (or select) a source code directory for your work. This directory
+should generally not be the `tcmalloc` directory itself; instead, you will link
+into that repository from your own source directory.
+
+```
+# Change to your main development directory and create a new development
+# directory. (If you already have a development directory you'd wish to use,
+# you can use that.)
+$ cd ~/Source
+$ mkdir TestProject; cd TestProject
+```
+
+Bazel allows you to link other Bazel projects using `WORKSPACE` files in the
+root of your development directories. To add a link to your local TCMalloc
+repository within your new project, add the following into a `WORKSPACE` file:
+
+```
+local_repository(
+  # Name of the TCMalloc repository. This name is defined within your
+  # WORKSPACE file, in its `workspace()` metadata
+  name = "com_google_tcmalloc",
+
+  # NOTE: Bazel paths must be absolute paths. E.g., you can't use ~/Source
+  path = "/PATH_TO_SOURCE/Source/tcmalloc",
+)
+```
+
+The "name" in the `WORKSPACE` file identifies the name you will use in Bazel
+`BUILD` files to refer to the linked repository (in this case
+"com_google_tcmalloc").
+
+Note that your path to the TCMalloc source code must be an absolute path.
+
+### Adding Abseil
+
+TCMalloc requires [Abseil](https://abseil.io) which you will also need to
+provide as a `local_repository`, or link to a specific commit (we always
+recommend the latest commit) using an `http_archive` declaration in the
+`WORKSPACE` file:
+
+<pre>
+# Abseil HTTP Archive to specific commit
+#
+# Consult https://github.com/abseil/abseil-cpp/commits/master for the latest
+# commit. But DO NOT use master.zip for that purpose. (Sha256 values are not
+# stable across master versions.) Click on that specific commit.
+#
+# Click "Browse Files" on the commit and click on "Clone or Download Code."
+#
+# Right click on "Download ZIP" to copy the HTTP Archive URL, which you will
+# use within the http_archive "urls" field.
+#
+# Note that you will need to generate a sha256 value for Bazel's http_archive
+# to ensure this code is secure. On Linux you can do so with a downloaded .zip
+# file using the sha256sum command line:
+#
+# $ sha256sum github_zip_file.zip
+http_archive(
+    name = "com_google_absl",
+    urls = ["https://github.com/abseil/abseil-cpp/archive/<i>commit_value</i>.zip"],
+    strip_prefix = "abseil-cpp-<i>commit_value</i>",
+    sha256 = "<i>sha256_of_commit_value</i>",
+)
+</pre>
+
+### Creating Your Test Code
+
+Within your `TestProject` create an `examples` directory:
+
+```
+$ cd TestProject; mkdir examples; cd examples
+```
+
+Now, create a `hello_world.cc` C++ file within your `examples` directory:
+
+```
+#include <iostream>
+#include <cstddef>
+
+int main() {
+    std::cout << "Standard Alignment: " << alignof(std::max_align_t) << '\n';
+
+    double *ptr = (double*) malloc(sizeof(double));
+    std::cout << "Double Alignment: " << alignof(*ptr) << '\n';
+
+    char *ptr2 = (char*) malloc(1);
+    std::cout << "Char Alignment: " << alignof(*ptr2) << '\n';
+
+    void *ptr3;
+    std::cout << "Sizeof void*: " << sizeof(ptr3) << '\n';
+return 0;
+}
+```
+
+### Creating Your BUILD File
+
+Now, create a `BUILD` file within your `examples` directory like the following:
+
+```
+cc_binary(
+    name = "hello_world",
+    srcs = ["hello_world.cc"],
+    malloc = "@com_google_tcmalloc//tcmalloc",
+)
+```
+
+NOTE: For more information on how to create Bazel BUILD files, consult the
+[Bazel Tutorial](https://docs.bazel.build/versions/master/tutorial/cpp.html).
+
+We declare TCMalloc as our own custom allocation framework using the `malloc`
+keyword and set this to the library name (`//tcmalloc`) within our `WORKSPACE`
+file (`@com_google_tcmalloc`).
+
+Build our target ("hello_world") and run it:
+
+```
+# It's often good practice to build files from the workspace root
+$ cd ~/Source/TestProject
+Source/TestProject$ bazel build //examples:hello_world --cxxopt='-std=c++17'
+INFO: Analysed target //examples:hello_world (12 packages loaded).
+INFO: Found 1 target...
+Target //examples:hello_world up-to-date:
+  bazel-bin/examples/hello_world
+INFO: Elapsed time: 0.180s, Critical Path: 0.00s
+INFO: Build completed successfully, 1 total action
+
+Source/TestProject$ bazel run //examples:hello_world
+INFO: Running command line: bazel-bin/examples/hello_world
+Standard Alignment: 16
+Double Alignment: 8
+Char Alignment: 1
+Sizeof void*: 8
+Source/TestProject$
+```
+
+Note that we passed `--cxxopt='std=c++17'` to build using C++17. Instead of
+passing this flag you can add this line to your root `.bazelrc` file:
+
+```
+build --cxxopt='-std=c++17'
+```
+
+Congratulations! You've created your first binary using TCMalloc.
+
+## What's Next
+
+*   Read our [overview](overview.md), if you haven't already. The overview
+    covers memory allocation concepts and best practices for using TCMalloc.
+*   Read through the TCMalloc [reference](reference.md) for information on the
+    behavior of `malloc()`, `::operator new`, and other allocation/deallocation
+    routines in TCMalloc.
+*   Consult the TCMalloc C++ `malloc_extension.h` header file, which contains
+    information on TCMalloc's supported extensions.
+*   Read our [contribution guidelines](../CONTRIBUTING.md), if you intend to
+    submit code to our repository.
--- a/src/third_party/tcmalloc/dist/docs/reference.md
+++ b/src/third_party/tcmalloc/dist/docs/reference.md
@ -0,0 +1,244 @@
+# TCMalloc Basic Reference
+
+TCMalloc provides implementations for C and C++ library memory management
+routines (`malloc()`, etc.) provided within the C and C++ standard libraries.
+
+Currently, TCMalloc requires code that conforms to the C11 C standard library
+and the C++11, C++14, or C++17 C++ standard library.
+
+NOTE: although the C API in this document is specific to the C language, the
+entire TCMalloc API itself is designed to be callable directly within C++ code
+(and we expect most usage to be from C++). The documentation in this section
+assumes C constructs (e.g. `size_t`) though invocations using equivalent C++
+constructs of aliased types (e.g. `std::size_t`) are instrinsically supported.
+
+## C++ API
+
+We implement the variants of `operator new` and `operator delete` from the
+C++11, C++14, C++17 standards exposed within the `<new>` header file. This
+includes:
+
+*   The basic `::operator new()`, `::operator delete()`, and array variant
+    functions.
+*   C++14's sized `::operator delete()`
+*   C++17's overaligned `::operator new()` and `::operator delete()` functions.
+    As required by the C++ standard, memory allocated using an aligned `operator
+    new` function must be deallocated with an aligned `operator delete`.
+
+### `::operator new` / `::operator new[]`
+
+```
+void* operator new(std::size_t count);
+void* operator new(std::size_t count, const std::nothrow_t& tag) noexcept;
+void* operator new(std::size_t count, std::align_val_t al);  // C++17
+void* operator new(std::size_t count,
+                   std::align_val_t al, const std::nothrow_t&) noexcept;  // C++17
+
+void* operator new[](std::size_t count);
+void* operator new[](std::size_t count, const std::nothrow_t& tag) noexcept;
+void* operator new[](std::size_t count, std::align_val_t al);  // C++17
+void* operator new[](std::size_t count,
+                     std::align_val_t al, const std::nothrow_t&) noexcept;  // C++17
+```
+
+`operator new`/`operator new[]` allocates `count` bytes. They may be invoked
+directly but are more commonly invoked as part of a *new*-expression.
+
+When `__STDCPP_DEFAULT_NEW_ALIGNMENT__` is not specified (or is larger than 8
+bytes), we use standard 16 byte alignments for `::operator new` without a
+`std::align_val_t` argument. However, for allocations under 16 bytes, we may
+return an object with a lower alignment, as no object with a larger alignment
+requirement can be allocated in the space. When compiled with
+`__STDCPP_DEFAULT_NEW_ALIGNMENT__ <= 8`, we use a set of sizes aligned to 8
+bytes for raw storage allocated with `::operator new`.
+
+NOTE: On many platforms, the value of `__STDCPP_DEFAULT_NEW_ALIGNMENT__` can be
+configured by the `-fnew-alignment=...` flag.
+
+The `std::align_val_t` variants provide storage suitably aligned to the
+requested alignment.
+
+If the allocation is unsuccessful, a failure terminates the program.
+
+NOTE: unlike in the C++ standard, we do not throw an exception in case of
+allocation failure, or invoke `std::get_new_handler()` repeatedly in an attempt
+to successfully allocate, but instead crash directly. Such behavior can be used
+as a performance optimization for move constructors not currently marked
+`noexcept`; such move operations can be allowed to fail directly due to
+allocation failures. Within Abseil code, these direct allocation failures are
+enabled with the Abseil build-time configuration macro
+[`ABSL_ALLOCATOR_NOTHROW`](https://abseil.io/docs/cpp/guides/base#abseil-exception-policy).
+
+If the `std::no_throw_t` variant is utilized, upon failure, `::operator new`
+will return `nullptr` instead.
+
+### `::operator delete` / `::operator delete[]`
+
+```
+void operator delete(void* ptr) noexcept;
+void operator delete(void* ptr, std::size_t sz) noexcept;
+void operator delete(void* ptr, std::align_val_t al) noexcept;
+void operator delete(void* ptr, std::size_t sz,
+                     std::align_val_t all) noexcept;
+
+void operator delete[](void* ptr) noexcept;
+void operator delete[](void* ptr, std::size_t sz) noexcept;       // C++14
+void operator delete[](void* ptr, std::align_val_t al) noexcept;  // C++17
+void operator delete[](void* ptr, std::size_t sz,
+                       std::align_val_t al) noexcept;             // C++17
+```
+
+`::operator delete`/`::operator delete[]` deallocate memory previously allocated
+by a corresponding `::operator new`/`::operator new[]` call respectively. It is
+commonly invoked as part of a *delete*-expression.
+
+Sized delete is used as a critical performance optimization, eliminating the
+need to perform a costly pointer-to-size lookup.
+
+### Extensions
+
+We also expose a prototype of
+[P0901](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2019/p0901r5.html) in
+https://github.com/google/tcmalloc/blob/master/tcmalloc/malloc_extension.h with
+`tcmalloc_size_returning_operator_new()`. This returns both memory and the size
+of the allocation in bytes. It can be freed with `::operator delete`.
+
+## C API
+
+The C standard library specifies the API for dynamic memory management within
+the `<stdlib.h>` header file. Implementations require C11 or greater.
+
+TCMalloc provides implementation for the following C API functions:
+
+*   `malloc()`
+*   `calloc()`
+*   `realloc()`
+*   `free()`
+*   `aligned_alloc()`
+
+For `malloc`, `calloc`, and `realloc`, we obey the behavior of C90 DR075 and
+[DR445](http://www.open-std.org/jtc1/sc22/wg14/www/docs/summary.htm#dr_445)
+which states:
+
+> The alignment requirement still applies even if the size is too small for any
+> object requiring the given alignment.
+
+In other words, `malloc(1)` returns `alignof(std::max_align_t)`-aligned pointer.
+Based on the progress of
+[N2293](http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2293.htm), we may relax
+this alignment in the future.
+
+Additionally, TCMalloc provides an implementation for the following POSIX
+standard library function, available within glibc:
+
+*   `posix_memalign()`
+
+TCMalloc also provides implementations for the following obsolete functions
+typically provided within libc implementations:
+
+*   `cfree()`
+*   `memalign()`
+*   `valloc()`
+*   `pvalloc()`
+
+Documentation is not provided for these obsolete functions. The implementations
+are provided only for compatibility purposes.
+
+### `malloc()`
+
+```
+void* malloc(size_t size);
+```
+
+`malloc` allocates `size` bytes of memory and returns a `void *` pointer to the
+start of that memory.
+
+`malloc(0)` returns a non-NULL zero-sized pointer. (Attempting to access memory
+at this location is undefined.) If `malloc()` fails for some reason, it returns
+NULL.
+
+### `calloc()`
+
+```
+void* calloc(size_t num, size_t size);
+```
+
+`calloc()` allocates memory for an array of objects, zero-initializes all bytes
+in allocated storage, and if allocation succeeds, returns a pointer to the first
+byte in the allocated memory block.
+
+`calloc(num, 0)` or `calloc(0, size)` returns a non-NULL zero-sized pointer.
+(Attempting to access memory at this location is undefined.) If `calloc()` fails
+for some reason, it returns NULL.
+
+### `realloc()`
+
+```
+void* realloc(void *ptr, size_t new_size);
+```
+
+`realloc()` re-allocates memory for an existing region of memory by either
+expanding or contracting the memory based on the passed `new_size` in bytes,
+returning a `void*` pointer to the start of that memory (which may not change);
+it does not perform any initialization of new areas of memory.
+
+`realloc(OBJ*, 0)` returns a NULL pointer. If `realloc()` fails for some reason,
+it also returns NULL.
+
+### `aligned_alloc()`
+
+```
+void* aligned_alloc(size_t alignment, size_t size);
+```
+
+`aligned_alloc()` allocates `size` bytes of memory with alignment of size
+`alignment` and returns a `void *` pointer to the start of that memory; it does
+not perform any initialization.
+
+The `size` parameter must be an integral multiple of `alignment` and `alignment`
+must be a power of two. If either of these cases is not satisfied,
+`aligned_alloc()` will fail and return a NULL pointer.
+
+`aligned_alloc` with `size=0` returns a non-NULL zero-sized pointer. (Attempting
+to access memory at this location is undefined.)
+
+### `posix_memalign()`
+
+```
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+```
+
+`posix_memalign()`, like `aligned_alloc()` allocates `size` bytes of memory with
+alignment of size `alignment` to the start of memory pointed to by `**memptr`;
+it does not perform any initialization. This pointer can be cast to the desired
+type of data pointer in order to be dereferenceable. If the alignment allocation
+succeeds, `posix_memalign()` returns `0`; otherwise it returns an error value.
+
+`posix_memalign` is similar to `aligned_alloc()` but `alignment` be a power of
+two multiple of `sizeof(void *)`. If the constraints are not satisfied,
+`posix_memalign()` will fail.
+
+`posix_memalign` with `size=0` returns a non-NULL zero-sized pointer.
+(Attempting to access memory at this location is undefined.)
+
+### `free()`
+
+```
+void free(void* ptr);
+```
+
+`free()` deallocates memory previously allocated by `malloc()`, `calloc()`,
+`aligned_alloc()`, `posix_memalign()`, or `realloc()`. If `free()` is passed a
+null pointer, the function does nothing.
+
+### Extensions
+
+These are contained in
+https://github.com/google/tcmalloc/blob/master/tcmalloc/malloc_extension.h.
+
+*   `nallocx(size_t size, int flags)` - Returns the number of bytes that would
+    be allocated by `malloc(size)`, subject to the alignment specified in
+    `flags`.
+*   `sdallocx(void* ptr, size_t size, int flags)` - Deallocates memory allocated
+    by `malloc` or `memalign`. It takes a size parameter to pass the original
+    allocation size, improving deallocation performance.
--- a/src/third_party/tcmalloc/dist/docs/regions-are-not-optional.md
+++ b/src/third_party/tcmalloc/dist/docs/regions-are-not-optional.md
@ -0,0 +1,154 @@
+# Regions Are Not Optional!
+
+Andrew Hunter
+
+Discussion on the design of [Temeraire](temeraire.md) posited that `HugeRegion`
+is a weird/complex feature that possibly is a premature optimization.
+`HugeRegion` is neither optional, nor really all that complex. We claim this is
+actually a fairly simple approach that fixes what would otherwise be a very
+serious flaw.
+
+This expands on the description of `HugeRegion` in the main design doc.
+
+## Our Trilemma
+
+`HugeRegion` exists because of three key framing requirements for a
+Temeraire-enabled TCMalloc:
+
+1.  We must support allocations of any (reasonable) size, and in particular a
+    heap composed of any set of reasonable sizes in any ratio; "sorry, tcmalloc
+    detonates if you mostly use requests of size X" is not acceptable.
+1.  We must be able to back (most, ideally all) of our heap with hugepages.
+1.  We would like to tightly bound global space overhead[^1] on our heap.
+
+Consider requests R<sub>i</sub> that are larger than a hugepage, but small
+enough that the rounding error from extending to a hugepage boundary is
+significant by (3). (Note that rounding up to a hugepage boundary would
+introduce a significant amount of overhead for allocations between 1 and 10
+hugepages, and the overhead could still be considered significant for
+allocations larger than that.)
+
+*   We *cannot* unback the unused tail of the last hugepage (requirement (2)
+    would be violated).
+*   We *cannot* assume these requests are necessarily rare and we will have many
+    smaller ones to fill the unused tail (requirement (1) would be violated).
+    Moreover this is **empirically false** for widely used
+    binaries.
+
+In summary, we must be able to use the unused tail of a hugepage from one
+R<sub>i</sub> as space for another large R<sub>j</sub>. If we do not enable such
+usage in our allocator, we will either potentially have space overhead of up to
+100%, or dramatically reduce our hugepage usage. The conclusion we came to is
+that we **must support**, in some form, allocating multiple such R<sub>i</sub>
+contiguously; that is, using the unused tail from R<sub>1 </sub>as the beginning
+of R<sub>2</sub> and so on.
+
+**This is all `HugeRegion{,Set}` does.**
+
+## The "Simple" Truth
+
+The above argument is why we have `HugeRegion`: we need a way to allocate
+multiple large (>1 hugepage) allocations on overlapping hugepages. So how can we
+do that? Clearly, we need some range of hugepages, large enough for several such
+R<sub>i</sub>, from which we allocate. What should we do in that space? A
+best-fit algorithm that tracks the free lengths seems appropriate.
+
+As allocations become free, it seems reasonable (by requirement (3) above) that
+we unback empty hugepages.
+
+Finally, what happens if the the range we allocated is full? We could do two
+things
+
+1.  extend it
+1.  obtain a new one and do allocations from there as needed.
+
+(1) is an interesting choice, but not actually possible with the `SysAllocator`
+interface. We might get lucky with `sbrk` (or even `mmap`, though it is less
+likely) placement choice, but we also might not; we cannot rely on it. So we
+must be able to fall back to (2) anyway, and given that there's very little
+disadvantages to having multiple such ranges (we won’t need very many in any
+case), why not just only do that?
+
+It should not be surprising that we have just described the algorithm
+`HugeRegion{,Set}` uses: inside some fixed-size range, do best-fit allocation
+for large allocations, backing and unbacking hugepages on demand. When one
+region fills, obtain another; fill from the most fragmented to bound total
+overhead (a policy derived from `HugePageFiller`).
+
+That is *really it*. We do not see this as particularly complicated. The only
+thing left is the implementation of that policy: We used `RangeTracker` because
+it was convenient, supported exactly the API we needed, and fast enough (even
+though we're tracking fairly large bitsets).
+
+## But what about...
+
+There are some reasonable objections to particular details, which we are happy
+to address.
+
+### Why are regions so big?
+
+Because it worked. Virtual address space is virtually free. :) We can easily
+justify why they aren’t 32 MiB (our original choice, as it happens):
+[Temeraire](temeraire.md) contains a simple argument, it is trivial to waste a
+full hugepage per region, and this scales down nicely with increasing region
+size. Why did we go to a gigabyte? Because it worked. :) It had an added
+advantage: even large binaries would only use a handful of regions, and thus
+walking the list was cheap and we could print a lot of info about each in
+mallocz.
+
+We've run more tests; 128 MiB and 512 MiB both perform reasonably, but this
+isn't a compelling reason to change the size. We don't really support VSS limits
+(and in practice we don't have them, outside badly behaved sandbox programs and
+some daemons that use `SMALL_BUT_SLOW` anyway, which we're not currently
+changing).
+
+### How did we pick the current policy for what goes to regions?
+
+Because it worked. The arguments above make it clear that anything larger than
+one hugepage and smaller than &lt;some value we can agree is many&gt; hugepages
+must go there. It seemed reasonable to allow slightly smaller ones to slip into
+the region if we had space and it was needed; we saw no reason not to allow
+many-hugepage allocations there if they fit. In practice, this seems to work
+well. There really isn’t more thought than that.
+
+### Can’t we fix binaries with problematic allocation patterns?
+
+Yes, we can. We probably should. It'd be good to do anyway. However: doing so
+doesn’t stop us from needing Regions:
+
+*   Changing workloads takes a long time.
+*   We cannot successfully change, all the programs that make any significant
+    use of allocations &gt;2 MiB and less than (say) 50 MiB. We cannot tell
+    users "Eh, no, tcmalloc does terribly if you allocate a couple megabytes at
+    a time?" Requirement (1) above is our expression of how we don't think
+    that's reasonable at all: we should able to handle 3 MiB allocations without
+    embarrassing ourselves.
+
+Recall that the trilemma leading to regions applies for **anything more than 2
+MiB which we can't just ignore the tail on**. It's easiest to show the potential
+huge problems with the canonical "2.1 MiB" allocation, but 5 MiB or 6.1 MiB or
+even 10.1 MiB allocations, if they're a significant component of heap usage,
+will lead to unacceptable overhead without `HugeRegion`, and we don't think we
+can say "don't do that."
+
+## Conclusion
+
+`HugeRegion` is the simplest possible solution we've found to a pressing problem
+in a hugepage-oriented allocator. When you read the [design doc](temeraire.md),
+please don't assume that HugeRegion is a speculative fix for a potential
+problem, that we might not need, nor that it's a roughed out attempt. This is a
+key part of the algorithm, and one we've thought a lot about the best fix for.
+We don't claim it is perfect and must surely have hit on the best fix, but
+"nothing" is not an acceptable solution. This gets reasonable space performance
+with badly sized allocations.
+
+**In short, `HugeRegion` is neither optional nor particularly complex. Having it
+produces dramatic savings in a number of realistic scenarios, and costs us very
+little.**
+
+## Notes
+
+[^1]: What our designed bound of overhead is...a very interesting question.
+    Different places accept different forms of overhead. While we could target
+    the current overhead, we can and must do better than this. One goal of
+    Temeraire is to dramatically cut this (in the pageheap).
--- a/src/third_party/tcmalloc/dist/docs/rseq.md
+++ b/src/third_party/tcmalloc/dist/docs/rseq.md
@ -0,0 +1,424 @@
+# Restartable Sequence Mechanism for TCMalloc
+
+<!--*
+# Document freshness: For more information, see go/fresh-source.
+freshness: { owner: 'ckennelly' reviewed: '2022-12-14' }
+*-->
+
+## per-CPU Caches
+
+TCMalloc implements its per-CPU caches using restartable sequences (`man
+rseq(2)`) on Linux. This kernel feature was developed by
+[Paul Turner and Andrew Hunter at Google](http://www.linuxplumbersconf.net/2013/ocw//system/presentations/1695/original/LPC%20-%20PerCpu%20Atomics.pdf)
+and Mathieu Desnoyers at EfficiOS. Restartable sequences let us execute a region
+to completion (atomically with respect to other threads on the same CPU) or to
+be aborted if interrupted by the kernel by preemption, interrupts, or signal
+handling.
+
+Choosing to restart on migration across cores or preemption allows us to
+optimize the common case - we stay on the same core - by avoiding atomics, over
+the more rare case - we are actually preempted. As a consequence of this
+tradeoff, we need to make our code paths actually support being restarted. The
+entire sequence, except for its final store to memory which *commits* the
+change, must be capable of starting over.
+
+This carries a few implementation challenges:
+
+*   We need fine-grained control over the generated assembly, to ensure stores
+    are not reordered in unsuitable ways.
+*   The restart sequence is triggered if the kernel detects a context switch
+    occurred with the PC in the restartable sequence code. If this happens
+    instead of restarting at this PC, it restarts the thread at an abort
+    sequence, the abort sequence determines the interrupted restartable
+    sequence, and then returns to control to the entry point of this sequence.
+
+    We must preserve adequate state to successfully restart the code sequence.
+    In particular, we must preserve the function parameters so that we can
+    restart the sequence with the same conditions; next we must reload any
+    parameters like the CPU ID, and recompute any necessary values.
+
+## Structure of the `TcmallocSlab`
+
+In per-CPU mode, we allocate an array of `N` `TcmallocSlab::Slabs`. For all
+operations, we index into the array with the logical CPU ID.
+
+Each slab has a header region of control data (one 8-byte header per-size
+class). These index into the remainder of the slab, which contains pointers to
+free listed objects.
+
+![Memory layout of per-cpu data structures](images/per-cpu-cache-internals.png "Memory layout of per-cpu data structures")
+
+In
+[C++ code](https://github.com/google/tcmalloc/blob/master/tcmalloc/internal/percpu_tcmalloc.h),
+these are represented as:
+
+```
+struct Slabs {
+  std::atomic<int64_t> header[NumClasses];
+  void* mem[((1ul << Shift) - sizeof(header)) / sizeof(void*)];
+};
+
+// Slab header (packed, atomically updated 64-bit).
+// All {begin, current, end} values are pointer offsets from per-CPU region
+// start. The slot array is in [begin, end), and the occupied slots are in
+// [begin, current).
+struct Header {
+  // The end offset of the currently occupied slots.
+  uint16_t current;
+  // Copy of end. Updated by Shrink/Grow, but is not overwritten by Drain.
+  uint16_t end_copy;
+  // Lock updates only begin and end with a 32-bit write.
+
+  // The begin offset of the slot array for this size class.
+  uint16_t begin;
+  // The end offset of the slot array for this size class.
+  uint16_t end;
+
+  // Lock is used by Drain to stop concurrent mutations of the Header.
+  // Lock sets begin to 0xffff and end to 0, which makes Push and Pop fail
+  // regardless of current value.
+  bool IsLocked() const;
+  void Lock();
+};
+
+```
+
+The atomic `header` allows us to read the state (esp. for telemetry purposes) of
+a core without undefined behavior.
+
+The fields in `Header` are indexed in `sizeof(void*)` strides into the slab. For
+the default value of `Shift=18`, this allows us to cache nearly 32K objects per
+CPU. Ongoing work encodes `Slabs*` and `Shift` into a single pointer, allowing
+it to be dynamically updated at runtime.
+
+We have allocated capacity for `end-begin` objects for a given size-class.
+`begin` is chosen via static partitioning at initialization time. `end` is
+chosen dynamically at a higher-level (in `tcmalloc::CPUCache`), as to:
+
+*   Avoid running into the next size-classes' `begin`
+*   Balance cached object capacity across size-classes, according to the
+    specified byte limit.
+
+## Usage: Allocation
+
+As the first operation, we can look at allocation, which needs to read the
+pointer at index `current-1`, return that object, and decrement `current`.
+Decrementing `current` is the *commit* operation.
+
+In pseudo-C++, this looks like:
+
+```
+void* TcmallocSlab_Pop(
+    void *slabs,
+    size_t size_class,
+    UnderflowHandler underflow_handler) {
+  // Expanded START_RSEQ macro...
+restart:
+  __rseq_abi.rseq_cs = &__rseq_cs_TcmallocSlab_Pop;
+start:
+  // Actual sequence
+  uint64_t cpu_id = __rseq_abi.cpu_id;
+  Header* hdr = &slabs[cpu_id].header[size_class];
+  uint64_t current = hdr->current;
+  uint64_t begin = hdr->begin;
+  if (ABSL_PREDICT_FALSE(current <= begin)) {
+    goto underflow;
+  }
+
+  void* next = *(&slabs[cpu_id] + current * sizeof(void*) - 2 * sizeof(void*))
+  prefetcht0(next);
+
+  void* ret = *(&slabs[cpu_id] + current * sizeof(void*) - sizeof(void*));
+  --current;
+  hdr->current = current;
+commit:
+  return ret;
+underflow:
+  return underflow_handler(cpu_id, size_class);
+}
+
+// This is implemented in assembly, but for exposition.
+ABSL_CONST_INIT kernel_rseq_cs __rseq_cs_TcmallocSlab_Pop = {
+  .version = 0,
+  .flags = 0,
+  .start_ip = &&start,
+  .post_commit_offset = &&commit - &&start,
+  .abort_ip = &&abort,
+};
+```
+
+`__rseq_cs_TcmallocSlab_Pop` is a read-only data structure, which contains
+metadata about this particular restartable sequence. When the kernel preempts
+the current thread, it examines this data structure. If the current instruction
+pointer is between `[start, commit)`, it returns control to a specified,
+per-sequence restart header at `abort`.
+
+Since the *next* object is frequently allocated soon after the current object,
+the allocation path prefetches the pointed-to object. To avoid prefetching a
+wild address, we populate `slabs[cpu][begin]` for each CPU/size-class with a
+pointer-to-self.
+
+This sequence terminates with the *single* committing store to `hdr->current`.
+If we are migrated or otherwise interrupted, we restart the preparatory steps,
+as the values of `cpu_id`, `current`, `begin` may have changed.
+
+As these operations work on a single core's data and are executed on that core.
+From a memory ordering perspective, loads and stores need to appear on that core
+in program order.
+
+### Restart Handling
+
+The `abort` label is distinct from `restart`. The `rseq` API provided by the
+kernel (see below) requires a "signature" (typically an intentionally invalid
+opcode) in the 4 bytes prior to the restart handler. We form a small
+trampoline - properly signed - to jump back to `restart`.
+
+In x86 assembly, this looks like:
+
+```
+  // Encode nop with RSEQ_SIGNATURE in its padding.
+  .byte 0x0f, 0x1f, 0x05
+  .long RSEQ_SIGNATURE
+  .local TcmallocSlab_Push_trampoline
+  .type TcmallocSlab_Push_trampoline,@function
+  TcmallocSlab_Push_trampoline:
+abort:
+  jmp restart
+```
+
+This ensures that the 4 bytes prior to `abort` match up with the signature that
+was configured with the `rseq` syscall.
+
+On x86, we can represent this with a nop which would allow for interleaving in
+the main implementation. On other platforms - with fixed width instructions -
+the signature is often chosen to be an illegal/trap instruction, so it has to be
+disjoint from the function's body.
+
+## Usage: Deallocation
+
+Deallocation uses two stores, one to store the deallocated object and another to
+update `current`. This is still compatible with the restartable sequence
+technique, as there is a *single* commit step, updating `current`. Any preempted
+sequences will overwrite the value of the deallocated object until a successful
+sequence commits it by updating `current`.
+
+```
+int TcmallocSlab_Push(
+    void *slab,
+    size_t size_class,
+    void* item,
+    OverflowHandler overflow_handler) {
+  // Expanded START_RSEQ macro...
+restart:
+  __rseq_abi.rseq_cs = &__rseq_cs_TcmallocSlab_Push;
+start:
+  // Actual sequence
+  uint64_t cpu_id = __rseq_abi.cpu_id;
+  Header* hdr = &slabs[cpu_id].header[size_class];
+  uint64_t current = hdr->current;
+  uint64_t end = hdr->end;
+  if (ABSL_PREDICT_FALSE(current >= end)) {
+    goto overflow;
+  }
+
+  *(&slabs[cpu_id] + current * sizeof(void*) - sizeof(void*)) = item;
+  current++;
+  hdr->current = current;
+commit:
+  return;
+overflow:
+  return overflow_handler(cpu_id, size_class, item);
+}
+```
+
+## Initialization of the Slab
+
+To reduce metadata demands, we lazily initialize the slabs, relying on the
+kernel to provide zeroed pages from the `mmap` call to obtain memory for the
+slab metadata.
+
+At startup, this leaves the `Header` of each initialized to `current = begin =
+end = 0`. The initial push or pop will trigger the overflow or underflow paths
+(respectively), so that we can populate these values.
+
+## More Complex Operations: Batches
+
+When the cache under or overflows, we populate or remove a full batch of objects
+obtained from inner caches. This amortizes some of the lock acquisition/logic
+for those caches. Using a similar approach to push and pop, we read/write a
+batch of `N` items and we update `current` to commit the operation.
+
+## Kernel API and implementation
+
+This section contains notes on the rseq API provided by the kernel, which is not
+well documented, and code pointers for how it is implemented.
+
+The `rseq` syscall is implemented by
+[`sys_rseq`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L304-L366).
+It starts by
+[handling](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L312-L328)
+the case where the thread wants to unregister, implementing that by clearing the
+[rseq information](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/linux/sched.h#L1188-L1189)
+out of the `task_struct` for the thread running
+[on the current CPU](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/arch/x86/include/asm/current.h#L11-L18).
+It then moves on to
+[return an error](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L333-L345)
+if the thread is already registered for rseq. Then it
+[validates](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L347-L355)
+and
+[saves](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L356-L357)
+the input from the user, and
+[sets](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L358-L363)
+the
+[`TIF_NOTIFY_RESUME` flag](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/linux/sched.h#L2044-L2048)
+for the thread.
+
+### Restarts
+
+Among other things, the user's input to the `rseq` syscall is used by
+`rseq_ip_fixup` to
+[decide](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L232-L238)
+whether we're in a critical section and if so
+[restart](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L247)
+at the abort point. That function is
+[called](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L271)
+by `__rseq_handle_notify_resume`, which is
+[documented](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L251-L261)
+as needing to be called after preemption or signal delivery before returning to
+the user. That in turn is called by
+[`rseq_handle_notify_resume`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/linux/sched.h#L2052-L2057),
+a simple wrapper that bails if rseq is not enabled for the thread.
+
+Here is one path that causes us to wind up here on x86:
+
+*   [`rseq_signal_deliver`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/linux/sched.h#L2065)
+*   [`setup_rt_frame`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/arch/x86/kernel/signal.c#L690-L691)
+*   [`handle_signal`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/arch/x86/kernel/signal.c#L746)
+*   [`arch_do_signal_or_restart`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/arch/x86/kernel/signal.c#L812-L813)
+*   [`handle_signal_work`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/entry/common.c#L147)
+*   [`exit_to_user_mode_loop`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/entry/common.c#L171)
+*   [`exit_to_user_mode_prepare`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/entry/common.c#L208)
+
+So the choke point is the code that returns to user space. Here are some notes
+on how the restart logic varies based on user input:
+
+*   `rseq_ip_fixup`
+    [calls](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L228)
+    `rseq_get_rseq_cs` every time. That means it
+    [reads](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L123-L124)
+    the
+    [pointer](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/uapi/linux/rseq.h#L91-L124)
+    to `struct rseq_cs` and then
+    [indirects](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L131-L133)
+    through it fresh from user memory each time. It
+    [checks](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L135-L145)
+    for invalid cases (which
+    [cause](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L278-L280)
+    a segfault for the user process) and then does
+    [validation](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L147-L157)
+    of the abort IP signature discussed below.
+
+*   Signature validation: from
+    [the code](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L147-L157)
+    linked above we can see that the requirement is that the abort handler
+    specified by `rseq_cs::abort_ip` be preceded by a 32-bit magic integer that
+    [matches](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L152)
+    the one originally provided to and
+    [saved by](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L357)
+    the `rseq` syscall.
+
+    The intent is to avoid turning buffer overflows into arbitrary code
+    execution: if an attacker can write into memory then they can control
+    `rseq_cs::abort_ip`, which is kind of like writing a jump instruction into
+    memory, which can be seen as breaking
+    [W^X](https://en.wikipedia.org/wiki/W%5EX) protections. Instead the kernel
+    has the caller pre-register a magic value from the executable memory that
+    they want to run, under the assumption that an attacker is unlikely to be
+    able to find other usable "gadgets" in executable memory that happen to be
+    preceded by that value.
+
+It's also worth noting that signals and preemption always
+[result in](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L238-L242)
+[clearing](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L197-L210)
+`rseq::rseq_cs::ptr64` from user space memory on the way out, except in error
+cases that cause a segfault.
+
+### CPU IDs
+
+The other thing `rseq.c` takes care of is writing CPU IDs to user space memory.
+
+There are two fields in user space that get this information:
+[`rseq::cpu_id_start`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/uapi/linux/rseq.h#L63-L75)
+and
+[`rseq::cpu_id`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/uapi/linux/rseq.h#L76-L90).
+The difference between the two is that `cpu_id_start` is always in range,
+whereas `cpu_id` may contain error values. The kernel provides both in order to
+support computation of values derived from the CPU ID that happens before
+entering the critical section. We could do this with one CPU ID, but it would
+require an extra branch to distinguish "not initialized" from "CPU ID changed
+after fetching it". On the other hand if (like tcmalloc) you only fetch the CPU
+Id within a critical section, then you need only one field because you have only
+one branch: am I initialized. There is no such thing as a CPU mismatch because
+instead you are just restarted when the CPU ID changes.
+
+The two CPU ID fields are maintained as follows:
+
+*   [`rseq_update_cpu_id`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L84-L94)
+    writes a CPU ID into each. This is
+    [called](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L274-L275)
+    by `__rseq_handle_notify_resume`, which is discussed above.
+
+*   [`rseq_reset_rseq_cpu_id`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L96-L113)
+    sets the `cpu_id_start` field to zero and the `cpu_id` field to
+    [`RSEQ_CPU_ID_UNINITIALIZED`](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/include/uapi/linux/rseq.h#L17)
+    (an out of range value). It is
+    [called](https://github.com/torvalds/linux/blob/414eece95b98b209cef0f49cfcac108fd00b8ced/kernel/rseq.c#L322)
+    in the unregister path discussed above.
+
+## Cross-CPU Operations
+
+With restartable sequences, we've optimized the fast path for same-CPU
+operations at the expense of costlier cross-CPU operations. Cross-CPU operations
+are rare&mdash;typically done only to facilitate periodic drains of idle
+caches&mdash;so this is a desirable tradeoff.
+
+Cross-CPU operations rely on operating system assistance (wrapped in
+`tcmalloc::tcmalloc_internal::subtle::percpu::FenceCpu`) to interrupt any
+running restartable sequences on the remote core. When control is returned to
+the thread running on that core, we have guaranteed that either the restartable
+sequence that was running has completed *or* that the restartable sequence was
+preempted.
+
+We use preemption and "locks" (`TcmallocSlab::Header::Lock`) to ensure that
+during a particular period, all accesses to the fast path will fail&mdash;the
+cache is both simultaneously "full" and "empty" so all inserts and removes will
+go to the slow path. Unlike using `sched_setaffinity` to run a remote core, this
+approach allows us to perform longer operations, such as taking elements from
+the cache and inserting them into the `TransferCache` as part of `Drain`, while
+still maintaining correctness.
+
+Since we are using relaxed loads and stores, potentially with word-level
+granularity, our operations need to potentially store part of the needed data to
+`Header`, fence, and then write additional fields. For example, at the end of of
+`Drain`, we:
+
+*   Store `hdr.current`. `hdr.begin = 0xFFFF` and `hdr.end = 0x0`, ensuring
+    insert and remove operations continue to fail.
+*   `FenceCpu`
+*   Store `hdr.begin` and `hdr.end` to their proper values.
+
+This sequence ensures that a thread running on the remote core can only see one
+of:
+
+*   `hdr.current = X`; `hdr.begin = 0xFFFF`; `hdr.end = 0x0`
+*   `hdr.current = Y`; `hdr.begin = 0xFFFF`; `hdr.end = 0x0`
+*   `hdr.current = Y`; `hdr.begin = Y`; `hdr.end = Y`
+
+`FenceCpu` ensures that after it completes, no thread can see `current=X` any
+longer.
+
+If we did a single store or omitted the intervening fence operation, a thread on
+the remote core could potentially see `hdr.begin = Y < hdr.current = X` and
+attempt to remove an element from the cache. (This failure would lead to data
+corruption as the element had already been "deallocated" to the `TransferCache`,
+essentially triggering a double-free.)
--- a/src/third_party/tcmalloc/dist/docs/sampling.md
+++ b/src/third_party/tcmalloc/dist/docs/sampling.md
@ -0,0 +1,64 @@
+# How sampling in TCMalloc works.
+
+## Introduction
+
+TCMalloc uses sampling to get representative data on memory usage and
+allocation. How this works is not well documented. This doc attempts to at least
+partially fix this.
+
+## Sampling
+
+We chose to sample an allocation every N bytes where N is a random value using
+[Sampler::PickNextSamplingPoint()](https://github.com/google/tcmalloc/blob/master/tcmalloc/sampler.cc)
+with a mean set by the profile sample rate using
+[MallocExtension::SetProfileSamplingRate()](https://github.com/google/tcmalloc/blob/master/tcmalloc/malloc_extension.h).
+By default this is every 2MiB.
+
+## How We Sample Allocations
+
+When we pick an allocation such as
+[Sampler::RecordAllocationSlow()](https://github.com/google/tcmalloc/blob/master/tcmalloc/sampler.cc)
+to sample we do some additional processing around that allocation using
+[SampleifyAllocation()](https://github.com/google/tcmalloc/blob/master/tcmalloc/allocation_sampling.h) -
+recording stack, alignment, request size, and allocation size. Then we go
+through all the active samplers using
+[ReportMalloc()](https://github.com/google/tcmalloc/blob/master/tcmalloc/allocation_sample.h)
+and tell them about the allocation. We also tell the span that we're sampling
+it - we can do this because we do sampling at tcmalloc page sizes, so each
+sample corresponds to a particular page in the pagemap.
+
+## How We Free Sampled Objects
+
+Each sampled allocation is tagged. So we can quickly test whether a particular
+allocation might be a sample.
+
+When we are done with the sampled span we release it using
+[tcmalloc::Span::Unsample()](https://github.com/google/tcmalloc/blob/master/tcmalloc/span.cc).
+
+## How Do We Handle Heap and Fragmentation Profiling
+
+To handle heap and fragmentation profiling we just need to traverse the list of
+sampled objects and compute either their degree of fragmentation, or the amount
+of heap they consume.
+
+## How Do We Handle Allocation Profiling
+
+Allocation profiling reports a list of sampled allocations during a length of
+time. We start an allocation profile using
+[MallocExtension::StartAllocationProfiling()](https://github.com/google/tcmalloc/blob/master/tcmalloc/malloc_extension.h),
+then wait until time has elapsed, then call `Stop` on the token. and report the
+profile.
+
+While the allocation sampler is active it is added to the list of samplers for
+allocations and removed from the list when it is claimed.
+
+## How Do We Handle Lifetime Profiling
+
+Lifetime profiling reports a list of object lifetimes as pairs of allocation and
+deallocation records. Profiling is initiated by calling
+[MallocExtension::StartLifetimeProfiling()](https://github.com/google/tcmalloc/blob/master/tcmalloc/malloc_extension.h).
+Profiling continues until `Stop` is invoked on the token. Lifetimes are only
+reported for objects where allocation *and* deallocation are observed while
+profiling is active. A description of the sampling based lifetime profiler can
+be found in Section 4 of
+["Learning-based Memory Allocation for C++ Server Workloads, ASPLOS 2020"](https://research.google/pubs/pub49008/).
--- a/src/third_party/tcmalloc/dist/docs/stats.md
+++ b/src/third_party/tcmalloc/dist/docs/stats.md
@ -0,0 +1,961 @@
+# Understanding Malloc Stats
+
+## Getting Malloc Stats
+
+Human-readable statistics can be obtained by calling
+`tcmalloc::MallocExtension::GetStats()`.
+
+## Understanding Malloc Stats Output
+
+### It's A Lot Of Information
+
+The output contains a lot of information. Much of it can be considered debug
+info that's interesting to folks who are passingly familiar with the internals
+of TCMalloc, but potentially not that useful for most people.
+
+### Summary Section
+
+The most generally useful section is the first few lines:
+
+```
+See https://github.com/google/tcmalloc/tree/master/docs/stats.md for an explanation of this page
+------------------------------------------------
+MALLOC:    10858234672 (10355.2 MiB) Bytes in use by application
+MALLOC: +    827129856 (  788.8 MiB) Bytes in page heap freelist
+MALLOC: +    386098400 (  368.2 MiB) Bytes in central cache freelist
+MALLOC: +    105330688 (  100.5 MiB) Bytes in per-CPU cache freelist
+MALLOC: +      9095680 (    8.7 MiB) Bytes in transfer cache freelist
+MALLOC: +       660976 (    0.6 MiB) Bytes in thread cache freelists
+MALLOC: +     49333930 (   47.0 MiB) Bytes in malloc metadata
+MALLOC: +       629440 (    0.6 MiB) Bytes in malloc metadata Arena unallocated
+MALLOC: +      1599704 (    1.5 MiB) Bytes in malloc metadata Arena unavailable
+MALLOC:   ------------
+MALLOC: =  12238113346 (11671.2 MiB) Actual memory used (physical + swap)
+MALLOC: +    704643072 (  672.0 MiB) Bytes released to OS (aka unmapped)
+MALLOC:   ------------
+MALLOC: =  12942756418 (12343.2 MiB) Virtual address space used
+```
+
+*   **Bytes in use by application:** Number of bytes that the application is
+    actively using to hold data. This is computed by the bytes requested from
+    the OS minus any bytes that are held in caches and other internal data
+    structures.
+*   **Bytes in page heap freelist:** The pageheap is a structure that holds
+    memory ready for TCMalloc to use. This memory is not actively being used,
+    and could be returned to the OS. [See TCMalloc tuning](tuning.md)
+*   **Bytes in central cache freelist:** This is the amount of memory currently
+    held in the central freelist. This is a structure that holds partially used
+    "[spans](#more-detail-on-metadata)" of memory. The spans are partially used
+    because some memory has been allocated from them, but not entirely used -
+    since they have some free memory on them.
+*   **Bytes in per-CPU cache freelist:** In per-cpu mode (which is the default)
+    each CPU holds some memory ready to quickly hand to the application. The
+    maximum size of this per-cpu cache is tunable.
+    [See TCMalloc tuning](tuning.md)
+*   **Bytes in transfer cache freelist:** The transfer cache can be considered
+    another part of the central freelist. It holds memory that is ready to be
+    provided to the application for use.
+*   **Bytes in thread cache freelists:** The TC in TCMalloc stands for thread
+    cache. Originally each thread held its own cache of memory to provide to the
+    application. Since the change of default to per-cpu caches, the thread
+    caches are used by very few applications. However, TCMalloc starts in
+    per-thread mode, so there may be some memory left in per-thread caches from
+    before it switches into per-cpu mode.
+*   **Bytes in malloc metadata:** the size of the data structures used for
+    tracking memory allocation. This will grow as the amount of memory used
+    grows.
+*   **Bytes in malloc metadata Arena unallocated:** Metadata is allocated in an
+    internal Arena. Memory requests to the OS are made in blocks which amortize
+    several Arena allocations and this captures memory that is not yet allocated
+    but could be by future Arena allocations.
+*   **Bytes in malloc metadata Arena unavailable:** The Arena allocator may fail
+    to allocate a block fully when a subsequent Arena allocation request is made
+    that is larger than the block's remaining space. This memory is currently
+    unavailable for allocation.
+
+There's a couple of summary lines:
+
+*   **Actual memory used:** This is the total amount of memory that TCMalloc
+    thinks it is using in the various categories. This is computed from the size
+    of the various areas, the actual contribution to RSS may be larger or
+    smaller than this value. The true RSS may be less if memory is not mapped
+    in. In some cases RSS can be larger if small regions end up being mapped
+    with huge pages. This does not count memory that TCMalloc is not aware of
+    (eg memory mapped files, text segments etc.)
+*   **Bytes released to OS:** TCMalloc can release memory back to the OS (see
+    [tcmalloc tuning](tuning.md)), and this is the upper bound on the amount of
+    released memory. However, it is up to the OS as to whether the act of
+    releasing the memory actually reduces the RSS of the application. The code
+    uses `MADV_DONTNEED`/`MADV_REMOVE` which tells the OS that the memory is no
+    longer needed.
+*   **Virtual address space used:** This is the amount of virtual address space
+    that TCMalloc believes it is using. This should match the later section on
+    requested memory. There are other ways that an application can increase its
+    virtual address space, and this statistic does not capture them.
+
+### More Detail On Metadata
+
+The next section gives some insight into the amount of metadata that TCMalloc is
+using. This is really debug information, and not very actionable.
+
+```
+MALLOC:         236176               Spans in use
+MALLOC:         238709 (   10.9 MiB) Spans created
+MALLOC:              8               Thread heaps in use
+MALLOC:             46 (    0.0 MiB) Thread heaps created
+MALLOC:          13517               Stack traces in use
+MALLOC:          13742 (    7.2 MiB) Stack traces created
+MALLOC:              0               Table buckets in use
+MALLOC:           2808 (    0.0 MiB) Table buckets created
+MALLOC:       11665416 (   11.1 MiB) Pagemap bytes used
+MALLOC:        4067336 (    3.9 MiB) Pagemap root resident bytes
+```
+
+*   **Spans:** structures that hold multiple [pages](#page-sizes) of allocatable
+    objects.
+*   **Thread heaps:** These are the per-thread structures used in per-thread
+    mode.
+*   **Stack traces:** These hold metadata for each sampled object.
+*   **Table buckets:** These hold data for stack traces for sampled events.
+*   **Pagemap:** This data structure supports the mapping of object addresses to
+    information about the objects held on the page. The pagemap root is a
+    potentially large array, and it is useful to know how much of it is actually
+    memory resident.
+
+### Realized Fragmentation
+
+```
+MALLOC:    12238113346 (11671.2 MiB) Actual memory used at peak
+MALLOC:    11626207678 (11087.6 MiB) Estimated in-use at peak
+MALLOC:              5.2632          Realized fragmentation (%)
+```
+
+Memory overhead at peak demand is more important than off-peak, since we need to
+provision a process with sufficient memory to run during its peak requirements
+without OOM'ing. After a peak in demand, memory may be deallocated and held in
+caches in anticipation of future reuse. Overhead as a fraction of the remaining
+live allocations rises, but no additional memory is required.
+
+This metric is called "realized fragmentation" and described in ["Adaptive
+Hugepage Subrelease for Non-moving Memory Allocators in Warehouse-Scale
+Computers"](https://research.google/pubs/pub50436/) (ISMM 2021). The realized
+fragmentation metric computed here is a snapshot over the life of the entire
+process.
+
+These realized fragmentation stats in the summary table indicate a snapshot of
+conditions when TCMalloc used a peak in its physical memory. As of April 2022,
+the in-use at peak number is estimated from TCMalloc's periodic allocation
+sampling.
+
+### Page Sizes
+
+There are three relevant "page" sizes for systems and TCMalloc. It's important
+to be able to disambiguate them.
+
+*   **System default page size:** this is not reported by TCMalloc. This is 4KiB
+    on x86. It's not referred to in TCMalloc, and it's not important, but it's
+    important to know that it is different from the sizes of pages used in
+    TCMalloc.
+*   **TCMalloc page size:** This is the basic unit of memory management for
+    TCMalloc. Objects on the same page are the same number of bytes in size.
+    Internally TCMalloc manages memory in chunks of this size. TCMalloc supports
+    4 sizes: 4KiB (small but slow), 8KiB (the default), 32 KiB (large), 256 KiB
+    (256 KiB pages). There are trade-offs around the page sizes:
+    *   Smaller page sizes are more memory efficient because we have less
+        fragmentation (ie left over space) when trying to provide the requested
+        amount of memory using 4KiB chunks. It's also more likely that all the
+        objects on a 4KiB page will be freed allowing the page to be returned
+        and used for a different size of data.
+    *   Larger pages result in fewer fetches from the page heap to provide a
+        given amount of memory. They also keep allocated objects of the same
+        size in closer proximity.
+*   **TCMalloc hugepage size:** This is the size of a hugepage on the system,
+    for x86 this is 2MiB. This size is used as a unit of management by
+    temeriare, but not used by the pre-temeraire pageheap.
+
+```
+MALLOC:          32768               Tcmalloc page size
+MALLOC:        2097152               Tcmalloc hugepage size
+```
+
+### Experiments
+
+There is an experiment framework embedded into TCMalloc.
+The enabled experiments are reported as part of the statistics.
+
+```
+MALLOC EXPERIMENTS: TCMALLOC_TEMERAIRE=0 TCMALLOC_TEMERAIRE_WITH_SUBRELEASE_V3=0
+```
+
+### Actual Memory Footprint
+
+The output also reports the memory size information recorded by the OS:
+
+*   Bytes resident is the amount of physical memory in use by the application
+    (RSS). This includes things like program text which is excluded from the
+    information that TCMalloc presents.
+*   Bytes mapped is the size of the virtual address space in use by the
+    application (VSS). This can be substantially larger than the virtual memory
+    reported by TCMalloc as applications can increase VSS in other ways. It's
+    also not that useful as a metric since the VSS is a limit to the RSS, but
+    not directly related to the amount of physical memory that the application
+    uses.
+
+```
+Total process stats (inclusive of non-malloc sources):
+TOTAL:  86880677888 (82855.9 MiB) Bytes resident (physical memory used)
+TOTAL:  89124790272 (84996.0 MiB) Bytes mapped (virtual memory used)
+```
+
+### Per Size-Class Information
+
+Requests for memory are rounded to convenient sizes. For example a request for
+15 bytes could be rounded to 16 bytes. These sizes are referred to as class
+sizes. There are various caches in TCMalloc where memory gets held, and the per
+size-class section reports how much memory is being used by cached objects of
+each size. The columns reported for each size-class are:
+
+*   The size of each object in that size-class.
+*   The number of objects of that size currently held in the per-cpu,
+    per-thread, transfer, and central caches.
+*   The total size of those objects in MiB - ie size of each object multiplied
+    by the number of objects.
+*   The cumulative size of that size-class plus all smaller size-classes.
+*   The number of live pages dedicated to this size-class.
+*   The number of returned and requested spans of this size-class.
+
+```
+Total size of freelists for per-thread and per-CPU caches,
+transfer cache, and central cache, as well as number of
+live pages, returned/requested spans by size-class
+------------------------------------------------
+class   1 [        8 bytes ] :    45645 objs;   0.3 MiB;   0.3 cum MiB;       73 live pages; spans:     19 ret /     92 req = 0.2065;
+class   2 [       16 bytes ] :    39942 objs;   0.6 MiB;   1.0 cum MiB;      120 live pages; spans:      3 ret /    123 req = 0.0244;
+class   3 [       24 bytes ] :    84130 objs;   1.9 MiB;   2.9 cum MiB;      807 live pages; spans:   1330 ret /   2137 req = 0.6224;
+class   4 [       32 bytes ] :   107271 objs;   3.3 MiB;   6.2 cum MiB;     1048 live pages; spans:    420 ret /   1468 req = 0.2861;
+class   5 [       40 bytes ] :    82230 objs;   3.1 MiB;   9.3 cum MiB;      790 live pages; spans:    962 ret /   1752 req = 0.5491;
+...
+```
+
+### Central Cache Free List Span Utilization
+
+Central cache free list manages memory in spans, where each span is a collection
+of one or more TCMalloc pages. We track histogram of span utilization, where
+each column refers to the number of spans with allocated objects less than N.
+
+```
+------------------------------------------------
+Central cache freelist: Span utilization histogram
+Non-cumulative number of spans with allocated objects < N
+------------------------------------------------
+class   1 [        8 bytes ] :      0 < 1,     0 < 2,     0 < 4,     0 < 8,     0 < 16,     1 < 32,     0 < 64,     1 < 128,     1 < 256,     1 < 512,     0 < 1024,     0 < 2048,     4 < 4096,    16 < 8192,     0 < 16384,     0 < 32768,     0 < 65536
+class   2 [       16 bytes ] :      0 < 1,     0 < 2,     0 < 4,     0 < 8,     0 < 16,     0 < 32,     0 < 64,     0 < 128,     0 < 256,     0 < 512,     1 < 1024,     0 < 2048,    47 < 4096,     0 < 8192,     0 < 16384,     0 < 32768,     0 < 65536
+class   3 [       24 bytes ] :      0 < 1,     0 < 2,     0 < 4,     0 < 8,     0 < 16,     0 < 32,     0 < 64,     2 < 128,     1 < 256,     3 < 512,     5 < 1024,   127 < 2048,     0 < 4096,     0 < 8192,     0 < 16384,     0 < 32768,     0 < 65536
+class   4 [       32 bytes ] :      0 < 1,     0 < 2,     0 < 4,     0 < 8,     0 < 16,     0 < 32,     0 < 64,     0 < 128,     0 < 256,     1 < 512,     0 < 1024,   129 < 2048,     0 < 4096,     0 < 8192,     0 < 16384,     0 < 32768,     0 < 65536
+class   5 [       40 bytes ] :      0 < 1,     1 < 2,     1 < 4,     0 < 8,     0 < 16,     0 < 32,     1 < 64,     1 < 128,     4 < 256,     5 < 512,    80 < 1024,     0 < 2048,     0 < 4096,     0 < 8192,     0 < 16384,     0 < 32768,     0 < 65536
+...
+```
+
+### Transfer Cache Information
+
+Transfer cache is used by TCMalloc, before going to central free list. For each
+size-class, we track and report the following statistics:
+
+*   The size of each object in that size-class.
+*   The number of objects of that size currently held in the transfer cache.
+*   The total size of those objects in MiB - i.e. size of each object multiplied
+    by the number of objects in the freelist.
+*   The cumulative size of that size-class plus all smaller size-classes.
+*   The current capacity of the freelist.
+*   The maximum capacity to which the freelist is allowed to grow.
+*   The number of hits observed during inserts to the transfer cache.
+*   The total number batched and non-batched misses observed during insert
+    operations.
+*   The number of partial (i.e. non-batch-sized) misses observed during insert
+    operations.
+*   The number of hits observed during removes from the transfer cache.
+*   The total number batched and non-batched misses observed during remove
+    operations.
+*   The number of partial (i.e. non-batch-sized) misses observed during remove
+    operations.
+
+```
+------------------------------------------------
+Used bytes, current capacity, and maximum allowed capacity
+of the transfer cache freelists.
+It also reports insert/remove hits/misses by size class.
+------------------------------------------------
+class   1 [        8 bytes ] :     1472 objs;   0.0 MiB;    0.0 cum MiB;  2048 capacity;  2048 max_capacity;      935 insert hits;     8543 insert misses (    4507 partial);      889 remove hits;     6612 remove misses (      86 partial);
+class   2 [       16 bytes ] :      608 objs;   0.0 MiB;    0.0 cum MiB;  2048 capacity;  2048 max_capacity;      575 insert hits;     3739 insert misses (    3602 partial);      556 remove hits;     3368 remove misses (      70 partial);
+class   3 [       24 bytes ] :      864 objs;   0.0 MiB;    0.0 cum MiB;  2048 capacity;  2048 max_capacity;     1533 insert hits;    15594 insert misses (    9417 partial);     1506 remove hits;    11939 remove misses (      74 partial);
+class   4 [       32 bytes ] :       96 objs;   0.0 MiB;    0.0 cum MiB;  2048 capacity;  2048 max_capacity;     1065 insert hits;    21772 insert misses (   19918 partial);     1061 remove hits;     6403 remove misses (     119 partial);
+class   5 [       40 bytes ] :     1408 objs;   0.1 MiB;    0.1 cum MiB;  2048 capacity;  2048 max_capacity;     1475 insert hits;    16018 insert misses (   14943 partial);     1431 remove hits;     3293 remove misses (      60 partial);
+class   6 [       48 bytes ] :     1664 objs;   0.1 MiB;    0.2 cum MiB;  2048 capacity;  2048 max_capacity;     1213 insert hits;    39140 insert misses (   37096 partial);     1160 remove hits;     5909 remove misses (      80 partial);
+class   7 [       56 bytes ] :     1792 objs;   0.1 MiB;    0.3 cum MiB;  2048 capacity;  2048 max_capacity;      466 insert hits;      650 insert misses (     375 partial);      410 remove hits;     1264 remove misses (      55 partial);
+class   8 [       64 bytes ] :     1408 objs;   0.1 MiB;    0.4 cum MiB;  2048 capacity;  2048 max_capacity;     2181 insert hits;     8816 insert misses (    8069 partial);     2137 remove hits;     2024 remove misses (      74 partial);
+class   9 [       72 bytes ] :      960 objs;   0.1 MiB;    0.4 cum MiB;  1600 capacity;  2048 max_capacity;      104 insert hits;      463 insert misses (     463 partial);       74 remove hits;      287 remove misses (      62 partial);
+class  10 [       80 bytes ] :     1056 objs;   0.1 MiB;    0.5 cum MiB;  2048 capacity;  2048 max_capacity;      372 insert hits;     3334 insert misses (    3287 partial);      339 remove hits;      562 remove misses (      80 partial);
+...
+```
+
+As of July 2021, the `TransferCache` misses when inserting or removing a
+non-batch size number of objects from the cache. These are reflected in the
+"partial" column. The insert and remove miss column is *inclusive* of misses for
+both batch size and non-batch size numbers of objects.
+
+### Per-CPU Information
+
+If the per-cpu cache is enabled then we get a report of the memory currently
+being cached on each CPU.
+
+The first number reported is the maximum size of the per-cpu cache on each CPU.
+This corresponds to the parameter `MallocExtension::GetMaxPerCpuCacheSize()`,
+which defaults to 1.5MiB. [See tuning](tuning.md)
+
+The following columns are reported for each CPU:
+
+*   The cpu ID
+*   The total size of the objects held in the CPU's cache in bytes.
+*   The total size of the objects held in the CPU's cache in MiB.
+*   The total number of unallocated bytes.
+
+The concept of unallocated bytes needs to be explained because the definition is
+not obvious.
+
+The per-cpu cache is an array of pointers to available memory. Each size-class
+has a number of entries that it can use in the array. These entries can be used
+to hold memory, or be empty.
+
+To control the maximum memory that the per-cpu cache can use we sum up the
+number of slots that can be used by a size-class multiplied by the size of
+objects in that size-class. This gives us the total memory that could be held in
+the cache. This is not what is reported by unallocated memory.
+
+Unallocated memory is the amount of memory left over from the per cpu limit
+after we have subtracted the total memory that could be held in the cache.
+
+The in use memory is calculated from the sum of the number of populated entries
+in the per-cpu array multiplied by the size of the objects held in those
+entries.
+
+To summarise, the per-cpu limit (which is reported before the per-cpu data) is
+equal to the number of bytes in use (which is reported in the second column)
+plus the number of bytes that could be used (which is not reported) plus the
+unallocated "spare" bytes (which is reported as the last column).
+
+```
+Bytes in per-CPU caches (per cpu limit: 3145728 bytes)
+------------------------------------------------
+cpu   0:      2168200 bytes (    2.1 MiB) with       52536 bytes unallocated active
+cpu   1:      1734880 bytes (    1.7 MiB) with      258944 bytes unallocated active
+cpu   2:      1779352 bytes (    1.7 MiB) with        8384 bytes unallocated active
+cpu   3:      1414224 bytes (    1.3 MiB) with      112432 bytes unallocated active
+cpu   4:      1260016 bytes (    1.2 MiB) with      179800 bytes unallocated
+...
+```
+
+Some CPU caches may be marked `active`, indicating that the process is currently
+runnable on that CPU.
+
+### Size Class Capacity Information in Per-CPU Caches
+
+In per-CPU caches, TCMalloc caches objects of discrete sizes. These are referred
+to as size classes. Memory requests for a particular object size are rounded off
+to a convenient size class. TCMalloc populates objects in each size class based
+on their demand, but also imposes an upper limit on the number of objects that
+may be cached per size class. The statistics below measure the capacity of each
+size class freelist, where capacity represents the total number of objects
+currently cached by the freelist. The columns below report number of objects
+cached by TCMalloc per size class:
+
+*   Size class.
+*   The size of each object in that size class.
+*   Minimum capacity of the size class freelist summarized over all per-CPU
+    caches.
+*   Average capacity of the size class freelist summarized over all per-CPU
+    caches.
+*   Maximum capacity of the size class freelist summarized over all per-CPU
+    caches.
+*   The upper limit imposed by TCMalloc on the number of objects that can be
+    cached in a per-CPU cache for that size class.
+
+```
+------------------------------------------------
+Size class capacity statistics in per-cpu caches
+------------------------------------------------
+class   0 [        0 bytes ] :      0 (minimum),    0.0 (average),     0 (maximum),     0 maximum allowed capacity
+class   1 [        8 bytes ] :      0 (minimum),  133.1 (average),   636 (maximum),  2048 maximum allowed capacity
+class   2 [       16 bytes ] :      0 (minimum),   51.8 (average),   378 (maximum),  2048 maximum allowed capacity
+class   3 [       24 bytes ] :      0 (minimum),  119.3 (average),   510 (maximum),  2048 maximum allowed capacity
+class   4 [       32 bytes ] :      0 (minimum),  100.0 (average),   542 (maximum),  2048 maximum allowed capacity
+class   5 [       40 bytes ] :      0 (minimum),   80.6 (average),   467 (maximum),  2048 maximum allowed capacity
+```
+
+### Number of per-CPU cache underflows, overflows, and reclaims
+
+We also keep track of cache miss counts. Underflows are when the user allocates
+and the cache does not have any pointers to return. Overflows are when the user
+deallocates and the cache is full. The ratio of overflows to underflows gives a
+rough indication of whether the cache is large enough. If the cache had infinite
+capacity, then we would expect to have 0 overflows whereas if the cache had 0
+capacity, we would expect to see roughly equal numbers of overflows and
+underflows. Therefore, if the ratio is close to 1.0, then the cache may not be
+large enough. Reclaims are when we empty out a cache for a specific CPU because
+it has been idle for a period of time. In this section, we report the total
+numbers of each of these metrics across all CPUs as well as the numbers for each
+individual CPU.
+
+```
+------------------------------------------------
+Number of per-CPU cache underflows, overflows, and reclaims
+------------------------------------------------
+Total  :         242 underflows,          12 overflows, overflows / underflows:  0.05,          168 reclaims
+cpu   0:          69 underflows,           5 overflows, overflows / underflows:  0.07,           46 reclaims
+cpu   1:          58 underflows,           0 overflows, overflows / underflows:  0.00,           42 reclaims
+cpu   2:          62 underflows,           7 overflows, overflows / underflows:  0.11,           42 reclaims
+cpu   3:          40 underflows,           0 overflows, overflows / underflows:  0.00,           27 reclaims
+cpu   4:          13 underflows,           0 overflows, overflows / underflows:  0.00,           11 reclaims
+cpu   5:           0 underflows,           0 overflows, overflows / underflows:  0.00,            0 reclaims
+```
+
+### Pageheap Information
+
+The pageheap holds pages of memory that are not currently being used either by
+the application or by TCMalloc's internal caches. These pages are grouped into
+spans - which are ranges of contiguous pages, and these spans can be either
+mapped (backed by physical memory) or unmapped (not necessarily backed by
+physical memory).
+
+Memory from the pageheap is used either to replenish the per-thread or per-cpu
+caches, or to directly satisfy requests that are larger than the sizes supported
+by the per-thread or per-cpu caches.
+
+**Note:** TCMalloc cannot tell whether a span of memory is actually backed by
+physical memory, but it uses *unmapped* to indicate that it has told the OS that
+the span is not used and does not need the associated physical memory. For this
+reason the physical memory of an application may be larger that the amount that
+TCMalloc reports.
+
+The pageheap section contains the following information:
+
+*   The first line reports the number of sizes of spans, the total memory that
+    these spans cover, and the total amount of that memory that is unmapped.
+*   The size of the span in number of pages.
+*   The number of spans of that size.
+*   The total memory consumed by those spans in MiB.
+*   The cumulative total memory held in spans of that size and fewer pages.
+*   The amount of that memory that has been unmapped.
+*   The cumulative amount of unmapped memory for spans of that size and smaller.
+
+```
+PageHeap: 30 sizes;  480.1 MiB free;  318.4 MiB unmapped
+------------------------------------------------
+  1 pages *  341 spans ~  10.7 MiB;   10.7 MiB cum; unmapped:    1.9 MiB;    1.9 MiB cum
+  2 pages *  469 spans ~  29.3 MiB;   40.0 MiB cum; unmapped:    0.0 MiB;    1.9 MiB cum
+  3 pages *  462 spans ~  43.3 MiB;   83.3 MiB cum; unmapped:    3.3 MiB;    5.2 MiB cum
+  4 pages *  119 spans ~  14.9 MiB;   98.2 MiB cum; unmapped:    0.1 MiB;    5.3 MiB cum
+...
+```
+
+### Pageheap Cache Age
+
+The next section gives some indication of the age of the various spans in the
+pageheap. Live (ie backed by physical memory) and unmapped spans are reported
+separately.
+
+The columns indicate roughly how long the span has been in the pageheap, ranging
+from less than a second to more than 8 hours.
+
+```
+------------------------------------------------
+PageHeap cache entry age (count of pages in spans of a given size that have been idle for up to the given period of time)
+------------------------------------------------
+                            mean     <1s      1s     30s      1m     30m      1h     8+h
+Live span     TOTAL PAGES:   9.1     533   13322      26    1483       0       0       0
+Live span,        1 pages:   7.4       0     256       0      24       0       0       0
+Live span,        2 pages:   1.6      38     900       0       0       0       0       0
+…
+Unmapped span TOTAL PAGES: 153.9     153    2245    1801    5991       0       0       0
+Unmapped span,    1 pages:  34.6       0      35      15      11       0       0       0
+Unmapped span,    3 pages:  28.4       0      60      42       3       0       0       0
+...
+```
+
+### Pageheap Allocation Summary
+
+This reports some stats on the number of pages allocated.
+
+*   The number of live (i.e., not on page heap) pages that were "small"
+    allocations. Small allocations are ones that are tracked in the pageheap by
+    size (e.g., a region of two pages in size). Larger allocations are just kept
+    in an array that has to be scanned linearly.
+*   The pages of slack result from situations where allocation is rounded up to
+    hugepages, and this leaves some spare pages.
+*   The largest seen allocation is self explanatory.
+
+```
+PageHeap: stats on allocation sizes
+PageHeap: 344420 pages live small allocation
+PageHeap: 12982 pages of slack on large allocations
+PageHeap: largest seen allocation 29184 pages
+```
+
+### Pageheap Per Number Of Pages In Range
+
+This starts off reporting the activity for small ranges of pages, but at the end
+of the list starts aggregating information for groups of page ranges.
+
+*   The first column contains the number of pages (or the range of pages if the
+    bucket is wider than a single page).
+*   The second and third columns are the number of allocated and freed pages we
+    have seen of this size.
+*   The fourth column is the number of live allocations of this size.
+*   The fifth column is the size of those live allocations in MiB.
+*   The sixth column is the allocation rate in pages per second since the start
+    of the application.
+*   The seventh column is the allocation rate in MiB per second since the start
+    of the application.
+
+```
+PageHeap: per-size information:
+PageHeap: 1 page info: 23978897 / 23762891 a/f, 216006 (6750.2 MiB) live, 2.43e+03 allocs/s ( 76.1 MiB/s)
+PageHeap: 2 page info: 21442844 / 21436331 a/f,   6513 ( 407.1 MiB) live, 2.18e+03 allocs/s (136.0 MiB/s)
+PageHeap: 3 page info:  2333686 /  2329225 a/f,   4461 ( 418.2 MiB) live,      237 allocs/s ( 22.2 MiB/s)
+PageHeap: 4 page info: 21509168 / 21508751 a/f,    417 (  52.1 MiB) live, 2.18e+03 allocs/s (272.9 MiB/s)
+PageHeap: 5 page info:  3356076 /  3354188 a/f,   1888 ( 295.0 MiB) live,      341 allocs/s ( 53.2 MiB/s)
+PageHeap: 6 page info:  1718534 /  1718486 a/f,     48 (   9.0 MiB) live,      174 allocs/s ( 32.7 MiB/s)
+...
+```
+
+### GWP-ASan Status
+
+The GWP-ASan section displays information about allocations guarded by
+[GWP-ASan](gwp-asan.md).
+
+*   The number of successful and failed GWP-ASan allocations. If there are 0
+    successful and 0 failed allocations, GWP-ASan is probably disabled on your
+    binary. If there are a large number of failed allocations, it probably means
+    your sampling rate is too high, causing the guarded slots to be exhausted.
+    See
+    [GWP-ASan sampling rate](gwp-asan.md#what-should-i-set-the-sampling-rate-to).
+*   The number of "slots" currently allocated and quarantined. An allocated slot
+    contains an allocation that is still active (i.e., not freed) while a
+    quarantined slot has either not been used yet or contains an allocation that
+    was freed.
+*   The maximum number of slots that have been allocated at the same time. This
+    number is printed along with the allocated slot limit. If the maximum slots
+    allocated matches the limit, you may want to reduce your sampling rate to
+    avoid failed GWP-ASan allocations.
+
+```
+------------------------------------------------
+GWP-ASan Status
+------------------------------------------------
+Successful Allocations: 1823
+Failed Allocations: 0
+Slots Currently Allocated: 33
+Slots Currently Quarantined: 95
+Moximum Slots Allocated: 51 / 64
+```
+
+### Memory Requested From The OS
+
+The stats also report the amount of memory requested from the OS by mmap.
+
+Memory is also requested, but may not actually be backed by physical memory, so
+these stats should resemble the VSS of the application, not the RSS.
+
+```
+Low-level allocator stats:
+MmapSysAllocator: 18083741696 bytes (17246.0 MiB) allocated
+```
+
+## Temeraire
+
+### Introduction
+
+Temeraire (or Huge Page Aware Allocator) is a new page heap for TCMalloc that is
+hugepage aware. It is designed to better handle memory backed by hugepages -
+avoiding breaking them up. Since it is more elaborate code, it reports
+additional information.
+
+See the [Temeraire design doc](temeraire.md) for more complete information.
+
+### Summary Statistics
+
+The initial set of statistics from the Huge Page Aware Allocator are similar to
+the old page heap, and show a summary of the number of instances of each range
+of contiguous pages.
+
+```
+------------------------------------------------
+HugePageAware: 75 sizes;  938.8 MiB free; 1154.0 MiB unmapped
+------------------------------------------------
+ 1 pages * 86655 spans ~ 677.0 MiB;  677.0 MiB cum; unmapped:    0.0 MiB;    0.0 MiB cum
+ 2 pages *  3632 spans ~  56.8 MiB;  733.7 MiB cum; unmapped:    0.0 MiB;    0.0 MiB cum
+ 3 pages *   288 spans ~   6.8 MiB;  740.5 MiB cum; unmapped:    0.0 MiB;    0.0 MiB cum
+ 4 pages *   250 spans ~   7.8 MiB;  748.3 MiB cum; unmapped:    0.0 MiB;    0.0 MiB cum
+...
+```
+
+The first line indicates the number of different sizes of ranges, the total MiB
+available, and the total MiB of unmapped ranges. The next lines are per number
+of continuous pages:
+
+*   The number of contiguous pages
+*   The number of spans of that number of pages
+*   The total number of MiB of that span size that are mapped.
+*   The cumulative total of the mapped pages.
+*   The total number of MiB of that span size that are unmapped.
+*   The cumulative total of the unmapped pages.
+
+### Per Component Information
+
+The Huge Page Aware Allocator has multiple places where pages of memory are
+held. More details of its workings can be found in
+[the Temeraire design doc](temeraire.md). There are four caches where pages of
+memory can be located:
+
+*   The filler, used for allocating ranges of a few TCMalloc pages in size.
+*   The region cache, used for allocating ranges of multiple pages.
+*   The huge cache which contains huge pages that are backed with memory.
+*   The huge page allocator which contains huge pages that are not backed by
+    memory.
+
+We get some summary information for the various caches, before we report
+detailed information for each of the caches.
+
+```
+Huge page aware allocator components:
+------------------------------------------------
+HugePageAware: breakdown of free / unmapped / used space:
+HugePageAware: filler 38825.2 MiB used,  938.8 MiB free,    0.0 MiB unmapped
+HugePageAware: region     0.0 MiB used,    0.0 MiB free,    0.0 MiB unmapped
+HugePageAware: cache    908.0 MiB used,    0.0 MiB free,    0.0 MiB unmapped
+HugePageAware: alloc      0.0 MiB used,    0.0 MiB free, 1154.0 MiB unmapped
+```
+
+The summary information tells us:
+
+*   The first column shows how much memory has been allocated from each of the
+    caches
+*   The second column indicates how much backed memory is available in each
+    cache.
+*   The third column indicates how much unmapped memory is available in each
+    cache.
+
+### Filler Cache
+
+The filler cache contains TCMalloc sized pages from within a single hugepage. So
+if we want a single TCMalloc page we will look for it in the filler.
+
+There are three sections of stats around the filler cache. The first section
+gives an indication of the number and state of the hugepages in the filler
+cache.
+
+```
+HugePageFiller: densely pack small requests into hugepages
+HugePageFiller: 19882 total, 8083 full, 11799 partial, 0 released (0 partially), 0 quarantined
+HugePageFiller: 120168 pages free in 19882 hugepages, 0.0236 free
+HugePageFiller: among non-fulls, 0.0398 free
+HugePageFiller: 499 used pages in subreleased hugepages (0 of them in partially released)
+HugePageFiller: 0 hugepages partially released, 0.0000 released
+HugePageFiller: 1.0000 of used pages hugepageable
+HugePageFiller: Since startup, 26159 pages subreleased, 345 hugepages broken
+```
+
+The summary stats are as follows:
+
+*   "total" refers to the total number of hugepages in the filler cache.
+*   "full" is the number of those hugepages that have multiple in-use
+    allocations.
+*   "partial" is the remaining number of hugepages that have a single in-use
+    allocation.
+*   "released" is the number of hugepages that are released - i.e., partially
+    unmapped. If partially released hugepages are enabled, the number in
+    parentheses shows the number of hugepages in this category.
+*   "quarantined" is a feature has been disabled, so the result is currently
+    zero.
+
+The second section gives an indication of the number of pages in various states
+in the filler cache:
+
+*   "pages free" refers to the number of free TCMalloc pages in the filler, as
+    well as the ratio to the total number of hugepages.
+*   "among non-fulls" states this ratio to the number of non-full hugepages.
+*   "used pages" refers to the number of occupied pages in the different types
+    of partially unmapped hugepages.
+
+```
+HugePageFiller: fullness histograms
+
+HugePageFiller: # of regular hps with a<= # of free pages <b
+HugePageFiller: <  0<=  8083 <  1<=     6 <  2<=     1 <  3<=     1 <  4<=     0 < 16<=   103
+HugePageFiller: < 32<=     1 < 48<=     0 < 64<=     3 < 80<=     1 < 96<=     0 <112<=     0
+HugePageFiller: <128<=    28 <144<=     0 <160<=     0 <176<=     1 <192<=     0 <208<=     0
+HugePageFiller: <224<=     2 <240<=     0 <252<=     0 <253<=     0 <254<=     0 <255<=     0
+
+HugePageFiller: # of donated hps with a<= # of free pages <b
+HugePageFiller: <  0<=     0 <  1<=     0 <  2<=     0 <  3<=     0 <  4<=     0 < 16<=     0
+HugePageFiller: < 32<=     0 < 48<=     0 < 64<=     0 < 80<=     0 < 96<=     0 <112<=     0
+HugePageFiller: <128<=     1 <144<=     0 <160<=     0 <176<=     0 <192<=     0 <208<=     0
+HugePageFiller: <224<=     0 <240<=     0 <252<=     0 <253<=     0 <254<=     0 <255<=     0
+
+HugePageFiller: # of released hps with a<= # of free pages <b
+...
+
+HugePageFiller: # of regular hps with a<= longest free range <b
+HugePageFiller: <  0<=  8083 <  1<=     6 <  2<=     1 <  3<=     1 <  4<=     0 < 16<=   103
+HugePageFiller: < 32<=     1 < 48<=     0 < 64<=     4 < 80<=     0 < 96<=     0 <112<=     0
+HugePageFiller: <128<=    29 <144<=     0 <160<=     0 <176<=     0 <192<=     0 <208<=     1
+HugePageFiller: <224<=     1 <240<=     0 <252<=     0 <253<=     0 <254<=     0 <255<=     0
+
+HugePageFiller: # of released hps with a<= longest free range <b
+...
+
+HugePageFiller: # of regular hps with a<= # of allocations <b
+HugePageFiller: <  1<=     8 <  2<=     7 <  3<=    10 <  4<=    10 <  5<=    12 < 17<=    15
+HugePageFiller: < 33<=    12 < 49<=     2 < 65<=     0 < 81<=     2 < 97<=    17 <113<=   166
+HugePageFiller: <129<=    42 <145<=     6 <161<=    20 <177<=    48 <193<=   398 <209<=  1968
+HugePageFiller: <225<=  5062 <241<=   425 <253<=     0 <254<=     0 <255<=     0 <256<=     0
+
+HugePageFiller: # of released hps with a<= # of allocations <b
+...
+```
+
+Some sections have been elided here for space.
+
+There are three sections, split by three tracker types. They use the same
+reporting format and indicate:
+
+*   The available TCMalloc pages in the hugepages of the given type.
+*   The longest contiguous range of available TCMalloc pages in the hugepages of
+    the given type.
+*   The number of current allocations from each of the hugepages of the given
+    type. The ranges are offset by one here, because a hugepage can't have zero
+    allocations.
+
+The reporting format is the number of hugepages that are between a particular
+range for the characteristic of interest. For example:
+
+*   There are 3 regular hugepages with TCMalloc free pages >= 64 and < 80.
+*   There are 6 regular hugepages with a longest contiguous length of exactly 1
+    page.
+*   There are 2 regular hugepages with between 81 and 96 allocations.
+
+The three tracker types are "regular," "donated," and "released." "Regular" is
+by far the most common, and indicates regular memory in the filler.
+
+"Donated" is hugepages that have been donated to the filler from the tail of
+large (multi-hugepage) allocations, so that the leftover space can be packed
+with smaller allocations. But we prefer to use up all useable regular hugepages
+before touching the donated ones, which devolve to "regular" type once they are
+used. Because of this last property, donated hugepages always have only one
+allocation and their longest range equals their free space, so those histograms
+aren't shown.
+
+"Released" is partially released hugepages. Normally the entirety of a hugepage
+is backed by real RAM, but in partially released hugepages most of it has been
+returned to the OS. Because this defeats the primary goal of the hugepage-aware
+allocator, this is done rarely, and we only reuse partially-released hugepages
+for new allocations as a last resort.
+
+The final section shows a summary of the filler's state over the past 5 minute
+time period:
+
+```
+HugePageFiller: time series over 5 min interval
+
+HugePageFiller: realized fragmentation: 0.0 MiB
+HugePageFiller: minimum free pages: 0 (0 backed)
+HugePageFiller: at peak demand: 1774 pages (and 261 free, 13 unmapped)
+HugePageFiller: at peak demand: 8 hps (5 regular, 1 donated, 0 partial, 2 released)
+HugePageFiller: at peak hps: 1774 pages (and 261 free, 13 unmapped)
+HugePageFiller: at peak hps: 8 hps (5 regular, 1 donated, 0 partial, 2 released)
+```
+
+The first line shows the minimum number of free pages over the time interval,
+which is an indication of how much memory could have been "usefully" reclaimed
+(i.e., free for long enough that the OS would likely be able to use the memory
+for another process). The line shows both the total number of free pages in the
+filler (whether or not released to the OS) as well as only those that were
+backed by physical memory for the full 5-min interval. The realized
+fragmentation metric computed here uses a bounded window.
+
+The next two sections show the state of the filler at peak demand (i.e., when
+the maximum number of pages was in use) and at peak hps (i.e., when the maximum
+number of hugepages was in use). For each, we show the number of free (backed)
+pages as well as unmapped pages, and the number of the four different types of
+hugepages active at that time. If there are multiple peaks, we return the state
+at the latest one of them.
+
+If applicable, an additional section tracks the behavior that skips subreleasing
+hugepages if behind the recent demand requirement, which is either the peak
+within `--tcmalloc_skip_subrelease_interval`, or the sum of short-term
+fluctuation peak within `--tcmalloc_skip_subrelease_short_interval` and
+long-term trend within `--tcmalloc_skip_subrelease_long_interval`.
+
+**Note:** Conducting skip-subrelease using both short-term and long-term
+intervals is an experimental feature, and should not be enabled without
+understanding its performance tradeoffs.
+
+```
+HugePageFiller: Since the start of the execution, 0 subreleases (0 pages) were skipped due to either recent (0s) peaks, or the sum of short-term (0s) fluctuations and long-term (0s) trends..
+HugePageFiller: 100.0000% of decisions confirmed correct, 0 pending (100.0000% of pages, 0 pending), as per anticipated 300s realized fragmentation.
+```
+
+This shows how many times a page that was meant to be subreleased was not (note
+that this can refer to the same page multiple times if subrelease of this page
+would have been triggered multiple times). The percentage shows what fraction of
+times this decision would have been correct (i.e., if we decided not to
+subrelease a page because of the calculated demand requirement, did memory
+consumption increase again within the *next* five minutes?). "Pending" refers to
+subrelease decisions that were less than five minutes in the past and we
+therefore do not know yet whether or not they were correct. The correctness
+evaluation chooses to use the five minutes interval as it is the interval used
+for realized fragmentation.
+
+The skip-subrelease feature prioritizes using the recent peak if
+`--tcmalloc_skip_subrelease_interval` is configured, otherwise it uses the
+combination of the recent short-term fluctuation peak and long-term trend. The
+feature is disabled if all three intervals are zero.
+
+### Region Cache
+
+The region cache holds a chunk of memory from which can be allocated spans of
+multiple TCMalloc pages. The region cache may not be populated, and it can
+contain multiple regions.
+
+```
+HugeRegionSet: 1 MiB+ allocations best-fit into 1024 MiB slabs
+HugeRegionSet: 0 total regions
+HugeRegionSet: 0 hugepages backed out of 0 total
+HugeRegionSet: 0 pages free in backed region, 0.0000 free
+```
+
+The lines of output indicate:
+
+*   The size of each region in MiB - this is currently 1GiB.
+*   The total number of regions in the region cache, in the example above there
+    are no regions in the cache.
+*   The number of backed hugepages in the cache out of the total number of
+    hugepages in the region cache.
+*   The number of free TCMalloc pages in the regions, and as a ratio of the
+    number of backed pages.
+
+### Huge Cache
+
+The huge cache contains backed hugepages, it grows and shrinks in size depending
+on runtime conditions. Attempting to hold onto backed memory ready to be
+provided for the application.
+
+```
+HugeCache: contains unused, backed hugepage(s)
+HugeCache: 0 / 10 hugepages cached / cache limit (0.053 hit rate, 0.436 overflow rate)
+HugeCache: 88880 MiB fast unbacked, 6814 MiB periodic
+HugeCache: 1234 MiB*s cached since startup
+HugeCache: recent usage range: 40672 min - 40672 curr -  40672 max MiB
+HugeCache: recent offpeak range: 0 min - 0 curr - 0 max MiB
+HugeCache: recent cache range: 0 min - 0 curr - 0 max MiB
+```
+
+The output shows the following information:
+
+*   The number of hugepages out of the maximum number of hugepages we will hold
+    in the huge cache. The hit rate is how often we get pages from the huge
+    cache vs getting them from the huge allocator. The overflow rate is the
+    number of times we added something to the huge cache causing it to exceed
+    its size limit.
+*   The fast unbacked is the cumulative amount of memory unbacked due size
+    limitations, the periodic count is the cumulative amount of memory unbacked
+    by periodic calls to release unused memory.
+*   The amount of cumulative memory stored in HugeCache since the startup of the
+    process. In other words, the area under the cached-memory-vs-time curve.
+*   The usage range is the range minimum, current, maximum in MiB of memory
+    obtained from the huge cache.
+*   The off-peak range is the minimum, current, maximum cache size in MiB
+    compared to the peak cache size.
+*   The recent range is the minimum, current, maximum size of memory in MiB in
+    the huge cache.
+
+### Huge Allocator
+
+The huge allocator holds unmapped memory ranges. We allocate from here if we are
+unable to allocate from any of the caches.
+
+```
+HugeAllocator: contiguous, unbacked hugepage(s)
+HugeAddressMap: treap 5 / 10 nodes used / created
+HugeAddressMap: 256 contiguous hugepages available
+HugeAllocator: 20913 requested - 20336 in use = 577 hugepages free
+```
+
+The information reported here is:
+
+*   The number of nodes used and created to handle regions of memory.
+*   The size of the longest contiguous region of available hugepages.
+*   The number of hugepages requested from the system, the number of hugepages
+    in used, and the number of hugepages available in the cache.
+
+### Pageheap Summary Information
+
+The new pageheap reports some summary information:
+
+```
+HugePageAware: stats on allocation sizes
+HugePageAware: 4969003 pages live small allocation
+HugePageAware: 659 pages of slack on large allocations
+HugePageAware: largest seen allocation 45839 pages
+```
+
+These are:
+
+*   The number of live "small" TCMalloc pages allocated (these less than 2MiB in
+    size). [Note: the 2MiB size distinction is separate from the size of
+    hugepages]
+*   The number of TCMalloc pages which are left over from "large" allocations.
+    These allocations are larger than 2MiB in size, and are rounded to a
+    hugepage - the slack being the amount left over after rounding.
+*   The largest seen allocation request in TCMalloc pages.
+
+### Per Size Range Info:
+
+The per size range info is the same format as the old pageheap:
+
+*   The first column contains the number of pages (or the range of pages if the
+    bucket is wider than a single page).
+*   The second and third columns are the number of allocated and freed pages we
+    have seen of this size.
+*   The fourth column is the number of live allocations of this size.
+*   The fifth column is the size of those live allocations in MiB.
+*   The sixth column is the allocation rate in pages per second since the start
+    of the application.
+*   The seventh column is the allocation rate in MiB per second since the start
+    of the application.
+
+```
+HugePageAware: per-size information:
+HugePageAware: 1 page info: 5817510 / 3863506 a/f, 1954004 (15265.7 MiB) live,  16    allocs/s (   0.1 MiB/s)
+HugePageAware: 2 page info: 1828473 / 1254096 a/f,  574377 ( 8974.6 MiB) live,   5.03 allocs/s (   0.1 MiB/s)
+HugePageAware: 3 page info: 1464568 / 1227253 a/f,  237315 ( 5562.1 MiB) live,   4.03 allocs/s (   0.1 MiB/s)
+...
+```
+
+### Pageheap Age Information:
+
+The new pageheap allocator also reports information on the age of the various
+page ranges. In this example you can see that there was a large number of
+unmapped pages in the last minute.
+
+```
+------------------------------------------------
+HugePageAware cache entry age (count of pages in spans of a given size that have been idle for up to the given period of time)
+------------------------------------------------
+                              mean    <1s     1s     30s      1m     30m      1h     8+h
+Live span     TOTAL PAGES:  29317.6   145    549    1775   13059   13561   58622   32457
+Live span,        1 pages:  35933.7     0     55     685    6354    8111   43853   27597
+...
+Unmapped span TOTAL PAGES:     51.3     0      0  131072   16640       0       0       0
+Unmapped span,   >=64 pages:   51.3     0      0  131072   16640       0       0       0
+...
+```
--- a/src/third_party/tcmalloc/dist/docs/temeraire.md
+++ b/src/third_party/tcmalloc/dist/docs/temeraire.md
@ -0,0 +1,267 @@
+# Temeraire: Hugepage-Aware Allocator
+
+Andrew Hunter, [Chris Kennelly](ckennelly@google.com)
+
+*Notes on the name*[^cutie]*: the french word for "reckless" or "rash" :), and
+also the name of several large and powerful English warships. So: giant and
+powerful, but maybe a little dangerous. :)*
+
+This is a description of the design of the Hugepage-Aware Allocator. We have
+also published ["Beyond malloc efficiency to fleet efficiency: a hugepage-aware
+memory allocator" at OSDI 2021](https://research.google/pubs/pub50370/), which
+provides further details on the design, implementation, and rollout of
+Temeraire.
+
+## GOALS
+
+What do we want out of this redesign?
+
+*   Dramatic reduction in pageheap size. The pageheap in TCMalloc holds
+    substantial amounts of memory *after* its attempts to `MADV_DONTNEED` memory
+    back to the OS, due to internal fragmentation. We can recover a useful
+    fraction of this. In optimal cases, we see savings of over 90%. We do not
+    expect to achieve this generally, but a variety of synthetic loads suggest
+    50% of pageheap is a reasonable target savings.
+*   Dramatic increase in hugepage usage. The `madvise()` in
+    `ReleaseMemoryToSystem` is made without any thought to transparent
+    hugepages, and in practice prevent most fleet RAM from remaining as intact
+    hugepages. Services have seen substantial performance gains from **from
+    disabling release** (and going to various other lengths to maximize hugepage
+    usage).
+*   *reasonable* allocation speed. This is really stating a non-goal: speed
+    parity with `PageHeap::New`. PageHeap is a relatively light consumer of
+    cycles. We are willing to accept a speed hit in actual page allocation in
+    exchange for better hugepage usage and space overhead. This is not free but
+    we think is well justified. Our goal is more to avoid catastrophic
+    regressions in speed. We intentionally accept two particular time hits:
+
+    *   much more aggressive releasing (of entire hugepages), leading to
+        increased costs for *backing* memory.
+    *   much more detailed (and expensive) choices of where to fulfill a
+        particular request.
+
+## DESIGN
+
+The algorithm -- as usual here, really, the data structures, which neatly
+determine our algorithm -- are nicely divided into components. Essentially, the
+path of an allocation goes like this:
+
+1.  If it is sufficiently small and we have the space we take an existing,
+    backed, partially empty hugepage and fit our allocation within it.
+1.  If it is too large to fit in a single hugepage, but too small to simply
+    round up to an integral number of hugepages, we best-fit it into one of
+    several larger slabs (whose allocations can cross hugepage boundaries). We
+    will back hugepages as needed for the allocation.
+1.  Sufficiently large allocations are rounded up to the nearest hugepage; the
+    extra space may be used for smaller allocations.
+
+Deallocation simply determines which of 1), 2), or 3) happened, and marks the
+corresponding object we allocated from as free.
+
+We will sketch the purpose and approach of each important part. Note that we
+have fairly detailed unit tests for each of these; one consequence on the
+implementations is that most components are templated on the
+`tcmalloc::SystemRelease` functions[^templated] as we make a strong attempt to
+be zero initializable where possible (sadly not everywhere).
+
+### `RangeTracker`
+
+`RangeTracker` and `Bitmap`, its underlying implementation, are helper class
+used throughout the components below. They are both quite simple: `Bitmap` is a
+fixed-size (templated) bitmap with fast operations to set and clear bits and
+ranges of bits, with extensive support for searching and iterating. (Search and
+iteration support is why `std::bitset` is not usable here.)
+
+`RangeTracker` is essentially a `Bitmap` augmented with statistics on usage, in
+particular the longest range of contiguous free (false) bits. It provides
+methods to do best-fit allocation from free ranges (keeping the statistics
+correct).
+
+Both of these need to be quite fast as they're on nearly every
+allocation/deallocation path in `HugePageAwareAllocator` (in multiple ways)!
+They are reasonably optimized but probably still have more headroom.
+
+### HugeAllocator/HugeCache (the backing...)
+
+This is a set of classes that fulfills requests for backed (or unbacked) aligned
+hugepage ranges. We use this for sufficiently large (or nicely sized) requests,
+and to provide memory for the other components to break up into smaller chunks.
+
+#### `HugeAllocator`
+
+`HugeAllocator` is (nearly) trivial: it requests arbitrarily large
+hugepage-sized chunks from `SysAllocator`, keeps them unbacked, and tracks the
+available (unbacked) regions. Note that we do not need to be perfectly space
+efficient here: we only pay virtual memory and metadata, since *none* of the
+contents are backed. (We do make our best efforts to be relatively frugal,
+however, since there’s no need to inflate VSS by large factors.) Nor do we have
+to be particularly fast; this is well off any hot path, and we’re going to incur
+non-trivial backing costs as soon as we’re done assigning a range.
+
+The one tricky bit here is that we have to write some fiddly data structures by
+hand. We would have liked to implement this by grabbing large (gigabyte+) ranges
+from SysAllocator and using bitmaps or the like within them; however, too many
+tests have brittle reliance on details of `SysAllocator` that break if TCMalloc
+consistently requests (any considerable amount) more than the minimum needed to
+back current usage. So instead we need to track relatively small ranges. We've
+implemented a balanced tree that merges adjacent ranges; it is, as we said,
+fiddly, but reasonably efficient and not stunningly complicated.
+
+#### `HugeCache`
+
+This is a very simple wrapper on top of HugeAllocator. It's only purpose is to
+store some number of backed *single* hugepage ranges as a hot cache (in case we
+rapidly allocate and deallocate a 2 MiB chunk).
+
+It is not clear whether the cache is necessary, but we have it and it's not
+costing us much in complexity, and will help significantly in some potential
+antagonistic scenarios, so we favor keeping it.
+
+It currently attempts to estimate the optimal cache size based on past behavior.
+This may not really be needed, but it's a very minor feature to keep *or* drop.
+
+### `HugePageFiller` (the core…)
+
+`HugePageFiller` takes small requests (less than a hugepage) and attempts to
+pack them efficiently into hugepages. The vast majority of binaries use almost
+entirely small allocations[^conditional], so this is the dominant consumer of
+space and the most important component.
+
+Our goal here is to make our live allocations fit within the smallest set of
+hugepages possible, so that we can afford to keep all used hugepages fully
+backed (and aggressively free empty ones).
+
+The key challenge is avoiding fragmentation of free space within a hugepage:
+requests for 1 page are (usually) the most common, but 4, 8, or even 50+ page
+requests aren't unheard of. Many 1-page free regions won’t be useful here, and
+we'll have to request enormous numbers of new hugepages for anything large.
+
+Our solution is to build a heap-ordered data structure on *fragmentation*, not
+total amount free, in each hugepage. We use the **longest free range** (the
+biggest allocation a hugepage can fulfill!) as a measurement of fragmentation.
+In other words: if a hugepage has a free range of length 8, we *never* allocate
+from it for a smaller request (unless all hugepages available have equally long
+ranges). This carefully husbands long ranges for the requests that need them,
+and allows them to grow (as neighboring allocations are freed).
+
+Inside each equal-longest-free-range group, we order our heap by the **number of
+allocations** (chunked logarithmically). This helps favor allocating from fuller
+hugepages (of equal fragmented status). Number of allocations handily
+outperforms the total number of allocated pages here; our hypothesis is that
+since allocations of any size are equally likely[^radioactive] to become free at
+any given time, and we need all allocations on a hugepage to become free to make
+the hugepage empty, we’re better off hoping for 1 10-page allocation to become
+free (with some probability P) than 5 1-page allocations (with probability P^5).
+
+The `HugePageFiller` contains support for releasing parts of mostly-empty
+hugepages as a last resort.
+
+The actual implementation uses a fixed set of lists and a bitmap for
+acceleration.
+
+### `HugeRegion` (big but not enormous...)
+
+`HugeAllocator` covers very large requests and `HugePageFiller` tiny ones; what
+about the middle? In particular, requests that cannot fit into a hugepage, but
+should not be rounded to multiples? (For instance, 2.1 MiB.) These are woefully
+common.
+
+In any case, we certainly have to do something with "2.1 MiB"-type allocations,
+and rounding them to 4 will produce unacceptable slack (see below for what we
+can do with the filler here; it is wildly insufficient in current binaries which
+have the majority of their allocation in these large chunks.)
+
+The solution is a much larger "region" that best-fits these chunks into a large
+range of hugepages (i.e. allows them to cross a hugepage boundary). We keep a
+set of these regions, and allocate from the most fragmented one (much as with
+Filler above)! The main difference is that these regions are kept **un-backed**
+by default (whereas the Filler deals almost entirely with backed hugepages). We
+back hugepages on demand when they are used by a request hitting the region (and
+aggressively _unback _them when they become empty again).
+
+A few important details:
+
+*   These regions are currently 1 GiB, which is very large!
+
+    The reason is this: suppose our entire binary allocates a huge number `N` of
+    requests of size `S` that are too big for the filler, but that don’t evenly
+    divide the region size `M` (say, 2.1 MiB :)) How much space will we waste?
+    Answer: we will allocate about `R = N / (M / S)` regions, with each region
+    storing `floor(M/S)` allocations. The tail will be unused. We can unback any
+    totally untouched hugepages, but suppose that `M/S` allocations just barely
+    touches the last hugepage in the region: we will then waste ~a full hugepage
+    per region, and thus waste `R` hugepages. Conclusion: the larger a region we
+    use, the less waste (in this case). Originally regions were 32 MiB, and this
+    effect was very noticeable. This also allows us to use very few regions in a
+    given binary, which means we can be less careful about how we organize the
+    set of regions.
+
+*   We don’t make *any* attempt, when allocating from a given region, to find an
+    already-backed but unused range. Nor do we prefer regions that have such
+    ranges.
+
+    This is basically a question of effort. We'd like to do this, but we don't
+    see any way to do it without making the data structure more complicated and
+    cumbersome. So far in tests it hasn't proved a major problem. (Note that
+    `RangeTracker` has a low-address bias, which will help somewhat here by
+    compacting allocations towards the low end of any region).
+
+Additional details on the design goals/tradeoffs are in the
+[Regions Are Not Optional](regions-are-not-optional.md) design doc.
+
+### `HugePageAwareAllocator` (putting it all together...)
+
+This class houses the above components and routes between them, in addition to
+interfacing with the rest of TCMalloc (the above classes don’t need or use
+Spans, for instance). This is mostly straightforward; two points are worth
+discussing.
+
+*   How do we choose which sub-allocator for a given request?
+
+    We use a size-based policy.
+
+    1.  Small allocations are handed directly to the filler; we add hugepages to
+        the filler as needed.
+    1.  For slightly larger allocations (still under a full hugepage), we *try*
+        the filler, but don’t grow it if there’s not currently space. Instead,
+        we look in the regions for free space. If neither the regions or the
+        filler has space, we prefer growing the filler (since it comes in
+        smaller chunks!) The reasoning here is that if our binary only has
+        allocations of (say) ¾ a hugepage, we don’t want the filler to be giant
+        but ¼ empty; but in a more reasonable binary where we can easily pack
+        such allocations near smaller ones, we’d prefer to do so over using the
+        region.
+    1.  Allocations that won’t fit in a hugepage are just given to the regions
+        (or, for truly enormous ones, to `HugeAllocator` directly).
+
+The changeover point between 1) and 2) is just a tuning decision (any choice
+would produce a usable binary). Half a hugepage was picked arbitrarily; this
+seems to work well.
+
+*   How do we handle backing?
+
+Allocations from `HugeAllocator` or `HugeRegion` (some of the time) need to be
+backed; so do hugepages that grow the `HugePageFiller`. This isn’t free. Page
+heap allocation isn’t hugely expensive in practice, but it is under a lock and
+contention matters. We currently rely on access by the application to back
+memory, and assume returned memory has been backed.
+
+For accounting purposes, we do a bit of tracking whether a given allocation is
+being fulfilled from previously-unbacked memory.
+
+We do wire that information to the point we drop the pageheap lock; we then back
+it without producing lock contention. This made a noticeable performance
+difference when explicitly backing memory before returning it to the
+application.
+
+## Notes
+
+[^cutie]: Also the name of
+    [this cutie](https://lh3.googleusercontent.com/VXENOSfqH1L84VMwLVAUA7JIqQh7TYH-IZHLBalvVVuMUeD3w5rOVHPsIp97nYEgmKpQoxsHO-lieGouheNmifA2X6tOPTBleTbQc_WCZIrI_roU2K37iiHg9go6omp2ys0Y7cxYc9c6EWNaCYtKG1dEPyyYLULUarCex4oqwt8KgRl95rd3yKXC6YQeW-TWkDpK786ZaAA3vKJXqT5E-ArPxQccyPH13EAmHrltKatqihC7L4Ym5IfP42u58IJwC5bRnKMczm2WwUfipGDEOvymf63mPNKmGMka50AQV4VGrE7hW_Ateb2roCTGISgZIooBSRwK0PMjqV9hBLP5DmUG4ITSV4FlOI5iWOyMSNZV6Gz5T2FgNez08Wdn98tsEsN4_lPcjdZXyJuHeVRKxAawDwjkbWP3aieXDckHY-bJMt0QfyDhPWzSOpTxTALcZiwoC069K9SrBDVKEKowJ2Zag7OlbpROhqbagM5Wuo_nn6O27yWXpihc8Lptt-Vo_e8kQZ4N2RReby3bxNPdRyv2L8BrDCIWBO-iFk7GcYRd9ox7HSD-7Y0yH1FtMP0FZKD5a2raVmabMQrolhsjc-AfYHgD3xBkNo-uTJ8YnFpqjpTdZz_1=w2170-h1446-no),
+    the real reason for the choice.
+[^templated]: It will be possible, given recent improvements in constexpr usage,
+    to eliminate this in followups.
+[^conditional]: Here we mean "requests to the pageheap as filtered through
+    sampling, the central cache, etc"
+[^radioactive]: Well, no, this is false in our empirical data, but to first
+    order.
--- a/src/third_party/tcmalloc/dist/docs/tuning.md
+++ b/src/third_party/tcmalloc/dist/docs/tuning.md
@ -0,0 +1,214 @@
+# Performance Tuning TCMalloc
+
+## User-Accessible Controls
+
+There are three user accessible controls that we can use to performance tune
+TCMalloc:
+
+*   The logical page size for TCMalloc (4KiB, 8KiB, 32KiB, 256KiB)
+*   The per-thread or per-cpu cache sizes
+*   The rate at which memory is released to the OS
+
+None of these tuning parameters are clear wins, otherwise they would be the
+default. We'll discuss the advantages and disadvantages of changing them.
+
+### The Logical Page Size for TCMalloc:
+
+This is determined at compile time by linking in the appropriate version of
+TCMalloc. The page size indicates the unit in which TCMalloc manages memory. The
+default is in 8KiB chunks, there are larger options of 32KiB and 256KiB. There
+is also the 4KiB page size used by the small-but-slow allocator.
+
+A smaller page size allows TCMalloc to provide memory to an application with
+less waste. Waste comes about through two issues:
+
+*   Left-over memory when rounding larger requests to the page size (eg a
+    request for 62 KiB might get rounded to 64 KiB).
+*   Pages of memory that are stuck because they have a single in use allocation
+    on the page, and therefore cannot be repurposed to hold a different size of
+    allocation.
+
+The second of these points is worth elucidating. For small allocations TCMalloc
+will fit multiple objects onto a single page.
+
+So if you request 512 bytes, then an entire page will be devoted to 512 byte
+objects. If the size of that page is 4KiB we get 8 objects, if the size of that
+page is 256KiB we get 512 objects. That page can only be used for 512 byte
+objects until all the objects on the page have been freed.
+
+If you have 8 objects on a page, there's a reasonable chance that all 8 will
+become free at the same time, and we can repurpose the page for objects of a
+different size. If there's 512 objects on that page, then it is very unlikely
+that all the objects will become freed at the same time, so that page will
+probably never become entirely free and will probably hang around, potentially
+containing only a few in-use objects.
+
+The consequence of this is that large pages tend to lead to a larger memory
+footprint. There's also the issue that if you want one object of a size, you
+need to allocate a whole page.
+
+The advantage of managing objects using larger page sizes are:
+
+*   Objects of the same size are better clustered in memory. If you need 512 KiB
+    of 8 byte objects, then that's two 256 KiB pages, or 128 x 4 KiB pages. If
+    memory is largely backed by hugepages, then with large pages in the worst
+    case we can map the entire demand with two large pages, whereas small pages
+    could take up to 128 entries in the TLB.
+*   There's a structure called the `PageMap` which enables TCMalloc to lookup
+    information about any allocated memory. If we use large pages the pagemap
+    needs fewer entries and can be much smaller. This makes it more likely that
+    it is cache resident. However, sized delete substantially reduced the number
+    of times that we need to consult the pagemap, so the benefit from larger
+    pages is reduced.
+
+**Suggestion:** The default of 8KiB page sizes is probably good enough for most
+applications. However, if an application has a heap measured in GiB it may be
+worth looking at using large page sizes.
+
+**Suggestion:** Small-but-slow is *extremely* slow and should be used only where
+it is absolutely vital to minimize memory footprint over performance at all
+costs. Small-but-slow works by turning off and shrinking several of TCMalloc's
+caches, but this comes at a significant performance penalty.
+
+**Note:** Size-classes are determined on a per-page-size basis. So changing the
+page size will implicitly change the size-classes used. Size-classes are
+selected to be memory-efficient for the applications using that page size. If an
+application changes page size, there may be a performance or memory impact from
+the different selection of size-classes.
+
+### Per-thread/per-cpu Cache Sizes
+
+The default is for TCMalloc to run in per-cpu mode as this is faster; however,
+there are few applications which have not yet transitioned. The plan is to move
+these across at some point soon.
+
+Increasing the size of the cache is an obvious way to improve performance. The
+larger the cache the less frequently memory needs to be fetched from the central
+caches. Returning memory from the cache is substantially faster than fetching
+from the central cache.
+
+The size of the per-cpu caches is controlled by
+`tcmalloc::MallocExtension::SetMaxPerCpuCacheSize`. This controls the limit for
+each CPU, so the total amount of memory for application could be much larger
+than this. Memory on CPUs where the application is no longer able to run can be
+freed by calling `tcmalloc::MallocExtension::ReleaseCpuMemory`.
+
+The heterogeneous per-cpu cache optimization in TCMalloc dynamically sizes
+per-cpu caches so as to balance the miss rate across all the active and
+populated caches. It shuffles and reassigns the capacity from lightly used
+caches to the heavily used caches, using miss rate as the proxy for their usage.
+When enabled, the heavily used per-cpu caches may steal capacity from lightly
+used caches and grow beyond the limit set by `tcmalloc_max_per_cpu_cache_size`
+flag. This optimization is enabled by default in TCMalloc.
+
+Releasing memory held by unuable CPU caches is handled by
+`tcmalloc::MallocExtension::ProcessBackgroundActions`.
+
+In contrast `tcmalloc::MallocExtension::SetMaxTotalThreadCacheBytes` controls
+the *total* size of all thread caches in the application.
+
+**Suggestion:** The default cache size is typically sufficient, but cache size
+can be increased (or decreased) depending on the amount of time spent in
+TCMalloc code, and depending on the overall size of the application (a larger
+application can afford to cache more memory without noticeably increasing its
+overall size).
+
+### Memory Releasing
+
+`tcmalloc::MallocExtension::ReleaseMemoryToSystem` makes a request to release
+`n` bytes of memory to TCMalloc. This can keep the memory footprint of the
+application down to a minimal amount, however it should be considered that this
+just reduces the application down from its peak memory footprint over time, and
+does not make that peak memory footprint smaller.
+
+Using a background thread running
+`tcmalloc::MallocExtension::ProcessBackgroundActions()`, memory will be released
+from the page heap at the specified rate.
+
+There are two disadvantages of releasing memory aggressively:
+
+*   Memory that is unmapped may be immediately needed, and there is a cost to
+    faulting unmapped memory back into the application.
+*   Memory that is unmapped at small granularity will break up hugepages, and
+    this will cause some performance loss due to increased TLB misses.
+
+**Note:** Release rate is not a panacea for memory usage. Jobs should be
+provisioned for peak memory usage to avoid OOM errors. Setting a release rate
+may enable an application to exceed the memory limit for short periods of time
+without triggering an OOM. A release rate is also a good citizen behavior as it
+will enable the system to use spare capacity memory for applications which are
+are under provisioned. However, it is not a substitute for setting appropriate
+memory requirements for the job.
+
+**Note:** Memory is released from the `PageHeap` and stranded per-cpu caches. It
+is not possible to release memory from other internal structures, like the
+`CentralFreeList`.
+
+**Suggestion:** The default release rate is probably appropriate for most
+applications. In situations where it is tempting to set a faster rate it is
+worth considering why there are memory spikes, since those spikes are likely to
+cause an OOM at some point.
+
+## System-Level Optimizations
+
+*   TCMalloc heavily relies on Transparent Huge Pages (THP). As of February
+    2020, we build and test with
+
+```
+/sys/kernel/mm/transparent_hugepage/enabled:
+    [always] madvise never
+
+/sys/kernel/mm/transparent_hugepage/defrag:
+    always defer [defer+madvise] madvise never`
+
+/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none:
+    0
+```
+
+*   TCMalloc makes assumptions about the availability of virtual address space,
+    so that we can layout allocations in cetain ways. We build and test with
+
+```
+/proc/sys/vm/overcommit_memory:
+    1
+```
+
+## Build-Time Optimizations
+
+TCMalloc is built and tested in certain ways. These build-time options can
+improve performance:
+
+*   Statically-linking TCMalloc reduces function call overhead, by obviating the
+    need to call procedure linkage stubs in the procedure linkage table (PLT).
+*   Enabling
+    [sized deallocation from C++14](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2013/n3778.html)
+    reduces deallocation costs when the size can be determined. Sized
+    deallocation is enabled with the `-fsized-deallocation` flag. This behavior
+    is enabled by default in GCC), but as of early 2020, is not enabled by
+    default on Clang even when compiling for C++14/C++17.
+
+    Some standard C++ libraries (such as
+    [libc++](https://reviews.llvm.org/rCXX345214)) will take advantage of sized
+    deallocation for their allocators as well, improving deallocation
+    performance in C++ containers.
+
+*   Aligning raw storage allocated with `::operator new` to 8 bytes by compiling
+    with `__STDCPP_DEFAULT_NEW_ALIGNMENT__ <= 8`. This smaller alignment
+    minimizes wasted memory for many common allocation sizes (24, 40, etc.)
+    which are otherwise rounded up to a multiple of 16 bytes. On many compilers,
+    this behavior is controlled by the `-fnew-alignment=...` flag.
+
+    When `__STDCPP_DEFAULT_NEW_ALIGNMENT__` is not specified (or is larger than
+    8 bytes), we use standard 16 byte alignments for `::operator new`. However,
+    for allocations under 16 bytes, we may return an object with a lower
+    alignment, as no object with a larger alignment requirement can be allocated
+    in the space.
+
+*   Optimizing failures of `operator new` by directly failing instead of
+    throwing exceptions. Because TCMalloc does not throw exceptions when
+    `operator new` fails, this can be used as a performance optimization for
+    many move constructors.
+
+    Within Abseil code, these direct allocation failures are enabled with the
+    Abseil build-time configuration macro
+    [`ABSL_ALLOCATOR_NOTHROW`](https://abseil.io/docs/cpp/guides/base#abseil-exception-policy).
--- a/src/third_party/tcmalloc/dist/tcmalloc/.clang-format
+++ b/src/third_party/tcmalloc/dist/tcmalloc/.clang-format
@ -0,0 +1,6 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+DerivePointerAlignment: false
+PointerAlignment: Left
+...
--- a/src/third_party/tcmalloc/dist/tcmalloc/.github/CODEOWNERS
+++ b/src/third_party/tcmalloc/dist/tcmalloc/.github/CODEOWNERS
@ -0,0 +1,5 @@
+# Default owners
+* @ckennelly
+
+# Documentation
+docs/* @manshreck
--- a/src/third_party/tcmalloc/dist/tcmalloc/.github/workflows/ci.yml
+++ b/src/third_party/tcmalloc/dist/tcmalloc/.github/workflows/ci.yml
@ -0,0 +1,63 @@
+# Copyright 2022 The TCMalloc Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: ci
+
+on:
+  push:
+    branches:
+    - master
+
+  pull_request:
+
+jobs:
+  Linux:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        compiler:
+          - g++
+          - clang++
+
+    name: "Build/Test ${{matrix.compiler}}"
+    steps:
+    - name: Cancel previous
+      uses: styfle/cancel-workflow-action@0.8.0
+      with:
+        access_token: ${{ github.token }}
+
+    - name: Prepare
+      run: |
+        sudo apt-get update -qq
+        sudo apt install -y g++ clang
+
+    - uses: actions/checkout@v2
+      with:
+        fetch-depth: 0
+
+    - name: Create Cache Timestamp
+      id: cache_timestamp
+      uses: nanzm/get-time-action@v1.1
+      with:
+        format: 'YYYY-MM-DD-HH-mm-ss'
+
+    - name: Mount bazel cache
+      uses: actions/cache@v2
+      with:
+        path: "/home/runner/.cache/bazel"
+        key: bazelcache_${{matrix.compiler}}_${{ steps.cache_timestamp.outputs.time }}
+        restore-keys: bazelcache_${{matrix.compiler}}_
+
+    - name: Tests
+      run: CXX=${{matrix.compiler}} bazel test --test_output=errors //...
--- a/src/third_party/tcmalloc/dist/tcmalloc/BUILD
+++ b/src/third_party/tcmalloc/dist/tcmalloc/BUILD
--- a/src/third_party/tcmalloc/dist/tcmalloc/allocation_sample.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/allocation_sample.cc
@ -0,0 +1,52 @@
+// Copyright 2022 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tcmalloc/allocation_sample.h"
+
+#include <memory>
+
+#include "absl/time/clock.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc::tcmalloc_internal {
+
+AllocationSample::AllocationSample(AllocationSampleList* list, absl::Time start)
+    : list_(list), start_(start) {
+  mallocs_ = std::make_unique<StackTraceTable>(ProfileType::kAllocations);
+  list->Add(this);
+}
+
+AllocationSample::~AllocationSample() {
+  if (mallocs_ == nullptr) {
+    return;
+  }
+
+  // deleted before ending profile, do it for them
+  list_->Remove(this);
+}
+
+Profile AllocationSample::Stop() && {
+  // We need to remove ourselves from list_ before we mutate mallocs_;
+  //
+  // A concurrent call to AllocationSampleList::ReportMalloc can access mallocs_
+  // until we remove it from list_.
+  if (mallocs_) {
+    list_->Remove(this);
+    mallocs_->SetDuration(absl::Now() - start_);
+  }
+  return ProfileAccessor::MakeProfile(std::move(mallocs_));
+}
+
+}  // namespace tcmalloc::tcmalloc_internal
+GOOGLE_MALLOC_SECTION_END
--- a/src/third_party/tcmalloc/dist/tcmalloc/allocation_sample.h
+++ b/src/third_party/tcmalloc/dist/tcmalloc/allocation_sample.h
@ -0,0 +1,87 @@
+// Copyright 2022 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TCMALLOC_ALLOCATION_SAMPLE_H_
+#define TCMALLOC_ALLOCATION_SAMPLE_H_
+
+#include "absl/base/dynamic_annotations.h"
+#include "absl/base/internal/spinlock.h"
+#include "absl/time/time.h"
+#include "tcmalloc/internal/logging.h"
+#include "tcmalloc/stack_trace_table.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc::tcmalloc_internal {
+
+class AllocationSampleList;
+
+class AllocationSample final : public AllocationProfilingTokenBase {
+ public:
+  AllocationSample(AllocationSampleList* list, absl::Time start);
+  ~AllocationSample() override;
+
+  Profile Stop() && override;
+
+ private:
+  AllocationSampleList* list_;
+  std::unique_ptr<StackTraceTable> mallocs_;
+  absl::Time start_;
+  AllocationSample* next_ = nullptr;
+  friend class AllocationSampleList;
+};
+
+class AllocationSampleList {
+ public:
+  constexpr AllocationSampleList() = default;
+
+  void Add(AllocationSample* as) {
+    absl::base_internal::SpinLockHolder h(&lock_);
+    as->next_ = first_;
+    first_ = as;
+  }
+
+  // This list is very short and we're nowhere near a hot path, just walk
+  void Remove(AllocationSample* as) {
+    absl::base_internal::SpinLockHolder h(&lock_);
+    AllocationSample** link = &first_;
+    AllocationSample* cur = first_;
+    while (cur != as) {
+      CHECK_CONDITION(cur != nullptr);
+      link = &cur->next_;
+      cur = cur->next_;
+    }
+    *link = as->next_;
+  }
+
+  void ReportMalloc(const struct StackTrace& sample) {
+    absl::base_internal::SpinLockHolder h(&lock_);
+    AllocationSample* cur = first_;
+    while (cur != nullptr) {
+      cur->mallocs_->AddTrace(1.0, sample);
+      cur = cur->next_;
+    }
+  }
+
+ private:
+  // Guard against any concurrent modifications on the list of allocation
+  // samples. Invoking `new` while holding this lock can lead to deadlock.
+  absl::base_internal::SpinLock lock_{
+      absl::kConstInit, absl::base_internal::SCHEDULE_KERNEL_ONLY};
+  AllocationSample* first_ ABSL_GUARDED_BY(lock_) = nullptr;
+};
+
+}  // namespace tcmalloc::tcmalloc_internal
+GOOGLE_MALLOC_SECTION_END
+
+#endif  // TCMALLOC_ALLOCATION_SAMPLE_H_
--- a/src/third_party/tcmalloc/dist/tcmalloc/allocation_sample_test.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/allocation_sample_test.cc
@ -0,0 +1,132 @@
+// Copyright 2022 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tcmalloc/allocation_sample.h"
+
+#include <stddef.h>
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/random/bit_gen_ref.h"
+#include "absl/random/random.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "tcmalloc/internal/logging.h"
+#include "tcmalloc/static_vars.h"
+#include "tcmalloc/testing/thread_manager.h"
+
+namespace tcmalloc::tcmalloc_internal {
+namespace {
+
+TEST(AllocationSample, Threaded) {
+  // StackTraceTable uses a global allocator.  It must be initialized.
+  tc_globals.InitIfNecessary();
+
+  // This test exercises b/143623146 by ensuring that the state of the sample is
+  // not modified before it is removed from the linked list.
+  AllocationSampleList list;
+
+  const int kThreads = 5;
+  const int kMaxSamplers = 3;
+  const int kMaxAllocations = 100;
+  ThreadManager m;
+  std::vector<absl::BitGen> thread_states(kThreads);
+
+  struct GlobalState {
+    absl::Mutex mu;
+    std::vector<std::unique_ptr<AllocationSample>> samplers ABSL_GUARDED_BY(mu);
+  } global;
+
+  auto PopSample = [&](absl::BitGenRef rng) {
+    std::unique_ptr<AllocationSample> ret;
+
+    // Do our test bookkeeping separately, so we don't synchronize list
+    // externally.
+    absl::MutexLock l(&global.mu);
+    if (global.samplers.empty()) {
+      return ret;
+    }
+    size_t index = absl::Uniform<size_t>(rng, 0, global.samplers.size() - 1u);
+    std::swap(global.samplers[index], global.samplers.back());
+    ret = std::move(global.samplers.back());
+    global.samplers.pop_back();
+
+    CHECK_CONDITION(ret != nullptr);
+    return ret;
+  };
+
+  m.Start(kThreads, [&](int thread) {
+    auto& state = thread_states[thread];
+    const double coin = absl::Uniform(state, 0., 1.0);
+
+    if (coin < 0.1) {
+      // Add a sampler.  This occurs implicitly in the AllocationSample
+      // constructor.
+      auto sampler = std::make_unique<AllocationSample>(&list, absl::Now());
+
+      // Do our test bookkeeping separately, so we don't synchronize list
+      // externally.
+      {
+        absl::MutexLock l(&global.mu);
+        if (global.samplers.size() < kMaxSamplers) {
+          // Add to the list.
+          global.samplers.push_back(std::move(sampler));
+        }
+      }
+
+      // If we didn't push it, we will unregister in ~AllocationSample.
+    } else if (coin < 0.2) {
+      std::unique_ptr<AllocationSample> sampler = PopSample(state);
+
+      // Remove a sample and allow its destructor to handle unregistering.
+      sampler.reset();
+    } else if (coin < 0.25) {
+      // Call Stop occasionally.
+      std::unique_ptr<AllocationSample> sampler = PopSample(state);
+
+      if (sampler) {
+        std::move(*sampler).Stop();
+      }
+    } else {
+      int allocations;
+      {
+        // StackTraceTable uses a global allocator, rather than one that is
+        // injected.  Consult the global state to see how many allocations are
+        // active.
+        absl::base_internal::SpinLockHolder h(&pageheap_lock);
+        allocations = tc_globals.linked_sample_allocator().stats().in_use;
+      }
+      if (allocations >= kMaxAllocations) {
+        return;
+      }
+
+      StackTrace s{};
+      s.requested_size = 16;
+      s.allocated_size = 32;
+      list.ReportMalloc(s);
+    }
+  });
+
+  absl::SleepFor(absl::Milliseconds(1));
+
+  m.Stop();
+}
+
+}  // namespace
+}  // namespace tcmalloc::tcmalloc_internal
--- a/src/third_party/tcmalloc/dist/tcmalloc/allocation_sampling.h
+++ b/src/third_party/tcmalloc/dist/tcmalloc/allocation_sampling.h
@ -0,0 +1,383 @@
+// Copyright 2022 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TCMALLOC_ALLOCATION_SAMPLING_H_
+#define TCMALLOC_ALLOCATION_SAMPLING_H_
+
+#include <memory>
+#include <utility>
+
+#include "tcmalloc/cpu_cache.h"
+#include "tcmalloc/guarded_page_allocator.h"
+#include "tcmalloc/internal/logging.h"
+#include "tcmalloc/malloc_extension.h"
+#include "tcmalloc/pagemap.h"
+#include "tcmalloc/sampler.h"
+#include "tcmalloc/span.h"
+#include "tcmalloc/stack_trace_table.h"
+#include "tcmalloc/tcmalloc_policy.h"
+#include "tcmalloc/thread_cache.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc::tcmalloc_internal {
+
+// This function computes a profile that maps a live stack trace to
+// the number of bytes of central-cache memory pinned by an allocation
+// at that stack trace.
+// In the case when span is hosting >= 1 number of small objects (t.proxy !=
+// nullptr), we call span::Fragmentation() and read `span->allocated_`. It is
+// safe to do so since we hold the per-sample lock while iterating over sampled
+// allocations. It prevents the sampled allocation that has the proxy object to
+// complete deallocation, thus `proxy` can not be returned to the span yet. It
+// thus prevents the central free list to return the span to the page heap.
+template <typename State>
+static std::unique_ptr<const ProfileBase> DumpFragmentationProfile(
+    State& state) {
+  auto profile = std::make_unique<StackTraceTable>(ProfileType::kFragmentation);
+  state.sampled_allocation_recorder().Iterate(
+      [&state, &profile](const SampledAllocation& sampled_allocation) {
+        // Compute fragmentation to charge to this sample:
+        const StackTrace& t = sampled_allocation.sampled_stack;
+        if (t.proxy == nullptr) {
+          // There is just one object per-span, and neighboring spans
+          // can be released back to the system, so we charge no
+          // fragmentation to this sampled object.
+          return;
+        }
+
+        // Fetch the span on which the proxy lives so we can examine its
+        // co-residents.
+        const PageId p = PageIdContaining(t.proxy);
+        Span* span = state.pagemap().GetDescriptor(p);
+        if (span == nullptr) {
+          // Avoid crashes in production mode code, but report in tests.
+          ASSERT(span != nullptr);
+          return;
+        }
+
+        const double frag = span->Fragmentation(t.allocated_size);
+        if (frag > 0) {
+          // Associate the memory warmth with the actual object, not the proxy.
+          // The residency information (t.span_start_address) is likely not very
+          // useful, but we might as well pass it along.
+          profile->AddTrace(frag, t);
+        }
+      });
+  return profile;
+}
+
+template <typename State>
+static std::unique_ptr<const ProfileBase> DumpHeapProfile(State& state) {
+  auto profile = std::make_unique<StackTraceTable>(ProfileType::kHeap);
+  state.sampled_allocation_recorder().Iterate(
+      [&](const SampledAllocation& sampled_allocation) {
+        profile->AddTrace(1.0, sampled_allocation.sampled_stack);
+      });
+  return profile;
+}
+
+ABSL_CONST_INIT static thread_local Sampler thread_sampler_
+    ABSL_ATTRIBUTE_INITIAL_EXEC;
+
+inline Sampler* GetThreadSampler() { return &thread_sampler_; }
+
+inline bool ShouldGuardingBeAttempted(
+    Profile::Sample::GuardedStatus guarded_status) {
+  switch (guarded_status) {
+    case Profile::Sample::GuardedStatus::LargerThanOnePage:
+    case Profile::Sample::GuardedStatus::Disabled:
+    case Profile::Sample::GuardedStatus::RateLimited:
+    case Profile::Sample::GuardedStatus::TooSmall:
+    case Profile::Sample::GuardedStatus::NoAvailableSlots:
+    case Profile::Sample::GuardedStatus::MProtectFailed:
+    case Profile::Sample::GuardedStatus::Filtered:
+    case Profile::Sample::GuardedStatus::Unknown:
+    case Profile::Sample::GuardedStatus::NotAttempted:
+      return false;
+    case Profile::Sample::GuardedStatus::Requested:
+    case Profile::Sample::GuardedStatus::Required:
+    case Profile::Sample::GuardedStatus::Guarded:
+      return true;
+  }
+  return false;
+}
+
+// If this allocation can be guarded, and if it's time to do a guarded sample,
+// returns a guarded allocation Span.  Otherwise returns nullptr.
+template <typename State>
+static GuardedPageAllocator::AllocWithStatus TrySampleGuardedAllocation(
+    State& state, size_t size, size_t alignment, Length num_pages) {
+  if (num_pages != Length(1)) {
+    return {nullptr, Profile::Sample::GuardedStatus::LargerThanOnePage};
+  }
+  Profile::Sample::GuardedStatus guarded_status =
+      GetThreadSampler()->ShouldSampleGuardedAllocation();
+  // If there is a reason not to guard, then return.
+  if (!ShouldGuardingBeAttempted(guarded_status)) {
+    return {nullptr, guarded_status};
+  }
+  // The num_pages == 1 constraint ensures that size <= kPageSize.  And
+  // since alignments above kPageSize cause size_class == 0, we're also
+  // guaranteed alignment <= kPageSize
+  //
+  // In all cases kPageSize <= GPA::page_size_, so Allocate's preconditions
+  // are met.
+  return state.guardedpage_allocator().Allocate(size, alignment);
+}
+
+// ShouldSampleAllocation() is called when an allocation of the given requested
+// size is in progress. It returns the sampling weight of the allocation if it
+// should be "sampled," and 0 otherwise. See SampleifyAllocation().
+//
+// Sampling is done based on requested sizes and later unskewed during profile
+// generation.
+inline size_t ShouldSampleAllocation(size_t size) {
+  return GetThreadSampler()->RecordAllocation(size);
+}
+
+template <typename State>
+ABSL_ATTRIBUTE_NOINLINE static inline void FreeProxyObject(State& state,
+                                                           void* ptr,
+                                                           size_t size_class) {
+  if (ABSL_PREDICT_TRUE(UsePerCpuCache(state))) {
+    state.cpu_cache().Deallocate(ptr, size_class);
+  } else if (ThreadCache* cache = ThreadCache::GetCacheIfPresent();
+             ABSL_PREDICT_TRUE(cache)) {
+    cache->Deallocate(ptr, size_class);
+  } else {
+    // This thread doesn't have thread-cache yet or already. Delete directly
+    // into transfer cache.
+    state.transfer_cache().InsertRange(size_class, absl::Span<void*>(&ptr, 1));
+  }
+}
+
+// Performs sampling for already occurred allocation of object.
+//
+// For very small object sizes, object is used as 'proxy' and full
+// page with sampled marked is allocated instead.
+//
+// For medium-sized objects that have single instance per span,
+// they're simply freed and fresh page span is allocated to represent
+// sampling.
+//
+// For large objects (i.e. allocated with do_malloc_pages) they are
+// also fully reused and their span is marked as sampled.
+//
+// Note that do_free_with_size assumes sampled objects have
+// page-aligned addresses. Please change both functions if need to
+// invalidate the assumption.
+//
+// Note that size_class might not match requested_size in case of
+// memalign. I.e. when larger than requested allocation is done to
+// satisfy alignment constraint.
+//
+// In case of out-of-memory condition when allocating span or
+// stacktrace struct, this function simply cheats and returns original
+// object. As if no sampling was requested.
+template <typename State, typename Policy>
+static void* SampleifyAllocation(State& state, Policy policy,
+                                 size_t requested_size, size_t weight,
+                                 size_t size_class, void* obj, Span* span,
+                                 size_t* capacity) {
+  CHECK_CONDITION((size_class != 0 && obj != nullptr && span == nullptr) ||
+                  (size_class == 0 && obj == nullptr && span != nullptr));
+
+  StackTrace stack_trace;
+  stack_trace.proxy = nullptr;
+  stack_trace.requested_size = requested_size;
+  // Grab the stack trace outside the heap lock.
+  stack_trace.depth = absl::GetStackTrace(stack_trace.stack, kMaxStackDepth, 0);
+
+  // requested_alignment = 1 means 'small size table alignment was used'
+  // Historically this is reported as requested_alignment = 0
+  stack_trace.requested_alignment = policy.align();
+  if (stack_trace.requested_alignment == 1) {
+    stack_trace.requested_alignment = 0;
+  }
+
+  stack_trace.requested_size_returning = capacity != nullptr;
+  stack_trace.access_hint = static_cast<uint8_t>(policy.access());
+  stack_trace.weight = weight;
+
+  GuardedPageAllocator::AllocWithStatus alloc_with_status{
+      nullptr, Profile::Sample::GuardedStatus::NotAttempted};
+
+  if (size_class != 0) {
+    ASSERT(size_class == state.pagemap().sizeclass(PageIdContaining(obj)));
+
+    stack_trace.allocated_size = state.sizemap().class_to_size(size_class);
+    stack_trace.cold_allocated = IsExpandedSizeClass(size_class);
+
+    // If the caller didn't provide a span, allocate one:
+    Length num_pages = BytesToLengthCeil(stack_trace.allocated_size);
+    alloc_with_status = TrySampleGuardedAllocation(
+        state, requested_size, stack_trace.requested_alignment, num_pages);
+    if (alloc_with_status.status == Profile::Sample::GuardedStatus::Guarded) {
+      ASSERT(IsSampledMemory(alloc_with_status.alloc));
+      const PageId p = PageIdContaining(alloc_with_status.alloc);
+      absl::base_internal::SpinLockHolder h(&pageheap_lock);
+      span = Span::New(p, num_pages);
+      state.pagemap().Set(p, span);
+      // If we report capacity back from a size returning allocation, we can not
+      // report the allocated_size, as we guard the size to 'requested_size',
+      // and we maintain the invariant that GetAllocatedSize() must match the
+      // returned size from size returning allocations. So in that case, we
+      // report the requested size for both capacity and GetAllocatedSize().
+      if (capacity) stack_trace.allocated_size = requested_size;
+    } else if ((span = state.page_allocator().New(
+                    num_pages, 1, MemoryTag::kSampled)) == nullptr) {
+      if (capacity) *capacity = stack_trace.allocated_size;
+      return obj;
+    }
+
+    size_t span_size =
+        Length(state.sizemap().class_to_pages(size_class)).in_bytes();
+    size_t objects_per_span = span_size / stack_trace.allocated_size;
+
+    if (objects_per_span != 1) {
+      ASSERT(objects_per_span > 1);
+      stack_trace.proxy = obj;
+      obj = nullptr;
+    }
+  } else {
+    // Set allocated_size to the exact size for a page allocation.
+    // NOTE: if we introduce gwp-asan sampling / guarded allocations
+    // for page allocations, then we need to revisit do_malloc_pages as
+    // the current assumption is that only class sized allocs are sampled
+    // for gwp-asan.
+    stack_trace.allocated_size = span->bytes_in_span();
+    stack_trace.cold_allocated = IsColdMemory(span->start_address());
+  }
+  if (capacity) *capacity = stack_trace.allocated_size;
+
+  ASSERT(span != nullptr);
+
+  stack_trace.sampled_alloc_handle =
+      state.sampled_alloc_handle_generator.fetch_add(
+          1, std::memory_order_relaxed) +
+      1;
+  stack_trace.span_start_address = span->start_address();
+  stack_trace.allocation_time = absl::Now();
+  stack_trace.guarded_status = static_cast<int>(alloc_with_status.status);
+
+  // How many allocations does this sample represent, given the sampling
+  // frequency (weight) and its size.
+  const double allocation_estimate =
+      static_cast<double>(weight) / (requested_size + 1);
+
+  // Adjust our estimate of internal fragmentation.
+  ASSERT(requested_size <= stack_trace.allocated_size);
+  if (requested_size < stack_trace.allocated_size) {
+    state.sampled_internal_fragmentation_.Add(
+        allocation_estimate * (stack_trace.allocated_size - requested_size));
+  }
+
+  state.allocation_samples.ReportMalloc(stack_trace);
+
+  state.deallocation_samples.ReportMalloc(stack_trace);
+
+  // The SampledAllocation object is visible to readers after this. Readers only
+  // care about its various metadata (e.g. stack trace, weight) to generate the
+  // heap profile, and won't need any information from Span::Sample() next.
+  SampledAllocation* sampled_allocation =
+      state.sampled_allocation_recorder().Register(std::move(stack_trace));
+  // No pageheap_lock required. The span is freshly allocated and no one else
+  // can access it. It is visible after we return from this allocation path.
+  span->Sample(sampled_allocation);
+
+  state.peak_heap_tracker().MaybeSaveSample();
+
+  if (obj != nullptr) {
+    // We are not maintaining precise statistics on malloc hit/miss rates at our
+    // cache tiers.  We can deallocate into our ordinary cache.
+    ASSERT(size_class != 0);
+    FreeProxyObject(state, obj, size_class);
+  }
+  return (alloc_with_status.alloc != nullptr) ? alloc_with_status.alloc
+                                              : span->start_address();
+}
+
+template <typename State>
+inline void MaybeUnsampleAllocation(State& state, void* ptr, Span* span) {
+  // No pageheap_lock required. The sampled span should be unmarked and have its
+  // state cleared only once. External synchronization when freeing is required;
+  // otherwise, concurrent writes here would likely report a double-free.
+  if (SampledAllocation* sampled_allocation = span->Unsample()) {
+    void* const proxy = sampled_allocation->sampled_stack.proxy;
+    const size_t weight = sampled_allocation->sampled_stack.weight;
+    const size_t requested_size =
+        sampled_allocation->sampled_stack.requested_size;
+    const size_t allocated_size =
+        sampled_allocation->sampled_stack.allocated_size;
+    const size_t alignment =
+        sampled_allocation->sampled_stack.requested_alignment;
+    // How many allocations does this sample represent, given the sampling
+    // frequency (weight) and its size.
+    const double allocation_estimate =
+        static_cast<double>(weight) / (requested_size + 1);
+    AllocHandle sampled_alloc_handle =
+        sampled_allocation->sampled_stack.sampled_alloc_handle;
+    state.sampled_allocation_recorder().Unregister(sampled_allocation);
+
+    // Adjust our estimate of internal fragmentation.
+    ASSERT(requested_size <= allocated_size);
+    if (requested_size < allocated_size) {
+      const size_t sampled_fragmentation =
+          allocation_estimate * (allocated_size - requested_size);
+
+      // Check against wraparound
+      ASSERT(state.sampled_internal_fragmentation_.value() >=
+             sampled_fragmentation);
+      state.sampled_internal_fragmentation_.Add(-sampled_fragmentation);
+    }
+
+    state.deallocation_samples.ReportFree(sampled_alloc_handle);
+
+    if (proxy) {
+      const auto policy = CppPolicy().InSameNumaPartitionAs(proxy);
+      size_t size_class;
+      if (AccessFromPointer(proxy) == AllocationAccess::kCold) {
+        size_class = state.sizemap().SizeClass(
+            policy.AccessAsCold().AlignAs(alignment), allocated_size);
+      } else {
+        size_class = state.sizemap().SizeClass(
+            policy.AccessAsHot().AlignAs(alignment), allocated_size);
+      }
+      ASSERT(size_class == state.pagemap().sizeclass(PageIdContaining(proxy)));
+      FreeProxyObject(state, proxy, size_class);
+    }
+  }
+}
+
+template <typename State, typename Policy, typename CapacityPtr>
+static void* SampleLargeAllocation(State& state, Policy policy,
+                                   size_t requested_size, size_t weight,
+                                   Span* span, CapacityPtr capacity) {
+  return SampleifyAllocation(state, policy, requested_size, weight, 0, nullptr,
+                             span, capacity);
+}
+
+template <typename State, typename Policy, typename CapacityPtr>
+static void* SampleSmallAllocation(State& state, Policy policy,
+                                   size_t requested_size, size_t weight,
+                                   size_t size_class, void* obj,
+                                   CapacityPtr capacity) {
+  return SampleifyAllocation(state, policy, requested_size, weight, size_class,
+                             obj, nullptr, capacity);
+}
+
+}  // namespace tcmalloc::tcmalloc_internal
+GOOGLE_MALLOC_SECTION_END
+
+#endif  // TCMALLOC_ALLOCATION_SAMPLING_H_
--- a/src/third_party/tcmalloc/dist/tcmalloc/arena.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/arena.cc
@ -0,0 +1,85 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tcmalloc/arena.h"
+
+#include <new>
+
+#include "tcmalloc/internal/logging.h"
+#include "tcmalloc/static_vars.h"
+#include "tcmalloc/system-alloc.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+
+void* Arena::Alloc(size_t bytes, std::align_val_t alignment) {
+  size_t align = static_cast<size_t>(alignment);
+  ASSERT(align > 0);
+  {  // First we need to move up to the correct alignment.
+    const int misalignment = reinterpret_cast<uintptr_t>(free_area_) % align;
+    const int alignment_bytes = misalignment != 0 ? align - misalignment : 0;
+    free_area_ += alignment_bytes;
+    free_avail_ -= alignment_bytes;
+    bytes_allocated_ += alignment_bytes;
+  }
+  char* result;
+  if (free_avail_ < bytes) {
+    size_t ask = bytes > kAllocIncrement ? bytes : kAllocIncrement;
+    // TODO(b/171081864): Arena allocations should be made relatively
+    // infrequently.  Consider tagging this memory with sampled objects which
+    // are also infrequently allocated.
+    //
+    // In the meantime it is important that we use the current NUMA partition
+    // rather than always using a particular one because it's possible that any
+    // single partition we choose might only contain nodes that the process is
+    // unable to allocate from due to cgroup restrictions.
+    MemoryTag tag;
+    const auto& numa_topology = tc_globals.numa_topology();
+    if (numa_topology.numa_aware()) {
+      tag = NumaNormalTag(numa_topology.GetCurrentPartition());
+    } else {
+      tag = MemoryTag::kNormal;
+    }
+
+    auto [ptr, actual_size] = SystemAlloc(ask, kPageSize, tag);
+    free_area_ = reinterpret_cast<char*>(ptr);
+    if (ABSL_PREDICT_FALSE(free_area_ == nullptr)) {
+      Crash(kCrash, __FILE__, __LINE__,
+            "FATAL ERROR: Out of memory trying to allocate internal tcmalloc "
+            "data (bytes, object-size); is something preventing mmap from "
+            "succeeding (sandbox, VSS limitations)?",
+            kAllocIncrement, bytes);
+    }
+    SystemBack(free_area_, actual_size);
+
+    // We've discarded the previous free_area_, so any bytes that were
+    // unallocated are effectively inaccessible to future allocations.
+    bytes_unavailable_ += free_avail_;
+    blocks_++;
+
+    free_avail_ = actual_size;
+  }
+
+  ASSERT(reinterpret_cast<uintptr_t>(free_area_) % align == 0);
+  result = free_area_;
+  free_area_ += bytes;
+  free_avail_ -= bytes;
+  bytes_allocated_ += bytes;
+  return reinterpret_cast<void*>(result);
+}
+
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
--- a/src/third_party/tcmalloc/dist/tcmalloc/arena.h
+++ b/src/third_party/tcmalloc/dist/tcmalloc/arena.h
@ -0,0 +1,107 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TCMALLOC_ARENA_H_
+#define TCMALLOC_ARENA_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <new>
+
+#include "absl/base/attributes.h"
+#include "absl/base/thread_annotations.h"
+#include "tcmalloc/common.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+
+struct ArenaStats {
+  // The number of bytes allocated and in-use by calls to Alloc().
+  size_t bytes_allocated;
+  // The number of bytes currently reserved for future calls to Alloc().
+  size_t bytes_unallocated;
+  // The number of bytes lost and unavailable to calls to Alloc() due to
+  // inefficiencies in Arena.
+  size_t bytes_unavailable;
+  // The number of allocated bytes that have subsequently become non-resident,
+  // e.g. due to the slab being resized. Note that these bytes are disjoint from
+  // the ones counted in `bytes_allocated`.
+  size_t bytes_nonresident;
+
+  // The number of blocks allocated by the Arena.
+  size_t blocks;
+};
+
+// Arena allocation; designed for use by tcmalloc internal data structures like
+// spans, profiles, etc.  Always expands.
+class Arena {
+ public:
+  constexpr Arena() {}
+
+  // Returns a properly aligned byte array of length "bytes".  Crashes if
+  // allocation fails.  Requires pageheap_lock is held.
+  ABSL_ATTRIBUTE_RETURNS_NONNULL void* Alloc(
+      size_t bytes, std::align_val_t alignment = kAlignment)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
+
+  // Updates the stats for allocated and non-resident bytes.
+  void UpdateAllocatedAndNonresident(int64_t allocated, int64_t nonresident)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
+    ASSERT(static_cast<int64_t>(bytes_allocated_) + allocated >= 0);
+    bytes_allocated_ += allocated;
+    ASSERT(static_cast<int64_t>(bytes_nonresident_) + nonresident >= 0);
+    bytes_nonresident_ += nonresident;
+  }
+
+  // Returns statistics about memory allocated and managed by this Arena.
+  ArenaStats stats() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
+    ArenaStats s;
+    s.bytes_allocated = bytes_allocated_;
+    s.bytes_unallocated = free_avail_;
+    s.bytes_unavailable = bytes_unavailable_;
+    s.bytes_nonresident = bytes_nonresident_;
+    s.blocks = blocks_;
+    return s;
+  }
+
+ private:
+  // How much to allocate from system at a time
+  static constexpr int kAllocIncrement = 128 << 10;
+
+  // Free area from which to carve new objects
+  char* free_area_ ABSL_GUARDED_BY(pageheap_lock) = nullptr;
+  size_t free_avail_ ABSL_GUARDED_BY(pageheap_lock) = 0;
+
+  // Total number of bytes allocated from this arena
+  size_t bytes_allocated_ ABSL_GUARDED_BY(pageheap_lock) = 0;
+  // The number of bytes that are unused and unavailable for future allocations
+  // because they are at the end of a discarded arena block.
+  size_t bytes_unavailable_ ABSL_GUARDED_BY(pageheap_lock) = 0;
+  // The number of bytes on the arena that have been MADV_DONTNEEDed away. Note
+  // that these bytes are disjoint from the ones counted in `bytes_allocated`.
+  size_t bytes_nonresident_ ABSL_GUARDED_BY(pageheap_lock) = 0;
+  // Total number of blocks/free areas managed by this Arena.
+  size_t blocks_ ABSL_GUARDED_BY(pageheap_lock) = 0;
+
+  Arena(const Arena&) = delete;
+  Arena& operator=(const Arena&) = delete;
+};
+
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
+
+#endif  // TCMALLOC_ARENA_H_
--- a/src/third_party/tcmalloc/dist/tcmalloc/arena_test.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/arena_test.cc
@ -0,0 +1,158 @@
+// Copyright 2021 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tcmalloc/arena.h"
+
+#include <stdint.h>
+
+#include <new>
+
+#include "gtest/gtest.h"
+
+namespace tcmalloc {
+namespace tcmalloc_internal {
+namespace {
+
+std::align_val_t Align(int align) {
+  return static_cast<std::align_val_t>(align);
+}
+
+TEST(Arena, AlignedAlloc) {
+  Arena arena;
+  absl::base_internal::SpinLockHolder h(&pageheap_lock);
+  EXPECT_EQ(reinterpret_cast<uintptr_t>(arena.Alloc(64, Align(64))) % 64, 0);
+  EXPECT_EQ(reinterpret_cast<uintptr_t>(arena.Alloc(7)) % 8, 0);
+  EXPECT_EQ(reinterpret_cast<uintptr_t>(arena.Alloc(128, Align(64))) % 64, 0);
+  for (int alignment = 1; alignment < 100; ++alignment) {
+    EXPECT_EQ(reinterpret_cast<uintptr_t>(arena.Alloc(7, Align(alignment))) %
+                  alignment,
+              0);
+  }
+}
+
+TEST(Arena, Stats) {
+  Arena arena;
+
+  ArenaStats stats;
+  {
+    absl::base_internal::SpinLockHolder h(&pageheap_lock);
+    stats = arena.stats();
+  }
+  EXPECT_EQ(stats.bytes_allocated, 0);
+  EXPECT_EQ(stats.bytes_unallocated, 0);
+  EXPECT_EQ(stats.bytes_unavailable, 0);
+  EXPECT_EQ(stats.bytes_nonresident, 0);
+  EXPECT_EQ(stats.blocks, 0);
+
+  // Trigger an allocation and grab new stats.
+  ArenaStats stats_after_alloc;
+  void* ptr;
+  {
+    absl::base_internal::SpinLockHolder h(&pageheap_lock);
+    ptr = arena.Alloc(1, Align(1));
+    stats_after_alloc = arena.stats();
+  }
+  EXPECT_NE(ptr, nullptr);
+
+  EXPECT_EQ(stats_after_alloc.bytes_allocated, 1);
+  EXPECT_GE(stats_after_alloc.bytes_unallocated, 0);
+  EXPECT_EQ(stats_after_alloc.bytes_unavailable, 0);
+  EXPECT_EQ(stats_after_alloc.bytes_nonresident, 0);
+  EXPECT_EQ(stats_after_alloc.blocks, 1);
+
+  // Trigger an allocation that is larger than the remaining free bytes.
+  //
+  // TODO(b/201694482): Optimize this.
+  ArenaStats stats_after_alloc2;
+  {
+    absl::base_internal::SpinLockHolder h(&pageheap_lock);
+    ptr = arena.Alloc(stats_after_alloc.bytes_unallocated + 1, Align(1));
+    stats_after_alloc2 = arena.stats();
+  }
+  EXPECT_NE(ptr, nullptr);
+
+  EXPECT_EQ(stats_after_alloc2.bytes_allocated,
+            stats_after_alloc.bytes_unallocated + 2);
+  EXPECT_GE(stats_after_alloc2.bytes_unallocated, 0);
+  EXPECT_EQ(stats_after_alloc2.bytes_unavailable,
+            stats_after_alloc.bytes_unallocated);
+  EXPECT_EQ(stats_after_alloc.bytes_nonresident, 0);
+  EXPECT_EQ(stats_after_alloc2.blocks, 2);
+}
+
+TEST(Arena, ReportUnmapped) {
+  Arena arena;
+  ArenaStats stats_after_alloc;
+  void* ptr;
+  {
+    absl::base_internal::SpinLockHolder h(&pageheap_lock);
+    ptr = arena.Alloc(10, Align(1));
+    stats_after_alloc = arena.stats();
+  }
+  EXPECT_NE(ptr, nullptr);
+
+  EXPECT_EQ(stats_after_alloc.bytes_allocated, 10);
+  EXPECT_EQ(stats_after_alloc.bytes_nonresident, 0);
+
+  {
+    absl::base_internal::SpinLockHolder h(&pageheap_lock);
+    arena.UpdateAllocatedAndNonresident(-5, 5);
+    stats_after_alloc = arena.stats();
+  }
+
+  EXPECT_EQ(stats_after_alloc.bytes_allocated, 5);
+  EXPECT_EQ(stats_after_alloc.bytes_nonresident, 5);
+
+  {
+    absl::base_internal::SpinLockHolder h(&pageheap_lock);
+    arena.UpdateAllocatedAndNonresident(3, -3);
+    stats_after_alloc = arena.stats();
+  }
+
+  EXPECT_EQ(stats_after_alloc.bytes_allocated, 8);
+  EXPECT_EQ(stats_after_alloc.bytes_nonresident, 2);
+}
+
+TEST(Arena, BytesImpending) {
+  Arena arena;
+
+  ArenaStats stats;
+  {
+    absl::base_internal::SpinLockHolder h(&pageheap_lock);
+    stats = arena.stats();
+  }
+  EXPECT_EQ(stats.bytes_allocated, 0);
+
+  {
+    absl::base_internal::SpinLockHolder h(&pageheap_lock);
+    arena.UpdateAllocatedAndNonresident(100, 0);
+    stats = arena.stats();
+  }
+
+  EXPECT_EQ(stats.bytes_allocated, 100);
+
+  void* ptr;
+  {
+    absl::base_internal::SpinLockHolder h(&pageheap_lock);
+    arena.UpdateAllocatedAndNonresident(-100, 0);
+    ptr = arena.Alloc(100, Align(1));
+    stats = arena.stats();
+  }
+  EXPECT_NE(ptr, nullptr);
+  EXPECT_EQ(stats.bytes_allocated, 100);
+}
+
+}  // namespace
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
--- a/src/third_party/tcmalloc/dist/tcmalloc/background.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/background.cc
@ -0,0 +1,131 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+
+#include "absl/base/internal/sysinfo.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "tcmalloc/cpu_cache.h"
+#include "tcmalloc/internal/logging.h"
+#include "tcmalloc/internal/percpu.h"
+#include "tcmalloc/internal_malloc_extension.h"
+#include "tcmalloc/malloc_extension.h"
+#include "tcmalloc/parameters.h"
+#include "tcmalloc/static_vars.h"
+
+// Release memory to the system at a constant rate.
+void MallocExtension_Internal_ProcessBackgroundActions() {
+  using ::tcmalloc::tcmalloc_internal::Parameters;
+  using ::tcmalloc::tcmalloc_internal::tc_globals;
+
+  tcmalloc::MallocExtension::MarkThreadIdle();
+
+  absl::Time prev_time = absl::Now();
+  constexpr absl::Duration kSleepTime = absl::Seconds(1);
+
+  // Reclaim inactive per-cpu caches once per kCpuCacheReclaimPeriod.
+  //
+  // We use a longer 30 sec reclaim period to make sure that caches are indeed
+  // idle. Reclaim drains entire cache, as opposed to cache shuffle for instance
+  // that only shrinks a cache by a few objects at a time. So, we might have
+  // larger performance degradation if we use a shorter reclaim interval and
+  // drain caches that weren't supposed to.
+  constexpr absl::Duration kCpuCacheReclaimPeriod = absl::Seconds(30);
+  absl::Time last_reclaim = absl::Now();
+
+  // Shuffle per-cpu caches once per kCpuCacheShufflePeriod.
+  constexpr absl::Duration kCpuCacheShufflePeriod = absl::Seconds(5);
+  absl::Time last_shuffle = absl::Now();
+
+  // See if we should resize the slab once per kCpuCacheSlabResizePeriod. This
+  // period is coprime to kCpuCacheShufflePeriod and kCpuCacheReclaimPeriod.
+  constexpr absl::Duration kCpuCacheSlabResizePeriod = absl::Seconds(29);
+  absl::Time last_slab_resize_check = absl::Now();
+
+#ifndef TCMALLOC_SMALL_BUT_SLOW
+  // We reclaim unused objects from the transfer caches once per
+  // kTransferCacheResizePeriod.
+  constexpr absl::Duration kTransferCachePlunderPeriod = absl::Seconds(5);
+  absl::Time last_transfer_cache_plunder_check = absl::Now();
+
+  // Resize transfer caches once per kTransferCacheResizePeriod.
+  constexpr absl::Duration kTransferCacheResizePeriod = absl::Seconds(2);
+  absl::Time last_transfer_cache_resize_check = absl::Now();
+#endif
+
+  while (true) {
+    absl::Time now = absl::Now();
+
+    // We follow the cache hierarchy in TCMalloc from outermost (per-CPU) to
+    // innermost (the page heap).  Freeing up objects at one layer can help aid
+    // memory coalescing for inner caches.
+
+    if (tcmalloc::MallocExtension::PerCpuCachesActive()) {
+      // Accelerate fences as part of this operation by registering this thread
+      // with rseq.  While this is not strictly required to succeed, we do not
+      // expect an inconsistent state for rseq (some threads registered and some
+      // threads unable to).
+      CHECK_CONDITION(tcmalloc::tcmalloc_internal::subtle::percpu::IsFast());
+
+      // Try to reclaim per-cpu caches once every kCpuCacheReclaimPeriod
+      // when enabled.
+      if (now - last_reclaim >= kCpuCacheReclaimPeriod) {
+        tc_globals.cpu_cache().TryReclaimingCaches();
+        last_reclaim = now;
+      }
+
+      if (Parameters::shuffle_per_cpu_caches() &&
+          now - last_shuffle >= kCpuCacheShufflePeriod) {
+        tc_globals.cpu_cache().ShuffleCpuCaches();
+        last_shuffle = now;
+      }
+
+      // See if we need to grow the slab once every kCpuCacheSlabResizePeriod
+      // when enabled.
+      if (Parameters::per_cpu_caches_dynamic_slab_enabled() &&
+          now - last_slab_resize_check >= kCpuCacheSlabResizePeriod) {
+        tc_globals.cpu_cache().ResizeSlabIfNeeded();
+        last_slab_resize_check = now;
+      }
+    }
+
+    tc_globals.sharded_transfer_cache().Plunder();
+
+#ifndef TCMALLOC_SMALL_BUT_SLOW
+    // Try to plunder and reclaim unused objects from transfer caches.
+    if (now - last_transfer_cache_plunder_check >=
+            kTransferCachePlunderPeriod &&
+        Parameters::partial_transfer_cache()) {
+      tc_globals.transfer_cache().TryPlunder();
+      last_transfer_cache_plunder_check = now;
+    }
+
+    if (now - last_transfer_cache_resize_check >= kTransferCacheResizePeriod) {
+      tc_globals.transfer_cache().TryResizingCaches();
+      last_transfer_cache_resize_check = now;
+    }
+#endif
+
+    const ssize_t bytes_to_release =
+        static_cast<size_t>(Parameters::background_release_rate()) *
+        absl::ToDoubleSeconds(now - prev_time);
+    if (bytes_to_release > 0) {  // may be negative if time goes backwards
+      tcmalloc::MallocExtension::ReleaseMemoryToSystem(bytes_to_release);
+    }
+
+    prev_time = now;
+    absl::SleepFor(kSleepTime);
+  }
+}
--- a/src/third_party/tcmalloc/dist/tcmalloc/central_freelist.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/central_freelist.cc
@ -0,0 +1,116 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tcmalloc/central_freelist.h"
+
+#include <stdint.h>
+
+#include "tcmalloc/internal/linked_list.h"
+#include "tcmalloc/internal/logging.h"
+#include "tcmalloc/internal/optimization.h"
+#include "tcmalloc/internal/prefetch.h"
+#include "tcmalloc/page_heap.h"
+#include "tcmalloc/pagemap.h"
+#include "tcmalloc/pages.h"
+#include "tcmalloc/static_vars.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+namespace central_freelist_internal {
+
+static MemoryTag MemoryTagFromSizeClass(size_t size_class) {
+  if (IsExpandedSizeClass(size_class)) {
+    return MemoryTag::kCold;
+  }
+  if (!tc_globals.numa_topology().numa_aware()) {
+    return MemoryTag::kNormal;
+  }
+  return NumaNormalTag(size_class / kNumBaseClasses);
+}
+
+size_t StaticForwarder::class_to_size(int size_class) {
+  return tc_globals.sizemap().class_to_size(size_class);
+}
+
+Length StaticForwarder::class_to_pages(int size_class) {
+  return Length(tc_globals.sizemap().class_to_pages(size_class));
+}
+
+Span* StaticForwarder::MapObjectToSpan(const void* object) {
+  const PageId p = PageIdContaining(object);
+  Span* span = tc_globals.pagemap().GetExistingDescriptor(p);
+  return span;
+}
+
+Span* StaticForwarder::AllocateSpan(int size_class, size_t objects_per_span,
+                                    Length pages_per_span) {
+  const MemoryTag tag = MemoryTagFromSizeClass(size_class);
+  Span* span =
+      tc_globals.page_allocator().New(pages_per_span, objects_per_span, tag);
+  if (ABSL_PREDICT_FALSE(span == nullptr)) {
+    return nullptr;
+  }
+  ASSERT(tag == GetMemoryTag(span->start_address()));
+  ASSERT(span->num_pages() == pages_per_span);
+
+  tc_globals.pagemap().RegisterSizeClass(span, size_class);
+  return span;
+}
+
+static void ReturnSpansToPageHeap(MemoryTag tag, absl::Span<Span*> free_spans,
+                                  size_t objects_per_span)
+    ABSL_LOCKS_EXCLUDED(pageheap_lock) {
+  absl::base_internal::SpinLockHolder h(&pageheap_lock);
+  for (Span* const free_span : free_spans) {
+    ASSERT(tag == GetMemoryTag(free_span->start_address()));
+    tc_globals.page_allocator().Delete(free_span, objects_per_span, tag);
+  }
+}
+
+void StaticForwarder::DeallocateSpans(int size_class, size_t objects_per_span,
+                                      absl::Span<Span*> free_spans) {
+  // Unregister size class doesn't require holding any locks.
+  for (Span* const free_span : free_spans) {
+    ASSERT(IsNormalMemory(free_span->start_address()) ||
+           IsColdMemory(free_span->start_address()));
+    tc_globals.pagemap().UnregisterSizeClass(free_span);
+
+    // Before taking pageheap_lock, prefetch the PageTrackers these spans are
+    // on.
+    //
+    // Small-but-slow does not use the HugePageAwareAllocator (by default), so
+    // do not prefetch on this config.
+#ifndef TCMALLOC_SMALL_BUT_SLOW
+    const PageId p = free_span->first_page();
+
+    // In huge_page_filler.h, we static_assert that PageTracker's key elements
+    // for deallocation are within the first two cachelines.
+    void* pt = tc_globals.pagemap().GetHugepage(p);
+    // Prefetch for writing, as we will issue stores to the PageTracker
+    // instance.
+    PrefetchW(pt);
+    PrefetchW(reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(pt) +
+                                      ABSL_CACHELINE_SIZE));
+#endif  // TCMALLOC_SMALL_BUT_SLOW
+  }
+
+  const MemoryTag tag = MemoryTagFromSizeClass(size_class);
+  ReturnSpansToPageHeap(tag, free_spans, objects_per_span);
+}
+
+}  // namespace central_freelist_internal
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
--- a/src/third_party/tcmalloc/dist/tcmalloc/central_freelist.h
+++ b/src/third_party/tcmalloc/dist/tcmalloc/central_freelist.h
@ -0,0 +1,581 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TCMALLOC_CENTRAL_FREELIST_H_
+#define TCMALLOC_CENTRAL_FREELIST_H_
+
+#include <stddef.h>
+
+#include <algorithm>
+#include <cstddef>
+
+#include "absl/base/attributes.h"
+#include "absl/base/const_init.h"
+#include "absl/base/internal/spinlock.h"
+#include "absl/base/macros.h"
+#include "absl/base/thread_annotations.h"
+#include "tcmalloc/common.h"
+#include "tcmalloc/hinted_tracker_lists.h"
+#include "tcmalloc/internal/atomic_stats_counter.h"
+#include "tcmalloc/internal/optimization.h"
+#include "tcmalloc/pages.h"
+#include "tcmalloc/parameters.h"
+#include "tcmalloc/span.h"
+#include "tcmalloc/span_stats.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+
+namespace central_freelist_internal {
+
+// StaticForwarder provides access to the PageMap and page heap.
+//
+// This is a class, rather than namespaced globals, so that it can be mocked for
+// testing.
+class StaticForwarder {
+ public:
+  static size_t class_to_size(int size_class);
+  static Length class_to_pages(int size_class);
+  static Span* MapObjectToSpan(const void* object);
+  static Span* AllocateSpan(int size_class, size_t objects_per_span,
+                            Length pages_per_span)
+      ABSL_LOCKS_EXCLUDED(pageheap_lock);
+  static void DeallocateSpans(int size_class, size_t objects_per_span,
+                              absl::Span<Span*> free_spans)
+      ABSL_LOCKS_EXCLUDED(pageheap_lock);
+};
+
+// Specifies number of nonempty_ lists that keep track of non-empty spans.
+static constexpr size_t kNumLists = 8;
+
+// Data kept per size-class in central cache.
+template <typename ForwarderT>
+class CentralFreeList {
+ public:
+  using Forwarder = ForwarderT;
+
+  constexpr CentralFreeList()
+      : lock_(absl::kConstInit, absl::base_internal::SCHEDULE_KERNEL_ONLY),
+        size_class_(0),
+        object_size_(0),
+        objects_per_span_(0),
+        first_nonempty_index_(0),
+        pages_per_span_(0),
+        nonempty_() {}
+
+  CentralFreeList(const CentralFreeList&) = delete;
+  CentralFreeList& operator=(const CentralFreeList&) = delete;
+
+  void Init(size_t size_class) ABSL_LOCKS_EXCLUDED(lock_);
+
+  // These methods all do internal locking.
+
+  // Insert batch into the central freelist.
+  // REQUIRES: batch.size() > 0 && batch.size() <= kMaxObjectsToMove.
+  void InsertRange(absl::Span<void*> batch) ABSL_LOCKS_EXCLUDED(lock_);
+
+  // Fill a prefix of batch[0..N-1] with up to N elements removed from central
+  // freelist.  Return the number of elements removed.
+  ABSL_MUST_USE_RESULT int RemoveRange(void** batch, int N)
+      ABSL_LOCKS_EXCLUDED(lock_);
+
+  // Returns the number of free objects in cache.
+  size_t length() const { return static_cast<size_t>(counter_.value()); }
+
+  // Returns the memory overhead (internal fragmentation) attributable
+  // to the freelist.  This is memory lost when the size of elements
+  // in a freelist doesn't exactly divide the page-size (an 8192-byte
+  // page full of 5-byte objects would have 2 bytes memory overhead).
+  size_t OverheadBytes() const;
+
+  // Returns number of live spans currently in the nonempty_[n] list.
+  // REQUIRES: n >= 0 && n < kNumLists.
+  size_t NumSpansInList(int n) ABSL_LOCKS_EXCLUDED(lock_);
+  SpanStats GetSpanStats() const;
+
+  // Reports span utilization histogram stats.
+  void PrintSpanUtilStats(Printer* out) const;
+  void PrintSpanUtilStatsInPbtxt(PbtxtRegion* region) const;
+
+  // Get number of spans in the histogram bucket. We record spans in the
+  // histogram indexed by absl::bit_width(allocated). So, instead of using the
+  // absolute number of allocated objects, it uses absl::bit_width(allocated),
+  // passed as <bitwidth>, to index and return the number of spans in the
+  // histogram.
+  size_t NumSpansWith(uint16_t bitwidth) const;
+
+  Forwarder& forwarder() { return forwarder_; }
+
+ private:
+  // Release an object to spans.
+  // Returns object's span if it become completely free.
+  Span* ReleaseToSpans(void* object, Span* span, size_t object_size)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  // Populate cache by fetching from the page heap.
+  // May temporarily release lock_.
+  // Fill a prefix of batch[0..N-1] with up to N elements removed from central
+  // freelist. Returns the number of elements removed.
+  int Populate(void** batch, int N) ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  // Parses nonempty_ lists and returns span from the list with the lowest
+  // possible index.
+  // Returns the span if one exists in the nonempty_ lists. Else, returns
+  // nullptr.
+  Span* FirstNonEmptySpan() ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  // Returns first index to the nonempty_ lists that may record spans.
+  uint8_t GetFirstNonEmptyIndex() const;
+
+  // Returns index into nonempty_ based on the number of allocated objects for
+  // the span. Instead of using the absolute number of allocated objects, it
+  // uses absl::bit_width(allocated), passed as bitwidth, to calculate the list
+  // index.
+  static uint8_t IndexFor(uint8_t bitwidth);
+
+  // Records span utilization in objects_to_span_ map. Instead of using the
+  // absolute number of allocated objects, it uses
+  // absl::bit_width(allocated), passed as <bitwidth>, to index this map.
+  //
+  // If increase is set to true, includes the span by incrementing the count
+  // in the map. Otherwise, removes the span by decrementing the count in
+  // the map.
+  void RecordSpanUtil(uint8_t bitwidth, bool increase)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
+    ASSUME(bitwidth > 0);
+    // Updates to objects_to_span_ are guarded by lock_, so writes may be
+    // performed using LossyAdd.
+    objects_to_spans_[bitwidth - 1].LossyAdd(increase ? 1 : -1);
+  }
+
+  // This lock protects all the mutable data members.
+  absl::base_internal::SpinLock lock_;
+
+  size_t size_class_;  // My size class (immutable after Init())
+  size_t object_size_;
+  size_t objects_per_span_;
+  // Hint used for parsing through the nonempty_ lists. This prevents us from
+  // parsing the lists with an index starting zero, if the lowest possible index
+  // is higher than that.
+  size_t first_nonempty_index_;
+  Length pages_per_span_;
+
+  size_t num_spans() const {
+    size_t requested = num_spans_requested_.value();
+    size_t returned = num_spans_returned_.value();
+    if (requested < returned) return 0;
+    return (requested - returned);
+  }
+
+  void RecordSpanAllocated() ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
+    counter_.LossyAdd(objects_per_span_);
+    num_spans_requested_.LossyAdd(1);
+  }
+
+  void RecordMultiSpansDeallocated(size_t num_spans_returned)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
+    counter_.LossyAdd(-num_spans_returned * objects_per_span_);
+    num_spans_returned_.LossyAdd(num_spans_returned);
+  }
+
+  void UpdateObjectCounts(int num) ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
+    counter_.LossyAdd(num);
+  }
+
+  // The followings are kept as a StatsCounter so that they can read without
+  // acquiring a lock. Updates to these variables are guarded by lock_
+  // so writes are performed using LossyAdd for speed, the lock still
+  // guarantees accuracy.
+
+  // Num free objects in cache entry
+  StatsCounter counter_;
+
+  StatsCounter num_spans_requested_;
+  StatsCounter num_spans_returned_;
+
+  // Records histogram of span utilization.
+  //
+  // Each bucket in the histogram records number of live spans with
+  // corresponding number of allocated objects. Instead of using the absolute
+  // value of number of allocated objects, we use absl::bit_width(allocated) to
+  // index this map. A bucket in the histogram corresponds to power-of-two
+  // number of objects. That is, bucket N tracks number of spans with allocated
+  // objects < 2^(N+1). For instance, objects_to_spans_ map tracks number of
+  // spans with allocated objects in the range [a,b), indexed as: [1,2) in
+  // objects_to_spans_[0], [2,4) in objects_to_spans_[1], [4, 8) in
+  // objects_to_spans_[2] and so on. We can query the objects_to_spans_ map
+  // using NumSpansWith(bitwidth) to obtain the number of spans associated
+  // with the corresponding bucket in the histogram.
+  //
+  // As the actual value of objects_per_span_ is not known at compile time, we
+  // use maximum value that it can be to initialize this hashmap, and
+  // kSpanUtilBucketCapacity determines this value. We also check during Init
+  // that absl::bit_width(objects_per_span_) is indeed less than or equal to
+  // kSpanUtilBucketCapacity.
+  //
+  // We disable collection of histogram stats for TCMalloc small-but-slow due to
+  // performance issues. See b/227362263.
+  static constexpr size_t kSpanUtilBucketCapacity = 16;
+  StatsCounter objects_to_spans_[kSpanUtilBucketCapacity];
+
+  // Non-empty lists that distinguish spans based on the number of objects
+  // allocated from them. As we prioritize spans, spans may be added to any of
+  // the kNumLists nonempty_ lists based on their allocated objects. If span
+  // prioritization is disabled, we add spans to the nonempty_[kNumlists-1]
+  // list, leaving other lists unused.
+  //
+  // We do not enable multiple nonempty lists for small-but-slow yet due to
+  // performance issues. See b/227362263.
+#ifdef TCMALLOC_SMALL_BUT_SLOW
+  SpanList nonempty_ ABSL_GUARDED_BY(lock_);
+#else
+  HintedTrackerLists<Span, kNumLists> nonempty_ ABSL_GUARDED_BY(lock_);
+#endif
+
+  TCMALLOC_NO_UNIQUE_ADDRESS Forwarder forwarder_;
+};
+
+// Like a constructor and hence we disable thread safety analysis.
+template <class Forwarder>
+inline void CentralFreeList<Forwarder>::Init(size_t size_class)
+    ABSL_NO_THREAD_SAFETY_ANALYSIS {
+  size_class_ = size_class;
+  object_size_ = Forwarder::class_to_size(size_class);
+  pages_per_span_ = Forwarder::class_to_pages(size_class);
+  objects_per_span_ =
+      pages_per_span_.in_bytes() / (object_size_ ? object_size_ : 1);
+
+  // Records nonempty_ list index associated with the span with
+  // objects_per_span_ number of allocated objects. Refer to the comment in
+  // IndexFor(...) below for a detailed description.
+  first_nonempty_index_ =
+      kNumLists -
+      std::min<size_t>(absl::bit_width(objects_per_span_), kNumLists);
+
+  ASSERT(absl::bit_width(objects_per_span_) <= kSpanUtilBucketCapacity);
+}
+
+template <class Forwarder>
+inline Span* CentralFreeList<Forwarder>::ReleaseToSpans(void* object,
+                                                        Span* span,
+                                                        size_t object_size) {
+  if (ABSL_PREDICT_FALSE(span->FreelistEmpty(object_size))) {
+#ifdef TCMALLOC_SMALL_BUT_SLOW
+    nonempty_.prepend(span);
+#else
+    const uint8_t index = GetFirstNonEmptyIndex();
+    nonempty_.Add(span, index);
+    span->set_nonempty_index(index);
+#endif
+  }
+
+#ifdef TCMALLOC_SMALL_BUT_SLOW
+  // We maintain a single nonempty list for small-but-slow. Also, we do not
+  // collect histogram stats due to performance issues.
+  if (ABSL_PREDICT_TRUE(span->FreelistPush(object, object_size))) {
+    return nullptr;
+  }
+  nonempty_.remove(span);
+  return span;
+#else
+  const uint8_t prev_index = span->nonempty_index();
+  const uint8_t prev_bitwidth = absl::bit_width(span->Allocated());
+  if (ABSL_PREDICT_FALSE(!span->FreelistPush(object, object_size))) {
+    // Update the histogram as the span is full and will be removed from the
+    // nonempty_ list.
+    RecordSpanUtil(prev_bitwidth, /*increase=*/false);
+    nonempty_.Remove(span, prev_index);
+    return span;
+  }
+  // As the objects are being added to the span, its utilization might change.
+  // We remove the stale utilization from the histogram and add the new
+  // utilization to the histogram after we release objects to the span.
+  const uint8_t cur_bitwidth = absl::bit_width(span->Allocated());
+  if (cur_bitwidth != prev_bitwidth) {
+    RecordSpanUtil(prev_bitwidth, /*increase=*/false);
+    RecordSpanUtil(cur_bitwidth, /*increase=*/true);
+    // If span allocation changes so that it moved to a different nonempty_
+    // list, we remove it from the previous list and add it to the desired
+    // list indexed by cur_index.
+    const uint8_t cur_index = IndexFor(cur_bitwidth);
+    if (cur_index != prev_index) {
+      nonempty_.Remove(span, prev_index);
+      nonempty_.Add(span, cur_index);
+      span->set_nonempty_index(cur_index);
+    }
+  }
+  return nullptr;
+#endif
+}
+
+template <class Forwarder>
+inline Span* CentralFreeList<Forwarder>::FirstNonEmptySpan() {
+  // Scan nonempty_ lists in the range [first_nonempty_index_, kNumLists) and
+  // return the span from a non-empty list if one exists. If all the lists are
+  // empty, return nullptr.
+#ifdef TCMALLOC_SMALL_BUT_SLOW
+  if (ABSL_PREDICT_FALSE(nonempty_.empty())) {
+    return nullptr;
+  }
+  return nonempty_.first();
+#else
+  return nonempty_.PeekLeast(GetFirstNonEmptyIndex());
+#endif
+}
+
+template <class Forwarder>
+inline uint8_t CentralFreeList<Forwarder>::GetFirstNonEmptyIndex() const {
+  return first_nonempty_index_;
+}
+
+template <class Forwarder>
+inline uint8_t CentralFreeList<Forwarder>::IndexFor(uint8_t bitwidth) {
+  // We would like to index into the nonempty_ list based on the number of
+  // allocated objects from the span. Given a span with fewer allocated objects
+  // (i.e. when it is more likely to be freed), we would like to map it to a
+  // higher index in the nonempty_ list. Depending on the number of kNumLists
+  // and the number of objects per span, we may have to clamp multiple buckets
+  // in index 0. It should be ok to do that because it is less beneficial to
+  // differentiate between spans that have 128 vs 256 allocated objects,
+  // compared to those that have 16 vs 32 allocated objects.
+  //
+  // Consider objects_per_span = 1024 and kNumLists = 8. The following examples
+  // show spans with allocated objects in the range [a, b) indexed to the
+  // nonempty_[idx] list using a notation [a, b) -> idx.
+  // [1, 2) -> 7, [2, 4) -> 6, [4, 8) -> 5, [8, 16) -> 4, [16, 32) -> 3, [32,
+  // 64) -> 2, [64, 128) -> 1, [128, 1024) -> 0.
+
+  ASSUME(bitwidth > 0);
+  const uint8_t offset = std::min<size_t>(bitwidth, kNumLists);
+  const uint8_t index = kNumLists - offset;
+  ASSUME(index < kNumLists);
+  return index;
+}
+
+template <class Forwarder>
+inline size_t CentralFreeList<Forwarder>::NumSpansInList(int n) {
+  ASSUME(n >= 0);
+  ASSUME(n < kNumLists);
+  absl::base_internal::SpinLockHolder h(&lock_);
+#ifdef TCMALLOC_SMALL_BUT_SLOW
+  return nonempty_.length();
+#else
+  return nonempty_.SizeOfList(n);
+#endif
+}
+
+template <class Forwarder>
+inline void CentralFreeList<Forwarder>::InsertRange(absl::Span<void*> batch) {
+  CHECK_CONDITION(!batch.empty() && batch.size() <= kMaxObjectsToMove);
+  Span* spans[kMaxObjectsToMove];
+  // Safe to store free spans into freed up space in span array.
+  Span** free_spans = spans;
+  int free_count = 0;
+
+  // Prefetch Span objects to reduce cache misses.
+  for (int i = 0; i < batch.size(); ++i) {
+    Span* span = forwarder_.MapObjectToSpan(batch[i]);
+    ASSERT(span != nullptr);
+    span->Prefetch();
+    spans[i] = span;
+  }
+
+  // First, release all individual objects into spans under our mutex
+  // and collect spans that become completely free.
+  {
+    // Use local copy of variable to ensure that it is not reloaded.
+    size_t object_size = object_size_;
+    absl::base_internal::SpinLockHolder h(&lock_);
+    for (int i = 0; i < batch.size(); ++i) {
+      Span* span = ReleaseToSpans(batch[i], spans[i], object_size);
+      if (ABSL_PREDICT_FALSE(span)) {
+        free_spans[free_count] = span;
+        free_count++;
+      }
+    }
+
+    RecordMultiSpansDeallocated(free_count);
+    UpdateObjectCounts(batch.size());
+  }
+
+  // Then, release all free spans into page heap under its mutex.
+  if (ABSL_PREDICT_FALSE(free_count)) {
+    forwarder_.DeallocateSpans(size_class_, objects_per_span_,
+                               absl::MakeSpan(free_spans, free_count));
+  }
+}
+
+template <class Forwarder>
+inline int CentralFreeList<Forwarder>::RemoveRange(void** batch, int N) {
+  ASSUME(N > 0);
+  // Use local copy of variable to ensure that it is not reloaded.
+  size_t object_size = object_size_;
+  int result = 0;
+  absl::base_internal::SpinLockHolder h(&lock_);
+
+  do {
+    Span* span = FirstNonEmptySpan();
+    if (ABSL_PREDICT_FALSE(!span)) {
+      result += Populate(batch + result, N - result);
+      break;
+    }
+
+#ifdef TCMALLOC_SMALL_BUT_SLOW
+    // We do not collect histogram stats for small-but-slow.
+    int here = span->FreelistPopBatch(batch + result, N - result, object_size);
+    ASSERT(here > 0);
+    if (span->FreelistEmpty(object_size)) {
+      nonempty_.remove(span);
+    }
+#else
+    const uint8_t prev_bitwidth = absl::bit_width(span->Allocated());
+    const uint8_t prev_index = span->nonempty_index();
+    int here = span->FreelistPopBatch(batch + result, N - result, object_size);
+    ASSERT(here > 0);
+    // As the objects are being popped from the span, its utilization might
+    // change. So, we remove the stale utilization from the histogram here and
+    // add it again once we pop the objects.
+    const uint8_t cur_bitwidth = absl::bit_width(span->Allocated());
+    if (cur_bitwidth != prev_bitwidth) {
+      RecordSpanUtil(prev_bitwidth, /*increase=*/false);
+      RecordSpanUtil(cur_bitwidth, /*increase=*/true);
+    }
+    if (span->FreelistEmpty(object_size)) {
+      nonempty_.Remove(span, prev_index);
+    } else if (cur_bitwidth != prev_bitwidth) {
+      // If span allocation changes so that it must be moved to a different
+      // nonempty_ list, we remove it from the previous list and add it to the
+      // desired list indexed by cur_index.
+      const uint8_t cur_index = IndexFor(cur_bitwidth);
+      if (cur_index != prev_index) {
+        nonempty_.Remove(span, prev_index);
+        nonempty_.Add(span, cur_index);
+        span->set_nonempty_index(cur_index);
+      }
+    }
+#endif
+    result += here;
+  } while (result < N);
+  UpdateObjectCounts(-result);
+  return result;
+}
+
+// Fetch memory from the system and add to the central cache freelist.
+template <class Forwarder>
+inline int CentralFreeList<Forwarder>::Populate(void** batch, int N)
+    ABSL_NO_THREAD_SAFETY_ANALYSIS {
+  // Release central list lock while operating on pageheap
+  // Note, this could result in multiple calls to populate each allocating
+  // a new span and the pushing those partially full spans onto nonempty.
+  lock_.Unlock();
+  Span* span =
+      forwarder_.AllocateSpan(size_class_, objects_per_span_, pages_per_span_);
+  if (ABSL_PREDICT_FALSE(span == nullptr)) {
+    Log(kLog, __FILE__, __LINE__, "tcmalloc: allocation failed",
+        pages_per_span_.in_bytes());
+
+    lock_.Lock();
+    return 0;
+  }
+
+  int result = span->BuildFreelist(object_size_, objects_per_span_, batch, N);
+  ASSERT(result > 0);
+  // This is a cheaper check than using FreelistEmpty().
+  bool span_empty = result == objects_per_span_;
+
+  lock_.Lock();
+
+#ifdef TCMALLOC_SMALL_BUT_SLOW
+  // We do not collect histogram stats for small-but-slow. Moreover, we maintain
+  // a single nonempty list to which we prepend the span.
+  if (!span_empty) {
+    nonempty_.prepend(span);
+  }
+#else
+  // Update the histogram once we populate the span.
+  const uint8_t bitwidth = absl::bit_width(span->Allocated());
+  RecordSpanUtil(bitwidth, /*increase=*/true);
+  if (!span_empty) {
+    const uint8_t index = IndexFor(bitwidth);
+    nonempty_.Add(span, index);
+    span->set_nonempty_index(index);
+  }
+#endif
+  RecordSpanAllocated();
+  return result;
+}
+
+template <class Forwarder>
+inline size_t CentralFreeList<Forwarder>::OverheadBytes() const {
+  if (ABSL_PREDICT_FALSE(object_size_ == 0)) {
+    return 0;
+  }
+  const size_t overhead_per_span = pages_per_span_.in_bytes() % object_size_;
+  return num_spans() * overhead_per_span;
+}
+
+template <class Forwarder>
+inline SpanStats CentralFreeList<Forwarder>::GetSpanStats() const {
+  SpanStats stats;
+  if (ABSL_PREDICT_FALSE(objects_per_span_ == 0)) {
+    return stats;
+  }
+  stats.num_spans_requested = static_cast<size_t>(num_spans_requested_.value());
+  stats.num_spans_returned = static_cast<size_t>(num_spans_returned_.value());
+  stats.obj_capacity = stats.num_live_spans() * objects_per_span_;
+  return stats;
+}
+
+template <class Forwarder>
+inline size_t CentralFreeList<Forwarder>::NumSpansWith(
+    uint16_t bitwidth) const {
+  ASSERT(bitwidth > 0);
+  const int bucket = bitwidth - 1;
+  return objects_to_spans_[bucket].value();
+}
+
+template <class Forwarder>
+inline void CentralFreeList<Forwarder>::PrintSpanUtilStats(Printer* out) const {
+  out->printf("class %3d [ %8zu bytes ] : ", size_class_, object_size_);
+  for (size_t i = 1; i <= kSpanUtilBucketCapacity; ++i) {
+    out->printf("%6zu < %zu", NumSpansWith(i), 1 << i);
+    if (i < kSpanUtilBucketCapacity) {
+      out->printf(",");
+    }
+  }
+  out->printf("\n");
+}
+
+template <class Forwarder>
+inline void CentralFreeList<Forwarder>::PrintSpanUtilStatsInPbtxt(
+    PbtxtRegion* region) const {
+  for (size_t i = 1; i <= kSpanUtilBucketCapacity; ++i) {
+    PbtxtRegion histogram = region->CreateSubRegion("span_util_histogram");
+    histogram.PrintI64("lower_bound", 1 << (i - 1));
+    histogram.PrintI64("upper_bound", 1 << i);
+    histogram.PrintI64("value", NumSpansWith(i));
+  }
+}
+
+}  // namespace central_freelist_internal
+
+using CentralFreeList = central_freelist_internal::CentralFreeList<
+    central_freelist_internal::StaticForwarder>;
+
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
+
+#endif  // TCMALLOC_CENTRAL_FREELIST_H_
--- a/src/third_party/tcmalloc/dist/tcmalloc/central_freelist_benchmark.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/central_freelist_benchmark.cc
@ -0,0 +1,201 @@
+// Copyright 2021 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/random/random.h"
+#include "benchmark/benchmark.h"
+#include "tcmalloc/central_freelist.h"
+#include "tcmalloc/common.h"
+#include "tcmalloc/static_vars.h"
+#include "tcmalloc/tcmalloc_policy.h"
+
+namespace tcmalloc {
+namespace tcmalloc_internal {
+namespace {
+
+// This benchmark measures how long it takes to populate multiple
+// spans. The spans are freed in the same order as they were populated
+// to minimize the time it takes to free them.
+void BM_Populate(benchmark::State& state) {
+  size_t object_size = state.range(0);
+  size_t size_class = tc_globals.sizemap().SizeClass(CppPolicy(), object_size);
+  int batch_size = tc_globals.sizemap().num_objects_to_move(size_class);
+  int num_objects = 64 * 1024 * 1024 / object_size;
+  const int num_batches = num_objects / batch_size;
+  CentralFreeList cfl;
+  // Initialize the span to contain the appropriate size of object.
+  cfl.Init(size_class);
+
+  // Allocate an array large enough to hold 64 MiB of objects.
+  std::vector<void*> buffer(num_objects);
+  int64_t items_processed = 0;
+  absl::BitGen rnd;
+
+  while (state.KeepRunningBatch(num_batches)) {
+    int index = 0;
+    // The cost of fetching objects will include the cost of fetching and
+    // populating the span.
+    while (index < num_objects) {
+      int count = std::min(batch_size, num_objects - index);
+      int got = cfl.RemoveRange(&buffer[index], count);
+      index += got;
+    }
+
+    // Don't include the cost of returning the objects to the span, and the
+    // span to the pageheap.
+    state.PauseTiming();
+    index = 0;
+    while (index < num_objects) {
+      uint64_t count = std::min(batch_size, num_objects - index);
+      cfl.InsertRange({&buffer[index], count});
+      index += count;
+    }
+    items_processed += index;
+    state.ResumeTiming();
+  }
+  state.SetItemsProcessed(items_processed);
+}
+BENCHMARK(BM_Populate)
+    ->DenseRange(8, 64, 16)
+    ->DenseRange(64, 1024, 64)
+    ->DenseRange(4096, 28 * 1024, 4096)
+    ->DenseRange(32 * 1024, 256 * 1024, 32 * 1024);
+
+// This benchmark fills a large array with objects, shuffles the objects
+// and then returns them.
+// This should be relatively representative of what happens at runtime.
+// Fetching objects from the CFL is usually done in batches, but returning
+// them is usually done spread over many active spans.
+void BM_MixAndReturn(benchmark::State& state) {
+  size_t object_size = state.range(0);
+  size_t size_class = tc_globals.sizemap().SizeClass(CppPolicy(), object_size);
+  int batch_size = tc_globals.sizemap().num_objects_to_move(size_class);
+  int num_objects = 64 * 1024 * 1024 / object_size;
+  const int num_batches = num_objects / batch_size;
+  CentralFreeList cfl;
+  // Initialize the span to contain the appropriate size of object.
+  cfl.Init(size_class);
+
+  // Allocate an array large enough to hold 64 MiB of objects.
+  std::vector<void*> buffer(num_objects);
+  int64_t items_processed = 0;
+  absl::BitGen rnd;
+
+  while (state.KeepRunningBatch(num_batches)) {
+    int index = 0;
+    while (index < num_objects) {
+      int count = std::min(batch_size, num_objects - index);
+      int got = cfl.RemoveRange(&buffer[index], count);
+      index += got;
+    }
+
+    state.PauseTiming();
+    // Shuffle the vector so that we don't return the objects in the same
+    // order as they were allocated.
+    absl::c_shuffle(buffer, rnd);
+    state.ResumeTiming();
+
+    index = 0;
+    while (index < num_objects) {
+      unsigned int count = std::min(batch_size, num_objects - index);
+      cfl.InsertRange({&buffer[index], count});
+      index += count;
+    }
+    items_processed += index;
+  }
+  state.SetItemsProcessed(items_processed);
+}
+BENCHMARK(BM_MixAndReturn)
+    ->DenseRange(8, 64, 16)
+    ->DenseRange(64, 1024, 64)
+    ->DenseRange(4096, 28 * 1024, 4096)
+    ->DenseRange(32 * 1024, 256 * 1024, 32 * 1024);
+
+// This benchmark holds onto half the allocated objects so that (except for
+// single object spans) spans are never allocated or freed during the
+// benchmark run. This evaluates the performance of just the span handling
+// code, and avoids timing the pageheap code.
+void BM_SpanReuse(benchmark::State& state) {
+  size_t object_size = state.range(0);
+  size_t size_class = tc_globals.sizemap().SizeClass(CppPolicy(), object_size);
+  int batch_size = tc_globals.sizemap().num_objects_to_move(size_class);
+  int num_objects = 64 * 1024 * 1024 / object_size;
+  const int num_batches = num_objects / batch_size;
+  CentralFreeList cfl;
+  // Initialize the span to contain the appropriate size of object.
+  cfl.Init(size_class);
+
+  // Array used to hold onto half of the objects
+  std::vector<void*> held_objects(2 * num_objects);
+  // Request twice the objects we need
+  for (int index = 0; index < 2 * num_objects;) {
+    int count = std::min(batch_size, 2 * num_objects - index);
+    int got = cfl.RemoveRange(&held_objects[index], count);
+    index += got;
+  }
+
+  // Return half of the objects. This will stop the spans from being
+  // returned to the pageheap. So future operations will not touch the
+  // pageheap.
+  for (int index = 0; index < 2 * num_objects; index += 2) {
+    cfl.InsertRange({&held_objects[index], 1});
+  }
+  // Allocate an array large enough to hold 64 MiB of objects.
+  std::vector<void*> buffer(num_objects);
+  int64_t items_processed = 0;
+  absl::BitGen rnd;
+
+  while (state.KeepRunningBatch(num_batches)) {
+    int index = 0;
+    while (index < num_objects) {
+      int count = std::min(batch_size, num_objects - index);
+      int got = cfl.RemoveRange(&buffer[index], count);
+      index += got;
+    }
+
+    state.PauseTiming();
+    // Shuffle the vector so that we don't return the objects in the same
+    // order as they were allocated.
+    absl::c_shuffle(buffer, rnd);
+    state.ResumeTiming();
+
+    index = 0;
+    while (index < num_objects) {
+      uint64_t count = std::min(batch_size, num_objects - index);
+      cfl.InsertRange({&buffer[index], count});
+      index += count;
+    }
+    items_processed += index;
+  }
+  state.SetItemsProcessed(items_processed);
+
+  // Return the other half of the objects.
+  for (int index = 1; index < 2 * num_objects; index += 2) {
+    cfl.InsertRange({&held_objects[index], 1});
+  }
+}
+// Want to avoid benchmarking spans where there is a single object per span.
+BENCHMARK(BM_SpanReuse)
+    ->DenseRange(8, 64, 16)
+    ->DenseRange(64, 1024, 64)
+    ->DenseRange(1024, 4096, 512);
+
+}  // namespace
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
--- a/src/third_party/tcmalloc/dist/tcmalloc/central_freelist_test.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/central_freelist_test.cc
@ -0,0 +1,792 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tcmalloc/central_freelist.h"
+
+#include <math.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/algorithm/container.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/fixed_array.h"
+#include "absl/memory/memory.h"
+#include "absl/numeric/bits.h"
+#include "absl/random/random.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "tcmalloc/common.h"
+#include "tcmalloc/internal/logging.h"
+#include "tcmalloc/mock_static_forwarder.h"
+#include "tcmalloc/pagemap.h"
+#include "tcmalloc/static_vars.h"
+#include "tcmalloc/testing/thread_manager.h"
+
+namespace tcmalloc {
+namespace tcmalloc_internal {
+
+namespace central_freelist_internal {
+
+class StaticForwarderTest : public testing::TestWithParam<size_t> {
+ protected:
+  size_t size_class_;
+  size_t object_size_;
+  Length pages_per_span_;
+  size_t batch_size_;
+  size_t objects_per_span_;
+
+ private:
+  void SetUp() override {
+    size_class_ = GetParam();
+    if (IsExpandedSizeClass(size_class_)) {
+#if ABSL_HAVE_THREAD_SANITIZER
+      GTEST_SKIP() << "Skipping test under sanitizers that conflict with "
+                      "address placement";
+#endif
+
+      if (!ColdFeatureActive()) {
+        // If !ColdFeatureActive(), we will use the normal page heap, which will
+        // keep us from seeing memory get the expected tags.
+        GTEST_SKIP()
+            << "Skipping expanded size classes without cold experiment";
+      }
+    }
+    object_size_ = tc_globals.sizemap().class_to_size(size_class_);
+    if (object_size_ == 0) {
+      GTEST_SKIP() << "Skipping empty size class.";
+    }
+
+    pages_per_span_ = Length(tc_globals.sizemap().class_to_pages(size_class_));
+    batch_size_ = tc_globals.sizemap().num_objects_to_move(size_class_);
+    objects_per_span_ = pages_per_span_.in_bytes() / object_size_;
+  }
+};
+
+TEST_P(StaticForwarderTest, Simple) {
+  Span* span = StaticForwarder::AllocateSpan(size_class_, objects_per_span_,
+                                             pages_per_span_);
+  ASSERT_NE(span, nullptr);
+
+  absl::FixedArray<void*> batch(objects_per_span_);
+  size_t allocated = span->BuildFreelist(object_size_, objects_per_span_,
+                                         &batch[0], objects_per_span_);
+  ASSERT_EQ(allocated, objects_per_span_);
+
+  EXPECT_EQ(size_class_, tc_globals.pagemap().sizeclass(span->first_page()));
+  EXPECT_EQ(size_class_, tc_globals.pagemap().sizeclass(span->last_page()));
+
+  // span_test.cc provides test coverage for Span, but we need to obtain several
+  // objects to confirm we can map back to the Span pointer from the PageMap.
+  for (void* ptr : batch) {
+    EXPECT_EQ(span, StaticForwarder::MapObjectToSpan(ptr));
+  }
+
+  for (void* ptr : batch) {
+    span->FreelistPush(ptr, object_size_);
+  }
+
+  StaticForwarder::DeallocateSpans(size_class_, objects_per_span_,
+                                   absl::MakeSpan(&span, 1));
+}
+
+class StaticForwarderEnvironment {
+  struct SpanData {
+    Span* span;
+    void* batch[kMaxObjectsToMove];
+  };
+
+ public:
+  StaticForwarderEnvironment(int size_class, size_t object_size,
+                             size_t objects_per_span, Length pages_per_span,
+                             int batch_size)
+      : size_class_(size_class),
+        object_size_(object_size),
+        objects_per_span_(objects_per_span),
+        pages_per_span_(pages_per_span),
+        batch_size_(batch_size) {}
+
+  ~StaticForwarderEnvironment() { Drain(); }
+
+  void RandomlyPoke() {
+    absl::BitGen rng;
+    double coin = absl::Uniform(rng, 0.0, 1.0);
+
+    if (coin < 0.5) {
+      Grow();
+    } else if (coin < 0.9) {
+      // Deallocate Spans.  We may deallocate more than 1 span, so we bias
+      // towards allocating Spans more often than we deallocate.
+      Shrink();
+    } else {
+      Shuffle(rng);
+    }
+  }
+
+  void Drain() {
+    std::vector<std::unique_ptr<SpanData>> spans;
+
+    {
+      absl::MutexLock l(&mu_);
+      if (data_.empty()) {
+        return;
+      }
+
+      spans = std::move(data_);
+      data_.clear();
+    }
+
+    // Check mappings.
+    std::vector<Span*> free_spans;
+    for (const auto& data : spans) {
+      EXPECT_EQ(size_class_,
+                tc_globals.pagemap().sizeclass(data->span->first_page()));
+      EXPECT_EQ(size_class_,
+                tc_globals.pagemap().sizeclass(data->span->last_page()));
+      // Confirm we can map at least one object back.
+      EXPECT_EQ(data->span, StaticForwarder::MapObjectToSpan(data->batch[0]));
+
+      free_spans.push_back(data->span);
+    }
+
+    StaticForwarder::DeallocateSpans(size_class_, objects_per_span_,
+                                     absl::MakeSpan(free_spans));
+  }
+
+  void Grow() {
+    // Allocate a Span
+    Span* span = StaticForwarder::AllocateSpan(size_class_, objects_per_span_,
+                                               pages_per_span_);
+    ASSERT_NE(span, nullptr);
+
+    auto d = absl::make_unique<SpanData>();
+    d->span = span;
+
+    size_t allocated = span->BuildFreelist(object_size_, objects_per_span_,
+                                           d->batch, batch_size_);
+    EXPECT_LE(allocated, objects_per_span_);
+
+    EXPECT_EQ(size_class_, tc_globals.pagemap().sizeclass(span->first_page()));
+    EXPECT_EQ(size_class_, tc_globals.pagemap().sizeclass(span->last_page()));
+    // Confirm we can map at least one object back.
+    EXPECT_EQ(span, StaticForwarder::MapObjectToSpan(d->batch[0]));
+
+    absl::MutexLock l(&mu_);
+    spans_allocated_++;
+    data_.push_back(std::move(d));
+  }
+
+  void Shrink() {
+    absl::BitGen rng;
+    std::vector<std::unique_ptr<SpanData>> spans;
+
+    {
+      absl::MutexLock l(&mu_);
+      if (data_.empty()) {
+        return;
+      }
+
+      size_t count = absl::LogUniform<size_t>(rng, 1, data_.size());
+      spans.reserve(count);
+
+      for (int i = 0; i < count; i++) {
+        spans.push_back(std::move(data_.back()));
+        data_.pop_back();
+      }
+    }
+
+    // Check mappings.
+    std::vector<Span*> free_spans;
+    for (auto& data : spans) {
+      EXPECT_EQ(size_class_,
+                tc_globals.pagemap().sizeclass(data->span->first_page()));
+      EXPECT_EQ(size_class_,
+                tc_globals.pagemap().sizeclass(data->span->last_page()));
+      // Confirm we can map at least one object back.
+      EXPECT_EQ(data->span, StaticForwarder::MapObjectToSpan(data->batch[0]));
+
+      free_spans.push_back(data->span);
+    }
+
+    StaticForwarder::DeallocateSpans(size_class_, objects_per_span_,
+                                     absl::MakeSpan(free_spans));
+  }
+
+  void Shuffle(absl::BitGen& rng) {
+    // Shuffle the shared vector.
+    absl::MutexLock l(&mu_);
+    absl::c_shuffle(data_, rng);
+  }
+
+  int64_t BytesAllocated() {
+    absl::MutexLock l(&mu_);
+    return pages_per_span_.in_bytes() * spans_allocated_;
+  }
+
+ private:
+  int size_class_;
+  size_t object_size_;
+  size_t objects_per_span_;
+  Length pages_per_span_;
+  int batch_size_;
+
+  absl::Mutex mu_;
+  int64_t spans_allocated_ ABSL_GUARDED_BY(mu_) = 0;
+  std::vector<std::unique_ptr<SpanData>> data_ ABSL_GUARDED_BY(mu_);
+};
+
+static BackingStats PageHeapStats() {
+  absl::base_internal::SpinLockHolder l(&pageheap_lock);
+  return tc_globals.page_allocator().stats();
+}
+
+TEST_P(StaticForwarderTest, Fuzz) {
+#if ABSL_HAVE_THREAD_SANITIZER
+  // TODO(b/193887621):  Enable this test under TSan after addressing benign
+  // true positives.
+  GTEST_SKIP() << "Skipping test under Thread Sanitizer.";
+#endif  // ABSL_HAVE_THREAD_SANITIZER
+
+  const auto page_heap_before = PageHeapStats();
+
+  StaticForwarderEnvironment env(size_class_, object_size_, objects_per_span_,
+                                 pages_per_span_, batch_size_);
+  ThreadManager threads;
+  threads.Start(10, [&](int) { env.RandomlyPoke(); });
+
+  absl::SleepFor(absl::Seconds(0.2));
+
+  threads.Stop();
+
+  const auto page_heap_after = PageHeapStats();
+  // Confirm we did not leak Spans by ensuring the page heap did not grow nearly
+  // 1:1 by the total number of Spans we ever allocated.
+  //
+  // Since we expect to allocate a significant number of spans, we apply a
+  // factor of 1/2 (which is unlikely to be flaky) to avoid false negatives
+  // if/when a background thread triggers a deallocation.
+  const int64_t bytes_allocated = env.BytesAllocated();
+  EXPECT_GT(bytes_allocated, 0);
+  EXPECT_LE(static_cast<int64_t>(page_heap_after.system_bytes) -
+                static_cast<int64_t>(page_heap_before.system_bytes),
+            bytes_allocated / 2);
+}
+
+INSTANTIATE_TEST_SUITE_P(All, StaticForwarderTest,
+                         testing::Range(size_t(1), kNumClasses));
+
+}  // namespace central_freelist_internal
+
+namespace {
+
+using central_freelist_internal::kNumLists;
+template <typename Env>
+using CentralFreeListTest = ::testing::Test;
+TYPED_TEST_SUITE_P(CentralFreeListTest);
+
+TYPED_TEST_P(CentralFreeListTest, IsolatedSmoke) {
+  TypeParam e;
+
+  EXPECT_CALL(e.forwarder(), AllocateSpan).Times(1);
+
+  absl::FixedArray<void*> batch(TypeParam::kBatchSize);
+  int allocated =
+      e.central_freelist().RemoveRange(&batch[0], TypeParam::kBatchSize);
+  ASSERT_GT(allocated, 0);
+  EXPECT_LE(allocated, TypeParam::kBatchSize);
+
+  // We should observe span's utilization captured in the histogram. The number
+  // of spans in rest of the buckets should be zero.
+  const int bitwidth = absl::bit_width(static_cast<unsigned>(allocated));
+  for (int i = 1; i <= absl::bit_width(TypeParam::kObjectsPerSpan); ++i) {
+    if (i == bitwidth) {
+      EXPECT_EQ(e.central_freelist().NumSpansWith(i), 1);
+    } else {
+      EXPECT_EQ(e.central_freelist().NumSpansWith(i), 0);
+    }
+  }
+
+  EXPECT_CALL(e.forwarder(), MapObjectToSpan).Times(allocated);
+  EXPECT_CALL(e.forwarder(), DeallocateSpans).Times(1);
+
+  SpanStats stats = e.central_freelist().GetSpanStats();
+  EXPECT_EQ(stats.num_spans_requested, 1);
+  EXPECT_EQ(stats.num_spans_returned, 0);
+  EXPECT_EQ(stats.obj_capacity, 1024);
+
+  e.central_freelist().InsertRange(absl::MakeSpan(&batch[0], allocated));
+
+  stats = e.central_freelist().GetSpanStats();
+  EXPECT_EQ(stats.num_spans_requested, 1);
+  EXPECT_EQ(stats.num_spans_returned, 1);
+  EXPECT_EQ(stats.obj_capacity, 0);
+
+  // Span captured in the histogram with the earlier utilization should have
+  // been removed.
+  for (int i = 1; i <= absl::bit_width(TypeParam::kObjectsPerSpan); ++i) {
+    EXPECT_EQ(e.central_freelist().NumSpansWith(i), 0);
+  }
+}
+
+TYPED_TEST_P(CentralFreeListTest, SpanUtilizationHistogram) {
+  TypeParam e;
+
+  constexpr size_t kNumSpans = 10;
+
+  // Request kNumSpans spans.
+  void* batch[kMaxObjectsToMove];
+  const int num_objects_to_fetch = kNumSpans * TypeParam::kObjectsPerSpan;
+  int total_fetched = 0;
+  // Tracks object and corresponding span idx from which it was allocated.
+  std::vector<std::pair<void*, int>> objects_to_span_idx;
+  // Tracks number of objects allocated per span.
+  std::vector<size_t> allocated_per_span(kNumSpans, 0);
+  int span_idx = 0;
+
+  while (total_fetched < num_objects_to_fetch) {
+    size_t n = num_objects_to_fetch - total_fetched;
+    int got = e.central_freelist().RemoveRange(
+        batch, std::min(n, TypeParam::kBatchSize));
+    total_fetched += got;
+
+    // Increment span_idx if current objects have been fetched from the new
+    // span.
+    if (total_fetched > (span_idx + 1) * TypeParam::kObjectsPerSpan) {
+      ++span_idx;
+    }
+    // Record fetched object and associated span index.
+    for (int i = 0; i < got; ++i) {
+      objects_to_span_idx.push_back(std::make_pair(batch[i], span_idx));
+    }
+    ASSERT(span_idx < kNumSpans);
+    allocated_per_span[span_idx] += got;
+  }
+
+  // Make sure that we have fetched exactly from kNumSpans spans.
+  EXPECT_EQ(span_idx + 1, kNumSpans);
+
+  // We should have kNumSpans spans in the histogram with number of allocated
+  // objects equal to TypeParam::kObjectsPerSpan (i.e. in the last bucket).
+  // Rest of the buckets should be empty.
+  const int expected_bitwidth = absl::bit_width(TypeParam::kObjectsPerSpan);
+  EXPECT_EQ(e.central_freelist().NumSpansWith(expected_bitwidth), kNumSpans);
+  for (int i = 1; i < expected_bitwidth; ++i) {
+    EXPECT_EQ(e.central_freelist().NumSpansWith(i), 0);
+  }
+
+  // Shuffle.
+  absl::BitGen rng;
+  std::shuffle(objects_to_span_idx.begin(), objects_to_span_idx.end(), rng);
+
+  // Return objects, a fraction at a time, each time checking that histogram is
+  // correct.
+  int total_returned = 0;
+  const int last_bucket = absl::bit_width(TypeParam::kObjectsPerSpan) - 1;
+  while (total_returned < num_objects_to_fetch) {
+    uint64_t size_to_pop = std::min(objects_to_span_idx.size() - total_returned,
+                                    TypeParam::kBatchSize);
+
+    for (int i = 0; i < size_to_pop; ++i) {
+      const auto [ptr, span_idx] = objects_to_span_idx[i + total_returned];
+      batch[i] = ptr;
+      ASSERT(span_idx < kNumSpans);
+      --allocated_per_span[span_idx];
+    }
+    total_returned += size_to_pop;
+    e.central_freelist().InsertRange({batch, size_to_pop});
+
+    // Calculate expected histogram.
+    size_t expected[absl::bit_width(TypeParam::kObjectsPerSpan)] = {0};
+    for (int i = 0; i < kNumSpans; ++i) {
+      // If span has non-zero allocated objects, include it in the histogram.
+      if (allocated_per_span[i]) {
+        const size_t bucket = absl::bit_width(allocated_per_span[i]) - 1;
+        ASSERT(bucket <= last_bucket);
+        ++expected[bucket];
+      }
+    }
+
+    // Fetch number of spans logged in the histogram and compare it with the
+    // expected histogram that we calculated using the tracked allocated
+    // objects per span.
+    for (int i = 1; i <= last_bucket; ++i) {
+      EXPECT_EQ(e.central_freelist().NumSpansWith(i), expected[i - 1]);
+    }
+  }
+
+  // Since no span is live here, histogram must be empty.
+  for (int i = 1; i <= last_bucket; ++i) {
+    EXPECT_EQ(e.central_freelist().NumSpansWith(i), 0);
+  }
+}
+
+// Confirms that a call to RemoveRange returns at most kObjectsPerSpan objects
+// in cases when there are no non-empty spans in the central freelist. This
+// makes sure that we populate, and subsequently allocate from a single span.
+// This avoids memory regression due to multiple Populate calls observed in
+// b/225880278.
+TYPED_TEST_P(CentralFreeListTest, SinglePopulate) {
+  // Make sure that we allocate up to kObjectsPerSpan objects in both the span
+  // prioritization states.
+    TypeParam e;
+    // Try to fetch sufficiently large number of objects at startup.
+    const int num_objects_to_fetch = 10 * TypeParam::kObjectsPerSpan;
+    void* objects[num_objects_to_fetch];
+    const size_t got =
+        e.central_freelist().RemoveRange(objects, num_objects_to_fetch);
+    // Confirm we allocated at most kObjectsPerSpan number of objects.
+    EXPECT_GT(got, 0);
+    EXPECT_LE(got, TypeParam::kObjectsPerSpan);
+    size_t returned = 0;
+    while (returned < got) {
+      const size_t to_return = std::min(got - returned, TypeParam::kBatchSize);
+      e.central_freelist().InsertRange({&objects[returned], to_return});
+      returned += to_return;
+    }
+}
+
+// Checks if we are indexing a span in the nonempty_ lists as expected.
+TYPED_TEST_P(CentralFreeListTest, MultiNonEmptyLists) {
+  TypeParam e;
+
+  ASSERT(kNumLists > 0);
+  const int num_objects_to_fetch = TypeParam::kObjectsPerSpan;
+  std::vector<void*> objects(num_objects_to_fetch);
+  size_t fetched = 0;
+  int expected_idx = kNumLists - 1;
+  int prev_bitwidth = 1;
+
+  // Fetch one object at a time from a span and confirm that the span is moved
+  // through the nonempty_ lists as we allocate more objects from it.
+  while (fetched < num_objects_to_fetch) {
+    // Try to fetch one object from the span.
+    int got = e.central_freelist().RemoveRange(&objects[fetched], 1);
+    fetched += got;
+    ASSERT(fetched);
+    size_t cur_bitwidth = absl::bit_width(fetched);
+    // We index nonempty_ lists based on log2(allocated) and so, we update the
+    // index when the bit_width changes.
+    if (cur_bitwidth != prev_bitwidth) {
+      // We ceil spans to nonempty_[0] when allocated objects from the span
+      // increases above 2^(kNumLists-1).
+      expected_idx = expected_idx > 0 ? expected_idx - 1 : 0;
+      prev_bitwidth = cur_bitwidth;
+    }
+    ASSERT(expected_idx >= 0);
+    ASSERT(expected_idx < kNumLists);
+    if (fetched % num_objects_to_fetch == 0) {
+      // Span should have been removed from nonempty_ lists because we have
+      // allocated all the objects from it.
+      EXPECT_EQ(e.central_freelist().NumSpansInList(expected_idx), 0);
+    } else {
+      // Check that the span exists in the corresponding nonempty_ list.
+      EXPECT_EQ(e.central_freelist().NumSpansInList(expected_idx), 1);
+    }
+  }
+
+  // Similar to our previous test, we now make sure that the span is moved
+  // through the nonempty_ lists when we deallocate objects back to it.
+  size_t remaining = fetched;
+
+  // We ceil spans to nonempty_[0] when allocated objects from the span
+  // increases above 2^(kNumLists-1).
+  const size_t threshold = pow(2, kNumLists - 1);
+  while (--remaining > 0) {
+    // Return objects back to the span one at a time.
+    e.central_freelist().InsertRange({&objects[remaining], 1});
+    ASSERT(remaining);
+    const size_t cur_bitwidth = absl::bit_width(remaining);
+    // If we cross pow2 boundaries, update the expected index into nonempty_
+    // lists.
+    if (cur_bitwidth != prev_bitwidth) {
+      // When allocated objects are more than the threshold, the span is indexed
+      // to nonempty_ list 0.
+      expected_idx = remaining < threshold ? expected_idx + 1 : 0;
+      prev_bitwidth = cur_bitwidth;
+    }
+    EXPECT_LT(expected_idx, kNumLists);
+    EXPECT_EQ(e.central_freelist().NumSpansInList(expected_idx), 1);
+  }
+
+  // When the last object is returned, we release the span to the page heap. So,
+  // nonempty_[0] should also be empty.
+  e.central_freelist().InsertRange({&objects[remaining], 1});
+  EXPECT_EQ(e.central_freelist().NumSpansInList(0), 0);
+}
+
+// Checks if we are indexing a span in the nonempty_ lists as expected. We also
+// check if the spans are correctly being prioritized. That is, we create a
+// scenario where we have two live spans, and one span has more allocated
+// objects than the other span. On subsequent allocations, we confirm that the
+// objects are allocated from the span with a higher number of allocated objects
+// as enforced by our prioritization scheme.
+TYPED_TEST_P(CentralFreeListTest, SpanPriority) {
+  TypeParam e;
+
+  // If the number of objects per span is less than 3, we do not use more than
+  // one nonempty_ lists. So, we can not prioritize the spans based on how many
+  // objects were allocated from them.
+  const int objects_per_span = TypeParam::kObjectsPerSpan;
+  if (objects_per_span < 3 || kNumLists < 2) return;
+
+  constexpr int kNumSpans = 2;
+
+  // Track objects allocated per span.
+  absl::FixedArray<std::vector<void*>> objects(kNumSpans);
+  void* batch[kMaxObjectsToMove];
+
+  const size_t to_fetch = objects_per_span;
+  // Allocate all objects from kNumSpans.
+  for (int span = 0; span < kNumSpans; ++span) {
+    size_t fetched = 0;
+    while (fetched < to_fetch) {
+      const size_t n = to_fetch - fetched;
+      int got = e.central_freelist().RemoveRange(
+          batch, std::min(n, TypeParam::kBatchSize));
+      for (int i = 0; i < got; ++i) {
+        objects[span].push_back(batch[i]);
+      }
+      fetched += got;
+    }
+  }
+
+  // Perform deallocations so that each span contains only two objects.
+  size_t to_release = to_fetch - 2;
+  for (int span = 0; span < kNumSpans; ++span) {
+    size_t released = 0;
+    while (released < to_release) {
+      uint64_t n = std::min(to_release - released, TypeParam::kBatchSize);
+      for (int i = 0; i < n; ++i) {
+        batch[i] = objects[span][i + released];
+      }
+      released += n;
+      e.central_freelist().InsertRange({batch, n});
+    }
+    objects[span].erase(objects[span].begin(),
+                        objects[span].begin() + released);
+  }
+
+  // Make sure we have kNumSpans in the expected second-last nonempty_ list.
+  EXPECT_EQ(e.central_freelist().NumSpansInList(kNumLists - 2), kNumSpans);
+
+  // Release an additional object from all but one spans so that they are
+  // deprioritized for subsequent allocations.
+  to_release = 1;
+  for (int span = 1; span < kNumSpans; ++span) {
+    size_t released = 0;
+    while (released < to_release) {
+      uint64_t n = std::min(to_release - released, TypeParam::kBatchSize);
+      for (int i = 0; i < n; ++i) {
+        batch[i] = objects[span][i + released];
+      }
+      released += n;
+      e.central_freelist().InsertRange({batch, n});
+    }
+    objects[span].erase(objects[span].begin(),
+                        objects[span].begin() + released);
+  }
+
+  // Make sure we have kNumSpans-1 spans in the last nonempty_ list and just one
+  // span in the second-last list.
+  EXPECT_EQ(e.central_freelist().NumSpansInList(kNumLists - 1), kNumSpans - 1);
+  EXPECT_EQ(e.central_freelist().NumSpansInList(kNumLists - 2), 1);
+
+  // Allocate one object to ensure that it is being allocated from the span with
+  // the highest number of allocated objects.
+  int got = e.central_freelist().RemoveRange(batch, 1);
+  EXPECT_EQ(got, 1);
+  // Number of spans in the last nonempty_ list should be unchanged (i.e.
+  // kNumSpans-1).
+  EXPECT_EQ(e.central_freelist().NumSpansInList(kNumLists - 1), kNumSpans - 1);
+  // We should have only one span in the second-last nonempty_ list; this is the
+  // span from which we should have allocated the last object.
+  EXPECT_EQ(e.central_freelist().NumSpansInList(kNumLists - 2), 1);
+  // Return previously allocated object.
+  e.central_freelist().InsertRange({batch, 1});
+
+  // Return rest of the objects.
+  for (int span = 0; span < kNumSpans; ++span) {
+    for (int i = 0; i < objects[span].size(); ++i) {
+      e.central_freelist().InsertRange({&objects[span][i], 1});
+    }
+  }
+}
+
+TYPED_TEST_P(CentralFreeListTest, MultipleSpans) {
+  TypeParam e;
+  std::vector<void*> all_objects;
+
+  constexpr size_t kNumSpans = 10;
+
+  // Request kNumSpans spans.
+  void* batch[kMaxObjectsToMove];
+  const int num_objects_to_fetch = kNumSpans * TypeParam::kObjectsPerSpan;
+  int total_fetched = 0;
+  while (total_fetched < num_objects_to_fetch) {
+    size_t n = num_objects_to_fetch - total_fetched;
+    int got = e.central_freelist().RemoveRange(
+        batch, std::min(n, TypeParam::kBatchSize));
+    for (int i = 0; i < got; ++i) {
+      all_objects.push_back(batch[i]);
+    }
+    total_fetched += got;
+  }
+
+  // We should have kNumSpans spans in the histogram with number of
+  // allocated objects equal to TypeParam::kObjectsPerSpan (i.e. in the last
+  // bucket). Rest of the buckets should be empty.
+  const int expected_bitwidth = absl::bit_width(TypeParam::kObjectsPerSpan);
+  EXPECT_EQ(e.central_freelist().NumSpansWith(expected_bitwidth), kNumSpans);
+  for (int i = 1; i < expected_bitwidth; ++i) {
+    EXPECT_EQ(e.central_freelist().NumSpansWith(i), 0);
+  }
+
+  SpanStats stats = e.central_freelist().GetSpanStats();
+  EXPECT_EQ(stats.num_spans_requested, kNumSpans);
+  EXPECT_EQ(stats.num_spans_returned, 0);
+
+  EXPECT_EQ(all_objects.size(), num_objects_to_fetch);
+
+  // Shuffle
+  absl::BitGen rng;
+  std::shuffle(all_objects.begin(), all_objects.end(), rng);
+
+  // Return all
+  int total_returned = 0;
+  bool checked_half = false;
+  while (total_returned < num_objects_to_fetch) {
+    uint64_t size_to_pop =
+        std::min(all_objects.size() - total_returned, TypeParam::kBatchSize);
+    for (int i = 0; i < size_to_pop; ++i) {
+      batch[i] = all_objects[i + total_returned];
+    }
+    total_returned += size_to_pop;
+    e.central_freelist().InsertRange({batch, size_to_pop});
+    // sanity check
+    if (!checked_half && total_returned >= (num_objects_to_fetch / 2)) {
+      stats = e.central_freelist().GetSpanStats();
+      EXPECT_GT(stats.num_spans_requested, stats.num_spans_returned);
+      EXPECT_NE(stats.obj_capacity, 0);
+      // Total spans recorded in the histogram must be equal to the number of
+      // live spans.
+      size_t spans_in_histogram = 0;
+      for (int i = 1; i <= absl::bit_width(TypeParam::kObjectsPerSpan); ++i) {
+        spans_in_histogram += e.central_freelist().NumSpansWith(i);
+      }
+      EXPECT_EQ(spans_in_histogram, stats.num_live_spans());
+      checked_half = true;
+    }
+  }
+
+  stats = e.central_freelist().GetSpanStats();
+  EXPECT_EQ(stats.num_spans_requested, stats.num_spans_returned);
+  // Since no span is live, histogram must be empty.
+  for (int i = 1; i <= absl::bit_width(TypeParam::kObjectsPerSpan); ++i) {
+    EXPECT_EQ(e.central_freelist().NumSpansWith(i), 0);
+  }
+  EXPECT_EQ(stats.obj_capacity, 0);
+}
+
+TYPED_TEST_P(CentralFreeListTest, PassSpanObjectCountToPageheap) {
+  ASSERT_GT(TypeParam::kObjectsPerSpan, 1);
+  auto test_function = [&](size_t num_objects) {
+    TypeParam e;
+    std::vector<void*> objects(TypeParam::kObjectsPerSpan);
+    EXPECT_CALL(
+        e.forwarder(),
+        AllocateSpan(testing::_, TypeParam::kObjectsPerSpan, testing::_))
+        .Times(1);
+    const size_t to_fetch =
+        std::min(TypeParam::kObjectsPerSpan, TypeParam::kBatchSize);
+    const size_t fetched =
+        e.central_freelist().RemoveRange(&objects[0], to_fetch);
+    size_t returned = 0;
+    while (returned < fetched) {
+      EXPECT_CALL(
+          e.forwarder(),
+          DeallocateSpans(testing::_, TypeParam::kObjectsPerSpan, testing::_))
+          .Times(1);
+      const size_t to_return =
+          std::min(fetched - returned, TypeParam::kBatchSize);
+      e.central_freelist().InsertRange({&objects[returned], to_return});
+      returned += to_return;
+    }
+  };
+  test_function(1);
+  test_function(TypeParam::kObjectsPerSpan);
+}
+
+TYPED_TEST_P(CentralFreeListTest, SpanFragmentation) {
+  // This test is primarily exercising Span itself to model how tcmalloc.cc uses
+  // it, but this gives us a self-contained (and sanitizable) implementation of
+  // the CentralFreeList.
+  TypeParam e;
+
+  // Allocate one object from the CFL to allocate a span.
+  void* initial;
+  int got = e.central_freelist().RemoveRange(&initial, 1);
+  ASSERT_EQ(got, 1);
+
+  Span* const span = e.central_freelist().forwarder().MapObjectToSpan(initial);
+  const size_t object_size =
+      e.central_freelist().forwarder().class_to_size(TypeParam::kSizeClass);
+
+  ThreadManager fragmentation;
+  fragmentation.Start(1, [&](int) {
+    benchmark::DoNotOptimize(span->Fragmentation(object_size));
+  });
+
+  ThreadManager cfl;
+  cfl.Start(1, [&](int) {
+    void* next;
+    int got = e.central_freelist().RemoveRange(&next, 1);
+    e.central_freelist().InsertRange(absl::MakeSpan(&next, got));
+  });
+
+  absl::SleepFor(absl::Seconds(0.1));
+
+  fragmentation.Stop();
+  cfl.Stop();
+
+  e.central_freelist().InsertRange(absl::MakeSpan(&initial, 1));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(CentralFreeListTest, IsolatedSmoke,
+                            MultiNonEmptyLists, SpanPriority,
+                            SpanUtilizationHistogram, MultipleSpans,
+                            SinglePopulate, PassSpanObjectCountToPageheap,
+                            SpanFragmentation);
+
+namespace unit_tests {
+
+using Env = FakeCentralFreeListEnvironment<
+    central_freelist_internal::CentralFreeList<MockStaticForwarder>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(CentralFreeList, CentralFreeListTest,
+                               ::testing::Types<Env>);
+
+}  // namespace unit_tests
+
+}  // namespace
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
--- a/src/third_party/tcmalloc/dist/tcmalloc/common.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/common.cc
@ -0,0 +1,53 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tcmalloc/common.h"
+
+#include <algorithm>
+
+#include "tcmalloc/experiment.h"
+#include "tcmalloc/internal/environment.h"
+#include "tcmalloc/internal/optimization.h"
+#include "tcmalloc/pages.h"
+#include "tcmalloc/sampler.h"
+#include "tcmalloc/span.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+
+absl::string_view MemoryTagToLabel(MemoryTag tag) {
+  switch (tag) {
+    case MemoryTag::kNormal:
+      return "NORMAL";
+    case MemoryTag::kNormalP1:
+      return "NORMAL_P1";
+    case MemoryTag::kSampled:
+      return "SAMPLED";
+    case MemoryTag::kCold:
+      return "COLD";
+    default:
+      ASSUME(false);
+  }
+}
+
+// This only provides correct answer for TCMalloc-allocated memory,
+// and may give a false positive for non-allocated block.
+extern "C" bool TCMalloc_Internal_PossiblyCold(const void* ptr) {
+  return IsColdMemory(ptr);
+}
+
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
--- a/src/third_party/tcmalloc/dist/tcmalloc/common.h
+++ b/src/third_party/tcmalloc/dist/tcmalloc/common.h
@ -0,0 +1,380 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Common definitions for tcmalloc code.
+
+#ifndef TCMALLOC_COMMON_H_
+#define TCMALLOC_COMMON_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <limits>
+#include <new>
+#include <type_traits>
+
+#include "absl/base/attributes.h"
+#include "absl/base/dynamic_annotations.h"
+#include "absl/base/internal/spinlock.h"
+#include "absl/base/macros.h"
+#include "absl/base/optimization.h"
+#include "absl/numeric/bits.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tcmalloc/experiment.h"
+#include "tcmalloc/internal/config.h"
+#include "tcmalloc/internal/logging.h"
+#include "tcmalloc/internal/optimization.h"
+#include "tcmalloc/malloc_extension.h"
+#include "tcmalloc/size_class_info.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+
+static_assert(sizeof(void*) == 8);
+
+//-------------------------------------------------------------------
+// Configuration
+//-------------------------------------------------------------------
+
+// There are four different models for tcmalloc which are created by defining a
+// set of constant variables differently:
+//
+// DEFAULT:
+//   The default configuration strives for good performance while trying to
+//   minimize fragmentation.  It uses a smaller page size to reduce
+//   fragmentation, but allocates per-thread and per-cpu capacities similar to
+//   TCMALLOC_LARGE_PAGES / TCMALLOC_256K_PAGES.
+//
+// TCMALLOC_LARGE_PAGES:
+//   Larger page sizes increase the bookkeeping granularity used by TCMalloc for
+//   its allocations.  This can reduce PageMap size and traffic to the
+//   innermost cache (the page heap), but can increase memory footprints.  As
+//   TCMalloc will not reuse a page for a different allocation size until the
+//   entire page is deallocated, this can be a source of increased memory
+//   fragmentation.
+//
+//   Historically, larger page sizes improved lookup performance for the
+//   pointer-to-size lookup in the PageMap that was part of the critical path.
+//   With most deallocations leveraging C++14's sized delete feature
+//   (https://isocpp.org/files/papers/n3778.html), this optimization is less
+//   significant.
+//
+// TCMALLOC_256K_PAGES
+//   This configuration uses an even larger page size (256KB) as the unit of
+//   accounting granularity.
+//
+// TCMALLOC_SMALL_BUT_SLOW:
+//   Used for situations where minimizing the memory footprint is the most
+//   desirable attribute, even at the cost of performance.
+//
+// The constants that vary between models are:
+//
+//   kPageShift - Shift amount used to compute the page size.
+//   kNumBaseClasses - Number of size classes serviced by bucket allocators
+//   kMaxSize - Maximum size serviced by bucket allocators (thread/cpu/central)
+//   kMinThreadCacheSize - The minimum size in bytes of each ThreadCache.
+//   kMaxThreadCacheSize - The maximum size in bytes of each ThreadCache.
+//   kDefaultOverallThreadCacheSize - The maximum combined size in bytes of all
+//     ThreadCaches for an executable.
+//   kStealAmount - The number of bytes one ThreadCache will steal from another
+//     when the first ThreadCache is forced to Scavenge(), delaying the next
+//     call to Scavenge for this thread.
+
+// Older configurations had their own customized macros.  Convert them into
+// a page-shift parameter that is checked below.
+
+#ifndef TCMALLOC_PAGE_SHIFT
+#ifdef TCMALLOC_SMALL_BUT_SLOW
+#define TCMALLOC_PAGE_SHIFT 12
+#define TCMALLOC_USE_PAGEMAP3
+#elif defined(TCMALLOC_256K_PAGES)
+#define TCMALLOC_PAGE_SHIFT 18
+#elif defined(TCMALLOC_LARGE_PAGES)
+#define TCMALLOC_PAGE_SHIFT 15
+#else
+#define TCMALLOC_PAGE_SHIFT 13
+#endif
+#else
+#error "TCMALLOC_PAGE_SHIFT is an internal macro!"
+#endif
+
+#if TCMALLOC_PAGE_SHIFT == 12
+inline constexpr size_t kPageShift = 12;
+inline constexpr size_t kNumBaseClasses = 46;
+inline constexpr bool kHasExpandedClasses = false;
+inline constexpr size_t kMaxSize = 8 << 10;
+inline constexpr size_t kMinThreadCacheSize = 4 * 1024;
+inline constexpr size_t kMaxThreadCacheSize = 64 * 1024;
+inline constexpr size_t kMaxCpuCacheSize = 10 * 1024;
+inline constexpr size_t kDefaultOverallThreadCacheSize = kMaxThreadCacheSize;
+inline constexpr size_t kStealAmount = kMinThreadCacheSize;
+inline constexpr size_t kDefaultProfileSamplingRate = 1 << 19;
+inline constexpr size_t kMinPages = 2;
+#elif TCMALLOC_PAGE_SHIFT == 15
+inline constexpr size_t kPageShift = 15;
+inline constexpr size_t kNumBaseClasses = 78;
+inline constexpr bool kHasExpandedClasses = true;
+inline constexpr size_t kMaxSize = 256 * 1024;
+inline constexpr size_t kMinThreadCacheSize = kMaxSize * 2;
+inline constexpr size_t kMaxThreadCacheSize = 4 << 20;
+inline constexpr size_t kMaxCpuCacheSize = 1.5 * 1024 * 1024;
+inline constexpr size_t kDefaultOverallThreadCacheSize =
+    8u * kMaxThreadCacheSize;
+inline constexpr size_t kStealAmount = 1 << 16;
+inline constexpr size_t kDefaultProfileSamplingRate = 1 << 21;
+inline constexpr size_t kMinPages = 8;
+#elif TCMALLOC_PAGE_SHIFT == 18
+inline constexpr size_t kPageShift = 18;
+inline constexpr size_t kNumBaseClasses = 89;
+inline constexpr bool kHasExpandedClasses = true;
+inline constexpr size_t kMaxSize = 256 * 1024;
+inline constexpr size_t kMinThreadCacheSize = kMaxSize * 2;
+inline constexpr size_t kMaxThreadCacheSize = 4 << 20;
+inline constexpr size_t kMaxCpuCacheSize = 1.5 * 1024 * 1024;
+inline constexpr size_t kDefaultOverallThreadCacheSize =
+    8u * kMaxThreadCacheSize;
+inline constexpr size_t kStealAmount = 1 << 16;
+inline constexpr size_t kDefaultProfileSamplingRate = 1 << 21;
+inline constexpr size_t kMinPages = 8;
+#elif TCMALLOC_PAGE_SHIFT == 13
+inline constexpr size_t kPageShift = 13;
+inline constexpr size_t kNumBaseClasses = 86;
+inline constexpr bool kHasExpandedClasses = true;
+inline constexpr size_t kMaxSize = 256 * 1024;
+inline constexpr size_t kMinThreadCacheSize = kMaxSize * 2;
+inline constexpr size_t kMaxThreadCacheSize = 4 << 20;
+inline constexpr size_t kMaxCpuCacheSize = 1.5 * 1024 * 1024;
+inline constexpr size_t kDefaultOverallThreadCacheSize =
+    8u * kMaxThreadCacheSize;
+inline constexpr size_t kStealAmount = 1 << 16;
+inline constexpr size_t kDefaultProfileSamplingRate = 1 << 21;
+inline constexpr size_t kMinPages = 8;
+#else
+#error "Unsupported TCMALLOC_PAGE_SHIFT value!"
+#endif
+
+// Sanitizers constrain the memory layout which causes problems with the
+// enlarged tags required to represent NUMA partitions. Disable NUMA awareness
+// to avoid failing to mmap memory.
+#if defined(TCMALLOC_NUMA_AWARE) && !defined(MEMORY_SANITIZER) && \
+    !defined(THREAD_SANITIZER)
+inline constexpr size_t kNumaPartitions = 2;
+#else
+inline constexpr size_t kNumaPartitions = 1;
+#endif
+
+// We have copies of kNumBaseClasses size classes for each NUMA node, followed
+// by any expanded classes.
+inline constexpr size_t kExpandedClassesStart =
+    kNumBaseClasses * kNumaPartitions;
+inline constexpr size_t kNumClasses =
+    kExpandedClassesStart + (kHasExpandedClasses ? kNumBaseClasses : 0);
+
+// Size classes are often stored as uint32_t values, but there are some
+// situations where we need to store a size class with as compact a
+// representation as possible (e.g. in PageMap). Here we determine the integer
+// type to use in these situations - i.e. the smallest integer type large
+// enough to store values in the range [0,kNumClasses).
+constexpr size_t kMaxClass = kNumClasses - 1;
+using CompactSizeClass =
+    std::conditional_t<kMaxClass <= std::numeric_limits<uint8_t>::max(),
+                       uint8_t, uint16_t>;
+
+// ~64K classes ought to be enough for anybody, but let's be sure.
+static_assert(kMaxClass <= std::numeric_limits<CompactSizeClass>::max());
+
+// Minimum/maximum number of batches in TransferCache per size class.
+// Actual numbers depends on a number of factors, see TransferCache::Init
+// for details.
+inline constexpr size_t kMinObjectsToMove = 2;
+inline constexpr size_t kMaxObjectsToMove = 128;
+
+inline constexpr size_t kPageSize = 1 << kPageShift;
+// Verify that the page size used is at least 8x smaller than the maximum
+// element size in the thread cache.  This guarantees at most 12.5% internal
+// fragmentation (1/8). When page size is 256k (kPageShift == 18), the benefit
+// of increasing kMaxSize to be multiple of kPageSize is unclear. Object size
+// profile data indicates that the number of simultaneously live objects (of
+// size >= 256k) tends to be very small. Keeping those objects as 'large'
+// objects won't cause too much memory waste, while heap memory reuse can be
+// improved. Increasing kMaxSize to be too large has another bad side effect --
+// the thread cache pressure is increased, which will in turn increase traffic
+// between central cache and thread cache, leading to performance degradation.
+static_assert((kMaxSize / kPageSize) >= kMinPages || kPageShift >= 18,
+              "Ratio of kMaxSize / kPageSize is too small");
+
+inline constexpr std::align_val_t kAlignment{8};
+// log2 (kAlignment)
+inline constexpr size_t kAlignmentShift =
+    absl::bit_width(static_cast<size_t>(kAlignment) - 1u);
+
+// The number of times that a deallocation can cause a freelist to
+// go over its max_length() before shrinking max_length().
+inline constexpr int kMaxOverages = 3;
+
+// Maximum length we allow a per-thread free-list to have before we
+// move objects from it into the corresponding central free-list.  We
+// want this big to avoid locking the central free-list too often.  It
+// should not hurt to make this list somewhat big because the
+// scavenging code will shrink it down when its contents are not in use.
+inline constexpr int kMaxDynamicFreeListLength = 8192;
+
+enum class MemoryTag : uint8_t {
+  // Sampled, infrequently allocated
+  kSampled = 0x0,
+  // Not sampled, NUMA partition 0
+  kNormalP0 = 0x1,
+  // Not sampled, NUMA partition 1
+  kNormalP1 = (kNumaPartitions > 1) ? 0x2 : 0xff,
+  // Not sampled
+  kNormal = kNormalP0,
+  // Cold
+  kCold = (kNumaPartitions > 1) ? 0x4 : 0x2,
+};
+
+// We make kNormal and kCold disjoint so that IsCold implies IsSampled.  This
+// allows us to avoid modifying the fast delete path in any way when cold-tagged
+// memory allocations are absent.  We can overload the IsSampled check and then
+// do a second check for whether the possibly-sampled allocation is actually
+// IsCold.
+static_assert((static_cast<uint8_t>(MemoryTag::kNormal) &
+               static_cast<uint8_t>(MemoryTag::kCold)) == 0,
+              "kNormal and kCold should have disjoint bit patterns");
+
+inline constexpr uintptr_t kTagShift = std::min(kAddressBits - 4, 42);
+inline constexpr uintptr_t kTagMask = uintptr_t{kNumaPartitions > 1 ? 0x7 : 0x3}
+                                      << kTagShift;
+
+inline bool IsSampledMemory(const void* ptr) {
+  constexpr uintptr_t kSampledNormalMask = kNumaPartitions > 1 ? 0x3 : 0x1;
+
+  static_assert(static_cast<uintptr_t>(MemoryTag::kNormalP0) &
+                kSampledNormalMask);
+  static_assert(static_cast<uintptr_t>(MemoryTag::kNormalP1) &
+                kSampledNormalMask);
+
+  const uintptr_t tag =
+      (reinterpret_cast<uintptr_t>(ptr) & kTagMask) >> kTagShift;
+  return (tag & kSampledNormalMask) ==
+         static_cast<uintptr_t>(MemoryTag::kSampled);
+}
+
+inline bool IsNormalMemory(const void* ptr) { return !IsSampledMemory(ptr); }
+
+inline bool IsColdMemory(const void* ptr) {
+  bool r = (reinterpret_cast<uintptr_t>(ptr) & kTagMask) ==
+           (static_cast<uintptr_t>(MemoryTag::kCold) << kTagShift);
+  // IsColdMemory(ptr) implies IsSampledMemory(ptr).  This allows us to avoid
+  // introducing new branches on the delete fast path when cold memory tags are
+  // not in use.
+  ASSERT(!r || IsSampledMemory(ptr));
+  return r;
+}
+
+inline constexpr bool ColdFeatureActive() { return kHasExpandedClasses; }
+
+inline MemoryTag GetMemoryTag(const void* ptr) {
+  return static_cast<MemoryTag>((reinterpret_cast<uintptr_t>(ptr) & kTagMask) >>
+                                kTagShift);
+}
+
+absl::string_view MemoryTagToLabel(MemoryTag tag);
+
+inline constexpr bool IsExpandedSizeClass(unsigned size_class) {
+  return kHasExpandedClasses && (size_class >= kExpandedClassesStart);
+}
+
+#if !defined(TCMALLOC_SMALL_BUT_SLOW) && __SIZEOF_POINTER__ != 4
+// Always allocate at least a huge page
+inline constexpr size_t kMinSystemAlloc = kHugePageSize;
+inline constexpr size_t kMinMmapAlloc = 1 << 30;  // mmap() in 1GiB ranges.
+#else
+// Allocate in units of 2MiB. This is the size of a huge page for x86, but
+// not for Power.
+inline constexpr size_t kMinSystemAlloc = 2 << 20;
+// mmap() in units of 32MiB. This is a multiple of huge page size for
+// both x86 (2MiB) and Power (16MiB)
+inline constexpr size_t kMinMmapAlloc = 32 << 20;
+#endif
+
+static_assert(kMinMmapAlloc % kMinSystemAlloc == 0,
+              "Minimum mmap allocation size is not a multiple of"
+              " minimum system allocation size");
+
+enum class AllocationAccess {
+  kHot,
+  kCold,
+};
+
+inline bool IsColdHint(hot_cold_t hint) {
+  return static_cast<uint8_t>(hint) < uint8_t{128};
+}
+
+inline AllocationAccess AccessFromPointer(void* ptr) {
+  if (!kHasExpandedClasses) {
+    ASSERT(!IsColdMemory(ptr));
+    return AllocationAccess::kHot;
+  }
+
+  return ABSL_PREDICT_FALSE(IsColdMemory(ptr)) ? AllocationAccess::kCold
+                                               : AllocationAccess::kHot;
+}
+
+inline MemoryTag NumaNormalTag(size_t numa_partition) {
+  switch (numa_partition) {
+    case 0:
+      return MemoryTag::kNormalP0;
+    case 1:
+      return MemoryTag::kNormalP1;
+    default:
+      ASSUME(false);
+      __builtin_unreachable();
+  }
+}
+
+inline size_t NumaPartitionFromPointer(void* ptr) {
+  if constexpr (kNumaPartitions == 1) {
+    return 0;
+  }
+
+  switch (GetMemoryTag(ptr)) {
+    case MemoryTag::kNormalP1:
+      return 1;
+    default:
+      return 0;
+  }
+}
+
+// Linker initialized, so this lock can be accessed at any time.
+// Note: `CpuCache::ResizeInfo::lock` must be taken before the `pageheap_lock`
+// if both are going to be held simultaneously.
+extern absl::base_internal::SpinLock pageheap_lock;
+
+// Evaluates a/b, avoiding division by zero.
+inline double safe_div(double a, double b) {
+  if (b == 0) {
+    return 0.;
+  } else {
+    return a / b;
+  }
+}
+
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
+
+#endif  // TCMALLOC_COMMON_H_
--- a/src/third_party/tcmalloc/dist/tcmalloc/copts.bzl
+++ b/src/third_party/tcmalloc/dist/tcmalloc/copts.bzl
@ -0,0 +1,47 @@
+# Copyright 2019 The TCMalloc Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This package provides default compiler warning flags for the OSS release"""
+
+TCMALLOC_LLVM_FLAGS = [
+    # Ensure TCMalloc itself builds without errors, even if its dependencies
+    # aren't necessarily -Werror clean.
+    "-Werror",
+    "-Wno-deprecated-declarations",
+    "-Wno-deprecated-volatile",
+    "-Wno-implicit-int-float-conversion",
+    "-Wno-sign-compare",
+    "-Wno-uninitialized",
+    "-Wno-unused-function",
+    "-Wno-unused-variable",
+]
+
+TCMALLOC_GCC_FLAGS = [
+    # Ensure TCMalloc itself builds without errors, even if its dependencies
+    # aren't necessarily -Werror clean.
+    "-Werror",
+    "-Wno-attribute-alias",
+    "-Wno-sign-compare",
+    "-Wno-stringop-overflow",
+    "-Wno-uninitialized",
+    "-Wno-unused-function",
+    # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66425
+    "-Wno-unused-result",
+    "-Wno-unused-variable",
+]
+
+TCMALLOC_DEFAULT_COPTS = select({
+    "//tcmalloc:llvm": TCMALLOC_LLVM_FLAGS,
+    "//conditions:default": TCMALLOC_GCC_FLAGS,
+})
--- a/src/third_party/tcmalloc/dist/tcmalloc/cpu_cache.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/cpu_cache.cc
@ -0,0 +1,82 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tcmalloc/cpu_cache.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+#include <atomic>
+
+#include "absl/base/dynamic_annotations.h"
+#include "absl/base/macros.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/fixed_array.h"
+#include "tcmalloc/arena.h"
+#include "tcmalloc/common.h"
+#include "tcmalloc/internal/logging.h"
+#include "tcmalloc/internal_malloc_extension.h"
+#include "tcmalloc/parameters.h"
+#include "tcmalloc/static_vars.h"
+#include "tcmalloc/transfer_cache.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+
+static void ActivatePerCpuCaches() {
+  if (tcmalloc::tcmalloc_internal::tc_globals.CpuCacheActive()) {
+    // Already active.
+    return;
+  }
+
+  if (Parameters::per_cpu_caches() && subtle::percpu::IsFast()) {
+    tc_globals.InitIfNecessary();
+    tc_globals.cpu_cache().Activate();
+    tc_globals.ActivateCpuCache();
+    // no need for this thread cache anymore, I guess.
+    ThreadCache::BecomeIdle();
+    // If there's a problem with this code, let's notice it right away:
+    ::operator delete(::operator new(1));
+  }
+}
+
+class PerCPUInitializer {
+ public:
+  PerCPUInitializer() {
+   ActivatePerCpuCaches();
+  }
+};
+static PerCPUInitializer module_enter_exit;
+
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
+
+extern "C" void TCMalloc_Internal_ForceCpuCacheActivation() {
+  tcmalloc::tcmalloc_internal::ActivatePerCpuCaches();
+}
+
+extern "C" bool MallocExtension_Internal_GetPerCpuCachesActive() {
+  return tcmalloc::tcmalloc_internal::tc_globals.CpuCacheActive();
+}
+
+extern "C" int32_t MallocExtension_Internal_GetMaxPerCpuCacheSize() {
+  return tcmalloc::tcmalloc_internal::Parameters::max_per_cpu_cache_size();
+}
+
+extern "C" void MallocExtension_Internal_SetMaxPerCpuCacheSize(int32_t value) {
+  tcmalloc::tcmalloc_internal::Parameters::set_max_per_cpu_cache_size(value);
+}
--- a/src/third_party/tcmalloc/dist/tcmalloc/cpu_cache.h
+++ b/src/third_party/tcmalloc/dist/tcmalloc/cpu_cache.h
--- a/src/third_party/tcmalloc/dist/tcmalloc/cpu_cache_activate_test.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/cpu_cache_activate_test.cc
@ -0,0 +1,86 @@
+// Copyright 2021 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+
+#include <string>
+#include <thread>  // NOLINT(build/c++11)
+
+#include "benchmark/benchmark.h"
+#include "gtest/gtest.h"
+#include "absl/base/internal/sysinfo.h"
+#include "absl/random/random.h"
+#include "absl/synchronization/notification.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "tcmalloc/cpu_cache.h"
+#include "tcmalloc/internal/percpu.h"
+#include "tcmalloc/internal_malloc_extension.h"
+#include "tcmalloc/static_vars.h"
+
+namespace tcmalloc {
+namespace tcmalloc_internal {
+namespace {
+
+// This test mutates global state, including triggering the activation of the
+// per-CPU caches.  It should not be run along side other tests in the same
+// process that may rely on an isolated global instance.
+TEST(CpuCacheActivateTest, GlobalInstance) {
+  if (!subtle::percpu::IsFast()) {
+    return;
+  }
+
+  CpuCache& cache = tc_globals.cpu_cache();
+
+  absl::Notification done;
+
+  std::thread t([&]() {
+    const int num_cpus = absl::base_internal::NumCPUs();
+    absl::BitGen rng;
+
+    while (!done.HasBeenNotified()) {
+      const double coin = absl::Uniform(rng, 0., 1.);
+      const bool ready = tc_globals.CpuCacheActive();
+
+      if (ready && coin < 0.25) {
+        const int cpu = absl::Uniform(rng, 0, num_cpus);
+        benchmark::DoNotOptimize(cache.UsedBytes(cpu));
+      } else if (ready && coin < 0.5) {
+        const int cpu = absl::Uniform(rng, 0, num_cpus);
+        benchmark::DoNotOptimize(cache.Capacity(cpu));
+      } else if (ready && coin < 0.75) {
+        benchmark::DoNotOptimize(cache.TotalUsedBytes());
+      } else {
+        benchmark::DoNotOptimize(cache.CacheLimit());
+      }
+    }
+  });
+
+  // Trigger initialization of the CpuCache, confirming it was not initialized
+  // at the start of the test and is afterwards.
+  EXPECT_FALSE(tc_globals.CpuCacheActive());
+  ASSERT_NE(&TCMalloc_Internal_ForceCpuCacheActivation, nullptr);
+  Parameters::set_per_cpu_caches(true);
+  TCMalloc_Internal_ForceCpuCacheActivation();
+  EXPECT_TRUE(tc_globals.CpuCacheActive());
+
+  absl::SleepFor(absl::Seconds(0.2));
+
+  done.Notify();
+  t.join();
+}
+
+}  // namespace
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
--- a/src/third_party/tcmalloc/dist/tcmalloc/cpu_cache_test.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/cpu_cache_test.cc
--- a/src/third_party/tcmalloc/dist/tcmalloc/deallocation_profiler.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/deallocation_profiler.cc
@ -0,0 +1,606 @@
+// Copyright 2022 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tcmalloc/deallocation_profiler.h"
+
+#include <algorithm>
+#include <cmath>    // for std::lround
+#include <cstdint>  // for uintptr_t
+#include <functional>
+#include <limits>
+#include <memory>
+#include <string>  // for memset
+#include <type_traits>
+#include <utility>
+
+#include "absl/base/attributes.h"
+#include "absl/base/internal/low_level_alloc.h"
+#include "absl/base/internal/spinlock.h"
+#include "absl/base/internal/sysinfo.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/debugging/stacktrace.h"  // for GetStackTrace
+#include "absl/hash/hash.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "tcmalloc/internal/logging.h"
+#include "tcmalloc/internal/percpu.h"
+#include "tcmalloc/internal_malloc_extension.h"
+#include "tcmalloc/malloc_extension.h"
+#include "tcmalloc/sampled_allocation.h"
+#include "tcmalloc/static_vars.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace deallocationz {
+namespace {
+using ::absl::base_internal::SpinLock;
+using ::absl::base_internal::SpinLockHolder;
+
+// STL adaptor for an arena based allocator which provides the following:
+//   static void* Alloc::Allocate(size_t size);
+//   static void Alloc::Free(void* ptr, size_t size);
+template <typename T, class Alloc>
+class AllocAdaptor final {
+ public:
+  using value_type = T;
+
+  AllocAdaptor() {}
+  AllocAdaptor(const AllocAdaptor&) {}
+
+  template <class T1>
+  using rebind = AllocAdaptor<T1, Alloc>;
+
+  template <class T1>
+  explicit AllocAdaptor(const AllocAdaptor<T1, Alloc>&) {}
+
+  T* allocate(size_t n) {
+    // Check if n is too big to allocate.
+    ASSERT((n * sizeof(T)) / sizeof(T) == n);
+    return static_cast<T*>(Alloc::Allocate(n * sizeof(T)));
+  }
+  void deallocate(T* p, size_t n) { Alloc::Free(p, n * sizeof(T)); }
+
+  // There's no state, so these allocators are always equal
+  bool operator==(const AllocAdaptor&) const { return true; }
+  bool operator!=(const AllocAdaptor&) const { return false; }
+};
+
+const int64_t kMaxStackDepth = 64;
+
+// Stores stack traces and metadata for any allocation or deallocation
+// encountered by the profiler.
+struct DeallocationSampleRecord {
+  double weight = 0.0;
+  size_t requested_size = 0;
+  size_t requested_alignment = 0;
+  size_t allocated_size = 0;  // size after sizeclass/page rounding
+
+  int depth = 0;  // Number of PC values stored in array below
+  void* stack[kMaxStackDepth];
+
+  // creation_time is used to capture the life_time of sampled allocations
+  absl::Time creation_time;
+  int cpu_id = -1;
+  pid_t thread_id = 0;
+
+  template <typename H>
+  friend H AbslHashValue(H h, const DeallocationSampleRecord& c) {
+    return H::combine(H::combine_contiguous(std::move(h), c.stack, c.depth),
+                      c.depth, c.requested_size, c.requested_alignment,
+                      c.allocated_size);
+  }
+
+  bool operator==(const DeallocationSampleRecord& other) const {
+    if (depth != other.depth || requested_size != other.requested_size ||
+        requested_alignment != other.requested_alignment ||
+        allocated_size != other.allocated_size) {
+      return false;
+    }
+    return std::equal(stack, stack + depth, other.stack);
+  }
+};
+
+// Tracks whether an object was allocated/deallocated by the same CPU/thread.
+struct CpuThreadMatchingStatus {
+  constexpr CpuThreadMatchingStatus(bool cpu_matched, bool thread_matched)
+      : cpu_matched(cpu_matched),
+        thread_matched(thread_matched),
+        value((static_cast<int>(cpu_matched) << 1) |
+              static_cast<int>(thread_matched)) {}
+  bool cpu_matched;
+  bool thread_matched;
+  int value;
+};
+
+struct RpcMatchingStatus {
+  static constexpr int ComputeValue(uint64_t alloc, uint64_t dealloc) {
+    if (alloc != 0 && dealloc != 0) {
+      return static_cast<int>(alloc == dealloc);
+    } else {
+      return 2;
+    }
+  }
+
+  constexpr RpcMatchingStatus(uint64_t alloc, uint64_t dealloc)
+      : value(ComputeValue(alloc, dealloc)) {}
+
+  int value;
+};
+
+int ComputeIndex(CpuThreadMatchingStatus status, RpcMatchingStatus rpc_status) {
+  return status.value * 3 + rpc_status.value;
+}
+
+constexpr std::pair<CpuThreadMatchingStatus, RpcMatchingStatus> kAllCases[] = {
+    {CpuThreadMatchingStatus(false, false), RpcMatchingStatus(0, 0)},
+    {CpuThreadMatchingStatus(false, true), RpcMatchingStatus(0, 0)},
+    {CpuThreadMatchingStatus(true, false), RpcMatchingStatus(0, 0)},
+    {CpuThreadMatchingStatus(true, true), RpcMatchingStatus(0, 0)},
+
+    {CpuThreadMatchingStatus(false, false), RpcMatchingStatus(1, 2)},
+    {CpuThreadMatchingStatus(false, true), RpcMatchingStatus(1, 2)},
+    {CpuThreadMatchingStatus(true, false), RpcMatchingStatus(1, 2)},
+    {CpuThreadMatchingStatus(true, true), RpcMatchingStatus(1, 2)},
+
+    {CpuThreadMatchingStatus(false, false), RpcMatchingStatus(1, 1)},
+    {CpuThreadMatchingStatus(false, true), RpcMatchingStatus(1, 1)},
+    {CpuThreadMatchingStatus(true, false), RpcMatchingStatus(1, 1)},
+    {CpuThreadMatchingStatus(true, true), RpcMatchingStatus(1, 1)},
+};
+}  // namespace
+
+class DeallocationProfiler {
+ private:
+  // Arena and allocator used to back STL objects used by DeallocationProfiler
+  // Shared between all instances of DeallocationProfiler
+  // TODO(b/248332543): Use TCMalloc's own arena allocator instead of defining a
+  // new one here. The need for refcount management could be the reason for
+  // using a custom allocator in the first place.
+  class MyAllocator {
+   public:
+    static void* Allocate(size_t n) {
+      return absl::base_internal::LowLevelAlloc::AllocWithArena(n, arena_);
+    }
+    static void Free(const void* p, size_t /* n */) {
+      absl::base_internal::LowLevelAlloc::Free(const_cast<void*>(p));
+    }
+
+    // The lifetime of the arena is managed using a reference count and
+    // determined by how long at least one emitted Profile remains alive.
+    struct LowLevelArenaReference {
+      LowLevelArenaReference() {
+        SpinLockHolder h(&arena_lock_);
+        if ((refcount_++) == 0) {
+          CHECK_CONDITION(arena_ == nullptr);
+          arena_ = absl::base_internal::LowLevelAlloc::NewArena(0);
+        }
+      }
+
+      ~LowLevelArenaReference() {
+        SpinLockHolder h(&arena_lock_);
+        if ((--refcount_) == 0) {
+          CHECK_CONDITION(
+              absl::base_internal::LowLevelAlloc::DeleteArena(arena_));
+          arena_ = nullptr;
+        }
+      }
+    };
+
+   private:
+    // We need to protect the arena with a mutex and ensure that every thread
+    // acquires that mutex before it uses the arena for the first time. Once
+    // it has acquired the mutex, it is guaranteed that arena won't change
+    // between that point in time and when the thread stops accessing it (as
+    // enforced by LowLevelArenaReference below).
+    ABSL_CONST_INIT static SpinLock arena_lock_;
+    static absl::base_internal::LowLevelAlloc::Arena* arena_;
+
+    // We assume that launching a new deallocation profiler takes too long
+    // to cause this to overflow within the sampling period. The reason this
+    // is not using std::shared_ptr is that we do not only need to protect the
+    // value of the reference count but also the pointer itself (and therefore
+    // need a separate mutex either way).
+    static uint32_t refcount_;
+  };
+
+  // This must be the first member of the class to be initialized. The
+  // underlying arena must stay alive as long as the profiler.
+  MyAllocator::LowLevelArenaReference arena_ref_;
+
+  // All active profilers are stored in a list.
+  DeallocationProfiler* next_;
+  DeallocationProfilerList* list_ = nullptr;
+  friend class DeallocationProfilerList;
+
+  using AllocsTable = absl::flat_hash_map<
+      tcmalloc_internal::AllocHandle, DeallocationSampleRecord,
+      absl::Hash<tcmalloc_internal::AllocHandle>,
+      std::equal_to<tcmalloc_internal::AllocHandle>,
+      AllocAdaptor<std::pair<const tcmalloc_internal::AllocHandle,
+                             DeallocationSampleRecord>,
+                   MyAllocator>>;
+
+  class DeallocationStackTraceTable final
+      : public tcmalloc_internal::ProfileBase {
+   public:
+    // We define the dtor to ensure it is placed in the desired text section.
+    ~DeallocationStackTraceTable() override = default;
+    void AddTrace(const DeallocationSampleRecord& alloc_trace,
+                  const DeallocationSampleRecord& dealloc_trace);
+
+    void Iterate(
+        absl::FunctionRef<void(const Profile::Sample&)> func) const override;
+
+    ProfileType Type() const override {
+      return tcmalloc::ProfileType::kLifetimes;
+    }
+
+    absl::Duration Duration() const override {
+      return stop_time_ - start_time_;
+    }
+
+    void StopAndRecord(const AllocsTable& allocs);
+
+   private:
+    // This must be the first member of the class to be initialized. The
+    // underlying arena must stay alive as long as the profile.
+    MyAllocator::LowLevelArenaReference arena_ref_;
+
+    static constexpr int kNumCases =
+        12;  // CPUthreadMatchingStatus({T,F},{T,F}) x RPCMatchingStatus
+
+    struct Key {
+      DeallocationSampleRecord alloc;
+      DeallocationSampleRecord dealloc;
+
+      Key(const DeallocationSampleRecord& alloc,
+          const DeallocationSampleRecord& dealloc)
+          : alloc(alloc), dealloc(dealloc) {}
+
+      template <typename H>
+      friend H AbslHashValue(H h, const Key& c) {
+        return H::combine(std::move(h), c.alloc, c.dealloc);
+      }
+
+      bool operator==(const Key& other) const {
+        return (alloc == other.alloc) && (dealloc == other.dealloc);
+      }
+    };
+
+    struct Value {
+      // for each possible cases, we collect repetition count and avg lifetime
+      // we also collect the minimum and maximum lifetimes, as well as the sum
+      // of squares (to calculate the standard deviation).
+      double counts[kNumCases] = {0.0};
+      double mean_life_times_ns[kNumCases] = {0.0};
+      double variance_life_times_ns[kNumCases] = {0.0};
+      double min_life_times_ns[kNumCases] = {0.0};
+      double max_life_times_ns[kNumCases] = {0.0};
+
+      Value() {
+        std::fill_n(min_life_times_ns, kNumCases,
+                    std::numeric_limits<double>::max());
+      }
+    };
+
+    absl::flat_hash_map<Key, Value, absl::Hash<Key>, std::equal_to<Key>,
+                        AllocAdaptor<std::pair<const Key, Value>, MyAllocator>>
+        table_;
+
+    absl::Time start_time_ = absl::Now();
+    absl::Time stop_time_;
+  };
+
+  // Keep track of allocations that are in flight
+  AllocsTable allocs_;
+
+  // Table to store lifetime information collected by this profiler
+  std::unique_ptr<DeallocationStackTraceTable> reports_ = nullptr;
+
+ public:
+  explicit DeallocationProfiler(DeallocationProfilerList* list) : list_(list) {
+    reports_ = std::make_unique<DeallocationStackTraceTable>();
+    list_->Add(this);
+  }
+
+  ~DeallocationProfiler() {
+    if (reports_ != nullptr) {
+      Stop();
+    }
+  }
+
+  const tcmalloc::Profile Stop() {
+    if (reports_ != nullptr) {
+      // We first remove the profiler from the list to avoid racing with
+      // potential allocations which may modify the allocs_ table.
+      list_->Remove(this);
+      reports_->StopAndRecord(allocs_);
+      return tcmalloc_internal::ProfileAccessor::MakeProfile(
+          std::move(reports_));
+    }
+    return tcmalloc::Profile();
+  }
+
+  void ReportMalloc(const tcmalloc_internal::StackTrace& stack_trace) {
+    // store sampled alloc in the hashmap
+    DeallocationSampleRecord& allocation =
+        allocs_[stack_trace.sampled_alloc_handle];
+
+    allocation.allocated_size = stack_trace.allocated_size;
+    allocation.requested_size = stack_trace.requested_size;
+    allocation.requested_alignment = stack_trace.requested_alignment;
+    allocation.depth = stack_trace.depth;
+    memcpy(allocation.stack, stack_trace.stack,
+           sizeof(void*) * std::min(static_cast<int64_t>(stack_trace.depth),
+                                    kMaxStackDepth));
+    // TODO(mmaas): Do we need to worry about b/65384231 anymore?
+    allocation.creation_time = stack_trace.allocation_time;
+    allocation.cpu_id = tcmalloc_internal::subtle::percpu::GetCurrentCpu();
+    allocation.thread_id = absl::base_internal::GetTID();
+    // We divide by the requested size to obtain the number of allocations.
+    // TODO(b/248332543): Consider using AllocatedBytes from sampler.h.
+    allocation.weight = static_cast<double>(stack_trace.weight) /
+                        (stack_trace.requested_size + 1);
+  }
+
+  void ReportFree(tcmalloc_internal::AllocHandle handle) {
+    auto it = allocs_.find(handle);
+
+    // Handle the case that we observed the deallocation but not the allocation
+    if (it == allocs_.end()) {
+      return;
+    }
+
+    DeallocationSampleRecord sample = it->second;
+    allocs_.erase(it);
+
+    DeallocationSampleRecord deallocation;
+    deallocation.allocated_size = sample.allocated_size;
+    deallocation.requested_alignment = sample.requested_alignment;
+    deallocation.requested_size = sample.requested_size;
+    deallocation.creation_time = absl::Now();
+    deallocation.cpu_id = tcmalloc_internal::subtle::percpu::GetCurrentCpu();
+    deallocation.thread_id = absl::base_internal::GetTID();
+    deallocation.depth =
+        absl::GetStackTrace(deallocation.stack, kMaxStackDepth, 1);
+
+    reports_->AddTrace(sample, deallocation);
+  }
+};
+
+void DeallocationProfilerList::Add(DeallocationProfiler* profiler) {
+  SpinLockHolder h(&profilers_lock_);
+  profiler->next_ = first_;
+  first_ = profiler;
+
+  // Whenever a new profiler is created, we seed it with live allocations.
+  tcmalloc_internal::tc_globals.sampled_allocation_recorder().Iterate(
+      [profiler](
+          const tcmalloc_internal::SampledAllocation& sampled_allocation) {
+        profiler->ReportMalloc(sampled_allocation.sampled_stack);
+      });
+}
+
+// This list is very short and we're nowhere near a hot path, just walk
+void DeallocationProfilerList::Remove(DeallocationProfiler* profiler) {
+  SpinLockHolder h(&profilers_lock_);
+  DeallocationProfiler** link = &first_;
+  DeallocationProfiler* cur = first_;
+  while (cur != profiler) {
+    CHECK_CONDITION(cur != nullptr);
+    link = &cur->next_;
+    cur = cur->next_;
+  }
+  *link = profiler->next_;
+}
+
+void DeallocationProfilerList::ReportMalloc(
+    const tcmalloc_internal::StackTrace& stack_trace) {
+  SpinLockHolder h(&profilers_lock_);
+  DeallocationProfiler* cur = first_;
+  while (cur != nullptr) {
+    cur->ReportMalloc(stack_trace);
+    cur = cur->next_;
+  }
+}
+
+void DeallocationProfilerList::ReportFree(
+    tcmalloc_internal::AllocHandle handle) {
+  SpinLockHolder h(&profilers_lock_);
+  DeallocationProfiler* cur = first_;
+  while (cur != nullptr) {
+    cur->ReportFree(handle);
+    cur = cur->next_;
+  }
+}
+
+// Initialize static variables
+absl::base_internal::LowLevelAlloc::Arena*
+    DeallocationProfiler::MyAllocator::arena_ = nullptr;
+uint32_t DeallocationProfiler::MyAllocator::refcount_ = 0;
+ABSL_CONST_INIT SpinLock DeallocationProfiler::MyAllocator::arena_lock_(
+    absl::kConstInit, absl::base_internal::SCHEDULE_KERNEL_ONLY);
+
+void DeallocationProfiler::DeallocationStackTraceTable::StopAndRecord(
+    const AllocsTable& allocs) {
+  stop_time_ = absl::Now();
+
+  // Insert a dummy DeallocationSampleRecord since the table stores pairs. This
+  // allows us to make minimal changes to the rest of the sample processing
+  // steps reducing special casing for censored samples. This also allows us to
+  // aggregate censored samples just like regular deallocation samples.
+  const DeallocationSampleRecord censored{
+      .creation_time = stop_time_,
+  };
+  for (const auto& [unused, alloc] : allocs) {
+    AddTrace(alloc, censored);
+  }
+}
+
+void DeallocationProfiler::DeallocationStackTraceTable::AddTrace(
+    const DeallocationSampleRecord& alloc_trace,
+    const DeallocationSampleRecord& dealloc_trace) {
+  CpuThreadMatchingStatus status =
+      CpuThreadMatchingStatus(alloc_trace.cpu_id == dealloc_trace.cpu_id,
+                              alloc_trace.thread_id == dealloc_trace.thread_id);
+
+  // Initialize a default rpc matched status.
+  RpcMatchingStatus rpc_status(/*alloc=*/0, /*dealloc=*/0);
+
+  const int index = ComputeIndex(status, rpc_status);
+
+  DeallocationStackTraceTable::Value& v =
+      table_[DeallocationStackTraceTable::Key(alloc_trace, dealloc_trace)];
+
+  const absl::Duration life_time =
+      dealloc_trace.creation_time - alloc_trace.creation_time;
+  double life_time_ns = absl::ToDoubleNanoseconds(life_time);
+
+  // Update mean and variance using Welford’s online algorithm.
+  double old_mean_ns = v.mean_life_times_ns[index];
+  v.mean_life_times_ns[index] +=
+      (life_time_ns - old_mean_ns) / static_cast<double>(v.counts[index] + 1);
+  v.variance_life_times_ns[index] +=
+      (life_time_ns - v.mean_life_times_ns[index]) *
+      (v.mean_life_times_ns[index] - old_mean_ns);
+
+  v.min_life_times_ns[index] =
+      std::min(v.min_life_times_ns[index], life_time_ns);
+  v.max_life_times_ns[index] =
+      std::max(v.max_life_times_ns[index], life_time_ns);
+  v.counts[index]++;
+}
+
+void DeallocationProfiler::DeallocationStackTraceTable::Iterate(
+    absl::FunctionRef<void(const Profile::Sample&)> func) const {
+  uint64_t pair_id = 1;
+
+  for (auto& it : table_) {
+    const Key& k = it.first;
+    const Value& v = it.second;
+
+    // Report total bytes that are a multiple of the object size.
+    size_t allocated_size = k.alloc.allocated_size;
+
+    for (const auto& matching_case : kAllCases) {
+      const int index = ComputeIndex(matching_case.first, matching_case.second);
+      if (v.counts[index] == 0) {
+        continue;
+      }
+
+      uintptr_t bytes =
+          std::lround(v.counts[index] * k.alloc.weight * allocated_size);
+      int64_t count = (bytes + allocated_size - 1) / allocated_size;
+      int64_t sum = count * allocated_size;
+
+      // The variance should be >= 0, but it's not impossible that it drops
+      // below 0 for numerical reasons. We don't want to crash in this case,
+      // so we ensure to return 0 if this happens.
+      double stddev_life_time_ns =
+          sqrt(std::max(0.0, v.variance_life_times_ns[index] /
+                                 static_cast<double>((v.counts[index]))));
+
+      const auto bucketize = internal::LifetimeNsToBucketedDuration;
+      Profile::Sample sample{
+          .sum = sum,
+          .requested_size = k.alloc.requested_size,
+          .requested_alignment = k.alloc.requested_alignment,
+          .allocated_size = allocated_size,
+          .profile_id = pair_id++,
+          // Set the is_censored flag so that when we create a proto
+          // sample later we can treat the *_lifetime accordingly.
+          .is_censored = (k.dealloc.depth == 0),
+          .avg_lifetime = bucketize(v.mean_life_times_ns[index]),
+          .stddev_lifetime = bucketize(stddev_life_time_ns),
+          .min_lifetime = bucketize(v.min_life_times_ns[index]),
+          .max_lifetime = bucketize(v.max_life_times_ns[index])};
+      // Only set the cpu and thread matched flags if the sample is not
+      // censored.
+      if (!sample.is_censored) {
+        sample.allocator_deallocator_cpu_matched =
+            matching_case.first.cpu_matched;
+        sample.allocator_deallocator_thread_matched =
+            matching_case.first.thread_matched;
+      }
+
+      // first for allocation
+      sample.count = count;
+      sample.depth = k.alloc.depth;
+      std::copy(k.alloc.stack, k.alloc.stack + k.alloc.depth, sample.stack);
+      func(sample);
+
+      // If this is a right-censored allocation (i.e. we did not observe the
+      // deallocation) then do not emit a deallocation sample pair.
+      if (sample.is_censored) {
+        continue;
+      }
+
+      // second for deallocation
+      static_assert(
+          std::is_signed<decltype(tcmalloc::Profile::Sample::count)>::value,
+          "Deallocation samples are tagged with negative count values.");
+      sample.count = -1 * count;
+      sample.depth = k.dealloc.depth;
+      std::copy(k.dealloc.stack, k.dealloc.stack + k.dealloc.depth,
+                sample.stack);
+      func(sample);
+    }
+  }
+}
+
+DeallocationSample::DeallocationSample(DeallocationProfilerList* list) {
+  profiler_ = std::make_unique<DeallocationProfiler>(list);
+}
+
+tcmalloc::Profile DeallocationSample::Stop() && {
+  if (profiler_ != nullptr) {
+    tcmalloc::Profile profile = profiler_->Stop();
+    profiler_.reset();
+    return profile;
+  }
+  return tcmalloc::Profile();
+}
+
+namespace internal {
+
+// Lifetimes below 1ns are truncated to 1ns.  Lifetimes between 1ns and 1ms
+// are rounded to the next smaller power of 10.  Lifetimes above 1ms are rounded
+// down to the nearest millisecond.
+absl::Duration LifetimeNsToBucketedDuration(double lifetime_ns) {
+  if (lifetime_ns < 1000000.0) {
+    if (lifetime_ns <= 1) {
+      // Avoid negatives.  We can't allocate in a negative amount of time or
+      // even as quickly as a nanosecond (microbenchmarks of
+      // allocation/deallocation in a tight loop are several nanoseconds), so
+      // results this small indicate probable clock skew or other confounding
+      // factors in the data.
+      return absl::Nanoseconds(1);
+    }
+
+    for (uint64_t cutoff_ns = 10; cutoff_ns <= 1000000; cutoff_ns *= 10) {
+      if (lifetime_ns < cutoff_ns) {
+        return absl::Nanoseconds(cutoff_ns / 10);
+      }
+    }
+  }
+
+  // Round down to nearest millisecond.
+  return absl::Nanoseconds(static_cast<uint64_t>(lifetime_ns / 1000000.0) *
+                           1000000L);
+}
+
+}  // namespace internal
+}  // namespace deallocationz
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
--- a/src/third_party/tcmalloc/dist/tcmalloc/deallocation_profiler.h
+++ b/src/third_party/tcmalloc/dist/tcmalloc/deallocation_profiler.h
@ -0,0 +1,67 @@
+// Copyright 2022 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TCMALLOC_DEALLOCATION_PROFILER_H_
+#define TCMALLOC_DEALLOCATION_PROFILER_H_
+
+#include <memory>
+
+#include "absl/base/const_init.h"
+#include "absl/base/internal/spinlock.h"
+#include "tcmalloc/internal/config.h"
+#include "tcmalloc/internal/logging.h"
+#include "tcmalloc/malloc_extension.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace deallocationz {
+
+class DeallocationProfiler;
+
+class DeallocationProfilerList {
+ public:
+  constexpr DeallocationProfilerList() = default;
+
+  void ReportMalloc(const tcmalloc_internal::StackTrace& stack_trace);
+  void ReportFree(tcmalloc_internal::AllocHandle handle);
+  void Add(DeallocationProfiler* profiler);
+  void Remove(DeallocationProfiler* profiler);
+
+ private:
+  DeallocationProfiler* first_ = nullptr;
+  absl::base_internal::SpinLock profilers_lock_{
+      absl::kConstInit, absl::base_internal::SCHEDULE_KERNEL_ONLY};
+};
+
+class DeallocationSample final
+    : public tcmalloc_internal::AllocationProfilingTokenBase {
+ public:
+  explicit DeallocationSample(DeallocationProfilerList* list);
+  // We define the dtor to ensure it is placed in the desired text section.
+  ~DeallocationSample() override = default;
+
+  tcmalloc::Profile Stop() && override;
+
+ private:
+  std::unique_ptr<DeallocationProfiler> profiler_;
+};
+
+namespace internal {
+absl::Duration LifetimeNsToBucketedDuration(double lifetime_ns);
+}  // namespace internal
+}  // namespace deallocationz
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
+
+#endif  // TCMALLOC_DEALLOCATION_PROFILER_H_
--- a/src/third_party/tcmalloc/dist/tcmalloc/experiment.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/experiment.cc
@ -0,0 +1,140 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tcmalloc/experiment.h"
+
+#include <string.h>
+
+#include <algorithm>
+#include <string>
+
+#include "absl/base/macros.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tcmalloc/internal/environment.h"
+#include "tcmalloc/internal/logging.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+namespace {
+
+const char kDelimiter = ',';
+const char kExperiments[] = "BORG_EXPERIMENTS";
+const char kDisableExperiments[] = "BORG_DISABLE_EXPERIMENTS";
+constexpr absl::string_view kEnableAll = "enable-all-known-experiments";
+constexpr absl::string_view kDisableAll = "all";
+
+bool LookupExperimentID(absl::string_view label, Experiment* exp) {
+  for (auto config : experiments) {
+    if (config.name == label) {
+      *exp = config.id;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+const bool* GetSelectedExperiments() {
+  static bool by_id[kNumExperiments];
+
+  static const bool* status = [&]() {
+    const char* active_experiments = thread_safe_getenv(kExperiments);
+    const char* disabled_experiments = thread_safe_getenv(kDisableExperiments);
+    return SelectExperiments(by_id,
+                             active_experiments ? active_experiments : "",
+                             disabled_experiments ? disabled_experiments : "");
+  }();
+  return status;
+}
+
+template <typename F>
+void ParseExperiments(absl::string_view labels, F f) {
+  absl::string_view::size_type pos = 0;
+  do {
+    absl::string_view token;
+    auto end = labels.find(kDelimiter, pos);
+    if (end == absl::string_view::npos) {
+      token = labels.substr(pos);
+      pos = end;
+    } else {
+      token = labels.substr(pos, end - pos);
+      pos = end + 1;
+    }
+
+    f(token);
+  } while (pos != absl::string_view::npos);
+}
+
+}  // namespace
+
+const bool* SelectExperiments(bool* buffer, absl::string_view active,
+                              absl::string_view disabled) {
+  memset(buffer, 0, sizeof(*buffer) * kNumExperiments);
+
+  if (active == kEnableAll) {
+    std::fill(buffer, buffer + kNumExperiments, true);
+  }
+
+  ParseExperiments(active, [buffer](absl::string_view token) {
+    Experiment id;
+    if (LookupExperimentID(token, &id)) {
+      buffer[static_cast<int>(id)] = true;
+    }
+  });
+
+  if (disabled == kDisableAll) {
+    memset(buffer, 0, sizeof(*buffer) * kNumExperiments);
+  }
+
+  ParseExperiments(disabled, [buffer](absl::string_view token) {
+    Experiment id;
+    if (LookupExperimentID(token, &id)) {
+      buffer[static_cast<int>(id)] = false;
+    }
+  });
+
+  return buffer;
+}
+
+}  // namespace tcmalloc_internal
+
+bool IsExperimentActive(Experiment exp) {
+  ASSERT(static_cast<int>(exp) >= 0);
+  ASSERT(exp < Experiment::kMaxExperimentID);
+
+  return tcmalloc_internal::GetSelectedExperiments()[static_cast<int>(exp)];
+}
+
+absl::optional<Experiment> FindExperimentByName(absl::string_view name) {
+  for (const auto& config : experiments) {
+    if (name == config.name) {
+      return config.id;
+    }
+  }
+
+  return absl::nullopt;
+}
+
+void WalkExperiments(
+    absl::FunctionRef<void(absl::string_view name, bool active)> callback) {
+  for (const auto& config : experiments) {
+    callback(config.name, IsExperimentActive(config.id));
+  }
+}
+
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
--- a/src/third_party/tcmalloc/dist/tcmalloc/experiment.h
+++ b/src/third_party/tcmalloc/dist/tcmalloc/experiment.h
@ -0,0 +1,68 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TCMALLOC_EXPERIMENT_H_
+#define TCMALLOC_EXPERIMENT_H_
+
+#include <stddef.h>
+
+#include <string>
+
+#include "absl/functional/function_ref.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tcmalloc/experiment_config.h"
+#include "tcmalloc/internal/config.h"
+
+// TCMalloc Experiment Controller
+//
+// This consumes environment variables to decide whether to activate experiments
+// to control TCMalloc behavior.  It avoids memory allocations when making
+// experiment decisions to allow experiments to be used in critical TCMalloc
+// initialization paths.
+//
+// If an experiment is causing difficulty, all experiments can be disabled by
+// setting the environment variable:
+//     BORG_DISABLE_EXPERIMENTS=all *or*
+//     BORG_DISABLE_EXPERIMENTS=BAD_EXPERIMENT_LABEL
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+
+constexpr size_t kNumExperiments =
+    static_cast<size_t>(Experiment::kMaxExperimentID);
+
+// SelectExperiments parses the experiments enumerated by active and disabled
+// and updates buffer[experiment_id] accordingly.
+//
+// buffer must be sized for kMaxExperimentID entries.
+//
+// This is exposed for testing purposes only.
+const bool* SelectExperiments(bool* buffer, absl::string_view active,
+                              absl::string_view disabled);
+
+}  // namespace tcmalloc_internal
+
+bool IsExperimentActive(Experiment exp);
+
+absl::optional<Experiment> FindExperimentByName(absl::string_view name);
+
+void WalkExperiments(
+    absl::FunctionRef<void(absl::string_view name, bool active)> callback);
+
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
+
+#endif  // TCMALLOC_EXPERIMENT_H_
--- a/src/third_party/tcmalloc/dist/tcmalloc/experiment_config.h
+++ b/src/third_party/tcmalloc/dist/tcmalloc/experiment_config.h
@ -0,0 +1,55 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TCMALLOC_EXPERIMENT_CONFIG_H_
+#define TCMALLOC_EXPERIMENT_CONFIG_H_
+
+#include "absl/strings/string_view.h"
+
+// Autogenerated by experiments_proto_test --experiments_generate_config=true
+namespace tcmalloc {
+
+enum class Experiment : int {
+  TEST_ONLY_TCMALLOC_POW2_SIZECLASS,
+  TEST_ONLY_TCMALLOC_SHARDED_TRANSFER_CACHE,
+  TEST_ONLY_TCMALLOC_GENERIC_SHARDED_TRANSFER_CACHE,
+  TEST_ONLY_TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN,
+  TEST_ONLY_TCMALLOC_SEPARATE_ALLOCS_FOR_FEW_AND_MANY_OBJECTS_SPANS,
+  TCMALLOC_GENERIC_SHARDED_TRANSFER_CACHE,
+  TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN,
+  TCMALLOC_SEPARATE_ALLOCS_FOR_FEW_AND_MANY_OBJECTS_SPANS2,
+  kMaxExperimentID,
+};
+
+struct ExperimentConfig {
+  Experiment id;
+  absl::string_view name;
+};
+
+// clang-format off
+inline constexpr ExperimentConfig experiments[] = {
+    {Experiment::TEST_ONLY_TCMALLOC_POW2_SIZECLASS, "TEST_ONLY_TCMALLOC_POW2_SIZECLASS"},
+    {Experiment::TEST_ONLY_TCMALLOC_SHARDED_TRANSFER_CACHE, "TEST_ONLY_TCMALLOC_SHARDED_TRANSFER_CACHE"},
+    {Experiment::TEST_ONLY_TCMALLOC_GENERIC_SHARDED_TRANSFER_CACHE, "TEST_ONLY_TCMALLOC_GENERIC_SHARDED_TRANSFER_CACHE"},
+    {Experiment::TEST_ONLY_TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN, "TEST_ONLY_TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN"},
+    {Experiment::TEST_ONLY_TCMALLOC_SEPARATE_ALLOCS_FOR_FEW_AND_MANY_OBJECTS_SPANS, "TEST_ONLY_TCMALLOC_SEPARATE_ALLOCS_FOR_FEW_AND_MANY_OBJECTS_SPANS"},
+    {Experiment::TCMALLOC_GENERIC_SHARDED_TRANSFER_CACHE, "TCMALLOC_GENERIC_SHARDED_TRANSFER_CACHE"},
+    {Experiment::TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN, "TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN"},
+    {Experiment::TCMALLOC_SEPARATE_ALLOCS_FOR_FEW_AND_MANY_OBJECTS_SPANS2, "TCMALLOC_SEPARATE_ALLOCS_FOR_FEW_AND_MANY_OBJECTS_SPANS2"},
+};
+// clang-format on
+
+}  // namespace tcmalloc
+
+#endif  // TCMALLOC_EXPERIMENT_CONFIG_H_
--- a/src/third_party/tcmalloc/dist/tcmalloc/experiment_config_test.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/experiment_config_test.cc
@ -0,0 +1,31 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tcmalloc/experiment_config.h"
+
+#include "gtest/gtest.h"
+
+namespace tcmalloc {
+namespace {
+
+// Verify IDs are non-negative and strictly less than kMaxExperimentID.
+TEST(ExperimentConfigTest, ValidateIDs) {
+  for (const auto& exp : experiments) {
+    ASSERT_LE(0, static_cast<int>(exp.id));
+    ASSERT_LT(exp.id, Experiment::kMaxExperimentID);
+  }
+}
+
+}  // namespace
+}  // namespace tcmalloc
--- a/src/third_party/tcmalloc/dist/tcmalloc/experiment_fuzz.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/experiment_fuzz.cc
@ -0,0 +1,38 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "absl/strings/string_view.h"
+#include "tcmalloc/experiment.h"
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* d, size_t size) {
+  const char* data = reinterpret_cast<const char*>(d);
+
+  bool buffer[tcmalloc::tcmalloc_internal::kNumExperiments];
+  absl::string_view active, disabled;
+
+  const char* split = static_cast<const char*>(memchr(data, ';', size));
+  if (split == nullptr) {
+    active = absl::string_view(data, size);
+  } else {
+    active = absl::string_view(data, split - data);
+    disabled = absl::string_view(split + 1, size - (split - data + 1));
+  }
+
+  tcmalloc::tcmalloc_internal::SelectExperiments(buffer, active, disabled);
+  return 0;
+}
--- a/src/third_party/tcmalloc/dist/tcmalloc/experimental_pow2_size_class.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/experimental_pow2_size_class.cc
@ -0,0 +1,240 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tcmalloc/common.h"
+#include "tcmalloc/sizemap.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+
+namespace tcmalloc_internal {
+
+// <fixed> is fixed per-size-class overhead due to end-of-span fragmentation
+// and other factors. For instance, if we have a 96 byte size class, and use a
+// single 8KiB page, then we will hold 85 objects per span, and have 32 bytes
+// left over. There is also a fixed component of 48 bytes of TCMalloc metadata
+// per span. Together, the fixed overhead would be wasted/allocated =
+// (32 + 48) / (8192 - 32) ~= 0.98%.
+// There is also a dynamic component to overhead based on mismatches between the
+// number of bytes requested and the number of bytes provided by the size class.
+// Together they sum to the total overhead; for instance if you asked for a
+// 50-byte allocation that rounds up to a 64-byte size class, the dynamic
+// overhead would be 28%, and if <fixed> were 22% it would mean (on average)
+// 25 bytes of overhead for allocations of that size.
+
+// clang-format off
+#if defined(__cpp_aligned_new) && __STDCPP_DEFAULT_NEW_ALIGNMENT__ <= 8
+#if TCMALLOC_PAGE_SHIFT == 13
+static_assert(kMaxSize == 262144, "kMaxSize mismatch");
+static const int kCount = 17;
+static_assert(kCount <= kNumClasses);
+static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
+    // <bytes>, <pages>, <batch size>    <fixed>
+    {        0,       0,           0},  // +Inf%
+    {        8,       1,          32},  // 0.59%
+    {       16,       1,          32},  // 0.59%
+    {       32,       1,          32},  // 0.59%
+    {       64,       1,          32},  // 0.59%
+    {      128,       1,          32},  // 0.59%
+    {      256,       1,          32},  // 0.59%
+    {      512,       1,          32},  // 0.59%
+    {     1024,       1,          32},  // 0.59%
+    {     2048,       2,          32},  // 0.29%
+    {     4096,       1,          16},  // 0.59%
+    {     8192,       1,           8},  // 0.59%
+    {    16384,       2,           4},  // 0.29%
+    {    32768,       4,           2},  // 0.15%
+    {    65536,       8,           2},  // 0.07%
+    {   131072,      16,           2},  // 0.04%
+    {   262144,      32,           2},  // 0.02%
+};
+constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
+#elif TCMALLOC_PAGE_SHIFT == 15
+static_assert(kMaxSize == 262144, "kMaxSize mismatch");
+static const int kCount = 17;
+static_assert(kCount <= kNumClasses);
+static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
+    // <bytes>, <pages>, <batch size>    <fixed>
+    {        0,       0,           0},  // +Inf%
+    {        8,       1,          32},  // 0.15%
+    {       16,       1,          32},  // 0.15%
+    {       32,       1,          32},  // 0.15%
+    {       64,       1,          32},  // 0.15%
+    {      128,       1,          32},  // 0.15%
+    {      256,       1,          32},  // 0.15%
+    {      512,       1,          32},  // 0.15%
+    {     1024,       1,          32},  // 0.15%
+    {     2048,       1,          32},  // 0.15%
+    {     4096,       1,          16},  // 0.15%
+    {     8192,       1,           8},  // 0.15%
+    {    16384,       1,           4},  // 0.15%
+    {    32768,       1,           2},  // 0.15%
+    {    65536,       2,           2},  // 0.07%
+    {   131072,       4,           2},  // 0.04%
+    {   262144,       8,           2},  // 0.02%
+};
+constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
+#elif TCMALLOC_PAGE_SHIFT == 18
+static_assert(kMaxSize == 262144, "kMaxSize mismatch");
+static const int kCount = 17;
+static_assert(kCount <= kNumClasses);
+static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
+    // <bytes>, <pages>, <batch size>    <fixed>
+    {        0,       0,           0},  // +Inf%
+    {        8,       1,          32},  // 0.02%
+    {       16,       1,          32},  // 0.02%
+    {       32,       1,          32},  // 0.02%
+    {       64,       1,          32},  // 0.02%
+    {      128,       1,          32},  // 0.02%
+    {      256,       1,          32},  // 0.02%
+    {      512,       1,          32},  // 0.02%
+    {     1024,       1,          32},  // 0.02%
+    {     2048,       1,          32},  // 0.02%
+    {     4096,       1,          16},  // 0.02%
+    {     8192,       1,           8},  // 0.02%
+    {    16384,       1,           4},  // 0.02%
+    {    32768,       1,           2},  // 0.02%
+    {    65536,       1,           2},  // 0.02%
+    {   131072,       1,           2},  // 0.02%
+    {   262144,       1,           2},  // 0.02%
+};
+constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
+#elif TCMALLOC_PAGE_SHIFT == 12
+static_assert(kMaxSize == 8192, "kMaxSize mismatch");
+static const int kCount = 12;
+static_assert(kCount <= kNumClasses);
+static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
+    // <bytes>, <pages>, <batch size>    <fixed>
+    {        0,       0,           0},  // +Inf%
+    {        8,       1,          32},  // 1.17%
+    {       16,       1,          32},  // 1.17%
+    {       32,       1,          32},  // 1.17%
+    {       64,       1,          32},  // 1.17%
+    {      128,       1,          32},  // 1.17%
+    {      256,       1,          32},  // 1.17%
+    {      512,       1,          32},  // 1.17%
+    {     1024,       2,          32},  // 0.59%
+    {     2048,       4,          32},  // 0.29%
+    {     4096,       4,          16},  // 0.29%
+    {     8192,       4,           8},  // 0.29%
+};
+constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
+#else
+#error "Unsupported TCMALLOC_PAGE_SHIFT value!"
+#endif
+#else
+#if TCMALLOC_PAGE_SHIFT == 13
+static_assert(kMaxSize == 262144, "kMaxSize mismatch");
+static const int kCount = 17;
+static_assert(kCount <= kNumClasses);
+static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
+    // <bytes>, <pages>, <batch size>    <fixed>
+    {        0,       0,           0},  // +Inf%
+    {        8,       1,          32},  // 0.59%
+    {       16,       1,          32},  // 0.59%
+    {       32,       1,          32},  // 0.59%
+    {       64,       1,          32},  // 0.59%
+    {      128,       1,          32},  // 0.59%
+    {      256,       1,          32},  // 0.59%
+    {      512,       1,          32},  // 0.59%
+    {     1024,       1,          32},  // 0.59%
+    {     2048,       2,          32},  // 0.29%
+    {     4096,       1,          16},  // 0.59%
+    {     8192,       1,           8},  // 0.59%
+    {    16384,       2,           4},  // 0.29%
+    {    32768,       4,           2},  // 0.15%
+    {    65536,       8,           2},  // 0.07%
+    {   131072,      16,           2},  // 0.04%
+    {   262144,      32,           2},  // 0.02%
+};
+constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
+#elif TCMALLOC_PAGE_SHIFT == 15
+static_assert(kMaxSize == 262144, "kMaxSize mismatch");
+static const int kCount = 17;
+static_assert(kCount <= kNumClasses);
+static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
+    // <bytes>, <pages>, <batch size>    <fixed>
+    {        0,       0,           0},  // +Inf%
+    {        8,       1,          32},  // 0.15%
+    {       16,       1,          32},  // 0.15%
+    {       32,       1,          32},  // 0.15%
+    {       64,       1,          32},  // 0.15%
+    {      128,       1,          32},  // 0.15%
+    {      256,       1,          32},  // 0.15%
+    {      512,       1,          32},  // 0.15%
+    {     1024,       1,          32},  // 0.15%
+    {     2048,       1,          32},  // 0.15%
+    {     4096,       1,          16},  // 0.15%
+    {     8192,       1,           8},  // 0.15%
+    {    16384,       1,           4},  // 0.15%
+    {    32768,       1,           2},  // 0.15%
+    {    65536,       2,           2},  // 0.07%
+    {   131072,       4,           2},  // 0.04%
+    {   262144,       8,           2},  // 0.02%
+};
+constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
+#elif TCMALLOC_PAGE_SHIFT == 18
+static_assert(kMaxSize == 262144, "kMaxSize mismatch");
+static const int kCount = 17;
+static_assert(kCount <= kNumClasses);
+static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
+    // <bytes>, <pages>, <batch size>    <fixed>
+    {        0,       0,           0},  // +Inf%
+    {        8,       1,          32},  // 0.02%
+    {       16,       1,          32},  // 0.02%
+    {       32,       1,          32},  // 0.02%
+    {       64,       1,          32},  // 0.02%
+    {      128,       1,          32},  // 0.02%
+    {      256,       1,          32},  // 0.02%
+    {      512,       1,          32},  // 0.02%
+    {     1024,       1,          32},  // 0.02%
+    {     2048,       1,          32},  // 0.02%
+    {     4096,       1,          16},  // 0.02%
+    {     8192,       1,           8},  // 0.02%
+    {    16384,       1,           4},  // 0.02%
+    {    32768,       1,           2},  // 0.02%
+    {    65536,       1,           2},  // 0.02%
+    {   131072,       1,           2},  // 0.02%
+    {   262144,       1,           2},  // 0.02%
+};
+constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
+#elif TCMALLOC_PAGE_SHIFT == 12
+static_assert(kMaxSize == 8192, "kMaxSize mismatch");
+static const int kCount = 12;
+static_assert(kCount <= kNumClasses);
+static constexpr SizeClassInfo kExperimentalPow2SizeClassesList[kCount] = {
+    // <bytes>, <pages>, <batch size>    <fixed>
+    {        0,       0,           0},  // +Inf%
+    {        8,       1,          32},  // 1.17%
+    {       16,       1,          32},  // 1.17%
+    {       32,       1,          32},  // 1.17%
+    {       64,       1,          32},  // 1.17%
+    {      128,       1,          32},  // 1.17%
+    {      256,       1,          32},  // 1.17%
+    {      512,       1,          32},  // 1.17%
+    {     1024,       2,          32},  // 0.59%
+    {     2048,       4,          32},  // 0.29%
+    {     4096,       4,          16},  // 0.29%
+    {     8192,       4,           8},  // 0.29%
+};
+constexpr absl::Span<const SizeClassInfo> kExperimentalPow2SizeClasses(kExperimentalPow2SizeClassesList);
+#else
+#error "Unsupported TCMALLOC_PAGE_SHIFT value!"
+#endif
+#endif
+// clang-format on
+
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
--- a/src/third_party/tcmalloc/dist/tcmalloc/explicitly_constructed.h
+++ b/src/third_party/tcmalloc/dist/tcmalloc/explicitly_constructed.h
@ -0,0 +1,62 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TCMALLOC_EXPLICITLY_CONSTRUCTED_H_
+#define TCMALLOC_EXPLICITLY_CONSTRUCTED_H_
+
+#include <stdint.h>
+
+#include <utility>
+
+#include "tcmalloc/internal/config.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+
+// Wraps a variable whose constructor is explicitly called. It is particularly
+// useful for a global variable, without its constructor and destructor run on
+// start and end of the program lifetime.  This circumvents the initial
+// construction order fiasco, while keeping the address of the empty string a
+// compile time constant.
+//
+// Pay special attention to the initialization state of the object.
+// 1. The object is "uninitialized" to begin with.
+// 2. Call Construct() only if the object is uninitialized. After the call, the
+//    object becomes "initialized".
+// 3. Call get_mutable() only if the object is initialized.
+template <typename T>
+class ExplicitlyConstructed {
+ public:
+  template <typename... Args>
+  void Construct(Args&&... args) {
+    new (&union_) T(std::forward<Args>(args)...);
+  }
+
+  T& get_mutable() { return reinterpret_cast<T&>(union_); }
+
+ private:
+  union AlignedUnion {
+    constexpr AlignedUnion() = default;
+    alignas(T) char space[sizeof(T)];
+    int64_t align_to_int64;
+    void* align_to_ptr;
+  } union_;
+};
+
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
+
+#endif  // TCMALLOC_EXPLICITLY_CONSTRUCTED_H_
--- a/src/third_party/tcmalloc/dist/tcmalloc/global_stats.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/global_stats.cc
@ -0,0 +1,800 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tcmalloc/global_stats.h"
+
+#include "absl/strings/match.h"
+#include "absl/strings/strip.h"
+#include "tcmalloc/central_freelist.h"
+#include "tcmalloc/common.h"
+#include "tcmalloc/cpu_cache.h"
+#include "tcmalloc/experiment.h"
+#include "tcmalloc/guarded_page_allocator.h"
+#include "tcmalloc/internal/logging.h"
+#include "tcmalloc/internal/memory_stats.h"
+#include "tcmalloc/page_allocator.h"
+#include "tcmalloc/page_heap.h"
+#include "tcmalloc/page_heap_allocator.h"
+#include "tcmalloc/pagemap.h"
+#include "tcmalloc/pages.h"
+#include "tcmalloc/parameters.h"
+#include "tcmalloc/sampled_allocation.h"
+#include "tcmalloc/sampler.h"
+#include "tcmalloc/span.h"
+#include "tcmalloc/static_vars.h"
+#include "tcmalloc/stats.h"
+#include "tcmalloc/system-alloc.h"
+#include "tcmalloc/thread_cache.h"
+#include "tcmalloc/transfer_cache.h"
+#include "tcmalloc/transfer_cache_stats.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+
+// Get stats into "r".  Also, if class_count != NULL, class_count[k]
+// will be set to the total number of objects of size class k in the
+// central cache, transfer cache, and per-thread and per-CPU caches.
+// If small_spans is non-NULL, it is filled.  Same for large_spans.
+// The boolean report_residence determines whether residence information
+// should be captured or not. Residence info requires a potentially
+// costly OS call, and is not necessary in all situations.
+void ExtractStats(TCMallocStats* r, uint64_t* class_count,
+                  SpanStats* span_stats, SmallSpanStats* small_spans,
+                  LargeSpanStats* large_spans, bool report_residence) {
+  r->central_bytes = 0;
+  r->transfer_bytes = 0;
+  for (int size_class = 0; size_class < kNumClasses; ++size_class) {
+    const size_t length = tc_globals.central_freelist(size_class).length();
+    const size_t tc_length = tc_globals.transfer_cache().tc_length(size_class);
+    const size_t cache_overhead =
+        tc_globals.central_freelist(size_class).OverheadBytes();
+    const size_t size = tc_globals.sizemap().class_to_size(size_class);
+    r->central_bytes += (size * length) + cache_overhead;
+    r->transfer_bytes += (size * tc_length);
+    if (class_count) {
+      // Sum the lengths of all per-class freelists, except the per-thread
+      // freelists, which get counted when we call GetThreadStats(), below.
+      class_count[size_class] = length + tc_length;
+      if (UsePerCpuCache(tc_globals)) {
+        class_count[size_class] +=
+            tc_globals.cpu_cache().TotalObjectsOfClass(size_class);
+      }
+    }
+    if (span_stats) {
+      span_stats[size_class] =
+          tc_globals.central_freelist(size_class).GetSpanStats();
+    }
+  }
+
+  // Add stats from per-thread heaps
+  r->thread_bytes = 0;
+  {  // scope
+    absl::base_internal::SpinLockHolder h(&pageheap_lock);
+    ThreadCache::GetThreadStats(&r->thread_bytes, class_count);
+    r->tc_stats = ThreadCache::HeapStats();
+    r->span_stats = tc_globals.span_allocator().stats();
+    r->stack_stats = tc_globals.sampledallocation_allocator().stats();
+    r->linked_sample_stats = tc_globals.linked_sample_allocator().stats();
+    r->metadata_bytes = tc_globals.metadata_bytes();
+    r->pagemap_bytes = tc_globals.pagemap().bytes();
+    r->pageheap = tc_globals.page_allocator().stats();
+    r->peak_stats = tc_globals.page_allocator().peak_stats();
+    if (small_spans != nullptr) {
+      tc_globals.page_allocator().GetSmallSpanStats(small_spans);
+    }
+    if (large_spans != nullptr) {
+      tc_globals.page_allocator().GetLargeSpanStats(large_spans);
+    }
+
+    r->arena = tc_globals.arena().stats();
+    if (!report_residence) {
+      r->metadata_bytes += r->arena.bytes_nonresident;
+    }
+  }
+  // We can access the pagemap without holding the pageheap_lock since it
+  // is static data, and we are only taking address and size which are
+  // constants.
+  if (report_residence) {
+    auto resident_bytes = tc_globals.pagemap_residence();
+    r->pagemap_root_bytes_res = resident_bytes;
+    ASSERT(r->metadata_bytes >= r->pagemap_bytes);
+    r->metadata_bytes = r->metadata_bytes - r->pagemap_bytes + resident_bytes;
+  } else {
+    r->pagemap_root_bytes_res = 0;
+  }
+
+  r->per_cpu_bytes = 0;
+  r->sharded_transfer_bytes = 0;
+  r->percpu_metadata_bytes_res = 0;
+  r->percpu_metadata_bytes = 0;
+  if (UsePerCpuCache(tc_globals)) {
+    r->per_cpu_bytes = tc_globals.cpu_cache().TotalUsedBytes();
+    r->sharded_transfer_bytes =
+        tc_globals.sharded_transfer_cache().TotalBytes();
+
+    if (report_residence) {
+      auto percpu_metadata = tc_globals.cpu_cache().MetadataMemoryUsage();
+      r->percpu_metadata_bytes_res = percpu_metadata.resident_size;
+      r->percpu_metadata_bytes = percpu_metadata.virtual_size;
+
+      ASSERT(r->metadata_bytes >= r->percpu_metadata_bytes);
+      r->metadata_bytes = r->metadata_bytes - r->percpu_metadata_bytes +
+                          r->percpu_metadata_bytes_res;
+    }
+  }
+}
+
+void ExtractTCMallocStats(TCMallocStats* r, bool report_residence) {
+  ExtractStats(r, nullptr, nullptr, nullptr, nullptr, report_residence);
+}
+
+// Because different fields of stats are computed from state protected
+// by different locks, they may be inconsistent.  Prevent underflow
+// when subtracting to avoid gigantic results.
+static uint64_t StatSub(uint64_t a, uint64_t b) {
+  return (a >= b) ? (a - b) : 0;
+}
+
+// Return approximate number of bytes in use by app.
+uint64_t InUseByApp(const TCMallocStats& stats) {
+  return StatSub(stats.pageheap.system_bytes,
+                 stats.thread_bytes + stats.central_bytes +
+                     stats.transfer_bytes + stats.per_cpu_bytes +
+                     stats.sharded_transfer_bytes + stats.pageheap.free_bytes +
+                     stats.pageheap.unmapped_bytes);
+}
+
+uint64_t VirtualMemoryUsed(const TCMallocStats& stats) {
+  return stats.pageheap.system_bytes + stats.metadata_bytes +
+         stats.arena.bytes_unallocated + stats.arena.bytes_unavailable +
+         stats.arena.bytes_nonresident;
+}
+
+uint64_t UnmappedBytes(const TCMallocStats& stats) {
+  return stats.pageheap.unmapped_bytes + stats.arena.bytes_nonresident;
+}
+
+uint64_t PhysicalMemoryUsed(const TCMallocStats& stats) {
+  return StatSub(VirtualMemoryUsed(stats), UnmappedBytes(stats));
+}
+
+// The number of bytes either in use by the app or fragmented so that
+// it cannot be (arbitrarily) reused.
+uint64_t RequiredBytes(const TCMallocStats& stats) {
+  return StatSub(PhysicalMemoryUsed(stats), stats.pageheap.free_bytes);
+}
+
+size_t ExternalBytes(const TCMallocStats& stats) {
+  return stats.pageheap.free_bytes + stats.central_bytes + stats.per_cpu_bytes +
+         stats.sharded_transfer_bytes + stats.transfer_bytes +
+         stats.thread_bytes + stats.metadata_bytes +
+         stats.arena.bytes_unavailable + stats.arena.bytes_unallocated;
+}
+
+size_t HeapSizeBytes(const BackingStats& stats) {
+  return StatSub(stats.system_bytes, stats.unmapped_bytes);
+}
+
+size_t LocalBytes(const TCMallocStats& stats) {
+  return stats.thread_bytes + stats.per_cpu_bytes +
+         stats.sharded_transfer_bytes;
+}
+
+size_t SlackBytes(const BackingStats& stats) {
+  return stats.free_bytes + stats.unmapped_bytes;
+}
+
+static int CountAllowedCpus() {
+  cpu_set_t allowed_cpus;
+  if (sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus) != 0) {
+    return 0;
+  }
+
+  return CPU_COUNT(&allowed_cpus);
+}
+
+void DumpStats(Printer* out, int level) {
+  TCMallocStats stats;
+  uint64_t class_count[kNumClasses];
+  SpanStats span_stats[kNumClasses];
+  if (level >= 2) {
+    ExtractStats(&stats, class_count, span_stats, nullptr, nullptr, true);
+  } else {
+    ExtractTCMallocStats(&stats, true);
+  }
+
+  static const double MiB = 1048576.0;
+
+  out->printf(
+      "See https://github.com/google/tcmalloc/tree/master/docs/stats.md for an explanation of "
+      "this page\n");
+
+  const uint64_t virtual_memory_used = VirtualMemoryUsed(stats);
+  const uint64_t physical_memory_used = PhysicalMemoryUsed(stats);
+  const uint64_t unmapped_bytes = UnmappedBytes(stats);
+  const uint64_t bytes_in_use_by_app = InUseByApp(stats);
+
+#ifdef TCMALLOC_SMALL_BUT_SLOW
+  out->printf("NOTE:  SMALL MEMORY MODEL IS IN USE, PERFORMANCE MAY SUFFER.\n");
+#endif
+  // clang-format off
+  // Avoid clang-format complaining about the way that this text is laid out.
+  out->printf(
+      "------------------------------------------------\n"
+      "MALLOC:   %12u (%7.1f MiB) Bytes in use by application\n"
+      "MALLOC: + %12u (%7.1f MiB) Bytes in page heap freelist\n"
+      "MALLOC: + %12u (%7.1f MiB) Bytes in central cache freelist\n"
+      "MALLOC: + %12u (%7.1f MiB) Bytes in per-CPU cache freelist\n"
+      "MALLOC: + %12u (%7.1f MiB) Bytes in Sharded cache freelist\n"
+      "MALLOC: + %12u (%7.1f MiB) Bytes in transfer cache freelist\n"
+      "MALLOC: + %12u (%7.1f MiB) Bytes in thread cache freelists\n"
+      "MALLOC: + %12u (%7.1f MiB) Bytes in malloc metadata\n"
+      "MALLOC: + %12u (%7.1f MiB) Bytes in malloc metadata Arena unallocated\n"
+      "MALLOC: + %12u (%7.1f MiB) Bytes in malloc metadata Arena unavailable\n"
+
+      "MALLOC:   ------------\n"
+      "MALLOC: = %12u (%7.1f MiB) Actual memory used (physical + swap)\n"
+      "MALLOC: + %12u (%7.1f MiB) Bytes released to OS (aka unmapped)\n"
+      "MALLOC:   ------------\n"
+      "MALLOC: = %12u (%7.1f MiB) Virtual address space used\n"
+      "MALLOC:\n"
+      "MALLOC:   %12u               Spans in use\n"
+      "MALLOC:   %12u (%7.1f MiB) Spans created\n"
+      "MALLOC:   %12u               Thread heaps in use\n"
+      "MALLOC:   %12u (%7.1f MiB) Thread heaps created\n"
+      "MALLOC:   %12u               Stack traces in use\n"
+      "MALLOC:   %12u (%7.1f MiB) Stack traces created\n"
+      "MALLOC:   %12u               Table buckets in use\n"
+      "MALLOC:   %12u (%7.1f MiB) Table buckets created\n"
+      "MALLOC:   %12u (%7.1f MiB) Pagemap bytes used\n"
+      "MALLOC:   %12u (%7.1f MiB) Pagemap root resident bytes\n"
+      "MALLOC:   %12u (%7.1f MiB) per-CPU slab bytes used\n"
+      "MALLOC:   %12u (%7.1f MiB) per-CPU slab resident bytes\n"
+      "MALLOC:   %12u (%7.1f MiB) malloc metadata Arena non-resident bytes\n"
+      "MALLOC:   %12u (%7.1f MiB) Actual memory used at peak\n"
+      "MALLOC:   %12u (%7.1f MiB) Estimated in-use at peak\n"
+      "MALLOC:   %12.4f               Realized fragmentation (%%)\n"
+      "MALLOC:   %12u               Tcmalloc page size\n"
+      "MALLOC:   %12u               Tcmalloc hugepage size\n"
+      "MALLOC:   %12u               CPUs Allowed in Mask\n"
+      "MALLOC:   %12u               Arena blocks\n",
+      bytes_in_use_by_app, bytes_in_use_by_app / MiB,
+      stats.pageheap.free_bytes, stats.pageheap.free_bytes / MiB,
+      stats.central_bytes, stats.central_bytes / MiB,
+      stats.per_cpu_bytes, stats.per_cpu_bytes / MiB,
+      stats.sharded_transfer_bytes, stats.sharded_transfer_bytes / MiB,
+      stats.transfer_bytes, stats.transfer_bytes / MiB,
+      stats.thread_bytes, stats.thread_bytes / MiB,
+      stats.metadata_bytes, stats.metadata_bytes / MiB,
+      stats.arena.bytes_unallocated, stats.arena.bytes_unallocated / MiB,
+      stats.arena.bytes_unavailable, stats.arena.bytes_unavailable / MiB,
+      physical_memory_used, physical_memory_used / MiB,
+      unmapped_bytes, unmapped_bytes / MiB,
+      virtual_memory_used, virtual_memory_used / MiB,
+      uint64_t(stats.span_stats.in_use),
+      uint64_t(stats.span_stats.total),
+      (stats.span_stats.total * sizeof(Span)) / MiB,
+      uint64_t(stats.tc_stats.in_use),
+      uint64_t(stats.tc_stats.total),
+      (stats.tc_stats.total * sizeof(ThreadCache)) / MiB,
+      uint64_t(stats.stack_stats.in_use),
+      uint64_t(stats.stack_stats.total),
+      (stats.stack_stats.total * sizeof(StackTrace)) / MiB,
+      uint64_t(stats.linked_sample_stats.in_use),
+      uint64_t(stats.linked_sample_stats.total),
+      (stats.linked_sample_stats.total * sizeof(StackTraceTable::LinkedSample)) / MiB,
+      uint64_t(stats.pagemap_bytes),
+      stats.pagemap_bytes / MiB,
+      stats.pagemap_root_bytes_res, stats.pagemap_root_bytes_res / MiB,
+      uint64_t(stats.percpu_metadata_bytes),
+      stats.percpu_metadata_bytes / MiB,
+      stats.percpu_metadata_bytes_res, stats.percpu_metadata_bytes_res / MiB,
+      stats.arena.bytes_nonresident, stats.arena.bytes_nonresident / MiB,
+      uint64_t(stats.peak_stats.backed_bytes),
+      stats.peak_stats.backed_bytes / MiB,
+      uint64_t(stats.peak_stats.sampled_application_bytes),
+      stats.peak_stats.sampled_application_bytes / MiB,
+      100. * safe_div(stats.peak_stats.backed_bytes - stats.peak_stats.sampled_application_bytes, stats.peak_stats.sampled_application_bytes),
+      uint64_t(kPageSize),
+      uint64_t(kHugePageSize),
+      CountAllowedCpus(),
+      stats.arena.blocks
+  );
+  // clang-format on
+
+  out->printf("MALLOC EXPERIMENTS:");
+  WalkExperiments([&](absl::string_view name, bool active) {
+    const char* value = active ? "1" : "0";
+    out->printf(" %s=%s", name, value);
+  });
+  out->printf("\n");
+
+  out->printf(
+      "MALLOC SAMPLED PROFILES: %zu bytes (current), %zu bytes (internal "
+      "fragmentation), %zu bytes (peak), %zu count (total)\n",
+      static_cast<size_t>(tc_globals.sampled_objects_size_.value()),
+      tc_globals.sampled_internal_fragmentation_.value(),
+      tc_globals.peak_heap_tracker().CurrentPeakSize(),
+      tc_globals.total_sampled_count_.value());
+
+  MemoryStats memstats;
+  if (GetMemoryStats(&memstats)) {
+    uint64_t rss = memstats.rss;
+    uint64_t vss = memstats.vss;
+    // clang-format off
+    out->printf(
+        "\n"
+        "Total process stats (inclusive of non-malloc sources):\n"
+        "TOTAL: %12u (%7.1f MiB) Bytes resident (physical memory used)\n"
+        "TOTAL: %12u (%7.1f MiB) Bytes mapped (virtual memory used)\n",
+        rss, rss / MiB, vss, vss / MiB);
+    // clang-format on
+  }
+
+  out->printf(
+      "------------------------------------------------\n"
+      "Call ReleaseMemoryToSystem() to release freelist memory to the OS"
+      " (via madvise()).\n"
+      "Bytes released to the OS take up virtual address space"
+      " but no physical memory.\n");
+  if (level >= 2) {
+    out->printf("------------------------------------------------\n");
+    out->printf("Total size of freelists for per-thread and per-CPU caches,\n");
+    out->printf("transfer cache, and central cache, as well as number of\n");
+    out->printf("live pages, returned/requested spans by size class\n");
+    out->printf("------------------------------------------------\n");
+
+    uint64_t cumulative = 0;
+    for (int size_class = 1; size_class < kNumClasses; ++size_class) {
+      uint64_t class_bytes = class_count[size_class] *
+                             tc_globals.sizemap().class_to_size(size_class);
+
+      cumulative += class_bytes;
+      out->printf(
+          // clang-format off
+          "class %3d [ %8zu bytes ] : %8u objs; %5.1f MiB; %6.1f cum MiB; "
+          "%8u live pages; spans: %10zu ret / %10zu req = %5.4f;\n",
+          // clang-format on
+          size_class, tc_globals.sizemap().class_to_size(size_class),
+          class_count[size_class], class_bytes / MiB, cumulative / MiB,
+          span_stats[size_class].num_live_spans() *
+              tc_globals.sizemap().class_to_pages(size_class),
+          span_stats[size_class].num_spans_returned,
+          span_stats[size_class].num_spans_requested,
+          span_stats[size_class].prob_returned());
+    }
+
+#ifndef TCMALLOC_SMALL_BUT_SLOW
+    out->printf("------------------------------------------------\n");
+    out->printf("Central cache freelist: Span utilization histogram\n");
+    out->printf("Non-cumulative number of spans with allocated objects < N\n");
+    out->printf("------------------------------------------------\n");
+    for (int size_class = 1; size_class < kNumClasses; ++size_class) {
+      tc_globals.central_freelist(size_class).PrintSpanUtilStats(out);
+    }
+#endif
+
+    tc_globals.transfer_cache().Print(out);
+    tc_globals.sharded_transfer_cache().Print(out);
+
+    if (UsePerCpuCache(tc_globals)) {
+      tc_globals.cpu_cache().Print(out);
+    }
+
+    tc_globals.page_allocator().Print(out, MemoryTag::kNormal);
+    if (tc_globals.numa_topology().active_partitions() > 1) {
+      tc_globals.page_allocator().Print(out, MemoryTag::kNormalP1);
+    }
+    tc_globals.page_allocator().Print(out, MemoryTag::kSampled);
+    tc_globals.page_allocator().Print(out, MemoryTag::kCold);
+    tc_globals.guardedpage_allocator().Print(out);
+
+    uint64_t limit_bytes;
+    bool is_hard;
+    std::tie(limit_bytes, is_hard) = tc_globals.page_allocator().limit();
+    out->printf("PARAMETER desired_usage_limit_bytes %u %s\n", limit_bytes,
+                is_hard ? "(hard)" : "");
+    out->printf("Number of times limit was hit: %lld\n",
+                tc_globals.page_allocator().limit_hits());
+
+    out->printf("PARAMETER tcmalloc_per_cpu_caches %d\n",
+                Parameters::per_cpu_caches() ? 1 : 0);
+    out->printf("PARAMETER tcmalloc_max_per_cpu_cache_size %d\n",
+                Parameters::max_per_cpu_cache_size());
+    out->printf("PARAMETER tcmalloc_max_total_thread_cache_bytes %lld\n",
+                Parameters::max_total_thread_cache_bytes());
+    out->printf("PARAMETER malloc_release_bytes_per_sec %llu\n",
+                Parameters::background_release_rate());
+    out->printf(
+        "PARAMETER tcmalloc_skip_subrelease_interval %s\n",
+        absl::FormatDuration(Parameters::filler_skip_subrelease_interval()));
+    out->printf("PARAMETER tcmalloc_skip_subrelease_short_interval %s\n",
+                absl::FormatDuration(
+                    Parameters::filler_skip_subrelease_short_interval()));
+    out->printf("PARAMETER tcmalloc_skip_subrelease_long_interval %s\n",
+                absl::FormatDuration(
+                    Parameters::filler_skip_subrelease_long_interval()));
+    out->printf("PARAMETER flat vcpus %d\n",
+                subtle::percpu::UsingFlatVirtualCpus() ? 1 : 0);
+    out->printf("PARAMETER tcmalloc_shuffle_per_cpu_caches %d\n",
+                Parameters::shuffle_per_cpu_caches() ? 1 : 0);
+    out->printf("PARAMETER tcmalloc_partial_transfer_cache %d\n",
+                Parameters::partial_transfer_cache() ? 1 : 0);
+    out->printf(
+        "PARAMETER tcmalloc_separate_allocs_for_few_and_many_objects_spans "
+        "%d\n",
+        Parameters::separate_allocs_for_few_and_many_objects_spans());
+  }
+}
+
+void DumpStatsInPbtxt(Printer* out, int level) {
+  TCMallocStats stats;
+  uint64_t class_count[kNumClasses];
+  SpanStats span_stats[kNumClasses];
+  if (level >= 2) {
+    ExtractStats(&stats, class_count, span_stats, nullptr, nullptr, true);
+  } else {
+    ExtractTCMallocStats(&stats, true);
+  }
+
+  const uint64_t bytes_in_use_by_app = InUseByApp(stats);
+  const uint64_t virtual_memory_used = VirtualMemoryUsed(stats);
+  const uint64_t physical_memory_used = PhysicalMemoryUsed(stats);
+  const uint64_t unmapped_bytes = UnmappedBytes(stats);
+
+  PbtxtRegion region(out, kTop);
+  region.PrintI64("in_use_by_app", bytes_in_use_by_app);
+  region.PrintI64("page_heap_freelist", stats.pageheap.free_bytes);
+  region.PrintI64("central_cache_freelist", stats.central_bytes);
+  region.PrintI64("per_cpu_cache_freelist", stats.per_cpu_bytes);
+  region.PrintI64("sharded_transfer_cache_freelist",
+                  stats.sharded_transfer_bytes);
+  region.PrintI64("transfer_cache_freelist", stats.transfer_bytes);
+  region.PrintI64("thread_cache_freelists", stats.thread_bytes);
+  region.PrintI64("malloc_metadata", stats.metadata_bytes);
+  region.PrintI64("malloc_metadata_arena_unavailable",
+                  stats.arena.bytes_unavailable);
+  region.PrintI64("malloc_metadata_arena_unallocated",
+                  stats.arena.bytes_unallocated);
+  region.PrintI64("actual_mem_used", physical_memory_used);
+  region.PrintI64("unmapped", unmapped_bytes);
+  region.PrintI64("virtual_address_space_used", virtual_memory_used);
+  region.PrintI64("num_spans", uint64_t(stats.span_stats.in_use));
+  region.PrintI64("num_spans_created", uint64_t(stats.span_stats.total));
+  region.PrintI64("num_thread_heaps", uint64_t(stats.tc_stats.in_use));
+  region.PrintI64("num_thread_heaps_created", uint64_t(stats.tc_stats.total));
+  region.PrintI64("num_stack_traces", uint64_t(stats.stack_stats.in_use));
+  region.PrintI64("num_stack_traces_created",
+                  uint64_t(stats.stack_stats.total));
+  region.PrintI64("num_table_buckets",
+                  uint64_t(stats.linked_sample_stats.in_use));
+  region.PrintI64("num_table_buckets_created",
+                  uint64_t(stats.linked_sample_stats.total));
+  region.PrintI64("pagemap_size", uint64_t(stats.pagemap_bytes));
+  region.PrintI64("pagemap_root_residence", stats.pagemap_root_bytes_res);
+  region.PrintI64("percpu_slab_size", stats.percpu_metadata_bytes);
+  region.PrintI64("percpu_slab_residence", stats.percpu_metadata_bytes_res);
+  region.PrintI64("peak_backed", stats.peak_stats.backed_bytes);
+  region.PrintI64("peak_application_demand",
+                  stats.peak_stats.sampled_application_bytes);
+  region.PrintI64("tcmalloc_page_size", uint64_t(kPageSize));
+  region.PrintI64("tcmalloc_huge_page_size", uint64_t(kHugePageSize));
+  region.PrintI64("cpus_allowed", CountAllowedCpus());
+  region.PrintI64("arena_blocks", stats.arena.blocks);
+
+  {
+    auto sampled_profiles = region.CreateSubRegion("sampled_profiles");
+    sampled_profiles.PrintI64("current_bytes",
+                              tc_globals.sampled_objects_size_.value());
+    sampled_profiles.PrintI64(
+        "current_fragmentation_bytes",
+        tc_globals.sampled_internal_fragmentation_.value());
+    sampled_profiles.PrintI64("peak_bytes",
+                              tc_globals.peak_heap_tracker().CurrentPeakSize());
+  }
+
+  // Print total process stats (inclusive of non-malloc sources).
+  MemoryStats memstats;
+  if (GetMemoryStats(&memstats)) {
+    region.PrintI64("total_resident", uint64_t(memstats.rss));
+    region.PrintI64("total_mapped", uint64_t(memstats.vss));
+  }
+
+  region.PrintI64("total_sampled_count",
+                  tc_globals.total_sampled_count_.value());
+
+  if (level >= 2) {
+    {
+#ifndef TCMALLOC_SMALL_BUT_SLOW
+      for (int size_class = 1; size_class < kNumClasses; ++size_class) {
+        uint64_t class_bytes = class_count[size_class] *
+                               tc_globals.sizemap().class_to_size(size_class);
+        PbtxtRegion entry = region.CreateSubRegion("freelist");
+        entry.PrintI64("sizeclass",
+                       tc_globals.sizemap().class_to_size(size_class));
+        entry.PrintI64("bytes", class_bytes);
+        entry.PrintI64("num_spans_requested",
+                       span_stats[size_class].num_spans_requested);
+        entry.PrintI64("num_spans_returned",
+                       span_stats[size_class].num_spans_returned);
+        entry.PrintI64("obj_capacity", span_stats[size_class].obj_capacity);
+        tc_globals.central_freelist(size_class)
+            .PrintSpanUtilStatsInPbtxt(&entry);
+      }
+#endif
+    }
+
+    tc_globals.transfer_cache().PrintInPbtxt(&region);
+    tc_globals.sharded_transfer_cache().PrintInPbtxt(&region);
+
+    region.PrintRaw("transfer_cache_implementation",
+                    TransferCacheImplementationToLabel(
+                        tc_globals.transfer_cache().implementation()));
+
+    if (UsePerCpuCache(tc_globals)) {
+      tc_globals.cpu_cache().PrintInPbtxt(&region);
+    }
+  }
+  tc_globals.page_allocator().PrintInPbtxt(&region, MemoryTag::kNormal);
+  if (tc_globals.numa_topology().active_partitions() > 1) {
+    tc_globals.page_allocator().PrintInPbtxt(&region, MemoryTag::kNormalP1);
+  }
+  tc_globals.page_allocator().PrintInPbtxt(&region, MemoryTag::kSampled);
+  tc_globals.page_allocator().PrintInPbtxt(&region, MemoryTag::kCold);
+  // We do not collect tracking information in pbtxt.
+
+  size_t limit_bytes;
+  bool is_hard;
+  std::tie(limit_bytes, is_hard) = tc_globals.page_allocator().limit();
+  region.PrintI64("desired_usage_limit_bytes", limit_bytes);
+  region.PrintBool("hard_limit", is_hard);
+  region.PrintI64("limit_hits", tc_globals.page_allocator().limit_hits());
+
+  {
+    auto gwp_asan = region.CreateSubRegion("gwp_asan");
+    tc_globals.guardedpage_allocator().PrintInPbtxt(&gwp_asan);
+  }
+
+  region.PrintI64("memory_release_failures", SystemReleaseErrors());
+
+  region.PrintBool("tcmalloc_per_cpu_caches", Parameters::per_cpu_caches());
+  region.PrintI64("tcmalloc_max_per_cpu_cache_size",
+                  Parameters::max_per_cpu_cache_size());
+  region.PrintI64("tcmalloc_max_total_thread_cache_bytes",
+                  Parameters::max_total_thread_cache_bytes());
+  region.PrintI64("malloc_release_bytes_per_sec",
+                  static_cast<int64_t>(Parameters::background_release_rate()));
+  region.PrintI64(
+      "tcmalloc_skip_subrelease_interval_ns",
+      absl::ToInt64Nanoseconds(Parameters::filler_skip_subrelease_interval()));
+  region.PrintI64("tcmalloc_skip_subrelease_short_interval_ns",
+                  absl::ToInt64Nanoseconds(
+                      Parameters::filler_skip_subrelease_short_interval()));
+  region.PrintI64("tcmalloc_skip_subrelease_long_interval_ns",
+                  absl::ToInt64Nanoseconds(
+                      Parameters::filler_skip_subrelease_long_interval()));
+  region.PrintBool("tcmalloc_shuffle_per_cpu_caches",
+                   Parameters::shuffle_per_cpu_caches());
+  region.PrintI64("profile_sampling_rate", Parameters::profile_sampling_rate());
+  region.PrintRaw("percpu_vcpu_type",
+                  subtle::percpu::UsingFlatVirtualCpus() ? "FLAT" : "NONE");
+  region.PrintBool("tcmalloc_partial_transfer_cache",
+                   Parameters::partial_transfer_cache());
+  region.PrintI64("separate_allocs_for_few_and_many_objects_spans",
+                  Parameters::separate_allocs_for_few_and_many_objects_spans());
+}
+
+bool GetNumericProperty(const char* name_data, size_t name_size,
+                        size_t* value) {
+  // LINT.IfChange
+  ASSERT(name_data != nullptr);
+  ASSERT(value != nullptr);
+  const absl::string_view name(name_data, name_size);
+
+  // This is near the top since ReleasePerCpuMemoryToOS() calls it frequently.
+  if (name == "tcmalloc.per_cpu_caches_active") {
+    *value = tc_globals.CpuCacheActive();
+    return true;
+  }
+
+  if (name == "generic.virtual_memory_used") {
+    TCMallocStats stats;
+    ExtractTCMallocStats(&stats, false);
+    *value = VirtualMemoryUsed(stats);
+    return true;
+  }
+
+  if (name == "generic.physical_memory_used") {
+    TCMallocStats stats;
+    ExtractTCMallocStats(&stats, false);
+    *value = PhysicalMemoryUsed(stats);
+    return true;
+  }
+
+  if (name == "generic.current_allocated_bytes" ||
+      name == "generic.bytes_in_use_by_app") {
+    TCMallocStats stats;
+    ExtractTCMallocStats(&stats, false);
+    *value = InUseByApp(stats);
+    return true;
+  }
+
+  if (name == "generic.peak_memory_usage") {
+    TCMallocStats stats;
+    ExtractTCMallocStats(&stats, false);
+    *value = static_cast<uint64_t>(stats.peak_stats.sampled_application_bytes);
+    return true;
+  }
+
+  if (name == "generic.realized_fragmentation") {
+    TCMallocStats stats;
+    ExtractTCMallocStats(&stats, false);
+    *value = static_cast<uint64_t>(
+        100. * safe_div(stats.peak_stats.backed_bytes -
+                            stats.peak_stats.sampled_application_bytes,
+                        stats.peak_stats.sampled_application_bytes));
+
+    return true;
+  }
+
+  if (name == "generic.heap_size") {
+    absl::base_internal::SpinLockHolder l(&pageheap_lock);
+    BackingStats stats = tc_globals.page_allocator().stats();
+    *value = HeapSizeBytes(stats);
+    return true;
+  }
+
+  if (name == "tcmalloc.central_cache_free") {
+    TCMallocStats stats;
+    ExtractTCMallocStats(&stats, false);
+    *value = stats.central_bytes;
+    return true;
+  }
+
+  if (name == "tcmalloc.cpu_free") {
+    TCMallocStats stats;
+    ExtractTCMallocStats(&stats, false);
+    *value = stats.per_cpu_bytes;
+    return true;
+  }
+
+  if (name == "tcmalloc.sharded_transfer_cache_free") {
+    TCMallocStats stats;
+    ExtractTCMallocStats(&stats, false);
+    *value = stats.sharded_transfer_bytes;
+    return true;
+  }
+
+  if (name == "tcmalloc.slack_bytes") {
+    // Kept for backwards compatibility.  Now defined externally as:
+    //    pageheap_free_bytes + pageheap_unmapped_bytes.
+    absl::base_internal::SpinLockHolder l(&pageheap_lock);
+    BackingStats stats = tc_globals.page_allocator().stats();
+    *value = SlackBytes(stats);
+    return true;
+  }
+
+  if (name == "tcmalloc.pageheap_free_bytes" ||
+      name == "tcmalloc.page_heap_free") {
+    absl::base_internal::SpinLockHolder l(&pageheap_lock);
+    *value = tc_globals.page_allocator().stats().free_bytes;
+    return true;
+  }
+
+  if (name == "tcmalloc.pageheap_unmapped_bytes" ||
+      name == "tcmalloc.page_heap_unmapped") {
+    absl::base_internal::SpinLockHolder l(&pageheap_lock);
+    // Arena non-resident bytes aren't on the page heap, but they are unmapped.
+    *value = tc_globals.page_allocator().stats().unmapped_bytes +
+             tc_globals.arena().stats().bytes_nonresident;
+    return true;
+  }
+
+  if (name == "tcmalloc.sampled_internal_fragmentation") {
+    *value = tc_globals.sampled_internal_fragmentation_.value();
+    return true;
+  }
+
+  if (name == "tcmalloc.page_algorithm") {
+    absl::base_internal::SpinLockHolder l(&pageheap_lock);
+    *value = tc_globals.page_allocator().algorithm();
+    return true;
+  }
+
+  if (name == "tcmalloc.max_total_thread_cache_bytes") {
+    absl::base_internal::SpinLockHolder l(&pageheap_lock);
+    *value = ThreadCache::overall_thread_cache_size();
+    return true;
+  }
+
+  if (name == "tcmalloc.current_total_thread_cache_bytes" ||
+      name == "tcmalloc.thread_cache_free") {
+    TCMallocStats stats;
+    ExtractTCMallocStats(&stats, false);
+    *value = stats.thread_bytes;
+    return true;
+  }
+
+  if (name == "tcmalloc.thread_cache_count") {
+    TCMallocStats stats;
+    ExtractTCMallocStats(&stats, false);
+    *value = stats.tc_stats.in_use;
+    return true;
+  }
+
+  if (name == "tcmalloc.local_bytes") {
+    TCMallocStats stats;
+    ExtractTCMallocStats(&stats, false);
+    *value = LocalBytes(stats);
+    return true;
+  }
+
+  if (name == "tcmalloc.external_fragmentation_bytes") {
+    TCMallocStats stats;
+    ExtractTCMallocStats(&stats, false);
+    *value = ExternalBytes(stats);
+    return true;
+  }
+
+  if (name == "tcmalloc.metadata_bytes") {
+    TCMallocStats stats;
+    ExtractTCMallocStats(&stats, true);
+    *value = stats.metadata_bytes;
+    return true;
+  }
+
+  if (name == "tcmalloc.transfer_cache_free") {
+    TCMallocStats stats;
+    ExtractTCMallocStats(&stats, false);
+    *value = stats.transfer_bytes;
+    return true;
+  }
+
+  bool want_hard_limit = (name == "tcmalloc.hard_usage_limit_bytes");
+  if (want_hard_limit || name == "tcmalloc.desired_usage_limit_bytes") {
+    size_t amount;
+    bool is_hard;
+    std::tie(amount, is_hard) = tc_globals.page_allocator().limit();
+    if (want_hard_limit != is_hard) {
+      amount = std::numeric_limits<size_t>::max();
+    }
+    *value = amount;
+    return true;
+  }
+
+  if (name == "tcmalloc.required_bytes") {
+    TCMallocStats stats;
+    ExtractTCMallocStats(&stats, false);
+    *value = RequiredBytes(stats);
+    return true;
+  }
+
+  const absl::string_view kExperimentPrefix = "tcmalloc.experiment.";
+  if (absl::StartsWith(name, kExperimentPrefix)) {
+    absl::optional<Experiment> exp =
+        FindExperimentByName(absl::StripPrefix(name, kExperimentPrefix));
+    if (exp.has_value()) {
+      *value = IsExperimentActive(*exp) ? 1 : 0;
+      return true;
+    }
+  }
+
+  // LINT.ThenChange(//depot/google3/tcmalloc/malloc_extension_test.cc)
+  return false;
+}
+
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
--- a/src/third_party/tcmalloc/dist/tcmalloc/global_stats.h
+++ b/src/third_party/tcmalloc/dist/tcmalloc/global_stats.h
@ -0,0 +1,82 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TCMALLOC_GLOBAL_STATS_H_
+#define TCMALLOC_GLOBAL_STATS_H_
+
+#include <cstdint>
+
+#include "tcmalloc/internal/logging.h"
+#include "tcmalloc/page_allocator.h"
+#include "tcmalloc/span_stats.h"
+#include "tcmalloc/stats.h"
+#include "tcmalloc/transfer_cache_stats.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+
+// Extract interesting stats
+struct TCMallocStats {
+  uint64_t thread_bytes;               // Bytes in thread caches
+  uint64_t central_bytes;              // Bytes in central cache
+  uint64_t transfer_bytes;             // Bytes in central transfer cache
+  uint64_t metadata_bytes;             // Bytes alloced for metadata
+  uint64_t sharded_transfer_bytes;     // Bytes in per-CCX cache
+  uint64_t per_cpu_bytes;              // Bytes in per-CPU cache
+  uint64_t pagemap_root_bytes_res;     // Resident bytes of pagemap root node
+  uint64_t percpu_metadata_bytes_res;  // Resident bytes of the per-CPU metadata
+  AllocatorStats tc_stats;             // ThreadCache objects
+  AllocatorStats span_stats;           // Span objects
+  AllocatorStats stack_stats;          // StackTrace objects
+  AllocatorStats linked_sample_stats;  // StackTraceTable::LinkedSample objects
+  size_t pagemap_bytes;                // included in metadata bytes
+  size_t percpu_metadata_bytes;        // included in metadata bytes
+  BackingStats pageheap;               // Stats from page heap
+  PageAllocator::PeakStats peak_stats;
+
+  ArenaStats arena;  // Stats from the metadata Arena
+
+  // Explicitly declare the ctor to put it in the google_malloc section.
+  TCMallocStats() = default;
+};
+
+void ExtractStats(TCMallocStats* r, uint64_t* class_count,
+                  SpanStats* span_stats, SmallSpanStats* small_spans,
+                  LargeSpanStats* large_spans, TransferCacheStats* tc_stats,
+                  bool report_residence);
+
+void ExtractTCMallocStats(TCMallocStats* r, bool report_residence);
+
+uint64_t InUseByApp(const TCMallocStats& stats);
+uint64_t VirtualMemoryUsed(const TCMallocStats& stats);
+uint64_t UnmappedBytes(const TCMallocStats& stats);
+uint64_t PhysicalMemoryUsed(const TCMallocStats& stats);
+uint64_t RequiredBytes(const TCMallocStats& stats);
+size_t ExternalBytes(const TCMallocStats& stats);
+size_t HeapSizeBytes(const BackingStats& stats);
+size_t LocalBytes(const TCMallocStats& stats);
+size_t SlackBytes(const BackingStats& stats);
+
+// WRITE stats to "out"
+void DumpStats(Printer* out, int level);
+void DumpStatsInPbtxt(Printer* out, int level);
+
+bool GetNumericProperty(const char* name_data, size_t name_size, size_t* value);
+
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
+
+#endif  // TCMALLOC_GLOBAL_STATS_H_
--- a/src/third_party/tcmalloc/dist/tcmalloc/guarded_page_allocator.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/guarded_page_allocator.cc
@ -0,0 +1,569 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tcmalloc/guarded_page_allocator.h"
+
+#include <fcntl.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <csignal>
+#include <tuple>
+#include <utility>
+
+#include "absl/base/call_once.h"
+#include "absl/base/internal/spinlock.h"
+#include "absl/base/internal/sysinfo.h"
+#include "absl/debugging/stacktrace.h"
+#include "absl/numeric/bits.h"
+#include "absl/strings/string_view.h"
+#include "tcmalloc/common.h"
+#include "tcmalloc/internal/environment.h"
+#include "tcmalloc/internal/logging.h"
+#include "tcmalloc/internal/page_size.h"
+#include "tcmalloc/internal/util.h"
+#include "tcmalloc/pagemap.h"
+#include "tcmalloc/sampler.h"
+#include "tcmalloc/static_vars.h"
+#include "tcmalloc/system-alloc.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+
+const size_t GuardedPageAllocator::kMagicSize;  // NOLINT
+
+void GuardedPageAllocator::Init(size_t max_alloced_pages, size_t total_pages) {
+  CHECK_CONDITION(max_alloced_pages > 0);
+  CHECK_CONDITION(max_alloced_pages <= total_pages);
+  CHECK_CONDITION(total_pages <= kGpaMaxPages);
+  max_alloced_pages_ = max_alloced_pages;
+  total_pages_ = total_pages;
+
+  // If the system page size is larger than kPageSize, we need to use the
+  // system page size for this allocator since mprotect operates on full pages
+  // only.  This case happens on PPC.
+  page_size_ = std::max(kPageSize, static_cast<size_t>(GetPageSize()));
+  ASSERT(page_size_ % kPageSize == 0);
+
+  rand_ = reinterpret_cast<uint64_t>(this);  // Initialize RNG seed.
+  MapPages();
+}
+
+void GuardedPageAllocator::Destroy() {
+  absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
+  if (initialized_) {
+    size_t len = pages_end_addr_ - pages_base_addr_;
+    int err = munmap(reinterpret_cast<void*>(pages_base_addr_), len);
+    ASSERT(err != -1);
+    (void)err;
+    initialized_ = false;
+  }
+}
+
+GuardedPageAllocator::AllocWithStatus GuardedPageAllocator::Allocate(
+    size_t size, size_t alignment) {
+  if (size == 0) {
+    return {nullptr, Profile::Sample::GuardedStatus::TooSmall};
+  }
+  ssize_t free_slot = ReserveFreeSlot();
+  // All slots are reserved.
+  if (free_slot == -1) {
+    return {nullptr, Profile::Sample::GuardedStatus::NoAvailableSlots};
+  }
+
+  ASSERT(size <= page_size_);
+  ASSERT(alignment <= page_size_);
+  ASSERT(alignment == 0 || absl::has_single_bit(alignment));
+  void* result = reinterpret_cast<void*>(SlotToAddr(free_slot));
+  if (mprotect(result, page_size_, PROT_READ | PROT_WRITE) == -1) {
+    ASSERT(false && "mprotect failed");
+    absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
+    num_failed_allocations_++;
+    FreeSlot(free_slot);
+    return {nullptr, Profile::Sample::GuardedStatus::MProtectFailed};
+  }
+
+  // Place some allocations at end of page for better overflow detection.
+  MaybeRightAlign(free_slot, size, alignment, &result);
+
+  // Record stack trace.
+  SlotMetadata& d = data_[free_slot];
+  d.dealloc_trace.depth = 0;
+  d.alloc_trace.depth = absl::GetStackTrace(d.alloc_trace.stack, kMaxStackDepth,
+                                            /*skip_count=*/3);
+  d.alloc_trace.tid = absl::base_internal::GetTID();
+  d.requested_size = size;
+  d.allocation_start = reinterpret_cast<uintptr_t>(result);
+
+  ASSERT(!alignment || d.allocation_start % alignment == 0);
+  return {result, Profile::Sample::GuardedStatus::Guarded};
+}
+
+void GuardedPageAllocator::Deallocate(void* ptr) {
+  ASSERT(PointerIsMine(ptr));
+  const uintptr_t page_addr = GetPageAddr(reinterpret_cast<uintptr_t>(ptr));
+  size_t slot = AddrToSlot(page_addr);
+
+  absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
+  if (IsFreed(slot)) {
+    double_free_detected_ = true;
+  } else if (WriteOverflowOccurred(slot)) {
+    write_overflow_detected_ = true;
+  }
+
+  CHECK_CONDITION(mprotect(reinterpret_cast<void*>(page_addr), page_size_,
+                           PROT_NONE) != -1);
+
+  if (write_overflow_detected_ || double_free_detected_) {
+    *reinterpret_cast<char*>(ptr) = 'X';  // Trigger SEGV handler.
+    CHECK_CONDITION(false);               // Unreachable.
+  }
+
+  // Record stack trace.
+  GpaStackTrace& trace = data_[slot].dealloc_trace;
+  trace.depth = absl::GetStackTrace(trace.stack, kMaxStackDepth,
+                                    /*skip_count=*/2);
+  trace.tid = absl::base_internal::GetTID();
+
+  FreeSlot(slot);
+}
+
+size_t GuardedPageAllocator::GetRequestedSize(const void* ptr) const {
+  ASSERT(PointerIsMine(ptr));
+  size_t slot = AddrToSlot(GetPageAddr(reinterpret_cast<uintptr_t>(ptr)));
+  return data_[slot].requested_size;
+}
+
+std::pair<off_t, size_t> GuardedPageAllocator::GetAllocationOffsetAndSize(
+    const void* ptr) const {
+  ASSERT(PointerIsMine(ptr));
+  const uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
+  const size_t slot = GetNearestSlot(addr);
+  return {addr - data_[slot].allocation_start, data_[slot].requested_size};
+}
+
+GuardedPageAllocator::ErrorType GuardedPageAllocator::GetStackTraces(
+    const void* ptr, GpaStackTrace* alloc_trace,
+    GpaStackTrace* dealloc_trace) const {
+  ASSERT(PointerIsMine(ptr));
+  const uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
+  size_t slot = GetNearestSlot(addr);
+  *alloc_trace = data_[slot].alloc_trace;
+  *dealloc_trace = data_[slot].dealloc_trace;
+  return GetErrorType(addr, data_[slot]);
+}
+
+// We take guarded samples during periodic profiling samples.  Computes the
+// mean number of profiled samples made for every guarded sample.
+static int GetChainedRate() {
+  auto guarded_rate = Parameters::guarded_sampling_rate();
+  auto sample_rate = Parameters::profile_sampling_rate();
+  if (guarded_rate < 0 || sample_rate <= 0) {
+    return guarded_rate;
+  } else {
+    return std::ceil(static_cast<double>(guarded_rate) /
+                     static_cast<double>(sample_rate));
+  }
+}
+
+void GuardedPageAllocator::Print(Printer* out) {
+  absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
+  out->printf(
+      "\n"
+      "------------------------------------------------\n"
+      "GWP-ASan Status\n"
+      "------------------------------------------------\n"
+      "Successful Allocations: %zu\n"
+      "Failed Allocations: %zu\n"
+      "Slots Currently Allocated: %zu\n"
+      "Slots Currently Quarantined: %zu\n"
+      "Maximum Slots Allocated: %zu / %zu\n"
+      "PARAMETER tcmalloc_guarded_sample_parameter %d\n",
+      num_allocation_requests_ - num_failed_allocations_,
+      num_failed_allocations_, num_alloced_pages_,
+      total_pages_ - num_alloced_pages_, num_alloced_pages_max_,
+      max_alloced_pages_, GetChainedRate());
+}
+
+void GuardedPageAllocator::PrintInPbtxt(PbtxtRegion* gwp_asan) {
+  absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
+  gwp_asan->PrintI64("successful_allocations",
+                     num_allocation_requests_ - num_failed_allocations_);
+  gwp_asan->PrintI64("failed_allocations", num_failed_allocations_);
+  gwp_asan->PrintI64("current_slots_allocated", num_alloced_pages_);
+  gwp_asan->PrintI64("current_slots_quarantined",
+                     total_pages_ - num_alloced_pages_);
+  gwp_asan->PrintI64("max_slots_allocated", num_alloced_pages_max_);
+  gwp_asan->PrintI64("allocated_slot_limit", max_alloced_pages_);
+  gwp_asan->PrintI64("tcmalloc_guarded_sample_parameter", GetChainedRate());
+}
+
+// Maps 2 * total_pages_ + 1 pages so that there are total_pages_ unique pages
+// we can return from Allocate with guard pages before and after them.
+void GuardedPageAllocator::MapPages() {
+  absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
+  ASSERT(!first_page_addr_);
+  ASSERT(page_size_ % GetPageSize() == 0);
+  size_t len = (2 * total_pages_ + 1) * page_size_;
+  auto base_addr = reinterpret_cast<uintptr_t>(
+      MmapAligned(len, page_size_, MemoryTag::kSampled));
+  ASSERT(base_addr);
+  if (!base_addr) return;
+
+  // Tell TCMalloc's PageMap about the memory we own.
+  const PageId page = PageIdContaining(reinterpret_cast<void*>(base_addr));
+  const Length page_len = BytesToLengthFloor(len);
+  if (!tc_globals.pagemap().Ensure(page, page_len)) {
+    ASSERT(false && "Failed to notify page map of page-guarded memory.");
+    return;
+  }
+
+  // Allocate memory for slot metadata.
+  data_ = reinterpret_cast<SlotMetadata*>(
+      tc_globals.arena().Alloc(sizeof(*data_) * total_pages_));
+  for (size_t i = 0; i < total_pages_; ++i) {
+    new (&data_[i]) SlotMetadata;
+  }
+
+  pages_base_addr_ = base_addr;
+  pages_end_addr_ = pages_base_addr_ + len;
+
+  // Align first page to page_size_.
+  first_page_addr_ = GetPageAddr(pages_base_addr_ + page_size_);
+
+  std::fill_n(free_pages_, total_pages_, true);
+  initialized_ = true;
+}
+
+// Selects a random slot in O(total_pages_) time.
+ssize_t GuardedPageAllocator::ReserveFreeSlot() {
+  absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
+  if (!initialized_ || !allow_allocations_) return -1;
+  num_allocation_requests_++;
+  if (num_alloced_pages_ == max_alloced_pages_) {
+    num_failed_allocations_++;
+    return -1;
+  }
+
+  rand_ = Sampler::NextRandom(rand_);
+  size_t num_free_pages = total_pages_ - num_alloced_pages_;
+  size_t slot = GetIthFreeSlot(rand_ % num_free_pages);
+  ASSERT(free_pages_[slot]);
+  free_pages_[slot] = false;
+  num_alloced_pages_++;
+  num_alloced_pages_max_ = std::max(num_alloced_pages_, num_alloced_pages_max_);
+  return slot;
+}
+
+size_t GuardedPageAllocator::GetIthFreeSlot(size_t ith_free_slot) {
+  ASSERT(ith_free_slot < total_pages_ - num_alloced_pages_);
+  for (size_t free_slot_count = 0, j = 0;; j++) {
+    if (free_pages_[j]) {
+      if (free_slot_count == ith_free_slot) return j;
+      free_slot_count++;
+    }
+  }
+}
+
+void GuardedPageAllocator::FreeSlot(size_t slot) {
+  ASSERT(slot < total_pages_);
+  ASSERT(!free_pages_[slot]);
+  free_pages_[slot] = true;
+  num_alloced_pages_--;
+}
+
+uintptr_t GuardedPageAllocator::GetPageAddr(uintptr_t addr) const {
+  const uintptr_t addr_mask = ~(page_size_ - 1ULL);
+  return addr & addr_mask;
+}
+
+uintptr_t GuardedPageAllocator::GetNearestValidPage(uintptr_t addr) const {
+  if (addr < first_page_addr_) return first_page_addr_;
+  const uintptr_t last_page_addr =
+      first_page_addr_ + 2 * (total_pages_ - 1) * page_size_;
+  if (addr > last_page_addr) return last_page_addr;
+  uintptr_t offset = addr - first_page_addr_;
+
+  // If addr is already on a valid page, just return addr.
+  if ((offset / page_size_) % 2 == 0) return addr;
+
+  // ptr points to a guard page, so get nearest valid page.
+  const size_t kHalfPageSize = page_size_ / 2;
+  if ((offset / kHalfPageSize) % 2 == 0) {
+    return addr - kHalfPageSize;  // Round down.
+  }
+  return addr + kHalfPageSize;  // Round up.
+}
+
+size_t GuardedPageAllocator::GetNearestSlot(uintptr_t addr) const {
+  return AddrToSlot(GetPageAddr(GetNearestValidPage(addr)));
+}
+
+bool GuardedPageAllocator::IsFreed(size_t slot) const {
+  return free_pages_[slot];
+}
+
+bool GuardedPageAllocator::WriteOverflowOccurred(size_t slot) const {
+  if (!ShouldRightAlign(slot)) return false;
+  uint8_t magic = GetWriteOverflowMagic(slot);
+  uintptr_t alloc_end =
+      data_[slot].allocation_start + data_[slot].requested_size;
+  uintptr_t page_end = SlotToAddr(slot) + page_size_;
+  uintptr_t magic_end = std::min(page_end, alloc_end + kMagicSize);
+  for (uintptr_t p = alloc_end; p < magic_end; ++p) {
+    if (*reinterpret_cast<uint8_t*>(p) != magic) return true;
+  }
+  return false;
+}
+
+GuardedPageAllocator::ErrorType GuardedPageAllocator::GetErrorType(
+    uintptr_t addr, const SlotMetadata& d) const {
+  if (!d.allocation_start) return ErrorType::kUnknown;
+  if (double_free_detected_) return ErrorType::kDoubleFree;
+  if (write_overflow_detected_) return ErrorType::kBufferOverflowOnDealloc;
+  if (d.dealloc_trace.depth) return ErrorType::kUseAfterFree;
+  if (addr < d.allocation_start) return ErrorType::kBufferUnderflow;
+  if (addr >= d.allocation_start + d.requested_size) {
+    return ErrorType::kBufferOverflow;
+  }
+  return ErrorType::kUnknown;
+}
+
+uintptr_t GuardedPageAllocator::SlotToAddr(size_t slot) const {
+  ASSERT(slot < total_pages_);
+  return first_page_addr_ + 2 * slot * page_size_;
+}
+
+size_t GuardedPageAllocator::AddrToSlot(uintptr_t addr) const {
+  uintptr_t offset = addr - first_page_addr_;
+  ASSERT(offset % page_size_ == 0);
+  ASSERT((offset / page_size_) % 2 == 0);
+  int slot = offset / page_size_ / 2;
+  ASSERT(slot >= 0 && slot < total_pages_);
+  return slot;
+}
+
+void GuardedPageAllocator::MaybeRightAlign(size_t slot, size_t size,
+                                           size_t alignment, void** ptr) {
+  if (!ShouldRightAlign(slot)) return;
+  uintptr_t adjusted_ptr =
+      reinterpret_cast<uintptr_t>(*ptr) + page_size_ - size;
+
+  // If alignment == 0, the necessary alignment is never larger than the size
+  // rounded up to the next power of 2.  We use this fact to minimize alignment
+  // padding between the end of small allocations and their guard pages.
+  //
+  // For allocations larger than the greater of kAlignment and
+  // __STDCPP_DEFAULT_NEW_ALIGNMENT__, we're safe aligning to that value.
+  size_t default_alignment =
+      std::min(absl::bit_ceil(size),
+               std::max(static_cast<size_t>(kAlignment),
+                        static_cast<size_t>(__STDCPP_DEFAULT_NEW_ALIGNMENT__)));
+
+  // Ensure valid alignment.
+  alignment = std::max(alignment, default_alignment);
+  uintptr_t alignment_padding = adjusted_ptr & (alignment - 1);
+  adjusted_ptr -= alignment_padding;
+
+  // Write magic bytes in alignment padding to detect small overflow writes.
+  size_t magic_size = std::min(alignment_padding, kMagicSize);
+  memset(reinterpret_cast<void*>(adjusted_ptr + size),
+         GetWriteOverflowMagic(slot), magic_size);
+  *ptr = reinterpret_cast<void*>(adjusted_ptr);
+}
+
+// If this failure occurs during "bazel test", writes a warning for Bazel to
+// display.
+static void RecordBazelWarning(absl::string_view error) {
+  const char* warning_file = thread_safe_getenv("TEST_WARNINGS_OUTPUT_FILE");
+  if (!warning_file) return;  // Not a bazel test.
+
+  constexpr char warning[] = "GWP-ASan error detected: ";
+  int fd = open(warning_file, O_CREAT | O_WRONLY | O_APPEND, 0644);
+  if (fd == -1) return;
+  (void)write(fd, warning, sizeof(warning) - 1);
+  (void)write(fd, error.data(), error.size());
+  (void)write(fd, "\n", 1);
+  close(fd);
+}
+
+// If this failure occurs during a gUnit test, writes an XML file describing the
+// error type.  Note that we cannot use ::testing::Test::RecordProperty()
+// because it doesn't write the XML file if a test crashes (which we're about to
+// do here).  So we write directly to the XML file instead.
+//
+static void RecordTestFailure(absl::string_view error) {
+  const char* xml_file = thread_safe_getenv("XML_OUTPUT_FILE");
+  if (!xml_file) return;  // Not a gUnit test.
+
+  // Record test failure for Sponge.
+  constexpr char xml_text_header[] =
+      "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
+      "<testsuites><testsuite><testcase>"
+      "  <properties>"
+      "    <property name=\"gwp-asan-report\" value=\"";
+  constexpr char xml_text_footer[] =
+      "\"/>"
+      "  </properties>"
+      "  <failure message=\"MemoryError\">"
+      "    GWP-ASan detected a memory error.  See the test log for full report."
+      "  </failure>"
+      "</testcase></testsuite></testsuites>";
+
+  int fd = open(xml_file, O_CREAT | O_WRONLY | O_TRUNC, 0644);
+  if (fd == -1) return;
+  (void)write(fd, xml_text_header, sizeof(xml_text_header) - 1);
+  (void)write(fd, error.data(), error.size());
+  (void)write(fd, xml_text_footer, sizeof(xml_text_footer) - 1);
+  close(fd);
+}
+//
+// If this crash occurs in a test, records test failure summaries.
+//
+// error contains the type of error to record.
+static void RecordCrash(absl::string_view error) {
+
+  RecordBazelWarning(error);
+  RecordTestFailure(error);
+}
+
+static void PrintStackTrace(void** stack_frames, size_t depth) {
+  for (size_t i = 0; i < depth; ++i) {
+    Log(kLog, __FILE__, __LINE__, "  @  ", stack_frames[i]);
+  }
+}
+
+static void PrintStackTraceFromSignalHandler(void* context) {
+  void* stack_frames[kMaxStackDepth];
+  size_t depth = absl::GetStackTraceWithContext(stack_frames, kMaxStackDepth, 1,
+                                                context, nullptr);
+  PrintStackTrace(stack_frames, depth);
+}
+
+// A SEGV handler that prints stack traces for the allocation and deallocation
+// of relevant memory as well as the location of the memory error.
+static void SegvHandler(int signo, siginfo_t* info, void* context) {
+  if (signo != SIGSEGV) return;
+  void* fault = info->si_addr;
+  if (!tc_globals.guardedpage_allocator().PointerIsMine(fault)) return;
+  GuardedPageAllocator::GpaStackTrace alloc_trace, dealloc_trace;
+  GuardedPageAllocator::ErrorType error =
+      tc_globals.guardedpage_allocator().GetStackTraces(fault, &alloc_trace,
+                                                        &dealloc_trace);
+  if (error == GuardedPageAllocator::ErrorType::kUnknown) return;
+  pid_t current_thread = absl::base_internal::GetTID();
+  off_t offset;
+  size_t size;
+  std::tie(offset, size) =
+      tc_globals.guardedpage_allocator().GetAllocationOffsetAndSize(fault);
+
+  Log(kLog, __FILE__, __LINE__,
+      "*** GWP-ASan "
+      "(https://google.github.io/tcmalloc/gwp-asan.html)  "
+      "has detected a memory error ***");
+  Log(kLog, __FILE__, __LINE__, ">>> Access at offset", offset,
+      "into buffer of length", size);
+  Log(kLog, __FILE__, __LINE__,
+      "Error originates from memory allocated in thread", alloc_trace.tid,
+      "at:");
+  PrintStackTrace(alloc_trace.stack, alloc_trace.depth);
+
+  switch (error) {
+    case GuardedPageAllocator::ErrorType::kUseAfterFree:
+      Log(kLog, __FILE__, __LINE__, "The memory was freed in thread",
+          dealloc_trace.tid, "at:");
+      PrintStackTrace(dealloc_trace.stack, dealloc_trace.depth);
+      Log(kLog, __FILE__, __LINE__, "Use-after-free occurs in thread",
+          current_thread, "at:");
+      RecordCrash("use-after-free");
+      break;
+    case GuardedPageAllocator::ErrorType::kBufferUnderflow:
+      Log(kLog, __FILE__, __LINE__, "Buffer underflow occurs in thread",
+          current_thread, "at:");
+      RecordCrash("buffer-underflow");
+      break;
+    case GuardedPageAllocator::ErrorType::kBufferOverflow:
+      Log(kLog, __FILE__, __LINE__, "Buffer overflow occurs in thread",
+          current_thread, "at:");
+      RecordCrash("buffer-overflow");
+      break;
+    case GuardedPageAllocator::ErrorType::kDoubleFree:
+      Log(kLog, __FILE__, __LINE__, "The memory was freed in thread",
+          dealloc_trace.tid, "at:");
+      PrintStackTrace(dealloc_trace.stack, dealloc_trace.depth);
+      Log(kLog, __FILE__, __LINE__, "Double free occurs in thread",
+          current_thread, "at:");
+      RecordCrash("double-free");
+      break;
+    case GuardedPageAllocator::ErrorType::kBufferOverflowOnDealloc:
+      Log(kLog, __FILE__, __LINE__,
+          "Buffer overflow (write) detected in thread", current_thread,
+          "at free:");
+      RecordCrash("buffer-overflow-detected-at-free");
+      break;
+    case GuardedPageAllocator::ErrorType::kUnknown:
+      Crash(kCrash, __FILE__, __LINE__, "Unexpected ErrorType::kUnknown");
+  }
+  PrintStackTraceFromSignalHandler(context);
+  if (error == GuardedPageAllocator::ErrorType::kBufferOverflowOnDealloc) {
+    Log(kLog, __FILE__, __LINE__,
+        "*** Try rerunning with --config=asan to get stack trace of overflow "
+        "***");
+  }
+}
+
+static struct sigaction old_sa;
+
+static void ForwardSignal(int signo, siginfo_t* info, void* context) {
+  if (old_sa.sa_flags & SA_SIGINFO) {
+    old_sa.sa_sigaction(signo, info, context);
+  } else if (old_sa.sa_handler == SIG_DFL) {
+    // No previous handler registered.  Re-raise signal for core dump.
+    int err = sigaction(signo, &old_sa, nullptr);
+    if (err == -1) {
+      Log(kLog, __FILE__, __LINE__, "Couldn't restore previous sigaction!");
+    }
+    raise(signo);
+  } else if (old_sa.sa_handler == SIG_IGN) {
+    return;  // Previous sigaction ignored signal, so do the same.
+  } else {
+    old_sa.sa_handler(signo);
+  }
+}
+
+static void HandleSegvAndForward(int signo, siginfo_t* info, void* context) {
+  SegvHandler(signo, info, context);
+  ForwardSignal(signo, info, context);
+}
+
+extern "C" void MallocExtension_Internal_ActivateGuardedSampling() {
+  static absl::once_flag flag;
+  absl::call_once(flag, []() {
+    struct sigaction action = {};
+    action.sa_sigaction = HandleSegvAndForward;
+    sigemptyset(&action.sa_mask);
+    action.sa_flags = SA_SIGINFO;
+    sigaction(SIGSEGV, &action, &old_sa);
+    tc_globals.guardedpage_allocator().AllowAllocations();
+  });
+}
+
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
--- a/src/third_party/tcmalloc/dist/tcmalloc/guarded_page_allocator.h
+++ b/src/third_party/tcmalloc/dist/tcmalloc/guarded_page_allocator.h
@ -0,0 +1,315 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TCMALLOC_GUARDED_PAGE_ALLOCATOR_H_
+#define TCMALLOC_GUARDED_PAGE_ALLOCATOR_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <utility>
+
+#include "absl/base/attributes.h"
+#include "absl/base/internal/spinlock.h"
+#include "absl/base/thread_annotations.h"
+#include "tcmalloc/common.h"
+#include "tcmalloc/internal/logging.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+
+// An allocator that gives each allocation a new region, with guard pages on
+// either side of the allocated region.  If a buffer is overflowed to the next
+// guard page or underflowed to the previous guard page, a segfault occurs.
+// After an allocation is freed, the underlying page is marked as inaccessible,
+// and any future accesses to it will also cause segfaults until the page is
+// reallocated.
+//
+// Is safe to use with static storage duration and is thread safe with the
+// exception of calls to Init() and Destroy() (see corresponding function
+// comments).
+//
+// Example:
+//   ABSL_CONST_INIT GuardedPageAllocator gpa;
+//
+//   void foo() {
+//     char *buf = reinterpret_cast<char *>(gpa.Allocate(8000, 1));
+//     buf[0] = 'A';            // OK. No segfault occurs.
+//     memset(buf, 'A', 8000);  // OK. No segfault occurs.
+//     buf[-300] = 'A';         // Segfault!
+//     buf[9000] = 'A';         // Segfault!
+//     gpa.Deallocate(buf);
+//     buf[0] = 'B';            // Segfault!
+//   }
+//
+//   int main() {
+//     // Call Init() only once.
+//     gpa.Init(64, GuardedPageAllocator::kGpaMaxPages);
+//     gpa.AllowAllocations();
+//     for (int i = 0; i < 1000; i++) foo();
+//     return 0;
+//   }
+class GuardedPageAllocator {
+ public:
+  struct GpaStackTrace {
+    void* stack[kMaxStackDepth];
+    size_t depth = 0;
+    pid_t tid = 0;
+  };
+
+  // Maximum number of pages this class can allocate.
+  static constexpr size_t kGpaMaxPages = 512;
+
+  enum class ErrorType {
+    kUseAfterFree,
+    kBufferUnderflow,
+    kBufferOverflow,
+    kDoubleFree,
+    kBufferOverflowOnDealloc,
+    kUnknown,
+  };
+
+  constexpr GuardedPageAllocator()
+      : guarded_page_lock_(absl::kConstInit,
+                           absl::base_internal::SCHEDULE_KERNEL_ONLY),
+        free_pages_{},
+        num_alloced_pages_(0),
+        num_alloced_pages_max_(0),
+        num_allocation_requests_(0),
+        num_failed_allocations_(0),
+        data_(nullptr),
+        pages_base_addr_(0),
+        pages_end_addr_(0),
+        first_page_addr_(0),
+        max_alloced_pages_(0),
+        total_pages_(0),
+        page_size_(0),
+        rand_(0),
+        initialized_(false),
+        allow_allocations_(false),
+        double_free_detected_(false),
+        write_overflow_detected_(false) {}
+
+  GuardedPageAllocator(const GuardedPageAllocator&) = delete;
+  GuardedPageAllocator& operator=(const GuardedPageAllocator&) = delete;
+
+  ~GuardedPageAllocator() = default;
+
+  // Configures this allocator to allocate up to max_alloced_pages pages at a
+  // time from a pool of total_pages pages, where:
+  //   1 <= max_alloced_pages <= total_pages <= kGpaMaxPages
+  //
+  // This method should be called non-concurrently and only once to complete
+  // initialization.  Dynamic initialization is deliberately done here and not
+  // in the constructor, thereby allowing the constructor to be constexpr and
+  // avoiding static initialization order issues.
+  void Init(size_t max_alloced_pages, size_t total_pages)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
+
+  // Unmaps memory allocated by this class.
+  //
+  // This method should be called non-concurrently and only once to complete
+  // destruction.  Destruction is deliberately done here and not in the
+  // destructor, thereby allowing the destructor to be trivial (i.e. a no-op)
+  // and avoiding use-after-destruction issues for static/global instances.
+  void Destroy();
+
+  struct AllocWithStatus {
+    void* alloc = nullptr;
+    Profile::Sample::GuardedStatus status =
+        Profile::Sample::GuardedStatus::Unknown;
+  };
+
+  // On success, returns an instance of AllocWithStatus which includes a pointer
+  // to size bytes of page-guarded memory, aligned to alignment.  The member
+  // 'alloc' is a pointer that is guaranteed to be tagged.
+  // The 'status' member is set to GuardedStatus::Guarded.
+  // On failure, returns an instance of AllocWithStatus (the 'alloc' member is
+  // set to 'nullptr').  Failure can occur if memory could not be mapped or
+  // protected, if all guarded pages are already allocated, or if size is 0.
+  // These conditions are reflected in the 'status' member of the
+  // AllocWithStatus return value.
+  //
+  // Precondition:  size and alignment <= page_size_
+  // Precondition:  alignment is 0 or a power of 2
+  AllocWithStatus Allocate(size_t size, size_t alignment)
+      ABSL_LOCKS_EXCLUDED(guarded_page_lock_);
+
+  // Deallocates memory pointed to by ptr.  ptr must have been previously
+  // returned by a call to Allocate.
+  void Deallocate(void* ptr) ABSL_LOCKS_EXCLUDED(guarded_page_lock_);
+
+  // Returns the size requested when ptr was allocated.  ptr must have been
+  // previously returned by a call to Allocate.
+  size_t GetRequestedSize(const void* ptr) const;
+
+  // Returns ptr's offset from the beginning of its allocation along with the
+  // allocation's size.
+  std::pair<off_t, size_t> GetAllocationOffsetAndSize(const void* ptr) const;
+
+  // Records stack traces in alloc_trace and dealloc_trace for the page nearest
+  // to ptr.  alloc_trace is the trace at the time the page was allocated.  If
+  // the page is still allocated, dealloc_trace->depth will be 0. If the page
+  // has been deallocated, dealloc_trace is the trace at the time the page was
+  // deallocated.
+  //
+  // Returns the likely error type for an access at ptr.
+  //
+  // Requires that ptr points to memory mapped by this class.
+  ErrorType GetStackTraces(const void* ptr, GpaStackTrace* alloc_trace,
+                           GpaStackTrace* dealloc_trace) const;
+
+  // Writes a human-readable summary of GuardedPageAllocator's internal state to
+  // *out.
+  void Print(Printer* out) ABSL_LOCKS_EXCLUDED(guarded_page_lock_);
+  void PrintInPbtxt(PbtxtRegion* gwp_asan)
+      ABSL_LOCKS_EXCLUDED(guarded_page_lock_);
+
+  // Returns true if ptr points to memory managed by this class.
+  inline bool ABSL_ATTRIBUTE_ALWAYS_INLINE
+  PointerIsMine(const void* ptr) const {
+    uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
+    return pages_base_addr_ <= addr && addr < pages_end_addr_;
+  }
+
+  // Allows Allocate() to start returning allocations.
+  void AllowAllocations() ABSL_LOCKS_EXCLUDED(guarded_page_lock_) {
+    absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
+    allow_allocations_ = true;
+  }
+
+  // Returns the number of pages available for allocation, based on how many are
+  // currently in use.  (Should only be used in testing.)
+  size_t GetNumAvailablePages() ABSL_LOCKS_EXCLUDED(guarded_page_lock_) {
+    absl::base_internal::SpinLockHolder h(&guarded_page_lock_);
+    return max_alloced_pages_ - num_alloced_pages_;
+  }
+
+ private:
+  // Structure for storing data about a slot.
+  struct SlotMetadata {
+    GpaStackTrace alloc_trace;
+    GpaStackTrace dealloc_trace;
+    size_t requested_size = 0;
+    uintptr_t allocation_start = 0;
+  };
+
+  // Max number of magic bytes we use to detect write-overflows at deallocation.
+  static constexpr size_t kMagicSize = 32;
+
+  // Maps pages into memory.
+  void MapPages() ABSL_LOCKS_EXCLUDED(guarded_page_lock_)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
+
+  // Reserves and returns a slot randomly selected from the free slots in
+  // free_pages_.  Returns -1 if no slots available, or if AllowAllocations()
+  // hasn't been called yet.
+  ssize_t ReserveFreeSlot() ABSL_LOCKS_EXCLUDED(guarded_page_lock_);
+
+  // Returns the i-th free slot of free_pages_.  i must be in the range [0,
+  // total_pages_ - num_alloced_pages_).
+  size_t GetIthFreeSlot(size_t i)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(guarded_page_lock_);
+
+  // Marks the specified slot as unreserved.
+  void FreeSlot(size_t slot) ABSL_EXCLUSIVE_LOCKS_REQUIRED(guarded_page_lock_);
+
+  // Returns the address of the page that addr resides on.
+  uintptr_t GetPageAddr(uintptr_t addr) const;
+
+  // Returns an address somewhere on the valid page nearest to addr.
+  uintptr_t GetNearestValidPage(uintptr_t addr) const;
+
+  // Returns the slot number for the page nearest to addr.
+  size_t GetNearestSlot(uintptr_t addr) const;
+
+  // Returns true if the specified slot has already been freed.
+  bool IsFreed(size_t slot) const
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(guarded_page_lock_);
+
+  // Returns true if magic bytes for slot were overwritten.
+  bool WriteOverflowOccurred(size_t slot) const;
+
+  // Returns the likely error type for the given access address and metadata
+  // associated with the nearest slot.
+  ErrorType GetErrorType(uintptr_t addr, const SlotMetadata& d) const;
+
+  // Magic constant used for detecting write-overflows at deallocation time.
+  static uint8_t GetWriteOverflowMagic(size_t slot) {
+    // Only even slots get magic bytes, so use slot / 2 for more unique magics.
+    return uint8_t{0xcd} * static_cast<uint8_t>(slot / 2);
+  }
+
+  // Returns true if slot should be right aligned.
+  static bool ShouldRightAlign(size_t slot) { return slot % 2 == 0; }
+
+  // If slot is marked for right alignment, moves the allocation in *ptr to the
+  // right end of the slot, maintaining the specified size and alignment.  Magic
+  // bytes are written in any alignment padding.
+  void MaybeRightAlign(size_t slot, size_t size, size_t alignment, void** ptr);
+
+  uintptr_t SlotToAddr(size_t slot) const;
+  size_t AddrToSlot(uintptr_t addr) const;
+
+  absl::base_internal::SpinLock guarded_page_lock_;
+
+  // Maps each bool to one page.
+  // true: Free.  false: Reserved.
+  bool free_pages_[kGpaMaxPages] ABSL_GUARDED_BY(guarded_page_lock_);
+
+  // Number of currently-allocated pages.
+  size_t num_alloced_pages_ ABSL_GUARDED_BY(guarded_page_lock_);
+
+  // The high-water mark for num_alloced_pages_.
+  size_t num_alloced_pages_max_ ABSL_GUARDED_BY(guarded_page_lock_);
+
+  // Number of calls to Allocate.
+  size_t num_allocation_requests_ ABSL_GUARDED_BY(guarded_page_lock_);
+
+  // Number of times Allocate has failed.
+  size_t num_failed_allocations_ ABSL_GUARDED_BY(guarded_page_lock_);
+
+  // A dynamically-allocated array of stack trace data captured when each page
+  // is allocated/deallocated.  Printed by the SEGV handler when a memory error
+  // is detected.
+  SlotMetadata* data_;
+
+  uintptr_t pages_base_addr_;  // Points to start of mapped region.
+  uintptr_t pages_end_addr_;   // Points to the end of mapped region.
+  uintptr_t first_page_addr_;  // Points to first page returnable by Allocate.
+  size_t max_alloced_pages_;   // Max number of pages to allocate at once.
+  size_t total_pages_;         // Size of the page pool to allocate from.
+  size_t page_size_;           // Size of pages we allocate.
+  uint64_t rand_;              // RNG seed.
+
+  // True if this object has been fully initialized.
+  bool initialized_ ABSL_GUARDED_BY(guarded_page_lock_);
+
+  // Flag to control whether we can return allocations or not.
+  bool allow_allocations_ ABSL_GUARDED_BY(guarded_page_lock_);
+
+  // Set to true if a double free has occurred.
+  bool double_free_detected_;
+
+  // Set to true if a write overflow was detected on deallocation.
+  bool write_overflow_detected_;
+};
+
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
+
+#endif  // TCMALLOC_GUARDED_PAGE_ALLOCATOR_H_
--- a/src/third_party/tcmalloc/dist/tcmalloc/guarded_page_allocator_benchmark.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/guarded_page_allocator_benchmark.cc
@ -0,0 +1,63 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+
+#include <algorithm>
+
+#include "absl/base/internal/spinlock.h"
+#include "benchmark/benchmark.h"
+#include "tcmalloc/guarded_page_allocator.h"
+#include "tcmalloc/internal/logging.h"
+#include "tcmalloc/internal/page_size.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+namespace {
+
+static constexpr size_t kMaxGpaPages = GuardedPageAllocator::kGpaMaxPages;
+
+// Size of pages used by GuardedPageAllocator.
+static size_t PageSize() {
+  static const size_t page_size =
+      std::max(kPageSize, static_cast<size_t>(GetPageSize()));
+  return page_size;
+}
+
+void BM_AllocDealloc(benchmark::State& state) {
+  static GuardedPageAllocator* gpa = []() {
+    auto gpa = new GuardedPageAllocator;
+    absl::base_internal::SpinLockHolder h(&pageheap_lock);
+    gpa->Init(kMaxGpaPages, kMaxGpaPages);
+    gpa->AllowAllocations();
+    return gpa;
+  }();
+  size_t alloc_size = state.range(0);
+  for (auto _ : state) {
+    char* ptr = reinterpret_cast<char*>(gpa->Allocate(alloc_size, 0).alloc);
+    CHECK_CONDITION(ptr != nullptr);
+    ptr[0] = 'X';               // Page fault first page.
+    ptr[alloc_size - 1] = 'X';  // Page fault last page.
+    gpa->Deallocate(ptr);
+  }
+}
+
+BENCHMARK(BM_AllocDealloc)->Range(1, PageSize());
+BENCHMARK(BM_AllocDealloc)->Arg(1)->ThreadRange(1, kMaxGpaPages);
+
+}  // namespace
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
--- a/src/third_party/tcmalloc/dist/tcmalloc/guarded_page_allocator_profile_test.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/guarded_page_allocator_profile_test.cc
@ -0,0 +1,266 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/function_ref.h"
+#include "tcmalloc/malloc_extension.h"
+#include "tcmalloc/static_vars.h"
+#include "tcmalloc/testing/testutil.h"
+
+namespace tcmalloc {
+namespace tcmalloc_internal {
+namespace {
+
+class GuardedPageAllocatorProfileTest : public testing::Test {
+ public:
+  struct NextSteps {
+    bool stop = true;  // stop allocating
+    bool free = true;  // free allocation
+  };
+
+  void SetUp() override { MallocExtension::ActivateGuardedSampling(); }
+
+  // Return the number of allocations
+  int AllocateUntil(size_t size,
+                    absl::FunctionRef<NextSteps(void*)> evaluate_alloc) {
+    int alloc_count = 0;
+    while (true) {
+      void* alloc = ::operator new(size);
+      ++alloc_count;
+      benchmark::DoNotOptimize(alloc);
+      auto result = evaluate_alloc(alloc);
+      // evaluate_alloc takes responsibility for delete/free if result.free is
+      // set to false.
+      if (result.free) {
+        ::operator delete(alloc);
+      }
+      if (result.stop) {
+        break;
+      }
+    }
+    return alloc_count;
+  }
+
+  // Allocate until sample is guarded
+  // Called to reduce the internal counter to -1, which will trigger resetting
+  // the counter to the configured rate.
+  void AllocateUntilGuarded() {
+    AllocateUntil(968, [&](void* alloc) -> NextSteps {
+      return {IsSampledMemory(alloc) &&
+                  Static::guardedpage_allocator().PointerIsMine(alloc),
+              true};
+    });
+  }
+
+  void ExamineSamples(
+      Profile& profile, Profile::Sample::GuardedStatus sought_status,
+      absl::flat_hash_set<Profile::Sample::GuardedStatus> allowable_statuses,
+      absl::FunctionRef<void(const Profile::Sample& s)> verify =
+          [](const Profile::Sample& s) { /* do nothing */ }) {
+    absl::flat_hash_set<Profile::Sample::GuardedStatus> found_statuses;
+    int samples = 0;
+    profile.Iterate([&](const Profile::Sample& s) {
+      ++samples;
+      found_statuses.insert(s.guarded_status);
+      verify(s);
+    });
+    EXPECT_THAT(found_statuses, ::testing::Contains(sought_status));
+    found_statuses.erase(sought_status);
+    EXPECT_THAT(found_statuses, ::testing::IsSubsetOf(allowable_statuses));
+  }
+};
+
+TEST_F(GuardedPageAllocatorProfileTest, Guarded) {
+  ScopedAlwaysSample sas;
+  AllocateUntilGuarded();
+  auto token = MallocExtension::StartAllocationProfiling();
+
+  AllocateUntil(1051, [&](void* alloc) -> NextSteps { return {true, true}; });
+
+  auto profile = std::move(token).Stop();
+  ExamineSamples(profile, Profile::Sample::GuardedStatus::Guarded, {});
+}
+
+TEST_F(GuardedPageAllocatorProfileTest, NotAttempted) {
+  ScopedProfileSamplingRate spsr(4096);
+  auto token = MallocExtension::StartAllocationProfiling();
+
+  constexpr size_t alloc_size = 2 * 1024 * 1024;
+  AllocateUntil(alloc_size, [&](void* alloc) -> NextSteps {
+    return {true, true};
+  });
+
+  auto profile = std::move(token).Stop();
+  ExamineSamples(profile, Profile::Sample::GuardedStatus::NotAttempted,
+                 {Profile::Sample::GuardedStatus::Guarded},
+                 [&](const Profile::Sample& s) {
+                   switch (s.guarded_status) {
+                     case Profile::Sample::GuardedStatus::Guarded:
+                       EXPECT_NE(alloc_size, s.requested_size);
+                       break;
+                     default:
+                       break;
+                   }
+                 });
+}
+
+TEST_F(GuardedPageAllocatorProfileTest, LargerThanOnePage) {
+  ScopedAlwaysSample sas;
+  AllocateUntilGuarded();
+  auto token = MallocExtension::StartAllocationProfiling();
+
+  constexpr size_t alloc_size = kPageSize + 1;
+  AllocateUntil(alloc_size, [&](void* alloc) -> NextSteps {
+    return {true, true};
+  });
+
+  auto profile = std::move(token).Stop();
+  ExamineSamples(profile, Profile::Sample::GuardedStatus::LargerThanOnePage,
+                 {Profile::Sample::GuardedStatus::Guarded},
+                 [&](const Profile::Sample& s) {
+                   switch (s.guarded_status) {
+                     case Profile::Sample::GuardedStatus::Guarded:
+                       EXPECT_NE(alloc_size, s.requested_size);
+                       break;
+                     default:
+                       break;
+                   }
+                 });
+}
+
+TEST_F(GuardedPageAllocatorProfileTest, Disabled) {
+  ScopedGuardedSamplingRate sgsr(-1);
+  ScopedProfileSamplingRate spsr(1);
+  auto token = MallocExtension::StartAllocationProfiling();
+
+  AllocateUntil(1024, [&](void* alloc) -> NextSteps { return {true, true}; });
+
+  auto profile = std::move(token).Stop();
+  ExamineSamples(profile, Profile::Sample::GuardedStatus::Disabled, {});
+}
+
+TEST_F(GuardedPageAllocatorProfileTest, RateLimited) {
+  ScopedGuardedSamplingRate sgsr(1);
+  ScopedProfileSamplingRate spsr(1);
+  auto token = MallocExtension::StartAllocationProfiling();
+
+  // Keep allocating until something is sampled
+  constexpr size_t alloc_size = 1033;
+  bool guarded_found = false;
+  bool unguarded_found = false;
+  AllocateUntil(alloc_size, [&](void* alloc) -> NextSteps {
+    if (IsSampledMemory(alloc)) {
+      if (Static::guardedpage_allocator().PointerIsMine(alloc)) {
+        guarded_found = true;
+      } else {
+        unguarded_found = true;
+      }
+      return {guarded_found && unguarded_found, true};
+    }
+    return {false, true};
+  });
+
+  // Ensure Guarded and RateLimited both occur for the alloc_size
+  bool success_found = false;
+  bool ratelimited_found = false;
+  auto profile = std::move(token).Stop();
+  ExamineSamples(profile, Profile::Sample::GuardedStatus::RateLimited,
+                 {Profile::Sample::GuardedStatus::Guarded},
+                 [&](const Profile::Sample& s) {
+                   if (s.requested_size != alloc_size) {
+                     return;
+                   }
+                   switch (s.guarded_status) {
+                     case Profile::Sample::GuardedStatus::Guarded:
+                       success_found = true;
+                       break;
+                     case Profile::Sample::GuardedStatus::RateLimited:
+                       ratelimited_found = true;
+                       break;
+                     default:
+                       break;
+                   }
+                 });
+  EXPECT_TRUE(success_found);
+  EXPECT_TRUE(ratelimited_found);
+}
+
+TEST_F(GuardedPageAllocatorProfileTest, TooSmall) {
+  ScopedAlwaysSample sas;
+  AllocateUntilGuarded();
+  auto token = MallocExtension::StartAllocationProfiling();
+
+  // Next sampled allocation should be too small
+  constexpr size_t alloc_size = 0;
+  AllocateUntil(alloc_size, [&](void* alloc) -> NextSteps {
+    return {true, true};
+  });
+
+  auto profile = std::move(token).Stop();
+  ExamineSamples(profile, Profile::Sample::GuardedStatus::TooSmall,
+                 {Profile::Sample::GuardedStatus::RateLimited,
+                  Profile::Sample::GuardedStatus::Guarded},
+                 [&](const Profile::Sample& s) {
+                   switch (s.guarded_status) {
+                     case Profile::Sample::GuardedStatus::Guarded:
+                       EXPECT_NE(alloc_size, s.requested_size);
+                       break;
+                     case Profile::Sample::GuardedStatus::TooSmall:
+                       EXPECT_EQ(alloc_size, s.requested_size);
+                       break;
+                     default:
+                       break;
+                   }
+                 });
+}
+
+TEST_F(GuardedPageAllocatorProfileTest, NoAvailableSlots) {
+  ScopedAlwaysSample sas;
+  AllocateUntilGuarded();
+
+  std::vector<std::unique_ptr<char>> allocs;
+  // Guard until there are no slots available.
+  AllocateUntil(1039, [&](void* alloc) -> NextSteps {
+    if (Static::guardedpage_allocator().PointerIsMine(alloc)) {
+      allocs.emplace_back(static_cast<char*>(alloc));
+      return {Static::guardedpage_allocator().GetNumAvailablePages() == 0,
+              false};
+    }
+    return {false, true};
+  });
+
+  auto token = MallocExtension::StartAllocationProfiling();
+  // This should  fail for lack of slots
+  constexpr size_t alloc_size = 1055;
+  AllocateUntil(alloc_size, [&](void* alloc) -> NextSteps {
+    return {true, true};
+  });
+
+  auto profile = std::move(token).Stop();
+  ExamineSamples(profile, Profile::Sample::GuardedStatus::NoAvailableSlots, {});
+}
+
+}  // namespace
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
--- a/src/third_party/tcmalloc/dist/tcmalloc/guarded_page_allocator_test.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/guarded_page_allocator_test.cc
@ -0,0 +1,275 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tcmalloc/guarded_page_allocator.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <algorithm>
+#include <array>
+#include <memory>
+#include <string>
+#include <thread>  // NOLINT(build/c++11)
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/base/attributes.h"
+#include "absl/base/casts.h"
+#include "absl/base/internal/spinlock.h"
+#include "absl/base/internal/sysinfo.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/memory/memory.h"
+#include "absl/numeric/bits.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "tcmalloc/common.h"
+#include "tcmalloc/internal/logging.h"
+#include "tcmalloc/internal/page_size.h"
+#include "tcmalloc/malloc_extension.h"
+#include "tcmalloc/static_vars.h"
+
+namespace tcmalloc {
+namespace tcmalloc_internal {
+namespace {
+
+static constexpr size_t kMaxGpaPages = GuardedPageAllocator::kGpaMaxPages;
+
+// Size of pages used by GuardedPageAllocator.
+static size_t PageSize() {
+  static const size_t page_size =
+      std::max(kPageSize, static_cast<size_t>(GetPageSize()));
+  return page_size;
+}
+
+class GuardedPageAllocatorTest : public testing::Test {
+ protected:
+  GuardedPageAllocatorTest() {
+    absl::base_internal::SpinLockHolder h(&pageheap_lock);
+    gpa_.Init(kMaxGpaPages, kMaxGpaPages);
+    gpa_.AllowAllocations();
+  }
+
+  explicit GuardedPageAllocatorTest(size_t num_pages) {
+    absl::base_internal::SpinLockHolder h(&pageheap_lock);
+    gpa_.Init(num_pages, kMaxGpaPages);
+    gpa_.AllowAllocations();
+  }
+
+  ~GuardedPageAllocatorTest() override { gpa_.Destroy(); }
+
+  GuardedPageAllocator gpa_;
+};
+
+class GuardedPageAllocatorParamTest
+    : public GuardedPageAllocatorTest,
+      public testing::WithParamInterface<size_t> {
+ protected:
+  GuardedPageAllocatorParamTest() : GuardedPageAllocatorTest(GetParam()) {}
+};
+
+TEST_F(GuardedPageAllocatorTest, SingleAllocDealloc) {
+  auto alloc_with_status = gpa_.Allocate(PageSize(), 0);
+  EXPECT_EQ(alloc_with_status.status, Profile::Sample::GuardedStatus::Guarded);
+  char* buf = static_cast<char*>(alloc_with_status.alloc);
+  EXPECT_NE(buf, nullptr);
+  EXPECT_TRUE(gpa_.PointerIsMine(buf));
+  memset(buf, 'A', PageSize());
+  EXPECT_DEATH(buf[-1] = 'A', "");
+  EXPECT_DEATH(buf[PageSize()] = 'A', "");
+  gpa_.Deallocate(buf);
+  EXPECT_DEATH(buf[0] = 'B', "");
+  EXPECT_DEATH(buf[PageSize() / 2] = 'B', "");
+  EXPECT_DEATH(buf[PageSize() - 1] = 'B', "");
+}
+
+TEST_F(GuardedPageAllocatorTest, NoAlignmentProvided) {
+  constexpr size_t kLargeObjectAlignment =
+      std::max(static_cast<size_t>(kAlignment),
+               static_cast<size_t>(__STDCPP_DEFAULT_NEW_ALIGNMENT__));
+
+  for (size_t base_size = 1; base_size <= 64; base_size <<= 1) {
+    for (size_t size : {base_size, base_size + 1}) {
+      SCOPED_TRACE(size);
+
+      constexpr int kElements = 10;
+      std::array<void*, kElements> ptrs;
+
+      // Make several allocation attempts to encounter left/right-alignment in
+      // the guarded region.
+      for (int i = 0; i < kElements; i++) {
+        auto alloc_with_status = gpa_.Allocate(size, 0);
+        EXPECT_EQ(alloc_with_status.status,
+                  Profile::Sample::GuardedStatus::Guarded);
+        ptrs[i] = alloc_with_status.alloc;
+        EXPECT_NE(ptrs[i], nullptr);
+        EXPECT_TRUE(gpa_.PointerIsMine(ptrs[i]));
+
+        size_t observed_alignment =
+            1 << absl::countr_zero(absl::bit_cast<uintptr_t>(ptrs[i]));
+        EXPECT_GE(observed_alignment, std::min(size, kLargeObjectAlignment));
+      }
+
+      for (void* ptr : ptrs) {
+        gpa_.Deallocate(ptr);
+      }
+    }
+  }
+}
+
+TEST_F(GuardedPageAllocatorTest, AllocDeallocAligned) {
+  for (size_t align = 1; align <= PageSize(); align <<= 1) {
+    constexpr size_t alloc_size = 1;
+    auto alloc_with_status = gpa_.Allocate(alloc_size, align);
+    EXPECT_EQ(alloc_with_status.status,
+              Profile::Sample::GuardedStatus::Guarded);
+    EXPECT_NE(alloc_with_status.alloc, nullptr);
+    EXPECT_TRUE(gpa_.PointerIsMine(alloc_with_status.alloc));
+    EXPECT_EQ(reinterpret_cast<uintptr_t>(alloc_with_status.alloc) % align, 0);
+  }
+}
+
+TEST_P(GuardedPageAllocatorParamTest, AllocDeallocAllPages) {
+  size_t num_pages = GetParam();
+  char* bufs[kMaxGpaPages];
+  for (size_t i = 0; i < num_pages; i++) {
+    auto alloc_with_status = gpa_.Allocate(1, 0);
+    EXPECT_EQ(alloc_with_status.status,
+              Profile::Sample::GuardedStatus::Guarded);
+    bufs[i] = reinterpret_cast<char*>(alloc_with_status.alloc);
+    EXPECT_NE(bufs[i], nullptr);
+    EXPECT_TRUE(gpa_.PointerIsMine(bufs[i]));
+  }
+  auto alloc_with_status = gpa_.Allocate(1, 0);
+  EXPECT_EQ(alloc_with_status.status,
+            Profile::Sample::GuardedStatus::NoAvailableSlots);
+  EXPECT_EQ(alloc_with_status.alloc, nullptr);
+  gpa_.Deallocate(bufs[0]);
+  alloc_with_status = gpa_.Allocate(1, 0);
+  EXPECT_EQ(alloc_with_status.status, Profile::Sample::GuardedStatus::Guarded);
+  bufs[0] = reinterpret_cast<char*>(alloc_with_status.alloc);
+  EXPECT_NE(bufs[0], nullptr);
+  EXPECT_TRUE(gpa_.PointerIsMine(bufs[0]));
+  for (size_t i = 0; i < num_pages; i++) {
+    bufs[i][0] = 'A';
+    gpa_.Deallocate(bufs[i]);
+  }
+}
+INSTANTIATE_TEST_SUITE_P(VaryNumPages, GuardedPageAllocatorParamTest,
+                         testing::Values(1, kMaxGpaPages / 2, kMaxGpaPages));
+
+TEST_F(GuardedPageAllocatorTest, PointerIsMine) {
+  auto alloc_with_status = gpa_.Allocate(1, 0);
+  EXPECT_EQ(alloc_with_status.status, Profile::Sample::GuardedStatus::Guarded);
+  void* buf = alloc_with_status.alloc;
+  int stack_var;
+  auto malloc_ptr = absl::make_unique<char>();
+  EXPECT_TRUE(gpa_.PointerIsMine(buf));
+  EXPECT_FALSE(gpa_.PointerIsMine(&stack_var));
+  EXPECT_FALSE(gpa_.PointerIsMine(malloc_ptr.get()));
+}
+
+TEST_F(GuardedPageAllocatorTest, Print) {
+  char buf[1024] = {};
+  Printer out(buf, sizeof(buf));
+  gpa_.Print(&out);
+  EXPECT_THAT(buf, testing::ContainsRegex("GWP-ASan Status"));
+}
+
+// Test that no pages are double-allocated or left unallocated, and that no
+// extra pages are allocated when there's concurrent calls to Allocate().
+TEST_F(GuardedPageAllocatorTest, ThreadedAllocCount) {
+  constexpr size_t kNumThreads = 2;
+  void* allocations[kNumThreads][kMaxGpaPages];
+  {
+    std::vector<std::thread> threads;
+    threads.reserve(kNumThreads);
+    for (size_t i = 0; i < kNumThreads; i++) {
+      threads.push_back(std::thread([this, &allocations, i]() {
+        for (size_t j = 0; j < kMaxGpaPages; j++) {
+          allocations[i][j] = gpa_.Allocate(1, 0).alloc;
+        }
+      }));
+    }
+
+    for (auto& t : threads) {
+      t.join();
+    }
+  }
+  absl::flat_hash_set<void*> allocations_set;
+  for (size_t i = 0; i < kNumThreads; i++) {
+    for (size_t j = 0; j < kMaxGpaPages; j++) {
+      allocations_set.insert(allocations[i][j]);
+    }
+  }
+  allocations_set.erase(nullptr);
+  EXPECT_EQ(allocations_set.size(), kMaxGpaPages);
+}
+
+// Test that allocator remains in consistent state under high contention and
+// doesn't double-allocate pages or fail to deallocate pages.
+TEST_F(GuardedPageAllocatorTest, ThreadedHighContention) {
+  const size_t kNumThreads = 4 * absl::base_internal::NumCPUs();
+  {
+    std::vector<std::thread> threads;
+    threads.reserve(kNumThreads);
+    for (size_t i = 0; i < kNumThreads; i++) {
+      threads.push_back(std::thread([this]() {
+        char* buf;
+        while (true) {
+          auto alloc_with_status = gpa_.Allocate(1, 0);
+          if (alloc_with_status.status ==
+              Profile::Sample::GuardedStatus::Guarded) {
+            buf = reinterpret_cast<char*>(alloc_with_status.alloc);
+            EXPECT_NE(buf, nullptr);
+            break;
+          }
+          absl::SleepFor(absl::Nanoseconds(5000));
+        }
+
+        // Verify that no other thread has access to this page.
+        EXPECT_EQ(buf[0], 0);
+
+        // Mark this page and allow some time for another thread to potentially
+        // gain access to this page.
+        buf[0] = 'A';
+        absl::SleepFor(absl::Nanoseconds(5000));
+
+        // Unmark this page and deallocate.
+        buf[0] = 0;
+        gpa_.Deallocate(buf);
+      }));
+    }
+
+    for (auto& t : threads) {
+      t.join();
+    }
+  }
+  // Verify all pages have been deallocated now that all threads are done.
+  for (size_t i = 0; i < kMaxGpaPages; i++) {
+    auto alloc_with_status = gpa_.Allocate(1, 0);
+    EXPECT_EQ(alloc_with_status.status,
+              Profile::Sample::GuardedStatus::Guarded);
+    EXPECT_NE(alloc_with_status.alloc, nullptr);
+  }
+}
+
+ABSL_CONST_INIT ABSL_ATTRIBUTE_UNUSED GuardedPageAllocator
+    gpa_is_constant_initializable;
+
+}  // namespace
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
--- a/src/third_party/tcmalloc/dist/tcmalloc/heap_profiling_test.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/heap_profiling_test.cc
@ -0,0 +1,239 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/mman.h>
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "tcmalloc/internal/profile.pb.h"
+#include "gtest/gtest.h"
+#include "absl/base/attributes.h"
+#include "absl/base/const_init.h"
+#include "absl/base/internal/low_level_alloc.h"
+#include "absl/base/internal/spinlock.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "tcmalloc/internal/logging.h"
+#include "tcmalloc/internal/profile_builder.h"
+#include "tcmalloc/malloc_extension.h"
+#include "tcmalloc/sampled_allocation.h"
+#include "tcmalloc/static_vars.h"
+#include "tcmalloc/testing/test_allocator_harness.h"
+#include "tcmalloc/testing/thread_manager.h"
+
+namespace tcmalloc {
+namespace {
+
+class HeapProfilingTest : public ::testing::TestWithParam<int64_t> {};
+
+// Verify that heap profiling sessions concurrent with allocations/deallocations
+// do not crash, as they all use `tc_globals.sampled_allocation_recorder_`. Also
+// check that the data in the sample make sense. Here the
+// allocations/deallocations can happen on the same thread or the object is
+// allocated in one thread, transferred to another thread and deleted there.
+TEST_P(HeapProfilingTest, GetHeapProfileWhileAllocAndDealloc) {
+  ScopedProfileSamplingRate s(GetParam());
+  const int kThreads = 10;
+  ThreadManager manager;
+  AllocatorHarness harness(kThreads);
+
+  // Some threads are busy with allocating and deallocating.
+  manager.Start(kThreads, [&](int thread_id) { harness.Run(thread_id); });
+
+  absl::Time start = absl::Now();
+  // Another few threads busy with iterating different kinds of heap profiles.
+  for (auto t : {
+           ProfileType::kHeap,
+           ProfileType::kFragmentation,
+           ProfileType::kPeakHeap,
+       }) {
+    manager.Start(2, [&](int) {
+      MallocExtension::SnapshotCurrent(t).Iterate(
+          [&](const Profile::Sample& s) {
+            // Inspect a few fields in the sample.
+            EXPECT_GE(s.sum, 0);
+            EXPECT_GT(s.depth, 0);
+            EXPECT_GT(s.requested_size, 0);
+            EXPECT_GT(s.allocated_size, 0);
+            EXPECT_GT(s.allocation_time, start - absl::Seconds(10));
+            EXPECT_LT(s.allocation_time, start + absl::Seconds(10));
+          });
+    });
+  }
+
+  absl::SleepFor(absl::Seconds(1));
+  manager.Stop();
+}
+
+// Test at different sampling rates, from always sampling to lower sampling
+// probabilities. This is stress testing and attempts to expose potential
+// failure modes when we only have sampled allocations and when we have a mix of
+// sampled/unsampled allocations.
+INSTANTIATE_TEST_SUITE_P(SamplingRates, HeapProfilingTest,
+                         testing::Values(1, 1 << 7, 1 << 14, 1 << 21),
+                         testing::PrintToStringParamName());
+
+TEST(HeapProfilingTest, AllocateDifferentSizes) {
+  const int num_allocations = 1000;
+  const size_t requested_size1 = (1 << 19) + 1;
+  const size_t requested_size2 = (1 << 20) + 1;
+  int requested_size1_count = 0;
+  int requested_size2_count = 0;
+
+  // First allocate some large objects at a specific size, verify through heap
+  // profile, and deallocate them.
+  void* allocations1[num_allocations];
+  for (int i = 0; i < num_allocations; i++) {
+    allocations1[i] = ::operator new(requested_size1);
+  }
+
+  MallocExtension::SnapshotCurrent(ProfileType::kHeap)
+      .Iterate([&](const Profile::Sample& s) {
+        if (s.requested_size == requested_size1) requested_size1_count++;
+        if (s.requested_size == requested_size2) requested_size2_count++;
+      });
+
+  EXPECT_GT(requested_size1_count, 0);
+  EXPECT_EQ(requested_size2_count, 0);
+  requested_size1_count = 0;
+
+  for (int i = 0; i < num_allocations; i++) {
+    ::operator delete(allocations1[i]);
+  }
+
+  // Next allocate some large objects at a different size, verify through heap
+  // profile, and deallocate them.
+  void* allocations2[num_allocations];
+  for (int i = 0; i < num_allocations; i++) {
+    allocations2[i] = ::operator new(requested_size2);
+  }
+
+  MallocExtension::SnapshotCurrent(ProfileType::kHeap)
+      .Iterate([&](const Profile::Sample& s) {
+        if (s.requested_size == requested_size1) requested_size1_count++;
+        if (s.requested_size == requested_size2) requested_size2_count++;
+      });
+
+  EXPECT_EQ(requested_size1_count, 0);
+  EXPECT_GT(requested_size2_count, 0);
+
+  for (int i = 0; i < num_allocations; i++) {
+    ::operator delete(allocations2[i]);
+  }
+}
+
+TEST(HeapProfilingTest, CheckResidency) {
+  ScopedProfileSamplingRate s(1);
+  const int num_allocations = 1000;
+  const size_t requested_size = (1 << 19) + 1;
+
+  void* allocations[num_allocations];
+  for (int i = 0; i < num_allocations; i++) {
+    allocations[i] = ::operator new(requested_size);
+  }
+
+  bool mlock_failure = false;
+  for (int i = 0; i < num_allocations; i++) {
+    if (::mlock(allocations[i], requested_size) != 0) {
+      mlock_failure = true;
+      for (int j = 0; j < requested_size; ++j) {
+        static_cast<volatile char*>(allocations[i])[j] = 0x20;
+      }
+    }
+  }
+  if (mlock_failure) {
+    absl::FPrintF(
+        stderr,
+        "one or more mlocks failed, which could cause test flakiness\n");
+  }
+
+  // Collect the heap profile and look for residency info.
+  auto converted_or = tcmalloc_internal::MakeProfileProto(
+      MallocExtension::SnapshotCurrent(ProfileType::kHeap));
+  ASSERT_TRUE(converted_or.ok());
+  const auto& converted = **converted_or;
+
+  // Look for "sampled_resident_bytes" string in string table.
+  std::optional<int> sampled_resident_bytes_id;
+  for (int i = 0, n = converted.string_table().size(); i < n; ++i) {
+    if (converted.string_table(i) == "sampled_resident_bytes") {
+      sampled_resident_bytes_id = i;
+    }
+  }
+  ASSERT_TRUE(sampled_resident_bytes_id.has_value());
+
+  size_t resident_size = 0;
+  for (const auto& sample : converted.sample()) {
+    for (const auto& label : sample.label()) {
+      if (label.key() == sampled_resident_bytes_id) {
+        resident_size += label.num();
+      }
+    }
+  }
+
+  EXPECT_GE(resident_size, num_allocations * requested_size);
+  EXPECT_LE(resident_size, num_allocations * requested_size * 2);
+
+  for (int i = 0; i < num_allocations; i++) {
+    // throw away the error
+    ::munlock(allocations[i], requested_size);
+  }
+  for (int i = 0; i < num_allocations; i++) {
+    ::operator delete(allocations[i]);
+  }
+}
+
+// Make sure users can allocate when iterating over the heap samples. For now
+// `MallocExtension::SnapshotCurrent()` uses `StackTraceTable` to make a copy of
+// the sampled allocations from `tc_globals.sampled_allocation_recorder()` and
+// then iterate from the `StackTraceTable`. Ideally, we would want to avoid the
+// extra copy and iterate over sampled allocations directly. However, this would
+// result in deadlocks for the test case below. If we `Iterate()` directly on
+// `tc_globals.sampled_allocation_recorder()`, we hold the per-sample lock. As
+// we add data to a hashtable that stores allocations (always sampled here), the
+// hashtable can decide to `resize()`, deallocates the same sampled allocation
+// it is iterating at, wants to get the per-sample lock and ends up with a
+// deadlock. At the current state, making copies over sampled allocations and
+// iterate over those copies would not deadlock and the test case below passes.
+TEST(HeapProfilingTest, AllocateWhileIterating) {
+  ScopedProfileSamplingRate s(1);
+  absl::flat_hash_set<void*> set;
+  // This fills up the slots in hashtable and so there is a good chance it would
+  // call `resize()` when inserting new entries later. This makes it easier for
+  // the deadlock to happen (>95% of the cases when directly iterating over
+  // `tc_globals.sampled_allocation_recorder()`).
+  set.reserve(1);
+  set.insert(::operator new(100));
+  for (int i = 0; i < 3; i++) {
+    MallocExtension::SnapshotCurrent(ProfileType::kHeap)
+        .Iterate(
+            [&](const Profile::Sample& s) { set.insert(::operator new(100)); });
+  }
+  for (void* obj : set) {
+    ::operator delete(obj);
+  }
+}
+
+}  // namespace
+}  // namespace tcmalloc
--- a/src/third_party/tcmalloc/dist/tcmalloc/hinted_tracker_lists.h
+++ b/src/third_party/tcmalloc/dist/tcmalloc/hinted_tracker_lists.h
@ -0,0 +1,126 @@
+// Copyright 2022 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TCMALLOC_HINTED_TRACKER_LIST_H_
+#define TCMALLOC_HINTED_TRACKER_LIST_H_
+
+#include "tcmalloc/internal/linked_list.h"
+#include "tcmalloc/internal/range_tracker.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+
+// This class wraps an array of N TrackerLists and a Bitmap storing which
+// elements are non-empty.
+template <class TrackerType, size_t N>
+class HintedTrackerLists {
+ public:
+  using TrackerList = TList<TrackerType>;
+
+  constexpr HintedTrackerLists() : size_{} {}
+
+  // Removes a TrackerType from the first non-empty freelist with index at
+  // least n and returns it. Returns nullptr if there is none.
+  TrackerType* GetLeast(const size_t n) {
+    ASSERT(n < N);
+    size_t i = nonempty_.FindSet(n);
+    if (i == N) {
+      return nullptr;
+    }
+    ASSERT(!lists_[i].empty());
+    TrackerType* pt = lists_[i].first();
+    if (lists_[i].remove(pt)) {
+      nonempty_.ClearBit(i);
+    }
+    --size_;
+    return pt;
+  }
+
+  // Returns a pointer to the TrackerType from the first non-empty freelist with
+  // index at least n and returns it. Returns nullptr if there is none.
+  //
+  // Unlike GetLeast, this does not remove the pointer from the list when it is
+  // found.
+  TrackerType* PeekLeast(const size_t n) {
+    ASSERT(n < N);
+    size_t i = nonempty_.FindSet(n);
+    if (i == N) {
+      return nullptr;
+    }
+    ASSERT(!lists_[i].empty());
+    return lists_[i].first();
+  }
+
+  // Adds pointer <pt> to the nonempty_[i] list.
+  // REQUIRES: i < N && pt != nullptr.
+  void Add(TrackerType* pt, const size_t i) {
+    ASSERT(i < N);
+    ASSERT(pt != nullptr);
+    lists_[i].prepend(pt);
+    ++size_;
+    nonempty_.SetBit(i);
+  }
+
+  // Removes pointer <pt> from the nonempty_[i] list.
+  // REQUIRES: i < N && pt != nullptr.
+  void Remove(TrackerType* pt, const size_t i) {
+    ASSERT(i < N);
+    ASSERT(pt != nullptr);
+    if (lists_[i].remove(pt)) {
+      nonempty_.ClearBit(i);
+    }
+    --size_;
+  }
+  const TrackerList& operator[](const size_t n) const {
+    ASSERT(n < N);
+    return lists_[n];
+  }
+  size_t size() const { return size_; }
+  bool empty() const { return size_ == 0; }
+
+  // Returns length of the list at an index <n>.
+  // REQUIRES: n < N.
+  size_t SizeOfList(const size_t n) const {
+    ASSERT(n < N);
+    return lists_[n].length();
+  }
+  // Runs a functor on all pointers in the TrackerLists.
+  // This method is const but the Functor gets passed a non-const pointer.
+  // This quirk is inherited from TrackerList.
+  template <typename Functor>
+  void Iter(const Functor& func, size_t start) const {
+    size_t i = nonempty_.FindSet(start);
+    while (i < N) {
+      auto& list = lists_[i];
+      ASSERT(!list.empty());
+      for (TrackerType* pt : list) {
+        func(pt);
+      }
+      i++;
+      if (i < N) i = nonempty_.FindSet(i);
+    }
+  }
+
+ private:
+  TrackerList lists_[N];
+  size_t size_;
+  Bitmap<N> nonempty_;
+};
+
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
+
+#endif  // TCMALLOC_HINTED_TRACKER_LIST_H_
--- a/src/third_party/tcmalloc/dist/tcmalloc/huge_address_map.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/huge_address_map.cc
@ -0,0 +1,374 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tcmalloc/huge_address_map.h"
+
+#include <stdlib.h>
+
+#include <algorithm>
+#include <new>
+
+#include "absl/base/internal/cycleclock.h"
+#include "tcmalloc/internal/logging.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+
+const HugeAddressMap::Node* HugeAddressMap::Node::next() const {
+  const Node* n = right_;
+  if (n) {
+    while (n->left_) n = n->left_;
+    return n;
+  }
+
+  n = parent_;
+  const Node* last = this;
+  while (n) {
+    if (n->left_ == last) return n;
+    last = n;
+    n = n->parent_;
+  }
+
+  return nullptr;
+}
+
+HugeAddressMap::Node* HugeAddressMap::Node::next() {
+  const Node* n = static_cast<const Node*>(this)->next();
+  return const_cast<Node*>(n);
+}
+
+void HugeAddressMap::Node::Check(size_t* num_nodes, HugeLength* size) const {
+  HugeLength longest = range_.len();
+  *num_nodes += 1;
+  *size += range_.len();
+
+  if (left_) {
+    // tree
+    CHECK_CONDITION(left_->range_.start() < range_.start());
+    // disjoint
+    CHECK_CONDITION(left_->range_.end_addr() < range_.start_addr());
+    // well-formed
+    CHECK_CONDITION(left_->parent_ == this);
+    // heap
+    CHECK_CONDITION(left_->prio_ <= prio_);
+    left_->Check(num_nodes, size);
+    if (left_->longest_ > longest) longest = left_->longest_;
+  }
+
+  if (right_) {
+    // tree
+    CHECK_CONDITION(right_->range_.start() > range_.start());
+    // disjoint
+    CHECK_CONDITION(right_->range_.start_addr() > range_.end_addr());
+    // well-formed
+    CHECK_CONDITION(right_->parent_ == this);
+    // heap
+    CHECK_CONDITION(right_->prio_ <= prio_);
+    right_->Check(num_nodes, size);
+    if (right_->longest_ > longest) longest = right_->longest_;
+  }
+
+  CHECK_CONDITION(longest_ == longest);
+}
+
+const HugeAddressMap::Node* HugeAddressMap::first() const {
+  const Node* n = root();
+  if (!n) return nullptr;
+  const Node* left = n->left_;
+  while (left) {
+    n = left;
+    left = n->left_;
+  }
+
+  return n;
+}
+
+HugeAddressMap::Node* HugeAddressMap::first() {
+  const Node* f = static_cast<const HugeAddressMap*>(this)->first();
+  return const_cast<Node*>(f);
+}
+
+void HugeAddressMap::Check() {
+  size_t nodes = 0;
+  HugeLength size = NHugePages(0);
+  if (root_) {
+    CHECK_CONDITION(root_->parent_ == nullptr);
+    root_->Check(&nodes, &size);
+  }
+  CHECK_CONDITION(nodes == nranges());
+  CHECK_CONDITION(size == total_mapped());
+  CHECK_CONDITION(total_nodes_ == used_nodes_ + freelist_size_);
+}
+
+size_t HugeAddressMap::nranges() const { return used_nodes_; }
+
+HugeLength HugeAddressMap::total_mapped() const { return total_size_; }
+
+void HugeAddressMap::Print(Printer* out) const {
+  out->printf("HugeAddressMap: treap %zu / %zu nodes used / created\n",
+              used_nodes_, total_nodes_);
+  const size_t longest = root_ ? root_->longest_.raw_num() : 0;
+  out->printf("HugeAddressMap: %zu contiguous hugepages available\n", longest);
+}
+
+void HugeAddressMap::PrintInPbtxt(PbtxtRegion* hpaa) const {
+  hpaa->PrintI64("num_huge_address_map_treap_nodes_used", used_nodes_);
+  hpaa->PrintI64("num_huge_address_map_treap_nodes_created", total_nodes_);
+  const size_t longest = root_ ? root_->longest_.in_bytes() : 0;
+  hpaa->PrintI64("contiguous_free_bytes", longest);
+}
+
+HugeAddressMap::Node* HugeAddressMap::Predecessor(HugePage p) {
+  Node* n = root();
+  Node* best = nullptr;
+  while (n) {
+    HugeRange here = n->range_;
+    if (here.contains(p)) return n;
+    if (p < here.start()) {
+      // p comes before here:
+      // our predecessor isn't here, nor in the right subtree.
+      n = n->left_;
+    } else {
+      // p comes after here:
+      // here is a valid candidate, and the right subtree might have better.
+      best = n;
+      n = n->right_;
+    }
+  }
+
+  return best;
+}
+
+void HugeAddressMap::Merge(Node* b, HugeRange r, Node* a) {
+  auto merge_when = [](HugeRange x, int64_t x_when, HugeRange y,
+                       int64_t y_when) {
+    // avoid overflow with floating-point
+    const size_t x_len = x.len().raw_num();
+    const size_t y_len = y.len().raw_num();
+    const double x_weight = static_cast<double>(x_len) * x_when;
+    const double y_weight = static_cast<double>(y_len) * y_when;
+    return static_cast<int64_t>((x_weight + y_weight) / (x_len + y_len));
+  };
+
+  int64_t when = absl::base_internal::CycleClock::Now();
+  // Two way merges are easy.
+  if (a == nullptr) {
+    b->when_ = merge_when(b->range_, b->when(), r, when);
+    b->range_ = Join(b->range_, r);
+    FixLongest(b);
+    return;
+  } else if (b == nullptr) {
+    a->when_ = merge_when(r, when, a->range_, a->when());
+    a->range_ = Join(r, a->range_);
+    FixLongest(a);
+    return;
+  }
+
+  // Three way merge: slightly harder.  We must remove one node
+  // (arbitrarily picking next).
+  HugeRange partial = Join(r, a->range_);
+  int64_t partial_when = merge_when(r, when, a->range_, a->when());
+  HugeRange full = Join(b->range_, partial);
+  int64_t full_when = merge_when(b->range_, b->when(), partial, partial_when);
+  // Removing a will reduce total_size_ by that length, but since we're merging
+  // we actually don't change lengths at all; undo that.
+  total_size_ += a->range_.len();
+  Remove(a);
+  b->range_ = full;
+  b->when_ = full_when;
+  FixLongest(b);
+}
+
+void HugeAddressMap::Insert(HugeRange r) {
+  total_size_ += r.len();
+  // First, try to merge if necessary. Note there are three possibilities:
+  // we might need to merge before with r, r with after, or all three together.
+  Node* before = Predecessor(r.start());
+  CHECK_CONDITION(!before || !before->range_.intersects(r));
+  Node* after = before ? before->next() : first();
+  CHECK_CONDITION(!after || !after->range_.intersects(r));
+  if (before && before->range_.precedes(r)) {
+    if (after && r.precedes(after->range_)) {
+      Merge(before, r, after);
+    } else {
+      Merge(before, r, nullptr);
+    }
+    return;
+  } else if (after && r.precedes(after->range_)) {
+    Merge(nullptr, r, after);
+    return;
+  }
+  CHECK_CONDITION(!before || !before->range_.precedes(r));
+  CHECK_CONDITION(!after || !r.precedes(after->range_));
+  // No merging possible; just add a new node.
+  Node* n = Get(r);
+  Node* curr = root();
+  Node* parent = nullptr;
+  Node** link = &root_;
+  // Walk down the tree to our correct location
+  while (curr != nullptr && curr->prio_ >= n->prio_) {
+    curr->longest_ = std::max(curr->longest_, r.len());
+    parent = curr;
+    if (curr->range_.start() < r.start()) {
+      link = &curr->right_;
+      curr = curr->right_;
+    } else {
+      link = &curr->left_;
+      curr = curr->left_;
+    }
+  }
+  *link = n;
+  n->parent_ = parent;
+  n->left_ = n->right_ = nullptr;
+  n->longest_ = r.len();
+  if (curr) {
+    HugePage p = r.start();
+    // We need to split the treap at curr into n's children.
+    // This will be two treaps: one less than p, one greater, and has
+    // a nice recursive structure.
+    Node** less = &n->left_;
+    Node* lp = n;
+    Node** more = &n->right_;
+    Node* mp = n;
+    while (curr) {
+      if (curr->range_.start() < p) {
+        *less = curr;
+        curr->parent_ = lp;
+        less = &curr->right_;
+        lp = curr;
+        curr = curr->right_;
+      } else {
+        *more = curr;
+        curr->parent_ = mp;
+        more = &curr->left_;
+        mp = curr;
+        curr = curr->left_;
+      }
+    }
+    *more = *less = nullptr;
+    // We ripped apart the tree along these two paths--fix longest pointers.
+    FixLongest(lp);
+    FixLongest(mp);
+  }
+}
+
+void HugeAddressMap::Node::FixLongest() {
+  const HugeLength l = left_ ? left_->longest_ : NHugePages(0);
+  const HugeLength r = right_ ? right_->longest_ : NHugePages(0);
+  const HugeLength c = range_.len();
+  const HugeLength new_longest = std::max({l, r, c});
+  longest_ = new_longest;
+}
+
+void HugeAddressMap::FixLongest(HugeAddressMap::Node* n) {
+  while (n) {
+    n->FixLongest();
+    n = n->parent_;
+  }
+}
+
+void HugeAddressMap::Remove(HugeAddressMap::Node* n) {
+  total_size_ -= n->range_.len();
+  // We need to merge the left and right children of n into one
+  // treap, then glue it into place wherever n was.
+  Node** link;
+  Node* parent = n->parent_;
+  Node* top = n->left_;
+  Node* bottom = n->right_;
+
+  const HugeLength child_longest =
+      std::max(top ? top->longest_ : NHugePages(0),
+               bottom ? bottom->longest_ : NHugePages(0));
+  if (!parent) {
+    link = &root_;
+  } else {
+    // Account for the removed child--might change longests.
+    // Easiest way: update this subtree to ignore the removed node,
+    // then fix the chain of parents.
+    n->longest_ = child_longest;
+    FixLongest(parent);
+    if (parent->range_.start() > n->range_.start()) {
+      link = &parent->left_;
+    } else {
+      link = &parent->right_;
+    }
+  }
+
+  // A routine op we'll need a lot: given two (possibly null)
+  // children, put the root-ier one into top.
+  auto reorder_maybe = [](Node** top, Node** bottom) {
+    Node *b = *bottom, *t = *top;
+    if (b && (!t || t->prio_ < b->prio_)) {
+      *bottom = t;
+      *top = b;
+    }
+  };
+
+  reorder_maybe(&top, &bottom);
+  // if we have two treaps to merge (top is always non-null if bottom is)
+  // Invariant: top, bottom are two valid (longest included)
+  // treaps. parent (and all above/elsewhere) have the correct longest
+  // values, though parent does not have the correct children (will be the
+  // merged value of top and bottom.)
+  while (bottom) {
+    *link = top;
+    top->parent_ = parent;
+    // We're merging bottom into top, so top might contain a longer
+    // chunk than it thinks.
+    top->longest_ = std::max(top->longest_, bottom->longest_);
+    parent = top;
+    if (bottom->range_.start() < top->range_.start()) {
+      link = &top->left_;
+      top = top->left_;
+    } else {
+      link = &top->right_;
+      top = top->right_;
+    }
+    reorder_maybe(&top, &bottom);
+  }
+  *link = top;
+  if (top) top->parent_ = parent;
+  Put(n);
+}
+
+void HugeAddressMap::Put(Node* n) {
+  freelist_size_++;
+  used_nodes_--;
+  n->left_ = freelist_;
+  freelist_ = n;
+}
+
+HugeAddressMap::Node* HugeAddressMap::Get(HugeRange r) {
+  CHECK_CONDITION((freelist_ == nullptr) == (freelist_size_ == 0));
+  used_nodes_++;
+  int prio = rand_r(&seed_);
+  if (freelist_size_ == 0) {
+    total_nodes_++;
+    Node* ret = reinterpret_cast<Node*>(meta_(sizeof(Node)));
+    return new (ret) Node(r, prio);
+  }
+
+  freelist_size_--;
+  Node* ret = freelist_;
+  freelist_ = ret->left_;
+  return new (ret) Node(r, prio);
+}
+
+HugeAddressMap::Node::Node(HugeRange r, int prio)
+    : range_(r), prio_(prio), when_(absl::base_internal::CycleClock::Now()) {}
+
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
--- a/src/third_party/tcmalloc/dist/tcmalloc/huge_address_map.h
+++ b/src/third_party/tcmalloc/dist/tcmalloc/huge_address_map.h
@ -0,0 +1,147 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TCMALLOC_HUGE_ADDRESS_MAP_H_
+#define TCMALLOC_HUGE_ADDRESS_MAP_H_
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tcmalloc/huge_pages.h"
+#include "tcmalloc/internal/logging.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+
+// Maintains a set of disjoint HugeRanges, merging adjacent ranges into one.
+// Exposes a balanced (somehow) binary tree of free ranges on address,
+// augmented with the largest range in each subtree (this allows fairly simple
+// allocation algorithms from the contained ranges.
+//
+// This class scales well and is *reasonably* performant, but it is not intended
+// for use on extremely hot paths.
+class HugeAddressMap {
+ public:
+  typedef void* (*MetadataAllocFunction)(size_t bytes);
+  explicit constexpr HugeAddressMap(MetadataAllocFunction meta);
+
+  // IMPORTANT: DESTROYING A HUGE ADDRESS MAP DOES NOT MAKE ANY ATTEMPT
+  // AT FREEING ALLOCATED METADATA.
+  ~HugeAddressMap() = default;
+
+  class Node {
+   public:
+    // the range stored at this point
+    HugeRange range() const;
+    // Tree structure
+    Node* left();
+    Node* right();
+    // Iterate to the next node in address order
+    const Node* next() const;
+    Node* next();
+    // when were this node's content added (in
+    // absl::base_internal::CycleClock::Now units)?
+    int64_t when() const;
+
+    // What is the length of the longest range in the subtree rooted here?
+    HugeLength longest() const;
+
+   private:
+    Node(HugeRange r, int prio);
+    friend class HugeAddressMap;
+    HugeRange range_;
+    int prio_;  // chosen randomly
+    Node *left_, *right_;
+    Node* parent_;
+    HugeLength longest_;
+    int64_t when_;
+    // Expensive, recursive consistency check.
+    // Accumulates node count and range sizes into passed arguments.
+    void Check(size_t* num_nodes, HugeLength* size) const;
+
+    // We've broken longest invariants somehow; fix them here.
+    void FixLongest();
+  };
+
+  // Get root of the tree.
+  Node* root();
+  const Node* root() const;
+
+  // Get lowest-addressed node
+  const Node* first() const;
+  Node* first();
+
+  // Returns the highest-addressed range that does not lie completely
+  // after p (if any).
+  Node* Predecessor(HugePage p);
+
+  // Expensive consistency check.
+  void Check();
+
+  // Statistics
+  size_t nranges() const;
+  HugeLength total_mapped() const;
+  void Print(Printer* out) const;
+  void PrintInPbtxt(PbtxtRegion* hpaa) const;
+
+  // Add <r> to the map, merging with adjacent ranges as needed.
+  void Insert(HugeRange r);
+
+  // Delete n from the map.
+  void Remove(Node* n);
+
+ private:
+  // our tree
+  Node* root_{nullptr};
+  size_t used_nodes_{0};
+  HugeLength total_size_{NHugePages(0)};
+
+  // cache of unused nodes
+  Node* freelist_{nullptr};
+  size_t freelist_size_{0};
+  // How we get more
+  MetadataAllocFunction meta_;
+  Node* Get(HugeRange r);
+  void Put(Node* n);
+
+  size_t total_nodes_{0};
+
+  void Merge(Node* b, HugeRange r, Node* a);
+  void FixLongest(Node* n);
+  // Note that we always use the same seed, currently; this isn't very random.
+  // In practice we're not worried about adversarial input and this works well
+  // enough.
+  unsigned int seed_{0};
+};
+
+inline constexpr HugeAddressMap::HugeAddressMap(MetadataAllocFunction meta)
+    : meta_(meta) {}
+
+inline HugeRange HugeAddressMap::Node::range() const { return range_; }
+inline HugeAddressMap::Node* HugeAddressMap::Node::left() { return left_; }
+inline HugeAddressMap::Node* HugeAddressMap::Node::right() { return right_; }
+
+inline int64_t HugeAddressMap::Node::when() const { return when_; }
+inline HugeLength HugeAddressMap::Node::longest() const { return longest_; }
+
+inline HugeAddressMap::Node* HugeAddressMap::root() { return root_; }
+inline const HugeAddressMap::Node* HugeAddressMap::root() const {
+  return root_;
+}
+
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
+
+#endif  // TCMALLOC_HUGE_ADDRESS_MAP_H_
--- a/src/third_party/tcmalloc/dist/tcmalloc/huge_address_map_test.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/huge_address_map_test.cc
@ -0,0 +1,86 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tcmalloc/huge_address_map.h"
+
+#include <stddef.h>
+#include <stdlib.h>
+
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace tcmalloc {
+namespace tcmalloc_internal {
+namespace {
+
+class HugeAddressMapTest : public ::testing::Test {
+ protected:
+  HugeAddressMapTest() : map_(MallocMetadata) { metadata_allocs_.clear(); }
+
+  ~HugeAddressMapTest() override {
+    for (void* p : metadata_allocs_) {
+      free(p);
+    }
+  }
+
+  std::vector<HugeRange> Contents() {
+    std::vector<HugeRange> ret;
+    auto node = map_.first();
+    while (node) {
+      ret.push_back(node->range());
+      node = node->next();
+    }
+
+    return ret;
+  }
+
+  HugePage hp(size_t i) { return {i}; }
+  HugeLength hl(size_t i) { return NHugePages(i); }
+
+  HugeAddressMap map_;
+
+ private:
+  static void* MallocMetadata(size_t size) {
+    void* ptr = malloc(size);
+    metadata_allocs_.push_back(ptr);
+    return ptr;
+  }
+
+  static std::vector<void*> metadata_allocs_;
+};
+
+std::vector<void*> HugeAddressMapTest::metadata_allocs_;
+
+// This test verifies that HugeAddressMap merges properly.
+TEST_F(HugeAddressMapTest, Merging) {
+  const HugeRange r1 = HugeRange::Make(hp(0), hl(1));
+  const HugeRange r2 = HugeRange::Make(hp(1), hl(1));
+  const HugeRange r3 = HugeRange::Make(hp(2), hl(1));
+  const HugeRange all = Join(r1, Join(r2, r3));
+  map_.Insert(r1);
+  map_.Check();
+  EXPECT_THAT(Contents(), testing::ElementsAre(r1));
+  map_.Insert(r3);
+  map_.Check();
+  EXPECT_THAT(Contents(), testing::ElementsAre(r1, r3));
+  map_.Insert(r2);
+  map_.Check();
+  EXPECT_THAT(Contents(), testing::ElementsAre(all));
+}
+
+}  // namespace
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
--- a/src/third_party/tcmalloc/dist/tcmalloc/huge_allocator.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/huge_allocator.cc
@ -0,0 +1,174 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tcmalloc/huge_allocator.h"
+
+#include <string.h>
+
+#include "tcmalloc/huge_address_map.h"
+#include "tcmalloc/internal/logging.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+
+void HugeAllocator::Print(Printer* out) {
+  out->printf("HugeAllocator: contiguous, unbacked hugepage(s)\n");
+  free_.Print(out);
+  out->printf(
+      "HugeAllocator: %zu requested - %zu in use = %zu hugepages free\n",
+      from_system_.raw_num(), in_use_.raw_num(),
+      (from_system_ - in_use_).raw_num());
+}
+
+void HugeAllocator::PrintInPbtxt(PbtxtRegion* hpaa) const {
+  free_.PrintInPbtxt(hpaa);
+  hpaa->PrintI64("num_total_requested_huge_pages", from_system_.raw_num());
+  hpaa->PrintI64("num_in_use_huge_pages", in_use_.raw_num());
+}
+
+HugeAddressMap::Node* HugeAllocator::Find(HugeLength n) {
+  HugeAddressMap::Node* curr = free_.root();
+  // invariant: curr != nullptr && curr->longest >= n
+  // we favor smaller gaps and lower nodes and lower addresses, in that
+  // order. The net effect is that we are neither a best-fit nor a
+  // lowest-address allocator but vaguely close to both.
+  HugeAddressMap::Node* best = nullptr;
+  while (curr && curr->longest() >= n) {
+    if (curr->range().len() >= n) {
+      if (!best || best->range().len() > curr->range().len()) {
+        best = curr;
+      }
+    }
+
+    // Either subtree could contain a better fit and we don't want to
+    // search the whole tree. Pick a reasonable child to look at.
+    auto left = curr->left();
+    auto right = curr->right();
+    if (!left || left->longest() < n) {
+      curr = right;
+      continue;
+    }
+
+    if (!right || right->longest() < n) {
+      curr = left;
+      continue;
+    }
+
+    // Here, we have a nontrivial choice.
+    if (left->range().len() == right->range().len()) {
+      if (left->longest() <= right->longest()) {
+        curr = left;
+      } else {
+        curr = right;
+      }
+    } else if (left->range().len() < right->range().len()) {
+      // Here, the longest range in both children is the same...look
+      // in the subtree with the smaller root, as that's slightly
+      // more likely to be our best.
+      curr = left;
+    } else {
+      curr = right;
+    }
+  }
+  return best;
+}
+
+void HugeAllocator::CheckFreelist() {
+  free_.Check();
+  size_t num_nodes = free_.nranges();
+  HugeLength n = free_.total_mapped();
+  free_.Check();
+  CHECK_CONDITION(n == from_system_ - in_use_);
+  LargeSpanStats large;
+  AddSpanStats(nullptr, &large, nullptr);
+  CHECK_CONDITION(num_nodes == large.spans);
+  CHECK_CONDITION(n.in_pages() == large.returned_pages);
+}
+
+HugeRange HugeAllocator::AllocateRange(HugeLength n) {
+  if (n.overflows()) return HugeRange::Nil();
+  size_t bytes = n.in_bytes();
+  size_t align = kHugePageSize;
+  auto [ptr, actual] = allocate_(bytes, align);
+  if (ptr == nullptr) {
+    // OOM...
+    return HugeRange::Nil();
+  }
+  CHECK_CONDITION(ptr != nullptr);
+  // It's possible for a request to return extra hugepages.
+  CHECK_CONDITION(actual % kHugePageSize == 0);
+  n = HLFromBytes(actual);
+  from_system_ += n;
+  return HugeRange::Make(HugePageContaining(ptr), n);
+}
+
+HugeRange HugeAllocator::Get(HugeLength n) {
+  CHECK_CONDITION(n > NHugePages(0));
+  auto* node = Find(n);
+  if (!node) {
+    // Get more memory, then "delete" it
+    HugeRange r = AllocateRange(n);
+    if (!r.valid()) return r;
+    in_use_ += r.len();
+    Release(r);
+    node = Find(n);
+    CHECK_CONDITION(node != nullptr);
+  }
+  in_use_ += n;
+
+  HugeRange r = node->range();
+  free_.Remove(node);
+  if (r.len() > n) {
+    HugeLength before = r.len();
+    HugeRange extra = HugeRange::Make(r.start() + n, before - n);
+    r = HugeRange::Make(r.start(), n);
+    ASSERT(r.precedes(extra));
+    ASSERT(r.len() + extra.len() == before);
+    in_use_ += extra.len();
+    Release(extra);
+  } else {
+    // Release does this for us
+    DebugCheckFreelist();
+  }
+
+  return r;
+}
+
+void HugeAllocator::Release(HugeRange r) {
+  in_use_ -= r.len();
+
+  free_.Insert(r);
+  DebugCheckFreelist();
+}
+
+void HugeAllocator::AddSpanStats(SmallSpanStats* small, LargeSpanStats* large,
+                                 PageAgeHistograms* ages) const {
+  for (const HugeAddressMap::Node* node = free_.first(); node != nullptr;
+       node = node->next()) {
+    HugeLength n = node->range().len();
+    if (large != nullptr) {
+      large->spans++;
+      large->returned_pages += n.in_pages();
+    }
+
+    if (ages != nullptr) {
+      ages->RecordRange(n.in_pages(), true, node->when());
+    }
+  }
+}
+
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
--- a/src/third_party/tcmalloc/dist/tcmalloc/huge_allocator.h
+++ b/src/third_party/tcmalloc/dist/tcmalloc/huge_allocator.h
@ -0,0 +1,108 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Tracking information for the available range of hugepages,
+// and a basic allocator for unmapped hugepages.
+#ifndef TCMALLOC_HUGE_ALLOCATOR_H_
+#define TCMALLOC_HUGE_ALLOCATOR_H_
+
+#include <stddef.h>
+
+#include "tcmalloc/common.h"
+#include "tcmalloc/huge_address_map.h"
+#include "tcmalloc/huge_pages.h"
+#include "tcmalloc/stats.h"
+#include "tcmalloc/system-alloc.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+
+// these typedefs allow replacement of tcmalloc::System* for tests.
+using MemoryAllocFunction = AddressRange (*)(size_t bytes, size_t align);
+using MetadataAllocFunction = void* (*)(size_t bytes);
+
+// This tracks available ranges of hugepages and fulfills requests for
+// usable memory, allocating more from the system as needed.  All
+// hugepages are treated as (and assumed to be) unbacked.
+class HugeAllocator {
+ public:
+  constexpr HugeAllocator(MemoryAllocFunction allocate,
+                          MetadataAllocFunction meta_allocate)
+      : free_(meta_allocate), allocate_(allocate) {}
+
+  // Obtain a range of n unbacked hugepages, distinct from all other
+  // calls to Get (other than those that have been Released.)
+  HugeRange Get(HugeLength n);
+
+  // Returns a range of hugepages for reuse by subsequent Gets().
+  // REQUIRES: <r> is the return value (or a subrange thereof) of a previous
+  // call to Get(); neither <r> nor any overlapping range has been released
+  // since that Get().
+  void Release(HugeRange r);
+
+  // Total memory requested from the system, whether in use or not,
+  HugeLength system() const { return from_system_; }
+  // Unused memory in the allocator.
+  HugeLength size() const { return from_system_ - in_use_; }
+
+  void AddSpanStats(SmallSpanStats* small, LargeSpanStats* large,
+                    PageAgeHistograms* ages) const;
+
+  BackingStats stats() const {
+    BackingStats s;
+    s.system_bytes = system().in_bytes();
+    s.free_bytes = 0;
+    s.unmapped_bytes = size().in_bytes();
+    return s;
+  }
+
+  void Print(Printer* out);
+  void PrintInPbtxt(PbtxtRegion* hpaa) const;
+
+ private:
+  // We're constrained in several ways by existing code.  Hard requirements:
+  // * no radix tree or similar O(address space) external space tracking
+  // * support sub releasing
+  // * low metadata overhead
+  // * no pre-allocation.
+  // * reasonable space overhead
+  //
+  // We use a treap ordered on addresses to track.  This isn't the most
+  // efficient thing ever but we're about to hit 100usec+/hugepage
+  // backing costs if we've gotten this far; the last few bits of performance
+  // don't matter, and most of the simple ideas can't hit all of the above
+  // requirements.
+  HugeAddressMap free_;
+  HugeAddressMap::Node* Find(HugeLength n);
+
+  void CheckFreelist();
+  void DebugCheckFreelist() {
+#ifndef NDEBUG
+    CheckFreelist();
+#endif
+  }
+
+  HugeLength from_system_{NHugePages(0)};
+  HugeLength in_use_{NHugePages(0)};
+
+  MemoryAllocFunction allocate_;
+  HugeRange AllocateRange(HugeLength n);
+};
+
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
+
+#endif  // TCMALLOC_HUGE_ALLOCATOR_H_
--- a/src/third_party/tcmalloc/dist/tcmalloc/huge_allocator_test.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/huge_allocator_test.cc
@ -0,0 +1,448 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tcmalloc/huge_allocator.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/base/internal/cycleclock.h"
+#include "absl/random/random.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "tcmalloc/huge_pages.h"
+#include "tcmalloc/internal/config.h"
+#include "tcmalloc/internal/logging.h"
+
+namespace tcmalloc {
+namespace tcmalloc_internal {
+namespace {
+
+class HugeAllocatorTest : public testing::TestWithParam<bool> {
+ private:
+  // Use a tiny fraction of actual size so we can test aggressively.
+  static AddressRange AllocateFake(size_t bytes, size_t align);
+
+  static constexpr size_t kMaxBacking = 1024 * 1024;
+  // This isn't super good form but we'll never have more than one HAT
+  // extant at once.
+  static std::vector<size_t> backing_;
+
+  // We use actual malloc for metadata allocations, but we track them so they
+  // can be deleted.
+  static void* MallocMetadata(size_t size);
+  static std::vector<void*> metadata_allocs_;
+  static size_t metadata_bytes_;
+  static bool should_overallocate_;
+  static HugeLength huge_pages_requested_;
+  static HugeLength huge_pages_received_;
+
+ protected:
+  HugeLength HugePagesRequested() { return huge_pages_requested_; }
+  HugeLength HugePagesReceived() { return huge_pages_received_; }
+
+  HugeAllocatorTest() {
+    should_overallocate_ = GetParam();
+    huge_pages_requested_ = NHugePages(0);
+    huge_pages_received_ = NHugePages(0);
+    // We don't use the first few bytes, because things might get weird
+    // given zero pointers.
+    backing_.resize(1024);
+    metadata_bytes_ = 0;
+  }
+
+  ~HugeAllocatorTest() override {
+    for (void* p : metadata_allocs_) {
+      free(p);
+    }
+    metadata_allocs_.clear();
+    backing_.clear();
+  }
+
+  size_t* GetActual(HugePage p) { return &backing_[p.index()]; }
+
+  // We're dealing with a lot of memory, so we don't want to do full memset
+  // and then check every byte for corruption.  So set the first and last
+  // byte in each page...
+  void CheckPages(HugeRange r, size_t c) {
+    for (HugePage p = r.first; p < r.first + r.n; ++p) {
+      EXPECT_EQ(c, *GetActual(p));
+    }
+  }
+
+  void MarkPages(HugeRange r, size_t c) {
+    for (HugePage p = r.first; p < r.first + r.n; ++p) {
+      *GetActual(p) = c;
+    }
+  }
+
+  void CheckStats(HugeLength expected_use) {
+    const HugeLength received = HugePagesReceived();
+    EXPECT_EQ(received, allocator_.system());
+    HugeLength used = received - allocator_.size();
+    EXPECT_EQ(used, expected_use);
+  }
+
+  HugeAllocator allocator_{AllocateFake, MallocMetadata};
+};
+
+// Use a tiny fraction of actual size so we can test aggressively.
+AddressRange HugeAllocatorTest::AllocateFake(size_t bytes, size_t align) {
+  CHECK_CONDITION(bytes % kHugePageSize == 0);
+  CHECK_CONDITION(align % kHugePageSize == 0);
+  HugeLength req = HLFromBytes(bytes);
+  huge_pages_requested_ += req;
+  // Test the case where our sys allocator provides too much.
+  if (should_overallocate_) ++req;
+  huge_pages_received_ += req;
+  // we'll actually provide hidden backing, one word per hugepage.
+  bytes = req / NHugePages(1);
+  align /= kHugePageSize;
+  size_t index = backing_.size();
+  if (index % align != 0) {
+    index += (align - (index & align));
+  }
+  if (index + bytes > kMaxBacking) return {nullptr, 0};
+  backing_.resize(index + bytes);
+  void* ptr = reinterpret_cast<void*>(index * kHugePageSize);
+  return {ptr, req.in_bytes()};
+}
+
+// We use actual malloc for metadata allocations, but we track them so they
+// can be deleted.
+void* HugeAllocatorTest::MallocMetadata(size_t size) {
+  metadata_bytes_ += size;
+  void* ptr = malloc(size);
+  metadata_allocs_.push_back(ptr);
+  return ptr;
+}
+
+std::vector<size_t> HugeAllocatorTest::backing_;
+std::vector<void*> HugeAllocatorTest::metadata_allocs_;
+size_t HugeAllocatorTest::metadata_bytes_;
+bool HugeAllocatorTest::should_overallocate_;
+HugeLength HugeAllocatorTest::huge_pages_requested_;
+HugeLength HugeAllocatorTest::huge_pages_received_;
+
+TEST_P(HugeAllocatorTest, Basic) {
+  std::vector<std::pair<HugeRange, size_t>> allocs;
+  absl::BitGen rng;
+  size_t label = 0;
+  HugeLength total = NHugePages(0);
+  static const size_t kSize = 1000;
+  HugeLength peak = total;
+  for (int i = 0; i < kSize; ++i) {
+    HugeLength len =
+        NHugePages(absl::LogUniform<int32_t>(rng, 0, (1 << 12) - 1) + 1);
+    auto r = allocator_.Get(len);
+    ASSERT_TRUE(r.valid());
+    total += len;
+    peak = std::max(peak, total);
+    CheckStats(total);
+    MarkPages(r, label);
+    allocs.push_back({r, label});
+    label++;
+  }
+
+  for (int i = 0; i < 1000 * 25; ++i) {
+    size_t index = absl::Uniform<int32_t>(rng, 0, kSize);
+    std::swap(allocs[index], allocs[kSize - 1]);
+    auto p = allocs[kSize - 1];
+    CheckPages(p.first, p.second);
+    total -= p.first.len();
+    allocator_.Release(p.first);
+    CheckStats(total);
+
+    HugeLength len =
+        NHugePages(absl::LogUniform<int32_t>(rng, 0, (1 << 12) - 1) + 1);
+    auto r = allocator_.Get(len);
+    ASSERT_TRUE(r.valid());
+    ASSERT_EQ(r.len(), len);
+    total += len;
+    peak = std::max(peak, total);
+    CheckStats(total);
+    MarkPages(r, label);
+    allocs[kSize - 1] = {r, label};
+    label++;
+  }
+  for (auto p : allocs) {
+    CheckPages(p.first, p.second);
+    allocator_.Release(p.first);
+  }
+}
+
+// Check that releasing small chunks of allocations works OK.
+TEST_P(HugeAllocatorTest, Subrelease) {
+  size_t label = 1;
+  const HugeLength kLen = NHugePages(8);
+  const HugeLength kTotal = kLen * (kLen / NHugePages(1) - 1);
+  for (int i = 0; i < 100; ++i) {
+    std::vector<std::pair<HugeRange, size_t>> allocs;
+    // get allocs of kLen and release different sized sub-chunks of them -
+    // make sure that doesn't break anything else.
+    for (HugeLength j = NHugePages(1); j < kLen; ++j) {
+      auto r = allocator_.Get(kLen);
+      ASSERT_TRUE(r.valid());
+      MarkPages(r, label);
+      allocator_.Release({r.start(), j});
+      allocs.push_back({{r.start() + j, kLen - j}, label});
+      label++;
+    }
+    EXPECT_EQ(kTotal, HugePagesRequested());
+    for (auto p : allocs) {
+      CheckPages(p.first, p.second);
+      allocator_.Release(p.first);
+    }
+  }
+}
+
+// Does subreleasing work OK for absurdly large allocations?
+TEST_P(HugeAllocatorTest, SubreleaseLarge) {
+  absl::BitGen rng;
+  std::vector<std::pair<HugeRange, size_t>> allocs;
+  size_t label = 1;
+  const HugeLength kLimit = HLFromBytes(1024ul * 1024 * 1024 * 1024);
+  for (HugeLength n = NHugePages(2); n < kLimit; n *= 2) {
+    auto r = allocator_.Get(n);
+    ASSERT_TRUE(r.valid());
+    MarkPages(r, label);
+    // chunk of less than half
+    HugeLength chunk =
+        NHugePages(absl::Uniform<int32_t>(rng, 0, n / NHugePages(2)) + 1);
+    allocator_.Release({r.start(), chunk});
+    allocs.push_back({{r.start() + chunk, n - chunk}, label});
+    label++;
+  }
+  // reuse the released space
+  const HugeLength total = HugePagesRequested();
+  while (total == HugePagesRequested()) {
+    HugeLength n =
+        NHugePages(absl::LogUniform<int32_t>(rng, 0, (1 << 8) - 1) + 1);
+    auto r = allocator_.Get(n);
+    ASSERT_TRUE(r.valid());
+    MarkPages(r, label);
+    allocs.push_back({r, label});
+    label++;
+  }
+  for (auto p : allocs) {
+    CheckPages(p.first, p.second);
+    allocator_.Release(p.first);
+  }
+}
+
+// We don't care *that* much about vaddress space, but let's not be crazy.
+// Don't fill tiny requests from big spaces.
+TEST_P(HugeAllocatorTest, Fragmentation) {
+  // Prime the pump with some random allocations.
+  absl::BitGen rng;
+
+  std::vector<HugeRange> free;
+  constexpr int kSlots = 50;
+
+  // Plan to insert a large allocation at the big_slot'th index, then free it
+  // during the initial priming step (so we have at least a contiguous region of
+  // at least big hugepages).
+  HugeLength big = NHugePages(8);
+  const int big_slot = absl::Uniform(rng, 0, kSlots);
+
+  for (int i = 0; i < kSlots; ++i) {
+    if (i == big_slot) {
+      auto r = allocator_.Get(big);
+      ASSERT_TRUE(r.valid());
+      free.push_back(r);
+    }
+
+    auto r = allocator_.Get(NHugePages(1));
+    ASSERT_TRUE(r.valid());
+    if (absl::Bernoulli(rng, 1.0 / 2)) {
+      free.push_back(r);
+    }
+  }
+  size_t slots = free.size() - 1;
+  for (auto r : free) {
+    allocator_.Release(r);
+  }
+  free.clear();
+  static const size_t kReps = 5;
+  for (int i = 0; i < kReps; ++i) {
+    SCOPED_TRACE(i);
+
+    // Ensure we have a range of this size.
+    HugeRange r = allocator_.Get(big);
+    ASSERT_TRUE(r.valid());
+    if (NHugePages(slots) > allocator_.size()) {
+      // We should also have slots pages left over after allocating big
+      for (int i = 0; i < slots; ++i) {
+        HugeRange f = allocator_.Get(NHugePages(1));
+        ASSERT_TRUE(f.valid());
+        free.push_back(f);
+      }
+      for (auto f : free) {
+        allocator_.Release(f);
+      }
+      free.clear();
+    }
+    allocator_.Release(r);
+    // We should definitely have at least this many small spaces...
+    for (int i = 0; i < slots; ++i) {
+      r = allocator_.Get(NHugePages(1));
+      ASSERT_TRUE(r.valid());
+      free.push_back(r);
+    }
+    // that don't interfere with the available big space.
+    auto before = allocator_.system();
+    r = allocator_.Get(big);
+    ASSERT_TRUE(r.valid());
+    EXPECT_EQ(before, allocator_.system());
+    allocator_.Release(r);
+    for (auto r : free) {
+      allocator_.Release(r);
+    }
+    free.clear();
+    slots += big.raw_num();
+    big += big;
+  }
+}
+
+// Check that we only request as much as we actually need from the system.
+TEST_P(HugeAllocatorTest, Frugal) {
+  HugeLength total = NHugePages(0);
+  static const size_t kSize = 1000;
+  for (int i = 1; i < kSize; ++i) {
+    HugeLength len = NHugePages(i);
+    // toss the range, we ain't using it
+    ASSERT_TRUE(allocator_.Get(len).valid());
+
+    total += len;
+    CheckStats(total);
+    EXPECT_EQ(total, HugePagesRequested());
+  }
+}
+
+TEST_P(HugeAllocatorTest, Stats) {
+  struct Helper {
+    static void Stats(const HugeAllocator* huge, size_t* num_spans,
+                      Length* pages, absl::Duration* avg_age) {
+      SmallSpanStats small;
+      LargeSpanStats large;
+      PageAgeHistograms ages(absl::base_internal::CycleClock::Now());
+      huge->AddSpanStats(&small, &large, &ages);
+      for (auto i = Length(0); i < kMaxPages; ++i) {
+        EXPECT_EQ(0, small.normal_length[i.raw_num()]);
+        EXPECT_EQ(0, small.returned_length[i.raw_num()]);
+      }
+      *num_spans = large.spans;
+      EXPECT_EQ(Length(0), large.normal_pages);
+      *pages = large.returned_pages;
+      const PageAgeHistograms::Histogram* hist = ages.GetTotalHistogram(true);
+      *avg_age = absl::Seconds(hist->avg_age());
+    }
+  };
+
+  if (GetParam()) {
+    // Ensure overallocation doesn't skew our measurements below.
+    allocator_.Release(allocator_.Get(NHugePages(7)));
+  }
+  const HugeRange r = allocator_.Get(NHugePages(8));
+  ASSERT_TRUE(r.valid());
+  const HugePage p = r.start();
+  // Break it into 3 ranges, separated by one-page regions,
+  // so we can easily track the internal state in stats.
+  const HugeRange r1 = {p, NHugePages(1)};
+  const HugeRange b1 = {p + NHugePages(1), NHugePages(1)};
+  const HugeRange r2 = {p + NHugePages(2), NHugePages(2)};
+  const HugeRange b2 = {p + NHugePages(4), NHugePages(1)};
+  const HugeRange r3 = {p + NHugePages(5), NHugePages(3)};
+
+  size_t num_spans;
+  Length pages;
+  absl::Duration avg_age;
+
+  Helper::Stats(&allocator_, &num_spans, &pages, &avg_age);
+  EXPECT_EQ(0, num_spans);
+  EXPECT_EQ(Length(0), pages);
+  EXPECT_EQ(absl::ZeroDuration(), avg_age);
+
+  allocator_.Release(r1);
+  constexpr absl::Duration kDelay = absl::Milliseconds(500);
+  absl::SleepFor(kDelay);
+  Helper::Stats(&allocator_, &num_spans, &pages, &avg_age);
+  EXPECT_EQ(1, num_spans);
+  EXPECT_EQ(NHugePages(1).in_pages(), pages);
+  // We can only do >= testing, because we might be arbitrarily delayed.
+  // Since avg_age is computed in floating point, we may have round-off from
+  // TCMalloc's internal use of absl::base_internal::CycleClock down through
+  // computing the average age of the spans.  kEpsilon allows for a tiny amount
+  // of slop.
+  constexpr absl::Duration kEpsilon = absl::Microseconds(500);
+  EXPECT_LE(kDelay - kEpsilon, avg_age);
+
+  allocator_.Release(r2);
+  absl::SleepFor(absl::Milliseconds(250));
+  Helper::Stats(&allocator_, &num_spans, &pages, &avg_age);
+  EXPECT_EQ(2, num_spans);
+  EXPECT_EQ(NHugePages(3).in_pages(), pages);
+  EXPECT_LE(
+      (absl::Seconds(0.75) * 1 + absl::Seconds(0.25) * 2) / (1 + 2) - kEpsilon,
+      avg_age);
+
+  allocator_.Release(r3);
+  absl::SleepFor(absl::Milliseconds(125));
+  Helper::Stats(&allocator_, &num_spans, &pages, &avg_age);
+  EXPECT_EQ(3, num_spans);
+  EXPECT_EQ(NHugePages(6).in_pages(), pages);
+  EXPECT_LE((absl::Seconds(0.875) * 1 + absl::Seconds(0.375) * 2 +
+             absl::Seconds(0.125) * 3) /
+                    (1 + 2 + 3) -
+                kEpsilon,
+            avg_age);
+
+  allocator_.Release(b1);
+  allocator_.Release(b2);
+  absl::SleepFor(absl::Milliseconds(100));
+  Helper::Stats(&allocator_, &num_spans, &pages, &avg_age);
+  EXPECT_EQ(1, num_spans);
+  EXPECT_EQ(NHugePages(8).in_pages(), pages);
+  EXPECT_LE((absl::Seconds(0.975) * 1 + absl::Seconds(0.475) * 2 +
+             absl::Seconds(0.225) * 3 + absl::Seconds(0.1) * 2) /
+                    (1 + 2 + 3 + 2) -
+                kEpsilon,
+            avg_age);
+}
+
+// Make sure we're well-behaved in the presence of OOM (and that we do
+// OOM at some point...)
+TEST_P(HugeAllocatorTest, OOM) {
+  HugeLength n = NHugePages(1);
+  while (allocator_.Get(n).valid()) {
+    n *= 2;
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    NormalOverAlloc, HugeAllocatorTest, testing::Values(false, true),
+    +[](const testing::TestParamInfo<bool>& info) {
+      return info.param ? "overallocates" : "normal";
+    });
+
+}  // namespace
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
--- a/src/third_party/tcmalloc/dist/tcmalloc/huge_cache.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/huge_cache.cc
@ -0,0 +1,497 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tcmalloc/huge_cache.h"
+
+#include <algorithm>
+#include <tuple>
+
+#include "absl/time/time.h"
+#include "tcmalloc/common.h"
+#include "tcmalloc/huge_address_map.h"
+#include "tcmalloc/huge_pages.h"
+#include "tcmalloc/internal/logging.h"
+#include "tcmalloc/stats.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+
+template <size_t kEpochs>
+void MinMaxTracker<kEpochs>::Report(HugeLength val) {
+  timeseries_.Report(val);
+}
+
+template <size_t kEpochs>
+HugeLength MinMaxTracker<kEpochs>::MaxOverTime(absl::Duration t) const {
+  HugeLength m = NHugePages(0);
+  size_t num_epochs = ceil(absl::FDivDuration(t, kEpochLength));
+  timeseries_.IterBackwards([&](size_t offset, int64_t ts,
+                                const Extrema& e) { m = std::max(m, e.max); },
+                            num_epochs);
+  return m;
+}
+
+template <size_t kEpochs>
+HugeLength MinMaxTracker<kEpochs>::MinOverTime(absl::Duration t) const {
+  HugeLength m = kMaxVal;
+  size_t num_epochs = ceil(absl::FDivDuration(t, kEpochLength));
+  timeseries_.IterBackwards([&](size_t offset, int64_t ts,
+                                const Extrema& e) { m = std::min(m, e.min); },
+                            num_epochs);
+  return m;
+}
+
+template <size_t kEpochs>
+void MinMaxTracker<kEpochs>::Print(Printer* out) const {
+  // Prints timestamp:min_pages:max_pages for each window with records.
+  // Timestamp == kEpochs - 1 is the most recent measurement.
+  const int64_t millis = absl::ToInt64Milliseconds(kEpochLength);
+  out->printf("\nHugeCache: window %lldms * %zu", millis, kEpochs);
+  int written = 0;
+  timeseries_.Iter(
+      [&](size_t offset, int64_t ts, const Extrema& e) {
+        if ((written++) % 100 == 0)
+          out->printf("\nHugeCache: Usage timeseries ");
+        out->printf("%zu:%zu:%zd,", offset, e.min.raw_num(), e.max.raw_num());
+      },
+      timeseries_.kSkipEmptyEntries);
+  out->printf("\n");
+}
+
+template <size_t kEpochs>
+void MinMaxTracker<kEpochs>::PrintInPbtxt(PbtxtRegion* hpaa) const {
+  // Prints content of each non-empty epoch, from oldest to most recent data
+  auto huge_cache_history = hpaa->CreateSubRegion("huge_cache_history");
+  huge_cache_history.PrintI64("window_ms",
+                              absl::ToInt64Milliseconds(kEpochLength));
+  huge_cache_history.PrintI64("epochs", kEpochs);
+
+  timeseries_.Iter(
+      [&](size_t offset, int64_t ts, const Extrema& e) {
+        auto m = huge_cache_history.CreateSubRegion("measurements");
+        m.PrintI64("epoch", offset);
+        m.PrintI64("min_bytes", e.min.in_bytes());
+        m.PrintI64("max_bytes", e.max.in_bytes());
+      },
+      timeseries_.kSkipEmptyEntries);
+}
+
+template <size_t kEpochs>
+bool MinMaxTracker<kEpochs>::Extrema::operator==(const Extrema& other) const {
+  return (other.max == max) && (other.min == min);
+}
+
+// Explicit instantiations of template
+template class MinMaxTracker<>;
+template class MinMaxTracker<600>;
+
+// The logic for actually allocating from the cache or backing, and keeping
+// the hit rates specified.
+HugeRange HugeCache::DoGet(HugeLength n, bool* from_released) {
+  auto* node = Find(n);
+  if (!node) {
+    misses_++;
+    weighted_misses_ += n.raw_num();
+    HugeRange res = allocator_->Get(n);
+    if (res.valid()) {
+      *from_released = true;
+    }
+
+    return res;
+  }
+  hits_++;
+  weighted_hits_ += n.raw_num();
+  *from_released = false;
+  size_ -= n;
+  UpdateSize(size());
+  HugeRange result, leftover;
+  // Put back whatever we have left (or nothing, if it's exact.)
+  std::tie(result, leftover) = Split(node->range(), n);
+  cache_.Remove(node);
+  if (leftover.valid()) {
+    cache_.Insert(leftover);
+  }
+  return result;
+}
+
+void HugeCache::MaybeGrowCacheLimit(HugeLength missed) {
+  // Our goal is to make the cache size = the largest "brief dip."
+  //
+  // A "dip" being a case where usage shrinks, then increases back up
+  // to previous levels (at least partially).
+  //
+  // "brief" is "returns to normal usage in < kCacheTime." (In
+  // other words, we ideally want to be willing to cache memory for
+  // kCacheTime before expecting it to be used again--we are loose
+  // on the timing..)
+  //
+  // The interesting part is finding those dips.
+
+  // This is the downward slope: we lost some usage. (This in theory could
+  // be as much as 2 * kCacheTime old, which is fine.)
+  const HugeLength shrink = off_peak_tracker_.MaxOverTime(kCacheTime);
+
+  // This is the upward slope: we are coming back up.
+  const HugeLength grow = usage_ - usage_tracker_.MinOverTime(kCacheTime);
+
+  // Ideally we now know that we dipped down by some amount, then came
+  // up.  Sadly our stats aren't quite good enough to guarantee things
+  // happened in the proper order.  Suppose our usage takes the
+  // following path (in essentially zero time):
+  // 0, 10000, 5000, 5500.
+  //
+  // Clearly the proven dip here is 500.  But we'll compute shrink = 5000,
+  // grow = 5500--we'd prefer to measure from a min *after* that shrink.
+  //
+  // It's difficult to ensure this, and hopefully this case is rare.
+  // TODO(b/134690209): figure out if we can solve that problem.
+  const HugeLength dip = std::min(shrink, grow);
+
+  // Fragmentation: we may need to cache a little more than the actual
+  // usage jump. 10% seems to be a reasonable addition that doesn't waste
+  // much space, but gets good performance on tests.
+  const HugeLength slack = dip / 10;
+
+  const HugeLength lim = dip + slack;
+
+  if (lim > limit()) {
+    last_limit_change_ = clock_.now();
+    limit_ = lim;
+  }
+}
+
+void HugeCache::IncUsage(HugeLength n) {
+  usage_ += n;
+  usage_tracker_.Report(usage_);
+  detailed_tracker_.Report(usage_);
+  off_peak_tracker_.Report(NHugePages(0));
+}
+
+void HugeCache::DecUsage(HugeLength n) {
+  usage_ -= n;
+  usage_tracker_.Report(usage_);
+  detailed_tracker_.Report(usage_);
+  const HugeLength max = usage_tracker_.MaxOverTime(kCacheTime);
+  ASSERT(max >= usage_);
+  const HugeLength off_peak = max - usage_;
+  off_peak_tracker_.Report(off_peak);
+}
+
+void HugeCache::UpdateSize(HugeLength size) {
+  size_tracker_.Report(size);
+
+  // TODO(b/134691947): moving this inside the MinMaxTracker would save one call
+  // to clock_.now() but all MinMaxTrackers would track regret instead.
+  int64_t now = clock_.now();
+  if (now > last_regret_update_) {
+    regret_ += size.raw_num() * (now - last_regret_update_);
+    last_regret_update_ = now;
+  }
+}
+
+HugeRange HugeCache::Get(HugeLength n, bool* from_released) {
+  HugeRange r = DoGet(n, from_released);
+  // failure to get a range should "never" "never" happen (VSS limits
+  // or wildly incorrect allocation sizes only...) Don't deal with
+  // this case for cache size accounting.
+  IncUsage(r.len());
+
+  const bool miss = r.valid() && *from_released;
+  if (miss) MaybeGrowCacheLimit(n);
+  return r;
+}
+
+void HugeCache::Release(HugeRange r) {
+  DecUsage(r.len());
+
+  cache_.Insert(r);
+  size_ += r.len();
+  if (size_ <= limit()) {
+    fills_++;
+  } else {
+    overflows_++;
+  }
+
+  // Shrink the limit, if we're going to do it, before we shrink to
+  // the max size.  (This could reduce the number of regions we break
+  // in half to avoid overshrinking.)
+  if ((clock_.now() - last_limit_change_) > (cache_time_ticks_ * 2)) {
+    total_fast_unbacked_ += MaybeShrinkCacheLimit();
+  }
+  total_fast_unbacked_ += ShrinkCache(limit());
+
+  UpdateSize(size());
+}
+
+void HugeCache::ReleaseUnbacked(HugeRange r) {
+  DecUsage(r.len());
+  // No point in trying to cache it, just hand it back.
+  allocator_->Release(r);
+}
+
+HugeLength HugeCache::MaybeShrinkCacheLimit() {
+  last_limit_change_ = clock_.now();
+
+  const HugeLength min = size_tracker_.MinOverTime(kCacheTime * 2);
+  // If cache size has gotten down to at most 20% of max, we assume
+  // we're close enough to the optimal size--we don't want to fiddle
+  // too much/too often unless we have large gaps in usage.
+  if (min < limit() / 5) return NHugePages(0);
+
+  // Take away half of the unused portion.
+  HugeLength drop = std::max(min / 2, NHugePages(1));
+  limit_ = std::max(limit() <= drop ? NHugePages(0) : limit() - drop,
+                    MinCacheLimit());
+  return ShrinkCache(limit());
+}
+
+HugeLength HugeCache::ShrinkCache(HugeLength target) {
+  HugeLength removed = NHugePages(0);
+  while (size_ > target) {
+    // Remove smallest-ish nodes, to avoid fragmentation where possible.
+    auto* node = Find(NHugePages(1));
+    CHECK_CONDITION(node);
+    HugeRange r = node->range();
+    cache_.Remove(node);
+    // Suppose we're 10 MiB over target but the smallest available node
+    // is 100 MiB.  Don't go overboard--split up the range.
+    // In particular - this prevents disastrous results if we've decided
+    // the cache should be 99 MiB but the actual hot usage is 100 MiB
+    // (and it is unfragmented).
+    const HugeLength delta = size() - target;
+    if (r.len() > delta) {
+      HugeRange to_remove, leftover;
+      std::tie(to_remove, leftover) = Split(r, delta);
+      ASSERT(leftover.valid());
+      cache_.Insert(leftover);
+      r = to_remove;
+    }
+
+    size_ -= r.len();
+    // Note, actual unback implementation is temporarily dropping and
+    // re-acquiring the page heap lock here.
+    if (ABSL_PREDICT_FALSE(!unback_(r.start_addr(), r.byte_len()))) {
+      // We failed to release r.  Retain it in the cache instead of returning it
+      // to the HugeAllocator.
+      size_ += r.len();
+      cache_.Insert(r);
+      break;
+    }
+    allocator_->Release(r);
+    removed += r.len();
+  }
+
+  return removed;
+}
+
+HugeLength HugeCache::ReleaseCachedPages(HugeLength n) {
+  // This is a good time to check: is our cache going persistently unused?
+  HugeLength released = MaybeShrinkCacheLimit();
+
+  if (released < n) {
+    n -= released;
+    const HugeLength target = n > size() ? NHugePages(0) : size() - n;
+    released += ShrinkCache(target);
+  }
+
+  UpdateSize(size());
+  total_periodic_unbacked_ += released;
+  return released;
+}
+
+void HugeCache::AddSpanStats(SmallSpanStats* small, LargeSpanStats* large,
+                             PageAgeHistograms* ages) const {
+  static_assert(kPagesPerHugePage >= kMaxPages);
+  for (const HugeAddressMap::Node* node = cache_.first(); node != nullptr;
+       node = node->next()) {
+    HugeLength n = node->range().len();
+    if (large != nullptr) {
+      large->spans++;
+      large->normal_pages += n.in_pages();
+    }
+
+    if (ages != nullptr) {
+      ages->RecordRange(n.in_pages(), false, node->when());
+    }
+  }
+}
+
+HugeAddressMap::Node* HugeCache::Find(HugeLength n) {
+  HugeAddressMap::Node* curr = cache_.root();
+  // invariant: curr != nullptr && curr->longest >= n
+  // we favor smaller gaps and lower nodes and lower addresses, in that
+  // order. The net effect is that we are neither a best-fit nor a
+  // lowest-address allocator but vaguely close to both.
+  HugeAddressMap::Node* best = nullptr;
+  while (curr && curr->longest() >= n) {
+    if (curr->range().len() >= n) {
+      if (!best || best->range().len() > curr->range().len()) {
+        best = curr;
+      }
+    }
+
+    // Either subtree could contain a better fit and we don't want to
+    // search the whole tree. Pick a reasonable child to look at.
+    auto left = curr->left();
+    auto right = curr->right();
+    if (!left || left->longest() < n) {
+      curr = right;
+      continue;
+    }
+
+    if (!right || right->longest() < n) {
+      curr = left;
+      continue;
+    }
+
+    // Here, we have a nontrivial choice.
+    if (left->range().len() == right->range().len()) {
+      if (left->longest() <= right->longest()) {
+        curr = left;
+      } else {
+        curr = right;
+      }
+    } else if (left->range().len() < right->range().len()) {
+      // Here, the longest range in both children is the same...look
+      // in the subtree with the smaller root, as that's slightly
+      // more likely to be our best.
+      curr = left;
+    } else {
+      curr = right;
+    }
+  }
+  return best;
+}
+
+void HugeCache::Print(Printer* out) {
+  const int64_t millis = absl::ToInt64Milliseconds(kCacheTime);
+  out->printf(
+      "HugeCache: contains unused, backed hugepage(s) "
+      "(kCacheTime = %lldms)\n",
+      millis);
+  // a / (a + b), avoiding division by zero
+  auto safe_ratio = [](double a, double b) {
+    const double total = a + b;
+    if (total == 0) return 0.0;
+    return a / total;
+  };
+
+  const double hit_rate = safe_ratio(hits_, misses_);
+  const double overflow_rate = safe_ratio(overflows_, fills_);
+
+  out->printf(
+      "HugeCache: %zu / %zu hugepages cached / cache limit "
+      "(%.3f hit rate, %.3f overflow rate)\n",
+      size_.raw_num(), limit().raw_num(), hit_rate, overflow_rate);
+  out->printf("HugeCache: %zu MiB fast unbacked, %zu MiB periodic\n",
+              total_fast_unbacked_.in_bytes() / 1024 / 1024,
+              total_periodic_unbacked_.in_bytes() / 1024 / 1024);
+  UpdateSize(size());
+  out->printf(
+      "HugeCache: %zu MiB*s cached since startup\n",
+      NHugePages(regret_).in_mib() / static_cast<size_t>(clock_.freq()));
+
+  usage_tracker_.Report(usage_);
+  const HugeLength usage_min = usage_tracker_.MinOverTime(kCacheTime);
+  const HugeLength usage_max = usage_tracker_.MaxOverTime(kCacheTime);
+  out->printf(
+      "HugeCache: recent usage range: %zu min - %zu curr -  %zu max MiB\n",
+      usage_min.in_mib(), usage_.in_mib(), usage_max.in_mib());
+
+  const HugeLength off_peak = usage_max - usage_;
+  off_peak_tracker_.Report(off_peak);
+  const HugeLength off_peak_min = off_peak_tracker_.MinOverTime(kCacheTime);
+  const HugeLength off_peak_max = off_peak_tracker_.MaxOverTime(kCacheTime);
+  out->printf(
+      "HugeCache: recent offpeak range: %zu min - %zu curr - %zu max MiB\n",
+      off_peak_min.in_mib(), off_peak.in_mib(), off_peak_max.in_mib());
+
+  const HugeLength cache_min = size_tracker_.MinOverTime(kCacheTime);
+  const HugeLength cache_max = size_tracker_.MaxOverTime(kCacheTime);
+  out->printf(
+      "HugeCache: recent cache range: %zu min - %zu curr - %zu max MiB\n",
+      cache_min.in_mib(), size_.in_mib(), cache_max.in_mib());
+
+  detailed_tracker_.Print(out);
+}
+
+void HugeCache::PrintInPbtxt(PbtxtRegion* hpaa) {
+  hpaa->PrintI64("huge_cache_time_const",
+                 absl::ToInt64Milliseconds(kCacheTime));
+
+  // a / (a + b), avoiding division by zero
+  auto safe_ratio = [](double a, double b) {
+    const double total = a + b;
+    if (total == 0) return 0.0;
+    return a / total;
+  };
+
+  const double hit_rate = safe_ratio(hits_, misses_);
+  const double overflow_rate = safe_ratio(overflows_, fills_);
+
+  // number of bytes in HugeCache
+  hpaa->PrintI64("cached_huge_page_bytes", size_.in_bytes());
+  // max allowed bytes in HugeCache
+  hpaa->PrintI64("max_cached_huge_page_bytes", limit().in_bytes());
+  // lifetime cache hit rate
+  hpaa->PrintDouble("huge_cache_hit_rate", hit_rate);
+  // lifetime cache overflow rate
+  hpaa->PrintDouble("huge_cache_overflow_rate", overflow_rate);
+  // bytes eagerly unbacked by HugeCache
+  hpaa->PrintI64("fast_unbacked_bytes", total_fast_unbacked_.in_bytes());
+  // bytes unbacked by periodic releaser thread
+  hpaa->PrintI64("periodic_unbacked_bytes",
+                 total_periodic_unbacked_.in_bytes());
+  UpdateSize(size());
+  // memory cached since startup (in MiB*s)
+  hpaa->PrintI64("huge_cache_regret", NHugePages(regret_).in_mib() /
+                                          static_cast<size_t>(clock_.freq()));
+
+  usage_tracker_.Report(usage_);
+  const HugeLength usage_min = usage_tracker_.MinOverTime(kCacheTime);
+  const HugeLength usage_max = usage_tracker_.MaxOverTime(kCacheTime);
+  {
+    auto usage_stats = hpaa->CreateSubRegion("huge_cache_usage_stats");
+    usage_stats.PrintI64("min_bytes", usage_min.in_bytes());
+    usage_stats.PrintI64("current_bytes", usage_.in_bytes());
+    usage_stats.PrintI64("max_bytes", usage_max.in_bytes());
+  }
+
+  const HugeLength off_peak = usage_max - usage_;
+  off_peak_tracker_.Report(off_peak);
+  const HugeLength off_peak_min = off_peak_tracker_.MinOverTime(kCacheTime);
+  const HugeLength off_peak_max = off_peak_tracker_.MaxOverTime(kCacheTime);
+  {
+    auto usage_stats = hpaa->CreateSubRegion("huge_cache_offpeak_stats");
+    usage_stats.PrintI64("min_bytes", off_peak_min.in_bytes());
+    usage_stats.PrintI64("current_bytes", off_peak.in_bytes());
+    usage_stats.PrintI64("max_bytes", off_peak_max.in_bytes());
+  }
+
+  const HugeLength cache_min = size_tracker_.MinOverTime(kCacheTime);
+  const HugeLength cache_max = size_tracker_.MaxOverTime(kCacheTime);
+  {
+    auto usage_stats = hpaa->CreateSubRegion("huge_cache_cache_stats");
+    usage_stats.PrintI64("min_bytes", cache_min.in_bytes());
+    usage_stats.PrintI64("current_bytes", size_.in_bytes());
+    usage_stats.PrintI64("max_bytes", cache_max.in_bytes());
+  }
+
+  detailed_tracker_.PrintInPbtxt(hpaa);
+}
+
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
--- a/src/third_party/tcmalloc/dist/tcmalloc/huge_cache.h
+++ b/src/third_party/tcmalloc/dist/tcmalloc/huge_cache.h
@ -0,0 +1,263 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Wrapping interface for HugeAllocator that handles backing and
+// unbacking, including a hot cache of backed single hugepages.
+#ifndef TCMALLOC_HUGE_CACHE_H_
+#define TCMALLOC_HUGE_CACHE_H_
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <limits>
+
+#include "absl/time/time.h"
+#include "tcmalloc/common.h"
+#include "tcmalloc/experiment.h"
+#include "tcmalloc/experiment_config.h"
+#include "tcmalloc/huge_allocator.h"
+#include "tcmalloc/huge_pages.h"
+#include "tcmalloc/internal/config.h"
+#include "tcmalloc/internal/logging.h"
+#include "tcmalloc/internal/timeseries_tracker.h"
+#include "tcmalloc/stats.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+
+class MemoryModifyFunction {
+  using ReleaseFunction = bool (*)(void*, size_t);
+
+ public:
+  explicit MemoryModifyFunction(ReleaseFunction func) : func_(func) {}
+
+  ABSL_MUST_USE_RESULT bool operator()(void* start, size_t len) {
+    return func_(start, len);
+  }
+
+ private:
+  ReleaseFunction func_;
+};
+
+// Track the extreme values of a HugeLength value over the past
+// kWindow (time ranges approximate.)
+template <size_t kEpochs = 16>
+class MinMaxTracker {
+ public:
+  explicit constexpr MinMaxTracker(Clock clock, absl::Duration w)
+      : kEpochLength(w / kEpochs), timeseries_(clock, w) {}
+
+  void Report(HugeLength val);
+  void Print(Printer* out) const;
+  void PrintInPbtxt(PbtxtRegion* hpaa) const;
+
+  // If t < kEpochLength, these functions return statistics for last epoch. The
+  // granularity is kEpochLength (rounded up).
+  HugeLength MaxOverTime(absl::Duration t) const;
+  HugeLength MinOverTime(absl::Duration t) const;
+
+ private:
+  const absl::Duration kEpochLength;
+
+  static constexpr HugeLength kMaxVal =
+      NHugePages(std::numeric_limits<size_t>::max());
+  struct Extrema {
+    HugeLength min, max;
+
+    static Extrema Nil() {
+      Extrema e;
+      e.max = NHugePages(0);
+      e.min = kMaxVal;
+      return e;
+    }
+
+    void Report(HugeLength n) {
+      max = std::max(max, n);
+      min = std::min(min, n);
+    }
+
+    bool empty() const { return (*this == Nil()); }
+
+    bool operator==(const Extrema& other) const;
+  };
+
+  TimeSeriesTracker<Extrema, HugeLength, kEpochs> timeseries_;
+};
+
+// Explicit instantiations are defined in huge_cache.cc.
+extern template class MinMaxTracker<>;
+extern template class MinMaxTracker<600>;
+
+template <size_t kEpochs>
+constexpr HugeLength MinMaxTracker<kEpochs>::kMaxVal;
+
+class HugeCache {
+ public:
+  // For use in production
+  HugeCache(HugeAllocator* allocator, MetadataAllocFunction meta_allocate,
+            MemoryModifyFunction unback)
+      : HugeCache(allocator, meta_allocate, unback,
+                  Clock{.now = absl::base_internal::CycleClock::Now,
+                        .freq = absl::base_internal::CycleClock::Frequency}) {}
+
+  // For testing with mock clock.
+  //
+  // 2s (kCacheTime * 2) looks like an arbitrary window; it mostly is.
+  //
+  // Suffice to say that the below code (see MaybeGrowCacheLimit)
+  // tries to make sure the cache is sized to protect a working set
+  // that ebbs for 1 second, as a reasonable heuristic. This means it
+  // needs 1s of historical data to examine.
+  //
+  // Why 2s duration, then? Two reasons:
+  //
+  // - (minor) granularity of epoch boundaries make me want to err towards
+  //   keeping a bit too much data over a bit too little.
+  //
+  // - (major) hysteresis: in ReleaseCachedPages we try to detect
+  //   mistaken cache expansion and reverse it. I hope that using a
+  //   longer timescale than our expansion will increase stability
+  //   here: I will take some caches staying a bit too big over caches
+  //   oscillating back and forth between two size estimates, so we
+  //   require stronger evidence (longer time) to reverse an expansion
+  //   than to make it.
+  //
+  // We also tried other algorithms, but this one is simple and suffices to
+  // capture the empirical dynamics we've seen.  See "Beyond Malloc
+  // Efficiency..." (https://research.google/pubs/pub50370/) for more
+  // information.
+  HugeCache(HugeAllocator* allocator, MetadataAllocFunction meta_allocate,
+            MemoryModifyFunction unback, Clock clock)
+      : allocator_(allocator),
+        cache_(meta_allocate),
+        clock_(clock),
+        cache_time_ticks_(clock_.freq() * absl::ToDoubleSeconds(kCacheTime)),
+        nanoseconds_per_tick_(absl::ToInt64Nanoseconds(absl::Seconds(1)) /
+                              clock_.freq()),
+        last_limit_change_(clock.now()),
+        last_regret_update_(clock.now()),
+        detailed_tracker_(clock, absl::Minutes(10)),
+        usage_tracker_(clock, kCacheTime * 2),
+        off_peak_tracker_(clock, kCacheTime * 2),
+        size_tracker_(clock, kCacheTime * 2),
+        unback_(unback) {}
+  // Allocate a usable set of <n> contiguous hugepages.  Try to give out
+  // memory that's currently backed from the kernel if we have it available.
+  // *from_released is set to false if the return range is already backed;
+  // otherwise, it is set to true (and the caller should back it.)
+  HugeRange Get(HugeLength n, bool* from_released);
+
+  // Deallocate <r> (assumed to be backed by the kernel.)
+  void Release(HugeRange r);
+  // As Release, but the range is assumed to _not_ be backed.
+  void ReleaseUnbacked(HugeRange r);
+
+  // Release to the system up to <n> hugepages of cache contents; returns
+  // the number of hugepages released.
+  HugeLength ReleaseCachedPages(HugeLength n);
+
+  // Backed memory available.
+  HugeLength size() const { return size_; }
+  // Total memory cached (in HugeLength * nanoseconds)
+  uint64_t regret() const { return regret_ * nanoseconds_per_tick_; }
+  // Current limit for how much backed memory we'll cache.
+  HugeLength limit() const { return limit_; }
+  // Sum total of unreleased requests.
+  HugeLength usage() const { return usage_; }
+
+  void AddSpanStats(SmallSpanStats* small, LargeSpanStats* large,
+                    PageAgeHistograms* ages) const;
+
+  BackingStats stats() const {
+    BackingStats s;
+    s.system_bytes = (usage() + size()).in_bytes();
+    s.free_bytes = size().in_bytes();
+    s.unmapped_bytes = 0;
+    return s;
+  }
+
+  void Print(Printer* out);
+  void PrintInPbtxt(PbtxtRegion* hpaa);
+
+ private:
+  HugeAllocator* allocator_;
+
+  // We just cache-missed a request for <missed> pages;
+  // should we grow?
+  void MaybeGrowCacheLimit(HugeLength missed);
+  // Check if the cache seems consistently too big.  Returns the
+  // number of pages *evicted* (not the change in limit).
+  HugeLength MaybeShrinkCacheLimit();
+
+  // Ensure the cache contains at most <target> hugepages,
+  // returning the number removed.
+  HugeLength ShrinkCache(HugeLength target);
+
+  HugeRange DoGet(HugeLength n, bool* from_released);
+
+  HugeAddressMap::Node* Find(HugeLength n);
+
+  HugeAddressMap cache_;
+  HugeLength size_{NHugePages(0)};
+
+  HugeLength limit_{NHugePages(10)};
+  const absl::Duration kCacheTime = absl::Seconds(1);
+
+  size_t hits_{0};
+  size_t misses_{0};
+  size_t fills_{0};
+  size_t overflows_{0};
+  uint64_t weighted_hits_{0};
+  uint64_t weighted_misses_{0};
+
+  // Sum(size of Gets) - Sum(size of Releases), i.e. amount of backed
+  // hugepages our user currently wants to have.
+  void IncUsage(HugeLength n);
+  void DecUsage(HugeLength n);
+  HugeLength usage_{NHugePages(0)};
+
+  // This is CycleClock, except overridable for tests.
+  Clock clock_;
+  const int64_t cache_time_ticks_;
+  const double nanoseconds_per_tick_;
+
+  int64_t last_limit_change_;
+
+  // 10 hugepages is a good baseline for our cache--easily wiped away
+  // by periodic release, and not that much memory on any real server.
+  // However, we can go below it if we haven't used that much for 30 seconds.
+  HugeLength MinCacheLimit() const { return NHugePages(10); }
+
+  uint64_t regret_{0};  // overflows if we cache 585 hugepages for 1 year
+  int64_t last_regret_update_;
+  void UpdateSize(HugeLength size);
+
+  MinMaxTracker<600> detailed_tracker_;
+
+  MinMaxTracker<> usage_tracker_;
+  MinMaxTracker<> off_peak_tracker_;
+  MinMaxTracker<> size_tracker_;
+
+  HugeLength total_fast_unbacked_{NHugePages(0)};
+  HugeLength total_periodic_unbacked_{NHugePages(0)};
+
+  MemoryModifyFunction unback_;
+};
+
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
+
+#endif  // TCMALLOC_HUGE_CACHE_H_
--- a/src/third_party/tcmalloc/dist/tcmalloc/huge_cache_test.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/huge_cache_test.cc
@ -0,0 +1,622 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tcmalloc/huge_cache.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <memory>
+#include <random>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/base/internal/cycleclock.h"
+#include "absl/memory/memory.h"
+#include "absl/random/random.h"
+#include "absl/strings/str_cat.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "tcmalloc/huge_pages.h"
+#include "tcmalloc/internal/clock.h"
+#include "tcmalloc/internal/config.h"
+#include "tcmalloc/internal/logging.h"
+#include "tcmalloc/stats.h"
+
+namespace tcmalloc {
+namespace tcmalloc_internal {
+namespace {
+
+using testing::Return;
+
+class HugeCacheTest : public testing::Test {
+ private:
+  // Allow tests to modify the clock used by the cache.
+  static int64_t clock_offset_;
+  static double GetClockFrequency() {
+    return absl::base_internal::CycleClock::Frequency();
+  }
+  static int64_t GetClock() {
+    return absl::base_internal::CycleClock::Now() +
+           clock_offset_ * GetClockFrequency() /
+               absl::ToDoubleNanoseconds(absl::Seconds(1));
+  }
+
+  // Use a tiny fraction of actual size so we can test aggressively.
+  static AddressRange AllocateFake(size_t bytes, size_t align) {
+    if (bytes % kHugePageSize != 0) {
+      Crash(kCrash, __FILE__, __LINE__, "not aligned", bytes, kHugePageSize);
+    }
+    if (align % kHugePageSize != 0) {
+      Crash(kCrash, __FILE__, __LINE__, "not aligned", align, kHugePageSize);
+    }
+    // we'll actually provide hidden backing, one word per hugepage.
+    bytes /= kHugePageSize;
+    align /= kHugePageSize;
+    size_t index = backing.size();
+    if (index % align != 0) {
+      index += (align - (index & align));
+    }
+    backing.resize(index + bytes);
+    void* ptr = reinterpret_cast<void*>(index * kHugePageSize);
+    return {ptr, bytes * kHugePageSize};
+  }
+  // This isn't super good form but we'll never have more than one HAT
+  // extant at once.
+  static std::vector<size_t> backing;
+
+  // We use actual malloc for metadata allocations, but we track them so they
+  // can be deleted.  (TODO make this an arena if we care, which I doubt)
+  static void* MallocMetadata(size_t size) {
+    metadata_bytes += size;
+    void* ptr = calloc(size, 1);
+    metadata_allocs.push_back(ptr);
+    return ptr;
+  }
+  static std::vector<void*> metadata_allocs;
+  static size_t metadata_bytes;
+
+  // This is wordy, but necessary for mocking:
+  class BackingInterface {
+   public:
+    virtual bool Unback(void* p, size_t len) = 0;
+    virtual ~BackingInterface() {}
+  };
+
+  class MockBackingInterface : public BackingInterface {
+   public:
+    MOCK_METHOD(bool, Unback, (void* p, size_t len), (override));
+  };
+
+  static bool MockUnback(void* p, size_t len) { return mock_->Unback(p, len); }
+
+ protected:
+  static std::unique_ptr<testing::NiceMock<MockBackingInterface>> mock_;
+
+  HugeCacheTest() {
+    // We don't use the first few bytes, because things might get weird
+    // given zero pointers.
+    backing.resize(1024);
+    metadata_bytes = 0;
+    mock_ = absl::make_unique<testing::NiceMock<MockBackingInterface>>();
+  }
+
+  ~HugeCacheTest() override {
+    for (void* p : metadata_allocs) {
+      free(p);
+    }
+    metadata_allocs.clear();
+    backing.clear();
+    mock_.reset(nullptr);
+
+    clock_offset_ = 0;
+  }
+
+  void Advance(absl::Duration d) {
+    clock_offset_ += absl::ToInt64Nanoseconds(d);
+  }
+
+  HugeAllocator alloc_{AllocateFake, MallocMetadata};
+  HugeCache cache_{&alloc_, MallocMetadata, MemoryModifyFunction(MockUnback),
+                   Clock{.now = GetClock, .freq = GetClockFrequency}};
+};
+
+std::vector<size_t> HugeCacheTest::backing;
+std::vector<void*> HugeCacheTest::metadata_allocs;
+size_t HugeCacheTest::metadata_bytes;
+std::unique_ptr<testing::NiceMock<HugeCacheTest::MockBackingInterface>>
+    HugeCacheTest::mock_;
+
+int64_t HugeCacheTest::clock_offset_ = 0;
+
+TEST_F(HugeCacheTest, Basic) {
+  bool from;
+  for (int i = 0; i < 100 * 1000; ++i) {
+    cache_.Release(cache_.Get(NHugePages(1), &from));
+  }
+}
+
+TEST_F(HugeCacheTest, Backing) {
+  bool from;
+  cache_.Release(cache_.Get(NHugePages(4), &from));
+  EXPECT_TRUE(from);
+  // We should be able to split up a large range...
+  HugeRange r1 = cache_.Get(NHugePages(3), &from);
+  EXPECT_FALSE(from);
+  HugeRange r2 = cache_.Get(NHugePages(1), &from);
+  EXPECT_FALSE(from);
+
+  // and then merge it back.
+  cache_.Release(r1);
+  cache_.Release(r2);
+  HugeRange r = cache_.Get(NHugePages(4), &from);
+  EXPECT_FALSE(from);
+  cache_.Release(r);
+}
+
+TEST_F(HugeCacheTest, Release) {
+  bool from;
+  const HugeLength one = NHugePages(1);
+  cache_.Release(cache_.Get(NHugePages(5), &from));
+  HugeRange r1, r2, r3, r4, r5;
+  r1 = cache_.Get(one, &from);
+  r2 = cache_.Get(one, &from);
+  r3 = cache_.Get(one, &from);
+  r4 = cache_.Get(one, &from);
+  r5 = cache_.Get(one, &from);
+  cache_.Release(r1);
+  cache_.Release(r2);
+  cache_.Release(r3);
+  cache_.Release(r4);
+  cache_.Release(r5);
+
+  r1 = cache_.Get(one, &from);
+  ASSERT_EQ(false, from);
+  r2 = cache_.Get(one, &from);
+  ASSERT_EQ(false, from);
+  r3 = cache_.Get(one, &from);
+  ASSERT_EQ(false, from);
+  r4 = cache_.Get(one, &from);
+  ASSERT_EQ(false, from);
+  r5 = cache_.Get(one, &from);
+  ASSERT_EQ(false, from);
+  cache_.Release(r1);
+  cache_.Release(r2);
+  cache_.Release(r5);
+
+  ASSERT_EQ(NHugePages(3), cache_.size());
+  EXPECT_CALL(*mock_, Unback(r5.start_addr(), kHugePageSize * 1))
+      .WillOnce(Return(true));
+  EXPECT_EQ(NHugePages(1), cache_.ReleaseCachedPages(NHugePages(1)));
+  cache_.Release(r3);
+  cache_.Release(r4);
+
+  EXPECT_CALL(*mock_, Unback(r1.start_addr(), 4 * kHugePageSize))
+      .WillOnce(Return(true));
+  EXPECT_EQ(NHugePages(4), cache_.ReleaseCachedPages(NHugePages(200)));
+}
+
+TEST_F(HugeCacheTest, ReleaseFailure) {
+  bool from;
+  const HugeLength one = NHugePages(1);
+  cache_.Release(cache_.Get(NHugePages(5), &from));
+  HugeRange r1, r2, r3, r4, r5;
+  r1 = cache_.Get(one, &from);
+  r2 = cache_.Get(one, &from);
+  r3 = cache_.Get(one, &from);
+  r4 = cache_.Get(one, &from);
+  r5 = cache_.Get(one, &from);
+  cache_.Release(r1);
+  cache_.Release(r2);
+  cache_.Release(r3);
+  cache_.Release(r4);
+  cache_.Release(r5);
+
+  r1 = cache_.Get(one, &from);
+  ASSERT_EQ(false, from);
+  r2 = cache_.Get(one, &from);
+  ASSERT_EQ(false, from);
+  r3 = cache_.Get(one, &from);
+  ASSERT_EQ(false, from);
+  r4 = cache_.Get(one, &from);
+  ASSERT_EQ(false, from);
+  r5 = cache_.Get(one, &from);
+  ASSERT_EQ(false, from);
+  cache_.Release(r1);
+  cache_.Release(r2);
+  cache_.Release(r5);
+
+  ASSERT_EQ(NHugePages(3), cache_.size());
+  EXPECT_CALL(*mock_, Unback(r5.start_addr(), 1 * kHugePageSize))
+      .WillOnce(Return(false));
+  EXPECT_EQ(NHugePages(0), cache_.ReleaseCachedPages(NHugePages(1)));
+  cache_.Release(r3);
+  cache_.Release(r4);
+
+  EXPECT_CALL(*mock_, Unback(r1.start_addr(), 5 * kHugePageSize))
+      .WillOnce(Return(false));
+  EXPECT_EQ(NHugePages(0), cache_.ReleaseCachedPages(NHugePages(200)));
+}
+
+TEST_F(HugeCacheTest, Regret) {
+  bool from;
+  HugeRange r = cache_.Get(NHugePages(20), &from);
+  cache_.Release(r);
+  HugeLength cached = cache_.size();
+  absl::Duration d = absl::Seconds(20);
+  Advance(d);
+  char buf[512];
+  Printer out(buf, 512);
+  cache_.Print(&out);  // To update the regret
+  uint64_t expected_regret = absl::ToInt64Nanoseconds(d) * cached.raw_num();
+  // Not exactly accurate since the mock clock advances with real time, and
+  // when we measure regret will be updated.
+  EXPECT_NEAR(cache_.regret(), expected_regret, expected_regret / 100);
+  EXPECT_GE(cache_.regret(), expected_regret);
+}
+
+TEST_F(HugeCacheTest, Stats) {
+  bool from;
+  HugeRange r = cache_.Get(NHugePages(1 + 1 + 2 + 1 + 3), &from);
+  HugeRange r1, r2, r3, spacer1, spacer2;
+  std::tie(r1, spacer1) = Split(r, NHugePages(1));
+  std::tie(spacer1, r2) = Split(spacer1, NHugePages(1));
+  std::tie(r2, spacer2) = Split(r2, NHugePages(2));
+  std::tie(spacer2, r3) = Split(spacer2, NHugePages(1));
+  cache_.Release(r1);
+  cache_.Release(r2);
+  cache_.Release(r3);
+
+  ASSERT_EQ(NHugePages(6), cache_.size());
+  r1 = cache_.Get(NHugePages(1), &from);
+  ASSERT_EQ(false, from);
+  r2 = cache_.Get(NHugePages(2), &from);
+  ASSERT_EQ(false, from);
+  r3 = cache_.Get(NHugePages(3), &from);
+  ASSERT_EQ(false, from);
+
+  struct Helper {
+    static void Stat(const HugeCache& cache, size_t* spans,
+                     Length* pages_backed, Length* pages_unbacked,
+                     double* avg_age) {
+      PageAgeHistograms ages(absl::base_internal::CycleClock::Now());
+      LargeSpanStats large;
+      cache.AddSpanStats(nullptr, &large, &ages);
+
+      const PageAgeHistograms::Histogram* hist = ages.GetTotalHistogram(false);
+      *spans = large.spans;
+      *pages_backed = large.normal_pages;
+      *pages_unbacked = large.returned_pages;
+      *avg_age = hist->avg_age();
+    }
+  };
+
+  double avg_age;
+  size_t spans;
+  Length pages_backed;
+  Length pages_unbacked;
+
+  cache_.Release(r1);
+  absl::SleepFor(absl::Microseconds(5000));
+  Helper::Stat(cache_, &spans, &pages_backed, &pages_unbacked, &avg_age);
+  EXPECT_EQ(Length(0), pages_unbacked);
+  EXPECT_EQ(1, spans);
+  EXPECT_EQ(NHugePages(1).in_pages(), pages_backed);
+  EXPECT_LE(0.005, avg_age);
+
+  cache_.Release(r2);
+  absl::SleepFor(absl::Microseconds(2500));
+  Helper::Stat(cache_, &spans, &pages_backed, &pages_unbacked, &avg_age);
+  EXPECT_EQ(Length(0), pages_unbacked);
+  EXPECT_EQ(2, spans);
+  EXPECT_EQ(NHugePages(3).in_pages(), pages_backed);
+  EXPECT_LE((0.0075 * 1 + 0.0025 * 2) / (1 + 2), avg_age);
+
+  cache_.Release(r3);
+  absl::SleepFor(absl::Microseconds(1250));
+  Helper::Stat(cache_, &spans, &pages_backed, &pages_unbacked, &avg_age);
+  EXPECT_EQ(Length(0), pages_unbacked);
+  EXPECT_EQ(3, spans);
+  EXPECT_EQ(NHugePages(6).in_pages(), pages_backed);
+  EXPECT_LE((0.00875 * 1 + 0.00375 * 2 + 0.00125 * 3) / (1 + 2 + 3), avg_age);
+}
+
+static double Frac(HugeLength num, HugeLength denom) {
+  return static_cast<double>(num.raw_num()) / denom.raw_num();
+}
+
+TEST_F(HugeCacheTest, Growth) {
+  EXPECT_CALL(*mock_, Unback(testing::_, testing::_))
+      .WillRepeatedly(Return(true));
+
+  bool released;
+  absl::BitGen rng;
+  // fragmentation is a bit of a challenge
+  std::uniform_int_distribution<size_t> sizes(1, 5);
+  // fragment the cache badly.
+  std::vector<HugeRange> keep;
+  std::vector<HugeRange> drop;
+  for (int i = 0; i < 1000; ++i) {
+    auto& l = std::bernoulli_distribution()(rng) ? keep : drop;
+    l.push_back(cache_.Get(NHugePages(sizes(rng)), &released));
+  }
+
+  for (auto r : drop) {
+    cache_.Release(r);
+  }
+
+  // See the TODO in HugeCache::MaybeGrowCache; without this delay,
+  // the above fragmentation plays merry havoc with our instrumentation.
+  Advance(absl::Seconds(30));
+
+  // Test that our cache can grow to fit a working set.
+  HugeLength hot_set_sizes[] = {NHugePages(5), NHugePages(10), NHugePages(100),
+                                NHugePages(10000)};
+
+  for (const HugeLength hot : hot_set_sizes) {
+    SCOPED_TRACE(absl::StrCat("cache size = ", hot.in_bytes() / 1024.0 / 1024.0,
+                              " MiB"));
+    // Exercise the cache allocating about <hot> worth of data. After
+    // a brief warmup phase, we should do this without needing to back much.
+    auto alloc = [&]() -> std::pair<HugeLength, HugeLength> {
+      HugeLength got = NHugePages(0);
+      HugeLength needed_backing = NHugePages(0);
+      std::vector<HugeRange> items;
+      while (got < hot) {
+        HugeLength rest = hot - got;
+        HugeLength l = std::min(rest, NHugePages(sizes(rng)));
+        got += l;
+        items.push_back(cache_.Get(l, &released));
+        if (released) needed_backing += l;
+      }
+      for (auto r : items) {
+        cache_.Release(r);
+      }
+      return {needed_backing, got};
+    };
+
+    // warmup - we're allowed to incur misses and be too big.
+    for (int i = 0; i < 2; ++i) {
+      alloc();
+    }
+
+    HugeLength needed_backing = NHugePages(0);
+    HugeLength total = NHugePages(0);
+    for (int i = 0; i < 16; ++i) {
+      auto r = alloc();
+      needed_backing += r.first;
+      total += r.second;
+      // Cache shouldn't have just grown arbitrarily
+      const HugeLength cached = cache_.size();
+      // Allow us 10% slop, but don't get out of bed for tiny caches anyway.
+      const double ratio = Frac(cached, hot);
+      SCOPED_TRACE(
+          absl::StrCat(cached.raw_num(), "hps ", Frac(r.first, r.second)));
+      if (ratio > 1 && cached > NHugePages(16)) {
+        EXPECT_LE(ratio, 1.1);
+      }
+    }
+    // approximately, given the randomized sizing...
+
+    const double ratio = Frac(needed_backing, total);
+    EXPECT_LE(ratio, 0.3);
+  }
+}
+
+// If we repeatedly grow and shrink, but do so very slowly, we should *not*
+// cache the large variation.
+TEST_F(HugeCacheTest, SlowGrowthUncached) {
+  EXPECT_CALL(*mock_, Unback(testing::_, testing::_))
+      .WillRepeatedly(Return(true));
+
+  absl::BitGen rng;
+  std::uniform_int_distribution<size_t> sizes(1, 10);
+  for (int i = 0; i < 20; ++i) {
+    std::vector<HugeRange> rs;
+    for (int j = 0; j < 20; ++j) {
+      Advance(absl::Milliseconds(600));
+      bool released;
+      rs.push_back(cache_.Get(NHugePages(sizes(rng)), &released));
+    }
+    HugeLength max_cached = NHugePages(0);
+    for (auto r : rs) {
+      Advance(absl::Milliseconds(600));
+      cache_.Release(r);
+      max_cached = std::max(max_cached, cache_.size());
+    }
+    EXPECT_GE(NHugePages(10), max_cached);
+  }
+}
+
+// If very rarely we have a huge increase in usage, it shouldn't be cached.
+TEST_F(HugeCacheTest, SpikesUncached) {
+  EXPECT_CALL(*mock_, Unback(testing::_, testing::_))
+      .WillRepeatedly(Return(true));
+
+  absl::BitGen rng;
+  std::uniform_int_distribution<size_t> sizes(1, 10);
+  for (int i = 0; i < 20; ++i) {
+    std::vector<HugeRange> rs;
+    for (int j = 0; j < 2000; ++j) {
+      bool released;
+      rs.push_back(cache_.Get(NHugePages(sizes(rng)), &released));
+    }
+    HugeLength max_cached = NHugePages(0);
+    for (auto r : rs) {
+      cache_.Release(r);
+      max_cached = std::max(max_cached, cache_.size());
+    }
+    EXPECT_GE(NHugePages(10), max_cached);
+    Advance(absl::Seconds(30));
+  }
+}
+
+// If very rarely we have a huge *decrease* in usage, it *should* be cached.
+TEST_F(HugeCacheTest, DipsCached) {
+  absl::BitGen rng;
+  std::uniform_int_distribution<size_t> sizes(1, 10);
+  for (int i = 0; i < 20; ++i) {
+    std::vector<HugeRange> rs;
+    HugeLength got = NHugePages(0);
+    HugeLength uncached = NHugePages(0);
+    for (int j = 0; j < 2000; ++j) {
+      bool released;
+      HugeLength n = NHugePages(sizes(rng));
+      rs.push_back(cache_.Get(n, &released));
+      got += n;
+      if (released) uncached += n;
+    }
+    // Most of our time is at high usage...
+    Advance(absl::Seconds(30));
+    // Now immediately release and reallocate.
+    for (auto r : rs) {
+      cache_.Release(r);
+    }
+
+    // warmup
+    if (i >= 2) {
+      EXPECT_GE(0.07, Frac(uncached, got));
+    }
+  }
+}
+
+// Suppose in a previous era of behavior we needed a giant cache,
+// but now we don't.  Do we figure this out promptly?
+TEST_F(HugeCacheTest, Shrink) {
+  absl::BitGen rng;
+  std::uniform_int_distribution<size_t> sizes(1, 10);
+  for (int i = 0; i < 20; ++i) {
+    std::vector<HugeRange> rs;
+    for (int j = 0; j < 2000; ++j) {
+      HugeLength n = NHugePages(sizes(rng));
+      bool released;
+      rs.push_back(cache_.Get(n, &released));
+    }
+    for (auto r : rs) {
+      cache_.Release(r);
+    }
+  }
+
+  ASSERT_LE(NHugePages(10000), cache_.size());
+
+  for (int i = 0; i < 30; ++i) {
+    // New working set <= 20 pages.
+    Advance(absl::Seconds(1));
+
+    // And do some work.
+    for (int j = 0; j < 100; ++j) {
+      bool released;
+      HugeRange r1 = cache_.Get(NHugePages(sizes(rng)), &released);
+      HugeRange r2 = cache_.Get(NHugePages(sizes(rng)), &released);
+      cache_.Release(r1);
+      cache_.Release(r2);
+    }
+  }
+
+  ASSERT_GE(NHugePages(25), cache_.limit());
+}
+
+TEST_F(HugeCacheTest, Usage) {
+  bool released;
+
+  auto r1 = cache_.Get(NHugePages(10), &released);
+  EXPECT_EQ(NHugePages(10), cache_.usage());
+
+  auto r2 = cache_.Get(NHugePages(100), &released);
+  EXPECT_EQ(NHugePages(110), cache_.usage());
+
+  cache_.Release(r1);
+  EXPECT_EQ(NHugePages(100), cache_.usage());
+
+  // Pretend we unbacked this.
+  cache_.ReleaseUnbacked(r2);
+  EXPECT_EQ(NHugePages(0), cache_.usage());
+}
+
+class MinMaxTrackerTest : public testing::Test {
+ protected:
+  void Advance(absl::Duration d) {
+    clock_ += absl::ToDoubleSeconds(d) * GetFakeClockFrequency();
+  }
+
+  static int64_t FakeClock() { return clock_; }
+
+  static double GetFakeClockFrequency() {
+    return absl::ToDoubleNanoseconds(absl::Seconds(2));
+  }
+
+ private:
+  static int64_t clock_;
+};
+
+int64_t MinMaxTrackerTest::clock_{0};
+
+TEST_F(MinMaxTrackerTest, Works) {
+  const absl::Duration kDuration = absl::Seconds(2);
+  MinMaxTracker<> tracker{
+      Clock{.now = FakeClock, .freq = GetFakeClockFrequency}, kDuration};
+
+  tracker.Report(NHugePages(0));
+  EXPECT_EQ(NHugePages(0), tracker.MaxOverTime(kDuration));
+  EXPECT_EQ(NHugePages(0), tracker.MinOverTime(kDuration));
+
+  tracker.Report(NHugePages(10));
+  EXPECT_EQ(NHugePages(10), tracker.MaxOverTime(kDuration));
+  EXPECT_EQ(NHugePages(0), tracker.MinOverTime(kDuration));
+
+  tracker.Report(NHugePages(5));
+  EXPECT_EQ(NHugePages(10), tracker.MaxOverTime(kDuration));
+  EXPECT_EQ(NHugePages(0), tracker.MinOverTime(kDuration));
+
+  tracker.Report(NHugePages(100));
+  EXPECT_EQ(NHugePages(100), tracker.MaxOverTime(kDuration));
+  EXPECT_EQ(NHugePages(0), tracker.MinOverTime(kDuration));
+
+  // Some tests for advancing time
+  Advance(kDuration / 3);
+  tracker.Report(NHugePages(2));
+  EXPECT_EQ(NHugePages(2), tracker.MaxOverTime(absl::Nanoseconds(1)));
+  EXPECT_EQ(NHugePages(100), tracker.MaxOverTime(kDuration / 2));
+  EXPECT_EQ(NHugePages(100), tracker.MaxOverTime(kDuration));
+  EXPECT_EQ(NHugePages(2), tracker.MinOverTime(absl::Nanoseconds(1)));
+  EXPECT_EQ(NHugePages(0), tracker.MinOverTime(kDuration / 2));
+  EXPECT_EQ(NHugePages(0), tracker.MinOverTime(kDuration));
+
+  Advance(kDuration / 3);
+  tracker.Report(NHugePages(5));
+  EXPECT_EQ(NHugePages(5), tracker.MaxOverTime(absl::Nanoseconds(1)));
+  EXPECT_EQ(NHugePages(5), tracker.MaxOverTime(kDuration / 2));
+  EXPECT_EQ(NHugePages(100), tracker.MaxOverTime(kDuration));
+  EXPECT_EQ(NHugePages(5), tracker.MinOverTime(absl::Nanoseconds(1)));
+  EXPECT_EQ(NHugePages(2), tracker.MinOverTime(kDuration / 2));
+  EXPECT_EQ(NHugePages(0), tracker.MinOverTime(kDuration));
+
+  // This should annihilate everything.
+  Advance(kDuration * 2);
+  tracker.Report(NHugePages(1));
+  EXPECT_EQ(NHugePages(1), tracker.MaxOverTime(absl::Nanoseconds(1)));
+  EXPECT_EQ(NHugePages(1), tracker.MinOverTime(absl::Nanoseconds(1)));
+  EXPECT_EQ(NHugePages(1), tracker.MaxOverTime(kDuration));
+  EXPECT_EQ(NHugePages(1), tracker.MinOverTime(kDuration));
+}
+
+}  // namespace
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
--- a/src/third_party/tcmalloc/dist/tcmalloc/huge_page_aware_allocator.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/huge_page_aware_allocator.cc
@ -0,0 +1,847 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tcmalloc/huge_page_aware_allocator.h"
+
+#include <stdint.h>
+#include <string.h>
+
+#include <new>
+
+#include "absl/base/internal/cycleclock.h"
+#include "absl/base/internal/spinlock.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/time/time.h"
+#include "tcmalloc/common.h"
+#include "tcmalloc/experiment.h"
+#include "tcmalloc/experiment_config.h"
+#include "tcmalloc/huge_allocator.h"
+#include "tcmalloc/huge_page_filler.h"
+#include "tcmalloc/huge_pages.h"
+#include "tcmalloc/internal/environment.h"
+#include "tcmalloc/internal/lifetime_predictions.h"
+#include "tcmalloc/internal/logging.h"
+#include "tcmalloc/internal/optimization.h"
+#include "tcmalloc/internal/prefetch.h"
+#include "tcmalloc/lifetime_based_allocator.h"
+#include "tcmalloc/pagemap.h"
+#include "tcmalloc/parameters.h"
+#include "tcmalloc/span.h"
+#include "tcmalloc/static_vars.h"
+#include "tcmalloc/stats.h"
+#include "tcmalloc/system-alloc.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+
+bool decide_want_hpaa();
+ABSL_ATTRIBUTE_WEAK int default_want_hpaa();
+ABSL_ATTRIBUTE_WEAK int default_subrelease();
+
+bool decide_subrelease() {
+  if (!decide_want_hpaa()) {
+    // Subrelease is off if HPAA is off.
+    return false;
+  }
+
+  const char* e = thread_safe_getenv("TCMALLOC_HPAA_CONTROL");
+  if (e) {
+    switch (e[0]) {
+      case '0':
+        if (default_want_hpaa != nullptr) {
+          int default_hpaa = default_want_hpaa();
+          if (default_hpaa < 0) {
+            return false;
+          }
+        }
+
+        Log(kLog, __FILE__, __LINE__,
+            "Runtime opt-out from HPAA requires building with "
+            "//tcmalloc:want_no_hpaa."
+        );
+        break;
+      case '1':
+        return false;
+      case '2':
+        return true;
+      default:
+        Crash(kCrash, __FILE__, __LINE__, "bad env var", e);
+        return false;
+    }
+  }
+
+  if (default_subrelease != nullptr) {
+    const int decision = default_subrelease();
+    if (decision != 0) {
+      return decision > 0;
+    }
+  }
+
+  return true;
+}
+
+FillerPartialRerelease decide_partial_rerelease() {
+  const char* e = thread_safe_getenv("TCMALLOC_PARTIAL_RELEASE_CONTROL");
+  if (e) {
+    if (e[0] == '0') {
+      return FillerPartialRerelease::Return;
+    }
+    if (e[0] == '1') {
+      return FillerPartialRerelease::Retain;
+    }
+    Crash(kCrash, __FILE__, __LINE__, "bad env var", e);
+  }
+
+  return FillerPartialRerelease::Retain;
+}
+
+LifetimePredictionOptions decide_lifetime_predictions() {
+  // See LifetimePredictionOptions::FromFlag for a description of the format.
+  const char* e = tcmalloc::tcmalloc_internal::thread_safe_getenv(
+      "TCMALLOC_LIFETIMES_CONTROL");
+
+  if (e != nullptr) {
+    return LifetimePredictionOptions::FromFlag(e);
+  }
+
+  return LifetimePredictionOptions::Default();
+}
+
+HugeRegionCountOption use_huge_region_for_often() {
+  return (IsExperimentActive(
+              Experiment::TEST_ONLY_TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN) ||
+          IsExperimentActive(Experiment::TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN))
+             ? HugeRegionCountOption::kAbandonedCount
+             : HugeRegionCountOption::kSlack;
+}
+
+// Some notes: locking discipline here is a bit funny, because
+// we want to *not* hold the pageheap lock while backing memory.
+
+// We have here a collection of slightly different allocators each
+// optimized for slightly different purposes.  This file has two main purposes:
+// - pick the right one for a given allocation
+// - provide enough data to figure out what we picked last time!
+
+HugePageAwareAllocator::HugePageAwareAllocator(MemoryTag tag)
+    : HugePageAwareAllocator(tag, use_huge_region_for_often(),
+                             decide_lifetime_predictions()) {}
+
+HugePageAwareAllocator::HugePageAwareAllocator(
+    MemoryTag tag, HugeRegionCountOption use_huge_region_more_often)
+    : HugePageAwareAllocator(tag, use_huge_region_more_often,
+                             decide_lifetime_predictions()) {}
+
+HugePageAwareAllocator::HugePageAwareAllocator(
+    MemoryTag tag, HugeRegionCountOption use_huge_region_more_often,
+    LifetimePredictionOptions lifetime_options)
+    : PageAllocatorInterface("HugePageAware", tag),
+      filler_(decide_partial_rerelease(),
+              Parameters::separate_allocs_for_few_and_many_objects_spans(),
+              MemoryModifyFunction(SystemRelease)),
+      alloc_(
+          [](MemoryTag tag) {
+            // TODO(ckennelly): Remove the template parameter.
+            switch (tag) {
+              case MemoryTag::kNormal:
+                return AllocAndReport<MemoryTag::kNormal>;
+              case MemoryTag::kNormalP1:
+                return AllocAndReport<MemoryTag::kNormalP1>;
+              case MemoryTag::kSampled:
+                return AllocAndReport<MemoryTag::kSampled>;
+              case MemoryTag::kCold:
+                return AllocAndReport<MemoryTag::kCold>;
+              default:
+                ASSUME(false);
+                __builtin_unreachable();
+            }
+          }(tag),
+          MetaDataAlloc),
+      cache_(HugeCache{&alloc_, MetaDataAlloc,
+                       MemoryModifyFunction(UnbackWithoutLock)}),
+      lifetime_allocator_region_alloc_(this),
+      lifetime_allocator_(lifetime_options, &lifetime_allocator_region_alloc_),
+      use_huge_region_more_often_(use_huge_region_more_often) {
+  tracker_allocator_.Init(&tc_globals.arena());
+  region_allocator_.Init(&tc_globals.arena());
+}
+
+HugePageAwareAllocator::FillerType::Tracker* HugePageAwareAllocator::GetTracker(
+    HugePage p) {
+  void* v = tc_globals.pagemap().GetHugepage(p.first_page());
+  FillerType::Tracker* pt = reinterpret_cast<FillerType::Tracker*>(v);
+  ASSERT(pt == nullptr || pt->location() == p);
+  return pt;
+}
+
+void HugePageAwareAllocator::SetTracker(
+    HugePage p, HugePageAwareAllocator::FillerType::Tracker* pt) {
+  tc_globals.pagemap().SetHugepage(p.first_page(), pt);
+}
+
+PageId HugePageAwareAllocator::AllocAndContribute(HugePage p, Length n,
+                                                  size_t num_objects,
+                                                  bool donated) {
+  CHECK_CONDITION(p.start_addr() != nullptr);
+  FillerType::Tracker* pt = tracker_allocator_.New();
+  new (pt)
+      FillerType::Tracker(p, absl::base_internal::CycleClock::Now(), donated);
+  ASSERT(pt->longest_free_range() >= n);
+  ASSERT(pt->was_donated() == donated);
+  // if the page was donated, we track its size so that we can potentially
+  // measure it in abandoned_count_ once this large allocation gets deallocated.
+  if (pt->was_donated()) {
+    pt->set_abandoned_count(n);
+  }
+  PageId page = pt->Get(n).page;
+  ASSERT(page == p.first_page());
+  SetTracker(p, pt);
+  filler_.Contribute(pt, donated, num_objects);
+  ASSERT(pt->was_donated() == donated);
+  return page;
+}
+
+PageId HugePageAwareAllocator::RefillFiller(Length n, size_t num_objects,
+                                            bool* from_released) {
+  HugeRange r = cache_.Get(NHugePages(1), from_released);
+  if (!r.valid()) return PageId{0};
+  // This is duplicate to Finalize, but if we need to break up
+  // hugepages to get to our usage limit it would be very bad to break
+  // up what's left of r after we allocate from there--while r is
+  // mostly empty, clearly what's left in the filler is too fragmented
+  // to be very useful, and we would rather release those
+  // pages. Otherwise, we're nearly guaranteed to release r (if n
+  // isn't very large), and the next allocation will just repeat this
+  // process.
+  tc_globals.page_allocator().ShrinkToUsageLimit(n);
+  return AllocAndContribute(r.start(), n, num_objects, /*donated=*/false);
+}
+
+Span* HugePageAwareAllocator::Finalize(Length n, size_t num_objects,
+                                       PageId page)
+    ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
+  ASSERT(page != PageId{0});
+  Span* ret = Span::New(page, n);
+  tc_globals.pagemap().Set(page, ret);
+  ASSERT(!ret->sampled());
+  info_.RecordAlloc(page, n, num_objects);
+  tc_globals.page_allocator().ShrinkToUsageLimit(n);
+  return ret;
+}
+
+// For anything <= half a huge page, we will unconditionally use the filler
+// to pack it into a single page.  If we need another page, that's fine.
+Span* HugePageAwareAllocator::AllocSmall(Length n, size_t objects_per_span,
+                                         bool* from_released) {
+  auto [pt, page] = filler_.TryGet(n, objects_per_span);
+  if (ABSL_PREDICT_TRUE(pt != nullptr)) {
+    *from_released = false;
+    return Finalize(n, objects_per_span, page);
+  }
+
+  page = RefillFiller(n, objects_per_span, from_released);
+  if (ABSL_PREDICT_FALSE(page == PageId{0})) {
+    return nullptr;
+  }
+  return Finalize(n, objects_per_span, page);
+}
+
+Span* HugePageAwareAllocator::AllocLarge(Length n, size_t objects_per_span,
+                                         bool* from_released,
+                                         LifetimeStats* lifetime_context) {
+  // If it's an exact page multiple, just pull it from pages directly.
+  HugeLength hl = HLFromPages(n);
+  if (hl.in_pages() == n) {
+    return AllocRawHugepages(n, objects_per_span, from_released);
+  }
+
+  PageId page;
+  // If we fit in a single hugepage, try the Filler first.
+  if (n < kPagesPerHugePage) {
+    auto [pt, page] = filler_.TryGet(n, objects_per_span);
+    if (ABSL_PREDICT_TRUE(pt != nullptr)) {
+      *from_released = false;
+      return Finalize(n, objects_per_span, page);
+    }
+  }
+
+  // Try to perform a lifetime-based allocation.
+  LifetimeBasedAllocator::AllocationResult lifetime =
+      lifetime_allocator_.MaybeGet(n, from_released, lifetime_context);
+
+  // TODO(mmaas): Implement tracking if this is subsequently put into a
+  // conventional region (currently ignored).
+
+  // Was an object allocated in the lifetime region? If so, we return it.
+  if (lifetime.TryGetAllocation(&page)) {
+    return Finalize(n, objects_per_span, page);
+  }
+
+  // If we're using regions in this binary (see below comment), is
+  // there currently available space there?
+  if (regions_.MaybeGet(n, &page, from_released)) {
+    return Finalize(n, objects_per_span, page);
+  }
+
+  // We have two choices here: allocate a new region or go to
+  // hugepages directly (hoping that slack will be filled by small
+  // allocation.) The second strategy is preferrable, as it's
+  // typically faster and usually more space efficient, but it's sometimes
+  // catastrophic.
+  //
+  // See https://github.com/google/tcmalloc/tree/master/docs/regions-are-not-optional.md
+  //
+  // So test directly if we're in the bad case--almost no binaries are.
+  // If not, just fall back to direct allocation (and hope we do hit that case!)
+  const Length slack = info_.slack();
+  const Length donated =
+      UseHugeRegionMoreOften() ? abandoned_pages_ + slack : slack;
+  // Don't bother at all until the binary is reasonably sized.
+  if (donated < HLFromBytes(64 * 1024 * 1024).in_pages()) {
+    return AllocRawHugepagesAndMaybeTrackLifetime(n, objects_per_span, lifetime,
+                                                  from_released);
+  }
+
+  // In the vast majority of binaries, we have many small allocations which
+  // will nicely fill slack.  (Fleetwide, the average ratio is 15:1; only
+  // a handful of binaries fall below 1:1.)
+  //
+  // If we enable an experiment that tries to use huge regions more frequently,
+  // we skip the check.
+  const Length small = info_.small();
+  if (slack < small && !UseHugeRegionMoreOften()) {
+    return AllocRawHugepagesAndMaybeTrackLifetime(n, objects_per_span, lifetime,
+                                                  from_released);
+  }
+
+  // We couldn't allocate a new region. They're oversized, so maybe we'd get
+  // lucky with a smaller request?
+  if (!AddRegion()) {
+    return AllocRawHugepagesAndMaybeTrackLifetime(n, objects_per_span, lifetime,
+                                                  from_released);
+  }
+
+  CHECK_CONDITION(regions_.MaybeGet(n, &page, from_released));
+  return Finalize(n, objects_per_span, page);
+}
+
+Span* HugePageAwareAllocator::AllocEnormous(Length n, size_t objects_per_span,
+                                            bool* from_released) {
+  return AllocRawHugepages(n, objects_per_span, from_released);
+}
+
+Span* HugePageAwareAllocator::AllocRawHugepages(Length n, size_t num_objects,
+                                                bool* from_released) {
+  HugeLength hl = HLFromPages(n);
+
+  HugeRange r = cache_.Get(hl, from_released);
+  if (!r.valid()) return nullptr;
+
+  // We now have a huge page range that covers our request.  There
+  // might be some slack in it if n isn't a multiple of
+  // kPagesPerHugePage. Add the hugepage with slack to the filler,
+  // pretending the non-slack portion is a smaller allocation.
+  Length total = hl.in_pages();
+  Length slack = total - n;
+  HugePage first = r.start();
+  SetTracker(first, nullptr);
+  HugePage last = first + r.len() - NHugePages(1);
+  if (slack == Length(0)) {
+    SetTracker(last, nullptr);
+    return Finalize(total, num_objects, r.start().first_page());
+  }
+
+  ++donated_huge_pages_;
+
+  Length here = kPagesPerHugePage - slack;
+  ASSERT(here > Length(0));
+  AllocAndContribute(last, here, num_objects, /*donated=*/true);
+  Span* span = Finalize(n, num_objects, r.start().first_page());
+  span->set_donated(/*value=*/true);
+  return span;
+}
+
+Span* HugePageAwareAllocator::AllocRawHugepagesAndMaybeTrackLifetime(
+    Length n, size_t num_objects,
+    const LifetimeBasedAllocator::AllocationResult& lifetime_alloc,
+    bool* from_released) ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
+  Span* result = AllocRawHugepages(n, num_objects, from_released);
+
+  if (result != nullptr) {
+    // If this is an object with a lifetime prediction and led to a donation,
+    // add it to the tracker so that we can track its lifetime.
+    HugePage hp = HugePageContaining(result->last_page());
+    FillerType::Tracker* pt = GetTracker(hp);
+    ASSERT(pt != nullptr);
+
+    // The allocator may shrink the heap in response to allocations, which may
+    // cause the page to be subreleased and not donated anymore once we get
+    // here. If it still is, we attach a lifetime tracker (if enabled).
+    if (ABSL_PREDICT_TRUE(pt->donated())) {
+      lifetime_allocator_.MaybeAddTracker(lifetime_alloc,
+                                          pt->lifetime_tracker());
+    }
+  }
+
+  return result;
+}
+
+static void BackSpan(Span* span) {
+  SystemBack(span->start_address(), span->bytes_in_span());
+}
+
+// public
+Span* HugePageAwareAllocator::New(Length n, size_t objects_per_span) {
+  CHECK_CONDITION(n > Length(0));
+  bool from_released;
+  Span* s = LockAndAlloc(n, objects_per_span, &from_released);
+  if (s) {
+    // Prefetch for writing, as we anticipate using the memory soon.
+    PrefetchW(s->start_address());
+    // TODO(b/256233439):  Improve accuracy of from_released value.  The filler
+    // may have subreleased pages and is returning them now.
+    if (from_released) BackSpan(s);
+  }
+  ASSERT(!s || GetMemoryTag(s->start_address()) == tag_);
+  return s;
+}
+
+Span* HugePageAwareAllocator::LockAndAlloc(Length n, size_t objects_per_span,
+                                           bool* from_released) {
+  // Check whether we may perform lifetime-based allocation, and if so, collect
+  // the allocation context without holding the lock.
+  LifetimeStats* lifetime_ctx = lifetime_allocator_.CollectLifetimeContext(n);
+
+  absl::base_internal::SpinLockHolder h(&pageheap_lock);
+  // Our policy depends on size.  For small things, we will pack them
+  // into single hugepages.
+  if (n <= kPagesPerHugePage / 2) {
+    return AllocSmall(n, objects_per_span, from_released);
+  }
+
+  // For anything too big for the filler, we use either a direct hugepage
+  // allocation, or possibly the regions if we are worried about slack.
+  if (n <= HugeRegion::size().in_pages()) {
+    return AllocLarge(n, objects_per_span, from_released, lifetime_ctx);
+  }
+
+  // In the worst case, we just fall back to directly allocating a run
+  // of hugepages.
+  return AllocEnormous(n, objects_per_span, from_released);
+}
+
+// public
+Span* HugePageAwareAllocator::NewAligned(Length n, Length align,
+                                         size_t objects_per_span) {
+  if (align <= Length(1)) {
+    return New(n, objects_per_span);
+  }
+
+  // we can do better than this, but...
+  // TODO(b/134690769): support higher align.
+  CHECK_CONDITION(align <= kPagesPerHugePage);
+  bool from_released;
+  Span* s;
+  {
+    absl::base_internal::SpinLockHolder h(&pageheap_lock);
+    s = AllocRawHugepages(n, objects_per_span, &from_released);
+  }
+  if (s && from_released) BackSpan(s);
+  ASSERT(!s || GetMemoryTag(s->start_address()) == tag_);
+  return s;
+}
+
+void HugePageAwareAllocator::DeleteFromHugepage(FillerType::Tracker* pt,
+                                                PageId p, Length n,
+                                                size_t num_objects,
+                                                bool might_abandon) {
+  if (ABSL_PREDICT_TRUE(filler_.Put(pt, p, n, num_objects) == nullptr)) {
+    // If this allocation had resulted in a donation to the filler, we record
+    // these pages as abandoned.
+    if (ABSL_PREDICT_FALSE(might_abandon)) {
+      ASSERT(pt->was_donated());
+      abandoned_pages_ += pt->abandoned_count();
+      pt->set_abandoned(true);
+    }
+    return;
+  }
+  if (pt->was_donated()) {
+    --donated_huge_pages_;
+    if (pt->abandoned()) {
+      abandoned_pages_ -= pt->abandoned_count();
+      pt->set_abandoned(false);
+    }
+  } else {
+    ASSERT(pt->abandoned_count() == Length(0));
+  }
+  lifetime_allocator_.MaybePutTracker(pt->lifetime_tracker(), n);
+  ReleaseHugepage(pt);
+}
+
+bool HugePageAwareAllocator::AddRegion() {
+  HugeRange r = alloc_.Get(HugeRegion::size());
+  if (!r.valid()) return false;
+  HugeRegion* region = region_allocator_.New();
+  new (region) HugeRegion(r, MemoryModifyFunction(SystemRelease));
+  regions_.Contribute(region);
+  return true;
+}
+
+void HugePageAwareAllocator::Delete(Span* span, size_t objects_per_span) {
+  ASSERT(!span || GetMemoryTag(span->start_address()) == tag_);
+  PageId p = span->first_page();
+  HugePage hp = HugePageContaining(p);
+  Length n = span->num_pages();
+  info_.RecordFree(p, n, objects_per_span);
+
+  bool might_abandon = span->donated();
+  Span::Delete(span);
+  // Clear the descriptor of the page so a second pass through the same page
+  // could trigger the check on `span != nullptr` in do_free_pages.
+  tc_globals.pagemap().Set(p, nullptr);
+
+  // The tricky part, as with so many allocators: where did we come from?
+  // There are several possibilities.
+  FillerType::Tracker* pt = GetTracker(hp);
+  // a) We got packed by the filler onto a single hugepage - return our
+  //    allocation to that hugepage in the filler.
+  if (ABSL_PREDICT_TRUE(pt != nullptr)) {
+    ASSERT(hp == HugePageContaining(p + n - Length(1)));
+    DeleteFromHugepage(pt, p, n, objects_per_span, might_abandon);
+    return;
+  }
+
+  // b) We got put into a region, possibly crossing hugepages -
+  //    return our allocation to the region.
+  if (regions_.MaybePut(p, n)) return;
+  if (lifetime_allocator_.MaybePut(p, n)) return;
+
+  // c) we came straight from the HugeCache - return straight there.  (We
+  //    might have had slack put into the filler - if so, return that virtual
+  //    allocation to the filler too!)
+  ASSERT(n >= kPagesPerHugePage);
+  HugeLength hl = HLFromPages(n);
+  HugePage last = hp + hl - NHugePages(1);
+  Length slack = hl.in_pages() - n;
+  if (slack == Length(0)) {
+    ASSERT(GetTracker(last) == nullptr);
+  } else {
+    pt = GetTracker(last);
+    lifetime_allocator_.MaybePutTracker(pt->lifetime_tracker(), n);
+    CHECK_CONDITION(pt != nullptr);
+    ASSERT(pt->was_donated());
+    // We put the slack into the filler (see AllocEnormous.)
+    // Handle this page separately as a virtual allocation
+    // onto the last hugepage.
+    PageId virt = last.first_page();
+    Length virt_len = kPagesPerHugePage - slack;
+    // We may have used the slack, which would prevent us from returning
+    // the entire range now.  If filler returned a Tracker, we are fully empty.
+    if (filler_.Put(pt, virt, virt_len, objects_per_span) == nullptr) {
+      // Last page isn't empty -- pretend the range was shorter.
+      --hl;
+
+      // Note that we abandoned virt_len pages with pt.  These can be reused for
+      // other allocations, but this can contribute to excessive slack in the
+      // filler.
+      abandoned_pages_ += pt->abandoned_count();
+      pt->set_abandoned(true);
+    } else {
+      // Last page was empty - but if we sub-released it, we still
+      // have to split it off and release it independently.)
+      //
+      // We were able to reclaim the donated slack.
+      --donated_huge_pages_;
+      ASSERT(!pt->abandoned());
+
+      if (pt->released()) {
+        --hl;
+        ReleaseHugepage(pt);
+      } else {
+        // Get rid of the tracker *object*, but not the *hugepage* (which is
+        // still part of our range.)
+        SetTracker(pt->location(), nullptr);
+        ASSERT(!pt->lifetime_tracker()->is_tracked());
+        tracker_allocator_.Delete(pt);
+      }
+    }
+  }
+  cache_.Release({hp, hl});
+}
+
+void HugePageAwareAllocator::ReleaseHugepage(FillerType::Tracker* pt) {
+  ASSERT(pt->used_pages() == Length(0));
+  HugeRange r = {pt->location(), NHugePages(1)};
+  SetTracker(pt->location(), nullptr);
+
+  if (pt->released()) {
+    cache_.ReleaseUnbacked(r);
+  } else {
+    cache_.Release(r);
+  }
+
+  ASSERT(!pt->lifetime_tracker()->is_tracked());
+  tracker_allocator_.Delete(pt);
+}
+
+// public
+BackingStats HugePageAwareAllocator::stats() const {
+  BackingStats stats = alloc_.stats();
+  const auto actual_system = stats.system_bytes;
+  stats += cache_.stats();
+  stats += filler_.stats();
+  stats += regions_.stats();
+  stats += lifetime_allocator_.GetRegionStats().value_or(BackingStats());
+  // the "system" (total managed) byte count is wildly double counted,
+  // since it all comes from HugeAllocator but is then managed by
+  // cache/regions/filler. Adjust for that.
+  stats.system_bytes = actual_system;
+  return stats;
+}
+
+// public
+void HugePageAwareAllocator::GetSmallSpanStats(SmallSpanStats* result) {
+  GetSpanStats(result, nullptr, nullptr);
+}
+
+// public
+void HugePageAwareAllocator::GetLargeSpanStats(LargeSpanStats* result) {
+  GetSpanStats(nullptr, result, nullptr);
+}
+
+void HugePageAwareAllocator::GetSpanStats(SmallSpanStats* small,
+                                          LargeSpanStats* large,
+                                          PageAgeHistograms* ages) {
+  if (small != nullptr) {
+    *small = SmallSpanStats();
+  }
+  if (large != nullptr) {
+    *large = LargeSpanStats();
+  }
+
+  alloc_.AddSpanStats(small, large, ages);
+  filler_.AddSpanStats(small, large, ages);
+  regions_.AddSpanStats(small, large, ages);
+  cache_.AddSpanStats(small, large, ages);
+}
+
+// public
+Length HugePageAwareAllocator::ReleaseAtLeastNPages(Length num_pages) {
+  Length released;
+  released += cache_.ReleaseCachedPages(HLFromPages(num_pages)).in_pages();
+
+  // This is our long term plan but in current state will lead to insufficient
+  // THP coverage. It is however very useful to have the ability to turn this on
+  // for testing.
+  // TODO(b/134690769): make this work, remove the flag guard.
+  if (Parameters::hpaa_subrelease()) {
+    if (released < num_pages) {
+      released += filler_.ReleasePages(
+          num_pages - released,
+          SkipSubreleaseIntervals{
+              .peak_interval = Parameters::filler_skip_subrelease_interval(),
+              .short_interval =
+                  Parameters::filler_skip_subrelease_short_interval(),
+              .long_interval =
+                  Parameters::filler_skip_subrelease_long_interval()},
+          /*hit_limit*/ false);
+    }
+  }
+
+  // TODO(b/134690769):
+  // - perhaps release region?
+  // - refuse to release if we're too close to zero?
+  info_.RecordRelease(num_pages, released);
+  return released;
+}
+
+static double BytesToMiB(size_t bytes) {
+  const double MiB = 1048576.0;
+  return bytes / MiB;
+}
+
+static void BreakdownStats(Printer* out, const BackingStats& s,
+                           const char* label) {
+  out->printf("%s %6.1f MiB used, %6.1f MiB free, %6.1f MiB unmapped\n", label,
+              BytesToMiB(s.system_bytes - s.free_bytes - s.unmapped_bytes),
+              BytesToMiB(s.free_bytes), BytesToMiB(s.unmapped_bytes));
+}
+
+static void BreakdownStatsInPbtxt(PbtxtRegion* hpaa, const BackingStats& s,
+                                  const char* key) {
+  auto usage = hpaa->CreateSubRegion(key);
+  usage.PrintI64("used", s.system_bytes - s.free_bytes - s.unmapped_bytes);
+  usage.PrintI64("free", s.free_bytes);
+  usage.PrintI64("unmapped", s.unmapped_bytes);
+}
+
+// public
+void HugePageAwareAllocator::Print(Printer* out) { Print(out, true); }
+
+void HugePageAwareAllocator::Print(Printer* out, bool everything) {
+  SmallSpanStats small;
+  LargeSpanStats large;
+  BackingStats bstats;
+  PageAgeHistograms ages(absl::base_internal::CycleClock::Now());
+  absl::base_internal::SpinLockHolder h(&pageheap_lock);
+  bstats = stats();
+  GetSpanStats(&small, &large, &ages);
+  PrintStats("HugePageAware", out, bstats, small, large, everything);
+  out->printf(
+      "\nHuge page aware allocator components:\n"
+      "------------------------------------------------\n");
+  out->printf("HugePageAware: breakdown of used / free / unmapped space:\n");
+
+  auto fstats = filler_.stats();
+  BreakdownStats(out, fstats, "HugePageAware: filler  ");
+
+  auto rstats = regions_.stats();
+  BreakdownStats(out, rstats, "HugePageAware: region  ");
+
+  // Report short-lived region allocations when enabled.
+  auto lstats = lifetime_allocator_.GetRegionStats();
+  if (lstats.has_value()) {
+    BreakdownStats(out, lstats.value(), "HugePageAware: lifetime");
+  }
+
+  auto cstats = cache_.stats();
+  // Everything in the filler came from the cache -
+  // adjust the totals so we see the amount used by the mutator.
+  cstats.system_bytes -= fstats.system_bytes;
+  BreakdownStats(out, cstats, "HugePageAware: cache   ");
+
+  auto astats = alloc_.stats();
+  // Everything in *all* components came from here -
+  // so again adjust the totals.
+  astats.system_bytes -=
+      (fstats + rstats + lstats.value_or(BackingStats()) + cstats).system_bytes;
+  BreakdownStats(out, astats, "HugePageAware: alloc   ");
+  out->printf("\n");
+
+  out->printf(
+      "HugePageAware: filler donations %zu (%zu pages from abandoned "
+      "donations)\n",
+      donated_huge_pages_.raw_num(), abandoned_pages_.raw_num());
+
+  // Component debug output
+  // Filler is by far the most important; print (some) of it
+  // unconditionally.
+  filler_.Print(out, everything);
+  out->printf("\n");
+  if (everything) {
+    regions_.Print(out);
+    out->printf("\n");
+    cache_.Print(out);
+    lifetime_allocator_.Print(out);
+    out->printf("\n");
+    alloc_.Print(out);
+    out->printf("\n");
+
+    // Use statistics
+    info_.Print(out);
+
+    // and age tracking.
+    ages.Print("HugePageAware", out);
+  }
+
+  out->printf("PARAMETER hpaa_subrelease %d\n",
+              Parameters::hpaa_subrelease() ? 1 : 0);
+}
+
+void HugePageAwareAllocator::PrintInPbtxt(PbtxtRegion* region) {
+  SmallSpanStats small;
+  LargeSpanStats large;
+  PageAgeHistograms ages(absl::base_internal::CycleClock::Now());
+  absl::base_internal::SpinLockHolder h(&pageheap_lock);
+  GetSpanStats(&small, &large, &ages);
+  PrintStatsInPbtxt(region, small, large, ages);
+  {
+    auto hpaa = region->CreateSubRegion("huge_page_allocator");
+    hpaa.PrintBool("using_hpaa", true);
+    hpaa.PrintBool("using_hpaa_subrelease", Parameters::hpaa_subrelease());
+
+    // Fill HPAA Usage
+    auto fstats = filler_.stats();
+    BreakdownStatsInPbtxt(&hpaa, fstats, "filler_usage");
+
+    auto rstats = regions_.stats();
+    BreakdownStatsInPbtxt(&hpaa, rstats, "region_usage");
+
+    auto cstats = cache_.stats();
+    // Everything in the filler came from the cache -
+    // adjust the totals so we see the amount used by the mutator.
+    cstats.system_bytes -= fstats.system_bytes;
+    BreakdownStatsInPbtxt(&hpaa, cstats, "cache_usage");
+
+    auto astats = alloc_.stats();
+    // Everything in *all* components came from here -
+    // so again adjust the totals.
+    astats.system_bytes -= (fstats + rstats + cstats).system_bytes;
+
+    auto lstats = lifetime_allocator_.GetRegionStats();
+    if (lstats.has_value()) {
+      astats.system_bytes -= lstats.value().system_bytes;
+      BreakdownStatsInPbtxt(&hpaa, lstats.value(), "lifetime_region_usage");
+    }
+
+    BreakdownStatsInPbtxt(&hpaa, astats, "alloc_usage");
+
+    filler_.PrintInPbtxt(&hpaa);
+    regions_.PrintInPbtxt(&hpaa);
+    cache_.PrintInPbtxt(&hpaa);
+    alloc_.PrintInPbtxt(&hpaa);
+    lifetime_allocator_.PrintInPbtxt(&hpaa);
+
+    // Use statistics
+    info_.PrintInPbtxt(&hpaa, "hpaa_stat");
+
+    hpaa.PrintI64("filler_donated_huge_pages", donated_huge_pages_.raw_num());
+    hpaa.PrintI64("filler_abandoned_pages", abandoned_pages_.raw_num());
+  }
+}
+
+template <MemoryTag tag>
+AddressRange HugePageAwareAllocator::AllocAndReport(size_t bytes,
+                                                    size_t align) {
+  auto ret = SystemAlloc(bytes, align, tag);
+  if (ret.ptr == nullptr) return ret;
+  const PageId page = PageIdContaining(ret.ptr);
+  const Length page_len = BytesToLengthFloor(ret.bytes);
+  tc_globals.pagemap().Ensure(page, page_len);
+  return ret;
+}
+
+void* HugePageAwareAllocator::MetaDataAlloc(size_t bytes)
+    ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
+  return tc_globals.arena().Alloc(bytes);
+}
+
+Length HugePageAwareAllocator::ReleaseAtLeastNPagesBreakingHugepages(Length n) {
+  // We desperately need to release memory, and are willing to
+  // compromise on hugepage usage. That means releasing from the filler.
+  return filler_.ReleasePages(n, SkipSubreleaseIntervals{},
+                              /*hit_limit*/ true);
+}
+
+bool HugePageAwareAllocator::UnbackWithoutLock(void* start, size_t length) {
+  pageheap_lock.Unlock();
+  const bool ret = SystemRelease(start, length);
+  pageheap_lock.Lock();
+  return ret;
+}
+
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
--- a/src/third_party/tcmalloc/dist/tcmalloc/huge_page_aware_allocator.h
+++ b/src/third_party/tcmalloc/dist/tcmalloc/huge_page_aware_allocator.h
@ -0,0 +1,266 @@
+// Copyright 2019 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TCMALLOC_HUGE_PAGE_AWARE_ALLOCATOR_H_
+#define TCMALLOC_HUGE_PAGE_AWARE_ALLOCATOR_H_
+
+#include <stddef.h>
+
+#include "absl/base/thread_annotations.h"
+#include "tcmalloc/arena.h"
+#include "tcmalloc/common.h"
+#include "tcmalloc/huge_allocator.h"
+#include "tcmalloc/huge_cache.h"
+#include "tcmalloc/huge_pages.h"
+#include "tcmalloc/huge_region.h"
+#include "tcmalloc/internal/config.h"
+#include "tcmalloc/internal/logging.h"
+#include "tcmalloc/lifetime_based_allocator.h"
+#include "tcmalloc/page_allocator_interface.h"
+#include "tcmalloc/page_heap_allocator.h"
+#include "tcmalloc/span.h"
+#include "tcmalloc/stats.h"
+#include "tcmalloc/system-alloc.h"
+
+GOOGLE_MALLOC_SECTION_BEGIN
+namespace tcmalloc {
+namespace tcmalloc_internal {
+
+bool decide_subrelease();
+
+enum class HugeRegionCountOption : bool {
+  // This is a default behavior. We use slack to determine when to use
+  // HugeRegion. When slack is greater than 64MB (to ignore small binaries), and
+  // greater than the number of small allocations, we allocate large allocations
+  // from HugeRegion.
+  kSlack,
+  // When the experiment TEST_ONLY_TCMALLOC_USE_HUGE_REGIONS_MORE_OFTEN is
+  // enabled, we use number of abandoned pages in addition to slack to make a
+  // decision. If the size of abandoned pages plus slack exceeds 64MB (to ignore
+  // small binaries), we use HugeRegion for large allocations.
+  kAbandonedCount
+};
+
+// An implementation of the PageAllocator interface that is hugepage-efficient.
+// Attempts to pack allocations into full hugepages wherever possible,
+// and aggressively returns empty ones to the system.
+class HugePageAwareAllocator final : public PageAllocatorInterface {
+ public:
+  explicit HugePageAwareAllocator(MemoryTag tag);
+  // For use in testing.
+  HugePageAwareAllocator(MemoryTag tag,
+                         HugeRegionCountOption use_huge_region_more_often);
+  HugePageAwareAllocator(MemoryTag tag,
+                         HugeRegionCountOption use_huge_region_more_often,
+                         LifetimePredictionOptions lifetime_options);
+  ~HugePageAwareAllocator() override = default;
+
+  // Allocate a run of "n" pages.  Returns zero if out of memory.
+  // Caller should not pass "n == 0" -- instead, n should have
+  // been rounded up already.
+  Span* New(Length n, size_t objects_per_span)
+      ABSL_LOCKS_EXCLUDED(pageheap_lock) override;
+
+  // As New, but the returned span is aligned to a <align>-page boundary.
+  // <align> must be a power of two.
+  Span* NewAligned(Length n, Length align, size_t objects_per_span)
+      ABSL_LOCKS_EXCLUDED(pageheap_lock) override;
+
+  // Delete the span "[p, p+n-1]".
+  // REQUIRES: span was returned by earlier call to New() and
+  //           has not yet been deleted.
+  void Delete(Span* span, size_t objects_per_span)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) override;
+
+  BackingStats stats() const
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) override;
+
+  void GetSmallSpanStats(SmallSpanStats* result)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) override;
+
+  void GetLargeSpanStats(LargeSpanStats* result)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) override;
+
+  // Try to release at least num_pages for reuse by the OS.  Returns
+  // the actual number of pages released, which may be less than
+  // num_pages if there weren't enough pages to release. The result
+  // may also be larger than num_pages since page_heap might decide to
+  // release one large range instead of fragmenting it into two
+  // smaller released and unreleased ranges.
+  Length ReleaseAtLeastNPages(Length num_pages)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) override;
+
+  Length ReleaseAtLeastNPagesBreakingHugepages(Length n)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
+
+  // Prints stats about the page heap to *out.
+  void Print(Printer* out) ABSL_LOCKS_EXCLUDED(pageheap_lock) override;
+
+  // Print stats to *out, excluding long/likely uninteresting things
+  // unless <everything> is true.
+  void Print(Printer* out, bool everything) ABSL_LOCKS_EXCLUDED(pageheap_lock);
+
+  void PrintInPbtxt(PbtxtRegion* region)
+      ABSL_LOCKS_EXCLUDED(pageheap_lock) override;
+
+  HugeLength DonatedHugePages() const
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
+    return donated_huge_pages_;
+  }
+
+  // Number of pages that have been retained on huge pages by donations that did
+  // not reassemble by the time the larger allocation was deallocated.
+  Length AbandonedPages() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
+    return abandoned_pages_;
+  }
+
+  const HugeCache* cache() const { return &cache_; }
+
+  LifetimeBasedAllocator& lifetime_based_allocator() {
+    return lifetime_allocator_;
+  }
+
+  const HugeRegionSet<HugeRegion>& region() const
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
+    return regions_;
+  };
+
+ private:
+  typedef HugePageFiller<PageTracker> FillerType;
+  FillerType filler_ ABSL_GUARDED_BY(pageheap_lock);
+
+  class RegionAllocImpl final : public LifetimeBasedAllocator::RegionAlloc {
+   public:
+    explicit RegionAllocImpl(HugePageAwareAllocator* p) : p_(p) {}
+
+    // We need to explicitly instantiate the destructor here so that it gets
+    // placed within GOOGLE_MALLOC_SECTION.
+    ~RegionAllocImpl() override {}
+
+    HugeRegion* AllocRegion(HugeLength n, HugeRange* range) override
+        ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
+      if (!range->valid()) {
+        *range = p_->alloc_.Get(n);
+      }
+      if (!range->valid()) return nullptr;
+      HugeRegion* region = p_->region_allocator_.New();
+      new (region) HugeRegion(*range, MemoryModifyFunction(SystemRelease));
+      return region;
+    }
+
+   private:
+    HugePageAwareAllocator* p_;
+  };
+
+  // Calls SystemRelease, but with dropping of pageheap_lock around the call.
+  static ABSL_MUST_USE_RESULT bool UnbackWithoutLock(void* start, size_t length)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
+
+  HugeRegionSet<HugeRegion> regions_ ABSL_GUARDED_BY(pageheap_lock);
+
+  PageHeapAllocator<FillerType::Tracker> tracker_allocator_
+      ABSL_GUARDED_BY(pageheap_lock);
+  PageHeapAllocator<HugeRegion> region_allocator_
+      ABSL_GUARDED_BY(pageheap_lock);
+
+  FillerType::Tracker* GetTracker(HugePage p);
+
+  void SetTracker(HugePage p, FillerType::Tracker* pt);
+
+  template <MemoryTag tag>
+  static AddressRange AllocAndReport(size_t bytes, size_t align)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
+  static void* MetaDataAlloc(size_t bytes);
+  HugeAllocator alloc_ ABSL_GUARDED_BY(pageheap_lock);
+  HugeCache cache_ ABSL_GUARDED_BY(pageheap_lock);
+
+  // donated_huge_pages_ measures the number of huge pages contributed to the
+  // filler from left overs of large huge page allocations.  When the large
+  // allocation is deallocated, we decrement this count *if* we were able to
+  // fully reassemble the address range (that is, the partial hugepage did not
+  // get stuck in the filler).
+  HugeLength donated_huge_pages_ ABSL_GUARDED_BY(pageheap_lock);
+  // abandoned_pages_ tracks the number of pages contributed to the filler after
+  // a donating allocation is deallocated but the entire huge page has not been
+  // reassembled.
+  Length abandoned_pages_ ABSL_GUARDED_BY(pageheap_lock);
+
+  // Performs lifetime predictions for large objects and places short-lived
+  // objects into a separate region to reduce filler contention.
+  RegionAllocImpl lifetime_allocator_region_alloc_;
+  LifetimeBasedAllocator lifetime_allocator_;
+
+  // Ddetermines if the experiment is enabled. If enabled, we use
+  // abandoned_count_ in addition to slack in determining when to use
+  // HugeRegion.
+  const HugeRegionCountOption use_huge_region_more_often_;
+  bool UseHugeRegionMoreOften() const {
+    return use_huge_region_more_often_ ==
+           HugeRegionCountOption::kAbandonedCount;
+  }
+
+  void GetSpanStats(SmallSpanStats* small, LargeSpanStats* large,
+                    PageAgeHistograms* ages)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
+
+  PageId RefillFiller(Length n, size_t num_objects, bool* from_released)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
+
+  // Allocate the first <n> from p, and contribute the rest to the filler.  If
+  // "donated" is true, the contribution will be marked as coming from the
+  // tail of a multi-hugepage alloc.  Returns the allocated section.
+  PageId AllocAndContribute(HugePage p, Length n, size_t num_objects,
+                            bool donated)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
+  // Helpers for New().
+
+  Span* LockAndAlloc(Length n, size_t objects_per_span, bool* from_released);
+
+  Span* AllocSmall(Length n, size_t objects_per_span, bool* from_released)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
+  Span* AllocLarge(Length n, size_t objects_per_span, bool* from_released,
+                   LifetimeStats* lifetime_context)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
+  Span* AllocEnormous(Length n, size_t objects_per_span, bool* from_released)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
+
+  Span* AllocRawHugepages(Length n, size_t num_objects, bool* from_released)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
+
+  // Allocates a span and adds a tracker. This span has to be associated with a
+  // filler donation and have an associated page tracker. A tracker will only be
+  // added if there is an associated lifetime prediction.
+  Span* AllocRawHugepagesAndMaybeTrackLifetime(
+      Length n, size_t num_objects,
+      const LifetimeBasedAllocator::AllocationResult& lifetime_alloc,
+      bool* from_released) ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
+
+  bool AddRegion() ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
+
+  void ReleaseHugepage(FillerType::Tracker* pt)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
+  // Return an allocation from a single hugepage.
+  void DeleteFromHugepage(FillerType::Tracker* pt, PageId p, Length n,
+                          size_t num_objects, bool might_abandon)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
+
+  // Finish an allocation request - give it a span and mark it in the pagemap.
+  Span* Finalize(Length n, size_t num_objects, PageId page);
+};
+
+}  // namespace tcmalloc_internal
+}  // namespace tcmalloc
+GOOGLE_MALLOC_SECTION_END
+
+#endif  // TCMALLOC_HUGE_PAGE_AWARE_ALLOCATOR_H_
--- a/src/third_party/tcmalloc/dist/tcmalloc/huge_page_aware_allocator_test.cc
+++ b/src/third_party/tcmalloc/dist/tcmalloc/huge_page_aware_allocator_test.cc
--- a/src/third_party/tcmalloc/dist/tcmalloc/huge_page_filler.h
+++ b/src/third_party/tcmalloc/dist/tcmalloc/huge_page_filler.h
--- a/Show More
+++ b/Show More