Snap for 11819167 from 3fd49b0c0bc8410604521d097e01408b351736e7 to busytown-mac-infra-releasebusytown-mac-infra-release

Change-Id: I51bc6619147639898f6888a56430bf82da74ee6d
author: Android Build Coastguard Worker <android-build-coastguard-worker@google.com> 2024-05-10 15:43:44 +0000
committer: Android Build Coastguard Worker <android-build-coastguard-worker@google.com> 2024-05-10 15:43:44 +0000
commit: f3de6d992cb37d6ff14a1907d544d5e9121ddf0e (patch)
tree: 6b5bf6fe73d6621b9e2f398896b213e793adf226
parent: 06dad597df8cbe3970c5d588545c8102cdc738d8 (diff)
parent: 3fd49b0c0bc8410604521d097e01408b351736e7 (diff)
download: scudo-busytown-mac-infra-release.tar.gz
95 files changed, 7882 insertions, 2686 deletions
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 00000000000..9b3aa8b7213
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1 @@
+BasedOnStyle: LLVM
diff --git a/Android.bp b/Android.bp
index 5395d48ccba..88435b917bc 100644
--- a/Android.bp
+++ b/Android.bp
@@ -48,6 +48,7 @@ license {
 
 cc_library_headers {
     name: "scudo_headers",
+    ramdisk_available: true,
     recovery_available: true,
     vendor_ramdisk_available: true,
 
@@ -57,6 +58,7 @@ cc_library_headers {
 
     apex_available: [
         "com.android.runtime",
+        "//apex_available:platform",
     ],
 
     visibility: [
@@ -65,7 +67,26 @@ cc_library_headers {
 }
 
 cc_defaults {
+    name: "scudo_config_defaults",
+    cflags: [
+        // Use a custom Android configuration.
+        "-DSCUDO_USE_CUSTOM_CONFIG",
+    ],
+
+    include_dirs: [
+        "external/scudo/config",
+    ],
+
+    product_variables: {
+        malloc_low_memory: {
+            cflags: ["-DSCUDO_LOW_MEMORY"],
+        },
+    },
+}
+
+cc_defaults {
     name: "libscudo_defaults",
+    defaults: ["scudo_config_defaults"],
     native_coverage: false,
     ramdisk_available: true,
     vendor_ramdisk_available: true,
@@ -81,12 +102,12 @@ cc_defaults {
         "-fno-rtti",
         // This option speeds up alloc/free code paths by about 5% to 7%.
         "-fno-stack-protector",
-        "-fno-emulated-tls",
 
         "-Wall",
         "-Wextra",
         "-Wunused",
         "-Wno-unused-result",
+        "-Wconversion",
 
         "-Werror=pointer-to-int-cast",
         "-Werror=int-to-pointer-cast",
@@ -115,13 +136,17 @@ cc_defaults {
     srcs: [
         "standalone/checksum.cpp",
         "standalone/common.cpp",
+        "standalone/condition_variable_linux.cpp",
         "standalone/flags.cpp",
         "standalone/flags_parser.cpp",
         "standalone/linux.cpp",
+        "standalone/mem_map.cpp",
+        "standalone/mem_map_linux.cpp",
         "standalone/release.cpp",
         "standalone/report.cpp",
-        "standalone/rss_limit_checker.cpp",
+        "standalone/report_linux.cpp",
         "standalone/string_utils.cpp",
+        "standalone/timing.cpp",
         "standalone/wrappers_c_bionic.cpp"
     ],
     arch: {
@@ -171,9 +196,12 @@ cc_library_static {
     ],
     visibility: [
       "//bionic:__subpackages__",
-      "//frameworks/libs/native_bridge_support/libc:__subpackages__",
+      "//frameworks/libs/native_bridge_support/android_api/libc:__subpackages__",
       "//system/core/debuggerd:__subpackages__",
     ],
+    apex_available: [
+        "com.android.runtime",
+    ],
 }
 
 cc_library_static {
@@ -186,15 +214,22 @@ cc_library_static {
 
 cc_defaults {
     name: "scudo_unit_tests_default",
+    defaults: ["scudo_config_defaults"],
+    isolated: true,
     static_libs: ["libscudo_for_testing"],
     include_dirs: [
         "external/scudo/standalone",
         "external/scudo/standalone/include",
     ],
     cflags: [
-        "-Wno-unused-parameter",
-        "-fno-emulated-tls",
+        "-Wconversion",
+        // In memtag_test.cpp, some tests are disabled by GTEST_SKIP() so that
+        // they won't be run. However, for those disabled tests, it may contain
+        // unreachable code paths which will mislead some compiler checks. Given
+        // this flag won't be impacted too much, disable it only in the test.
+        "-Wno-unreachable-code-loop-increment",
         "-DSCUDO_DEBUG",
+        "-DSCUDO_NO_TEST_MAIN",
     ],
     target: {
         bionic: {
@@ -203,45 +238,54 @@ cc_defaults {
     },
     test_suites: ["general-tests"],
     bootstrap: true,
+    srcs: [
+        "standalone/tests/scudo_unit_test_main.cpp",
+    ],
 }
 
 cc_test {
     name: "scudo_unit_tests",
     defaults: ["scudo_unit_tests_default"],
-    // Temporarily disabled on host due to a 15-20s per-test timeout,
-    // which is currently exceeded by ScudoCombinedTest.BasicCombined.
-    host_supported: false,
+    host_supported: true,
     srcs: [
+        "standalone/tests/allocator_config_test.cpp",
         "standalone/tests/atomic_test.cpp",
         "standalone/tests/bytemap_test.cpp",
         "standalone/tests/checksum_test.cpp",
         "standalone/tests/chunk_test.cpp",
         "standalone/tests/combined_test.cpp",
+        "standalone/tests/condition_variable_test.cpp",
         "standalone/tests/flags_test.cpp",
         "standalone/tests/list_test.cpp",
         "standalone/tests/map_test.cpp",
+        "standalone/tests/memtag_test.cpp",
         "standalone/tests/mutex_test.cpp",
         "standalone/tests/primary_test.cpp",
         "standalone/tests/quarantine_test.cpp",
         "standalone/tests/release_test.cpp",
         "standalone/tests/report_test.cpp",
-        "standalone/tests/scudo_unit_test_main.cpp",
         "standalone/tests/secondary_test.cpp",
         "standalone/tests/size_class_map_test.cpp",
         "standalone/tests/stats_test.cpp",
         "standalone/tests/strings_test.cpp",
+        "standalone/tests/timing_test.cpp",
         "standalone/tests/tsd_test.cpp",
         "standalone/tests/vector_test.cpp",
     ],
 }
 
 cc_test {
-    name: "scudo_hooks_unit_tests",
+    name: "scudo_wrappers_unit_tests",
     defaults: ["scudo_unit_tests_default"],
-    host_supported: true,
+    // These are wrapper tests, disable the host tests since they would run
+    // against glibc.
+    host_supported: false,
+    cflags: [
+        "-Wno-mismatched-new-delete",
+    ],
     srcs: [
-      "standalone/tests/scudo_hooks_test.cpp",
-      "standalone/tests/scudo_unit_test_main.cpp",
+        "standalone/tests/wrappers_c_test.cpp",
+        "standalone/tests/wrappers_cpp_test.cpp",
     ],
 }
 
@@ -262,3 +306,73 @@ cc_fuzz {
         componentid: 87896
     },
 }
+
+cc_test {
+    name: "size_map_verify_unit_tests",
+    host_supported: true,
+    static_libs: ["libscudo"],
+
+    include_dirs: [
+        "external/scudo/android/tools",
+        "external/scudo/standalone",
+        "external/scudo/standalone/include",
+        "external/scudo/standalone/tools",
+    ],
+    srcs: [
+        "android/tests/size_map_verify_unit_tests.cpp",
+    ],
+
+}
+
+cc_binary {
+    name: "size_map_gen",
+    defaults: ["scudo_config_defaults"],
+    host_supported: true,
+    static_libs: ["libscudo"],
+    include_dirs: [
+        "external/scudo/android/tools",
+        "external/scudo/standalone",
+        "external/scudo/standalone/include",
+    ],
+    srcs: ["android/tools/size_map_gen.cpp"],
+}
+
+// The targets below verify that all configuration is set up properly for
+// the library or tests.
+cc_defaults {
+    name: "scudo_verify_defaults",
+    host_supported: true,
+    srcs: ["config/config_build_check.cpp"],
+
+    include_dirs: [
+        "external/scudo/standalone",
+    ],
+
+    product_variables: {
+        malloc_low_memory: {
+            cflags: ["-DSCUDO_LOW_MEMORY_CHECK"],
+        },
+    },
+}
+
+cc_library {
+    name: "libscudo_verify_config",
+    stl: "libc++",
+    defaults: [
+        "scudo_verify_defaults",
+        "libscudo_defaults",
+    ],
+    target: {
+        bionic: {
+            system_shared_libs: ["libc"],
+        },
+    },
+}
+
+cc_test {
+    name: "scudo_verify_config",
+    defaults: [
+        "scudo_verify_defaults",
+        "scudo_unit_tests_default",
+    ],
+}
diff --git a/PREUPLOAD.cfg b/PREUPLOAD.cfg
new file mode 100644
index 00000000000..dcf92be1eb8
--- /dev/null
+++ b/PREUPLOAD.cfg
@@ -0,0 +1,8 @@
+[Builtin Hooks]
+clang_format = true
+
+[Builtin Hooks Options]
+clang_format = --commit ${PREUPLOAD_COMMIT} --style file --extensions c,h,cc,cpp
+
+[Hook Scripts]
+aosp_hook = ${REPO_ROOT}/frameworks/base/tools/aosp/aosp_sha.sh ${PREUPLOAD_COMMIT} "."
diff --git a/TEST_MAPPING b/TEST_MAPPING
index 32f13f0954f..5de14076e5e 100644
--- a/TEST_MAPPING
+++ b/TEST_MAPPING
@@ -4,7 +4,7 @@
       "name": "scudo_unit_tests"
     },
     {
-      "name": "scudo_hooks_unit_tests"
+      "name": "scudo_wrappers_unit_tests"
     },
     {
       "name": "memunreachable_unit_test"
diff --git a/android/tests/size_map_verify_unit_tests.cpp b/android/tests/size_map_verify_unit_tests.cpp
new file mode 100644
index 00000000000..5aac2c7b900
--- /dev/null
+++ b/android/tests/size_map_verify_unit_tests.cpp
@@ -0,0 +1,289 @@
+//===-- size_map_verify_unit_tests.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "libsize_map_verify.h"
+#include "tests/scudo_unit_test.h"
+
+namespace scudo {
+
+class SmokeConfigTest {
+  // This test is the base test config.
+public:
+  static constexpr u32 Classes[] = {
+      32,   48,   64,    80,    96,    112,   144,   176,   192,   224,   288,
+      352,  448,  592,   800,   1104,  1648,  2096,  2576,  3120,  4112,  4624,
+      7120, 8720, 11664, 14224, 16400, 18448, 23056, 29456, 33296, 65552,
+  };
+  static const u32 MinSizeLog = 4;
+  static const u32 MidSizeLog = 6;
+  static const u32 MaxSizeLog = 16;
+  static const u32 NumBits = 7;
+
+  static const u32 SizeDelta = 16;
+
+  static const u32 MaxNumCachedHint = 13;
+  static const u32 MaxBytesCachedLog = 13;
+};
+TEST(ScudoToolSizeMapVerifyTest, generate_smoke_config) {
+  std::string NumBitsMessage;
+  EXPECT_TRUE(generateNumBits<SmokeConfigTest>(NumBitsMessage));
+  EXPECT_EQ("NumBits = 7\n", NumBitsMessage);
+}
+TEST(ScudoToolSizeMapVerifyTest, verify_smoke_config) {
+  std::string verifySizeMessage;
+  EXPECT_TRUE(verifySizeClass<SmokeConfigTest>(verifySizeMessage));
+  EXPECT_EQ("MidSizeLog non-szTable formula is used until: 80\n",
+            verifySizeMessage);
+}
+class SizeIncreaseConfigPass {
+  // Test shows that when every size and Min/Mid/Max changes that
+  // NumBits remains the same.
+  // Demonstrating NumBits changes based on how close the sizes
+  // are to each other.
+public:
+  static constexpr u32 Classes[] = {
+      80,    144,   208,   272,   336,   400,    528,    656,
+      720,   848,   1104,  1360,  1744,  2320,   3152,   4368,
+      6544,  8336,  10256, 12432, 16400, 18448,  28432,  34832,
+      46608, 56848, 65552, 73744, 92176, 117776, 133136, 262160,
+  };
+  static const u32 MinSizeLog = 6;
+  static const u32 MidSizeLog = 8;
+  static const u32 MaxSizeLog = 18;
+  static const u32 NumBits = 7;
+
+  static const u32 SizeDelta = 16;
+
+  static const u32 MaxNumCachedHint = 13;
+  static const u32 MaxBytesCachedLog = 13;
+};
+TEST(ScudoToolSizeMapVerifyTest, generate_size_increase_config) {
+  std::string NumBitsMessage;
+  EXPECT_TRUE(generateNumBits<SizeIncreaseConfigPass>(NumBitsMessage));
+  EXPECT_EQ("NumBits = 7\n", NumBitsMessage);
+}
+TEST(ScudoToolSizeMapVerifyTest, verify_size_increase_config) {
+  std::string verifySizeMessage;
+  EXPECT_TRUE(verifySizeClass<SizeIncreaseConfigPass>(verifySizeMessage));
+  EXPECT_EQ("MidSizeLog non-szTable formula is used until: 272\n",
+            verifySizeMessage);
+}
+class MaxSizeConfigPass {
+  // This config uses the largest sizes permitted in size_class_map
+  // showing that NumBits does not need to increase due to sizes being
+  // too large and also shows the limit for MaxSizeLog.
+  // Primary allocator works up to 524304.
+public:
+  static constexpr u32 Classes[] = {
+      144,   272,    400,    528,    656,    784,    1040,   1296,
+      1424,  1680,   2192,   2704,   3472,   4624,   6288,   8720,
+      13072, 16656,  20496,  24848,  32784,  36880,  56848,  69648,
+      93200, 113680, 131088, 147472, 184336, 235536, 266256, 524304,
+  };
+  static const u32 MinSizeLog = 7;
+  static const u32 MidSizeLog = 9;
+  static const u32 MaxSizeLog = 19;
+  static const u32 NumBits = 7;
+
+  static const u32 SizeDelta = 16;
+
+  static const u32 MaxNumCachedHint = 13;
+  static const u32 MaxBytesCachedLog = 13;
+};
+TEST(ScudoToolSizeMapVerifyTest, generate_max_size_config) {
+  std::string NumBitsMessage;
+  EXPECT_TRUE(generateNumBits<MaxSizeConfigPass>(NumBitsMessage));
+  EXPECT_EQ("NumBits = 7\n", NumBitsMessage);
+}
+TEST(ScudoToolSizeMapVerifyTest, verify_max_size_config) {
+  std::string verifySizeMessage;
+  EXPECT_TRUE(verifySizeClass<MaxSizeConfigPass>(verifySizeMessage));
+  EXPECT_EQ("MidSizeLog non-szTable formula is used until: 528\n",
+            verifySizeMessage);
+}
+class SizeDecreaseConfigFail {
+  // The NumBits decreasing causes a failure:
+  // NumBits not large enough to notice bit difference between numbers.
+  // NumBits cannot be increased due to MidSizeLog - 1 being the limit.
+public:
+  static constexpr u32 Classes[] = {
+      24,   32,   40,   48,   56,   64,   80,    96,    104,   120,   152,
+      184,  232,  304,  408,  560,  832,  1056,  1296,  1568,  2064,  2320,
+      3568, 4368, 5840, 7120, 8208, 9232, 11536, 14736, 16656, 32784,
+  };
+  static const u32 MinSizeLog = 3;
+  static const u32 MidSizeLog = 5;
+  static const u32 MaxSizeLog = 15;
+  static const u32 NumBits = 6;
+
+  static const u32 SizeDelta = 16;
+
+  static const u32 MaxNumCachedHint = 13;
+  static const u32 MaxBytesCachedLog = 13;
+};
+TEST(ScudoToolSizeMapVerifyTest, generate_size_decrease_config) {
+  std::string NumBitsMessage;
+  EXPECT_FALSE(generateNumBits<SizeDecreaseConfigFail>(NumBitsMessage));
+  EXPECT_NE("NumBits = 7\n", NumBitsMessage);
+}
+TEST(ScudoToolSizeMapVerifyTest, verify_size_decrease_config) {
+  std::string verifySizeMessage;
+  EXPECT_FALSE(verifySizeClass<SizeDecreaseConfigFail>(verifySizeMessage));
+  EXPECT_EQ(
+      "MidSizeLog non-szTable formula is used until: 48\n\nNumBits not "
+      "large enough to distinguish between values. \nHard max NumBits - 1 "
+      "cannot exceed MidSizeLog.\nIf NumBits is at max then increase "
+      "Min/Mid/Max sizelogs and increase the sizes accordingly.\n\n\n",
+      verifySizeMessage);
+}
+class MidSizeLog10ConfigPass {
+  // Expands the use of the non-table formula to 1040.
+  // Shows how to expand the non-table formula by increasing MidSizeLog and
+  // by ensuring an equal step between sizes up to MidSizeLog.
+  // Shows the tool's ability to use a larger MidSizeLog and a smaller szTable.
+  // Demonstrates how many sizes are needed to increase the MidSizeLog.
+public:
+  static constexpr u32 Classes[] = {
+      32,    48,    64,    80,    96,   112,  128,  144,   160,   176,   192,
+      208,   224,   240,   256,   272,  288,  304,  320,   336,   352,   368,
+      384,   400,   416,   432,   448,  464,  480,  496,   512,   528,   544,
+      560,   576,   592,   608,   624,  640,  656,  672,   688,   704,   720,
+      736,   752,   768,   784,   800,  816,  832,  848,   864,   880,   896,
+      912,   928,   944,   960,   976,  992,  1008, 1024,  1040,  1104,  1648,
+      2096,  2576,  3120,  4112,  4624, 7120, 8720, 11664, 14224, 16400, 18448,
+      23056, 29456, 33296, 65552,
+  };
+  static const u32 MinSizeLog = 4;
+  static const u32 MidSizeLog = 10;
+  static const u32 MaxSizeLog = 16;
+  static const u32 NumBits = 7;
+
+  static const u32 SizeDelta = 16;
+
+  static const u32 MaxNumCachedHint = 13;
+  static const u32 MaxBytesCachedLog = 13;
+};
+TEST(ScudoToolSizeMapVerifyTest, generate_midsizelog_10_config) {
+  std::string NumBitsMessage;
+  EXPECT_TRUE(generateNumBits<MidSizeLog10ConfigPass>(NumBitsMessage));
+  EXPECT_EQ("NumBits = 7\n", NumBitsMessage);
+}
+TEST(ScudoToolSizeMapVerifyTest, verify_midsizelog_10_config) {
+  std::string verifySizeMessage;
+  EXPECT_TRUE(verifySizeClass<MidSizeLog10ConfigPass>(verifySizeMessage));
+  EXPECT_EQ("MidSizeLog non-szTable formula is used until: 1040\n",
+            verifySizeMessage);
+}
+class NumBitsIncreaseConfigPass {
+  // Demonstrates when to increase NumBits and how to do it.
+  // Ensure NumBits - 1 <= MidSizeLog, with an equal step until MidSizeLog.
+  // Increasing NumBits allows more bits to be checked when analyzing sizes
+  // NumBits 8 checks 7 bits from the most-significant-bit-index.
+  // Here NumBits 8 is needed for the sizes 288 and 290.
+  // Shows NumBits increases szTable's flexibility for new sizes.
+  // Another condition to remember:
+  // Sizes cannot be just 1 larger than previous size.
+public:
+  static constexpr u32 Classes[] = {
+      32,    48,    64,    80,    96,    112,   128,   144,  176,
+      192,   224,   288,   290,   352,   448,   592,   800,  1104,
+      1648,  2096,  2576,  3120,  4112,  4624,  7120,  8720, 11664,
+      14224, 16400, 18448, 23056, 29456, 33296, 65552,
+  };
+  static const u32 MinSizeLog = 4;
+  static const u32 MidSizeLog = 7;
+  static const u32 MaxSizeLog = 16;
+  static const u32 NumBits = 8;
+
+  static const u32 SizeDelta = 16;
+
+  static const u32 MaxNumCachedHint = 13;
+  static const u32 MaxBytesCachedLog = 13;
+};
+TEST(ScudoToolSizeMapVerifyTest, generate_numbits_increase_config) {
+  std::string NumBitsMessage;
+  EXPECT_TRUE(generateNumBits<NumBitsIncreaseConfigPass>(NumBitsMessage));
+  EXPECT_EQ("NumBits = 8\n", NumBitsMessage);
+}
+TEST(ScudoToolSizeMapVerifyTest, verify_numbits_increase_config) {
+  std::string verifySizeMessage;
+  EXPECT_TRUE(verifySizeClass<NumBitsIncreaseConfigPass>(verifySizeMessage));
+  EXPECT_EQ("MidSizeLog non-szTable formula is used until: 144\n",
+            verifySizeMessage);
+}
+class MidEqualMaxConfigPass {
+  // The equality of MidSizeLog and MaxSizeLog shows how the szTable
+  // does not need to be used, which makes NumBits obselete.
+  // The test shows that the formula can be used for every size.
+public:
+  static constexpr u32 Classes[] = {
+      32,  48,  64,  80,  96,  112, 128, 144, 160, 176,  192,  208,  224,
+      240, 256, 272, 288, 304, 320, 336, 352, 368, 384,  400,  416,  432,
+      448, 464, 480, 496, 512, 528, 544, 560, 576, 592,  608,  624,  640,
+      656, 672, 688, 704, 720, 736, 752, 768, 784, 800,  816,  832,  848,
+      864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, 1040,
+  };
+  static const u32 MinSizeLog = 4;
+  static const u32 MidSizeLog = 10;
+  static const u32 MaxSizeLog = 10;
+  static const u32 NumBits = 7;
+
+  static const u32 SizeDelta = 16;
+
+  static const u32 MaxNumCachedHint = 13;
+  static const u32 MaxBytesCachedLog = 13;
+};
+TEST(ScudoToolSizeMapVerifyTest, generate_mid_equal_max_config) {
+  std::string NumBitsMessage;
+  EXPECT_TRUE(generateNumBits<MidEqualMaxConfigPass>(NumBitsMessage));
+  EXPECT_EQ(
+      "MidSizeLog = MaxSizeLog, NumBits not used for these sizes. Only uses "
+      "the formula without szTable.\n",
+      NumBitsMessage);
+}
+TEST(ScudoToolSizeMapVerifyTest, verify_mid_equal_max_config) {
+  std::string verifySizeMessage;
+  EXPECT_TRUE(verifySizeClass<MidEqualMaxConfigPass>(verifySizeMessage));
+  EXPECT_EQ("MidSizeLog non-szTable formula is used until: 1040\nMidSizeLog = "
+            "MaxSizeLog, szTable and NumBits are not used at all.\n",
+            verifySizeMessage);
+}
+class SizeDeltaConfigPass {
+  // Test shows that when changing SizeDelta, min and max have to
+  // change to match.
+  // Every size needs to change by whatever the SizeDelta changed by.
+  // Sizes need to be added to make Mid match.
+public:
+  static constexpr u32 Classes[] = {
+      16,   24,    32,    40,    48,    56,    64,    72,    88,
+      104,  136,   168,   184,   216,   280,   344,   440,   584,
+      792,  1096,  1640,  2088,  2568,  3112,  4104,  4616,  7112,
+      8712, 11656, 14216, 16392, 18440, 23048, 29448, 33288, 65544,
+  };
+  static const u32 MinSizeLog = 3;
+  static const u32 MidSizeLog = 6;
+  static const u32 MaxSizeLog = 16;
+  static const u32 NumBits = 7;
+
+  static const u32 SizeDelta = 8;
+
+  static const u32 MaxNumCachedHint = 13;
+  static const u32 MaxBytesCachedLog = 13;
+};
+TEST(ScudoToolSizeMapVerifyTest, generate_size_delta_config) {
+  std::string NumBitsMessage;
+  EXPECT_TRUE(generateNumBits<SizeDeltaConfigPass>(NumBitsMessage));
+  EXPECT_EQ("NumBits = 7\n", NumBitsMessage);
+}
+TEST(ScudoToolSizeMapVerifyTest, verify_size_delta_config) {
+  std::string verifySizeMessage;
+  EXPECT_TRUE(verifySizeClass<SizeDeltaConfigPass>(verifySizeMessage));
+  EXPECT_EQ("MidSizeLog non-szTable formula is used until: 72\n",
+            verifySizeMessage);
+}
+} // namespace scudo
+\ No newline at end of file
diff --git a/android/tools/libsize_map_verify.h b/android/tools/libsize_map_verify.h
new file mode 100644
index 00000000000..98b1835d48a
--- /dev/null
+++ b/android/tools/libsize_map_verify.h
@@ -0,0 +1,308 @@
+//===-- libsize_map_verify.h -------------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_LIBSIZE_MAP_VERIFY_H_
+#define SCUDO_LIBSIZE_MAP_VERIFY_H_
+
+#include "common.h"
+#include "size_class_map.h"
+#include <string>
+#include <vector>
+
+namespace scudo {
+typedef u8 szTableT;
+
+// Returns the index of each size class.
+// Attempting to find the smallest size that fits within each size class.
+// Example:
+// If a size class is 8 then 4,5,6,7,8 return the index of size 8
+// but 9 would return the index of the next size class, 16.
+u8 computeClassId(uptr Size, u32 ClassesSize, u32 Classes[]) {
+  for (uptr i = 0; i != ClassesSize; ++i)
+    if (Size <= Classes[i])
+      return static_cast<u8>(i + 1);
+  return static_cast<u8>(-1);
+}
+// Function returns a vector that contains the classIds for all the sizes.
+// Needed to check if the NumBits generated assigns the indexes to
+// classIds correctly.
+std::vector<u8> szTableCreate(u32 NumBits, u32 MidSizeLog, u32 MaxSizeLog,
+                              u32 SizeDelta, u32 ClassesSize, u32 Classes[]) {
+  std::vector<u8> Tab((MaxSizeLog - MidSizeLog) << NumBits);
+  // Pos starts at the MidSize, which ignores the sizes not used in szTable.
+  // Inc uses NumBits - 1 to determine the starting incrementing value.
+  // Tab gets the classId of each size based on computeClassId.
+  uptr Pos = 1 << MidSizeLog;
+  uptr Inc = 1 << (MidSizeLog - NumBits);
+  for (uptr i = 0; i != Tab.size(); ++i) {
+    Pos += Inc;
+    if ((Pos & (Pos - 1)) == 0)
+      Inc *= 2;
+    Tab[i] = computeClassId(Pos + SizeDelta, ClassesSize, Classes);
+  }
+  return Tab;
+};
+
+// Function returns the index of the first value greater than MidSizeLog.
+template <typename Config> uptr findMidSizeIndex() {
+  const u32 len = sizeof(Config::Classes) / sizeof(Config::Classes[0]);
+  u32 largerMid = 0;
+  for (uptr i = 0; i < len; ++i) {
+    if (Config::Classes[i] > (1 << Config::MidSizeLog) + Config::SizeDelta) {
+      largerMid = i;
+      break;
+    }
+  }
+  return largerMid;
+}
+
+// Calculates the minimum NumBits that can be used for the given sizes and
+// Min/Mid/Max. Smaller NumBits creates a szTable nearly half the size and
+// quickens navigation of the table. The sizes smaller than MidSizeLog do not
+// use NumBits or szTable, instead using a formula. This method is faster but
+// requires sizes to have the exact same spacing of 2^MinSizeLog; therefore,
+// having an efficient NumBits allows for the table to be more flexible than the
+// formula while still moving at a reasonable speed.
+template <typename Config> bool generateNumBits(std::string &manipMessage) {
+  // In size_class_map S is used, so it is used for consistency.
+  u32 S = Config::NumBits - 1;
+  const u32 len = sizeof(Config::Classes) / sizeof(Config::Classes[0]);
+  // This is used to display the NumBits calculated
+  u32 minNumBits = S;
+  // largerMid equals the index of the first value greater than MidSizeLog.
+  // These sizes are the only ones used with NumBits, smaller sizes are
+  // ignored.
+  const u32 largerMid = findMidSizeIndex<Config>();
+  if (largerMid == 0) {
+    manipMessage +=
+        "MidSizeLog = MaxSizeLog, NumBits not used for these sizes. "
+        "Only uses the formula without szTable.\n";
+    return true;
+  }
+
+  // Create Classes array that can be inputed into functions and referenced.
+  u32 ClassesFunc[len];
+  for (uptr i = 0; i < len; ++i)
+    ClassesFunc[i] = Config::Classes[i];
+  // Create smaller Classes array that can be manipulated to calculate NumBits.
+  u32 ClassesManip[len - largerMid];
+  for (uptr i = 0; i < len - largerMid; ++i)
+    ClassesManip[i] = ClassesFunc[i + largerMid] - Config::SizeDelta;
+
+  u32 holdIndex[len - largerMid];
+  bool failed = false;
+  // Starting at intial S, it decreases to find the smallest working S
+  // for the current config.
+  for (; S > 0; --S) {
+    // For each size it calls the on the algorithm which retuns an index.
+    for (uptr i = 0; i < len - largerMid; ++i)
+      holdIndex[i] = scaledLog2(ClassesManip[i] - 1, Config::MidSizeLog, S);
+
+    // Vector that holds classIds for sizes that is navigated using indexes
+    // stored in holdIndex.
+    std::vector szTableT =
+        szTableCreate(S, Config::MidSizeLog, Config::MaxSizeLog,
+                      Config::SizeDelta, len, ClassesFunc);
+
+    // Checks that each index in holdIndex should refer to a unique classId,
+    // therefore a unique size a duplicate means that the calculated index
+    // for two different sizes refers to the same classId.
+    for (uptr i = 1; i < len - largerMid; ++i) {
+      if (szTableT[holdIndex[i]] == szTableT[holdIndex[i - 1]]) {
+        failed = true;
+        break;
+      }
+    }
+    if (failed == true)
+      break;
+  }
+  // Setting minNumBits to the last working NumBits and Numbits = S + 1.
+  minNumBits = S + 2;
+  // Adds a check to ensure NumBits calculated is not too large.
+  if (minNumBits - 1 > Config::MidSizeLog) {
+    manipMessage +=
+        "Calculated Numbits too large. The max size for NumBits is: "
+        "NumBits - 1 = MidSizeLog.\n"
+        "NumBits = " +
+        std::to_string(minNumBits) + "\n";
+    return false;
+  }
+  manipMessage += "NumBits = " + std::to_string(minNumBits) + "\n";
+  return true;
+}
+
+// Verify the sizes and variables entered are functional.
+// If not, gives a brief explaination of the error.
+template <typename Config> bool verifySizeClass(std::string &manipMessage) {
+  const u32 len = sizeof(Config::Classes) / sizeof(Config::Classes[0]);
+  u32 ClassesFunc[len];
+  for (uptr i = 0; i < len; ++i)
+    ClassesFunc[i] = Config::Classes[i];
+
+  // Verify smallest size = MinSizeLog and largest size = MaxSizeLog.
+  // Log base 2 of (smallest size - SizeDelta) and
+  // Log base 2 of (largest size - SizeDelta).
+  const u32 MinSize = (1 << Config::MinSizeLog);
+  const u32 MaxSize = (1 << Config::MaxSizeLog);
+  if (ClassesFunc[0] - Config::SizeDelta != MinSize) {
+    manipMessage += "MinSizeLog + SizeDelta not equal to the smallest size. " +
+                    std::to_string(MinSize) +
+                    " != " + std::to_string(ClassesFunc[0]) + "\n\n";
+    return false;
+  }
+  if (ClassesFunc[len - 1] - Config::SizeDelta != MaxSize) {
+    manipMessage += "Largest size (" + std::to_string(ClassesFunc[len - 1]) +
+                    ") - SizeDelta (" + std::to_string(Config::SizeDelta) +
+                    ") != MaxSize (" + std::to_string(MaxSize) + ")\n\n";
+    return false;
+  }
+  // Verify MidSizeLog is greater than MinSizeLog.
+  const u32 MidSize = (1 << Config::MidSizeLog);
+  if (MidSize <= MinSize) {
+    manipMessage +=
+        "MidSizeLog needs to be greater than MinSizeLog\n"
+        "If the MidSizeLog is equal to MinSizeLog then the szTable will be "
+        "used for every size.\nMin size = " +
+        std::to_string(MinSize) + "\tMid size = " + std::to_string(MidSize) +
+        "\n\n";
+    return false;
+  }
+
+  // Displays why MidSizeLog is not working.
+  for (uptr i = 1; i < len; ++i) {
+    // If the step ends prior to MidSize, the step needs extending.
+    if (ClassesFunc[i] - ClassesFunc[i - 1] != 1 << Config::MinSizeLog &&
+        ClassesFunc[i - 1] - Config::SizeDelta < MidSize) {
+      manipMessage +=
+          "MidSizeLog non-table formula can be used until: " +
+          std::to_string(ClassesFunc[i - 1]) +
+          "\n\nCurrently stops at: " + std::to_string(MidSize) +
+          "\nFor size_map to work, formula must work for a number >= "
+          "the current MidSize.\nMidSizeLog is either too large or their "
+          "is not an equal step between desired sizes."
+          "\nThe step between sizes should equal 2^MinSizeLog.\n\n";
+      return false;
+    } else if (ClassesFunc[i] - ClassesFunc[i - 1] != 1 << Config::MinSizeLog ||
+               MidSize == MaxSize) {
+      manipMessage += "MidSizeLog non-szTable formula is used until: " +
+                      std::to_string(MidSize + Config::SizeDelta) + "\n";
+      break;
+    }
+  }
+  // Verifying if the MidSizeLog and MaxSizeLog.
+  if (MidSize == MaxSize) {
+    manipMessage +=
+        "MidSizeLog = MaxSizeLog, szTable and NumBits are not used at "
+        "all.\n";
+    return true;
+  }
+
+  // Recreates NumBits arrays/vectors to verify the NumBits.
+  // Explained in generateNumBits.
+  u32 S = Config::NumBits - 1;
+  std::vector szTableT =
+      szTableCreate(S, Config::MidSizeLog, Config::MaxSizeLog,
+                    Config::SizeDelta, len, ClassesFunc);
+  const u32 largerMid = findMidSizeIndex<Config>();
+  u32 ClassesManip[len - largerMid];
+  for (uptr i = 0; i < len - largerMid; ++i)
+    ClassesManip[i] = Config::Classes[i + largerMid] - Config::SizeDelta;
+  u32 holdIndex[len - largerMid];
+  for (uptr i = 0; i < len - largerMid; ++i)
+    holdIndex[i] = scaledLog2(ClassesManip[i] - 1, Config::MidSizeLog, S);
+
+  for (uptr i = 1; i < len - largerMid; ++i) {
+    if (szTableT[holdIndex[i]] == szTableT[holdIndex[i - 1]]) {
+      manipMessage +=
+          "\nNumBits not large enough to distinguish between values. "
+          "\nHard max NumBits - 1 cannot exceed MidSizeLog.\n"
+          "If NumBits is at max then increase Min/Mid/Max sizelogs and "
+          "increase the sizes accordingly.\n\n\n";
+      return false;
+    }
+  }
+  return true;
+}
+
+// Display to what size MidSizeLog will work with and most efficient numbers.
+// MidSizeLog uses a formula, not a table.
+template <typename Config> void optimizeMidSizeLog(std::string &manipMessage) {
+  const u32 len = sizeof(Config::Classes) / sizeof(Config::Classes[0]);
+  u32 ClassesFunc[len];
+  for (uptr i = 0; i < len; ++i)
+    ClassesFunc[i] = Config::Classes[i];
+
+  const u32 MaxSize = (1 << Config::MaxSizeLog);
+  const u32 MidSize = (1 << Config::MidSizeLog);
+  for (uptr i = 1; i < len; ++i) {
+    if (ClassesFunc[i] - ClassesFunc[i - 1] == 1 << Config::MinSizeLog)
+      continue;
+    manipMessage +=
+        "MidSizeLog non-table formula can be used until: " +
+        std::to_string(ClassesFunc[i - 1]) +
+        "\nCurrently stops at: " + std::to_string(MidSize + Config::SizeDelta) +
+        "\n";
+    if (MidSize == ClassesFunc[i - 1] - Config::SizeDelta) {
+      manipMessage +=
+          "MidSizeLog is used efficiently and fully for current config\n";
+    } else {
+      manipMessage +=
+          "For size_map to work, formula must work for a number "
+          ">= the current MidSize.\nMax efficiency is achieved if they "
+          "are equal.\n";
+      if (ClassesFunc[i - 1] - Config::SizeDelta > MidSize) {
+        manipMessage +=
+            "In order to match numbers, increase MidSizeLog.\nEnsure "
+            "each size up to the new MidSize has an equal step between "
+            "each size.\nThe step equals 2^MinSizeLog.\n";
+      } else if (ClassesFunc[i - 1] - Config::SizeDelta < MidSize) {
+        manipMessage +=
+            "MidSizeLog is either too large or their is not an equal "
+            "step between desired sizes.\nThe step between sizes "
+            "should equal 2^MinSizeLog.\n";
+      }
+    }
+    break;
+  }
+  if (ClassesFunc[len - 1] - Config::SizeDelta == MidSize) {
+    manipMessage +=
+        "MidSizeLog non-table formula can be used until: " +
+        std::to_string(ClassesFunc[len - 1]) +
+        "\nCurrently stops at: " + std::to_string(MidSize) +
+        "\n"
+        "MidSizeLog is used efficiently and fully for current config\n";
+  }
+}
+
+// Dumps the size of szTable in elements and bits.
+template <typename Config> bool dumpszTableInfo(std::string &manipMessage) {
+  u32 S = Config::NumBits - 1;
+  const u32 len = sizeof(Config::Classes) / sizeof(Config::Classes[0]);
+  u32 minNumBits = S;
+  const u32 largerMid = findMidSizeIndex<Config>();
+  bool failed = false;
+
+  if (largerMid == 0) {
+    manipMessage += "Does not use NumBits. MidSizeLog = MaxsizeLog.";
+    return true;
+  }
+  u32 ClassesFunc[len];
+  for (uptr i = 0; i < len; ++i)
+    ClassesFunc[i] = Config::Classes[i];
+  std::vector szTableT =
+      szTableCreate(S, Config::MidSizeLog, Config::MaxSizeLog,
+                    Config::SizeDelta, len, ClassesFunc);
+  manipMessage +=
+      "szTable Number of Elements: " + std::to_string(szTableT.size()) +
+      "\nSize of szTable in Bits: " +
+      std::to_string(szTableT.size() * sizeof(u8)) + "\n";
+  return true;
+}
+} // namespace scudo
+
+#endif // SCUDO_LIBSIZE_MAP_VERIFY_H_
diff --git a/android/tools/size_map_gen.cpp b/android/tools/size_map_gen.cpp
new file mode 100644
index 00000000000..177b8556624
--- /dev/null
+++ b/android/tools/size_map_gen.cpp
@@ -0,0 +1,57 @@
+//===-- size_map_gen.cpp --------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "allocator_config.h"
+#include "libsize_map_verify.h"
+#include "size_class_map.h"
+#include <iostream>
+
+int main() {
+  bool fullyPassed = true;
+  std::string NumBitsMessage;
+  std::string verifySizeMessage;
+  std::string optimizeMessage;
+  std::string dumpMessage;
+
+  fullyPassed = fullyPassed &&
+                scudo::generateNumBits<scudo::AndroidNormalSizeClassConfig>(
+                    NumBitsMessage);
+  fullyPassed = fullyPassed &&
+                scudo::verifySizeClass<scudo::AndroidNormalSizeClassConfig>(
+                    verifySizeMessage);
+  scudo::optimizeMidSizeLog<scudo::AndroidNormalSizeClassConfig>(
+      optimizeMessage);
+  scudo::dumpszTableInfo<scudo::AndroidNormalSizeClassConfig>(dumpMessage);
+
+  if (!NumBitsMessage.empty()) {
+    std::cout << "NumBits Calculator:" << std::endl;
+    std::cout << NumBitsMessage << std::endl;
+  }
+  if (!verifySizeMessage.empty()) {
+    std::cout << "Sizes Verification:" << std::endl;
+    std::cout << verifySizeMessage << std::endl;
+  }
+  if (!verifySizeMessage.empty()) {
+    std::cout << "Optimizations:" << std::endl;
+    std::cout << optimizeMessage << std::endl;
+  }
+  if (!verifySizeMessage.empty()) {
+    std::cout << "szTable Dump:" << std::endl;
+    std::cout << dumpMessage << std::endl;
+  }
+
+  if (fullyPassed == true)
+    std::cout << "All Parameters Passed.\n\n";
+  else
+    std::cout << "Errors Detected. Check NumBits Calculator or Size "
+                 "Verification\n\n";
+
+  scudo::validateMap<scudo::AndroidNormalSizeClassMap>();
+
+  return fullyPassed ? 0 : 1;
+}
diff --git a/config/config_build_check.cpp b/config/config_build_check.cpp
new file mode 100644
index 00000000000..3f97fc7514e
--- /dev/null
+++ b/config/config_build_check.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2023 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <type_traits>
+
+#include "allocator_config.h"
+
+#if defined(SCUDO_LOW_MEMORY_CHECK)
+static_assert(
+    std::is_same<scudo::Config, scudo::AndroidLowMemoryConfig>() == true,
+    "Low Memory is enabled, but AndroidLowMemoryConfig is not the default");
+#else
+static_assert(std::is_same<scudo::Config, scudo::AndroidNormalConfig>() == true,
+              "Not using AndrodNormalConfig as the default");
+#endif
+
+static_assert(std::is_same<scudo::Config, scudo::DefaultConfig>() == true,
+              "DefaultConfig and Config are not the same");
diff --git a/config/custom_scudo_config.h b/config/custom_scudo_config.h
new file mode 100644
index 00000000000..a0fa59230ce
--- /dev/null
+++ b/config/custom_scudo_config.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) 2023 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+//===-- custom_scudo-config.h -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+// Use a custom config instead of the config found in allocator_config.h
+namespace scudo {
+
+struct AndroidNormalSizeClassConfig {
+#if SCUDO_WORDSIZE == 64U
+  static const uptr NumBits = 7;
+  static const uptr MinSizeLog = 4;
+  static const uptr MidSizeLog = 6;
+  static const uptr MaxSizeLog = 16;
+  static const u16 MaxNumCachedHint = 13;
+  static const uptr MaxBytesCachedLog = 13;
+
+  static constexpr uptr Classes[] = {
+      0x00020, 0x00030, 0x00040, 0x00050, 0x00060, 0x00070, 0x00090, 0x000b0,
+      0x000c0, 0x000e0, 0x00120, 0x00160, 0x001c0, 0x00250, 0x00320, 0x00450,
+      0x00670, 0x00830, 0x00a10, 0x00c30, 0x01010, 0x01210, 0x01bd0, 0x02210,
+      0x02d90, 0x03790, 0x04010, 0x04810, 0x05a10, 0x07310, 0x08210, 0x10010,
+  };
+  static const uptr SizeDelta = 16;
+#else
+  static const uptr NumBits = 8;
+  static const uptr MinSizeLog = 4;
+  static const uptr MidSizeLog = 7;
+  static const uptr MaxSizeLog = 16;
+  static const u16 MaxNumCachedHint = 14;
+  static const uptr MaxBytesCachedLog = 13;
+
+  static constexpr uptr Classes[] = {
+      0x00020, 0x00030, 0x00040, 0x00050, 0x00060, 0x00070, 0x00080, 0x00090,
+      0x000a0, 0x000b0, 0x000c0, 0x000e0, 0x000f0, 0x00110, 0x00120, 0x00130,
+      0x00150, 0x00160, 0x00170, 0x00190, 0x001d0, 0x00210, 0x00240, 0x002a0,
+      0x00330, 0x00370, 0x003a0, 0x00400, 0x00430, 0x004a0, 0x00530, 0x00610,
+      0x00730, 0x00840, 0x00910, 0x009c0, 0x00a60, 0x00b10, 0x00ca0, 0x00e00,
+      0x00fb0, 0x01030, 0x01130, 0x011f0, 0x01490, 0x01650, 0x01930, 0x02010,
+      0x02190, 0x02490, 0x02850, 0x02d50, 0x03010, 0x03210, 0x03c90, 0x04090,
+      0x04510, 0x04810, 0x05c10, 0x06f10, 0x07310, 0x08010, 0x0c010, 0x10010,
+  };
+  static const uptr SizeDelta = 16;
+#endif
+};
+
+typedef TableSizeClassMap<AndroidNormalSizeClassConfig>
+    AndroidNormalSizeClassMap;
+
+#if defined(__LP64__)
+static_assert(AndroidNormalSizeClassMap::usesCompressedLSBFormat(), "");
+#endif
+
+struct AndroidNormalConfig {
+#if defined(__aarch64__)
+  static const bool MaySupportMemoryTagging = true;
+#else
+  static const bool MaySupportMemoryTagging = false;
+#endif
+  template <class A>
+  using TSDRegistryT = TSDRegistrySharedT<A, 8U, 2U>; // Shared, max 8 TSDs.
+
+  struct Primary {
+    using SizeClassMap = AndroidNormalSizeClassMap;
+#if SCUDO_CAN_USE_PRIMARY64
+    static const uptr RegionSizeLog = 28U;
+    typedef u32 CompactPtrT;
+    static const uptr CompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
+    static const uptr GroupSizeLog = 20U;
+    static const bool EnableRandomOffset = true;
+    static const uptr MapSizeIncrement = 1UL << 18;
+#else
+    static const uptr RegionSizeLog = 18U;
+    static const uptr GroupSizeLog = 18U;
+    typedef uptr CompactPtrT;
+#endif
+    static const s32 MinReleaseToOsIntervalMs = -1;
+    static const s32 MaxReleaseToOsIntervalMs = 1000;
+    static const s32 DefaultReleaseToOsIntervalMs = 1000;
+  };
+#if SCUDO_CAN_USE_PRIMARY64
+  template <typename Config> using PrimaryT = SizeClassAllocator64<Config>;
+#else
+  template <typename Config> using PrimaryT = SizeClassAllocator32<Config>;
+#endif
+
+  struct Secondary {
+    struct Cache {
+      static const u32 EntriesArraySize = 256U;
+      static const u32 QuarantineSize = 32U;
+      static const u32 DefaultMaxEntriesCount = 32U;
+      static const uptr DefaultMaxEntrySize = 2UL << 20;
+      static const s32 MinReleaseToOsIntervalMs = -1;
+      static const s32 MaxReleaseToOsIntervalMs = 1000;
+      static const s32 DefaultReleaseToOsIntervalMs = 0;
+    };
+    template <typename Config> using CacheT = MapAllocatorCache<Config>;
+  };
+
+  template <typename Config> using SecondaryT = MapAllocator<Config>;
+};
+
+struct AndroidLowMemoryConfig {
+#if defined(__aarch64__)
+  static const bool MaySupportMemoryTagging = true;
+#else
+  static const bool MaySupportMemoryTagging = false;
+#endif
+  template <class A> using TSDRegistryT = TSDRegistrySharedT<A, 1U, 1U>;
+
+  struct Primary {
+    // Use the same size class map as the normal config.
+    using SizeClassMap = AndroidNormalSizeClassMap;
+#if SCUDO_CAN_USE_PRIMARY64
+    static const uptr RegionSizeLog = 28U;
+    typedef u32 CompactPtrT;
+    static const uptr CompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
+    static const uptr GroupSizeLog = 18U;
+    static const bool EnableRandomOffset = true;
+    static const uptr MapSizeIncrement = 1UL << 18;
+#else
+    static const uptr RegionSizeLog = 20U;
+    static const uptr GroupSizeLog = 20U;
+    typedef uptr CompactPtrT;
+#endif
+    static const s32 MinReleaseToOsIntervalMs = 100;
+    static const s32 MaxReleaseToOsIntervalMs = 1000;
+  };
+#if SCUDO_CAN_USE_PRIMARY64
+  template <typename Config> using PrimaryT = SizeClassAllocator64<Config>;
+#else
+  template <typename Config> using PrimaryT = SizeClassAllocator32<Config>;
+#endif
+
+  struct Secondary {
+    // TODO(cferris): After secondary caching tuned, re-add a cache config.
+    template <typename Config> using CacheT = MapAllocatorNoCache<Config>;
+  };
+
+  template <typename Config> using SecondaryT = MapAllocator<Config>;
+};
+
+#if defined(SCUDO_LOW_MEMORY)
+typedef AndroidLowMemoryConfig Config;
+#else
+typedef AndroidNormalConfig Config;
+#endif
+
+typedef Config DefaultConfig;
+
+} // namespace scudo
diff --git a/standalone/allocator_common.h b/standalone/allocator_common.h
new file mode 100644
index 00000000000..2b77516ad11
--- /dev/null
+++ b/standalone/allocator_common.h
@@ -0,0 +1,92 @@
+//===-- allocator_common.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_ALLOCATOR_COMMON_H_
+#define SCUDO_ALLOCATOR_COMMON_H_
+
+#include "common.h"
+#include "list.h"
+
+namespace scudo {
+
+template <class SizeClassAllocator> struct TransferBatch {
+  typedef typename SizeClassAllocator::SizeClassMap SizeClassMap;
+  typedef typename SizeClassAllocator::CompactPtrT CompactPtrT;
+
+  static const u16 MaxNumCached = SizeClassMap::MaxNumCachedHint;
+  void setFromArray(CompactPtrT *Array, u16 N) {
+    DCHECK_LE(N, MaxNumCached);
+    Count = N;
+    memcpy(Batch, Array, sizeof(Batch[0]) * Count);
+  }
+  void appendFromArray(CompactPtrT *Array, u16 N) {
+    DCHECK_LE(N, MaxNumCached - Count);
+    memcpy(Batch + Count, Array, sizeof(Batch[0]) * N);
+    // u16 will be promoted to int by arithmetic type conversion.
+    Count = static_cast<u16>(Count + N);
+  }
+  void appendFromTransferBatch(TransferBatch *B, u16 N) {
+    DCHECK_LE(N, MaxNumCached - Count);
+    DCHECK_GE(B->Count, N);
+    // Append from the back of `B`.
+    memcpy(Batch + Count, B->Batch + (B->Count - N), sizeof(Batch[0]) * N);
+    // u16 will be promoted to int by arithmetic type conversion.
+    Count = static_cast<u16>(Count + N);
+    B->Count = static_cast<u16>(B->Count - N);
+  }
+  void clear() { Count = 0; }
+  bool empty() { return Count == 0; }
+  void add(CompactPtrT P) {
+    DCHECK_LT(Count, MaxNumCached);
+    Batch[Count++] = P;
+  }
+  void moveToArray(CompactPtrT *Array) {
+    memcpy(Array, Batch, sizeof(Batch[0]) * Count);
+    clear();
+  }
+
+  void moveNToArray(CompactPtrT *Array, u16 N) {
+    DCHECK_LE(N, Count);
+    memcpy(Array, Batch + Count - N, sizeof(Batch[0]) * N);
+    Count = static_cast<u16>(Count - N);
+  }
+  u16 getCount() const { return Count; }
+  bool isEmpty() const { return Count == 0U; }
+  CompactPtrT get(u16 I) const {
+    DCHECK_LE(I, Count);
+    return Batch[I];
+  }
+  TransferBatch *Next;
+
+private:
+  CompactPtrT Batch[MaxNumCached];
+  u16 Count;
+};
+
+// A BatchGroup is used to collect blocks. Each group has a group id to
+// identify the group kind of contained blocks.
+template <class SizeClassAllocator> struct BatchGroup {
+  // `Next` is used by IntrusiveList.
+  BatchGroup *Next;
+  // The compact base address of each group
+  uptr CompactPtrGroupBase;
+  // Cache value of SizeClassAllocatorLocalCache::getMaxCached()
+  u16 MaxCachedPerBatch;
+  // Number of blocks pushed into this group. This is an increment-only
+  // counter.
+  uptr PushedBlocks;
+  // This is used to track how many bytes are not in-use since last time we
+  // tried to release pages.
+  uptr BytesInBGAtLastCheckpoint;
+  // Blocks are managed by TransferBatch in a list.
+  SinglyLinkedList<TransferBatch<SizeClassAllocator>> Batches;
+};
+
+} // namespace scudo
+
+#endif // SCUDO_ALLOCATOR_COMMON_H_
diff --git a/standalone/allocator_config.def b/standalone/allocator_config.def
new file mode 100644
index 00000000000..dcd130ac449
--- /dev/null
+++ b/standalone/allocator_config.def
@@ -0,0 +1,131 @@
+//===-- allocator_config.def ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all the flags and types supported in Scudo. For optional
+// flags and types, only explicitly define them when interested (i.e., unused
+// optional flags or types can be skipped).
+
+#ifndef BASE_REQUIRED_TEMPLATE_TYPE
+#define BASE_REQUIRED_TEMPLATE_TYPE(...)
+#endif
+#ifndef BASE_OPTIONAL
+#define BASE_OPTIONAL(...)
+#endif
+#ifndef PRIMARY_REQUIRED_TYPE
+#define PRIMARY_REQUIRED_TYPE(...)
+#endif
+#ifndef PRIMARY_REQUIRED
+#define PRIMARY_REQUIRED(...)
+#endif
+#ifndef PRIMARY_OPTIONAL
+#define PRIMARY_OPTIONAL(...)
+#endif
+#ifndef PRIMARY_OPTIONAL_TYPE
+#define PRIMARY_OPTIONAL_TYPE(...)
+#endif
+#ifndef SECONDARY_REQUIRED_TEMPLATE_TYPE
+#define SECONDARY_REQUIRED_TEMPLATE_TYPE(...)
+#endif
+#ifndef SECONDARY_CACHE_OPTIONAL
+#define SECONDARY_CACHE_OPTIONAL(...)
+#endif
+
+// BASE_REQUIRED_TEMPLATE_TYPE(NAME)
+//
+// Thread-Specific Data Registry used, shared or exclusive.
+BASE_REQUIRED_TEMPLATE_TYPE(TSDRegistryT)
+
+// Defines the type of Primary allocator to use.
+BASE_REQUIRED_TEMPLATE_TYPE(PrimaryT)
+
+// Defines the type of Secondary allocator to use.
+BASE_REQUIRED_TEMPLATE_TYPE(SecondaryT)
+
+// BASE_OPTIONAL(TYPE, NAME, DEFAULT)
+//
+// Indicates possible support for Memory Tagging.
+BASE_OPTIONAL(const bool, MaySupportMemoryTagging, false)
+
+// PRIMARY_REQUIRED_TYPE(NAME)
+//
+// SizeClassMap to use with the Primary.
+PRIMARY_REQUIRED_TYPE(SizeClassMap)
+
+// Defines the type and scale of a compact pointer. A compact pointer can
+// be understood as the offset of a pointer within the region it belongs
+// to, in increments of a power-of-2 scale. See `CompactPtrScale` also.
+PRIMARY_REQUIRED_TYPE(CompactPtrT)
+
+// PRIMARY_REQUIRED(TYPE, NAME)
+//
+// The scale of a compact pointer. E.g., Ptr = Base + (CompactPtr << Scale).
+PRIMARY_REQUIRED(const uptr, CompactPtrScale)
+
+// Log2 of the size of a size class region, as used by the Primary.
+PRIMARY_REQUIRED(const uptr, RegionSizeLog)
+
+// Conceptually, a region will be divided into groups based on the address
+// range. Each allocation consumes blocks in the same group until exhaustion
+// then it pops out blocks in a new group. Therefore, `GroupSizeLog` is always
+// smaller or equal to `RegionSizeLog`. Note that `GroupSizeLog` needs to be
+// equal to `RegionSizeLog` for SizeClassAllocator32 because of certain
+// constraints.
+PRIMARY_REQUIRED(const uptr, GroupSizeLog)
+
+// Call map for user memory with at least this size. Only used with primary64.
+PRIMARY_REQUIRED(const uptr, MapSizeIncrement)
+
+// Defines the minimal & maximal release interval that can be set.
+PRIMARY_REQUIRED(const s32, MinReleaseToOsIntervalMs)
+PRIMARY_REQUIRED(const s32, MaxReleaseToOsIntervalMs)
+
+// PRIMARY_OPTIONAL(TYPE, NAME, DEFAULT)
+//
+// Indicates support for offsetting the start of a region by a random number of
+// pages. This is only used if `EnableContiguousRegions` is enabled.
+PRIMARY_OPTIONAL(const bool, EnableRandomOffset, false)
+PRIMARY_OPTIONAL(const s32, DefaultReleaseToOsIntervalMs, INT32_MIN)
+
+// When `EnableContiguousRegions` is true, all regions will be be arranged in
+// adjacency. This will reduce the fragmentation caused by region allocations
+// but may require a huge amount of contiguous pages at initialization.
+PRIMARY_OPTIONAL(const bool, EnableContiguousRegions, true)
+
+// PRIMARY_OPTIONAL_TYPE(NAME, DEFAULT)
+//
+// Use condition variable to shorten the waiting time of refillment of
+// freelist. Note that this depends on the implementation of condition
+// variable on each platform and the performance may vary so that it does not
+// guarantee a performance benefit.
+PRIMARY_OPTIONAL_TYPE(ConditionVariableT, ConditionVariableDummy)
+
+// SECONDARY_REQUIRED_TEMPLATE_TYPE(NAME)
+//
+// Defines the type of Secondary Cache to use.
+SECONDARY_REQUIRED_TEMPLATE_TYPE(CacheT)
+
+// SECONDARY_CACHE_OPTIONAL(TYPE, NAME, DEFAULT)
+//
+// Defines the type of cache used by the Secondary. Some additional
+// configuration entries can be necessary depending on the Cache.
+SECONDARY_CACHE_OPTIONAL(const u32, EntriesArraySize, 0)
+SECONDARY_CACHE_OPTIONAL(const u32, QuarantineSize, 0)
+SECONDARY_CACHE_OPTIONAL(const u32, DefaultMaxEntriesCount, 0)
+SECONDARY_CACHE_OPTIONAL(const uptr, DefaultMaxEntrySize, 0)
+SECONDARY_CACHE_OPTIONAL(const s32, MinReleaseToOsIntervalMs, INT32_MIN)
+SECONDARY_CACHE_OPTIONAL(const s32, MaxReleaseToOsIntervalMs, INT32_MAX)
+SECONDARY_CACHE_OPTIONAL(const s32, DefaultReleaseToOsIntervalMs, INT32_MIN)
+
+#undef SECONDARY_CACHE_OPTIONAL
+#undef SECONDARY_REQUIRED_TEMPLATE_TYPE
+#undef PRIMARY_OPTIONAL_TYPE
+#undef PRIMARY_OPTIONAL
+#undef PRIMARY_REQUIRED
+#undef PRIMARY_REQUIRED_TYPE
+#undef BASE_OPTIONAL
+#undef BASE_REQUIRED_TEMPLATE_TYPE
diff --git a/standalone/allocator_config.h b/standalone/allocator_config.h
index 64306066123..60f59bdd2f4 100644
--- a/standalone/allocator_config.h
+++ b/standalone/allocator_config.h
@@ -11,6 +11,7 @@
 
 #include "combined.h"
 #include "common.h"
+#include "condition_variable.h"
 #include "flags.h"
 #include "primary32.h"
 #include "primary64.h"
@@ -19,192 +20,180 @@
 #include "tsd_exclusive.h"
 #include "tsd_shared.h"
 
+// To import a custom configuration, define `SCUDO_USE_CUSTOM_CONFIG` and
+// aliasing the `Config` like:
+//
+// namespace scudo {
+//   // The instance of Scudo will be initiated with `Config`.
+//   typedef CustomConfig Config;
+//   // Aliasing as default configuration to run the tests with this config.
+//   typedef CustomConfig DefaultConfig;
+// } // namespace scudo
+//
+// Put them in the header `custom_scudo_config.h` then you will be using the
+// custom configuration and able to run all the tests as well.
+#ifdef SCUDO_USE_CUSTOM_CONFIG
+#include "custom_scudo_config.h"
+#endif
+
 namespace scudo {
 
-// The combined allocator uses a structure as a template argument that
-// specifies the configuration options for the various subcomponents of the
-// allocator.
-//
-// struct ExampleConfig {
-//   // SizeClassMap to use with the Primary.
-//   using SizeClassMap = DefaultSizeClassMap;
-//   // Indicates possible support for Memory Tagging.
-//   static const bool MaySupportMemoryTagging = false;
-//   // Defines the Primary allocator to use.
-//   typedef SizeClassAllocator64<ExampleConfig> Primary;
-//   // Log2 of the size of a size class region, as used by the Primary.
-//   static const uptr PrimaryRegionSizeLog = 30U;
-//   // Log2 of the size of block group, as used by the Primary. Each group
-//   // contains a range of memory addresses, blocks in the range will belong to
-//   // the same group. In general, single region may have 1 or 2MB group size.
-//   // Multiple regions will have the group size equal to the region size
-//   // because the region size is usually smaller than 1 MB.
-//   // Smaller value gives fine-grained control of memory usage but the trade
-//   // off is that it may take longer time of deallocation.
-//   static const uptr PrimaryGroupSizeLog = 20U;
-//   // Defines the type and scale of a compact pointer. A compact pointer can
-//   // be understood as the offset of a pointer within the region it belongs
-//   // to, in increments of a power-of-2 scale.
-//   // eg: Ptr = Base + (CompactPtr << Scale).
-//   typedef u32 PrimaryCompactPtrT;
-//   static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
-//   // Indicates support for offsetting the start of a region by
-//   // a random number of pages. Only used with primary64.
-//   static const bool PrimaryEnableRandomOffset = true;
-//   // Call map for user memory with at least this size. Only used with
-//   // primary64.
-//   static const uptr PrimaryMapSizeIncrement = 1UL << 18;
-//   // Defines the minimal & maximal release interval that can be set.
-//   static const s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
-//   static const s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
-//   // Defines the type of cache used by the Secondary. Some additional
-//   // configuration entries can be necessary depending on the Cache.
-//   typedef MapAllocatorNoCache SecondaryCache;
-//   // Thread-Specific Data Registry used, shared or exclusive.
-//   template <class A> using TSDRegistryT = TSDRegistrySharedT<A, 8U, 4U>;
-// };
-
-// Default configurations for various platforms.
+// Scudo uses a structure as a template argument that specifies the
+// configuration options for the various subcomponents of the allocator. See the
+// following configs as examples and check `allocator_config.def` for all the
+// available options.
 
+#ifndef SCUDO_USE_CUSTOM_CONFIG
+
+// Default configurations for various platforms. Note this is only enabled when
+// there's no custom configuration in the build system.
 struct DefaultConfig {
-  using SizeClassMap = DefaultSizeClassMap;
   static const bool MaySupportMemoryTagging = true;
+  template <class A> using TSDRegistryT = TSDRegistryExT<A>; // Exclusive
 
+  struct Primary {
+    using SizeClassMap = DefaultSizeClassMap;
 #if SCUDO_CAN_USE_PRIMARY64
-  typedef SizeClassAllocator64<DefaultConfig> Primary;
-  static const uptr PrimaryRegionSizeLog = 32U;
-  static const uptr PrimaryGroupSizeLog = 21U;
-  typedef uptr PrimaryCompactPtrT;
-  static const uptr PrimaryCompactPtrScale = 0;
-  static const bool PrimaryEnableRandomOffset = true;
-  static const uptr PrimaryMapSizeIncrement = 1UL << 18;
+    static const uptr RegionSizeLog = 32U;
+    static const uptr GroupSizeLog = 21U;
+    typedef uptr CompactPtrT;
+    static const uptr CompactPtrScale = 0;
+    static const bool EnableRandomOffset = true;
+    static const uptr MapSizeIncrement = 1UL << 18;
 #else
-  typedef SizeClassAllocator32<DefaultConfig> Primary;
-  static const uptr PrimaryRegionSizeLog = 19U;
-  static const uptr PrimaryGroupSizeLog = 19U;
-  typedef uptr PrimaryCompactPtrT;
+    static const uptr RegionSizeLog = 19U;
+    static const uptr GroupSizeLog = 19U;
+    typedef uptr CompactPtrT;
 #endif
-  static const s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
-  static const s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
-
-  typedef MapAllocatorCache<DefaultConfig> SecondaryCache;
-  static const u32 SecondaryCacheEntriesArraySize = 32U;
-  static const u32 SecondaryCacheQuarantineSize = 0U;
-  static const u32 SecondaryCacheDefaultMaxEntriesCount = 32U;
-  static const uptr SecondaryCacheDefaultMaxEntrySize = 1UL << 19;
-  static const s32 SecondaryCacheMinReleaseToOsIntervalMs = INT32_MIN;
-  static const s32 SecondaryCacheMaxReleaseToOsIntervalMs = INT32_MAX;
-
-  template <class A> using TSDRegistryT = TSDRegistryExT<A>; // Exclusive
-};
-struct AndroidConfig {
-  using SizeClassMap = AndroidSizeClassMap;
-  static const bool MaySupportMemoryTagging = true;
-
+    static const s32 MinReleaseToOsIntervalMs = INT32_MIN;
+    static const s32 MaxReleaseToOsIntervalMs = INT32_MAX;
+  };
 #if SCUDO_CAN_USE_PRIMARY64
-  typedef SizeClassAllocator64<AndroidConfig> Primary;
-  static const uptr PrimaryRegionSizeLog = 28U;
-  typedef u32 PrimaryCompactPtrT;
-  static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
-  static const uptr PrimaryGroupSizeLog = 20U;
-  static const bool PrimaryEnableRandomOffset = true;
-  static const uptr PrimaryMapSizeIncrement = 1UL << 18;
+  template <typename Config> using PrimaryT = SizeClassAllocator64<Config>;
 #else
-  typedef SizeClassAllocator32<AndroidConfig> Primary;
-  static const uptr PrimaryRegionSizeLog = 18U;
-  static const uptr PrimaryGroupSizeLog = 18U;
-  typedef uptr PrimaryCompactPtrT;
+  template <typename Config> using PrimaryT = SizeClassAllocator32<Config>;
 #endif
-  static const s32 PrimaryMinReleaseToOsIntervalMs = 1000;
-  static const s32 PrimaryMaxReleaseToOsIntervalMs = 1000;
 
-  typedef MapAllocatorCache<AndroidConfig> SecondaryCache;
-  static const u32 SecondaryCacheEntriesArraySize = 256U;
-  static const u32 SecondaryCacheQuarantineSize = 32U;
-  static const u32 SecondaryCacheDefaultMaxEntriesCount = 32U;
-  static const uptr SecondaryCacheDefaultMaxEntrySize = 2UL << 20;
-  static const s32 SecondaryCacheMinReleaseToOsIntervalMs = 0;
-  static const s32 SecondaryCacheMaxReleaseToOsIntervalMs = 1000;
+  struct Secondary {
+    struct Cache {
+      static const u32 EntriesArraySize = 32U;
+      static const u32 QuarantineSize = 0U;
+      static const u32 DefaultMaxEntriesCount = 32U;
+      static const uptr DefaultMaxEntrySize = 1UL << 19;
+      static const s32 MinReleaseToOsIntervalMs = INT32_MIN;
+      static const s32 MaxReleaseToOsIntervalMs = INT32_MAX;
+    };
+    template <typename Config> using CacheT = MapAllocatorCache<Config>;
+  };
+
+  template <typename Config> using SecondaryT = MapAllocator<Config>;
+};
+
+#endif // SCUDO_USE_CUSTOM_CONFIG
 
+struct AndroidConfig {
+  static const bool MaySupportMemoryTagging = true;
   template <class A>
   using TSDRegistryT = TSDRegistrySharedT<A, 8U, 2U>; // Shared, max 8 TSDs.
-};
-
-struct AndroidSvelteConfig {
-  using SizeClassMap = SvelteSizeClassMap;
-  static const bool MaySupportMemoryTagging = false;
 
+  struct Primary {
+    using SizeClassMap = AndroidSizeClassMap;
 #if SCUDO_CAN_USE_PRIMARY64
-  typedef SizeClassAllocator64<AndroidSvelteConfig> Primary;
-  static const uptr PrimaryRegionSizeLog = 27U;
-  typedef u32 PrimaryCompactPtrT;
-  static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
-  static const uptr PrimaryGroupSizeLog = 18U;
-  static const bool PrimaryEnableRandomOffset = true;
-  static const uptr PrimaryMapSizeIncrement = 1UL << 18;
+    static const uptr RegionSizeLog = 28U;
+    typedef u32 CompactPtrT;
+    static const uptr CompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
+    static const uptr GroupSizeLog = 20U;
+    static const bool EnableRandomOffset = true;
+    static const uptr MapSizeIncrement = 1UL << 18;
 #else
-  typedef SizeClassAllocator32<AndroidSvelteConfig> Primary;
-  static const uptr PrimaryRegionSizeLog = 16U;
-  static const uptr PrimaryGroupSizeLog = 16U;
-  typedef uptr PrimaryCompactPtrT;
+    static const uptr RegionSizeLog = 18U;
+    static const uptr GroupSizeLog = 18U;
+    typedef uptr CompactPtrT;
+#endif
+    static const s32 MinReleaseToOsIntervalMs = 1000;
+    static const s32 MaxReleaseToOsIntervalMs = 1000;
+  };
+#if SCUDO_CAN_USE_PRIMARY64
+  template <typename Config> using PrimaryT = SizeClassAllocator64<Config>;
+#else
+  template <typename Config> using PrimaryT = SizeClassAllocator32<Config>;
 #endif
-  static const s32 PrimaryMinReleaseToOsIntervalMs = 1000;
-  static const s32 PrimaryMaxReleaseToOsIntervalMs = 1000;
-
-  typedef MapAllocatorCache<AndroidSvelteConfig> SecondaryCache;
-  static const u32 SecondaryCacheEntriesArraySize = 16U;
-  static const u32 SecondaryCacheQuarantineSize = 32U;
-  static const u32 SecondaryCacheDefaultMaxEntriesCount = 4U;
-  static const uptr SecondaryCacheDefaultMaxEntrySize = 1UL << 18;
-  static const s32 SecondaryCacheMinReleaseToOsIntervalMs = 0;
-  static const s32 SecondaryCacheMaxReleaseToOsIntervalMs = 0;
 
-  template <class A>
-  using TSDRegistryT = TSDRegistrySharedT<A, 2U, 1U>; // Shared, max 2 TSDs.
+  struct Secondary {
+    struct Cache {
+      static const u32 EntriesArraySize = 256U;
+      static const u32 QuarantineSize = 32U;
+      static const u32 DefaultMaxEntriesCount = 32U;
+      static const uptr DefaultMaxEntrySize = 2UL << 20;
+      static const s32 MinReleaseToOsIntervalMs = 0;
+      static const s32 MaxReleaseToOsIntervalMs = 1000;
+    };
+    template <typename Config> using CacheT = MapAllocatorCache<Config>;
+  };
+
+  template <typename Config> using SecondaryT = MapAllocator<Config>;
 };
 
 #if SCUDO_CAN_USE_PRIMARY64
 struct FuchsiaConfig {
-  using SizeClassMap = FuchsiaSizeClassMap;
   static const bool MaySupportMemoryTagging = false;
-
-  typedef SizeClassAllocator64<FuchsiaConfig> Primary;
-  static const uptr PrimaryRegionSizeLog = 30U;
-  static const uptr PrimaryGroupSizeLog = 21U;
-  typedef u32 PrimaryCompactPtrT;
-  static const bool PrimaryEnableRandomOffset = true;
-  static const uptr PrimaryMapSizeIncrement = 1UL << 18;
-  static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
-  static const s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
-  static const s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
-
-  typedef MapAllocatorNoCache SecondaryCache;
   template <class A>
   using TSDRegistryT = TSDRegistrySharedT<A, 8U, 4U>; // Shared, max 8 TSDs.
+
+  struct Primary {
+    using SizeClassMap = FuchsiaSizeClassMap;
+#if SCUDO_RISCV64
+    // Support 39-bit VMA for riscv-64
+    static const uptr RegionSizeLog = 28U;
+    static const uptr GroupSizeLog = 19U;
+    static const bool EnableContiguousRegions = false;
+#else
+    static const uptr RegionSizeLog = 30U;
+    static const uptr GroupSizeLog = 21U;
+#endif
+    typedef u32 CompactPtrT;
+    static const bool EnableRandomOffset = true;
+    static const uptr MapSizeIncrement = 1UL << 18;
+    static const uptr CompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
+    static const s32 MinReleaseToOsIntervalMs = INT32_MIN;
+    static const s32 MaxReleaseToOsIntervalMs = INT32_MAX;
+  };
+  template <typename Config> using PrimaryT = SizeClassAllocator64<Config>;
+
+  struct Secondary {
+    template <typename Config> using CacheT = MapAllocatorNoCache<Config>;
+  };
+  template <typename Config> using SecondaryT = MapAllocator<Config>;
 };
 
 struct TrustyConfig {
-  using SizeClassMap = TrustySizeClassMap;
-  static const bool MaySupportMemoryTagging = false;
-
-  typedef SizeClassAllocator64<TrustyConfig> Primary;
-  // Some apps have 1 page of heap total so small regions are necessary.
-  static const uptr PrimaryRegionSizeLog = 10U;
-  static const uptr PrimaryGroupSizeLog = 10U;
-  typedef u32 PrimaryCompactPtrT;
-  static const bool PrimaryEnableRandomOffset = false;
-  // Trusty is extremely memory-constrained so minimally round up map calls.
-  static const uptr PrimaryMapSizeIncrement = 1UL << 4;
-  static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
-  static const s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
-  static const s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
-
-  typedef MapAllocatorNoCache SecondaryCache;
+  static const bool MaySupportMemoryTagging = true;
   template <class A>
   using TSDRegistryT = TSDRegistrySharedT<A, 1U, 1U>; // Shared, max 1 TSD.
+
+  struct Primary {
+    using SizeClassMap = TrustySizeClassMap;
+    static const uptr RegionSizeLog = 28U;
+    static const uptr GroupSizeLog = 20U;
+    typedef u32 CompactPtrT;
+    static const bool EnableRandomOffset = false;
+    static const uptr MapSizeIncrement = 1UL << 12;
+    static const uptr CompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
+    static const s32 MinReleaseToOsIntervalMs = INT32_MIN;
+    static const s32 MaxReleaseToOsIntervalMs = INT32_MAX;
+  };
+  template <typename Config> using PrimaryT = SizeClassAllocator64<Config>;
+
+  struct Secondary {
+    template <typename Config> using CacheT = MapAllocatorNoCache<Config>;
+  };
+
+  template <typename Config> using SecondaryT = MapAllocator<Config>;
 };
 #endif
 
+#ifndef SCUDO_USE_CUSTOM_CONFIG
+
 #if SCUDO_ANDROID
 typedef AndroidConfig Config;
 #elif SCUDO_FUCHSIA
@@ -215,6 +204,8 @@ typedef TrustyConfig Config;
 typedef DefaultConfig Config;
 #endif
 
+#endif // SCUDO_USE_CUSTOM_CONFIG
+
 } // namespace scudo
 
 #endif // SCUDO_ALLOCATOR_CONFIG_H_
diff --git a/standalone/allocator_config_wrapper.h b/standalone/allocator_config_wrapper.h
new file mode 100644
index 00000000000..5477236ac1f
--- /dev/null
+++ b/standalone/allocator_config_wrapper.h
@@ -0,0 +1,149 @@
+//===-- allocator_config_wrapper.h ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_ALLOCATOR_CONFIG_WRAPPER_H_
+#define SCUDO_ALLOCATOR_CONFIG_WRAPPER_H_
+
+#include "condition_variable.h"
+#include "internal_defs.h"
+#include "secondary.h"
+
+namespace {
+
+template <typename T> struct removeConst {
+  using type = T;
+};
+template <typename T> struct removeConst<const T> {
+  using type = T;
+};
+
+// This is only used for SFINAE when detecting if a type is defined.
+template <typename T> struct voidAdaptor {
+  using type = void;
+};
+
+// This is used for detecting the case that defines the flag with wrong type and
+// it'll be viewed as undefined optional flag.
+template <typename L, typename R> struct assertSameType {
+  template <typename, typename> struct isSame {
+    static constexpr bool value = false;
+  };
+  template <typename T> struct isSame<T, T> {
+    static constexpr bool value = true;
+  };
+  static_assert(isSame<L, R>::value, "Flag type mismatches");
+  using type = R;
+};
+
+} // namespace
+
+namespace scudo {
+
+#define OPTIONAL_TEMPLATE(TYPE, NAME, DEFAULT, MEMBER)                         \
+  template <typename Config, typename = TYPE> struct NAME##State {             \
+    static constexpr removeConst<TYPE>::type getValue() { return DEFAULT; }    \
+  };                                                                           \
+  template <typename Config>                                                   \
+  struct NAME##State<                                                          \
+      Config, typename assertSameType<decltype(Config::MEMBER), TYPE>::type> { \
+    static constexpr removeConst<TYPE>::type getValue() {                      \
+      return Config::MEMBER;                                                   \
+    }                                                                          \
+  };
+
+#define OPTIONAL_TYPE_TEMPLATE(NAME, DEFAULT, MEMBER)                          \
+  template <typename Config, typename Void = void> struct NAME##Type {         \
+    static constexpr bool enabled() { return false; }                          \
+    using NAME = DEFAULT;                                                      \
+  };                                                                           \
+  template <typename Config>                                                   \
+  struct NAME##Type<Config,                                                    \
+                    typename voidAdaptor<typename Config::MEMBER>::type> {     \
+    static constexpr bool enabled() { return true; }                           \
+    using NAME = typename Config::MEMBER;                                      \
+  };
+
+template <typename AllocatorConfig> struct BaseConfig {
+#define BASE_REQUIRED_TEMPLATE_TYPE(NAME)                                      \
+  template <typename T> using NAME = typename AllocatorConfig::template NAME<T>;
+
+#define BASE_OPTIONAL(TYPE, NAME, DEFAULT)                                     \
+  OPTIONAL_TEMPLATE(TYPE, NAME, DEFAULT, NAME)                                 \
+  static constexpr removeConst<TYPE>::type get##NAME() {                       \
+    return NAME##State<AllocatorConfig>::getValue();                           \
+  }
+
+#include "allocator_config.def"
+}; // BaseConfig
+
+template <typename AllocatorConfig> struct PrimaryConfig {
+  // TODO: Pass this flag through template argument to remove this hard-coded
+  //       function.
+  static constexpr bool getMaySupportMemoryTagging() {
+    return BaseConfig<AllocatorConfig>::getMaySupportMemoryTagging();
+  }
+
+#define PRIMARY_REQUIRED_TYPE(NAME)                                            \
+  using NAME = typename AllocatorConfig::Primary::NAME;
+
+#define PRIMARY_REQUIRED(TYPE, NAME)                                           \
+  static constexpr removeConst<TYPE>::type get##NAME() {                       \
+    return AllocatorConfig::Primary::NAME;                                     \
+  }
+
+#define PRIMARY_OPTIONAL(TYPE, NAME, DEFAULT)                                  \
+  OPTIONAL_TEMPLATE(TYPE, NAME, DEFAULT, NAME)                                 \
+  static constexpr removeConst<TYPE>::type get##NAME() {                       \
+    return NAME##State<typename AllocatorConfig::Primary>::getValue();         \
+  }
+
+#define PRIMARY_OPTIONAL_TYPE(NAME, DEFAULT)                                   \
+  OPTIONAL_TYPE_TEMPLATE(NAME, DEFAULT, NAME)                                  \
+  static constexpr bool has##NAME() {                                          \
+    return NAME##Type<typename AllocatorConfig::Primary>::enabled();           \
+  }                                                                            \
+  using NAME = typename NAME##Type<typename AllocatorConfig::Primary>::NAME;
+
+#include "allocator_config.def"
+
+}; // PrimaryConfig
+
+template <typename AllocatorConfig> struct SecondaryConfig {
+  // TODO: Pass this flag through template argument to remove this hard-coded
+  //       function.
+  static constexpr bool getMaySupportMemoryTagging() {
+    return BaseConfig<AllocatorConfig>::getMaySupportMemoryTagging();
+  }
+
+#define SECONDARY_REQUIRED_TEMPLATE_TYPE(NAME)                                 \
+  template <typename T>                                                        \
+  using NAME = typename AllocatorConfig::Secondary::template NAME<T>;
+#include "allocator_config.def"
+
+  struct CacheConfig {
+    // TODO: Pass this flag through template argument to remove this hard-coded
+    //       function.
+    static constexpr bool getMaySupportMemoryTagging() {
+      return BaseConfig<AllocatorConfig>::getMaySupportMemoryTagging();
+    }
+
+#define SECONDARY_CACHE_OPTIONAL(TYPE, NAME, DEFAULT)                          \
+  OPTIONAL_TEMPLATE(TYPE, NAME, DEFAULT, Cache::NAME)                          \
+  static constexpr removeConst<TYPE>::type get##NAME() {                       \
+    return NAME##State<typename AllocatorConfig::Secondary>::getValue();       \
+  }
+#include "allocator_config.def"
+  }; // CacheConfig
+};   // SecondaryConfig
+
+#undef OPTIONAL_TEMPLATE
+#undef OPTIONAL_TEMPLATE_TYPE
+
+} // namespace scudo
+
+#endif // SCUDO_ALLOCATOR_CONFIG_WRAPPER_H_
diff --git a/standalone/atomic_helpers.h b/standalone/atomic_helpers.h
index d88f5d7be64..a68ffd16291 100644
--- a/standalone/atomic_helpers.h
+++ b/standalone/atomic_helpers.h
@@ -133,10 +133,10 @@ inline void atomic_store_relaxed(volatile T *A, typename T::Type V) {
 }
 
 template <typename T>
-inline typename T::Type atomic_compare_exchange(volatile T *A,
-                                                typename T::Type Cmp,
-                                                typename T::Type Xchg) {
-  atomic_compare_exchange_strong(A, &Cmp, Xchg, memory_order_acquire);
+inline typename T::Type
+atomic_compare_exchange_strong(volatile T *A, typename T::Type Cmp,
+                               typename T::Type Xchg, memory_order MO) {
+  atomic_compare_exchange_strong(A, &Cmp, Xchg, MO);
   return Cmp;
 }
 
diff --git a/standalone/benchmarks/malloc_benchmark.cpp b/standalone/benchmarks/malloc_benchmark.cpp
index 2adec88da3e..4fb05b7614c 100644
--- a/standalone/benchmarks/malloc_benchmark.cpp
+++ b/standalone/benchmarks/malloc_benchmark.cpp
@@ -52,8 +52,6 @@ static const size_t MaxSize = 128 * 1024;
 // cleanly.
 BENCHMARK_TEMPLATE(BM_malloc_free, scudo::AndroidConfig)
     ->Range(MinSize, MaxSize);
-BENCHMARK_TEMPLATE(BM_malloc_free, scudo::AndroidSvelteConfig)
-    ->Range(MinSize, MaxSize);
 #if SCUDO_CAN_USE_PRIMARY64
 BENCHMARK_TEMPLATE(BM_malloc_free, scudo::FuchsiaConfig)
     ->Range(MinSize, MaxSize);
@@ -99,8 +97,6 @@ static const size_t MaxIters = 32 * 1024;
 // cleanly.
 BENCHMARK_TEMPLATE(BM_malloc_free_loop, scudo::AndroidConfig)
     ->Range(MinIters, MaxIters);
-BENCHMARK_TEMPLATE(BM_malloc_free_loop, scudo::AndroidSvelteConfig)
-    ->Range(MinIters, MaxIters);
 #if SCUDO_CAN_USE_PRIMARY64
 BENCHMARK_TEMPLATE(BM_malloc_free_loop, scudo::FuchsiaConfig)
     ->Range(MinIters, MaxIters);
diff --git a/standalone/checksum.cpp b/standalone/checksum.cpp
index 2c277391a2e..efa4055bcbc 100644
--- a/standalone/checksum.cpp
+++ b/standalone/checksum.cpp
@@ -19,6 +19,8 @@
 #else
 #include <sys/auxv.h>
 #endif
+#elif defined(__loongarch__)
+#include <sys/auxv.h>
 #endif
 
 namespace scudo {
@@ -75,6 +77,20 @@ bool hasHardwareCRC32() {
   return !!(getauxval(AT_HWCAP) & HWCAP_CRC32);
 #endif // SCUDO_FUCHSIA
 }
+#elif defined(__loongarch__)
+// The definition is only pulled in by <sys/auxv.h> since glibc 2.38, so
+// supply it if missing.
+#ifndef HWCAP_LOONGARCH_CRC32
+#define HWCAP_LOONGARCH_CRC32 (1 << 6)
+#endif
+// Query HWCAP for platform capability, according to *Software Development and
+// Build Convention for LoongArch Architectures* v0.1, Section 9.1.
+//
+// Link:
+// https://github.com/loongson/la-softdev-convention/blob/v0.1/la-softdev-convention.adoc#kernel-development
+bool hasHardwareCRC32() {
+  return !!(getauxval(AT_HWCAP) & HWCAP_LOONGARCH_CRC32);
+}
 #else
 // No hardware CRC32 implemented in Scudo for other architectures.
 bool hasHardwareCRC32() { return false; }
diff --git a/standalone/checksum.h b/standalone/checksum.h
index f8eda81fd91..32ca372b097 100644
--- a/standalone/checksum.h
+++ b/standalone/checksum.h
@@ -30,6 +30,10 @@
 #include <arm_acle.h>
 #define CRC32_INTRINSIC FIRST_32_SECOND_64(__crc32cw, __crc32cd)
 #endif
+#ifdef __loongarch__
+#include <larchintrin.h>
+#define CRC32_INTRINSIC FIRST_32_SECOND_64(__crcc_w_w_w, __crcc_w_d_w)
+#endif
 
 namespace scudo {
 
diff --git a/standalone/chunk.h b/standalone/chunk.h
index 32874a8df64..9228df04718 100644
--- a/standalone/chunk.h
+++ b/standalone/chunk.h
@@ -128,19 +128,6 @@ inline void loadHeader(u32 Cookie, const void *Ptr,
     reportHeaderCorruption(const_cast<void *>(Ptr));
 }
 
-inline void compareExchangeHeader(u32 Cookie, void *Ptr,
-                                  UnpackedHeader *NewUnpackedHeader,
-                                  UnpackedHeader *OldUnpackedHeader) {
-  NewUnpackedHeader->Checksum =
-      computeHeaderChecksum(Cookie, Ptr, NewUnpackedHeader);
-  PackedHeader NewPackedHeader = bit_cast<PackedHeader>(*NewUnpackedHeader);
-  PackedHeader OldPackedHeader = bit_cast<PackedHeader>(*OldUnpackedHeader);
-  if (UNLIKELY(!atomic_compare_exchange_strong(
-          getAtomicHeader(Ptr), &OldPackedHeader, NewPackedHeader,
-          memory_order_relaxed)))
-    reportHeaderRace(Ptr);
-}
-
 inline bool isValid(u32 Cookie, const void *Ptr,
                     UnpackedHeader *NewUnpackedHeader) {
   PackedHeader NewPackedHeader = atomic_load_relaxed(getConstAtomicHeader(Ptr));
diff --git a/standalone/combined.h b/standalone/combined.h
index 52dbf6b1452..927513dea92 100644
--- a/standalone/combined.h
+++ b/standalone/combined.h
@@ -9,16 +9,19 @@
 #ifndef SCUDO_COMBINED_H_
 #define SCUDO_COMBINED_H_
 
+#include "allocator_config_wrapper.h"
+#include "atomic_helpers.h"
 #include "chunk.h"
 #include "common.h"
 #include "flags.h"
 #include "flags_parser.h"
 #include "local_cache.h"
+#include "mem_map.h"
 #include "memtag.h"
+#include "mutex.h"
 #include "options.h"
 #include "quarantine.h"
 #include "report.h"
-#include "rss_limit_checker.h"
 #include "secondary.h"
 #include "stack_depot.h"
 #include "string_utils.h"
@@ -43,13 +46,17 @@ extern "C" size_t android_unsafe_frame_pointer_chase(scudo::uptr *buf,
 
 namespace scudo {
 
-template <class Params, void (*PostInitCallback)(void) = EmptyCallback>
+template <class Config, void (*PostInitCallback)(void) = EmptyCallback>
 class Allocator {
 public:
-  using PrimaryT = typename Params::Primary;
+  using AllocatorConfig = BaseConfig<Config>;
+  using PrimaryT =
+      typename AllocatorConfig::template PrimaryT<PrimaryConfig<Config>>;
+  using SecondaryT =
+      typename AllocatorConfig::template SecondaryT<SecondaryConfig<Config>>;
   using CacheT = typename PrimaryT::CacheT;
-  typedef Allocator<Params, PostInitCallback> ThisT;
-  typedef typename Params::template TSDRegistryT<ThisT> TSDRegistryT;
+  typedef Allocator<Config, PostInitCallback> ThisT;
+  typedef typename AllocatorConfig::template TSDRegistryT<ThisT> TSDRegistryT;
 
   void callPostInitCallback() {
     pthread_once(&PostInitNonce, PostInitCallback);
@@ -67,14 +74,13 @@ public:
       if (UNLIKELY(Header.State != Chunk::State::Quarantined))
         reportInvalidChunkState(AllocatorAction::Recycling, Ptr);
 
-      Chunk::UnpackedHeader NewHeader = Header;
-      NewHeader.State = Chunk::State::Available;
-      Chunk::compareExchangeHeader(Allocator.Cookie, Ptr, &NewHeader, &Header);
+      Header.State = Chunk::State::Available;
+      Chunk::storeHeader(Allocator.Cookie, Ptr, &Header);
 
-      if (allocatorSupportsMemoryTagging<Params>())
+      if (allocatorSupportsMemoryTagging<AllocatorConfig>())
         Ptr = untagPointer(Ptr);
-      void *BlockBegin = Allocator::getBlockBegin(Ptr, &NewHeader);
-      Cache.deallocate(NewHeader.ClassId, BlockBegin);
+      void *BlockBegin = Allocator::getBlockBegin(Ptr, &Header);
+      Cache.deallocate(Header.ClassId, BlockBegin);
     }
 
     // We take a shortcut when allocating a quarantine batch by working with the
@@ -98,7 +104,8 @@ public:
 
       // Reset tag to 0 as this chunk may have been previously used for a tagged
       // user allocation.
-      if (UNLIKELY(useMemoryTagging<Params>(Allocator.Primary.Options.load())))
+      if (UNLIKELY(useMemoryTagging<AllocatorConfig>(
+              Allocator.Primary.Options.load())))
         storeTags(reinterpret_cast<uptr>(Ptr),
                   reinterpret_cast<uptr>(Ptr) + sizeof(QuarantineBatch));
 
@@ -117,9 +124,8 @@ public:
       DCHECK_EQ(Header.Offset, 0);
       DCHECK_EQ(Header.SizeOrUnusedBytes, sizeof(QuarantineBatch));
 
-      Chunk::UnpackedHeader NewHeader = Header;
-      NewHeader.State = Chunk::State::Available;
-      Chunk::compareExchangeHeader(Allocator.Cookie, Ptr, &NewHeader, &Header);
+      Header.State = Chunk::State::Available;
+      Chunk::storeHeader(Allocator.Cookie, Ptr, &Header);
       Cache.deallocate(QuarantineClassId,
                        reinterpret_cast<void *>(reinterpret_cast<uptr>(Ptr) -
                                                 Chunk::getHeaderSize()));
@@ -148,9 +154,6 @@ public:
     initFlags();
     reportUnrecognizedFlags();
 
-    RssChecker.init(scudo::getFlags()->soft_rss_limit_mb,
-                    scudo::getFlags()->hard_rss_limit_mb);
-
     // Store some flags locally.
     if (getFlags()->may_return_null)
       Primary.Options.set(OptionBit::MayReturnNull);
@@ -162,23 +165,37 @@ public:
       Primary.Options.set(OptionBit::DeallocTypeMismatch);
     if (getFlags()->delete_size_mismatch)
       Primary.Options.set(OptionBit::DeleteSizeMismatch);
-    if (allocatorSupportsMemoryTagging<Params>() &&
+    if (allocatorSupportsMemoryTagging<AllocatorConfig>() &&
         systemSupportsMemoryTagging())
       Primary.Options.set(OptionBit::UseMemoryTagging);
-    Primary.Options.set(OptionBit::UseOddEvenTags);
 
     QuarantineMaxChunkSize =
         static_cast<u32>(getFlags()->quarantine_max_chunk_size);
 
     Stats.init();
+    // TODO(chiahungduan): Given that we support setting the default value in
+    // the PrimaryConfig and CacheConfig, consider to deprecate the use of
+    // `release_to_os_interval_ms` flag.
     const s32 ReleaseToOsIntervalMs = getFlags()->release_to_os_interval_ms;
     Primary.init(ReleaseToOsIntervalMs);
     Secondary.init(&Stats, ReleaseToOsIntervalMs);
     Quarantine.init(
         static_cast<uptr>(getFlags()->quarantine_size_kb << 10),
         static_cast<uptr>(getFlags()->thread_local_quarantine_size_kb << 10));
+  }
+
+  void enableRingBuffer() NO_THREAD_SAFETY_ANALYSIS {
+    AllocationRingBuffer *RB = getRingBuffer();
+    if (RB)
+      RB->Depot->enable();
+    RingBufferInitLock.unlock();
+  }
 
-    initRingBuffer();
+  void disableRingBuffer() NO_THREAD_SAFETY_ANALYSIS {
+    RingBufferInitLock.lock();
+    AllocationRingBuffer *RB = getRingBuffer();
+    if (RB)
+      RB->Depot->disable();
   }
 
   // Initialize the embedded GWP-ASan instance. Requires the main allocator to
@@ -228,6 +245,7 @@ public:
   }
 
   void unmapTestOnly() {
+    unmapRingBuffer();
     TSDRegistry.unmapTestOnly(this);
     Primary.unmapTestOnly();
     Secondary.unmapTestOnly();
@@ -239,6 +257,7 @@ public:
   }
 
   TSDRegistryT *getTSDRegistry() { return &TSDRegistry; }
+  QuarantineT *getQuarantine() { return &Quarantine; }
 
   // The Cache must be provided zero-initialized.
   void initCache(CacheT *Cache) { Cache->init(&Stats, &Primary); }
@@ -249,13 +268,22 @@ public:
   // - unlinking the local stats from the global ones (destroying the cache does
   //   the last two items).
   void commitBack(TSD<ThisT> *TSD) {
+    TSD->assertLocked(/*BypassCheck=*/true);
     Quarantine.drain(&TSD->getQuarantineCache(),
                      QuarantineCallback(*this, TSD->getCache()));
     TSD->getCache().destroy(&Stats);
   }
 
+  void drainCache(TSD<ThisT> *TSD) {
+    TSD->assertLocked(/*BypassCheck=*/true);
+    Quarantine.drainAndRecycle(&TSD->getQuarantineCache(),
+                               QuarantineCallback(*this, TSD->getCache()));
+    TSD->getCache().drain();
+  }
+  void drainCaches() { TSDRegistry.drainCaches(this); }
+
   ALWAYS_INLINE void *getHeaderTaggedPointer(void *Ptr) {
-    if (!allocatorSupportsMemoryTagging<Params>())
+    if (!allocatorSupportsMemoryTagging<AllocatorConfig>())
       return Ptr;
     auto UntaggedPtr = untagPointer(Ptr);
     if (UntaggedPtr != Ptr)
@@ -267,7 +295,7 @@ public:
   }
 
   ALWAYS_INLINE uptr addHeaderTag(uptr Ptr) {
-    if (!allocatorSupportsMemoryTagging<Params>())
+    if (!allocatorSupportsMemoryTagging<AllocatorConfig>())
       return Ptr;
     return addFixedTag(Ptr, 2);
   }
@@ -276,7 +304,7 @@ public:
     return reinterpret_cast<void *>(addHeaderTag(reinterpret_cast<uptr>(Ptr)));
   }
 
-  NOINLINE u32 collectStackTrace() {
+  NOINLINE u32 collectStackTrace(UNUSED StackDepot *Depot) {
 #ifdef HAVE_ANDROID_UNSAFE_FRAME_POINTER_CHASE
     // Discard collectStackTrace() frame and allocator function frame.
     constexpr uptr DiscardFrames = 2;
@@ -284,13 +312,13 @@ public:
     uptr Size =
         android_unsafe_frame_pointer_chase(Stack, MaxTraceSize + DiscardFrames);
     Size = Min<uptr>(Size, MaxTraceSize + DiscardFrames);
-    return Depot.insert(Stack + Min<uptr>(DiscardFrames, Size), Stack + Size);
+    return Depot->insert(Stack + Min<uptr>(DiscardFrames, Size), Stack + Size);
 #else
     return 0;
 #endif
   }
 
-  uptr computeOddEvenMaskForPointerMaybe(Options Options, uptr Ptr,
+  uptr computeOddEvenMaskForPointerMaybe(const Options &Options, uptr Ptr,
                                          uptr ClassId) {
     if (!Options.get(OptionBit::UseOddEvenTags))
       return 0;
@@ -320,8 +348,6 @@ public:
 #ifdef GWP_ASAN_HOOKS
     if (UNLIKELY(GuardedAlloc.shouldSample())) {
       if (void *Ptr = GuardedAlloc.allocate(Size, Alignment)) {
-        if (UNLIKELY(&__scudo_allocate_hook))
-          __scudo_allocate_hook(Ptr, Size);
         Stats.lock();
         Stats.add(StatAllocated, GuardedAllocSlotSize);
         Stats.sub(StatFree, GuardedAllocSlotSize);
@@ -354,40 +380,23 @@ public:
     }
     DCHECK_LE(Size, NeededSize);
 
-    switch (RssChecker.getRssLimitExceeded()) {
-    case RssLimitChecker::Neither:
-      break;
-    case RssLimitChecker::Soft:
-      if (Options.get(OptionBit::MayReturnNull))
-        return nullptr;
-      reportSoftRSSLimit(RssChecker.getSoftRssLimit());
-      break;
-    case RssLimitChecker::Hard:
-      reportHardRSSLimit(RssChecker.getHardRssLimit());
-      break;
-    }
-
     void *Block = nullptr;
     uptr ClassId = 0;
     uptr SecondaryBlockEnd = 0;
     if (LIKELY(PrimaryT::canAllocate(NeededSize))) {
       ClassId = SizeClassMap::getClassIdBySize(NeededSize);
       DCHECK_NE(ClassId, 0U);
-      bool UnlockRequired;
-      auto *TSD = TSDRegistry.getTSDAndLock(&UnlockRequired);
+      typename TSDRegistryT::ScopedTSD TSD(TSDRegistry);
       Block = TSD->getCache().allocate(ClassId);
-      // If the allocation failed, the most likely reason with a 32-bit primary
-      // is the region being full. In that event, retry in each successively
-      // larger class until it fits. If it fails to fit in the largest class,
-      // fallback to the Secondary.
+      // If the allocation failed, retry in each successively larger class until
+      // it fits. If it fails to fit in the largest class, fallback to the
+      // Secondary.
       if (UNLIKELY(!Block)) {
         while (ClassId < SizeClassMap::LargestClassId && !Block)
           Block = TSD->getCache().allocate(++ClassId);
         if (!Block)
           ClassId = 0;
       }
-      if (UnlockRequired)
-        TSD->unlock();
     }
     if (UNLIKELY(ClassId == 0)) {
       Block = Secondary.allocate(Options, Size, Alignment, &SecondaryBlockEnd,
@@ -397,6 +406,7 @@ public:
     if (UNLIKELY(!Block)) {
       if (Options.get(OptionBit::MayReturnNull))
         return nullptr;
+      printStats();
       reportOutOfMemory(NeededSize);
     }
 
@@ -418,7 +428,7 @@ public:
       //
       // When memory tagging is enabled, zeroing the contents is done as part of
       // setting the tag.
-      if (UNLIKELY(useMemoryTagging<Params>(Options))) {
+      if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Options))) {
         uptr PrevUserPtr;
         Chunk::UnpackedHeader Header;
         const uptr BlockSize = PrimaryT::getSizeByClassId(ClassId);
@@ -500,7 +510,7 @@ public:
     } else {
       Block = addHeaderTag(Block);
       Ptr = addHeaderTag(Ptr);
-      if (UNLIKELY(useMemoryTagging<Params>(Options))) {
+      if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Options))) {
         storeTags(reinterpret_cast<uptr>(Block), reinterpret_cast<uptr>(Ptr));
         storeSecondaryAllocationStackMaybe(Options, Ptr, Size);
       }
@@ -526,14 +536,14 @@ public:
         Chunk::SizeOrUnusedBytesMask;
     Chunk::storeHeader(Cookie, Ptr, &Header);
 
-    if (UNLIKELY(&__scudo_allocate_hook))
-      __scudo_allocate_hook(TaggedPtr, Size);
-
     return TaggedPtr;
   }
 
   NOINLINE void deallocate(void *Ptr, Chunk::Origin Origin, uptr DeleteSize = 0,
                            UNUSED uptr Alignment = MinAlignment) {
+    if (UNLIKELY(!Ptr))
+      return;
+
     // For a deallocation, we only ensure minimal initialization, meaning thread
     // local data will be left uninitialized for now (when using ELF TLS). The
     // fallback cache will be used instead. This is a workaround for a situation
@@ -542,12 +552,6 @@ public:
     // being destroyed properly. Any other heap operation will do a full init.
     initThreadMaybe(/*MinimalInit=*/true);
 
-    if (UNLIKELY(&__scudo_deallocate_hook))
-      __scudo_deallocate_hook(Ptr);
-
-    if (UNLIKELY(!Ptr))
-      return;
-
 #ifdef GWP_ASAN_HOOKS
     if (UNLIKELY(GuardedAlloc.pointerIsMine(Ptr))) {
       GuardedAlloc.deallocate(Ptr);
@@ -626,48 +630,47 @@ public:
     if (UNLIKELY(!isAligned(reinterpret_cast<uptr>(OldPtr), MinAlignment)))
       reportMisalignedPointer(AllocatorAction::Reallocating, OldPtr);
 
-    Chunk::UnpackedHeader OldHeader;
-    Chunk::loadHeader(Cookie, OldPtr, &OldHeader);
+    Chunk::UnpackedHeader Header;
+    Chunk::loadHeader(Cookie, OldPtr, &Header);
 
-    if (UNLIKELY(OldHeader.State != Chunk::State::Allocated))
+    if (UNLIKELY(Header.State != Chunk::State::Allocated))
       reportInvalidChunkState(AllocatorAction::Reallocating, OldPtr);
 
     // Pointer has to be allocated with a malloc-type function. Some
     // applications think that it is OK to realloc a memalign'ed pointer, which
     // will trigger this check. It really isn't.
     if (Options.get(OptionBit::DeallocTypeMismatch)) {
-      if (UNLIKELY(OldHeader.OriginOrWasZeroed != Chunk::Origin::Malloc))
+      if (UNLIKELY(Header.OriginOrWasZeroed != Chunk::Origin::Malloc))
         reportDeallocTypeMismatch(AllocatorAction::Reallocating, OldPtr,
-                                  OldHeader.OriginOrWasZeroed,
+                                  Header.OriginOrWasZeroed,
                                   Chunk::Origin::Malloc);
     }
 
-    void *BlockBegin = getBlockBegin(OldTaggedPtr, &OldHeader);
+    void *BlockBegin = getBlockBegin(OldTaggedPtr, &Header);
     uptr BlockEnd;
     uptr OldSize;
-    const uptr ClassId = OldHeader.ClassId;
+    const uptr ClassId = Header.ClassId;
     if (LIKELY(ClassId)) {
       BlockEnd = reinterpret_cast<uptr>(BlockBegin) +
                  SizeClassMap::getSizeByClassId(ClassId);
-      OldSize = OldHeader.SizeOrUnusedBytes;
+      OldSize = Header.SizeOrUnusedBytes;
     } else {
       BlockEnd = SecondaryT::getBlockEnd(BlockBegin);
       OldSize = BlockEnd - (reinterpret_cast<uptr>(OldTaggedPtr) +
-                            OldHeader.SizeOrUnusedBytes);
+                            Header.SizeOrUnusedBytes);
     }
     // If the new chunk still fits in the previously allocated block (with a
     // reasonable delta), we just keep the old block, and update the chunk
     // header to reflect the size change.
     if (reinterpret_cast<uptr>(OldTaggedPtr) + NewSize <= BlockEnd) {
       if (NewSize > OldSize || (OldSize - NewSize) < getPageSizeCached()) {
-        Chunk::UnpackedHeader NewHeader = OldHeader;
-        NewHeader.SizeOrUnusedBytes =
+        Header.SizeOrUnusedBytes =
             (ClassId ? NewSize
                      : BlockEnd -
                            (reinterpret_cast<uptr>(OldTaggedPtr) + NewSize)) &
             Chunk::SizeOrUnusedBytesMask;
-        Chunk::compareExchangeHeader(Cookie, OldPtr, &NewHeader, &OldHeader);
-        if (UNLIKELY(useMemoryTagging<Params>(Options))) {
+        Chunk::storeHeader(Cookie, OldPtr, &Header);
+        if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Options))) {
           if (ClassId) {
             resizeTaggedChunk(reinterpret_cast<uptr>(OldTaggedPtr) + OldSize,
                               reinterpret_cast<uptr>(OldTaggedPtr) + NewSize,
@@ -688,9 +691,7 @@ public:
     void *NewPtr = allocate(NewSize, Chunk::Origin::Malloc, Alignment);
     if (LIKELY(NewPtr)) {
       memcpy(NewPtr, OldTaggedPtr, Min(NewSize, OldSize));
-      if (UNLIKELY(&__scudo_deallocate_hook))
-        __scudo_deallocate_hook(OldTaggedPtr);
-      quarantineOrDeallocateChunk(Options, OldTaggedPtr, &OldHeader, OldSize);
+      quarantineOrDeallocateChunk(Options, OldTaggedPtr, &Header, OldSize);
     }
     return NewPtr;
   }
@@ -708,10 +709,12 @@ public:
     Quarantine.disable();
     Primary.disable();
     Secondary.disable();
+    disableRingBuffer();
   }
 
   void enable() NO_THREAD_SAFETY_ANALYSIS {
     initThreadMaybe();
+    enableRingBuffer();
     Secondary.enable();
     Primary.enable();
     Quarantine.enable();
@@ -745,9 +748,18 @@ public:
     Str.output();
   }
 
-  void releaseToOS() {
+  void printFragmentationInfo() {
+    ScopedString Str;
+    Primary.getFragmentationInfo(&Str);
+    // Secondary allocator dumps the fragmentation data in getStats().
+    Str.output();
+  }
+
+  void releaseToOS(ReleaseToOS ReleaseType) {
     initThreadMaybe();
-    Primary.releaseToOS();
+    if (ReleaseType == ReleaseToOS::ForceAll)
+      drainCaches();
+    Primary.releaseToOS(ReleaseType);
     Secondary.releaseToOS();
   }
 
@@ -761,8 +773,9 @@ public:
       Base = untagPointer(Base);
     const uptr From = Base;
     const uptr To = Base + Size;
-    bool MayHaveTaggedPrimary = allocatorSupportsMemoryTagging<Params>() &&
-                                systemSupportsMemoryTagging();
+    bool MayHaveTaggedPrimary =
+        allocatorSupportsMemoryTagging<AllocatorConfig>() &&
+        systemSupportsMemoryTagging();
     auto Lambda = [this, From, To, MayHaveTaggedPrimary, Callback,
                    Arg](uptr Block) {
       if (Block < From || Block >= To)
@@ -783,9 +796,9 @@ public:
       }
       if (Header.State == Chunk::State::Allocated) {
         uptr TaggedChunk = Chunk;
-        if (allocatorSupportsMemoryTagging<Params>())
+        if (allocatorSupportsMemoryTagging<AllocatorConfig>())
           TaggedChunk = untagPointer(TaggedChunk);
-        if (useMemoryTagging<Params>(Primary.Options.load()))
+        if (useMemoryTagging<AllocatorConfig>(Primary.Options.load()))
           TaggedChunk = loadTag(Chunk);
         Callback(TaggedChunk, getSize(reinterpret_cast<void *>(Chunk), &Header),
                  Arg);
@@ -836,10 +849,15 @@ public:
   // for it, which then forces realloc to copy the usable size of a chunk as
   // opposed to its actual size.
   uptr getUsableSize(const void *Ptr) {
-    initThreadMaybe();
     if (UNLIKELY(!Ptr))
       return 0;
 
+    return getAllocSize(Ptr);
+  }
+
+  uptr getAllocSize(const void *Ptr) {
+    initThreadMaybe();
+
 #ifdef GWP_ASAN_HOOKS
     if (UNLIKELY(GuardedAlloc.pointerIsMine(Ptr)))
       return GuardedAlloc.getSize(Ptr);
@@ -848,9 +866,11 @@ public:
     Ptr = getHeaderTaggedPointer(const_cast<void *>(Ptr));
     Chunk::UnpackedHeader Header;
     Chunk::loadHeader(Cookie, Ptr, &Header);
-    // Getting the usable size of a chunk only makes sense if it's allocated.
+
+    // Getting the alloc size of a chunk only makes sense if it's allocated.
     if (UNLIKELY(Header.State != Chunk::State::Allocated))
       reportInvalidChunkState(AllocatorAction::Sizing, const_cast<void *>(Ptr));
+
     return getSize(Ptr, &Header);
   }
 
@@ -876,15 +896,8 @@ public:
            Header.State == Chunk::State::Allocated;
   }
 
-  void setRssLimitsTestOnly(int SoftRssLimitMb, int HardRssLimitMb,
-                            bool MayReturnNull) {
-    RssChecker.init(SoftRssLimitMb, HardRssLimitMb);
-    if (MayReturnNull)
-      Primary.Options.set(OptionBit::MayReturnNull);
-  }
-
   bool useMemoryTaggingTestOnly() const {
-    return useMemoryTagging<Params>(Primary.Options.load());
+    return useMemoryTagging<AllocatorConfig>(Primary.Options.load());
   }
   void disableMemoryTagging() {
     // If we haven't been initialized yet, we need to initialize now in order to
@@ -894,7 +907,7 @@ public:
     // callback), which may cause mappings to be created with memory tagging
     // enabled.
     TSDRegistry.initOnceMaybe(this);
-    if (allocatorSupportsMemoryTagging<Params>()) {
+    if (allocatorSupportsMemoryTagging<AllocatorConfig>()) {
       Secondary.disableMemoryTagging();
       Primary.Options.clear(OptionBit::UseMemoryTagging);
     }
@@ -902,13 +915,15 @@ public:
 
   void setTrackAllocationStacks(bool Track) {
     initThreadMaybe();
-    if (getFlags()->allocation_ring_buffer_size == 0) {
+    if (getFlags()->allocation_ring_buffer_size <= 0) {
       DCHECK(!Primary.Options.load().get(OptionBit::TrackAllocationStacks));
       return;
     }
-    if (Track)
+
+    if (Track) {
+      initRingBufferMaybe();
       Primary.Options.set(OptionBit::TrackAllocationStacks);
-    else
+    } else
       Primary.Options.clear(OptionBit::TrackAllocationStacks);
   }
 
@@ -925,8 +940,16 @@ public:
       Primary.Options.clear(OptionBit::AddLargeAllocationSlack);
   }
 
-  const char *getStackDepotAddress() const {
-    return reinterpret_cast<const char *>(&Depot);
+  const char *getStackDepotAddress() {
+    initThreadMaybe();
+    AllocationRingBuffer *RB = getRingBuffer();
+    return RB ? reinterpret_cast<char *>(RB->Depot) : nullptr;
+  }
+
+  uptr getStackDepotSize() {
+    initThreadMaybe();
+    AllocationRingBuffer *RB = getRingBuffer();
+    return RB ? RB->StackDepotSize : 0;
   }
 
   const char *getRegionInfoArrayAddress() const {
@@ -939,26 +962,15 @@ public:
 
   const char *getRingBufferAddress() {
     initThreadMaybe();
-    return RawRingBuffer;
+    return reinterpret_cast<char *>(getRingBuffer());
   }
 
   uptr getRingBufferSize() {
     initThreadMaybe();
-    auto *RingBuffer = getRingBuffer();
-    return RingBuffer ? ringBufferSizeInBytes(RingBuffer->Size) : 0;
-  }
-
-  static bool setRingBufferSizeForBuffer(char *Buffer, size_t Size) {
-    // Need at least one entry.
-    if (Size < sizeof(AllocationRingBuffer) +
-                   sizeof(typename AllocationRingBuffer::Entry)) {
-      return false;
-    }
-    AllocationRingBuffer *RingBuffer =
-        reinterpret_cast<AllocationRingBuffer *>(Buffer);
-    RingBuffer->Size = (Size - sizeof(AllocationRingBuffer)) /
-                       sizeof(typename AllocationRingBuffer::Entry);
-    return true;
+    AllocationRingBuffer *RB = getRingBuffer();
+    return RB && RB->RingBufferElements
+               ? ringBufferSizeInBytes(RB->RingBufferElements)
+               : 0;
   }
 
   static const uptr MaxTraceSize = 64;
@@ -969,20 +981,35 @@ public:
     if (!Depot->find(Hash, &RingPos, &Size))
       return;
     for (unsigned I = 0; I != Size && I != MaxTraceSize; ++I)
-      Trace[I] = static_cast<uintptr_t>((*Depot)[RingPos + I]);
+      Trace[I] = static_cast<uintptr_t>(Depot->at(RingPos + I));
   }
 
   static void getErrorInfo(struct scudo_error_info *ErrorInfo,
                            uintptr_t FaultAddr, const char *DepotPtr,
-                           const char *RegionInfoPtr, const char *RingBufferPtr,
+                           size_t DepotSize, const char *RegionInfoPtr,
+                           const char *RingBufferPtr, size_t RingBufferSize,
                            const char *Memory, const char *MemoryTags,
                            uintptr_t MemoryAddr, size_t MemorySize) {
+    // N.B. we need to support corrupted data in any of the buffers here. We get
+    // this information from an external process (the crashing process) that
+    // should not be able to crash the crash dumper (crash_dump on Android).
+    // See also the get_error_info_fuzzer.
     *ErrorInfo = {};
-    if (!allocatorSupportsMemoryTagging<Params>() ||
+    if (!allocatorSupportsMemoryTagging<AllocatorConfig>() ||
         MemoryAddr + MemorySize < MemoryAddr)
       return;
 
-    auto *Depot = reinterpret_cast<const StackDepot *>(DepotPtr);
+    const StackDepot *Depot = nullptr;
+    if (DepotPtr) {
+      // check for corrupted StackDepot. First we need to check whether we can
+      // read the metadata, then whether the metadata matches the size.
+      if (DepotSize < sizeof(*Depot))
+        return;
+      Depot = reinterpret_cast<const StackDepot *>(DepotPtr);
+      if (!Depot->isValid(DepotSize))
+        return;
+    }
+
     size_t NextErrorReport = 0;
 
     // Check for OOB in the current block and the two surrounding blocks. Beyond
@@ -995,7 +1022,7 @@ public:
     // Check the ring buffer. For primary allocations this will only find UAF;
     // for secondary allocations we can find either UAF or OOB.
     getRingBufferErrorInfo(ErrorInfo, NextErrorReport, FaultAddr, Depot,
-                           RingBufferPtr);
+                           RingBufferPtr, RingBufferSize);
 
     // Check for OOB in the 28 blocks surrounding the 3 we checked earlier.
     // Beyond that we are likely to hit false positives.
@@ -1006,7 +1033,6 @@ public:
   }
 
 private:
-  using SecondaryT = MapAllocator<Params>;
   typedef typename PrimaryT::SizeClassMap SizeClassMap;
 
   static const uptr MinAlignmentLog = SCUDO_MIN_ALIGNMENT_LOG;
@@ -1018,7 +1044,7 @@ private:
 
   static_assert(MinAlignment >= sizeof(Chunk::PackedHeader),
                 "Minimal alignment must at least cover a chunk header.");
-  static_assert(!allocatorSupportsMemoryTagging<Params>() ||
+  static_assert(!allocatorSupportsMemoryTagging<AllocatorConfig>() ||
                     MinAlignment >= archMemoryTagGranuleSize(),
                 "");
 
@@ -1043,15 +1069,12 @@ private:
   QuarantineT Quarantine;
   TSDRegistryT TSDRegistry;
   pthread_once_t PostInitNonce = PTHREAD_ONCE_INIT;
-  RssLimitChecker RssChecker;
 
 #ifdef GWP_ASAN_HOOKS
   gwp_asan::GuardedPoolAllocator GuardedAlloc;
   uptr GuardedAllocSlotSize = 0;
 #endif // GWP_ASAN_HOOKS
 
-  StackDepot Depot;
-
   struct AllocationRingBuffer {
     struct Entry {
       atomic_uptr Ptr;
@@ -1061,15 +1084,31 @@ private:
       atomic_u32 DeallocationTrace;
       atomic_u32 DeallocationTid;
     };
-
+    StackDepot *Depot = nullptr;
+    uptr StackDepotSize = 0;
+    MemMapT RawRingBufferMap;
+    MemMapT RawStackDepotMap;
+    u32 RingBufferElements = 0;
     atomic_uptr Pos;
-    u32 Size;
     // An array of Size (at least one) elements of type Entry is immediately
     // following to this struct.
   };
+  static_assert(sizeof(AllocationRingBuffer) %
+                        alignof(typename AllocationRingBuffer::Entry) ==
+                    0,
+                "invalid alignment");
+
+  // Lock to initialize the RingBuffer
+  HybridMutex RingBufferInitLock;
+
   // Pointer to memory mapped area starting with AllocationRingBuffer struct,
   // and immediately followed by Size elements of type Entry.
-  char *RawRingBuffer = {};
+  atomic_uptr RingBufferAddress = {};
+
+  AllocationRingBuffer *getRingBuffer() {
+    return reinterpret_cast<AllocationRingBuffer *>(
+        atomic_load(&RingBufferAddress, memory_order_acquire));
+  }
 
   // The following might get optimized out by the compiler.
   NOINLINE void performSanityChecks() {
@@ -1118,41 +1157,40 @@ private:
     const uptr SizeOrUnusedBytes = Header->SizeOrUnusedBytes;
     if (LIKELY(Header->ClassId))
       return SizeOrUnusedBytes;
-    if (allocatorSupportsMemoryTagging<Params>())
+    if (allocatorSupportsMemoryTagging<AllocatorConfig>())
       Ptr = untagPointer(const_cast<void *>(Ptr));
     return SecondaryT::getBlockEnd(getBlockBegin(Ptr, Header)) -
            reinterpret_cast<uptr>(Ptr) - SizeOrUnusedBytes;
   }
 
-  void quarantineOrDeallocateChunk(Options Options, void *TaggedPtr,
+  void quarantineOrDeallocateChunk(const Options &Options, void *TaggedPtr,
                                    Chunk::UnpackedHeader *Header,
                                    uptr Size) NO_THREAD_SAFETY_ANALYSIS {
     void *Ptr = getHeaderTaggedPointer(TaggedPtr);
-    Chunk::UnpackedHeader NewHeader = *Header;
     // If the quarantine is disabled, the actual size of a chunk is 0 or larger
     // than the maximum allowed, we return a chunk directly to the backend.
     // This purposefully underflows for Size == 0.
     const bool BypassQuarantine = !Quarantine.getCacheSize() ||
                                   ((Size - 1) >= QuarantineMaxChunkSize) ||
-                                  !NewHeader.ClassId;
+                                  !Header->ClassId;
     if (BypassQuarantine)
-      NewHeader.State = Chunk::State::Available;
+      Header->State = Chunk::State::Available;
     else
-      NewHeader.State = Chunk::State::Quarantined;
-    NewHeader.OriginOrWasZeroed = useMemoryTagging<Params>(Options) &&
-                                  NewHeader.ClassId &&
-                                  !TSDRegistry.getDisableMemInit();
-    Chunk::compareExchangeHeader(Cookie, Ptr, &NewHeader, Header);
+      Header->State = Chunk::State::Quarantined;
+    Header->OriginOrWasZeroed = useMemoryTagging<AllocatorConfig>(Options) &&
+                                Header->ClassId &&
+                                !TSDRegistry.getDisableMemInit();
+    Chunk::storeHeader(Cookie, Ptr, Header);
 
-    if (UNLIKELY(useMemoryTagging<Params>(Options))) {
+    if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Options))) {
       u8 PrevTag = extractTag(reinterpret_cast<uptr>(TaggedPtr));
       storeDeallocationStackMaybe(Options, Ptr, PrevTag, Size);
-      if (NewHeader.ClassId) {
+      if (Header->ClassId) {
         if (!TSDRegistry.getDisableMemInit()) {
           uptr TaggedBegin, TaggedEnd;
           const uptr OddEvenMask = computeOddEvenMaskForPointerMaybe(
-              Options, reinterpret_cast<uptr>(getBlockBegin(Ptr, &NewHeader)),
-              NewHeader.ClassId);
+              Options, reinterpret_cast<uptr>(getBlockBegin(Ptr, Header)),
+              Header->ClassId);
           // Exclude the previous tag so that immediate use after free is
           // detected 100% of the time.
           setRandomTag(Ptr, Size, OddEvenMask | (1UL << PrevTag), &TaggedBegin,
@@ -1161,29 +1199,32 @@ private:
       }
     }
     if (BypassQuarantine) {
-      if (allocatorSupportsMemoryTagging<Params>())
+      if (allocatorSupportsMemoryTagging<AllocatorConfig>())
         Ptr = untagPointer(Ptr);
-      void *BlockBegin = getBlockBegin(Ptr, &NewHeader);
-      const uptr ClassId = NewHeader.ClassId;
+      void *BlockBegin = getBlockBegin(Ptr, Header);
+      const uptr ClassId = Header->ClassId;
       if (LIKELY(ClassId)) {
-        bool UnlockRequired;
-        auto *TSD = TSDRegistry.getTSDAndLock(&UnlockRequired);
-        TSD->getCache().deallocate(ClassId, BlockBegin);
-        if (UnlockRequired)
-          TSD->unlock();
+        bool CacheDrained;
+        {
+          typename TSDRegistryT::ScopedTSD TSD(TSDRegistry);
+          CacheDrained = TSD->getCache().deallocate(ClassId, BlockBegin);
+        }
+        // When we have drained some blocks back to the Primary from TSD, that
+        // implies that we may have the chance to release some pages as well.
+        // Note that in order not to block other thread's accessing the TSD,
+        // release the TSD first then try the page release.
+        if (CacheDrained)
+          Primary.tryReleaseToOS(ClassId, ReleaseToOS::Normal);
       } else {
-        if (UNLIKELY(useMemoryTagging<Params>(Options)))
+        if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Options)))
           storeTags(reinterpret_cast<uptr>(BlockBegin),
                     reinterpret_cast<uptr>(Ptr));
         Secondary.deallocate(Options, BlockBegin);
       }
     } else {
-      bool UnlockRequired;
-      auto *TSD = TSDRegistry.getTSDAndLock(&UnlockRequired);
+      typename TSDRegistryT::ScopedTSD TSD(TSDRegistry);
       Quarantine.put(&TSD->getQuarantineCache(),
                      QuarantineCallback(*this, TSD->getCache()), Ptr, Size);
-      if (UnlockRequired)
-        TSD->unlock();
     }
   }
 
@@ -1256,20 +1297,24 @@ private:
     storeEndMarker(RoundNewPtr, NewSize, BlockEnd);
   }
 
-  void storePrimaryAllocationStackMaybe(Options Options, void *Ptr) {
+  void storePrimaryAllocationStackMaybe(const Options &Options, void *Ptr) {
     if (!UNLIKELY(Options.get(OptionBit::TrackAllocationStacks)))
       return;
+    AllocationRingBuffer *RB = getRingBuffer();
+    if (!RB)
+      return;
     auto *Ptr32 = reinterpret_cast<u32 *>(Ptr);
-    Ptr32[MemTagAllocationTraceIndex] = collectStackTrace();
+    Ptr32[MemTagAllocationTraceIndex] = collectStackTrace(RB->Depot);
     Ptr32[MemTagAllocationTidIndex] = getThreadID();
   }
 
-  void storeRingBufferEntry(void *Ptr, u32 AllocationTrace, u32 AllocationTid,
+  void storeRingBufferEntry(AllocationRingBuffer *RB, void *Ptr,
+                            u32 AllocationTrace, u32 AllocationTid,
                             uptr AllocationSize, u32 DeallocationTrace,
                             u32 DeallocationTid) {
-    uptr Pos = atomic_fetch_add(&getRingBuffer()->Pos, 1, memory_order_relaxed);
+    uptr Pos = atomic_fetch_add(&RB->Pos, 1, memory_order_relaxed);
     typename AllocationRingBuffer::Entry *Entry =
-        getRingBufferEntry(RawRingBuffer, Pos % getRingBuffer()->Size);
+        getRingBufferEntry(RB, Pos % RB->RingBufferElements);
 
     // First invalidate our entry so that we don't attempt to interpret a
     // partially written state in getSecondaryErrorInfo(). The fences below
@@ -1288,34 +1333,38 @@ private:
     atomic_store_relaxed(&Entry->Ptr, reinterpret_cast<uptr>(Ptr));
   }
 
-  void storeSecondaryAllocationStackMaybe(Options Options, void *Ptr,
+  void storeSecondaryAllocationStackMaybe(const Options &Options, void *Ptr,
                                           uptr Size) {
     if (!UNLIKELY(Options.get(OptionBit::TrackAllocationStacks)))
       return;
-
-    u32 Trace = collectStackTrace();
+    AllocationRingBuffer *RB = getRingBuffer();
+    if (!RB)
+      return;
+    u32 Trace = collectStackTrace(RB->Depot);
     u32 Tid = getThreadID();
 
     auto *Ptr32 = reinterpret_cast<u32 *>(Ptr);
     Ptr32[MemTagAllocationTraceIndex] = Trace;
     Ptr32[MemTagAllocationTidIndex] = Tid;
 
-    storeRingBufferEntry(untagPointer(Ptr), Trace, Tid, Size, 0, 0);
+    storeRingBufferEntry(RB, untagPointer(Ptr), Trace, Tid, Size, 0, 0);
   }
 
-  void storeDeallocationStackMaybe(Options Options, void *Ptr, u8 PrevTag,
-                                   uptr Size) {
+  void storeDeallocationStackMaybe(const Options &Options, void *Ptr,
+                                   u8 PrevTag, uptr Size) {
     if (!UNLIKELY(Options.get(OptionBit::TrackAllocationStacks)))
       return;
-
+    AllocationRingBuffer *RB = getRingBuffer();
+    if (!RB)
+      return;
     auto *Ptr32 = reinterpret_cast<u32 *>(Ptr);
     u32 AllocationTrace = Ptr32[MemTagAllocationTraceIndex];
     u32 AllocationTid = Ptr32[MemTagAllocationTidIndex];
 
-    u32 DeallocationTrace = collectStackTrace();
+    u32 DeallocationTrace = collectStackTrace(RB->Depot);
     u32 DeallocationTid = getThreadID();
 
-    storeRingBufferEntry(addFixedTag(untagPointer(Ptr), PrevTag),
+    storeRingBufferEntry(RB, addFixedTag(untagPointer(Ptr), PrevTag),
                          AllocationTrace, AllocationTid, Size,
                          DeallocationTrace, DeallocationTid);
   }
@@ -1391,8 +1440,10 @@ private:
           UntaggedFaultAddr < ChunkAddr ? BUFFER_UNDERFLOW : BUFFER_OVERFLOW;
       R->allocation_address = ChunkAddr;
       R->allocation_size = Header.SizeOrUnusedBytes;
-      collectTraceMaybe(Depot, R->allocation_trace,
-                        Data[MemTagAllocationTraceIndex]);
+      if (Depot) {
+        collectTraceMaybe(Depot, R->allocation_trace,
+                          Data[MemTagAllocationTraceIndex]);
+      }
       R->allocation_tid = Data[MemTagAllocationTidIndex];
       return NextErrorReport == NumErrorReports;
     };
@@ -1410,17 +1461,19 @@ private:
                                      size_t &NextErrorReport,
                                      uintptr_t FaultAddr,
                                      const StackDepot *Depot,
-                                     const char *RingBufferPtr) {
+                                     const char *RingBufferPtr,
+                                     size_t RingBufferSize) {
     auto *RingBuffer =
         reinterpret_cast<const AllocationRingBuffer *>(RingBufferPtr);
-    if (!RingBuffer || RingBuffer->Size == 0)
+    size_t RingBufferElements = ringBufferElementsFromBytes(RingBufferSize);
+    if (!RingBuffer || RingBufferElements == 0 || !Depot)
       return;
     uptr Pos = atomic_load_relaxed(&RingBuffer->Pos);
 
-    for (uptr I = Pos - 1;
-         I != Pos - 1 - RingBuffer->Size && NextErrorReport != NumErrorReports;
+    for (uptr I = Pos - 1; I != Pos - 1 - RingBufferElements &&
+                           NextErrorReport != NumErrorReports;
          --I) {
-      auto *Entry = getRingBufferEntry(RingBufferPtr, I % RingBuffer->Size);
+      auto *Entry = getRingBufferEntry(RingBuffer, I % RingBufferElements);
       uptr EntryPtr = atomic_load_relaxed(&Entry->Ptr);
       if (!EntryPtr)
         continue;
@@ -1483,47 +1536,110 @@ private:
     Primary.getStats(Str);
     Secondary.getStats(Str);
     Quarantine.getStats(Str);
+    TSDRegistry.getStats(Str);
     return Str->length();
   }
 
   static typename AllocationRingBuffer::Entry *
-  getRingBufferEntry(char *RawRingBuffer, uptr N) {
+  getRingBufferEntry(AllocationRingBuffer *RB, uptr N) {
+    char *RBEntryStart =
+        &reinterpret_cast<char *>(RB)[sizeof(AllocationRingBuffer)];
     return &reinterpret_cast<typename AllocationRingBuffer::Entry *>(
-        &RawRingBuffer[sizeof(AllocationRingBuffer)])[N];
+        RBEntryStart)[N];
   }
   static const typename AllocationRingBuffer::Entry *
-  getRingBufferEntry(const char *RawRingBuffer, uptr N) {
+  getRingBufferEntry(const AllocationRingBuffer *RB, uptr N) {
+    const char *RBEntryStart =
+        &reinterpret_cast<const char *>(RB)[sizeof(AllocationRingBuffer)];
     return &reinterpret_cast<const typename AllocationRingBuffer::Entry *>(
-        &RawRingBuffer[sizeof(AllocationRingBuffer)])[N];
+        RBEntryStart)[N];
+  }
+
+  void initRingBufferMaybe() {
+    ScopedLock L(RingBufferInitLock);
+    if (getRingBuffer() != nullptr)
+      return;
+
+    int ring_buffer_size = getFlags()->allocation_ring_buffer_size;
+    if (ring_buffer_size <= 0)
+      return;
+
+    u32 AllocationRingBufferSize = static_cast<u32>(ring_buffer_size);
+
+    // We store alloc and free stacks for each entry.
+    constexpr u32 kStacksPerRingBufferEntry = 2;
+    constexpr u32 kMaxU32Pow2 = ~(UINT32_MAX >> 1);
+    static_assert(isPowerOfTwo(kMaxU32Pow2));
+    // On Android we always have 3 frames at the bottom: __start_main,
+    // __libc_init, main, and 3 at the top: malloc, scudo_malloc and
+    // Allocator::allocate. This leaves 10 frames for the user app. The next
+    // smallest power of two (8) would only leave 2, which is clearly too
+    // little.
+    constexpr u32 kFramesPerStack = 16;
+    static_assert(isPowerOfTwo(kFramesPerStack));
+
+    if (AllocationRingBufferSize > kMaxU32Pow2 / kStacksPerRingBufferEntry)
+      return;
+    u32 TabSize = static_cast<u32>(roundUpPowerOfTwo(kStacksPerRingBufferEntry *
+                                                     AllocationRingBufferSize));
+    if (TabSize > UINT32_MAX / kFramesPerStack)
+      return;
+    u32 RingSize = static_cast<u32>(TabSize * kFramesPerStack);
+
+    uptr StackDepotSize = sizeof(StackDepot) + sizeof(atomic_u64) * RingSize +
+                          sizeof(atomic_u32) * TabSize;
+    MemMapT DepotMap;
+    DepotMap.map(
+        /*Addr=*/0U, roundUp(StackDepotSize, getPageSizeCached()),
+        "scudo:stack_depot");
+    auto *Depot = reinterpret_cast<StackDepot *>(DepotMap.getBase());
+    Depot->init(RingSize, TabSize);
+
+    MemMapT MemMap;
+    MemMap.map(
+        /*Addr=*/0U,
+        roundUp(ringBufferSizeInBytes(AllocationRingBufferSize),
+                getPageSizeCached()),
+        "scudo:ring_buffer");
+    auto *RB = reinterpret_cast<AllocationRingBuffer *>(MemMap.getBase());
+    RB->RawRingBufferMap = MemMap;
+    RB->RingBufferElements = AllocationRingBufferSize;
+    RB->Depot = Depot;
+    RB->StackDepotSize = StackDepotSize;
+    RB->RawStackDepotMap = DepotMap;
+
+    atomic_store(&RingBufferAddress, reinterpret_cast<uptr>(RB),
+                 memory_order_release);
   }
 
-  void initRingBuffer() {
-    u32 AllocationRingBufferSize =
-        static_cast<u32>(getFlags()->allocation_ring_buffer_size);
-    if (AllocationRingBufferSize < 1)
+  void unmapRingBuffer() {
+    AllocationRingBuffer *RB = getRingBuffer();
+    if (RB == nullptr)
       return;
-    MapPlatformData Data = {};
-    RawRingBuffer = static_cast<char *>(
-        map(/*Addr=*/nullptr,
-            roundUp(ringBufferSizeInBytes(AllocationRingBufferSize),
-                    getPageSizeCached()),
-            "AllocatorRingBuffer", /*Flags=*/0, &Data));
-    auto *RingBuffer = reinterpret_cast<AllocationRingBuffer *>(RawRingBuffer);
-    RingBuffer->Size = AllocationRingBufferSize;
-    static_assert(sizeof(AllocationRingBuffer) %
-                          alignof(typename AllocationRingBuffer::Entry) ==
-                      0,
-                  "invalid alignment");
-  }
-
-  static constexpr size_t ringBufferSizeInBytes(u32 AllocationRingBufferSize) {
+    // N.B. because RawStackDepotMap is part of RawRingBufferMap, the order
+    // is very important.
+    RB->RawStackDepotMap.unmap(RB->RawStackDepotMap.getBase(),
+                               RB->RawStackDepotMap.getCapacity());
+    // Note that the `RB->RawRingBufferMap` is stored on the pages managed by
+    // itself. Take over the ownership before calling unmap() so that any
+    // operation along with unmap() won't touch inaccessible pages.
+    MemMapT RawRingBufferMap = RB->RawRingBufferMap;
+    RawRingBufferMap.unmap(RawRingBufferMap.getBase(),
+                           RawRingBufferMap.getCapacity());
+    atomic_store(&RingBufferAddress, 0, memory_order_release);
+  }
+
+  static constexpr size_t ringBufferSizeInBytes(u32 RingBufferElements) {
     return sizeof(AllocationRingBuffer) +
-           AllocationRingBufferSize *
-               sizeof(typename AllocationRingBuffer::Entry);
+           RingBufferElements * sizeof(typename AllocationRingBuffer::Entry);
   }
 
-  inline AllocationRingBuffer *getRingBuffer() {
-    return reinterpret_cast<AllocationRingBuffer *>(RawRingBuffer);
+  static constexpr size_t ringBufferElementsFromBytes(size_t Bytes) {
+    if (Bytes < sizeof(AllocationRingBuffer)) {
+      return 0;
+    }
+    return (Bytes - sizeof(AllocationRingBuffer)) /
+           sizeof(typename AllocationRingBuffer::Entry);
   }
 };
 
diff --git a/standalone/common.cpp b/standalone/common.cpp
index 9f14faeef28..06e930638f6 100644
--- a/standalone/common.cpp
+++ b/standalone/common.cpp
@@ -21,22 +21,4 @@ uptr getPageSizeSlow() {
   return PageSizeCached;
 }
 
-// Fatal internal map() or unmap() error (potentially OOM related).
-void NORETURN dieOnMapUnmapError(uptr SizeIfOOM) {
-  char Error[128] = "Scudo ERROR: internal map or unmap failure\n";
-  if (SizeIfOOM) {
-    formatString(
-        Error, sizeof(Error),
-        "Scudo ERROR: internal map failure (NO MEMORY) requesting %zuKB\n",
-        SizeIfOOM >> 10);
-  }
-  outputRaw(Error);
-  setAbortMessage(Error);
-  die();
-}
-
-#if !SCUDO_LINUX
-uptr GetRSS() { return 0; }
-#endif
-
 } // namespace scudo
diff --git a/standalone/common.h b/standalone/common.h
index aa15e9e787e..151fbd317e7 100644
--- a/standalone/common.h
+++ b/standalone/common.h
@@ -17,6 +17,7 @@
 
 #include <stddef.h>
 #include <string.h>
+#include <unistd.h>
 
 namespace scudo {
 
@@ -27,7 +28,11 @@ template <class Dest, class Source> inline Dest bit_cast(const Source &S) {
   return D;
 }
 
-inline constexpr bool isPowerOfTwo(uptr X) { return (X & (X - 1)) == 0; }
+inline constexpr bool isPowerOfTwo(uptr X) {
+  if (X == 0)
+    return false;
+  return (X & (X - 1)) == 0;
+}
 
 inline constexpr uptr roundUp(uptr X, uptr Boundary) {
   DCHECK(isPowerOfTwo(Boundary));
@@ -111,19 +116,19 @@ template <typename T> inline void shuffle(T *A, u32 N, u32 *RandState) {
   *RandState = State;
 }
 
-// Hardware specific inlinable functions.
+inline void computePercentage(uptr Numerator, uptr Denominator, uptr *Integral,
+                              uptr *Fractional) {
+  constexpr uptr Digits = 100;
+  if (Denominator == 0) {
+    *Integral = 100;
+    *Fractional = 0;
+    return;
+  }
 
-inline void yieldProcessor(UNUSED u8 Count) {
-#if defined(__i386__) || defined(__x86_64__)
-  __asm__ __volatile__("" ::: "memory");
-  for (u8 I = 0; I < Count; I++)
-    __asm__ __volatile__("pause");
-#elif defined(__aarch64__) || defined(__arm__)
-  __asm__ __volatile__("" ::: "memory");
-  for (u8 I = 0; I < Count; I++)
-    __asm__ __volatile__("yield");
-#endif
-  __asm__ __volatile__("" ::: "memory");
+  *Integral = Numerator * Digits / Denominator;
+  *Fractional =
+      (((Numerator * Digits) % Denominator) * Digits + Denominator / 2) /
+      Denominator;
 }
 
 // Platform specific functions.
@@ -131,9 +136,10 @@ inline void yieldProcessor(UNUSED u8 Count) {
 extern uptr PageSizeCached;
 uptr getPageSizeSlow();
 inline uptr getPageSizeCached() {
-  // Bionic uses a hardcoded value.
-  if (SCUDO_ANDROID)
-    return 4096U;
+#if SCUDO_ANDROID && defined(PAGE_SIZE)
+  // Most Android builds have a build-time constant page size.
+  return PAGE_SIZE;
+#endif
   if (LIKELY(PageSizeCached))
     return PageSizeCached;
   return getPageSizeSlow();
@@ -144,9 +150,10 @@ u32 getNumberOfCPUs();
 
 const char *getEnv(const char *Name);
 
-uptr GetRSS();
-
 u64 getMonotonicTime();
+// Gets the time faster but with less accuracy. Can call getMonotonicTime
+// if no fast version is available.
+u64 getMonotonicTimeFast();
 
 u32 getThreadID();
 
@@ -187,10 +194,6 @@ void setMemoryPermission(uptr Addr, uptr Size, uptr Flags,
 void releasePagesToOS(uptr BaseAddress, uptr Offset, uptr Size,
                       MapPlatformData *Data = nullptr);
 
-// Internal map & unmap fatal error. This must not call map(). SizeIfOOM shall
-// hold the requested size on an out-of-memory error, 0 otherwise.
-void NORETURN dieOnMapUnmapError(uptr SizeIfOOM = 0);
-
 // Logging related functions.
 
 void setAbortMessage(const char *Message);
@@ -212,6 +215,13 @@ enum class Option : u8 {
   MaxTSDsCount,         // Number of usable TSDs for the shared registry.
 };
 
+enum class ReleaseToOS : u8 {
+  Normal, // Follow the normal rules for releasing pages to the OS
+  Force,  // Force release pages to the OS, but avoid cases that take too long.
+  ForceAll, // Force release every page possible regardless of how long it will
+            // take.
+};
+
 constexpr unsigned char PatternFillByte = 0xAB;
 
 enum FillContentsMode {
diff --git a/standalone/condition_variable.h b/standalone/condition_variable.h
new file mode 100644
index 00000000000..3f16c86651e
--- /dev/null
+++ b/standalone/condition_variable.h
@@ -0,0 +1,44 @@
+//===-- condition_variable.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_CONDITION_VARIABLE_H_
+#define SCUDO_CONDITION_VARIABLE_H_
+
+#include "condition_variable_base.h"
+
+#include "common.h"
+#include "platform.h"
+
+#include "condition_variable_linux.h"
+
+namespace scudo {
+
+// A default implementation of default condition variable. It doesn't do a real
+// `wait`, instead it spins a short amount of time only.
+class ConditionVariableDummy
+    : public ConditionVariableBase<ConditionVariableDummy> {
+public:
+  void notifyAllImpl(UNUSED HybridMutex &M) REQUIRES(M) {}
+
+  void waitImpl(UNUSED HybridMutex &M) REQUIRES(M) {
+    M.unlock();
+
+    constexpr u32 SpinTimes = 64;
+    volatile u32 V = 0;
+    for (u32 I = 0; I < SpinTimes; ++I) {
+      u32 Tmp = V + 1;
+      V = Tmp;
+    }
+
+    M.lock();
+  }
+};
+
+} // namespace scudo
+
+#endif // SCUDO_CONDITION_VARIABLE_H_
diff --git a/standalone/condition_variable_base.h b/standalone/condition_variable_base.h
new file mode 100644
index 00000000000..416c327fed4
--- /dev/null
+++ b/standalone/condition_variable_base.h
@@ -0,0 +1,56 @@
+//===-- condition_variable_base.h -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_CONDITION_VARIABLE_BASE_H_
+#define SCUDO_CONDITION_VARIABLE_BASE_H_
+
+#include "mutex.h"
+#include "thread_annotations.h"
+
+namespace scudo {
+
+template <typename Derived> class ConditionVariableBase {
+public:
+  constexpr ConditionVariableBase() = default;
+
+  void bindTestOnly(HybridMutex &Mutex) {
+#if SCUDO_DEBUG
+    boundMutex = &Mutex;
+#else
+    (void)Mutex;
+#endif
+  }
+
+  void notifyAll(HybridMutex &M) REQUIRES(M) {
+#if SCUDO_DEBUG
+    CHECK_EQ(&M, boundMutex);
+#endif
+    getDerived()->notifyAllImpl(M);
+  }
+
+  void wait(HybridMutex &M) REQUIRES(M) {
+#if SCUDO_DEBUG
+    CHECK_EQ(&M, boundMutex);
+#endif
+    getDerived()->waitImpl(M);
+  }
+
+protected:
+  Derived *getDerived() { return static_cast<Derived *>(this); }
+
+#if SCUDO_DEBUG
+  // Because thread-safety analysis doesn't support pointer aliasing, we are not
+  // able to mark the proper annotations without false positive. Instead, we
+  // pass the lock and do the same-lock check separately.
+  HybridMutex *boundMutex = nullptr;
+#endif
+};
+
+} // namespace scudo
+
+#endif // SCUDO_CONDITION_VARIABLE_BASE_H_
diff --git a/standalone/condition_variable_linux.cpp b/standalone/condition_variable_linux.cpp
new file mode 100644
index 00000000000..e6d9bd1771a
--- /dev/null
+++ b/standalone/condition_variable_linux.cpp
@@ -0,0 +1,52 @@
+//===-- condition_variable_linux.cpp ----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "platform.h"
+
+#if SCUDO_LINUX
+
+#include "condition_variable_linux.h"
+
+#include "atomic_helpers.h"
+
+#include <limits.h>
+#include <linux/futex.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+namespace scudo {
+
+void ConditionVariableLinux::notifyAllImpl(UNUSED HybridMutex &M) {
+  const u32 V = atomic_load_relaxed(&Counter);
+  atomic_store_relaxed(&Counter, V + 1);
+
+  // TODO(chiahungduan): Move the waiters from the futex waiting queue
+  // `Counter` to futex waiting queue `M` so that the awoken threads won't be
+  // blocked again due to locked `M` by current thread.
+  if (LastNotifyAll != V) {
+    syscall(SYS_futex, reinterpret_cast<uptr>(&Counter), FUTEX_WAKE_PRIVATE,
+            INT_MAX, nullptr, nullptr, 0);
+  }
+
+  LastNotifyAll = V + 1;
+}
+
+void ConditionVariableLinux::waitImpl(HybridMutex &M) {
+  const u32 V = atomic_load_relaxed(&Counter) + 1;
+  atomic_store_relaxed(&Counter, V);
+
+  // TODO: Use ScopedUnlock when it's supported.
+  M.unlock();
+  syscall(SYS_futex, reinterpret_cast<uptr>(&Counter), FUTEX_WAIT_PRIVATE, V,
+          nullptr, nullptr, 0);
+  M.lock();
+}
+
+} // namespace scudo
+
+#endif // SCUDO_LINUX
diff --git a/standalone/condition_variable_linux.h b/standalone/condition_variable_linux.h
new file mode 100644
index 00000000000..cd073287326
--- /dev/null
+++ b/standalone/condition_variable_linux.h
@@ -0,0 +1,38 @@
+//===-- condition_variable_linux.h ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_CONDITION_VARIABLE_LINUX_H_
+#define SCUDO_CONDITION_VARIABLE_LINUX_H_
+
+#include "platform.h"
+
+#if SCUDO_LINUX
+
+#include "atomic_helpers.h"
+#include "condition_variable_base.h"
+#include "thread_annotations.h"
+
+namespace scudo {
+
+class ConditionVariableLinux
+    : public ConditionVariableBase<ConditionVariableLinux> {
+public:
+  void notifyAllImpl(HybridMutex &M) REQUIRES(M);
+
+  void waitImpl(HybridMutex &M) REQUIRES(M);
+
+private:
+  u32 LastNotifyAll = 0;
+  atomic_u32 Counter = {};
+};
+
+} // namespace scudo
+
+#endif // SCUDO_LINUX
+
+#endif // SCUDO_CONDITION_VARIABLE_LINUX_H_
diff --git a/standalone/crc32_hw.cpp b/standalone/crc32_hw.cpp
index 73f2ae000c6..910cf946031 100644
--- a/standalone/crc32_hw.cpp
+++ b/standalone/crc32_hw.cpp
@@ -17,4 +17,13 @@ u32 computeHardwareCRC32(u32 Crc, uptr Data) {
 #endif // defined(__CRC32__) || defined(__SSE4_2__) ||
        // defined(__ARM_FEATURE_CRC32)
 
+#if defined(__loongarch__)
+u32 computeHardwareCRC32(u32 Crc, uptr Data) {
+  // The LoongArch CRC intrinsics have the two input arguments swapped, and
+  // expect them to be signed.
+  return static_cast<u32>(
+      CRC32_INTRINSIC(static_cast<long>(Data), static_cast<int>(Crc)));
+}
+#endif // defined(__loongarch__)
+
 } // namespace scudo
diff --git a/standalone/flags.cpp b/standalone/flags.cpp
index de5153b288b..f498edfbd32 100644
--- a/standalone/flags.cpp
+++ b/standalone/flags.cpp
@@ -68,6 +68,9 @@ void initFlags() {
   Parser.parseString(getCompileDefinitionScudoDefaultOptions());
   Parser.parseString(getScudoDefaultOptions());
   Parser.parseString(getEnv("SCUDO_OPTIONS"));
+  if (const char *V = getEnv("SCUDO_ALLOCATION_RING_BUFFER_SIZE")) {
+    Parser.parseStringPair("allocation_ring_buffer_size", V);
+  }
 }
 
 } // namespace scudo
diff --git a/standalone/flags.inc b/standalone/flags.inc
index c1f153bafdd..ff0c28e1db7 100644
--- a/standalone/flags.inc
+++ b/standalone/flags.inc
@@ -42,18 +42,10 @@ SCUDO_FLAG(bool, may_return_null, true,
            "returning NULL in otherwise non-fatal error scenarios, eg: OOM, "
            "invalid allocation alignments, etc.")
 
-SCUDO_FLAG(int, release_to_os_interval_ms, SCUDO_ANDROID ? INT32_MIN : 5000,
+SCUDO_FLAG(int, release_to_os_interval_ms, 5000,
            "Interval (in milliseconds) at which to attempt release of unused "
            "memory to the OS. Negative values disable the feature.")
 
-SCUDO_FLAG(int, hard_rss_limit_mb, 0,
-           "Hard RSS Limit in Mb. If non-zero, once the limit is achieved, "
-           "abort the process")
-
-SCUDO_FLAG(int, soft_rss_limit_mb, 0,
-           "Soft RSS Limit in Mb. If non-zero, once the limit is reached, all "
-           "subsequent calls will fail or return NULL until the RSS goes below "
-           "the soft limit")
-
 SCUDO_FLAG(int, allocation_ring_buffer_size, 32768,
-           "Entries to keep in the allocation ring buffer for scudo.")
+           "Entries to keep in the allocation ring buffer for scudo. "
+           "Values less or equal to zero disable the buffer.")
diff --git a/standalone/flags_parser.cpp b/standalone/flags_parser.cpp
index be39fcd4f88..3d8c6f3789b 100644
--- a/standalone/flags_parser.cpp
+++ b/standalone/flags_parser.cpp
@@ -10,6 +10,8 @@
 #include "common.h"
 #include "report.h"
 
+#include <errno.h>
+#include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -80,7 +82,7 @@ void FlagParser::parseFlag() {
       ++Pos;
     Value = Buffer + ValueStart;
   }
-  if (!runHandler(Name, Value))
+  if (!runHandler(Name, Value, '='))
     reportError("flag parsing failed.");
 }
 
@@ -122,10 +124,16 @@ inline bool parseBool(const char *Value, bool *b) {
   return false;
 }
 
-bool FlagParser::runHandler(const char *Name, const char *Value) {
+void FlagParser::parseStringPair(const char *Name, const char *Value) {
+  if (!runHandler(Name, Value, '\0'))
+    reportError("flag parsing failed.");
+}
+
+bool FlagParser::runHandler(const char *Name, const char *Value,
+                            const char Sep) {
   for (u32 I = 0; I < NumberOfFlags; ++I) {
     const uptr Len = strlen(Flags[I].Name);
-    if (strncmp(Name, Flags[I].Name, Len) != 0 || Name[Len] != '=')
+    if (strncmp(Name, Flags[I].Name, Len) != 0 || Name[Len] != Sep)
       continue;
     bool Ok = false;
     switch (Flags[I].Type) {
@@ -136,12 +144,18 @@ bool FlagParser::runHandler(const char *Name, const char *Value) {
       break;
     case FlagType::FT_int:
       char *ValueEnd;
-      *reinterpret_cast<int *>(Flags[I].Var) =
-          static_cast<int>(strtol(Value, &ValueEnd, 10));
-      Ok =
-          *ValueEnd == '"' || *ValueEnd == '\'' || isSeparatorOrNull(*ValueEnd);
-      if (!Ok)
+      errno = 0;
+      long V = strtol(Value, &ValueEnd, 10);
+      if (errno != 0 ||                 // strtol failed (over or underflow)
+          V > INT_MAX || V < INT_MIN || // overflows integer
+          // contains unexpected characters
+          (*ValueEnd != '"' && *ValueEnd != '\'' &&
+           !isSeparatorOrNull(*ValueEnd))) {
         reportInvalidFlag("int", Value);
+        break;
+      }
+      *reinterpret_cast<int *>(Flags[I].Var) = static_cast<int>(V);
+      Ok = true;
       break;
     }
     return Ok;
diff --git a/standalone/flags_parser.h b/standalone/flags_parser.h
index ba832adbd90..ded496fda3b 100644
--- a/standalone/flags_parser.h
+++ b/standalone/flags_parser.h
@@ -27,6 +27,7 @@ public:
                     void *Var);
   void parseString(const char *S);
   void printFlagDescriptions();
+  void parseStringPair(const char *Name, const char *Value);
 
 private:
   static const u32 MaxFlags = 20;
@@ -45,7 +46,7 @@ private:
   void skipWhitespace();
   void parseFlags();
   void parseFlag();
-  bool runHandler(const char *Name, const char *Value);
+  bool runHandler(const char *Name, const char *Value, char Sep);
 };
 
 void reportUnrecognizedFlags();
diff --git a/standalone/fuchsia.cpp b/standalone/fuchsia.cpp
index da684e7f1de..2144f1b63f8 100644
--- a/standalone/fuchsia.cpp
+++ b/standalone/fuchsia.cpp
@@ -19,6 +19,7 @@
 #include <zircon/compiler.h>
 #include <zircon/process.h>
 #include <zircon/sanitizer.h>
+#include <zircon/status.h>
 #include <zircon/syscalls.h>
 
 namespace scudo {
@@ -31,6 +32,15 @@ void NORETURN die() { __builtin_trap(); }
 // with ZX_HANDLE_INVALID.
 static_assert(ZX_HANDLE_INVALID == 0, "");
 
+static void NORETURN dieOnError(zx_status_t Status, const char *FnName,
+                                uptr Size) {
+  ScopedString Error;
+  Error.append("SCUDO ERROR: %s failed with size %zuKB (%s)", FnName,
+               Size >> 10, zx_status_get_string(Status));
+  outputRaw(Error.data());
+  die();
+}
+
 static void *allocateVmar(uptr Size, MapPlatformData *Data, bool AllowNoMem) {
   // Only scenario so far.
   DCHECK(Data);
@@ -42,7 +52,7 @@ static void *allocateVmar(uptr Size, MapPlatformData *Data, bool AllowNoMem) {
       Size, &Data->Vmar, &Data->VmarBase);
   if (UNLIKELY(Status != ZX_OK)) {
     if (Status != ZX_ERR_NO_MEMORY || !AllowNoMem)
-      dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY ? Size : 0);
+      dieOnError(Status, "zx_vmar_allocate", Size);
     return nullptr;
   }
   return reinterpret_cast<void *>(Data->VmarBase);
@@ -73,7 +83,7 @@ void *map(void *Addr, uptr Size, const char *Name, uptr Flags,
     Status = _zx_vmo_set_size(Vmo, VmoSize + Size);
     if (Status != ZX_OK) {
       if (Status != ZX_ERR_NO_MEMORY || !AllowNoMem)
-        dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY ? Size : 0);
+        dieOnError(Status, "zx_vmo_set_size", VmoSize + Size);
       return nullptr;
     }
   } else {
@@ -81,7 +91,7 @@ void *map(void *Addr, uptr Size, const char *Name, uptr Flags,
     Status = _zx_vmo_create(Size, ZX_VMO_RESIZABLE, &Vmo);
     if (UNLIKELY(Status != ZX_OK)) {
       if (Status != ZX_ERR_NO_MEMORY || !AllowNoMem)
-        dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY ? Size : 0);
+        dieOnError(Status, "zx_vmo_create", Size);
       return nullptr;
     }
     _zx_object_set_property(Vmo, ZX_PROP_NAME, Name, strlen(Name));
@@ -99,7 +109,7 @@ void *map(void *Addr, uptr Size, const char *Name, uptr Flags,
   Status = _zx_vmar_map(Vmar, MapFlags, Offset, Vmo, VmoSize, Size, &P);
   if (UNLIKELY(Status != ZX_OK)) {
     if (Status != ZX_ERR_NO_MEMORY || !AllowNoMem)
-      dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY ? Size : 0);
+      dieOnError(Status, "zx_vmar_map", Size);
     return nullptr;
   }
 
@@ -120,7 +130,7 @@ void *map(void *Addr, uptr Size, const char *Name, uptr Flags,
   }
   if (UNLIKELY(Status != ZX_OK)) {
     if (Status != ZX_ERR_NO_MEMORY || !AllowNoMem)
-      dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY ? Size : 0);
+      dieOnError(Status, "zx_vmar_op_range", Size);
     return nullptr;
   }
 
@@ -145,7 +155,7 @@ void unmap(void *Addr, uptr Size, uptr Flags, MapPlatformData *Data) {
     const zx_status_t Status =
         _zx_vmar_unmap(Vmar, reinterpret_cast<uintptr_t>(Addr), Size);
     if (UNLIKELY(Status != ZX_OK))
-      dieOnMapUnmapError();
+      dieOnError(Status, "zx_vmar_unmap", Size);
   }
   if (Data) {
     if (Data->Vmo != ZX_HANDLE_INVALID)
@@ -160,12 +170,15 @@ void setMemoryPermission(UNUSED uptr Addr, UNUSED uptr Size, UNUSED uptr Flags,
       (Flags & MAP_NOACCESS) ? 0 : (ZX_VM_PERM_READ | ZX_VM_PERM_WRITE);
   DCHECK(Data);
   DCHECK_NE(Data->Vmar, ZX_HANDLE_INVALID);
-  if (_zx_vmar_protect(Data->Vmar, Prot, Addr, Size) != ZX_OK)
-    dieOnMapUnmapError();
+  const zx_status_t Status = _zx_vmar_protect(Data->Vmar, Prot, Addr, Size);
+  if (Status != ZX_OK)
+    dieOnError(Status, "zx_vmar_protect", Size);
 }
 
 void releasePagesToOS(UNUSED uptr BaseAddress, uptr Offset, uptr Size,
                       MapPlatformData *Data) {
+  // TODO: DCHECK the BaseAddress is consistent with the data in
+  // MapPlatformData.
   DCHECK(Data);
   DCHECK_NE(Data->Vmar, ZX_HANDLE_INVALID);
   DCHECK_NE(Data->Vmo, ZX_HANDLE_INVALID);
@@ -198,6 +211,7 @@ void HybridMutex::unlock() __TA_NO_THREAD_SAFETY_ANALYSIS {
 void HybridMutex::assertHeldImpl() __TA_NO_THREAD_SAFETY_ANALYSIS {}
 
 u64 getMonotonicTime() { return _zx_clock_get_monotonic(); }
+u64 getMonotonicTimeFast() { return _zx_clock_get_monotonic(); }
 
 u32 getNumberOfCPUs() { return _zx_system_get_num_cpus(); }
 
diff --git a/standalone/fuzz/get_error_info_fuzzer.cpp b/standalone/fuzz/get_error_info_fuzzer.cpp
index 74456450a47..2cef1c44fad 100644
--- a/standalone/fuzz/get_error_info_fuzzer.cpp
+++ b/standalone/fuzz/get_error_info_fuzzer.cpp
@@ -9,6 +9,7 @@
 #define SCUDO_FUZZ
 #include "allocator_config.h"
 #include "combined.h"
+#include "common.h"
 
 #include <fuzzer/FuzzedDataProvider.h>
 
@@ -31,11 +32,6 @@ extern "C" int LLVMFuzzerTestOneInput(uint8_t *Data, size_t Size) {
 
   std::string StackDepotBytes =
       FDP.ConsumeRandomLengthString(FDP.remaining_bytes());
-  std::vector<char> StackDepot(sizeof(scudo::StackDepot), 0);
-  for (size_t i = 0; i < StackDepotBytes.length() && i < StackDepot.size();
-       ++i) {
-    StackDepot[i] = StackDepotBytes[i];
-  }
 
   std::string RegionInfoBytes =
       FDP.ConsumeRandomLengthString(FDP.remaining_bytes());
@@ -46,14 +42,11 @@ extern "C" int LLVMFuzzerTestOneInput(uint8_t *Data, size_t Size) {
   }
 
   std::string RingBufferBytes = FDP.ConsumeRemainingBytesAsString();
-  // RingBuffer is too short.
-  if (!AllocatorT::setRingBufferSizeForBuffer(RingBufferBytes.data(),
-                                              RingBufferBytes.size()))
-    return 0;
 
   scudo_error_info ErrorInfo;
-  AllocatorT::getErrorInfo(&ErrorInfo, FaultAddr, StackDepot.data(),
-                           RegionInfo.data(), RingBufferBytes.data(), Memory,
-                           MemoryTags, MemoryAddr, MemorySize);
+  AllocatorT::getErrorInfo(&ErrorInfo, FaultAddr, StackDepotBytes.data(),
+                           StackDepotBytes.size(), RegionInfo.data(),
+                           RingBufferBytes.data(), RingBufferBytes.size(),
+                           Memory, MemoryTags, MemoryAddr, MemorySize);
   return 0;
 }
diff --git a/standalone/include/scudo/interface.h b/standalone/include/scudo/interface.h
index 23bcfba3982..a2dedea910c 100644
--- a/standalone/include/scudo/interface.h
+++ b/standalone/include/scudo/interface.h
@@ -17,10 +17,22 @@ extern "C" {
 __attribute__((weak)) const char *__scudo_default_options(void);
 
 // Post-allocation & pre-deallocation hooks.
-// They must be thread-safe and not use heap related functions.
 __attribute__((weak)) void __scudo_allocate_hook(void *ptr, size_t size);
 __attribute__((weak)) void __scudo_deallocate_hook(void *ptr);
 
+// `realloc` involves both deallocation and allocation but they are not reported
+// atomically. In one specific case which may keep taking a snapshot right in
+// the middle of `realloc` reporting the deallocation and allocation, it may
+// confuse the user by missing memory from `realloc`. To alleviate that case,
+// define the two `realloc` hooks to get the knowledge of the bundled hook
+// calls. These hooks are optional and should only be used when a hooks user
+// wants to track reallocs more closely.
+//
+// See more details in the comment of `realloc` in wrapper_c.inc.
+__attribute__((weak)) void
+__scudo_realloc_allocate_hook(void *old_ptr, void *new_ptr, size_t size);
+__attribute__((weak)) void __scudo_realloc_deallocate_hook(void *old_ptr);
+
 void __scudo_print_stats(void);
 
 typedef void (*iterate_callback)(uintptr_t base, size_t size, void *arg);
@@ -73,7 +85,8 @@ typedef void (*iterate_callback)(uintptr_t base, size_t size, void *arg);
 // pointer.
 void __scudo_get_error_info(struct scudo_error_info *error_info,
                             uintptr_t fault_addr, const char *stack_depot,
-                            const char *region_info, const char *ring_buffer,
+                            size_t stack_depot_size, const char *region_info,
+                            const char *ring_buffer, size_t ring_buffer_size,
                             const char *memory, const char *memory_tags,
                             uintptr_t memory_addr, size_t memory_size);
 
@@ -118,6 +131,10 @@ size_t __scudo_get_ring_buffer_size(void);
 #define M_PURGE -101
 #endif
 
+#ifndef M_PURGE_ALL
+#define M_PURGE_ALL -104
+#endif
+
 // Tune the allocator's choice of memory tags to make it more likely that
 // a certain class of memory errors will be detected. The value argument should
 // be one of the M_MEMTAG_TUNING_* constants below.
@@ -155,6 +172,11 @@ size_t __scudo_get_ring_buffer_size(void);
 #define M_MEMTAG_TUNING_UAF 1
 #endif
 
+// Print internal stats to the log.
+#ifndef M_LOG_STATS
+#define M_LOG_STATS -205
+#endif
+
 } // extern "C"
 
 #endif // SCUDO_INTERFACE_H_
diff --git a/standalone/linux.cpp b/standalone/linux.cpp
index 33757e292f2..27469510810 100644
--- a/standalone/linux.cpp
+++ b/standalone/linux.cpp
@@ -14,6 +14,7 @@
 #include "internal_defs.h"
 #include "linux.h"
 #include "mutex.h"
+#include "report_linux.h"
 #include "string_utils.h"
 
 #include <errno.h>
@@ -43,6 +44,7 @@ uptr getPageSize() { return static_cast<uptr>(sysconf(_SC_PAGESIZE)); }
 
 void NORETURN die() { abort(); }
 
+// TODO: Will be deprecated. Use the interfaces in MemMapLinux instead.
 void *map(void *Addr, uptr Size, UNUSED const char *Name, uptr Flags,
           UNUSED MapPlatformData *Data) {
   int MmapFlags = MAP_PRIVATE | MAP_ANONYMOUS;
@@ -65,7 +67,7 @@ void *map(void *Addr, uptr Size, UNUSED const char *Name, uptr Flags,
   void *P = mmap(Addr, Size, MmapProt, MmapFlags, -1, 0);
   if (P == MAP_FAILED) {
     if (!(Flags & MAP_ALLOWNOMEM) || errno != ENOMEM)
-      dieOnMapUnmapError(errno == ENOMEM ? Size : 0);
+      reportMapError(errno == ENOMEM ? Size : 0);
     return nullptr;
   }
 #if SCUDO_ANDROID
@@ -75,19 +77,22 @@ void *map(void *Addr, uptr Size, UNUSED const char *Name, uptr Flags,
   return P;
 }
 
+// TODO: Will be deprecated. Use the interfaces in MemMapLinux instead.
 void unmap(void *Addr, uptr Size, UNUSED uptr Flags,
            UNUSED MapPlatformData *Data) {
   if (munmap(Addr, Size) != 0)
-    dieOnMapUnmapError();
+    reportUnmapError(reinterpret_cast<uptr>(Addr), Size);
 }
 
+// TODO: Will be deprecated. Use the interfaces in MemMapLinux instead.
 void setMemoryPermission(uptr Addr, uptr Size, uptr Flags,
                          UNUSED MapPlatformData *Data) {
   int Prot = (Flags & MAP_NOACCESS) ? PROT_NONE : (PROT_READ | PROT_WRITE);
   if (mprotect(reinterpret_cast<void *>(Addr), Size, Prot) != 0)
-    dieOnMapUnmapError();
+    reportProtectError(Addr, Size, Prot);
 }
 
+// TODO: Will be deprecated. Use the interfaces in MemMapLinux instead.
 void releasePagesToOS(uptr BaseAddress, uptr Offset, uptr Size,
                       UNUSED MapPlatformData *Data) {
   void *Addr = reinterpret_cast<void *>(BaseAddress + Offset);
@@ -104,12 +109,14 @@ enum State : u32 { Unlocked = 0, Locked = 1, Sleeping = 2 };
 }
 
 bool HybridMutex::tryLock() {
-  return atomic_compare_exchange(&M, Unlocked, Locked) == Unlocked;
+  return atomic_compare_exchange_strong(&M, Unlocked, Locked,
+                                        memory_order_acquire) == Unlocked;
 }
 
 // The following is based on https://akkadia.org/drepper/futex.pdf.
 void HybridMutex::lockSlow() {
-  u32 V = atomic_compare_exchange(&M, Unlocked, Locked);
+  u32 V = atomic_compare_exchange_strong(&M, Unlocked, Locked,
+                                         memory_order_acquire);
   if (V == Unlocked)
     return;
   if (V != Sleeping)
@@ -140,6 +147,17 @@ u64 getMonotonicTime() {
          static_cast<u64>(TS.tv_nsec);
 }
 
+u64 getMonotonicTimeFast() {
+#if defined(CLOCK_MONOTONIC_COARSE)
+  timespec TS;
+  clock_gettime(CLOCK_MONOTONIC_COARSE, &TS);
+  return static_cast<u64>(TS.tv_sec) * (1000ULL * 1000 * 1000) +
+         static_cast<u64>(TS.tv_nsec);
+#else
+  return getMonotonicTime();
+#endif
+}
+
 u32 getNumberOfCPUs() {
   cpu_set_t CPUs;
   // sched_getaffinity can fail for a variety of legitimate reasons (lack of
@@ -186,39 +204,6 @@ bool getRandom(void *Buffer, uptr Length, UNUSED bool Blocking) {
 extern "C" WEAK int async_safe_write_log(int pri, const char *tag,
                                          const char *msg);
 
-static uptr GetRSSFromBuffer(const char *Buf) {
-  // The format of the file is:
-  // 1084 89 69 11 0 79 0
-  // We need the second number which is RSS in pages.
-  const char *Pos = Buf;
-  // Skip the first number.
-  while (*Pos >= '0' && *Pos <= '9')
-    Pos++;
-  // Skip whitespaces.
-  while (!(*Pos >= '0' && *Pos <= '9') && *Pos != 0)
-    Pos++;
-  // Read the number.
-  u64 Rss = 0;
-  for (; *Pos >= '0' && *Pos <= '9'; Pos++)
-    Rss = Rss * 10 + static_cast<u64>(*Pos) - '0';
-  return static_cast<uptr>(Rss * getPageSizeCached());
-}
-
-uptr GetRSS() {
-  // TODO: We currently use sanitizer_common's GetRSS which reads the
-  // RSS from /proc/self/statm by default. We might want to
-  // call getrusage directly, even if it's less accurate.
-  auto Fd = open("/proc/self/statm", O_RDONLY);
-  char Buf[64];
-  s64 Len = read(Fd, Buf, sizeof(Buf) - 1);
-  close(Fd);
-  if (Len <= 0)
-    return 0;
-  Buf[Len] = 0;
-
-  return GetRSSFromBuffer(Buf);
-}
-
 void outputRaw(const char *Buffer) {
   if (&async_safe_write_log) {
     constexpr s32 AndroidLogInfo = 4;
diff --git a/standalone/local_cache.h b/standalone/local_cache.h
index 6e84158659a..46d6affdc03 100644
--- a/standalone/local_cache.h
+++ b/standalone/local_cache.h
@@ -14,6 +14,7 @@
 #include "platform.h"
 #include "report.h"
 #include "stats.h"
+#include "string_utils.h"
 
 namespace scudo {
 
@@ -21,71 +22,13 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
   typedef typename SizeClassAllocator::SizeClassMap SizeClassMap;
   typedef typename SizeClassAllocator::CompactPtrT CompactPtrT;
 
-  struct TransferBatch {
-    static const u16 MaxNumCached = SizeClassMap::MaxNumCachedHint;
-    void setFromArray(CompactPtrT *Array, u16 N) {
-      DCHECK_LE(N, MaxNumCached);
-      Count = N;
-      memcpy(Batch, Array, sizeof(Batch[0]) * Count);
-    }
-    void appendFromArray(CompactPtrT *Array, u16 N) {
-      DCHECK_LE(N, MaxNumCached - Count);
-      memcpy(Batch + Count, Array, sizeof(Batch[0]) * N);
-      // u16 will be promoted to int by arithmetic type conversion.
-      Count = static_cast<u16>(Count + N);
-    }
-    void clear() { Count = 0; }
-    void add(CompactPtrT P) {
-      DCHECK_LT(Count, MaxNumCached);
-      Batch[Count++] = P;
-    }
-    void copyToArray(CompactPtrT *Array) const {
-      memcpy(Array, Batch, sizeof(Batch[0]) * Count);
-    }
-    u16 getCount() const { return Count; }
-    CompactPtrT get(u16 I) const {
-      DCHECK_LE(I, Count);
-      return Batch[I];
-    }
-    static u16 getMaxCached(uptr Size) {
-      return Min(MaxNumCached, SizeClassMap::getMaxCachedHint(Size));
-    }
-    TransferBatch *Next;
-
-  private:
-    CompactPtrT Batch[MaxNumCached];
-    u16 Count;
-  };
-
-  // A BatchGroup is used to collect blocks. Each group has a group id to
-  // identify the group kind of contained blocks.
-  struct BatchGroup {
-    // `Next` is used by IntrusiveList.
-    BatchGroup *Next;
-    // The identifier of each group
-    uptr GroupId;
-    // Cache value of TransferBatch::getMaxCached()
-    u16 MaxCachedPerBatch;
-    // Number of blocks pushed into this group. This is an increment-only
-    // counter.
-    uptr PushedBlocks;
-    // This is used to track how many blocks are pushed since last time we
-    // checked `PushedBlocks`. It's useful for page releasing to determine the
-    // usage of a BatchGroup.
-    uptr PushedBlocksAtLastCheckpoint;
-    // Blocks are managed by TransferBatch in a list.
-    SinglyLinkedList<TransferBatch> Batches;
-  };
-
-  static_assert(sizeof(BatchGroup) <= sizeof(TransferBatch),
-                "BatchGroup uses the same class size as TransferBatch");
-
   void init(GlobalStats *S, SizeClassAllocator *A) {
     DCHECK(isEmpty());
     Stats.init();
     if (LIKELY(S))
       S->link(&Stats);
     Allocator = A;
+    initCache();
   }
 
   void destroy(GlobalStats *S) {
@@ -98,7 +41,9 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
     DCHECK_LT(ClassId, NumClasses);
     PerClass *C = &PerClassArray[ClassId];
     if (C->Count == 0) {
-      if (UNLIKELY(!refill(C, ClassId)))
+      // Refill half of the number of max cached.
+      DCHECK_GT(C->MaxCount / 2, 0U);
+      if (UNLIKELY(!refill(C, ClassId, C->MaxCount / 2)))
         return nullptr;
       DCHECK_GT(C->Count, 0);
     }
@@ -112,13 +57,13 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
     return Allocator->decompactPtr(ClassId, CompactP);
   }
 
-  void deallocate(uptr ClassId, void *P) {
+  bool deallocate(uptr ClassId, void *P) {
     CHECK_LT(ClassId, NumClasses);
     PerClass *C = &PerClassArray[ClassId];
-    // We still have to initialize the cache in the event that the first heap
-    // operation in a thread is a deallocation.
-    initCacheMaybe(C);
-    if (C->Count == C->MaxCount)
+
+    // If the cache is full, drain half of blocks back to the main allocator.
+    const bool NeedToDrainCache = C->Count == C->MaxCount;
+    if (NeedToDrainCache)
       drain(C, ClassId);
     // See comment in allocate() about memory accesses.
     const uptr ClassSize = C->ClassSize;
@@ -126,6 +71,8 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
         Allocator->compactPtr(ClassId, reinterpret_cast<uptr>(P));
     Stats.sub(StatAllocated, ClassSize);
     Stats.add(StatFree, ClassSize);
+
+    return NeedToDrainCache;
   }
 
   bool isEmpty() const {
@@ -136,7 +83,7 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
   }
 
   void drain() {
-    // Drain BatchClassId last as createBatch can refill it.
+    // Drain BatchClassId last as it may be needed while draining normal blocks.
     for (uptr I = 0; I < NumClasses; ++I) {
       if (I == BatchClassId)
         continue;
@@ -148,22 +95,42 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
     DCHECK(isEmpty());
   }
 
-  TransferBatch *createBatch(uptr ClassId, void *B) {
-    if (ClassId != BatchClassId)
-      B = allocate(BatchClassId);
+  void *getBatchClassBlock() {
+    void *B = allocate(BatchClassId);
     if (UNLIKELY(!B))
       reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId));
-    return reinterpret_cast<TransferBatch *>(B);
+    return B;
   }
 
-  BatchGroup *createGroup() {
-    void *Ptr = allocate(BatchClassId);
-    if (UNLIKELY(!Ptr))
-      reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId));
-    return reinterpret_cast<BatchGroup *>(Ptr);
+  LocalStats &getStats() { return Stats; }
+
+  void getStats(ScopedString *Str) {
+    bool EmptyCache = true;
+    for (uptr I = 0; I < NumClasses; ++I) {
+      if (PerClassArray[I].Count == 0)
+        continue;
+
+      EmptyCache = false;
+      // The size of BatchClass is set to 0 intentionally. See the comment in
+      // initCache() for more details.
+      const uptr ClassSize = I == BatchClassId
+                                 ? SizeClassAllocator::getSizeByClassId(I)
+                                 : PerClassArray[I].ClassSize;
+      // Note that the string utils don't support printing u16 thus we cast it
+      // to a common use type uptr.
+      Str->append("    %02zu (%6zu): cached: %4zu max: %4zu\n", I, ClassSize,
+                  static_cast<uptr>(PerClassArray[I].Count),
+                  static_cast<uptr>(PerClassArray[I].MaxCount));
+    }
+
+    if (EmptyCache)
+      Str->append("    No block is cached.\n");
   }
 
-  LocalStats &getStats() { return Stats; }
+  static u16 getMaxCached(uptr Size) {
+    return Min(SizeClassMap::MaxNumCachedHint,
+               SizeClassMap::getMaxCachedHint(Size));
+  }
 
 private:
   static const uptr NumClasses = SizeClassMap::NumClasses;
@@ -173,24 +140,17 @@ private:
     u16 MaxCount;
     // Note: ClassSize is zero for the transfer batch.
     uptr ClassSize;
-    CompactPtrT Chunks[2 * TransferBatch::MaxNumCached];
+    CompactPtrT Chunks[2 * SizeClassMap::MaxNumCachedHint];
   };
   PerClass PerClassArray[NumClasses] = {};
   LocalStats Stats;
   SizeClassAllocator *Allocator = nullptr;
 
-  ALWAYS_INLINE void initCacheMaybe(PerClass *C) {
-    if (LIKELY(C->MaxCount))
-      return;
-    initCache();
-    DCHECK_NE(C->MaxCount, 0U);
-  }
-
   NOINLINE void initCache() {
     for (uptr I = 0; I < NumClasses; I++) {
       PerClass *P = &PerClassArray[I];
       const uptr Size = SizeClassAllocator::getSizeByClassId(I);
-      P->MaxCount = static_cast<u16>(2 * TransferBatch::getMaxCached(Size));
+      P->MaxCount = static_cast<u16>(2 * getMaxCached(Size));
       if (I != BatchClassId) {
         P->ClassSize = Size;
       } else {
@@ -206,17 +166,12 @@ private:
       deallocate(BatchClassId, B);
   }
 
-  NOINLINE bool refill(PerClass *C, uptr ClassId) {
-    initCacheMaybe(C);
-    TransferBatch *B = Allocator->popBatch(this, ClassId);
-    if (UNLIKELY(!B))
-      return false;
-    DCHECK_GT(B->getCount(), 0);
-    C->Count = B->getCount();
-    B->copyToArray(C->Chunks);
-    B->clear();
-    destroyBatch(ClassId, B);
-    return true;
+  NOINLINE bool refill(PerClass *C, uptr ClassId, u16 MaxRefill) {
+    const u16 NumBlocksRefilled =
+        Allocator->popBlocks(this, ClassId, C->Chunks, MaxRefill);
+    DCHECK_LE(NumBlocksRefilled, MaxRefill);
+    C->Count = static_cast<u16>(C->Count + NumBlocksRefilled);
+    return NumBlocksRefilled != 0;
   }
 
   NOINLINE void drain(PerClass *C, uptr ClassId) {
diff --git a/standalone/mem_map.cpp b/standalone/mem_map.cpp
new file mode 100644
index 00000000000..115cc34e706
--- /dev/null
+++ b/standalone/mem_map.cpp
@@ -0,0 +1,84 @@
+//===-- mem_map.cpp ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mem_map.h"
+
+#include "common.h"
+
+namespace scudo {
+
+bool MemMapDefault::mapImpl(uptr Addr, uptr Size, const char *Name,
+                            uptr Flags) {
+  void *MappedAddr =
+      ::scudo::map(reinterpret_cast<void *>(Addr), Size, Name, Flags, &Data);
+  if (MappedAddr == nullptr)
+    return false;
+  Base = reinterpret_cast<uptr>(MappedAddr);
+  MappedBase = Base;
+  Capacity = Size;
+  return true;
+}
+
+void MemMapDefault::unmapImpl(uptr Addr, uptr Size) {
+  if (Size == Capacity) {
+    Base = MappedBase = Capacity = 0;
+  } else {
+    if (Base == Addr) {
+      Base = Addr + Size;
+      MappedBase = MappedBase == 0 ? Base : Max(MappedBase, Base);
+    }
+    Capacity -= Size;
+  }
+
+  ::scudo::unmap(reinterpret_cast<void *>(Addr), Size, UNMAP_ALL, &Data);
+}
+
+bool MemMapDefault::remapImpl(uptr Addr, uptr Size, const char *Name,
+                              uptr Flags) {
+  void *RemappedPtr =
+      ::scudo::map(reinterpret_cast<void *>(Addr), Size, Name, Flags, &Data);
+  const uptr RemappedAddr = reinterpret_cast<uptr>(RemappedPtr);
+  MappedBase = MappedBase == 0 ? RemappedAddr : Min(MappedBase, RemappedAddr);
+  return RemappedAddr == Addr;
+}
+
+void MemMapDefault::releaseAndZeroPagesToOSImpl(uptr From, uptr Size) {
+  DCHECK_NE(MappedBase, 0U);
+  DCHECK_GE(From, MappedBase);
+  return ::scudo::releasePagesToOS(MappedBase, From - MappedBase, Size, &Data);
+}
+
+void MemMapDefault::setMemoryPermissionImpl(uptr Addr, uptr Size, uptr Flags) {
+  return ::scudo::setMemoryPermission(Addr, Size, Flags);
+}
+
+void ReservedMemoryDefault::releaseImpl() {
+  ::scudo::unmap(reinterpret_cast<void *>(Base), Capacity, UNMAP_ALL, &Data);
+}
+
+bool ReservedMemoryDefault::createImpl(uptr Addr, uptr Size, const char *Name,
+                                       uptr Flags) {
+  void *Reserved = ::scudo::map(reinterpret_cast<void *>(Addr), Size, Name,
+                                Flags | MAP_NOACCESS, &Data);
+  if (Reserved == nullptr)
+    return false;
+
+  Base = reinterpret_cast<uptr>(Reserved);
+  Capacity = Size;
+
+  return true;
+}
+
+ReservedMemoryDefault::MemMapT ReservedMemoryDefault::dispatchImpl(uptr Addr,
+                                                                   uptr Size) {
+  ReservedMemoryDefault::MemMapT NewMap(Addr, Size);
+  NewMap.setMapPlatformData(Data);
+  return NewMap;
+}
+
+} // namespace scudo
diff --git a/standalone/mem_map.h b/standalone/mem_map.h
new file mode 100644
index 00000000000..b92216cf271
--- /dev/null
+++ b/standalone/mem_map.h
@@ -0,0 +1,92 @@
+//===-- mem_map.h -----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_MEM_MAP_H_
+#define SCUDO_MEM_MAP_H_
+
+#include "mem_map_base.h"
+
+#include "common.h"
+#include "internal_defs.h"
+
+// TODO: This is only used for `MapPlatformData`. Remove these includes when we
+// have all three platform specific `MemMap` and `ReservedMemory`
+// implementations.
+#include "fuchsia.h"
+#include "linux.h"
+#include "trusty.h"
+
+#include "mem_map_fuchsia.h"
+#include "mem_map_linux.h"
+
+namespace scudo {
+
+// This will be deprecated when every allocator has been supported by each
+// platform's `MemMap` implementation.
+class MemMapDefault final : public MemMapBase<MemMapDefault> {
+public:
+  constexpr MemMapDefault() = default;
+  MemMapDefault(uptr Base, uptr Capacity) : Base(Base), Capacity(Capacity) {}
+
+  // Impls for base functions.
+  bool mapImpl(uptr Addr, uptr Size, const char *Name, uptr Flags);
+  void unmapImpl(uptr Addr, uptr Size);
+  bool remapImpl(uptr Addr, uptr Size, const char *Name, uptr Flags);
+  void setMemoryPermissionImpl(uptr Addr, uptr Size, uptr Flags);
+  void releasePagesToOSImpl(uptr From, uptr Size) {
+    return releaseAndZeroPagesToOSImpl(From, Size);
+  }
+  void releaseAndZeroPagesToOSImpl(uptr From, uptr Size);
+  uptr getBaseImpl() { return Base; }
+  uptr getCapacityImpl() { return Capacity; }
+
+  void setMapPlatformData(MapPlatformData &NewData) { Data = NewData; }
+
+private:
+  uptr Base = 0;
+  uptr Capacity = 0;
+  uptr MappedBase = 0;
+  MapPlatformData Data = {};
+};
+
+// This will be deprecated when every allocator has been supported by each
+// platform's `MemMap` implementation.
+class ReservedMemoryDefault final
+    : public ReservedMemory<ReservedMemoryDefault, MemMapDefault> {
+public:
+  constexpr ReservedMemoryDefault() = default;
+
+  bool createImpl(uptr Addr, uptr Size, const char *Name, uptr Flags);
+  void releaseImpl();
+  MemMapT dispatchImpl(uptr Addr, uptr Size);
+  uptr getBaseImpl() { return Base; }
+  uptr getCapacityImpl() { return Capacity; }
+
+private:
+  uptr Base = 0;
+  uptr Capacity = 0;
+  MapPlatformData Data = {};
+};
+
+#if SCUDO_LINUX
+using ReservedMemoryT = ReservedMemoryLinux;
+using MemMapT = ReservedMemoryT::MemMapT;
+#elif SCUDO_FUCHSIA
+using ReservedMemoryT = ReservedMemoryFuchsia;
+using MemMapT = ReservedMemoryT::MemMapT;
+#elif SCUDO_TRUSTY
+using ReservedMemoryT = ReservedMemoryDefault;
+using MemMapT = ReservedMemoryT::MemMapT;
+#else
+#error                                                                         \
+    "Unsupported platform, please implement the ReservedMemory for your platform!"
+#endif
+
+} // namespace scudo
+
+#endif // SCUDO_MEM_MAP_H_
diff --git a/standalone/mem_map_base.h b/standalone/mem_map_base.h
new file mode 100644
index 00000000000..99ab0cba604
--- /dev/null
+++ b/standalone/mem_map_base.h
@@ -0,0 +1,129 @@
+//===-- mem_map_base.h ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_MEM_MAP_BASE_H_
+#define SCUDO_MEM_MAP_BASE_H_
+
+#include "common.h"
+
+namespace scudo {
+
+// In Scudo, every memory operation will be fulfilled through a
+// platform-specific `MemMap` instance. The essential APIs are listed in the
+// `MemMapBase` below. This is implemented in CRTP, so for each implementation,
+// it has to implement all of the 'Impl' named functions.
+template <class Derived> class MemMapBase {
+public:
+  constexpr MemMapBase() = default;
+
+  // This is used to map a new set of contiguous pages. Note that the `Addr` is
+  // only a suggestion to the system.
+  bool map(uptr Addr, uptr Size, const char *Name, uptr Flags = 0) {
+    DCHECK(!isAllocated());
+    return invokeImpl(&Derived::mapImpl, Addr, Size, Name, Flags);
+  }
+
+  // This is used to unmap partial/full pages from the beginning or the end.
+  // I.e., the result pages are expected to be still contiguous.
+  void unmap(uptr Addr, uptr Size) {
+    DCHECK(isAllocated());
+    DCHECK((Addr == getBase()) || (Addr + Size == getBase() + getCapacity()));
+    invokeImpl(&Derived::unmapImpl, Addr, Size);
+  }
+
+  // This is used to remap a mapped range (either from map() or dispatched from
+  // ReservedMemory). For example, we have reserved several pages and then we
+  // want to remap them with different accessibility.
+  bool remap(uptr Addr, uptr Size, const char *Name, uptr Flags = 0) {
+    DCHECK(isAllocated());
+    DCHECK((Addr >= getBase()) && (Addr + Size <= getBase() + getCapacity()));
+    return invokeImpl(&Derived::remapImpl, Addr, Size, Name, Flags);
+  }
+
+  // This is used to update the pages' access permission. For example, mark
+  // pages as no read/write permission.
+  void setMemoryPermission(uptr Addr, uptr Size, uptr Flags) {
+    DCHECK(isAllocated());
+    DCHECK((Addr >= getBase()) && (Addr + Size <= getBase() + getCapacity()));
+    return invokeImpl(&Derived::setMemoryPermissionImpl, Addr, Size, Flags);
+  }
+
+  // Suggest releasing a set of contiguous physical pages back to the OS. Note
+  // that only physical pages are supposed to be released. Any release of
+  // virtual pages may lead to undefined behavior.
+  void releasePagesToOS(uptr From, uptr Size) {
+    DCHECK(isAllocated());
+    DCHECK((From >= getBase()) && (From + Size <= getBase() + getCapacity()));
+    invokeImpl(&Derived::releasePagesToOSImpl, From, Size);
+  }
+  // This is similar to the above one except that any subsequent access to the
+  // released pages will return with zero-filled pages.
+  void releaseAndZeroPagesToOS(uptr From, uptr Size) {
+    DCHECK(isAllocated());
+    DCHECK((From >= getBase()) && (From + Size <= getBase() + getCapacity()));
+    invokeImpl(&Derived::releaseAndZeroPagesToOSImpl, From, Size);
+  }
+
+  uptr getBase() { return invokeImpl(&Derived::getBaseImpl); }
+  uptr getCapacity() { return invokeImpl(&Derived::getCapacityImpl); }
+
+  bool isAllocated() { return getBase() != 0U; }
+
+protected:
+  template <typename R, typename... Args>
+  R invokeImpl(R (Derived::*MemFn)(Args...), Args... args) {
+    return (static_cast<Derived *>(this)->*MemFn)(args...);
+  }
+};
+
+// `ReservedMemory` is a special memory handle which can be viewed as a page
+// allocator. `ReservedMemory` will reserve a contiguous pages and the later
+// page request can be fulfilled at the designated address. This is used when
+// we want to ensure the virtual address of the MemMap will be in a known range.
+// This is implemented in CRTP, so for each
+// implementation, it has to implement all of the 'Impl' named functions.
+template <class Derived, typename MemMapTy> class ReservedMemory {
+public:
+  using MemMapT = MemMapTy;
+  constexpr ReservedMemory() = default;
+
+  // Reserve a chunk of memory at a suggested address.
+  bool create(uptr Addr, uptr Size, const char *Name, uptr Flags = 0) {
+    DCHECK(!isCreated());
+    return invokeImpl(&Derived::createImpl, Addr, Size, Name, Flags);
+  }
+
+  // Release the entire reserved memory.
+  void release() {
+    DCHECK(isCreated());
+    invokeImpl(&Derived::releaseImpl);
+  }
+
+  // Dispatch a sub-range of reserved memory. Note that any fragmentation of
+  // the reserved pages is managed by each implementation.
+  MemMapT dispatch(uptr Addr, uptr Size) {
+    DCHECK(isCreated());
+    DCHECK((Addr >= getBase()) && (Addr + Size <= getBase() + getCapacity()));
+    return invokeImpl(&Derived::dispatchImpl, Addr, Size);
+  }
+
+  uptr getBase() { return invokeImpl(&Derived::getBaseImpl); }
+  uptr getCapacity() { return invokeImpl(&Derived::getCapacityImpl); }
+
+  bool isCreated() { return getBase() != 0U; }
+
+protected:
+  template <typename R, typename... Args>
+  R invokeImpl(R (Derived::*MemFn)(Args...), Args... args) {
+    return (static_cast<Derived *>(this)->*MemFn)(args...);
+  }
+};
+
+} // namespace scudo
+
+#endif // SCUDO_MEM_MAP_BASE_H_
diff --git a/standalone/mem_map_fuchsia.cpp b/standalone/mem_map_fuchsia.cpp
new file mode 100644
index 00000000000..fc793abf44c
--- /dev/null
+++ b/standalone/mem_map_fuchsia.cpp
@@ -0,0 +1,258 @@
+//===-- mem_map_fuchsia.cpp -------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mem_map_fuchsia.h"
+
+#include "atomic_helpers.h"
+#include "common.h"
+#include "string_utils.h"
+
+#if SCUDO_FUCHSIA
+
+#include <zircon/process.h>
+#include <zircon/status.h>
+#include <zircon/syscalls.h>
+
+namespace scudo {
+
+static void NORETURN dieOnError(zx_status_t Status, const char *FnName,
+                                uptr Size) {
+  ScopedString Error;
+  Error.append("SCUDO ERROR: %s failed with size %zuKB (%s)", FnName,
+               Size >> 10, _zx_status_get_string(Status));
+  outputRaw(Error.data());
+  die();
+}
+
+static void setVmoName(zx_handle_t Vmo, const char *Name) {
+  size_t Len = strlen(Name);
+  DCHECK_LT(Len, ZX_MAX_NAME_LEN);
+  zx_status_t Status = _zx_object_set_property(Vmo, ZX_PROP_NAME, Name, Len);
+  CHECK_EQ(Status, ZX_OK);
+}
+
+// Returns the (cached) base address of the root VMAR.
+static uptr getRootVmarBase() {
+  static atomic_uptr CachedResult = {0};
+
+  uptr Result = atomic_load(&CachedResult, memory_order_acquire);
+  if (UNLIKELY(!Result)) {
+    zx_info_vmar_t VmarInfo;
+    zx_status_t Status =
+        _zx_object_get_info(_zx_vmar_root_self(), ZX_INFO_VMAR, &VmarInfo,
+                            sizeof(VmarInfo), nullptr, nullptr);
+    CHECK_EQ(Status, ZX_OK);
+    CHECK_NE(VmarInfo.base, 0);
+
+    atomic_store(&CachedResult, VmarInfo.base, memory_order_release);
+    Result = VmarInfo.base;
+  }
+
+  return Result;
+}
+
+// Lazily creates and then always returns the same zero-sized VMO.
+static zx_handle_t getPlaceholderVmo() {
+  static atomic_u32 StoredVmo = {ZX_HANDLE_INVALID};
+
+  zx_handle_t Vmo = atomic_load(&StoredVmo, memory_order_acquire);
+  if (UNLIKELY(Vmo == ZX_HANDLE_INVALID)) {
+    // Create a zero-sized placeholder VMO.
+    zx_status_t Status = _zx_vmo_create(0, 0, &Vmo);
+    if (UNLIKELY(Status != ZX_OK))
+      dieOnError(Status, "zx_vmo_create", 0);
+
+    setVmoName(Vmo, "scudo:reserved");
+
+    // Atomically store its handle. If some other thread wins the race, use its
+    // handle and discard ours.
+    zx_handle_t OldValue = atomic_compare_exchange_strong(
+        &StoredVmo, ZX_HANDLE_INVALID, Vmo, memory_order_acq_rel);
+    if (UNLIKELY(OldValue != ZX_HANDLE_INVALID)) {
+      Status = _zx_handle_close(Vmo);
+      CHECK_EQ(Status, ZX_OK);
+
+      Vmo = OldValue;
+    }
+  }
+
+  return Vmo;
+}
+
+// Checks if MAP_ALLOWNOMEM allows the given error code.
+static bool IsNoMemError(zx_status_t Status) {
+  // Note: _zx_vmar_map returns ZX_ERR_NO_RESOURCES if the VMAR does not contain
+  // a suitable free spot.
+  return Status == ZX_ERR_NO_MEMORY || Status == ZX_ERR_NO_RESOURCES;
+}
+
+MemMapFuchsia::MemMapFuchsia(uptr Base, uptr Capacity)
+    : MapAddr(Base), WindowBase(Base), WindowSize(Capacity) {
+  // Create the VMO.
+  zx_status_t Status = _zx_vmo_create(Capacity, 0, &Vmo);
+  if (UNLIKELY(Status != ZX_OK))
+    dieOnError(Status, "zx_vmo_create", Capacity);
+}
+
+bool MemMapFuchsia::mapImpl(UNUSED uptr Addr, uptr Size, const char *Name,
+                            uptr Flags) {
+  const bool AllowNoMem = !!(Flags & MAP_ALLOWNOMEM);
+  const bool PreCommit = !!(Flags & MAP_PRECOMMIT);
+  const bool NoAccess = !!(Flags & MAP_NOACCESS);
+
+  // Create the VMO.
+  zx_status_t Status = _zx_vmo_create(Size, 0, &Vmo);
+  if (UNLIKELY(Status != ZX_OK)) {
+    if (AllowNoMem && IsNoMemError(Status))
+      return false;
+    dieOnError(Status, "zx_vmo_create", Size);
+  }
+
+  if (Name != nullptr)
+    setVmoName(Vmo, Name);
+
+  // Map it.
+  zx_vm_option_t MapFlags = ZX_VM_ALLOW_FAULTS;
+  if (!NoAccess)
+    MapFlags |= ZX_VM_PERM_READ | ZX_VM_PERM_WRITE;
+  Status =
+      _zx_vmar_map(_zx_vmar_root_self(), MapFlags, 0, Vmo, 0, Size, &MapAddr);
+  if (UNLIKELY(Status != ZX_OK)) {
+    if (AllowNoMem && IsNoMemError(Status)) {
+      Status = _zx_handle_close(Vmo);
+      CHECK_EQ(Status, ZX_OK);
+
+      MapAddr = 0;
+      Vmo = ZX_HANDLE_INVALID;
+      return false;
+    }
+    dieOnError(Status, "zx_vmar_map", Size);
+  }
+
+  if (PreCommit) {
+    Status = _zx_vmar_op_range(_zx_vmar_root_self(), ZX_VMAR_OP_COMMIT, MapAddr,
+                               Size, nullptr, 0);
+    CHECK_EQ(Status, ZX_OK);
+  }
+
+  WindowBase = MapAddr;
+  WindowSize = Size;
+  return true;
+}
+
+void MemMapFuchsia::unmapImpl(uptr Addr, uptr Size) {
+  zx_status_t Status;
+
+  if (Size == WindowSize) {
+    // NOTE: Closing first and then unmapping seems slightly faster than doing
+    // the same operations in the opposite order.
+    Status = _zx_handle_close(Vmo);
+    CHECK_EQ(Status, ZX_OK);
+    Status = _zx_vmar_unmap(_zx_vmar_root_self(), Addr, Size);
+    CHECK_EQ(Status, ZX_OK);
+
+    MapAddr = WindowBase = WindowSize = 0;
+    Vmo = ZX_HANDLE_INVALID;
+  } else {
+    // Unmap the subrange.
+    Status = _zx_vmar_unmap(_zx_vmar_root_self(), Addr, Size);
+    CHECK_EQ(Status, ZX_OK);
+
+    // Decommit the pages that we just unmapped.
+    Status = _zx_vmo_op_range(Vmo, ZX_VMO_OP_DECOMMIT, Addr - MapAddr, Size,
+                              nullptr, 0);
+    CHECK_EQ(Status, ZX_OK);
+
+    if (Addr == WindowBase)
+      WindowBase += Size;
+    WindowSize -= Size;
+  }
+}
+
+bool MemMapFuchsia::remapImpl(uptr Addr, uptr Size, const char *Name,
+                              uptr Flags) {
+  const bool AllowNoMem = !!(Flags & MAP_ALLOWNOMEM);
+  const bool PreCommit = !!(Flags & MAP_PRECOMMIT);
+  const bool NoAccess = !!(Flags & MAP_NOACCESS);
+
+  // NOTE: This will rename the *whole* VMO, not only the requested portion of
+  // it. But we cannot do better than this given the MemMap API. In practice,
+  // the upper layers of Scudo always pass the same Name for a given MemMap.
+  if (Name != nullptr)
+    setVmoName(Vmo, Name);
+
+  uptr MappedAddr;
+  zx_vm_option_t MapFlags = ZX_VM_ALLOW_FAULTS | ZX_VM_SPECIFIC_OVERWRITE;
+  if (!NoAccess)
+    MapFlags |= ZX_VM_PERM_READ | ZX_VM_PERM_WRITE;
+  zx_status_t Status =
+      _zx_vmar_map(_zx_vmar_root_self(), MapFlags, Addr - getRootVmarBase(),
+                   Vmo, Addr - MapAddr, Size, &MappedAddr);
+  if (UNLIKELY(Status != ZX_OK)) {
+    if (AllowNoMem && IsNoMemError(Status))
+      return false;
+    dieOnError(Status, "zx_vmar_map", Size);
+  }
+  DCHECK_EQ(Addr, MappedAddr);
+
+  if (PreCommit) {
+    Status = _zx_vmar_op_range(_zx_vmar_root_self(), ZX_VMAR_OP_COMMIT, MapAddr,
+                               Size, nullptr, 0);
+    CHECK_EQ(Status, ZX_OK);
+  }
+
+  return true;
+}
+
+void MemMapFuchsia::releaseAndZeroPagesToOSImpl(uptr From, uptr Size) {
+  zx_status_t Status = _zx_vmo_op_range(Vmo, ZX_VMO_OP_DECOMMIT, From - MapAddr,
+                                        Size, nullptr, 0);
+  CHECK_EQ(Status, ZX_OK);
+}
+
+void MemMapFuchsia::setMemoryPermissionImpl(uptr Addr, uptr Size, uptr Flags) {
+  const bool NoAccess = !!(Flags & MAP_NOACCESS);
+
+  zx_vm_option_t MapFlags = 0;
+  if (!NoAccess)
+    MapFlags |= ZX_VM_PERM_READ | ZX_VM_PERM_WRITE;
+  zx_status_t Status =
+      _zx_vmar_protect(_zx_vmar_root_self(), MapFlags, Addr, Size);
+  CHECK_EQ(Status, ZX_OK);
+}
+
+bool ReservedMemoryFuchsia::createImpl(UNUSED uptr Addr, uptr Size,
+                                       UNUSED const char *Name, uptr Flags) {
+  const bool AllowNoMem = !!(Flags & MAP_ALLOWNOMEM);
+
+  // Reserve memory by mapping the placeholder VMO without any permission.
+  zx_status_t Status = _zx_vmar_map(_zx_vmar_root_self(), ZX_VM_ALLOW_FAULTS, 0,
+                                    getPlaceholderVmo(), 0, Size, &Base);
+  if (UNLIKELY(Status != ZX_OK)) {
+    if (AllowNoMem && IsNoMemError(Status))
+      return false;
+    dieOnError(Status, "zx_vmar_map", Size);
+  }
+
+  Capacity = Size;
+  return true;
+}
+
+void ReservedMemoryFuchsia::releaseImpl() {
+  zx_status_t Status = _zx_vmar_unmap(_zx_vmar_root_self(), Base, Capacity);
+  CHECK_EQ(Status, ZX_OK);
+}
+
+ReservedMemoryFuchsia::MemMapT ReservedMemoryFuchsia::dispatchImpl(uptr Addr,
+                                                                   uptr Size) {
+  return ReservedMemoryFuchsia::MemMapT(Addr, Size);
+}
+
+} // namespace scudo
+
+#endif // SCUDO_FUCHSIA
diff --git a/standalone/mem_map_fuchsia.h b/standalone/mem_map_fuchsia.h
new file mode 100644
index 00000000000..2e66f89cfca
--- /dev/null
+++ b/standalone/mem_map_fuchsia.h
@@ -0,0 +1,75 @@
+//===-- mem_map_fuchsia.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_MEM_MAP_FUCHSIA_H_
+#define SCUDO_MEM_MAP_FUCHSIA_H_
+
+#include "mem_map_base.h"
+
+#if SCUDO_FUCHSIA
+
+#include <stdint.h>
+#include <zircon/types.h>
+
+namespace scudo {
+
+class MemMapFuchsia final : public MemMapBase<MemMapFuchsia> {
+public:
+  constexpr MemMapFuchsia() = default;
+
+  // Impls for base functions.
+  bool mapImpl(uptr Addr, uptr Size, const char *Name, uptr Flags);
+  void unmapImpl(uptr Addr, uptr Size);
+  bool remapImpl(uptr Addr, uptr Size, const char *Name, uptr Flags);
+  void setMemoryPermissionImpl(uptr Addr, uptr Size, uptr Flags);
+  void releasePagesToOSImpl(uptr From, uptr Size) {
+    return releaseAndZeroPagesToOSImpl(From, Size);
+  }
+  void releaseAndZeroPagesToOSImpl(uptr From, uptr Size);
+  uptr getBaseImpl() { return WindowBase; }
+  uptr getCapacityImpl() { return WindowSize; }
+
+private:
+  friend class ReservedMemoryFuchsia;
+
+  // Used by ReservedMemoryFuchsia::dispatch.
+  MemMapFuchsia(uptr Base, uptr Capacity);
+
+  // Virtual memory address corresponding to VMO offset 0.
+  uptr MapAddr = 0;
+
+  // Virtual memory base address and size of the VMO subrange that is still in
+  // use. unmapImpl() can shrink this range, either at the beginning or at the
+  // end.
+  uptr WindowBase = 0;
+  uptr WindowSize = 0;
+
+  zx_handle_t Vmo = ZX_HANDLE_INVALID;
+};
+
+class ReservedMemoryFuchsia final
+    : public ReservedMemory<ReservedMemoryFuchsia, MemMapFuchsia> {
+public:
+  constexpr ReservedMemoryFuchsia() = default;
+
+  bool createImpl(uptr Addr, uptr Size, const char *Name, uptr Flags);
+  void releaseImpl();
+  MemMapT dispatchImpl(uptr Addr, uptr Size);
+  uptr getBaseImpl() { return Base; }
+  uptr getCapacityImpl() { return Capacity; }
+
+private:
+  uptr Base = 0;
+  uptr Capacity = 0;
+};
+
+} // namespace scudo
+
+#endif // SCUDO_FUCHSIA
+
+#endif // SCUDO_MEM_MAP_FUCHSIA_H_
diff --git a/standalone/mem_map_linux.cpp b/standalone/mem_map_linux.cpp
new file mode 100644
index 00000000000..783c4f0d9ab
--- /dev/null
+++ b/standalone/mem_map_linux.cpp
@@ -0,0 +1,153 @@
+//===-- mem_map_linux.cpp ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "platform.h"
+
+#if SCUDO_LINUX
+
+#include "mem_map_linux.h"
+
+#include "common.h"
+#include "internal_defs.h"
+#include "linux.h"
+#include "mutex.h"
+#include "report_linux.h"
+#include "string_utils.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/futex.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <time.h>
+#include <unistd.h>
+
+#if SCUDO_ANDROID
+// TODO(chiahungduan): Review if we still need the followings macros.
+#include <sys/prctl.h>
+// Definitions of prctl arguments to set a vma name in Android kernels.
+#define ANDROID_PR_SET_VMA 0x53564d41
+#define ANDROID_PR_SET_VMA_ANON_NAME 0
+#endif
+
+namespace scudo {
+
+static void *mmapWrapper(uptr Addr, uptr Size, const char *Name, uptr Flags) {
+  int MmapFlags = MAP_PRIVATE | MAP_ANONYMOUS;
+  int MmapProt;
+  if (Flags & MAP_NOACCESS) {
+    MmapFlags |= MAP_NORESERVE;
+    MmapProt = PROT_NONE;
+  } else {
+    MmapProt = PROT_READ | PROT_WRITE;
+  }
+#if defined(__aarch64__)
+#ifndef PROT_MTE
+#define PROT_MTE 0x20
+#endif
+  if (Flags & MAP_MEMTAG)
+    MmapProt |= PROT_MTE;
+#endif
+  if (Addr)
+    MmapFlags |= MAP_FIXED;
+  void *P =
+      mmap(reinterpret_cast<void *>(Addr), Size, MmapProt, MmapFlags, -1, 0);
+  if (P == MAP_FAILED) {
+    if (!(Flags & MAP_ALLOWNOMEM) || errno != ENOMEM)
+      reportMapError(errno == ENOMEM ? Size : 0);
+    return nullptr;
+  }
+#if SCUDO_ANDROID
+  if (Name)
+    prctl(ANDROID_PR_SET_VMA, ANDROID_PR_SET_VMA_ANON_NAME, P, Size, Name);
+#else
+  (void)Name;
+#endif
+
+  return P;
+}
+
+bool MemMapLinux::mapImpl(uptr Addr, uptr Size, const char *Name, uptr Flags) {
+  void *P = mmapWrapper(Addr, Size, Name, Flags);
+  if (P == nullptr)
+    return false;
+
+  MapBase = reinterpret_cast<uptr>(P);
+  MapCapacity = Size;
+  return true;
+}
+
+void MemMapLinux::unmapImpl(uptr Addr, uptr Size) {
+  // If we unmap all the pages, also mark `MapBase` to 0 to indicate invalid
+  // status.
+  if (Size == MapCapacity) {
+    MapBase = MapCapacity = 0;
+  } else {
+    // This is partial unmap and is unmapping the pages from the beginning,
+    // shift `MapBase` to the new base.
+    if (MapBase == Addr)
+      MapBase = Addr + Size;
+    MapCapacity -= Size;
+  }
+
+  if (munmap(reinterpret_cast<void *>(Addr), Size) != 0)
+    reportUnmapError(Addr, Size);
+}
+
+bool MemMapLinux::remapImpl(uptr Addr, uptr Size, const char *Name,
+                            uptr Flags) {
+  void *P = mmapWrapper(Addr, Size, Name, Flags);
+  if (reinterpret_cast<uptr>(P) != Addr)
+    reportMapError();
+  return true;
+}
+
+void MemMapLinux::setMemoryPermissionImpl(uptr Addr, uptr Size, uptr Flags) {
+  int Prot = (Flags & MAP_NOACCESS) ? PROT_NONE : (PROT_READ | PROT_WRITE);
+  if (mprotect(reinterpret_cast<void *>(Addr), Size, Prot) != 0)
+    reportProtectError(Addr, Size, Prot);
+}
+
+void MemMapLinux::releaseAndZeroPagesToOSImpl(uptr From, uptr Size) {
+  void *Addr = reinterpret_cast<void *>(From);
+
+  while (madvise(Addr, Size, MADV_DONTNEED) == -1 && errno == EAGAIN) {
+  }
+}
+
+bool ReservedMemoryLinux::createImpl(uptr Addr, uptr Size, const char *Name,
+                                     uptr Flags) {
+  ReservedMemoryLinux::MemMapT MemMap;
+  if (!MemMap.map(Addr, Size, Name, Flags | MAP_NOACCESS))
+    return false;
+
+  MapBase = MemMap.getBase();
+  MapCapacity = MemMap.getCapacity();
+
+  return true;
+}
+
+void ReservedMemoryLinux::releaseImpl() {
+  if (munmap(reinterpret_cast<void *>(getBase()), getCapacity()) != 0)
+    reportUnmapError(getBase(), getCapacity());
+}
+
+ReservedMemoryLinux::MemMapT ReservedMemoryLinux::dispatchImpl(uptr Addr,
+                                                               uptr Size) {
+  return ReservedMemoryLinux::MemMapT(Addr, Size);
+}
+
+} // namespace scudo
+
+#endif // SCUDO_LINUX
diff --git a/standalone/mem_map_linux.h b/standalone/mem_map_linux.h
new file mode 100644
index 00000000000..7a89b3bff5e
--- /dev/null
+++ b/standalone/mem_map_linux.h
@@ -0,0 +1,67 @@
+//===-- mem_map_linux.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_MEM_MAP_LINUX_H_
+#define SCUDO_MEM_MAP_LINUX_H_
+
+#include "platform.h"
+
+#if SCUDO_LINUX
+
+#include "common.h"
+#include "mem_map_base.h"
+
+namespace scudo {
+
+class MemMapLinux final : public MemMapBase<MemMapLinux> {
+public:
+  constexpr MemMapLinux() = default;
+  MemMapLinux(uptr Base, uptr Capacity)
+      : MapBase(Base), MapCapacity(Capacity) {}
+
+  // Impls for base functions.
+  bool mapImpl(uptr Addr, uptr Size, const char *Name, uptr Flags = 0);
+  void unmapImpl(uptr Addr, uptr Size);
+  bool remapImpl(uptr Addr, uptr Size, const char *Name, uptr Flags = 0);
+  void setMemoryPermissionImpl(uptr Addr, uptr Size, uptr Flags);
+  void releasePagesToOSImpl(uptr From, uptr Size) {
+    return releaseAndZeroPagesToOSImpl(From, Size);
+  }
+  void releaseAndZeroPagesToOSImpl(uptr From, uptr Size);
+  uptr getBaseImpl() { return MapBase; }
+  uptr getCapacityImpl() { return MapCapacity; }
+
+private:
+  uptr MapBase = 0;
+  uptr MapCapacity = 0;
+};
+
+// This will be deprecated when every allocator has been supported by each
+// platform's `MemMap` implementation.
+class ReservedMemoryLinux final
+    : public ReservedMemory<ReservedMemoryLinux, MemMapLinux> {
+public:
+  // The following two are the Impls for function in `MemMapBase`.
+  uptr getBaseImpl() { return MapBase; }
+  uptr getCapacityImpl() { return MapCapacity; }
+
+  // These threes are specific to `ReservedMemory`.
+  bool createImpl(uptr Addr, uptr Size, const char *Name, uptr Flags);
+  void releaseImpl();
+  MemMapT dispatchImpl(uptr Addr, uptr Size);
+
+private:
+  uptr MapBase = 0;
+  uptr MapCapacity = 0;
+};
+
+} // namespace scudo
+
+#endif // SCUDO_LINUX
+
+#endif // SCUDO_MEM_MAP_LINUX_H_
diff --git a/standalone/memtag.h b/standalone/memtag.h
index 7f14a30fee1..1f6983e9940 100644
--- a/standalone/memtag.h
+++ b/standalone/memtag.h
@@ -11,7 +11,7 @@
 
 #include "internal_defs.h"
 
-#if SCUDO_LINUX
+#if SCUDO_CAN_USE_MTE
 #include <sys/auxv.h>
 #include <sys/prctl.h>
 #endif
@@ -25,7 +25,7 @@ namespace scudo {
 // tagging. Not all operating systems enable TBI, so we only claim architectural
 // support for memory tagging if the operating system enables TBI.
 // HWASan uses the top byte for its own purpose and Scudo should not touch it.
-#if SCUDO_LINUX && !defined(SCUDO_DISABLE_TBI) &&                              \
+#if SCUDO_CAN_USE_MTE && !defined(SCUDO_DISABLE_TBI) &&                        \
     !__has_feature(hwaddress_sanitizer)
 inline constexpr bool archSupportsMemoryTagging() { return true; }
 #else
@@ -60,7 +60,7 @@ inline NORETURN uint8_t extractTag(uptr Ptr) {
 
 #if __clang_major__ >= 12 && defined(__aarch64__) && !defined(__ILP32__)
 
-#if SCUDO_LINUX
+#if SCUDO_CAN_USE_MTE
 
 inline bool systemSupportsMemoryTagging() {
 #ifndef HWCAP2_MTE
@@ -106,7 +106,7 @@ inline void enableSystemMemoryTaggingTestOnly() {
         0, 0, 0);
 }
 
-#else // !SCUDO_LINUX
+#else // !SCUDO_CAN_USE_MTE
 
 inline bool systemSupportsMemoryTagging() { return false; }
 
@@ -118,7 +118,7 @@ inline NORETURN void enableSystemMemoryTaggingTestOnly() {
   UNREACHABLE("memory tagging not supported");
 }
 
-#endif // SCUDO_LINUX
+#endif // SCUDO_CAN_USE_MTE
 
 class ScopedDisableMemoryTagChecks {
   uptr PrevTCO;
@@ -326,7 +326,7 @@ inline void *addFixedTag(void *Ptr, uptr Tag) {
 
 template <typename Config>
 inline constexpr bool allocatorSupportsMemoryTagging() {
-  return archSupportsMemoryTagging() && Config::MaySupportMemoryTagging &&
+  return archSupportsMemoryTagging() && Config::getMaySupportMemoryTagging() &&
          (1 << SCUDO_MIN_ALIGNMENT_LOG) >= archMemoryTagGranuleSize();
 }
 
diff --git a/standalone/mutex.h b/standalone/mutex.h
index 05340de3e12..4caa945219b 100644
--- a/standalone/mutex.h
+++ b/standalone/mutex.h
@@ -35,7 +35,7 @@ public:
 #pragma nounroll
 #endif
     for (u8 I = 0U; I < NumberOfTries; I++) {
-      yieldProcessor(NumberOfYields);
+      delayLoop();
       if (tryLock())
         return;
     }
@@ -53,10 +53,23 @@ public:
   }
 
 private:
+  void delayLoop() {
+    // The value comes from the average time spent in accessing caches (which
+    // are the fastest operations) so that we are unlikely to wait too long for
+    // fast operations.
+    constexpr u32 SpinTimes = 16;
+    volatile u32 V = 0;
+    for (u32 I = 0; I < SpinTimes; ++I) {
+      u32 Tmp = V + 1;
+      V = Tmp;
+    }
+  }
+
   void assertHeldImpl();
 
-  static constexpr u8 NumberOfTries = 8U;
-  static constexpr u8 NumberOfYields = 8U;
+  // TODO(chiahungduan): Adapt this value based on scenarios. E.g., primary and
+  // secondary allocator have different allocation times.
+  static constexpr u8 NumberOfTries = 32U;
 
 #if SCUDO_LINUX
   atomic_u32 M = {};
diff --git a/standalone/options.h b/standalone/options.h
index 4e678651333..b20142a4159 100644
--- a/standalone/options.h
+++ b/standalone/options.h
@@ -38,7 +38,7 @@ struct Options {
   }
 };
 
-template <typename Config> bool useMemoryTagging(Options Options) {
+template <typename Config> bool useMemoryTagging(const Options &Options) {
   return allocatorSupportsMemoryTagging<Config>() &&
          Options.get(OptionBit::UseMemoryTagging);
 }
diff --git a/standalone/platform.h b/standalone/platform.h
index db4217ddab9..5af1275e32d 100644
--- a/standalone/platform.h
+++ b/standalone/platform.h
@@ -37,6 +37,12 @@
 #define SCUDO_TRUSTY 0
 #endif
 
+#if defined(__riscv) && (__riscv_xlen == 64)
+#define SCUDO_RISCV64 1
+#else
+#define SCUDO_RISCV64 0
+#endif
+
 #if defined(__LP64__)
 #define SCUDO_WORDSIZE 64U
 #else
@@ -53,6 +59,14 @@
 #define SCUDO_CAN_USE_PRIMARY64 (SCUDO_WORDSIZE == 64U)
 #endif
 
+#ifndef SCUDO_CAN_USE_MTE
+#define SCUDO_CAN_USE_MTE (SCUDO_LINUX || SCUDO_TRUSTY)
+#endif
+
+#ifndef SCUDO_ENABLE_HOOKS
+#define SCUDO_ENABLE_HOOKS 0
+#endif
+
 #ifndef SCUDO_MIN_ALIGNMENT_LOG
 // We force malloc-type functions to be aligned to std::max_align_t, but there
 // is no reason why the minimum alignment for all other functions can't be 8
diff --git a/standalone/primary32.h b/standalone/primary32.h
index 0edc40d7e6c..ebfb8dfe0a3 100644
--- a/standalone/primary32.h
+++ b/standalone/primary32.h
@@ -9,6 +9,7 @@
 #ifndef SCUDO_PRIMARY32_H_
 #define SCUDO_PRIMARY32_H_
 
+#include "allocator_common.h"
 #include "bytemap.h"
 #include "common.h"
 #include "list.h"
@@ -42,22 +43,25 @@ namespace scudo {
 
 template <typename Config> class SizeClassAllocator32 {
 public:
-  typedef typename Config::PrimaryCompactPtrT CompactPtrT;
+  typedef typename Config::CompactPtrT CompactPtrT;
   typedef typename Config::SizeClassMap SizeClassMap;
-  static const uptr GroupSizeLog = Config::PrimaryGroupSizeLog;
+  static const uptr GroupSizeLog = Config::getGroupSizeLog();
   // The bytemap can only track UINT8_MAX - 1 classes.
   static_assert(SizeClassMap::LargestClassId <= (UINT8_MAX - 1), "");
   // Regions should be large enough to hold the largest Block.
-  static_assert((1UL << Config::PrimaryRegionSizeLog) >= SizeClassMap::MaxSize,
+  static_assert((1UL << Config::getRegionSizeLog()) >= SizeClassMap::MaxSize,
                 "");
   typedef SizeClassAllocator32<Config> ThisT;
   typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
-  typedef typename CacheT::TransferBatch TransferBatch;
-  typedef typename CacheT::BatchGroup BatchGroup;
+  typedef TransferBatch<ThisT> TransferBatchT;
+  typedef BatchGroup<ThisT> BatchGroupT;
+
+  static_assert(sizeof(BatchGroupT) <= sizeof(TransferBatchT),
+                "BatchGroupT uses the same class size as TransferBatchT");
 
   static uptr getSizeByClassId(uptr ClassId) {
     return (ClassId == SizeClassMap::BatchClassId)
-               ? sizeof(TransferBatch)
+               ? sizeof(TransferBatchT)
                : SizeClassMap::getSizeByClassId(ClassId);
   }
 
@@ -73,7 +77,7 @@ public:
     DCHECK(isAligned(reinterpret_cast<uptr>(this), alignof(ThisT)));
     PossibleRegions.init();
     u32 Seed;
-    const u64 Time = getMonotonicTime();
+    const u64 Time = getMonotonicTimeFast();
     if (!getRandom(reinterpret_cast<void *>(&Seed), sizeof(Seed)))
       Seed = static_cast<u32>(
           Time ^ (reinterpret_cast<uptr>(SizeClassInfoArray) >> 6));
@@ -84,6 +88,10 @@ public:
       Sci->MinRegionIndex = NumRegions;
       Sci->ReleaseInfo.LastReleaseAtNs = Time;
     }
+
+    // The default value in the primary config has the higher priority.
+    if (Config::getDefaultReleaseToOsIntervalMs() != INT32_MIN)
+      ReleaseToOsInterval = Config::getDefaultReleaseToOsIntervalMs();
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
 
@@ -108,12 +116,57 @@ public:
     }
 
     ScopedLock L(ByteMapMutex);
-    for (uptr I = MinRegionIndex; I < MaxRegionIndex; I++)
+    for (uptr I = MinRegionIndex; I <= MaxRegionIndex; I++)
       if (PossibleRegions[I])
         unmap(reinterpret_cast<void *>(I * RegionSize), RegionSize);
     PossibleRegions.unmapTestOnly();
   }
 
+  // When all blocks are freed, it has to be the same size as `AllocatedUser`.
+  void verifyAllBlocksAreReleasedTestOnly() {
+    // `BatchGroup` and `TransferBatch` also use the blocks from BatchClass.
+    uptr BatchClassUsedInFreeLists = 0;
+    for (uptr I = 0; I < NumClasses; I++) {
+      // We have to count BatchClassUsedInFreeLists in other regions first.
+      if (I == SizeClassMap::BatchClassId)
+        continue;
+      SizeClassInfo *Sci = getSizeClassInfo(I);
+      ScopedLock L1(Sci->Mutex);
+      uptr TotalBlocks = 0;
+      for (BatchGroupT &BG : Sci->FreeListInfo.BlockList) {
+        // `BG::Batches` are `TransferBatches`. +1 for `BatchGroup`.
+        BatchClassUsedInFreeLists += BG.Batches.size() + 1;
+        for (const auto &It : BG.Batches)
+          TotalBlocks += It.getCount();
+      }
+
+      const uptr BlockSize = getSizeByClassId(I);
+      DCHECK_EQ(TotalBlocks, Sci->AllocatedUser / BlockSize);
+      DCHECK_EQ(Sci->FreeListInfo.PushedBlocks, Sci->FreeListInfo.PoppedBlocks);
+    }
+
+    SizeClassInfo *Sci = getSizeClassInfo(SizeClassMap::BatchClassId);
+    ScopedLock L1(Sci->Mutex);
+    uptr TotalBlocks = 0;
+    for (BatchGroupT &BG : Sci->FreeListInfo.BlockList) {
+      if (LIKELY(!BG.Batches.empty())) {
+        for (const auto &It : BG.Batches)
+          TotalBlocks += It.getCount();
+      } else {
+        // `BatchGroup` with empty freelist doesn't have `TransferBatch` record
+        // itself.
+        ++TotalBlocks;
+      }
+    }
+
+    const uptr BlockSize = getSizeByClassId(SizeClassMap::BatchClassId);
+    DCHECK_EQ(TotalBlocks + BatchClassUsedInFreeLists,
+              Sci->AllocatedUser / BlockSize);
+    const uptr BlocksInUse =
+        Sci->FreeListInfo.PoppedBlocks - Sci->FreeListInfo.PushedBlocks;
+    DCHECK_EQ(BlocksInUse, BatchClassUsedInFreeLists);
+  }
+
   CompactPtrT compactPtr(UNUSED uptr ClassId, uptr Ptr) const {
     return static_cast<CompactPtrT>(Ptr);
   }
@@ -122,26 +175,40 @@ public:
     return reinterpret_cast<void *>(static_cast<uptr>(CompactPtr));
   }
 
-  uptr compactPtrGroup(CompactPtrT CompactPtr) {
-    return CompactPtr >> GroupSizeLog;
+  uptr compactPtrGroupBase(CompactPtrT CompactPtr) {
+    const uptr Mask = (static_cast<uptr>(1) << GroupSizeLog) - 1;
+    return CompactPtr & ~Mask;
+  }
+
+  uptr decompactGroupBase(uptr CompactPtrGroupBase) {
+    return CompactPtrGroupBase;
+  }
+
+  ALWAYS_INLINE static bool isSmallBlock(uptr BlockSize) {
+    const uptr PageSize = getPageSizeCached();
+    return BlockSize < PageSize / 16U;
   }
 
-  uptr batchGroupBase(uptr GroupId) { return GroupId << GroupSizeLog; }
+  ALWAYS_INLINE static bool isLargeBlock(uptr BlockSize) {
+    const uptr PageSize = getPageSizeCached();
+    return BlockSize > PageSize;
+  }
 
-  TransferBatch *popBatch(CacheT *C, uptr ClassId) {
+  u16 popBlocks(CacheT *C, uptr ClassId, CompactPtrT *ToArray,
+                const u16 MaxBlockCount) {
     DCHECK_LT(ClassId, NumClasses);
     SizeClassInfo *Sci = getSizeClassInfo(ClassId);
     ScopedLock L(Sci->Mutex);
-    TransferBatch *B = popBatchImpl(C, ClassId, Sci);
-    if (UNLIKELY(!B)) {
+
+    u16 PopCount = popBlocksImpl(C, ClassId, Sci, ToArray, MaxBlockCount);
+    if (UNLIKELY(PopCount == 0)) {
       if (UNLIKELY(!populateFreeList(C, ClassId, Sci)))
-        return nullptr;
-      B = popBatchImpl(C, ClassId, Sci);
-      // if `populateFreeList` succeeded, we are supposed to get free blocks.
-      DCHECK_NE(B, nullptr);
+        return 0U;
+      PopCount = popBlocksImpl(C, ClassId, Sci, ToArray, MaxBlockCount);
+      DCHECK_NE(PopCount, 0U);
     }
-    Sci->Stats.PoppedBlocks += B->getCount();
-    return B;
+
+    return PopCount;
   }
 
   // Push the array of free blocks to the designated batch group.
@@ -152,16 +219,7 @@ public:
     SizeClassInfo *Sci = getSizeClassInfo(ClassId);
     if (ClassId == SizeClassMap::BatchClassId) {
       ScopedLock L(Sci->Mutex);
-      // Constructing a batch group in the free list will use two blocks in
-      // BatchClassId. If we are pushing BatchClassId blocks, we will use the
-      // blocks in the array directly (can't delegate local cache which will
-      // cause a recursive allocation). However, The number of free blocks may
-      // be less than two. Therefore, populate the free list before inserting
-      // the blocks.
-      if (Size == 1 && !populateFreeList(C, ClassId, Sci))
-        return;
-      pushBlocksImpl(C, ClassId, Sci, Array, Size);
-      Sci->Stats.PushedBlocks += Size;
+      pushBatchClassBlocks(Sci, Array, Size);
       return;
     }
 
@@ -172,11 +230,12 @@ public:
     // together.
     bool SameGroup = true;
     for (u32 I = 1; I < Size; ++I) {
-      if (compactPtrGroup(Array[I - 1]) != compactPtrGroup(Array[I]))
+      if (compactPtrGroupBase(Array[I - 1]) != compactPtrGroupBase(Array[I]))
         SameGroup = false;
       CompactPtrT Cur = Array[I];
       u32 J = I;
-      while (J > 0 && compactPtrGroup(Cur) < compactPtrGroup(Array[J - 1])) {
+      while (J > 0 &&
+             compactPtrGroupBase(Cur) < compactPtrGroupBase(Array[J - 1])) {
         Array[J] = Array[J - 1];
         --J;
       }
@@ -185,10 +244,6 @@ public:
 
     ScopedLock L(Sci->Mutex);
     pushBlocksImpl(C, ClassId, Sci, Array, Size, SameGroup);
-
-    Sci->Stats.PushedBlocks += Size;
-    if (ClassId != SizeClassMap::BatchClassId)
-      releaseToOSMaybe(Sci, ClassId);
   }
 
   void disable() NO_THREAD_SAFETY_ANALYSIS {
@@ -252,8 +307,8 @@ public:
       SizeClassInfo *Sci = getSizeClassInfo(I);
       ScopedLock L(Sci->Mutex);
       TotalMapped += Sci->AllocatedUser;
-      PoppedBlocks += Sci->Stats.PoppedBlocks;
-      PushedBlocks += Sci->Stats.PushedBlocks;
+      PoppedBlocks += Sci->FreeListInfo.PoppedBlocks;
+      PushedBlocks += Sci->FreeListInfo.PushedBlocks;
     }
     Str->append("Stats: SizeClassAllocator32: %zuM mapped in %zu allocations; "
                 "remains %zu\n",
@@ -261,15 +316,27 @@ public:
     for (uptr I = 0; I < NumClasses; I++) {
       SizeClassInfo *Sci = getSizeClassInfo(I);
       ScopedLock L(Sci->Mutex);
-      getStats(Str, I, Sci, 0);
+      getStats(Str, I, Sci);
+    }
+  }
+
+  void getFragmentationInfo(ScopedString *Str) {
+    Str->append(
+        "Fragmentation Stats: SizeClassAllocator32: page size = %zu bytes\n",
+        getPageSizeCached());
+
+    for (uptr I = 1; I < NumClasses; I++) {
+      SizeClassInfo *Sci = getSizeClassInfo(I);
+      ScopedLock L(Sci->Mutex);
+      getSizeClassFragmentationInfo(Sci, I, Str);
     }
   }
 
   bool setOption(Option O, sptr Value) {
     if (O == Option::ReleaseInterval) {
       const s32 Interval = Max(
-          Min(static_cast<s32>(Value), Config::PrimaryMaxReleaseToOsIntervalMs),
-          Config::PrimaryMinReleaseToOsIntervalMs);
+          Min(static_cast<s32>(Value), Config::getMaxReleaseToOsIntervalMs()),
+          Config::getMinReleaseToOsIntervalMs());
       atomic_store_relaxed(&ReleaseToOsIntervalMs, Interval);
       return true;
     }
@@ -277,14 +344,22 @@ public:
     return true;
   }
 
-  uptr releaseToOS() {
+  uptr tryReleaseToOS(uptr ClassId, ReleaseToOS ReleaseType) {
+    SizeClassInfo *Sci = getSizeClassInfo(ClassId);
+    // TODO: Once we have separate locks like primary64, we may consider using
+    // tryLock() as well.
+    ScopedLock L(Sci->Mutex);
+    return releaseToOSMaybe(Sci, ClassId, ReleaseType);
+  }
+
+  uptr releaseToOS(ReleaseToOS ReleaseType) {
     uptr TotalReleasedBytes = 0;
     for (uptr I = 0; I < NumClasses; I++) {
       if (I == SizeClassMap::BatchClassId)
         continue;
       SizeClassInfo *Sci = getSizeClassInfo(I);
       ScopedLock L(Sci->Mutex);
-      TotalReleasedBytes += releaseToOSMaybe(Sci, I, /*Force=*/true);
+      TotalReleasedBytes += releaseToOSMaybe(Sci, I, ReleaseType);
     }
     return TotalReleasedBytes;
   }
@@ -301,30 +376,30 @@ public:
 
 private:
   static const uptr NumClasses = SizeClassMap::NumClasses;
-  static const uptr RegionSize = 1UL << Config::PrimaryRegionSizeLog;
-  static const uptr NumRegions =
-      SCUDO_MMAP_RANGE_SIZE >> Config::PrimaryRegionSizeLog;
+  static const uptr RegionSize = 1UL << Config::getRegionSizeLog();
+  static const uptr NumRegions = SCUDO_MMAP_RANGE_SIZE >>
+                                 Config::getRegionSizeLog();
   static const u32 MaxNumBatches = SCUDO_ANDROID ? 4U : 8U;
   typedef FlatByteMap<NumRegions> ByteMap;
 
-  struct SizeClassStats {
-    uptr PoppedBlocks;
-    uptr PushedBlocks;
-  };
-
   struct ReleaseToOsInfo {
-    uptr PushedBlocksAtLastRelease;
+    uptr BytesInFreeListAtLastCheckpoint;
     uptr RangesReleased;
     uptr LastReleasedBytes;
     u64 LastReleaseAtNs;
   };
 
+  struct BlocksInfo {
+    SinglyLinkedList<BatchGroupT> BlockList = {};
+    uptr PoppedBlocks = 0;
+    uptr PushedBlocks = 0;
+  };
+
   struct alignas(SCUDO_CACHE_LINE_SIZE) SizeClassInfo {
     HybridMutex Mutex;
-    SinglyLinkedList<BatchGroup> FreeList GUARDED_BY(Mutex);
+    BlocksInfo FreeListInfo GUARDED_BY(Mutex);
     uptr CurrentRegion GUARDED_BY(Mutex);
     uptr CurrentRegionAllocated GUARDED_BY(Mutex);
-    SizeClassStats Stats GUARDED_BY(Mutex);
     u32 RandState;
     uptr AllocatedUser GUARDED_BY(Mutex);
     // Lowest & highest region index allocated for this size class, to avoid
@@ -336,7 +411,7 @@ private:
   static_assert(sizeof(SizeClassInfo) % SCUDO_CACHE_LINE_SIZE == 0, "");
 
   uptr computeRegionId(uptr Mem) {
-    const uptr Id = Mem >> Config::PrimaryRegionSizeLog;
+    const uptr Id = Mem >> Config::getRegionSizeLog();
     CHECK_LT(Id, NumRegions);
     return Id;
   }
@@ -363,6 +438,11 @@ private:
     const uptr End = Region + MapSize;
     if (End != MapEnd)
       unmap(reinterpret_cast<void *>(End), MapEnd - End);
+
+    DCHECK_EQ(Region % RegionSize, 0U);
+    static_assert(Config::getRegionSizeLog() == GroupSizeLog,
+                  "Memory group should be the same size as Region");
+
     return Region;
   }
 
@@ -394,15 +474,125 @@ private:
     return &SizeClassInfoArray[ClassId];
   }
 
+  void pushBatchClassBlocks(SizeClassInfo *Sci, CompactPtrT *Array, u32 Size)
+      REQUIRES(Sci->Mutex) {
+    DCHECK_EQ(Sci, getSizeClassInfo(SizeClassMap::BatchClassId));
+
+    // Free blocks are recorded by TransferBatch in freelist for all
+    // size-classes. In addition, TransferBatch is allocated from BatchClassId.
+    // In order not to use additional block to record the free blocks in
+    // BatchClassId, they are self-contained. I.e., A TransferBatch records the
+    // block address of itself. See the figure below:
+    //
+    // TransferBatch at 0xABCD
+    // +----------------------------+
+    // | Free blocks' addr          |
+    // | +------+------+------+     |
+    // | |0xABCD|...   |...   |     |
+    // | +------+------+------+     |
+    // +----------------------------+
+    //
+    // When we allocate all the free blocks in the TransferBatch, the block used
+    // by TransferBatch is also free for use. We don't need to recycle the
+    // TransferBatch. Note that the correctness is maintained by the invariant,
+    //
+    //   Each popBlocks() request returns the entire TransferBatch. Returning
+    //   part of the blocks in a TransferBatch is invalid.
+    //
+    // This ensures that TransferBatch won't leak the address itself while it's
+    // still holding other valid data.
+    //
+    // Besides, BatchGroup is also allocated from BatchClassId and has its
+    // address recorded in the TransferBatch too. To maintain the correctness,
+    //
+    //   The address of BatchGroup is always recorded in the last TransferBatch
+    //   in the freelist (also imply that the freelist should only be
+    //   updated with push_front). Once the last TransferBatch is popped,
+    //   the block used by BatchGroup is also free for use.
+    //
+    // With this approach, the blocks used by BatchGroup and TransferBatch are
+    // reusable and don't need additional space for them.
+
+    Sci->FreeListInfo.PushedBlocks += Size;
+    BatchGroupT *BG = Sci->FreeListInfo.BlockList.front();
+
+    if (BG == nullptr) {
+      // Construct `BatchGroup` on the last element.
+      BG = reinterpret_cast<BatchGroupT *>(
+          decompactPtr(SizeClassMap::BatchClassId, Array[Size - 1]));
+      --Size;
+      BG->Batches.clear();
+      // BatchClass hasn't enabled memory group. Use `0` to indicate there's no
+      // memory group here.
+      BG->CompactPtrGroupBase = 0;
+      // `BG` is also the block of BatchClassId. Note that this is different
+      // from `CreateGroup` in `pushBlocksImpl`
+      BG->PushedBlocks = 1;
+      BG->BytesInBGAtLastCheckpoint = 0;
+      BG->MaxCachedPerBatch =
+          CacheT::getMaxCached(getSizeByClassId(SizeClassMap::BatchClassId));
+
+      Sci->FreeListInfo.BlockList.push_front(BG);
+    }
+
+    if (UNLIKELY(Size == 0))
+      return;
+
+    // This happens under 2 cases.
+    //   1. just allocated a new `BatchGroup`.
+    //   2. Only 1 block is pushed when the freelist is empty.
+    if (BG->Batches.empty()) {
+      // Construct the `TransferBatch` on the last element.
+      TransferBatchT *TB = reinterpret_cast<TransferBatchT *>(
+          decompactPtr(SizeClassMap::BatchClassId, Array[Size - 1]));
+      TB->clear();
+      // As mentioned above, addresses of `TransferBatch` and `BatchGroup` are
+      // recorded in the TransferBatch.
+      TB->add(Array[Size - 1]);
+      TB->add(
+          compactPtr(SizeClassMap::BatchClassId, reinterpret_cast<uptr>(BG)));
+      --Size;
+      DCHECK_EQ(BG->PushedBlocks, 1U);
+      // `TB` is also the block of BatchClassId.
+      BG->PushedBlocks += 1;
+      BG->Batches.push_front(TB);
+    }
+
+    TransferBatchT *CurBatch = BG->Batches.front();
+    DCHECK_NE(CurBatch, nullptr);
+
+    for (u32 I = 0; I < Size;) {
+      u16 UnusedSlots =
+          static_cast<u16>(BG->MaxCachedPerBatch - CurBatch->getCount());
+      if (UnusedSlots == 0) {
+        CurBatch = reinterpret_cast<TransferBatchT *>(
+            decompactPtr(SizeClassMap::BatchClassId, Array[I]));
+        CurBatch->clear();
+        // Self-contained
+        CurBatch->add(Array[I]);
+        ++I;
+        // TODO(chiahungduan): Avoid the use of push_back() in `Batches` of
+        // BatchClassId.
+        BG->Batches.push_front(CurBatch);
+        UnusedSlots = static_cast<u16>(BG->MaxCachedPerBatch - 1);
+      }
+      // `UnusedSlots` is u16 so the result will be also fit in u16.
+      const u16 AppendSize = static_cast<u16>(Min<u32>(UnusedSlots, Size - I));
+      CurBatch->appendFromArray(&Array[I], AppendSize);
+      I += AppendSize;
+    }
+
+    BG->PushedBlocks += Size;
+  }
   // Push the blocks to their batch group. The layout will be like,
   //
-  // FreeList - > BG -> BG -> BG
-  //              |     |     |
-  //              v     v     v
-  //              TB    TB    TB
-  //              |
-  //              v
-  //              TB
+  // FreeListInfo.BlockList - > BG -> BG -> BG
+  //                            |     |     |
+  //                            v     v     v
+  //                            TB    TB    TB
+  //                            |
+  //                            v
+  //                            TB
   //
   // Each BlockGroup(BG) will associate with unique group id and the free blocks
   // are managed by a list of TransferBatch(TB). To reduce the time of inserting
@@ -415,81 +605,29 @@ private:
   void pushBlocksImpl(CacheT *C, uptr ClassId, SizeClassInfo *Sci,
                       CompactPtrT *Array, u32 Size, bool SameGroup = false)
       REQUIRES(Sci->Mutex) {
+    DCHECK_NE(ClassId, SizeClassMap::BatchClassId);
     DCHECK_GT(Size, 0U);
 
-    auto CreateGroup = [&](uptr GroupId) {
-      BatchGroup *BG = nullptr;
-      TransferBatch *TB = nullptr;
-      if (ClassId == SizeClassMap::BatchClassId) {
-        DCHECK_GE(Size, 2U);
-
-        // Free blocks are recorded by TransferBatch in freelist, blocks of
-        // BatchClassId are included. In order not to use additional memory to
-        // record blocks of BatchClassId, they are self-contained. I.e., A
-        // TransferBatch may record the block address of itself. See the figure
-        // below:
-        //
-        // TransferBatch at 0xABCD
-        // +----------------------------+
-        // | Free blocks' addr          |
-        // | +------+------+------+     |
-        // | |0xABCD|...   |...   |     |
-        // | +------+------+------+     |
-        // +----------------------------+
-        //
-        // The safeness of manipulating TransferBatch is kept by the invariant,
-        //
-        //   The unit of each pop-block request is a TransferBatch. Return
-        //   part of the blocks in a TransferBatch is not allowed.
-        //
-        // This ensures that TransferBatch won't leak the address itself while
-        // it's still holding other valid data.
-        //
-        // Besides, BatchGroup uses the same size-class as TransferBatch does
-        // and its address is recorded in the TransferBatch too. To maintain the
-        // safeness, the invariant to keep is,
-        //
-        //   The address of itself is always recorded in the last TransferBatch
-        //   of the freelist (also imply that the freelist should only be
-        //   updated with push_front). Once the last TransferBatch is popped,
-        //   the BatchGroup becomes invalid.
-        //
-        // As a result, the blocks used by BatchGroup and TransferBatch are
-        // reusable and don't need additional space for them.
-        BG = reinterpret_cast<BatchGroup *>(
-            decompactPtr(ClassId, Array[Size - 1]));
-        BG->Batches.clear();
-
-        TB = reinterpret_cast<TransferBatch *>(
-            decompactPtr(ClassId, Array[Size - 2]));
-        TB->clear();
-
-        // Append the blocks used by BatchGroup and TransferBatch immediately so
-        // that we ensure that they are in the last TransBatch.
-        TB->appendFromArray(Array + Size - 2, 2);
-        Size -= 2;
-      } else {
-        BG = C->createGroup();
-        BG->Batches.clear();
+    auto CreateGroup = [&](uptr CompactPtrGroupBase) {
+      BatchGroupT *BG =
+          reinterpret_cast<BatchGroupT *>(C->getBatchClassBlock());
+      BG->Batches.clear();
+      TransferBatchT *TB =
+          reinterpret_cast<TransferBatchT *>(C->getBatchClassBlock());
+      TB->clear();
 
-        TB = C->createBatch(ClassId, nullptr);
-        TB->clear();
-      }
-
-      BG->GroupId = GroupId;
-      // TODO(chiahungduan): Avoid the use of push_back() in `Batches`.
+      BG->CompactPtrGroupBase = CompactPtrGroupBase;
       BG->Batches.push_front(TB);
       BG->PushedBlocks = 0;
-      BG->PushedBlocksAtLastCheckpoint = 0;
-      BG->MaxCachedPerBatch =
-          TransferBatch::getMaxCached(getSizeByClassId(ClassId));
+      BG->BytesInBGAtLastCheckpoint = 0;
+      BG->MaxCachedPerBatch = TransferBatchT::MaxNumCached;
 
       return BG;
     };
 
-    auto InsertBlocks = [&](BatchGroup *BG, CompactPtrT *Array, u32 Size) {
-      SinglyLinkedList<TransferBatch> &Batches = BG->Batches;
-      TransferBatch *CurBatch = Batches.front();
+    auto InsertBlocks = [&](BatchGroupT *BG, CompactPtrT *Array, u32 Size) {
+      SinglyLinkedList<TransferBatchT> &Batches = BG->Batches;
+      TransferBatchT *CurBatch = Batches.front();
       DCHECK_NE(CurBatch, nullptr);
 
       for (u32 I = 0; I < Size;) {
@@ -497,9 +635,8 @@ private:
         u16 UnusedSlots =
             static_cast<u16>(BG->MaxCachedPerBatch - CurBatch->getCount());
         if (UnusedSlots == 0) {
-          CurBatch = C->createBatch(
-              ClassId,
-              reinterpret_cast<void *>(decompactPtr(ClassId, Array[I])));
+          CurBatch =
+              reinterpret_cast<TransferBatchT *>(C->getBatchClassBlock());
           CurBatch->clear();
           Batches.push_front(CurBatch);
           UnusedSlots = BG->MaxCachedPerBatch;
@@ -513,40 +650,33 @@ private:
       BG->PushedBlocks += Size;
     };
 
-    BatchGroup *Cur = Sci->FreeList.front();
-
-    if (ClassId == SizeClassMap::BatchClassId) {
-      if (Cur == nullptr) {
-        // Don't need to classify BatchClassId.
-        Cur = CreateGroup(/*GroupId=*/0);
-        Sci->FreeList.push_front(Cur);
-      }
-      InsertBlocks(Cur, Array, Size);
-      return;
-    }
+    Sci->FreeListInfo.PushedBlocks += Size;
+    BatchGroupT *Cur = Sci->FreeListInfo.BlockList.front();
 
     // In the following, `Cur` always points to the BatchGroup for blocks that
     // will be pushed next. `Prev` is the element right before `Cur`.
-    BatchGroup *Prev = nullptr;
+    BatchGroupT *Prev = nullptr;
 
-    while (Cur != nullptr && compactPtrGroup(Array[0]) > Cur->GroupId) {
+    while (Cur != nullptr &&
+           compactPtrGroupBase(Array[0]) > Cur->CompactPtrGroupBase) {
       Prev = Cur;
       Cur = Cur->Next;
     }
 
-    if (Cur == nullptr || compactPtrGroup(Array[0]) != Cur->GroupId) {
-      Cur = CreateGroup(compactPtrGroup(Array[0]));
+    if (Cur == nullptr ||
+        compactPtrGroupBase(Array[0]) != Cur->CompactPtrGroupBase) {
+      Cur = CreateGroup(compactPtrGroupBase(Array[0]));
       if (Prev == nullptr)
-        Sci->FreeList.push_front(Cur);
+        Sci->FreeListInfo.BlockList.push_front(Cur);
       else
-        Sci->FreeList.insert(Prev, Cur);
+        Sci->FreeListInfo.BlockList.insert(Prev, Cur);
     }
 
     // All the blocks are from the same group, just push without checking group
     // id.
     if (SameGroup) {
       for (u32 I = 0; I < Size; ++I)
-        DCHECK_EQ(compactPtrGroup(Array[I]), Cur->GroupId);
+        DCHECK_EQ(compactPtrGroupBase(Array[I]), Cur->CompactPtrGroupBase);
 
       InsertBlocks(Cur, Array, Size);
       return;
@@ -556,19 +686,21 @@ private:
     // push them to their group together.
     u32 Count = 1;
     for (u32 I = 1; I < Size; ++I) {
-      if (compactPtrGroup(Array[I - 1]) != compactPtrGroup(Array[I])) {
-        DCHECK_EQ(compactPtrGroup(Array[I - 1]), Cur->GroupId);
+      if (compactPtrGroupBase(Array[I - 1]) != compactPtrGroupBase(Array[I])) {
+        DCHECK_EQ(compactPtrGroupBase(Array[I - 1]), Cur->CompactPtrGroupBase);
         InsertBlocks(Cur, Array + I - Count, Count);
 
-        while (Cur != nullptr && compactPtrGroup(Array[I]) > Cur->GroupId) {
+        while (Cur != nullptr &&
+               compactPtrGroupBase(Array[I]) > Cur->CompactPtrGroupBase) {
           Prev = Cur;
           Cur = Cur->Next;
         }
 
-        if (Cur == nullptr || compactPtrGroup(Array[I]) != Cur->GroupId) {
-          Cur = CreateGroup(compactPtrGroup(Array[I]));
+        if (Cur == nullptr ||
+            compactPtrGroupBase(Array[I]) != Cur->CompactPtrGroupBase) {
+          Cur = CreateGroup(compactPtrGroupBase(Array[I]));
           DCHECK_NE(Prev, nullptr);
-          Sci->FreeList.insert(Prev, Cur);
+          Sci->FreeListInfo.BlockList.insert(Prev, Cur);
         }
 
         Count = 1;
@@ -580,37 +712,74 @@ private:
     InsertBlocks(Cur, Array + Size - Count, Count);
   }
 
-  // Pop one TransferBatch from a BatchGroup. The BatchGroup with the smallest
-  // group id will be considered first.
-  //
-  // The region mutex needs to be held while calling this method.
-  TransferBatch *popBatchImpl(CacheT *C, uptr ClassId, SizeClassInfo *Sci)
+  u16 popBlocksImpl(CacheT *C, uptr ClassId, SizeClassInfo *Sci,
+                    CompactPtrT *ToArray, const u16 MaxBlockCount)
       REQUIRES(Sci->Mutex) {
-    if (Sci->FreeList.empty())
-      return nullptr;
+    if (Sci->FreeListInfo.BlockList.empty())
+      return 0U;
 
-    SinglyLinkedList<TransferBatch> &Batches = Sci->FreeList.front()->Batches;
-    DCHECK(!Batches.empty());
+    SinglyLinkedList<TransferBatchT> &Batches =
+        Sci->FreeListInfo.BlockList.front()->Batches;
 
-    TransferBatch *B = Batches.front();
-    Batches.pop_front();
+    if (Batches.empty()) {
+      DCHECK_EQ(ClassId, SizeClassMap::BatchClassId);
+      BatchGroupT *BG = Sci->FreeListInfo.BlockList.front();
+      Sci->FreeListInfo.BlockList.pop_front();
+
+      // Block used by `BatchGroup` is from BatchClassId. Turn the block into
+      // `TransferBatch` with single block.
+      TransferBatchT *TB = reinterpret_cast<TransferBatchT *>(BG);
+      ToArray[0] =
+          compactPtr(SizeClassMap::BatchClassId, reinterpret_cast<uptr>(TB));
+      Sci->FreeListInfo.PoppedBlocks += 1;
+      return 1U;
+    }
+
+    // So far, instead of always filling the blocks to `MaxBlockCount`, we only
+    // examine single `TransferBatch` to minimize the time spent on the primary
+    // allocator. Besides, the sizes of `TransferBatch` and
+    // `CacheT::getMaxCached()` may also impact the time spent on accessing the
+    // primary allocator.
+    // TODO(chiahungduan): Evaluate if we want to always prepare `MaxBlockCount`
+    // blocks and/or adjust the size of `TransferBatch` according to
+    // `CacheT::getMaxCached()`.
+    TransferBatchT *B = Batches.front();
     DCHECK_NE(B, nullptr);
     DCHECK_GT(B->getCount(), 0U);
 
-    if (Batches.empty()) {
-      BatchGroup *BG = Sci->FreeList.front();
-      Sci->FreeList.pop_front();
-
-      // We don't keep BatchGroup with zero blocks to avoid empty-checking while
-      // allocating. Note that block used by constructing BatchGroup is recorded
-      // as free blocks in the last element of BatchGroup::Batches. Which means,
-      // once we pop the last TransferBatch, the block is implicitly
-      // deallocated.
+    // BachClassId should always take all blocks in the TransferBatch. Read the
+    // comment in `pushBatchClassBlocks()` for more details.
+    const u16 PopCount = ClassId == SizeClassMap::BatchClassId
+                             ? B->getCount()
+                             : Min(MaxBlockCount, B->getCount());
+    B->moveNToArray(ToArray, PopCount);
+
+    // TODO(chiahungduan): The deallocation of unused BatchClassId blocks can be
+    // done without holding `Mutex`.
+    if (B->empty()) {
+      Batches.pop_front();
+      // `TransferBatch` of BatchClassId is self-contained, no need to
+      // deallocate. Read the comment in `pushBatchClassBlocks()` for more
+      // details.
       if (ClassId != SizeClassMap::BatchClassId)
-        C->deallocate(SizeClassMap::BatchClassId, BG);
+        C->deallocate(SizeClassMap::BatchClassId, B);
+
+      if (Batches.empty()) {
+        BatchGroupT *BG = Sci->FreeListInfo.BlockList.front();
+        Sci->FreeListInfo.BlockList.pop_front();
+
+        // We don't keep BatchGroup with zero blocks to avoid empty-checking
+        // while allocating. Note that block used for constructing BatchGroup is
+        // recorded as free blocks in the last element of BatchGroup::Batches.
+        // Which means, once we pop the last TransferBatch, the block is
+        // implicitly deallocated.
+        if (ClassId != SizeClassMap::BatchClassId)
+          C->deallocate(SizeClassMap::BatchClassId, BG);
+      }
     }
 
-    return B;
+    Sci->FreeListInfo.PoppedBlocks += PopCount;
+    return PopCount;
   }
 
   NOINLINE bool populateFreeList(CacheT *C, uptr ClassId, SizeClassInfo *Sci)
@@ -636,7 +805,7 @@ private:
     }
 
     const uptr Size = getSizeByClassId(ClassId);
-    const u16 MaxCount = TransferBatch::getMaxCached(Size);
+    const u16 MaxCount = CacheT::getMaxCached(Size);
     DCHECK_GT(MaxCount, 0U);
     // The maximum number of blocks we should carve in the region is dictated
     // by the maximum number of batches we want to fill, and the amount of
@@ -649,7 +818,7 @@ private:
     DCHECK_GT(NumberOfBlocks, 0U);
 
     constexpr u32 ShuffleArraySize =
-        MaxNumBatches * TransferBatch::MaxNumCached;
+        MaxNumBatches * TransferBatchT::MaxNumCached;
     // Fill the transfer batches and put them in the size-class freelist. We
     // need to randomize the blocks for security purposes, so we first fill a
     // local array that we then shuffle before populating the batches.
@@ -662,14 +831,14 @@ private:
 
     if (ClassId != SizeClassMap::BatchClassId) {
       u32 N = 1;
-      uptr CurGroup = compactPtrGroup(ShuffleArray[0]);
+      uptr CurGroup = compactPtrGroupBase(ShuffleArray[0]);
       for (u32 I = 1; I < NumberOfBlocks; I++) {
-        if (UNLIKELY(compactPtrGroup(ShuffleArray[I]) != CurGroup)) {
+        if (UNLIKELY(compactPtrGroupBase(ShuffleArray[I]) != CurGroup)) {
           shuffle(ShuffleArray + I - N, N, &Sci->RandState);
           pushBlocksImpl(C, ClassId, Sci, ShuffleArray + I - N, N,
                          /*SameGroup=*/true);
           N = 1;
-          CurGroup = compactPtrGroup(ShuffleArray[I]);
+          CurGroup = compactPtrGroupBase(ShuffleArray[I]);
         } else {
           ++N;
         }
@@ -679,10 +848,15 @@ private:
       pushBlocksImpl(C, ClassId, Sci, &ShuffleArray[NumberOfBlocks - N], N,
                      /*SameGroup=*/true);
     } else {
-      pushBlocksImpl(C, ClassId, Sci, ShuffleArray, NumberOfBlocks,
-                     /*SameGroup=*/true);
+      pushBatchClassBlocks(Sci, ShuffleArray, NumberOfBlocks);
     }
 
+    // Note that `PushedBlocks` and `PoppedBlocks` are supposed to only record
+    // the requests from `PushBlocks` and `PopBatch` which are external
+    // interfaces. `populateFreeList` is the internal interface so we should set
+    // the values back to avoid incorrectly setting the stats.
+    Sci->FreeListInfo.PushedBlocks -= NumberOfBlocks;
+
     const uptr AllocatedUser = Size * NumberOfBlocks;
     C->getStats().add(StatFree, AllocatedUser);
     DCHECK_LE(Sci->CurrentRegionAllocated + AllocatedUser, RegionSize);
@@ -700,54 +874,102 @@ private:
     return true;
   }
 
-  void getStats(ScopedString *Str, uptr ClassId, SizeClassInfo *Sci, uptr Rss)
+  void getStats(ScopedString *Str, uptr ClassId, SizeClassInfo *Sci)
       REQUIRES(Sci->Mutex) {
     if (Sci->AllocatedUser == 0)
       return;
-    const uptr InUse = Sci->Stats.PoppedBlocks - Sci->Stats.PushedBlocks;
-    const uptr AvailableChunks = Sci->AllocatedUser / getSizeByClassId(ClassId);
+    const uptr BlockSize = getSizeByClassId(ClassId);
+    const uptr InUse =
+        Sci->FreeListInfo.PoppedBlocks - Sci->FreeListInfo.PushedBlocks;
+    const uptr BytesInFreeList = Sci->AllocatedUser - InUse * BlockSize;
+    uptr PushedBytesDelta = 0;
+    if (BytesInFreeList >= Sci->ReleaseInfo.BytesInFreeListAtLastCheckpoint) {
+      PushedBytesDelta =
+          BytesInFreeList - Sci->ReleaseInfo.BytesInFreeListAtLastCheckpoint;
+    }
+    const uptr AvailableChunks = Sci->AllocatedUser / BlockSize;
     Str->append("  %02zu (%6zu): mapped: %6zuK popped: %7zu pushed: %7zu "
-                "inuse: %6zu avail: %6zu rss: %6zuK releases: %6zu\n",
+                "inuse: %6zu avail: %6zu releases: %6zu last released: %6zuK "
+                "latest pushed bytes: %6zuK\n",
                 ClassId, getSizeByClassId(ClassId), Sci->AllocatedUser >> 10,
-                Sci->Stats.PoppedBlocks, Sci->Stats.PushedBlocks, InUse,
-                AvailableChunks, Rss >> 10, Sci->ReleaseInfo.RangesReleased);
+                Sci->FreeListInfo.PoppedBlocks, Sci->FreeListInfo.PushedBlocks,
+                InUse, AvailableChunks, Sci->ReleaseInfo.RangesReleased,
+                Sci->ReleaseInfo.LastReleasedBytes >> 10,
+                PushedBytesDelta >> 10);
   }
 
-  NOINLINE uptr releaseToOSMaybe(SizeClassInfo *Sci, uptr ClassId,
-                                 bool Force = false) REQUIRES(Sci->Mutex) {
+  void getSizeClassFragmentationInfo(SizeClassInfo *Sci, uptr ClassId,
+                                     ScopedString *Str) REQUIRES(Sci->Mutex) {
     const uptr BlockSize = getSizeByClassId(ClassId);
+    const uptr First = Sci->MinRegionIndex;
+    const uptr Last = Sci->MaxRegionIndex;
+    const uptr Base = First * RegionSize;
+    const uptr NumberOfRegions = Last - First + 1U;
+    auto SkipRegion = [this, First, ClassId](uptr RegionIndex) {
+      ScopedLock L(ByteMapMutex);
+      return (PossibleRegions[First + RegionIndex] - 1U) != ClassId;
+    };
+
+    FragmentationRecorder Recorder;
+    if (!Sci->FreeListInfo.BlockList.empty()) {
+      PageReleaseContext Context =
+          markFreeBlocks(Sci, ClassId, BlockSize, Base, NumberOfRegions,
+                         ReleaseToOS::ForceAll);
+      releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
+    }
+
     const uptr PageSize = getPageSizeCached();
+    const uptr TotalBlocks = Sci->AllocatedUser / BlockSize;
+    const uptr InUseBlocks =
+        Sci->FreeListInfo.PoppedBlocks - Sci->FreeListInfo.PushedBlocks;
+    uptr AllocatedPagesCount = 0;
+    if (TotalBlocks != 0U) {
+      for (uptr I = 0; I < NumberOfRegions; ++I) {
+        if (SkipRegion(I))
+          continue;
+        AllocatedPagesCount += RegionSize / PageSize;
+      }
+
+      DCHECK_NE(AllocatedPagesCount, 0U);
+    }
+
+    DCHECK_GE(AllocatedPagesCount, Recorder.getReleasedPagesCount());
+    const uptr InUsePages =
+        AllocatedPagesCount - Recorder.getReleasedPagesCount();
+    const uptr InUseBytes = InUsePages * PageSize;
+
+    uptr Integral;
+    uptr Fractional;
+    computePercentage(BlockSize * InUseBlocks, InUsePages * PageSize, &Integral,
+                      &Fractional);
+    Str->append("  %02zu (%6zu): inuse/total blocks: %6zu/%6zu inuse/total "
+                "pages: %6zu/%6zu inuse bytes: %6zuK util: %3zu.%02zu%%\n",
+                ClassId, BlockSize, InUseBlocks, TotalBlocks, InUsePages,
+                AllocatedPagesCount, InUseBytes >> 10, Integral, Fractional);
+  }
+
+  NOINLINE uptr releaseToOSMaybe(SizeClassInfo *Sci, uptr ClassId,
+                                 ReleaseToOS ReleaseType = ReleaseToOS::Normal)
+      REQUIRES(Sci->Mutex) {
+    const uptr BlockSize = getSizeByClassId(ClassId);
 
-    DCHECK_GE(Sci->Stats.PoppedBlocks, Sci->Stats.PushedBlocks);
+    DCHECK_GE(Sci->FreeListInfo.PoppedBlocks, Sci->FreeListInfo.PushedBlocks);
     const uptr BytesInFreeList =
         Sci->AllocatedUser -
-        (Sci->Stats.PoppedBlocks - Sci->Stats.PushedBlocks) * BlockSize;
-    if (BytesInFreeList < PageSize)
-      return 0; // No chance to release anything.
-    const uptr BytesPushed =
-        (Sci->Stats.PushedBlocks - Sci->ReleaseInfo.PushedBlocksAtLastRelease) *
-        BlockSize;
-    if (BytesPushed < PageSize)
-      return 0; // Nothing new to release.
-
-    const bool CheckDensity = BlockSize < PageSize / 16U;
-    // Releasing smaller blocks is expensive, so we want to make sure that a
-    // significant amount of bytes are free, and that there has been a good
-    // amount of batches pushed to the freelist before attempting to release.
-    if (CheckDensity) {
-      if (!Force && BytesPushed < Sci->AllocatedUser / 16U)
-        return 0;
-    }
+        (Sci->FreeListInfo.PoppedBlocks - Sci->FreeListInfo.PushedBlocks) *
+            BlockSize;
 
-    if (!Force) {
-      const s32 IntervalMs = atomic_load_relaxed(&ReleaseToOsIntervalMs);
-      if (IntervalMs < 0)
-        return 0;
-      if (Sci->ReleaseInfo.LastReleaseAtNs +
-              static_cast<u64>(IntervalMs) * 1000000 >
-          getMonotonicTime()) {
-        return 0; // Memory was returned recently.
-      }
+    if (UNLIKELY(BytesInFreeList == 0))
+      return 0;
+
+    // ====================================================================== //
+    // 1. Check if we have enough free blocks and if it's worth doing a page
+    // release.
+    // ====================================================================== //
+    if (ReleaseType != ReleaseToOS::ForceAll &&
+        !hasChanceToReleasePages(Sci, BlockSize, BytesInFreeList,
+                                 ReleaseType)) {
+      return 0;
     }
 
     const uptr First = Sci->MinRegionIndex;
@@ -757,26 +979,122 @@ private:
     uptr TotalReleasedBytes = 0;
     const uptr Base = First * RegionSize;
     const uptr NumberOfRegions = Last - First + 1U;
-    const uptr GroupSize = (1U << GroupSizeLog);
-    const uptr CurRegionGroupId =
-        compactPtrGroup(compactPtr(ClassId, Sci->CurrentRegion));
 
+    // ==================================================================== //
+    // 2. Mark the free blocks and we can tell which pages are in-use by
+    //    querying `PageReleaseContext`.
+    // ==================================================================== //
+    PageReleaseContext Context = markFreeBlocks(Sci, ClassId, BlockSize, Base,
+                                                NumberOfRegions, ReleaseType);
+    if (!Context.hasBlockMarked())
+      return 0;
+
+    // ==================================================================== //
+    // 3. Release the unused physical pages back to the OS.
+    // ==================================================================== //
     ReleaseRecorder Recorder(Base);
-    PageReleaseContext Context(BlockSize, RegionSize, NumberOfRegions,
+    auto SkipRegion = [this, First, ClassId](uptr RegionIndex) {
+      ScopedLock L(ByteMapMutex);
+      return (PossibleRegions[First + RegionIndex] - 1U) != ClassId;
+    };
+    releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
+
+    if (Recorder.getReleasedRangesCount() > 0) {
+      Sci->ReleaseInfo.BytesInFreeListAtLastCheckpoint = BytesInFreeList;
+      Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
+      Sci->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
+      TotalReleasedBytes += Sci->ReleaseInfo.LastReleasedBytes;
+    }
+    Sci->ReleaseInfo.LastReleaseAtNs = getMonotonicTimeFast();
+
+    return TotalReleasedBytes;
+  }
+
+  bool hasChanceToReleasePages(SizeClassInfo *Sci, uptr BlockSize,
+                               uptr BytesInFreeList, ReleaseToOS ReleaseType)
+      REQUIRES(Sci->Mutex) {
+    DCHECK_GE(Sci->FreeListInfo.PoppedBlocks, Sci->FreeListInfo.PushedBlocks);
+    const uptr PageSize = getPageSizeCached();
+
+    if (BytesInFreeList <= Sci->ReleaseInfo.BytesInFreeListAtLastCheckpoint)
+      Sci->ReleaseInfo.BytesInFreeListAtLastCheckpoint = BytesInFreeList;
+
+    // Always update `BytesInFreeListAtLastCheckpoint` with the smallest value
+    // so that we won't underestimate the releasable pages. For example, the
+    // following is the region usage,
+    //
+    //  BytesInFreeListAtLastCheckpoint   AllocatedUser
+    //                v                         v
+    //  |--------------------------------------->
+    //         ^                   ^
+    //  BytesInFreeList     ReleaseThreshold
+    //
+    // In general, if we have collected enough bytes and the amount of free
+    // bytes meets the ReleaseThreshold, we will try to do page release. If we
+    // don't update `BytesInFreeListAtLastCheckpoint` when the current
+    // `BytesInFreeList` is smaller, we may take longer time to wait for enough
+    // freed blocks because we miss the bytes between
+    // (BytesInFreeListAtLastCheckpoint - BytesInFreeList).
+    const uptr PushedBytesDelta =
+        BytesInFreeList - Sci->ReleaseInfo.BytesInFreeListAtLastCheckpoint;
+    if (PushedBytesDelta < PageSize)
+      return false;
+
+    // Releasing smaller blocks is expensive, so we want to make sure that a
+    // significant amount of bytes are free, and that there has been a good
+    // amount of batches pushed to the freelist before attempting to release.
+    if (isSmallBlock(BlockSize) && ReleaseType == ReleaseToOS::Normal)
+      if (PushedBytesDelta < Sci->AllocatedUser / 16U)
+        return false;
+
+    if (ReleaseType == ReleaseToOS::Normal) {
+      const s32 IntervalMs = atomic_load_relaxed(&ReleaseToOsIntervalMs);
+      if (IntervalMs < 0)
+        return false;
+
+      // The constant 8 here is selected from profiling some apps and the number
+      // of unreleased pages in the large size classes is around 16 pages or
+      // more. Choose half of it as a heuristic and which also avoids page
+      // release every time for every pushBlocks() attempt by large blocks.
+      const bool ByPassReleaseInterval =
+          isLargeBlock(BlockSize) && PushedBytesDelta > 8 * PageSize;
+      if (!ByPassReleaseInterval) {
+        if (Sci->ReleaseInfo.LastReleaseAtNs +
+                static_cast<u64>(IntervalMs) * 1000000 >
+            getMonotonicTimeFast()) {
+          // Memory was returned recently.
+          return false;
+        }
+      }
+    } // if (ReleaseType == ReleaseToOS::Normal)
+
+    return true;
+  }
+
+  PageReleaseContext markFreeBlocks(SizeClassInfo *Sci, const uptr ClassId,
+                                    const uptr BlockSize, const uptr Base,
+                                    const uptr NumberOfRegions,
+                                    ReleaseToOS ReleaseType)
+      REQUIRES(Sci->Mutex) {
+    const uptr PageSize = getPageSizeCached();
+    const uptr GroupSize = (1UL << GroupSizeLog);
+    const uptr CurGroupBase =
+        compactPtrGroupBase(compactPtr(ClassId, Sci->CurrentRegion));
+
+    PageReleaseContext Context(BlockSize, NumberOfRegions,
                                /*ReleaseSize=*/RegionSize);
 
     auto DecompactPtr = [](CompactPtrT CompactPtr) {
       return reinterpret_cast<uptr>(CompactPtr);
     };
-    for (BatchGroup &BG : Sci->FreeList) {
-      const uptr PushedBytesDelta =
-          BG.PushedBlocks - BG.PushedBlocksAtLastCheckpoint;
-      if (PushedBytesDelta * BlockSize < PageSize)
-        continue;
-
-      uptr AllocatedGroupSize = BG.GroupId == CurRegionGroupId
+    for (BatchGroupT &BG : Sci->FreeListInfo.BlockList) {
+      const uptr GroupBase = decompactGroupBase(BG.CompactPtrGroupBase);
+      // The `GroupSize` may not be divided by `BlockSize`, which means there is
+      // an unused space at the end of Region. Exclude that space to avoid
+      // unused page map entry.
+      uptr AllocatedGroupSize = GroupBase == CurGroupBase
                                     ? Sci->CurrentRegionAllocated
-                                    : GroupSize;
+                                    : roundDownSlow(GroupSize, BlockSize);
       if (AllocatedGroupSize == 0)
         continue;
 
@@ -785,65 +1103,59 @@ private:
       const uptr NumBlocks = (BG.Batches.size() - 1) * BG.MaxCachedPerBatch +
                              BG.Batches.front()->getCount();
       const uptr BytesInBG = NumBlocks * BlockSize;
-      // Given the randomness property, we try to release the pages only if the
-      // bytes used by free blocks exceed certain proportion of allocated
-      // spaces.
-      if (CheckDensity && (BytesInBG * 100U) / AllocatedGroupSize <
-                              (100U - 1U - BlockSize / 16U)) {
-        continue;
+
+      if (ReleaseType != ReleaseToOS::ForceAll) {
+        if (BytesInBG <= BG.BytesInBGAtLastCheckpoint) {
+          BG.BytesInBGAtLastCheckpoint = BytesInBG;
+          continue;
+        }
+
+        const uptr PushedBytesDelta = BytesInBG - BG.BytesInBGAtLastCheckpoint;
+        if (PushedBytesDelta < PageSize)
+          continue;
+
+        // Given the randomness property, we try to release the pages only if
+        // the bytes used by free blocks exceed certain proportion of allocated
+        // spaces.
+        if (isSmallBlock(BlockSize) && (BytesInBG * 100U) / AllocatedGroupSize <
+                                           (100U - 1U - BlockSize / 16U)) {
+          continue;
+        }
       }
 
-      BG.PushedBlocksAtLastCheckpoint = BG.PushedBlocks;
+      // TODO: Consider updating this after page release if `ReleaseRecorder`
+      // can tell the released bytes in each group.
+      BG.BytesInBGAtLastCheckpoint = BytesInBG;
 
       const uptr MaxContainedBlocks = AllocatedGroupSize / BlockSize;
-      // The first condition to do range marking is that all the blocks in the
-      // range need to be from the same region. In SizeClassAllocator32, this is
-      // true when GroupSize and RegionSize are the same. Another tricky case,
-      // while range marking, the last block in a region needs the logic to mark
-      // the last page. However, in SizeClassAllocator32, the RegionSize
-      // recorded in PageReleaseContext may be different from
-      // `CurrentRegionAllocated` of the current region. This exception excludes
-      // the chance of doing range marking for the current region.
-      const bool CanDoRangeMark =
-          GroupSize == RegionSize && BG.GroupId != CurRegionGroupId;
-
-      if (CanDoRangeMark && NumBlocks == MaxContainedBlocks) {
+      const uptr RegionIndex = (GroupBase - Base) / RegionSize;
+
+      if (NumBlocks == MaxContainedBlocks) {
         for (const auto &It : BG.Batches)
           for (u16 I = 0; I < It.getCount(); ++I)
-            DCHECK_EQ(compactPtrGroup(It.get(I)), BG.GroupId);
+            DCHECK_EQ(compactPtrGroupBase(It.get(I)), BG.CompactPtrGroupBase);
 
-        const uptr From = batchGroupBase(BG.GroupId);
-        const uptr To = batchGroupBase(BG.GroupId) + AllocatedGroupSize;
-        Context.markRangeAsAllCounted(From, To, Base);
+        const uptr To = GroupBase + AllocatedGroupSize;
+        Context.markRangeAsAllCounted(GroupBase, To, GroupBase, RegionIndex,
+                                      AllocatedGroupSize);
       } else {
-        if (CanDoRangeMark)
-          DCHECK_LT(NumBlocks, MaxContainedBlocks);
+        DCHECK_LT(NumBlocks, MaxContainedBlocks);
 
         // Note that we don't always visit blocks in each BatchGroup so that we
         // may miss the chance of releasing certain pages that cross
         // BatchGroups.
-        Context.markFreeBlocks(BG.Batches, DecompactPtr, Base);
+        Context.markFreeBlocksInRegion(BG.Batches, DecompactPtr, GroupBase,
+                                       RegionIndex, AllocatedGroupSize,
+                                       /*MayContainLastBlockInRegion=*/true);
       }
-    }
-
-    if (!Context.hasBlockMarked())
-      return 0;
 
-    auto SkipRegion = [this, First, ClassId](uptr RegionIndex) {
-      ScopedLock L(ByteMapMutex);
-      return (PossibleRegions[First + RegionIndex] - 1U) != ClassId;
-    };
-    releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
-
-    if (Recorder.getReleasedRangesCount() > 0) {
-      Sci->ReleaseInfo.PushedBlocksAtLastRelease = Sci->Stats.PushedBlocks;
-      Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
-      Sci->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
-      TotalReleasedBytes += Sci->ReleaseInfo.LastReleasedBytes;
+      // We may not be able to do the page release In a rare case that we may
+      // fail on PageMap allocation.
+      if (UNLIKELY(!Context.hasBlockMarked()))
+        break;
     }
-    Sci->ReleaseInfo.LastReleaseAtNs = getMonotonicTime();
 
-    return TotalReleasedBytes;
+    return Context;
   }
 
   SizeClassInfo SizeClassInfoArray[NumClasses] = {};
diff --git a/standalone/primary64.h b/standalone/primary64.h
index 4caf8eb1751..bed2ccb8b99 100644
--- a/standalone/primary64.h
+++ b/standalone/primary64.h
@@ -9,10 +9,13 @@
 #ifndef SCUDO_PRIMARY64_H_
 #define SCUDO_PRIMARY64_H_
 
+#include "allocator_common.h"
 #include "bytemap.h"
 #include "common.h"
+#include "condition_variable.h"
 #include "list.h"
 #include "local_cache.h"
+#include "mem_map.h"
 #include "memtag.h"
 #include "options.h"
 #include "release.h"
@@ -44,96 +47,226 @@ namespace scudo {
 
 template <typename Config> class SizeClassAllocator64 {
 public:
-  typedef typename Config::PrimaryCompactPtrT CompactPtrT;
-  static const uptr CompactPtrScale = Config::PrimaryCompactPtrScale;
-  static const uptr GroupSizeLog = Config::PrimaryGroupSizeLog;
+  typedef typename Config::CompactPtrT CompactPtrT;
   typedef typename Config::SizeClassMap SizeClassMap;
+  typedef typename Config::ConditionVariableT ConditionVariableT;
+  static const uptr CompactPtrScale = Config::getCompactPtrScale();
+  static const uptr RegionSizeLog = Config::getRegionSizeLog();
+  static const uptr GroupSizeLog = Config::getGroupSizeLog();
+  static_assert(RegionSizeLog >= GroupSizeLog,
+                "Group size shouldn't be greater than the region size");
+  static const uptr GroupScale = GroupSizeLog - CompactPtrScale;
   typedef SizeClassAllocator64<Config> ThisT;
   typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
-  typedef typename CacheT::TransferBatch TransferBatch;
-  typedef typename CacheT::BatchGroup BatchGroup;
+  typedef TransferBatch<ThisT> TransferBatchT;
+  typedef BatchGroup<ThisT> BatchGroupT;
+
+  static_assert(sizeof(BatchGroupT) <= sizeof(TransferBatchT),
+                "BatchGroupT uses the same class size as TransferBatchT");
 
   static uptr getSizeByClassId(uptr ClassId) {
     return (ClassId == SizeClassMap::BatchClassId)
-               ? roundUp(sizeof(TransferBatch), 1U << CompactPtrScale)
+               ? roundUp(sizeof(TransferBatchT), 1U << CompactPtrScale)
                : SizeClassMap::getSizeByClassId(ClassId);
   }
 
   static bool canAllocate(uptr Size) { return Size <= SizeClassMap::MaxSize; }
 
+  static bool conditionVariableEnabled() {
+    return Config::hasConditionVariableT();
+  }
+
   void init(s32 ReleaseToOsInterval) NO_THREAD_SAFETY_ANALYSIS {
     DCHECK(isAligned(reinterpret_cast<uptr>(this), alignof(ThisT)));
-    DCHECK_EQ(PrimaryBase, 0U);
-    // Reserve the space required for the Primary.
-    PrimaryBase = reinterpret_cast<uptr>(map(
-        nullptr, PrimarySize, "scudo:primary_reserve", MAP_NOACCESS, &Data));
+
+    const uptr PageSize = getPageSizeCached();
+    const uptr GroupSize = (1UL << GroupSizeLog);
+    const uptr PagesInGroup = GroupSize / PageSize;
+    const uptr MinSizeClass = getSizeByClassId(1);
+    // When trying to release pages back to memory, visiting smaller size
+    // classes is expensive. Therefore, we only try to release smaller size
+    // classes when the amount of free blocks goes over a certain threshold (See
+    // the comment in releaseToOSMaybe() for more details). For example, for
+    // size class 32, we only do the release when the size of free blocks is
+    // greater than 97% of pages in a group. However, this may introduce another
+    // issue that if the number of free blocks is bouncing between 97% ~ 100%.
+    // Which means we may try many page releases but only release very few of
+    // them (less than 3% in a group). Even though we have
+    // `&ReleaseToOsIntervalMs` which slightly reduce the frequency of these
+    // calls but it will be better to have another guard to mitigate this issue.
+    //
+    // Here we add another constraint on the minimum size requirement. The
+    // constraint is determined by the size of in-use blocks in the minimal size
+    // class. Take size class 32 as an example,
+    //
+    //   +-     one memory group      -+
+    //   +----------------------+------+
+    //   |  97% of free blocks  |      |
+    //   +----------------------+------+
+    //                           \    /
+    //                      3% in-use blocks
+    //
+    //   * The release size threshold is 97%.
+    //
+    // The 3% size in a group is about 7 pages. For two consecutive
+    // releaseToOSMaybe(), we require the difference between `PushedBlocks`
+    // should be greater than 7 pages. This mitigates the page releasing
+    // thrashing which is caused by memory usage bouncing around the threshold.
+    // The smallest size class takes longest time to do the page release so we
+    // use its size of in-use blocks as a heuristic.
+    SmallerBlockReleasePageDelta =
+        PagesInGroup * (1 + MinSizeClass / 16U) / 100;
 
     u32 Seed;
-    const u64 Time = getMonotonicTime();
+    const u64 Time = getMonotonicTimeFast();
     if (!getRandom(reinterpret_cast<void *>(&Seed), sizeof(Seed)))
-      Seed = static_cast<u32>(Time ^ (PrimaryBase >> 12));
-    const uptr PageSize = getPageSizeCached();
-    for (uptr I = 0; I < NumClasses; I++) {
-      RegionInfo *Region = getRegionInfo(I);
-      // The actual start of a region is offset by a random number of pages
-      // when PrimaryEnableRandomOffset is set.
-      Region->RegionBeg = getRegionBaseByClassId(I) +
-                          (Config::PrimaryEnableRandomOffset
-                               ? ((getRandomModN(&Seed, 16) + 1) * PageSize)
-                               : 0);
-      Region->RandState = getRandomU32(&Seed);
-      Region->ReleaseInfo.LastReleaseAtNs = Time;
+      Seed = static_cast<u32>(Time ^ (reinterpret_cast<uptr>(&Seed) >> 12));
+
+    for (uptr I = 0; I < NumClasses; I++)
+      getRegionInfo(I)->RandState = getRandomU32(&Seed);
+
+    if (Config::getEnableContiguousRegions()) {
+      ReservedMemoryT ReservedMemory = {};
+      // Reserve the space required for the Primary.
+      CHECK(ReservedMemory.create(/*Addr=*/0U, RegionSize * NumClasses,
+                                  "scudo:primary_reserve"));
+      const uptr PrimaryBase = ReservedMemory.getBase();
+
+      for (uptr I = 0; I < NumClasses; I++) {
+        MemMapT RegionMemMap = ReservedMemory.dispatch(
+            PrimaryBase + (I << RegionSizeLog), RegionSize);
+        RegionInfo *Region = getRegionInfo(I);
+
+        initRegion(Region, I, RegionMemMap, Config::getEnableRandomOffset());
+      }
+      shuffle(RegionInfoArray, NumClasses, &Seed);
     }
+
+    // The binding should be done after region shuffling so that it won't bind
+    // the FLLock from the wrong region.
+    for (uptr I = 0; I < NumClasses; I++)
+      getRegionInfo(I)->FLLockCV.bindTestOnly(getRegionInfo(I)->FLLock);
+
+    // The default value in the primary config has the higher priority.
+    if (Config::getDefaultReleaseToOsIntervalMs() != INT32_MIN)
+      ReleaseToOsInterval = Config::getDefaultReleaseToOsIntervalMs();
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
 
-  void unmapTestOnly() NO_THREAD_SAFETY_ANALYSIS {
+  void unmapTestOnly() {
     for (uptr I = 0; I < NumClasses; I++) {
       RegionInfo *Region = getRegionInfo(I);
+      {
+        ScopedLock ML(Region->MMLock);
+        MemMapT MemMap = Region->MemMapInfo.MemMap;
+        if (MemMap.isAllocated())
+          MemMap.unmap(MemMap.getBase(), MemMap.getCapacity());
+      }
       *Region = {};
     }
-    if (PrimaryBase)
-      unmap(reinterpret_cast<void *>(PrimaryBase), PrimarySize, UNMAP_ALL,
-            &Data);
-    PrimaryBase = 0U;
   }
 
-  TransferBatch *popBatch(CacheT *C, uptr ClassId) {
+  // When all blocks are freed, it has to be the same size as `AllocatedUser`.
+  void verifyAllBlocksAreReleasedTestOnly() {
+    // `BatchGroup` and `TransferBatch` also use the blocks from BatchClass.
+    uptr BatchClassUsedInFreeLists = 0;
+    for (uptr I = 0; I < NumClasses; I++) {
+      // We have to count BatchClassUsedInFreeLists in other regions first.
+      if (I == SizeClassMap::BatchClassId)
+        continue;
+      RegionInfo *Region = getRegionInfo(I);
+      ScopedLock ML(Region->MMLock);
+      ScopedLock FL(Region->FLLock);
+      const uptr BlockSize = getSizeByClassId(I);
+      uptr TotalBlocks = 0;
+      for (BatchGroupT &BG : Region->FreeListInfo.BlockList) {
+        // `BG::Batches` are `TransferBatches`. +1 for `BatchGroup`.
+        BatchClassUsedInFreeLists += BG.Batches.size() + 1;
+        for (const auto &It : BG.Batches)
+          TotalBlocks += It.getCount();
+      }
+
+      DCHECK_EQ(TotalBlocks, Region->MemMapInfo.AllocatedUser / BlockSize);
+      DCHECK_EQ(Region->FreeListInfo.PushedBlocks,
+                Region->FreeListInfo.PoppedBlocks);
+    }
+
+    RegionInfo *Region = getRegionInfo(SizeClassMap::BatchClassId);
+    ScopedLock ML(Region->MMLock);
+    ScopedLock FL(Region->FLLock);
+    const uptr BlockSize = getSizeByClassId(SizeClassMap::BatchClassId);
+    uptr TotalBlocks = 0;
+    for (BatchGroupT &BG : Region->FreeListInfo.BlockList) {
+      if (LIKELY(!BG.Batches.empty())) {
+        for (const auto &It : BG.Batches)
+          TotalBlocks += It.getCount();
+      } else {
+        // `BatchGroup` with empty freelist doesn't have `TransferBatch` record
+        // itself.
+        ++TotalBlocks;
+      }
+    }
+    DCHECK_EQ(TotalBlocks + BatchClassUsedInFreeLists,
+              Region->MemMapInfo.AllocatedUser / BlockSize);
+    DCHECK_GE(Region->FreeListInfo.PoppedBlocks,
+              Region->FreeListInfo.PushedBlocks);
+    const uptr BlocksInUse =
+        Region->FreeListInfo.PoppedBlocks - Region->FreeListInfo.PushedBlocks;
+    DCHECK_EQ(BlocksInUse, BatchClassUsedInFreeLists);
+  }
+
+  u16 popBlocks(CacheT *C, uptr ClassId, CompactPtrT *ToArray,
+                const u16 MaxBlockCount) {
     DCHECK_LT(ClassId, NumClasses);
     RegionInfo *Region = getRegionInfo(ClassId);
-    bool PrintStats = false;
+    u16 PopCount = 0;
+
     {
-      ScopedLock L(Region->Mutex);
-      TransferBatch *B = popBatchImpl(C, ClassId, Region);
-      if (LIKELY(B)) {
-        Region->Stats.PoppedBlocks += B->getCount();
-        return B;
-      }
+      ScopedLock L(Region->FLLock);
+      PopCount = popBlocksImpl(C, ClassId, Region, ToArray, MaxBlockCount);
+      if (PopCount != 0U)
+        return PopCount;
+    }
 
-      const bool RegionIsExhausted = Region->Exhausted;
-      if (UNLIKELY(RegionIsExhausted ||
-                   !populateFreeList(C, ClassId, Region))) {
-        PrintStats = !RegionIsExhausted && Region->Exhausted;
-      } else {
-        B = popBatchImpl(C, ClassId, Region);
-        // if `populateFreeList` succeeded, we are supposed to get free blocks.
-        DCHECK_NE(B, nullptr);
-        Region->Stats.PoppedBlocks += B->getCount();
-        return B;
+    bool ReportRegionExhausted = false;
+
+    if (conditionVariableEnabled()) {
+      PopCount = popBlocksWithCV(C, ClassId, Region, ToArray, MaxBlockCount,
+                                 ReportRegionExhausted);
+    } else {
+      while (true) {
+        // When two threads compete for `Region->MMLock`, we only want one of
+        // them to call populateFreeListAndPopBatch(). To avoid both of them
+        // doing that, always check the freelist before mapping new pages.
+        ScopedLock ML(Region->MMLock);
+        {
+          ScopedLock FL(Region->FLLock);
+          PopCount = popBlocksImpl(C, ClassId, Region, ToArray, MaxBlockCount);
+          if (PopCount != 0U)
+            return PopCount;
+        }
+
+        const bool RegionIsExhausted = Region->Exhausted;
+        if (!RegionIsExhausted) {
+          PopCount = populateFreeListAndPopBlocks(C, ClassId, Region, ToArray,
+                                                  MaxBlockCount);
+        }
+        ReportRegionExhausted = !RegionIsExhausted && Region->Exhausted;
+        break;
       }
     }
 
-    // Note that `getStats()` requires locking each region so we can't call it
-    // while locking the Region->Mutex in the above.
-    if (UNLIKELY(PrintStats)) {
-      ScopedString Str;
-      getStats(&Str);
-      Str.append(
-          "Scudo OOM: The process has exhausted %zuM for size class %zu.\n",
-          RegionSize >> 20, getSizeByClassId(ClassId));
-      Str.output();
+    if (UNLIKELY(ReportRegionExhausted)) {
+      Printf("Can't populate more pages for size class %zu.\n",
+             getSizeByClassId(ClassId));
+
+      // Theoretically, BatchClass shouldn't be used up. Abort immediately  when
+      // it happens.
+      if (ClassId == SizeClassMap::BatchClassId)
+        reportOutOfBatchClass();
     }
-    return nullptr;
+
+    return PopCount;
   }
 
   // Push the array of free blocks to the designated batch group.
@@ -143,68 +276,39 @@ public:
 
     RegionInfo *Region = getRegionInfo(ClassId);
     if (ClassId == SizeClassMap::BatchClassId) {
-      bool PrintStats = false;
-      {
-        ScopedLock L(Region->Mutex);
-        // Constructing a batch group in the free list will use two blocks in
-        // BatchClassId. If we are pushing BatchClassId blocks, we will use the
-        // blocks in the array directly (can't delegate local cache which will
-        // cause a recursive allocation). However, The number of free blocks may
-        // be less than two. Therefore, populate the free list before inserting
-        // the blocks.
-        if (Size >= 2U) {
-          pushBlocksImpl(C, SizeClassMap::BatchClassId, Region, Array, Size);
-          Region->Stats.PushedBlocks += Size;
-        } else {
-          const bool RegionIsExhausted = Region->Exhausted;
-          if (UNLIKELY(
-                  RegionIsExhausted ||
-                  !populateFreeList(C, SizeClassMap::BatchClassId, Region))) {
-            PrintStats = !RegionIsExhausted && Region->Exhausted;
-          }
-        }
-      }
-
-      // Note that `getStats()` requires the lock of each region so we can't
-      // call it while locking the Region->Mutex in the above.
-      if (UNLIKELY(PrintStats)) {
-        ScopedString Str;
-        getStats(&Str);
-        Str.append(
-            "Scudo OOM: The process has exhausted %zuM for size class %zu.\n",
-            RegionSize >> 20, getSizeByClassId(ClassId));
-        Str.output();
-        // Theoretically, BatchClass shouldn't be used up. Abort immediately
-        // when it happens.
-        reportOutOfBatchClass();
-      }
+      ScopedLock L(Region->FLLock);
+      pushBatchClassBlocks(Region, Array, Size);
+      if (conditionVariableEnabled())
+        Region->FLLockCV.notifyAll(Region->FLLock);
       return;
     }
 
     // TODO(chiahungduan): Consider not doing grouping if the group size is not
     // greater than the block size with a certain scale.
 
-    // Sort the blocks so that blocks belonging to the same group can be pushed
-    // together.
     bool SameGroup = true;
-    for (u32 I = 1; I < Size; ++I) {
-      if (compactPtrGroup(Array[I - 1]) != compactPtrGroup(Array[I]))
-        SameGroup = false;
-      CompactPtrT Cur = Array[I];
-      u32 J = I;
-      while (J > 0 && compactPtrGroup(Cur) < compactPtrGroup(Array[J - 1])) {
-        Array[J] = Array[J - 1];
-        --J;
+    if (GroupSizeLog < RegionSizeLog) {
+      // Sort the blocks so that blocks belonging to the same group can be
+      // pushed together.
+      for (u32 I = 1; I < Size; ++I) {
+        if (compactPtrGroup(Array[I - 1]) != compactPtrGroup(Array[I]))
+          SameGroup = false;
+        CompactPtrT Cur = Array[I];
+        u32 J = I;
+        while (J > 0 && compactPtrGroup(Cur) < compactPtrGroup(Array[J - 1])) {
+          Array[J] = Array[J - 1];
+          --J;
+        }
+        Array[J] = Cur;
       }
-      Array[J] = Cur;
     }
 
-    ScopedLock L(Region->Mutex);
-    pushBlocksImpl(C, ClassId, Region, Array, Size, SameGroup);
-
-    Region->Stats.PushedBlocks += Size;
-    if (ClassId != SizeClassMap::BatchClassId)
-      releaseToOSMaybe(Region, ClassId);
+    {
+      ScopedLock L(Region->FLLock);
+      pushBlocksImpl(C, ClassId, Region, Array, Size, SameGroup);
+      if (conditionVariableEnabled())
+        Region->FLLockCV.notifyAll(Region->FLLock);
+    }
   }
 
   void disable() NO_THREAD_SAFETY_ANALYSIS {
@@ -212,17 +316,21 @@ public:
     for (sptr I = static_cast<sptr>(NumClasses) - 1; I >= 0; I--) {
       if (static_cast<uptr>(I) == SizeClassMap::BatchClassId)
         continue;
-      getRegionInfo(static_cast<uptr>(I))->Mutex.lock();
+      getRegionInfo(static_cast<uptr>(I))->MMLock.lock();
+      getRegionInfo(static_cast<uptr>(I))->FLLock.lock();
     }
-    getRegionInfo(SizeClassMap::BatchClassId)->Mutex.lock();
+    getRegionInfo(SizeClassMap::BatchClassId)->MMLock.lock();
+    getRegionInfo(SizeClassMap::BatchClassId)->FLLock.lock();
   }
 
   void enable() NO_THREAD_SAFETY_ANALYSIS {
-    getRegionInfo(SizeClassMap::BatchClassId)->Mutex.unlock();
+    getRegionInfo(SizeClassMap::BatchClassId)->FLLock.unlock();
+    getRegionInfo(SizeClassMap::BatchClassId)->MMLock.unlock();
     for (uptr I = 0; I < NumClasses; I++) {
       if (I == SizeClassMap::BatchClassId)
         continue;
-      getRegionInfo(I)->Mutex.unlock();
+      getRegionInfo(I)->FLLock.unlock();
+      getRegionInfo(I)->MMLock.unlock();
     }
   }
 
@@ -234,10 +342,11 @@ public:
       // TODO: The call of `iterateOverBlocks` requires disabling
       // SizeClassAllocator64. We may consider locking each region on demand
       // only.
-      Region->Mutex.assertHeld();
+      Region->FLLock.assertHeld();
+      Region->MMLock.assertHeld();
       const uptr BlockSize = getSizeByClassId(I);
       const uptr From = Region->RegionBeg;
-      const uptr To = From + Region->AllocatedUser;
+      const uptr To = From + Region->MemMapInfo.AllocatedUser;
       for (uptr Block = From; Block < To; Block += BlockSize)
         Callback(Block);
     }
@@ -250,29 +359,47 @@ public:
     uptr PushedBlocks = 0;
     for (uptr I = 0; I < NumClasses; I++) {
       RegionInfo *Region = getRegionInfo(I);
-      ScopedLock L(Region->Mutex);
-      if (Region->MappedUser)
-        TotalMapped += Region->MappedUser;
-      PoppedBlocks += Region->Stats.PoppedBlocks;
-      PushedBlocks += Region->Stats.PushedBlocks;
+      {
+        ScopedLock L(Region->MMLock);
+        TotalMapped += Region->MemMapInfo.MappedUser;
+      }
+      {
+        ScopedLock L(Region->FLLock);
+        PoppedBlocks += Region->FreeListInfo.PoppedBlocks;
+        PushedBlocks += Region->FreeListInfo.PushedBlocks;
+      }
     }
+    const s32 IntervalMs = atomic_load_relaxed(&ReleaseToOsIntervalMs);
     Str->append("Stats: SizeClassAllocator64: %zuM mapped (%uM rss) in %zu "
-                "allocations; remains %zu\n",
+                "allocations; remains %zu; ReleaseToOsIntervalMs = %d\n",
                 TotalMapped >> 20, 0U, PoppedBlocks,
-                PoppedBlocks - PushedBlocks);
+                PoppedBlocks - PushedBlocks, IntervalMs >= 0 ? IntervalMs : -1);
 
     for (uptr I = 0; I < NumClasses; I++) {
       RegionInfo *Region = getRegionInfo(I);
-      ScopedLock L(Region->Mutex);
-      getStats(Str, I, Region, 0);
+      ScopedLock L1(Region->MMLock);
+      ScopedLock L2(Region->FLLock);
+      getStats(Str, I, Region);
+    }
+  }
+
+  void getFragmentationInfo(ScopedString *Str) {
+    Str->append(
+        "Fragmentation Stats: SizeClassAllocator64: page size = %zu bytes\n",
+        getPageSizeCached());
+
+    for (uptr I = 1; I < NumClasses; I++) {
+      RegionInfo *Region = getRegionInfo(I);
+      ScopedLock L(Region->MMLock);
+      getRegionFragmentationInfo(Region, I, Str);
     }
   }
 
   bool setOption(Option O, sptr Value) {
     if (O == Option::ReleaseInterval) {
       const s32 Interval = Max(
-          Min(static_cast<s32>(Value), Config::PrimaryMaxReleaseToOsIntervalMs),
-          Config::PrimaryMinReleaseToOsIntervalMs);
+          Min(static_cast<s32>(Value), Config::getMaxReleaseToOsIntervalMs()),
+          Config::getMinReleaseToOsIntervalMs());
       atomic_store_relaxed(&ReleaseToOsIntervalMs, Interval);
       return true;
     }
@@ -280,14 +407,27 @@ public:
     return true;
   }
 
-  uptr releaseToOS() {
+  uptr tryReleaseToOS(uptr ClassId, ReleaseToOS ReleaseType) {
+    RegionInfo *Region = getRegionInfo(ClassId);
+    // Note that the tryLock() may fail spuriously, given that it should rarely
+    // happen and page releasing is fine to skip, we don't take certain
+    // approaches to ensure one page release is done.
+    if (Region->MMLock.tryLock()) {
+      uptr BytesReleased = releaseToOSMaybe(Region, ClassId, ReleaseType);
+      Region->MMLock.unlock();
+      return BytesReleased;
+    }
+    return 0;
+  }
+
+  uptr releaseToOS(ReleaseToOS ReleaseType) {
     uptr TotalReleasedBytes = 0;
     for (uptr I = 0; I < NumClasses; I++) {
       if (I == SizeClassMap::BatchClassId)
         continue;
       RegionInfo *Region = getRegionInfo(I);
-      ScopedLock L(Region->Mutex);
-      TotalReleasedBytes += releaseToOSMaybe(Region, I, /*Force=*/true);
+      ScopedLock L(Region->MMLock);
+      TotalReleasedBytes += releaseToOSMaybe(Region, I, ReleaseType);
     }
     return TotalReleasedBytes;
   }
@@ -299,9 +439,6 @@ public:
   static uptr getRegionInfoArraySize() { return sizeof(RegionInfoArray); }
 
   uptr getCompactPtrBaseByClassId(uptr ClassId) {
-    // If we are not compacting pointers, base everything off of 0.
-    if (sizeof(CompactPtrT) == sizeof(uptr) && CompactPtrScale == 0)
-      return 0;
     return getRegionInfo(ClassId)->RegionBeg;
   }
 
@@ -327,13 +464,13 @@ public:
       if (I == SizeClassMap::BatchClassId)
         continue;
       uptr Begin = RegionInfoArray[I].RegionBeg;
-      // TODO(chiahungduan): In fact, We need to lock the RegionInfo::Mutex.
+      // TODO(chiahungduan): In fact, We need to lock the RegionInfo::MMLock.
       // However, the RegionInfoData is passed with const qualifier and lock the
       // mutex requires modifying RegionInfoData, which means we need to remove
       // the const qualifier. This may lead to another undefined behavior (The
       // first one is accessing `AllocatedUser` without locking. It's better to
       // pass `RegionInfoData` as `void *` then we can lock the mutex properly.
-      uptr End = Begin + RegionInfoArray[I].AllocatedUser;
+      uptr End = Begin + RegionInfoArray[I].MemMapInfo.AllocatedUser;
       if (Begin > End || End - Begin < SizeClassMap::getSizeByClassId(I))
         continue;
       uptr RegionDistance;
@@ -355,7 +492,8 @@ public:
     BlockInfo B = {};
     if (MinDistance <= 8192) {
       B.RegionBegin = RegionInfoArray[ClassId].RegionBeg;
-      B.RegionEnd = B.RegionBegin + RegionInfoArray[ClassId].AllocatedUser;
+      B.RegionEnd =
+          B.RegionBegin + RegionInfoArray[ClassId].MemMapInfo.AllocatedUser;
       B.BlockSize = SizeClassMap::getSizeByClassId(ClassId);
       B.BlockBegin =
           B.RegionBegin + uptr(sptr(Ptr - B.RegionBegin) / sptr(B.BlockSize) *
@@ -371,40 +509,50 @@ public:
   AtomicOptions Options;
 
 private:
-  static const uptr RegionSize = 1UL << Config::PrimaryRegionSizeLog;
+  static const uptr RegionSize = 1UL << RegionSizeLog;
   static const uptr NumClasses = SizeClassMap::NumClasses;
-  static const uptr PrimarySize = RegionSize * NumClasses;
 
-  static const uptr MapSizeIncrement = Config::PrimaryMapSizeIncrement;
+  static const uptr MapSizeIncrement = Config::getMapSizeIncrement();
   // Fill at most this number of batches from the newly map'd memory.
   static const u32 MaxNumBatches = SCUDO_ANDROID ? 4U : 8U;
 
-  struct RegionStats {
-    uptr PoppedBlocks;
-    uptr PushedBlocks;
-  };
-
   struct ReleaseToOsInfo {
-    uptr PushedBlocksAtLastRelease;
+    uptr BytesInFreeListAtLastCheckpoint;
     uptr RangesReleased;
     uptr LastReleasedBytes;
     u64 LastReleaseAtNs;
   };
 
-  struct UnpaddedRegionInfo {
-    HybridMutex Mutex;
-    SinglyLinkedList<BatchGroup> FreeList GUARDED_BY(Mutex);
-    // This is initialized before thread creation.
-    uptr RegionBeg = 0;
-    RegionStats Stats GUARDED_BY(Mutex) = {};
-    u32 RandState GUARDED_BY(Mutex) = 0;
+  struct BlocksInfo {
+    SinglyLinkedList<BatchGroupT> BlockList = {};
+    uptr PoppedBlocks = 0;
+    uptr PushedBlocks = 0;
+  };
+
+  struct PagesInfo {
+    MemMapT MemMap = {};
     // Bytes mapped for user memory.
-    uptr MappedUser GUARDED_BY(Mutex) = 0;
+    uptr MappedUser = 0;
     // Bytes allocated for user memory.
-    uptr AllocatedUser GUARDED_BY(Mutex) = 0;
-    MapPlatformData Data GUARDED_BY(Mutex) = {};
-    ReleaseToOsInfo ReleaseInfo GUARDED_BY(Mutex) = {};
-    bool Exhausted GUARDED_BY(Mutex) = false;
+    uptr AllocatedUser = 0;
+  };
+
+  struct UnpaddedRegionInfo {
+    // Mutex for operations on freelist
+    HybridMutex FLLock;
+    ConditionVariableT FLLockCV GUARDED_BY(FLLock);
+    // Mutex for memmap operations
+    HybridMutex MMLock ACQUIRED_BEFORE(FLLock);
+    // `RegionBeg` is initialized before thread creation and won't be changed.
+    uptr RegionBeg = 0;
+    u32 RandState GUARDED_BY(MMLock) = 0;
+    BlocksInfo FreeListInfo GUARDED_BY(FLLock);
+    PagesInfo MemMapInfo GUARDED_BY(MMLock);
+    // The minimum size of pushed blocks to trigger page release.
+    uptr TryReleaseThreshold GUARDED_BY(MMLock) = 0;
+    ReleaseToOsInfo ReleaseInfo GUARDED_BY(MMLock) = {};
+    bool Exhausted GUARDED_BY(MMLock) = false;
+    bool isPopulatingFreeList GUARDED_BY(FLLock) = false;
   };
   struct RegionInfo : UnpaddedRegionInfo {
     char Padding[SCUDO_CACHE_LINE_SIZE -
@@ -412,18 +560,20 @@ private:
   };
   static_assert(sizeof(RegionInfo) % SCUDO_CACHE_LINE_SIZE == 0, "");
 
-  uptr PrimaryBase = 0;
-  MapPlatformData Data = {};
-  atomic_s32 ReleaseToOsIntervalMs = {};
-  alignas(SCUDO_CACHE_LINE_SIZE) RegionInfo RegionInfoArray[NumClasses];
-
   RegionInfo *getRegionInfo(uptr ClassId) {
     DCHECK_LT(ClassId, NumClasses);
     return &RegionInfoArray[ClassId];
   }
 
-  uptr getRegionBaseByClassId(uptr ClassId) const {
-    return PrimaryBase + (ClassId << Config::PrimaryRegionSizeLog);
+  uptr getRegionBaseByClassId(uptr ClassId) {
+    RegionInfo *Region = getRegionInfo(ClassId);
+    Region->MMLock.assertHeld();
+
+    if (!Config::getEnableContiguousRegions() &&
+        !Region->MemMapInfo.MemMap.isAllocated()) {
+      return 0U;
+    }
+    return Region->MemMapInfo.MemMap.getBase();
   }
 
   static CompactPtrT compactPtrInternal(uptr Base, uptr Ptr) {
@@ -435,21 +585,168 @@ private:
   }
 
   static uptr compactPtrGroup(CompactPtrT CompactPtr) {
-    return static_cast<uptr>(CompactPtr) >> (GroupSizeLog - CompactPtrScale);
+    const uptr Mask = (static_cast<uptr>(1) << GroupScale) - 1;
+    return static_cast<uptr>(CompactPtr) & ~Mask;
+  }
+  static uptr decompactGroupBase(uptr Base, uptr CompactPtrGroupBase) {
+    DCHECK_EQ(CompactPtrGroupBase % (static_cast<uptr>(1) << (GroupScale)), 0U);
+    return Base + (CompactPtrGroupBase << CompactPtrScale);
+  }
+
+  ALWAYS_INLINE static bool isSmallBlock(uptr BlockSize) {
+    const uptr PageSize = getPageSizeCached();
+    return BlockSize < PageSize / 16U;
   }
-  static uptr batchGroupBase(uptr Base, uptr GroupId) {
-    return (GroupId << GroupSizeLog) + Base;
+
+  ALWAYS_INLINE static bool isLargeBlock(uptr BlockSize) {
+    const uptr PageSize = getPageSizeCached();
+    return BlockSize > PageSize;
+  }
+
+  ALWAYS_INLINE void initRegion(RegionInfo *Region, uptr ClassId,
+                                MemMapT MemMap, bool EnableRandomOffset)
+      REQUIRES(Region->MMLock) {
+    DCHECK(!Region->MemMapInfo.MemMap.isAllocated());
+    DCHECK(MemMap.isAllocated());
+
+    const uptr PageSize = getPageSizeCached();
+
+    Region->MemMapInfo.MemMap = MemMap;
+
+    Region->RegionBeg = MemMap.getBase();
+    if (EnableRandomOffset) {
+      Region->RegionBeg +=
+          (getRandomModN(&Region->RandState, 16) + 1) * PageSize;
+    }
+
+    // Releasing small blocks is expensive, set a higher threshold to avoid
+    // frequent page releases.
+    if (isSmallBlock(getSizeByClassId(ClassId)))
+      Region->TryReleaseThreshold = PageSize * SmallerBlockReleasePageDelta;
+    else
+      Region->TryReleaseThreshold = PageSize;
+  }
+
+  void pushBatchClassBlocks(RegionInfo *Region, CompactPtrT *Array, u32 Size)
+      REQUIRES(Region->FLLock) {
+    DCHECK_EQ(Region, getRegionInfo(SizeClassMap::BatchClassId));
+
+    // Free blocks are recorded by TransferBatch in freelist for all
+    // size-classes. In addition, TransferBatch is allocated from BatchClassId.
+    // In order not to use additional block to record the free blocks in
+    // BatchClassId, they are self-contained. I.e., A TransferBatch records the
+    // block address of itself. See the figure below:
+    //
+    // TransferBatch at 0xABCD
+    // +----------------------------+
+    // | Free blocks' addr          |
+    // | +------+------+------+     |
+    // | |0xABCD|...   |...   |     |
+    // | +------+------+------+     |
+    // +----------------------------+
+    //
+    // When we allocate all the free blocks in the TransferBatch, the block used
+    // by TransferBatch is also free for use. We don't need to recycle the
+    // TransferBatch. Note that the correctness is maintained by the invariant,
+    //
+    //   Each popBlocks() request returns the entire TransferBatch. Returning
+    //   part of the blocks in a TransferBatch is invalid.
+    //
+    // This ensures that TransferBatch won't leak the address itself while it's
+    // still holding other valid data.
+    //
+    // Besides, BatchGroup is also allocated from BatchClassId and has its
+    // address recorded in the TransferBatch too. To maintain the correctness,
+    //
+    //   The address of BatchGroup is always recorded in the last TransferBatch
+    //   in the freelist (also imply that the freelist should only be
+    //   updated with push_front). Once the last TransferBatch is popped,
+    //   the block used by BatchGroup is also free for use.
+    //
+    // With this approach, the blocks used by BatchGroup and TransferBatch are
+    // reusable and don't need additional space for them.
+
+    Region->FreeListInfo.PushedBlocks += Size;
+    BatchGroupT *BG = Region->FreeListInfo.BlockList.front();
+
+    if (BG == nullptr) {
+      // Construct `BatchGroup` on the last element.
+      BG = reinterpret_cast<BatchGroupT *>(
+          decompactPtr(SizeClassMap::BatchClassId, Array[Size - 1]));
+      --Size;
+      BG->Batches.clear();
+      // BatchClass hasn't enabled memory group. Use `0` to indicate there's no
+      // memory group here.
+      BG->CompactPtrGroupBase = 0;
+      // `BG` is also the block of BatchClassId. Note that this is different
+      // from `CreateGroup` in `pushBlocksImpl`
+      BG->PushedBlocks = 1;
+      BG->BytesInBGAtLastCheckpoint = 0;
+      BG->MaxCachedPerBatch =
+          CacheT::getMaxCached(getSizeByClassId(SizeClassMap::BatchClassId));
+
+      Region->FreeListInfo.BlockList.push_front(BG);
+    }
+
+    if (UNLIKELY(Size == 0))
+      return;
+
+    // This happens under 2 cases.
+    //   1. just allocated a new `BatchGroup`.
+    //   2. Only 1 block is pushed when the freelist is empty.
+    if (BG->Batches.empty()) {
+      // Construct the `TransferBatch` on the last element.
+      TransferBatchT *TB = reinterpret_cast<TransferBatchT *>(
+          decompactPtr(SizeClassMap::BatchClassId, Array[Size - 1]));
+      TB->clear();
+      // As mentioned above, addresses of `TransferBatch` and `BatchGroup` are
+      // recorded in the TransferBatch.
+      TB->add(Array[Size - 1]);
+      TB->add(
+          compactPtr(SizeClassMap::BatchClassId, reinterpret_cast<uptr>(BG)));
+      --Size;
+      DCHECK_EQ(BG->PushedBlocks, 1U);
+      // `TB` is also the block of BatchClassId.
+      BG->PushedBlocks += 1;
+      BG->Batches.push_front(TB);
+    }
+
+    TransferBatchT *CurBatch = BG->Batches.front();
+    DCHECK_NE(CurBatch, nullptr);
+
+    for (u32 I = 0; I < Size;) {
+      u16 UnusedSlots =
+          static_cast<u16>(BG->MaxCachedPerBatch - CurBatch->getCount());
+      if (UnusedSlots == 0) {
+        CurBatch = reinterpret_cast<TransferBatchT *>(
+            decompactPtr(SizeClassMap::BatchClassId, Array[I]));
+        CurBatch->clear();
+        // Self-contained
+        CurBatch->add(Array[I]);
+        ++I;
+        // TODO(chiahungduan): Avoid the use of push_back() in `Batches` of
+        // BatchClassId.
+        BG->Batches.push_front(CurBatch);
+        UnusedSlots = static_cast<u16>(BG->MaxCachedPerBatch - 1);
+      }
+      // `UnusedSlots` is u16 so the result will be also fit in u16.
+      const u16 AppendSize = static_cast<u16>(Min<u32>(UnusedSlots, Size - I));
+      CurBatch->appendFromArray(&Array[I], AppendSize);
+      I += AppendSize;
+    }
+
+    BG->PushedBlocks += Size;
   }
 
   // Push the blocks to their batch group. The layout will be like,
   //
-  // FreeList - > BG -> BG -> BG
-  //              |     |     |
-  //              v     v     v
-  //              TB    TB    TB
-  //              |
-  //              v
-  //              TB
+  // FreeListInfo.BlockList - > BG -> BG -> BG
+  //                            |     |     |
+  //                            v     v     v
+  //                            TB    TB    TB
+  //                            |
+  //                            v
+  //                            TB
   //
   // Each BlockGroup(BG) will associate with unique group id and the free blocks
   // are managed by a list of TransferBatch(TB). To reduce the time of inserting
@@ -457,86 +754,32 @@ private:
   // that we can get better performance of maintaining sorted property.
   // Use `SameGroup=true` to indicate that all blocks in the array are from the
   // same group then we will skip checking the group id of each block.
-  //
-  // The region mutex needs to be held while calling this method.
   void pushBlocksImpl(CacheT *C, uptr ClassId, RegionInfo *Region,
                       CompactPtrT *Array, u32 Size, bool SameGroup = false)
-      REQUIRES(Region->Mutex) {
+      REQUIRES(Region->FLLock) {
+    DCHECK_NE(ClassId, SizeClassMap::BatchClassId);
     DCHECK_GT(Size, 0U);
 
-    auto CreateGroup = [&](uptr GroupId) {
-      BatchGroup *BG = nullptr;
-      TransferBatch *TB = nullptr;
-      if (ClassId == SizeClassMap::BatchClassId) {
-        DCHECK_GE(Size, 2U);
-
-        // Free blocks are recorded by TransferBatch in freelist, blocks of
-        // BatchClassId are included. In order not to use additional memory to
-        // record blocks of BatchClassId, they are self-contained. I.e., A
-        // TransferBatch may record the block address of itself. See the figure
-        // below:
-        //
-        // TransferBatch at 0xABCD
-        // +----------------------------+
-        // | Free blocks' addr          |
-        // | +------+------+------+     |
-        // | |0xABCD|...   |...   |     |
-        // | +------+------+------+     |
-        // +----------------------------+
-        //
-        // The safeness of manipulating TransferBatch is kept by the invariant,
-        //
-        //   The unit of each pop-block request is a TransferBatch. Return
-        //   part of the blocks in a TransferBatch is not allowed.
-        //
-        // This ensures that TransferBatch won't leak the address itself while
-        // it's still holding other valid data.
-        //
-        // Besides, BatchGroup uses the same size-class as TransferBatch does
-        // and its address is recorded in the TransferBatch too. To maintain the
-        // safeness, the invariant to keep is,
-        //
-        //   The address of itself is always recorded in the last TransferBatch
-        //   of the freelist (also imply that the freelist should only be
-        //   updated with push_front). Once the last TransferBatch is popped,
-        //   the BatchGroup becomes invalid.
-        //
-        // As a result, the blocks used by BatchGroup and TransferBatch are
-        // reusable and don't need additional space for them.
-        BG = reinterpret_cast<BatchGroup *>(
-            decompactPtr(ClassId, Array[Size - 1]));
-        BG->Batches.clear();
-
-        TB = reinterpret_cast<TransferBatch *>(
-            decompactPtr(ClassId, Array[Size - 2]));
-        TB->clear();
-
-        // Append the blocks used by BatchGroup and TransferBatch immediately so
-        // that we ensure that they are in the last TransBatch.
-        TB->appendFromArray(Array + Size - 2, 2);
-        Size -= 2;
-      } else {
-        BG = C->createGroup();
-        BG->Batches.clear();
-
-        TB = C->createBatch(ClassId, nullptr);
-        TB->clear();
-      }
+    auto CreateGroup = [&](uptr CompactPtrGroupBase) {
+      BatchGroupT *BG =
+          reinterpret_cast<BatchGroupT *>(C->getBatchClassBlock());
+      BG->Batches.clear();
+      TransferBatchT *TB =
+          reinterpret_cast<TransferBatchT *>(C->getBatchClassBlock());
+      TB->clear();
 
-      BG->GroupId = GroupId;
-      // TODO(chiahungduan): Avoid the use of push_back() in `Batches`.
+      BG->CompactPtrGroupBase = CompactPtrGroupBase;
       BG->Batches.push_front(TB);
       BG->PushedBlocks = 0;
-      BG->PushedBlocksAtLastCheckpoint = 0;
-      BG->MaxCachedPerBatch =
-          TransferBatch::getMaxCached(getSizeByClassId(ClassId));
+      BG->BytesInBGAtLastCheckpoint = 0;
+      BG->MaxCachedPerBatch = TransferBatchT::MaxNumCached;
 
       return BG;
     };
 
-    auto InsertBlocks = [&](BatchGroup *BG, CompactPtrT *Array, u32 Size) {
-      SinglyLinkedList<TransferBatch> &Batches = BG->Batches;
-      TransferBatch *CurBatch = Batches.front();
+    auto InsertBlocks = [&](BatchGroupT *BG, CompactPtrT *Array, u32 Size) {
+      SinglyLinkedList<TransferBatchT> &Batches = BG->Batches;
+      TransferBatchT *CurBatch = Batches.front();
       DCHECK_NE(CurBatch, nullptr);
 
       for (u32 I = 0; I < Size;) {
@@ -544,9 +787,8 @@ private:
         u16 UnusedSlots =
             static_cast<u16>(BG->MaxCachedPerBatch - CurBatch->getCount());
         if (UnusedSlots == 0) {
-          CurBatch = C->createBatch(
-              ClassId,
-              reinterpret_cast<void *>(decompactPtr(ClassId, Array[I])));
+          CurBatch =
+              reinterpret_cast<TransferBatchT *>(C->getBatchClassBlock());
           CurBatch->clear();
           Batches.push_front(CurBatch);
           UnusedSlots = BG->MaxCachedPerBatch;
@@ -560,40 +802,33 @@ private:
       BG->PushedBlocks += Size;
     };
 
-    BatchGroup *Cur = Region->FreeList.front();
-
-    if (ClassId == SizeClassMap::BatchClassId) {
-      if (Cur == nullptr) {
-        // Don't need to classify BatchClassId.
-        Cur = CreateGroup(/*GroupId=*/0);
-        Region->FreeList.push_front(Cur);
-      }
-      InsertBlocks(Cur, Array, Size);
-      return;
-    }
+    Region->FreeListInfo.PushedBlocks += Size;
+    BatchGroupT *Cur = Region->FreeListInfo.BlockList.front();
 
     // In the following, `Cur` always points to the BatchGroup for blocks that
     // will be pushed next. `Prev` is the element right before `Cur`.
-    BatchGroup *Prev = nullptr;
+    BatchGroupT *Prev = nullptr;
 
-    while (Cur != nullptr && compactPtrGroup(Array[0]) > Cur->GroupId) {
+    while (Cur != nullptr &&
+           compactPtrGroup(Array[0]) > Cur->CompactPtrGroupBase) {
       Prev = Cur;
       Cur = Cur->Next;
     }
 
-    if (Cur == nullptr || compactPtrGroup(Array[0]) != Cur->GroupId) {
+    if (Cur == nullptr ||
+        compactPtrGroup(Array[0]) != Cur->CompactPtrGroupBase) {
       Cur = CreateGroup(compactPtrGroup(Array[0]));
       if (Prev == nullptr)
-        Region->FreeList.push_front(Cur);
+        Region->FreeListInfo.BlockList.push_front(Cur);
       else
-        Region->FreeList.insert(Prev, Cur);
+        Region->FreeListInfo.BlockList.insert(Prev, Cur);
     }
 
     // All the blocks are from the same group, just push without checking group
     // id.
     if (SameGroup) {
       for (u32 I = 0; I < Size; ++I)
-        DCHECK_EQ(compactPtrGroup(Array[I]), Cur->GroupId);
+        DCHECK_EQ(compactPtrGroup(Array[I]), Cur->CompactPtrGroupBase);
 
       InsertBlocks(Cur, Array, Size);
       return;
@@ -604,18 +839,20 @@ private:
     u32 Count = 1;
     for (u32 I = 1; I < Size; ++I) {
       if (compactPtrGroup(Array[I - 1]) != compactPtrGroup(Array[I])) {
-        DCHECK_EQ(compactPtrGroup(Array[I - 1]), Cur->GroupId);
+        DCHECK_EQ(compactPtrGroup(Array[I - 1]), Cur->CompactPtrGroupBase);
         InsertBlocks(Cur, Array + I - Count, Count);
 
-        while (Cur != nullptr && compactPtrGroup(Array[I]) > Cur->GroupId) {
+        while (Cur != nullptr &&
+               compactPtrGroup(Array[I]) > Cur->CompactPtrGroupBase) {
           Prev = Cur;
           Cur = Cur->Next;
         }
 
-        if (Cur == nullptr || compactPtrGroup(Array[I]) != Cur->GroupId) {
+        if (Cur == nullptr ||
+            compactPtrGroup(Array[I]) != Cur->CompactPtrGroupBase) {
           Cur = CreateGroup(compactPtrGroup(Array[I]));
           DCHECK_NE(Prev, nullptr);
-          Region->FreeList.insert(Prev, Cur);
+          Region->FreeListInfo.BlockList.insert(Prev, Cur);
         }
 
         Count = 1;
@@ -627,48 +864,180 @@ private:
     InsertBlocks(Cur, Array + Size - Count, Count);
   }
 
-  // Pop one TransferBatch from a BatchGroup. The BatchGroup with the smallest
-  // group id will be considered first.
-  //
-  // The region mutex needs to be held while calling this method.
-  TransferBatch *popBatchImpl(CacheT *C, uptr ClassId, RegionInfo *Region)
-      REQUIRES(Region->Mutex) {
-    if (Region->FreeList.empty())
-      return nullptr;
-
-    SinglyLinkedList<TransferBatch> &Batches =
-        Region->FreeList.front()->Batches;
-    DCHECK(!Batches.empty());
-
-    TransferBatch *B = Batches.front();
-    Batches.pop_front();
+  u16 popBlocksWithCV(CacheT *C, uptr ClassId, RegionInfo *Region,
+                      CompactPtrT *ToArray, const u16 MaxBlockCount,
+                      bool &ReportRegionExhausted) {
+    u16 PopCount = 0;
+
+    while (true) {
+      // We only expect one thread doing the freelist refillment and other
+      // threads will be waiting for either the completion of the
+      // `populateFreeListAndPopBatch()` or `pushBlocks()` called by other
+      // threads.
+      bool PopulateFreeList = false;
+      {
+        ScopedLock FL(Region->FLLock);
+        if (!Region->isPopulatingFreeList) {
+          Region->isPopulatingFreeList = true;
+          PopulateFreeList = true;
+        }
+      }
+
+      if (PopulateFreeList) {
+        ScopedLock ML(Region->MMLock);
+
+        const bool RegionIsExhausted = Region->Exhausted;
+        if (!RegionIsExhausted) {
+          PopCount = populateFreeListAndPopBlocks(C, ClassId, Region, ToArray,
+                                                  MaxBlockCount);
+        }
+        ReportRegionExhausted = !RegionIsExhausted && Region->Exhausted;
+
+        {
+          // Before reacquiring the `FLLock`, the freelist may be used up again
+          // and some threads are waiting for the freelist refillment by the
+          // current thread. It's important to set
+          // `Region->isPopulatingFreeList` to false so the threads about to
+          // sleep will notice the status change.
+          ScopedLock FL(Region->FLLock);
+          Region->isPopulatingFreeList = false;
+          Region->FLLockCV.notifyAll(Region->FLLock);
+        }
+
+        break;
+      }
+
+      // At here, there are two preconditions to be met before waiting,
+      //   1. The freelist is empty.
+      //   2. Region->isPopulatingFreeList == true, i.e, someone is still doing
+      //   `populateFreeListAndPopBatch()`.
+      //
+      // Note that it has the chance that freelist is empty but
+      // Region->isPopulatingFreeList == false because all the new populated
+      // blocks were used up right after the refillment. Therefore, we have to
+      // check if someone is still populating the freelist.
+      ScopedLock FL(Region->FLLock);
+      PopCount = popBlocksImpl(C, ClassId, Region, ToArray, MaxBlockCount);
+      if (PopCount != 0U)
+        break;
+
+      if (!Region->isPopulatingFreeList)
+        continue;
+
+      // Now the freelist is empty and someone's doing the refillment. We will
+      // wait until anyone refills the freelist or someone finishes doing
+      // `populateFreeListAndPopBatch()`. The refillment can be done by
+      // `populateFreeListAndPopBatch()`, `pushBlocks()`,
+      // `pushBatchClassBlocks()` and `mergeGroupsToReleaseBack()`.
+      Region->FLLockCV.wait(Region->FLLock);
+
+      PopCount = popBlocksImpl(C, ClassId, Region, ToArray, MaxBlockCount);
+      if (PopCount != 0U)
+        break;
+    }
+
+    return PopCount;
+  }
+
+  u16 popBlocksImpl(CacheT *C, uptr ClassId, RegionInfo *Region,
+                    CompactPtrT *ToArray, const u16 MaxBlockCount)
+      REQUIRES(Region->FLLock) {
+    if (Region->FreeListInfo.BlockList.empty())
+      return 0U;
+
+    SinglyLinkedList<TransferBatchT> &Batches =
+        Region->FreeListInfo.BlockList.front()->Batches;
+
+    if (Batches.empty()) {
+      DCHECK_EQ(ClassId, SizeClassMap::BatchClassId);
+      BatchGroupT *BG = Region->FreeListInfo.BlockList.front();
+      Region->FreeListInfo.BlockList.pop_front();
+
+      // Block used by `BatchGroup` is from BatchClassId. Turn the block into
+      // `TransferBatch` with single block.
+      TransferBatchT *TB = reinterpret_cast<TransferBatchT *>(BG);
+      ToArray[0] =
+          compactPtr(SizeClassMap::BatchClassId, reinterpret_cast<uptr>(TB));
+      Region->FreeListInfo.PoppedBlocks += 1;
+      return 1U;
+    }
+
+    // So far, instead of always filling blocks to `MaxBlockCount`, we only
+    // examine single `TransferBatch` to minimize the time spent in the primary
+    // allocator. Besides, the sizes of `TransferBatch` and
+    // `CacheT::getMaxCached()` may also impact the time spent on accessing the
+    // primary allocator.
+    // TODO(chiahungduan): Evaluate if we want to always prepare `MaxBlockCount`
+    // blocks and/or adjust the size of `TransferBatch` according to
+    // `CacheT::getMaxCached()`.
+    TransferBatchT *B = Batches.front();
     DCHECK_NE(B, nullptr);
     DCHECK_GT(B->getCount(), 0U);
 
-    if (Batches.empty()) {
-      BatchGroup *BG = Region->FreeList.front();
-      Region->FreeList.pop_front();
-
-      // We don't keep BatchGroup with zero blocks to avoid empty-checking while
-      // allocating. Note that block used by constructing BatchGroup is recorded
-      // as free blocks in the last element of BatchGroup::Batches. Which means,
-      // once we pop the last TransferBatch, the block is implicitly
-      // deallocated.
+    // BachClassId should always take all blocks in the TransferBatch. Read the
+    // comment in `pushBatchClassBlocks()` for more details.
+    const u16 PopCount = ClassId == SizeClassMap::BatchClassId
+                             ? B->getCount()
+                             : Min(MaxBlockCount, B->getCount());
+    B->moveNToArray(ToArray, PopCount);
+
+    // TODO(chiahungduan): The deallocation of unused BatchClassId blocks can be
+    // done without holding `FLLock`.
+    if (B->empty()) {
+      Batches.pop_front();
+      // `TransferBatch` of BatchClassId is self-contained, no need to
+      // deallocate. Read the comment in `pushBatchClassBlocks()` for more
+      // details.
       if (ClassId != SizeClassMap::BatchClassId)
-        C->deallocate(SizeClassMap::BatchClassId, BG);
+        C->deallocate(SizeClassMap::BatchClassId, B);
+
+      if (Batches.empty()) {
+        BatchGroupT *BG = Region->FreeListInfo.BlockList.front();
+        Region->FreeListInfo.BlockList.pop_front();
+
+        // We don't keep BatchGroup with zero blocks to avoid empty-checking
+        // while allocating. Note that block used for constructing BatchGroup is
+        // recorded as free blocks in the last element of BatchGroup::Batches.
+        // Which means, once we pop the last TransferBatch, the block is
+        // implicitly deallocated.
+        if (ClassId != SizeClassMap::BatchClassId)
+          C->deallocate(SizeClassMap::BatchClassId, BG);
+      }
     }
 
-    return B;
+    Region->FreeListInfo.PoppedBlocks += PopCount;
+
+    return PopCount;
   }
 
-  NOINLINE bool populateFreeList(CacheT *C, uptr ClassId, RegionInfo *Region)
-      REQUIRES(Region->Mutex) {
-    const uptr Size = getSizeByClassId(ClassId);
-    const u16 MaxCount = TransferBatch::getMaxCached(Size);
+  NOINLINE u16 populateFreeListAndPopBlocks(CacheT *C, uptr ClassId,
+                                            RegionInfo *Region,
+                                            CompactPtrT *ToArray,
+                                            const u16 MaxBlockCount)
+      REQUIRES(Region->MMLock) EXCLUDES(Region->FLLock) {
+    if (!Config::getEnableContiguousRegions() &&
+        !Region->MemMapInfo.MemMap.isAllocated()) {
+      ReservedMemoryT ReservedMemory;
+      if (UNLIKELY(!ReservedMemory.create(/*Addr=*/0U, RegionSize,
+                                          "scudo:primary_reserve",
+                                          MAP_ALLOWNOMEM))) {
+        Printf("Can't reserve pages for size class %zu.\n",
+               getSizeByClassId(ClassId));
+        return 0U;
+      }
+      initRegion(Region, ClassId,
+                 ReservedMemory.dispatch(ReservedMemory.getBase(),
+                                         ReservedMemory.getCapacity()),
+                 /*EnableRandomOffset=*/false);
+    }
 
+    DCHECK(Region->MemMapInfo.MemMap.isAllocated());
+    const uptr Size = getSizeByClassId(ClassId);
+    const u16 MaxCount = CacheT::getMaxCached(Size);
     const uptr RegionBeg = Region->RegionBeg;
-    const uptr MappedUser = Region->MappedUser;
-    const uptr TotalUserBytes = Region->AllocatedUser + MaxCount * Size;
+    const uptr MappedUser = Region->MemMapInfo.MappedUser;
+    const uptr TotalUserBytes =
+        Region->MemMapInfo.AllocatedUser + MaxCount * Size;
     // Map more space for blocks, if necessary.
     if (TotalUserBytes > MappedUser) {
       // Do the mmap for the user memory.
@@ -677,37 +1046,39 @@ private:
       const uptr RegionBase = RegionBeg - getRegionBaseByClassId(ClassId);
       if (UNLIKELY(RegionBase + MappedUser + MapSize > RegionSize)) {
         Region->Exhausted = true;
-        return false;
+        return 0U;
       }
-      if (MappedUser == 0)
-        Region->Data = Data;
-      if (UNLIKELY(!map(
-              reinterpret_cast<void *>(RegionBeg + MappedUser), MapSize,
-              "scudo:primary",
+
+      if (UNLIKELY(!Region->MemMapInfo.MemMap.remap(
+              RegionBeg + MappedUser, MapSize, "scudo:primary",
               MAP_ALLOWNOMEM | MAP_RESIZABLE |
-                  (useMemoryTagging<Config>(Options.load()) ? MAP_MEMTAG : 0),
-              &Region->Data))) {
-        return false;
+                  (useMemoryTagging<Config>(Options.load()) ? MAP_MEMTAG
+                                                            : 0)))) {
+        return 0U;
       }
-      Region->MappedUser += MapSize;
+      Region->MemMapInfo.MappedUser += MapSize;
       C->getStats().add(StatMapped, MapSize);
     }
 
-    const u32 NumberOfBlocks = Min(
-        MaxNumBatches * MaxCount,
-        static_cast<u32>((Region->MappedUser - Region->AllocatedUser) / Size));
+    const u32 NumberOfBlocks =
+        Min(MaxNumBatches * MaxCount,
+            static_cast<u32>((Region->MemMapInfo.MappedUser -
+                              Region->MemMapInfo.AllocatedUser) /
+                             Size));
     DCHECK_GT(NumberOfBlocks, 0);
 
     constexpr u32 ShuffleArraySize =
-        MaxNumBatches * TransferBatch::MaxNumCached;
+        MaxNumBatches * TransferBatchT::MaxNumCached;
     CompactPtrT ShuffleArray[ShuffleArraySize];
     DCHECK_LE(NumberOfBlocks, ShuffleArraySize);
 
     const uptr CompactPtrBase = getCompactPtrBaseByClassId(ClassId);
-    uptr P = RegionBeg + Region->AllocatedUser;
+    uptr P = RegionBeg + Region->MemMapInfo.AllocatedUser;
     for (u32 I = 0; I < NumberOfBlocks; I++, P += Size)
       ShuffleArray[I] = compactPtrInternal(CompactPtrBase, P);
 
+    ScopedLock L(Region->FLLock);
+
     if (ClassId != SizeClassMap::BatchClassId) {
       u32 N = 1;
       uptr CurGroup = compactPtrGroup(ShuffleArray[0]);
@@ -727,161 +1098,354 @@ private:
       pushBlocksImpl(C, ClassId, Region, &ShuffleArray[NumberOfBlocks - N], N,
                      /*SameGroup=*/true);
     } else {
-      pushBlocksImpl(C, ClassId, Region, ShuffleArray, NumberOfBlocks,
-                     /*SameGroup=*/true);
+      pushBatchClassBlocks(Region, ShuffleArray, NumberOfBlocks);
     }
 
+    const u16 PopCount =
+        popBlocksImpl(C, ClassId, Region, ToArray, MaxBlockCount);
+    DCHECK_NE(PopCount, 0U);
+
+    // Note that `PushedBlocks` and `PoppedBlocks` are supposed to only record
+    // the requests from `PushBlocks` and `PopBatch` which are external
+    // interfaces. `populateFreeListAndPopBatch` is the internal interface so we
+    // should set the values back to avoid incorrectly setting the stats.
+    Region->FreeListInfo.PushedBlocks -= NumberOfBlocks;
+
     const uptr AllocatedUser = Size * NumberOfBlocks;
     C->getStats().add(StatFree, AllocatedUser);
-    Region->AllocatedUser += AllocatedUser;
+    Region->MemMapInfo.AllocatedUser += AllocatedUser;
 
-    return true;
+    return PopCount;
   }
 
-  void getStats(ScopedString *Str, uptr ClassId, RegionInfo *Region, uptr Rss)
-      REQUIRES(Region->Mutex) {
-    if (Region->MappedUser == 0)
+  void getStats(ScopedString *Str, uptr ClassId, RegionInfo *Region)
+      REQUIRES(Region->MMLock, Region->FLLock) {
+    if (Region->MemMapInfo.MappedUser == 0)
       return;
-    const uptr InUse = Region->Stats.PoppedBlocks - Region->Stats.PushedBlocks;
-    const uptr TotalChunks = Region->AllocatedUser / getSizeByClassId(ClassId);
-    Str->append("%s %02zu (%6zu): mapped: %6zuK popped: %7zu pushed: %7zu "
-                "inuse: %6zu total: %6zu rss: %6zuK releases: %6zu last "
-                "released: %6zuK region: 0x%zx (0x%zx)\n",
-                Region->Exhausted ? "F" : " ", ClassId,
-                getSizeByClassId(ClassId), Region->MappedUser >> 10,
-                Region->Stats.PoppedBlocks, Region->Stats.PushedBlocks, InUse,
-                TotalChunks, Rss >> 10, Region->ReleaseInfo.RangesReleased,
-                Region->ReleaseInfo.LastReleasedBytes >> 10, Region->RegionBeg,
-                getRegionBaseByClassId(ClassId));
+    const uptr BlockSize = getSizeByClassId(ClassId);
+    const uptr InUseBlocks =
+        Region->FreeListInfo.PoppedBlocks - Region->FreeListInfo.PushedBlocks;
+    const uptr BytesInFreeList =
+        Region->MemMapInfo.AllocatedUser - InUseBlocks * BlockSize;
+    uptr RegionPushedBytesDelta = 0;
+    if (BytesInFreeList >=
+        Region->ReleaseInfo.BytesInFreeListAtLastCheckpoint) {
+      RegionPushedBytesDelta =
+          BytesInFreeList - Region->ReleaseInfo.BytesInFreeListAtLastCheckpoint;
+    }
+    const uptr TotalChunks = Region->MemMapInfo.AllocatedUser / BlockSize;
+    Str->append(
+        "%s %02zu (%6zu): mapped: %6zuK popped: %7zu pushed: %7zu "
+        "inuse: %6zu total: %6zu releases: %6zu last "
+        "released: %6zuK latest pushed bytes: %6zuK region: 0x%zx (0x%zx)\n",
+        Region->Exhausted ? "E" : " ", ClassId, getSizeByClassId(ClassId),
+        Region->MemMapInfo.MappedUser >> 10, Region->FreeListInfo.PoppedBlocks,
+        Region->FreeListInfo.PushedBlocks, InUseBlocks, TotalChunks,
+        Region->ReleaseInfo.RangesReleased,
+        Region->ReleaseInfo.LastReleasedBytes >> 10,
+        RegionPushedBytesDelta >> 10, Region->RegionBeg,
+        getRegionBaseByClassId(ClassId));
+  }
+
+  void getRegionFragmentationInfo(RegionInfo *Region, uptr ClassId,
+                                  ScopedString *Str) REQUIRES(Region->MMLock) {
+    const uptr BlockSize = getSizeByClassId(ClassId);
+    const uptr AllocatedUserEnd =
+        Region->MemMapInfo.AllocatedUser + Region->RegionBeg;
+
+    SinglyLinkedList<BatchGroupT> GroupsToRelease;
+    {
+      ScopedLock L(Region->FLLock);
+      GroupsToRelease = Region->FreeListInfo.BlockList;
+      Region->FreeListInfo.BlockList.clear();
+    }
+
+    FragmentationRecorder Recorder;
+    if (!GroupsToRelease.empty()) {
+      PageReleaseContext Context =
+          markFreeBlocks(Region, BlockSize, AllocatedUserEnd,
+                         getCompactPtrBaseByClassId(ClassId), GroupsToRelease);
+      auto SkipRegion = [](UNUSED uptr RegionIndex) { return false; };
+      releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
+
+      mergeGroupsToReleaseBack(Region, GroupsToRelease);
+    }
+
+    ScopedLock L(Region->FLLock);
+    const uptr PageSize = getPageSizeCached();
+    const uptr TotalBlocks = Region->MemMapInfo.AllocatedUser / BlockSize;
+    const uptr InUseBlocks =
+        Region->FreeListInfo.PoppedBlocks - Region->FreeListInfo.PushedBlocks;
+    const uptr AllocatedPagesCount =
+        roundUp(Region->MemMapInfo.AllocatedUser, PageSize) / PageSize;
+    DCHECK_GE(AllocatedPagesCount, Recorder.getReleasedPagesCount());
+    const uptr InUsePages =
+        AllocatedPagesCount - Recorder.getReleasedPagesCount();
+    const uptr InUseBytes = InUsePages * PageSize;
+
+    uptr Integral;
+    uptr Fractional;
+    computePercentage(BlockSize * InUseBlocks, InUsePages * PageSize, &Integral,
+                      &Fractional);
+    Str->append("  %02zu (%6zu): inuse/total blocks: %6zu/%6zu inuse/total "
+                "pages: %6zu/%6zu inuse bytes: %6zuK util: %3zu.%02zu%%\n",
+                ClassId, BlockSize, InUseBlocks, TotalBlocks, InUsePages,
+                AllocatedPagesCount, InUseBytes >> 10, Integral, Fractional);
   }
 
   NOINLINE uptr releaseToOSMaybe(RegionInfo *Region, uptr ClassId,
-                                 bool Force = false) REQUIRES(Region->Mutex) {
+                                 ReleaseToOS ReleaseType = ReleaseToOS::Normal)
+      REQUIRES(Region->MMLock) EXCLUDES(Region->FLLock) {
     const uptr BlockSize = getSizeByClassId(ClassId);
+    uptr BytesInFreeList;
+    const uptr AllocatedUserEnd =
+        Region->MemMapInfo.AllocatedUser + Region->RegionBeg;
+    SinglyLinkedList<BatchGroupT> GroupsToRelease;
+
+    {
+      ScopedLock L(Region->FLLock);
+
+      BytesInFreeList = Region->MemMapInfo.AllocatedUser -
+                        (Region->FreeListInfo.PoppedBlocks -
+                         Region->FreeListInfo.PushedBlocks) *
+                            BlockSize;
+      if (UNLIKELY(BytesInFreeList == 0))
+        return false;
+
+      // ==================================================================== //
+      // 1. Check if we have enough free blocks and if it's worth doing a page
+      //    release.
+      // ==================================================================== //
+      if (ReleaseType != ReleaseToOS::ForceAll &&
+          !hasChanceToReleasePages(Region, BlockSize, BytesInFreeList,
+                                   ReleaseType)) {
+        return 0;
+      }
+
+      // ==================================================================== //
+      // 2. Determine which groups can release the pages. Use a heuristic to
+      //    gather groups that are candidates for doing a release.
+      // ==================================================================== //
+      if (ReleaseType == ReleaseToOS::ForceAll) {
+        GroupsToRelease = Region->FreeListInfo.BlockList;
+        Region->FreeListInfo.BlockList.clear();
+      } else {
+        GroupsToRelease =
+            collectGroupsToRelease(Region, BlockSize, AllocatedUserEnd,
+                                   getCompactPtrBaseByClassId(ClassId));
+      }
+      if (GroupsToRelease.empty())
+        return 0;
+    }
+
+    // Note that we have extracted the `GroupsToRelease` from region freelist.
+    // It's safe to let pushBlocks()/popBlocks() access the remaining region
+    // freelist. In the steps 3 and 4, we will temporarily release the FLLock
+    // and lock it again before step 5.
+
+    // ==================================================================== //
+    // 3. Mark the free blocks in `GroupsToRelease` in the `PageReleaseContext`.
+    //    Then we can tell which pages are in-use by querying
+    //    `PageReleaseContext`.
+    // ==================================================================== //
+    PageReleaseContext Context =
+        markFreeBlocks(Region, BlockSize, AllocatedUserEnd,
+                       getCompactPtrBaseByClassId(ClassId), GroupsToRelease);
+    if (UNLIKELY(!Context.hasBlockMarked())) {
+      mergeGroupsToReleaseBack(Region, GroupsToRelease);
+      return 0;
+    }
+
+    // ==================================================================== //
+    // 4. Release the unused physical pages back to the OS.
+    // ==================================================================== //
+    RegionReleaseRecorder<MemMapT> Recorder(&Region->MemMapInfo.MemMap,
+                                            Region->RegionBeg,
+                                            Context.getReleaseOffset());
+    auto SkipRegion = [](UNUSED uptr RegionIndex) { return false; };
+    releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
+    if (Recorder.getReleasedRangesCount() > 0) {
+      Region->ReleaseInfo.BytesInFreeListAtLastCheckpoint = BytesInFreeList;
+      Region->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
+      Region->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
+    }
+    Region->ReleaseInfo.LastReleaseAtNs = getMonotonicTimeFast();
+
+    // ====================================================================== //
+    // 5. Merge the `GroupsToRelease` back to the freelist.
+    // ====================================================================== //
+    mergeGroupsToReleaseBack(Region, GroupsToRelease);
+
+    return Recorder.getReleasedBytes();
+  }
+
+  bool hasChanceToReleasePages(RegionInfo *Region, uptr BlockSize,
+                               uptr BytesInFreeList, ReleaseToOS ReleaseType)
+      REQUIRES(Region->MMLock, Region->FLLock) {
+    DCHECK_GE(Region->FreeListInfo.PoppedBlocks,
+              Region->FreeListInfo.PushedBlocks);
     const uptr PageSize = getPageSizeCached();
 
-    DCHECK_GE(Region->Stats.PoppedBlocks, Region->Stats.PushedBlocks);
-    const uptr BytesInFreeList =
-        Region->AllocatedUser -
-        (Region->Stats.PoppedBlocks - Region->Stats.PushedBlocks) * BlockSize;
-    if (BytesInFreeList < PageSize)
-      return 0; // No chance to release anything.
-    const uptr BytesPushed = (Region->Stats.PushedBlocks -
-                              Region->ReleaseInfo.PushedBlocksAtLastRelease) *
-                             BlockSize;
-    if (BytesPushed < PageSize)
-      return 0; // Nothing new to release.
-
-    bool CheckDensity = BlockSize < PageSize / 16U;
+    // Always update `BytesInFreeListAtLastCheckpoint` with the smallest value
+    // so that we won't underestimate the releasable pages. For example, the
+    // following is the region usage,
+    //
+    //  BytesInFreeListAtLastCheckpoint   AllocatedUser
+    //                v                         v
+    //  |--------------------------------------->
+    //         ^                   ^
+    //  BytesInFreeList     ReleaseThreshold
+    //
+    // In general, if we have collected enough bytes and the amount of free
+    // bytes meets the ReleaseThreshold, we will try to do page release. If we
+    // don't update `BytesInFreeListAtLastCheckpoint` when the current
+    // `BytesInFreeList` is smaller, we may take longer time to wait for enough
+    // freed blocks because we miss the bytes between
+    // (BytesInFreeListAtLastCheckpoint - BytesInFreeList).
+    if (BytesInFreeList <=
+        Region->ReleaseInfo.BytesInFreeListAtLastCheckpoint) {
+      Region->ReleaseInfo.BytesInFreeListAtLastCheckpoint = BytesInFreeList;
+    }
+
+    const uptr RegionPushedBytesDelta =
+        BytesInFreeList - Region->ReleaseInfo.BytesInFreeListAtLastCheckpoint;
+    if (RegionPushedBytesDelta < PageSize)
+      return false;
+
     // Releasing smaller blocks is expensive, so we want to make sure that a
     // significant amount of bytes are free, and that there has been a good
     // amount of batches pushed to the freelist before attempting to release.
-    if (CheckDensity) {
-      if (!Force && BytesPushed < Region->AllocatedUser / 16U)
-        return 0;
-    }
+    if (isSmallBlock(BlockSize) && ReleaseType == ReleaseToOS::Normal)
+      if (RegionPushedBytesDelta < Region->TryReleaseThreshold)
+        return false;
 
-    if (!Force) {
+    if (ReleaseType == ReleaseToOS::Normal) {
       const s32 IntervalMs = atomic_load_relaxed(&ReleaseToOsIntervalMs);
       if (IntervalMs < 0)
-        return 0;
-      if (Region->ReleaseInfo.LastReleaseAtNs +
-              static_cast<u64>(IntervalMs) * 1000000 >
-          getMonotonicTime()) {
-        return 0; // Memory was returned recently.
-      }
-    }
+        return false;
 
-    const uptr GroupSize = (1U << GroupSizeLog);
-    const uptr AllocatedUserEnd = Region->AllocatedUser + Region->RegionBeg;
-    const uptr CompactPtrBase = getCompactPtrBaseByClassId(ClassId);
-    auto DecompactPtr = [CompactPtrBase](CompactPtrT CompactPtr) {
-      return decompactPtrInternal(CompactPtrBase, CompactPtr);
-    };
+      // The constant 8 here is selected from profiling some apps and the number
+      // of unreleased pages in the large size classes is around 16 pages or
+      // more. Choose half of it as a heuristic and which also avoids page
+      // release every time for every pushBlocks() attempt by large blocks.
+      const bool ByPassReleaseInterval =
+          isLargeBlock(BlockSize) && RegionPushedBytesDelta > 8 * PageSize;
+      if (!ByPassReleaseInterval) {
+        if (Region->ReleaseInfo.LastReleaseAtNs +
+                static_cast<u64>(IntervalMs) * 1000000 >
+            getMonotonicTimeFast()) {
+          // Memory was returned recently.
+          return false;
+        }
+      }
+    } // if (ReleaseType == ReleaseToOS::Normal)
 
-    // Instead of always preparing PageMap for the entire region, we only do it
-    // for the range of releasing groups. To do that, the free-block marking
-    // process includes visiting BlockGroups twice.
+    return true;
+  }
 
-    // The first visit is to determine the range of BatchGroups we are going to
-    // release. And we will extract those BatchGroups out and push into
-    // `GroupToRelease`.
-    SinglyLinkedList<BatchGroup> GroupToRelease;
-    GroupToRelease.clear();
+  SinglyLinkedList<BatchGroupT>
+  collectGroupsToRelease(RegionInfo *Region, const uptr BlockSize,
+                         const uptr AllocatedUserEnd, const uptr CompactPtrBase)
+      REQUIRES(Region->MMLock, Region->FLLock) {
+    const uptr GroupSize = (1UL << GroupSizeLog);
+    const uptr PageSize = getPageSizeCached();
+    SinglyLinkedList<BatchGroupT> GroupsToRelease;
 
-    // This is only used for debugging to ensure the consistency of the number
-    // of groups.
-    uptr NumberOfBatchGroups = Region->FreeList.size();
+    // We are examining each group and will take the minimum distance to the
+    // release threshold as the next Region::TryReleaseThreshold(). Note that if
+    // the size of free blocks has reached the release threshold, the distance
+    // to the next release will be PageSize * SmallerBlockReleasePageDelta. See
+    // the comment on `SmallerBlockReleasePageDelta` for more details.
+    uptr MinDistToThreshold = GroupSize;
 
-    for (BatchGroup *BG = Region->FreeList.front(), *Prev = nullptr;
+    for (BatchGroupT *BG = Region->FreeListInfo.BlockList.front(),
+                     *Prev = nullptr;
          BG != nullptr;) {
-      const uptr PushedBytesDelta =
-          BG->PushedBlocks - BG->PushedBlocksAtLastCheckpoint;
-      if (PushedBytesDelta * BlockSize < PageSize) {
-        Prev = BG;
-        BG = BG->Next;
-        continue;
-      }
-
-      // Group boundary does not necessarily have the same alignment as Region.
-      // It may sit across a Region boundary. Which means that we may have the
-      // following two cases,
+      // Group boundary is always GroupSize-aligned from CompactPtr base. The
+      // layout of memory groups is like,
       //
-      // 1. Group boundary sits before RegionBeg.
+      //     (CompactPtrBase)
+      // #1 CompactPtrGroupBase   #2 CompactPtrGroupBase            ...
+      //           |                       |                       |
+      //           v                       v                       v
+      //           +-----------------------+-----------------------+
+      //            \                     / \                     /
+      //             ---   GroupSize   ---   ---   GroupSize   ---
       //
-      //                (BatchGroupBase)
-      // batchGroupBase  RegionBeg       BatchGroupEnd
-      //        |            |                |
-      //        v            v                v
-      //        +------------+----------------+
-      //         \                           /
-      //          ------   GroupSize   ------
-      //
-      // 2. Group boundary sits after RegionBeg.
-      //
-      //               (BatchGroupBase)
-      //    RegionBeg  batchGroupBase               BatchGroupEnd
-      //        |           |                             |
-      //        v           v                             v
-      //        +-----------+-----------------------------+
-      //                     \                           /
-      //                      ------   GroupSize   ------
-      //
-      // Note that in the first case, the group range before RegionBeg is never
-      // used. Therefore, while calculating the used group size, we should
-      // exclude that part to get the correct size.
+      // After decompacting the CompactPtrGroupBase, we expect the alignment
+      // property is held as well.
       const uptr BatchGroupBase =
-          Max(batchGroupBase(CompactPtrBase, BG->GroupId), Region->RegionBeg);
+          decompactGroupBase(CompactPtrBase, BG->CompactPtrGroupBase);
+      DCHECK_LE(Region->RegionBeg, BatchGroupBase);
       DCHECK_GE(AllocatedUserEnd, BatchGroupBase);
-      const uptr BatchGroupEnd =
-          batchGroupBase(CompactPtrBase, BG->GroupId) + GroupSize;
-      const uptr AllocatedGroupSize = AllocatedUserEnd >= BatchGroupEnd
-                                          ? BatchGroupEnd - BatchGroupBase
-                                          : AllocatedUserEnd - BatchGroupBase;
-      if (AllocatedGroupSize == 0) {
-        Prev = BG;
-        BG = BG->Next;
-        continue;
-      }
-
+      DCHECK_EQ((Region->RegionBeg - BatchGroupBase) % GroupSize, 0U);
       // TransferBatches are pushed in front of BG.Batches. The first one may
       // not have all caches used.
       const uptr NumBlocks = (BG->Batches.size() - 1) * BG->MaxCachedPerBatch +
                              BG->Batches.front()->getCount();
       const uptr BytesInBG = NumBlocks * BlockSize;
+
+      if (BytesInBG <= BG->BytesInBGAtLastCheckpoint) {
+        BG->BytesInBGAtLastCheckpoint = BytesInBG;
+        Prev = BG;
+        BG = BG->Next;
+        continue;
+      }
+
+      const uptr PushedBytesDelta = BG->BytesInBGAtLastCheckpoint - BytesInBG;
+
       // Given the randomness property, we try to release the pages only if the
       // bytes used by free blocks exceed certain proportion of group size. Note
       // that this heuristic only applies when all the spaces in a BatchGroup
       // are allocated.
-      if (CheckDensity && (BytesInBG * 100U) / AllocatedGroupSize <
-                              (100U - 1U - BlockSize / 16U)) {
-        Prev = BG;
-        BG = BG->Next;
-        continue;
+      if (isSmallBlock(BlockSize)) {
+        const uptr BatchGroupEnd = BatchGroupBase + GroupSize;
+        const uptr AllocatedGroupSize = AllocatedUserEnd >= BatchGroupEnd
+                                            ? GroupSize
+                                            : AllocatedUserEnd - BatchGroupBase;
+        const uptr ReleaseThreshold =
+            (AllocatedGroupSize * (100 - 1U - BlockSize / 16U)) / 100U;
+        const bool HighDensity = BytesInBG >= ReleaseThreshold;
+        const bool MayHaveReleasedAll = NumBlocks >= (GroupSize / BlockSize);
+        // If all blocks in the group are released, we will do range marking
+        // which is fast. Otherwise, we will wait until we have accumulated
+        // a certain amount of free memory.
+        const bool ReachReleaseDelta =
+            MayHaveReleasedAll
+                ? true
+                : PushedBytesDelta >= PageSize * SmallerBlockReleasePageDelta;
+
+        if (!HighDensity) {
+          DCHECK_LE(BytesInBG, ReleaseThreshold);
+          // The following is the usage of a memroy group,
+          //
+          //     BytesInBG             ReleaseThreshold
+          //  /             \                 v
+          //  +---+---------------------------+-----+
+          //  |   |         |                 |     |
+          //  +---+---------------------------+-----+
+          //       \        /                       ^
+          //    PushedBytesDelta                 GroupEnd
+          MinDistToThreshold =
+              Min(MinDistToThreshold,
+                  ReleaseThreshold - BytesInBG + PushedBytesDelta);
+        } else {
+          // If it reaches high density at this round, the next time we will try
+          // to release is based on SmallerBlockReleasePageDelta
+          MinDistToThreshold =
+              Min(MinDistToThreshold, PageSize * SmallerBlockReleasePageDelta);
+        }
+
+        if (!HighDensity || !ReachReleaseDelta) {
+          Prev = BG;
+          BG = BG->Next;
+          continue;
+        }
       }
 
-      // If `BG` is the first BatchGroup in the list, we only need to advance
-      // `BG` and call FreeList::pop_front(). No update is needed for `Prev`.
+      // If `BG` is the first BatchGroupT in the list, we only need to advance
+      // `BG` and call FreeListInfo.BlockList::pop_front(). No update is needed
+      // for `Prev`.
       //
       //         (BG)   (BG->Next)
       // Prev     Cur      BG
@@ -892,7 +1456,7 @@ private:
       //          +--+    +--+
       //
       // Otherwise, `Prev` will be used to extract the `Cur` from the
-      // `FreeList`.
+      // `FreeListInfo.BlockList`.
       //
       //         (BG)   (BG->Next)
       // Prev     Cur      BG
@@ -902,7 +1466,7 @@ private:
       //  |  | -> |X | -> |  | -> ...
       //  +--+    +--+    +--+
       //
-      // After FreeList::extract(),
+      // After FreeListInfo.BlockList::extract(),
       //
       // Prev     Cur       BG
       //   |       |        |
@@ -913,26 +1477,53 @@ private:
       //       +--------+
       //
       // Note that we need to advance before pushing this BatchGroup to
-      // GroupToRelease because it's a destructive operation.
+      // GroupsToRelease because it's a destructive operation.
 
-      BatchGroup *Cur = BG;
+      BatchGroupT *Cur = BG;
       BG = BG->Next;
 
+      // Ideally, we may want to update this only after successful release.
+      // However, for smaller blocks, each block marking is a costly operation.
+      // Therefore, we update it earlier.
+      // TODO: Consider updating this after releasing pages if `ReleaseRecorder`
+      // can tell the released bytes in each group.
+      Cur->BytesInBGAtLastCheckpoint = BytesInBG;
+
       if (Prev != nullptr)
-        Region->FreeList.extract(Prev, Cur);
+        Region->FreeListInfo.BlockList.extract(Prev, Cur);
       else
-        Region->FreeList.pop_front();
-      GroupToRelease.push_back(Cur);
+        Region->FreeListInfo.BlockList.pop_front();
+      GroupsToRelease.push_back(Cur);
     }
 
-    if (GroupToRelease.empty())
-      return 0;
+    // Only small blocks have the adaptive `TryReleaseThreshold`.
+    if (isSmallBlock(BlockSize)) {
+      // If the MinDistToThreshold is not updated, that means each memory group
+      // may have only pushed less than a page size. In that case, just set it
+      // back to normal.
+      if (MinDistToThreshold == GroupSize)
+        MinDistToThreshold = PageSize * SmallerBlockReleasePageDelta;
+      Region->TryReleaseThreshold = MinDistToThreshold;
+    }
 
-    const uptr ReleaseBase =
-        Max(batchGroupBase(CompactPtrBase, GroupToRelease.front()->GroupId),
-            Region->RegionBeg);
+    return GroupsToRelease;
+  }
+
+  PageReleaseContext
+  markFreeBlocks(RegionInfo *Region, const uptr BlockSize,
+                 const uptr AllocatedUserEnd, const uptr CompactPtrBase,
+                 SinglyLinkedList<BatchGroupT> &GroupsToRelease)
+      REQUIRES(Region->MMLock) EXCLUDES(Region->FLLock) {
+    const uptr GroupSize = (1UL << GroupSizeLog);
+    auto DecompactPtr = [CompactPtrBase](CompactPtrT CompactPtr) {
+      return decompactPtrInternal(CompactPtrBase, CompactPtr);
+    };
+
+    const uptr ReleaseBase = decompactGroupBase(
+        CompactPtrBase, GroupsToRelease.front()->CompactPtrGroupBase);
     const uptr LastGroupEnd =
-        Min(batchGroupBase(CompactPtrBase, GroupToRelease.back()->GroupId) +
+        Min(decompactGroupBase(CompactPtrBase,
+                               GroupsToRelease.back()->CompactPtrGroupBase) +
                 GroupSize,
             AllocatedUserEnd);
     // The last block may straddle the group boundary. Rounding up to BlockSize
@@ -941,26 +1532,25 @@ private:
         roundUpSlow(LastGroupEnd - Region->RegionBeg, BlockSize) +
         Region->RegionBeg;
     const uptr ReleaseRangeSize = ReleaseEnd - ReleaseBase;
+    const uptr ReleaseOffset = ReleaseBase - Region->RegionBeg;
 
-    ReleaseRecorder Recorder(ReleaseBase, &Region->Data);
-    PageReleaseContext Context(
-        BlockSize, Region->AllocatedUser, /*NumberOfRegions=*/1U,
-        ReleaseRangeSize, /*ReleaseOffset=*/ReleaseBase - Region->RegionBeg);
-
-    for (BatchGroup &BG : GroupToRelease) {
-      BG.PushedBlocksAtLastCheckpoint = BG.PushedBlocks;
+    PageReleaseContext Context(BlockSize, /*NumberOfRegions=*/1U,
+                               ReleaseRangeSize, ReleaseOffset);
+    // We may not be able to do the page release in a rare case that we may
+    // fail on PageMap allocation.
+    if (UNLIKELY(!Context.ensurePageMapAllocated()))
+      return Context;
 
-      // TODO(chiahungduan): Replace GroupId with BatchGroupBase to simplify the
-      // calculation here and above (where we determine the set of groups to
-      // release).
+    for (BatchGroupT &BG : GroupsToRelease) {
       const uptr BatchGroupBase =
-          Max(batchGroupBase(CompactPtrBase, BG.GroupId), Region->RegionBeg);
-      const uptr BatchGroupEnd =
-          batchGroupBase(CompactPtrBase, BG.GroupId) + GroupSize;
+          decompactGroupBase(CompactPtrBase, BG.CompactPtrGroupBase);
+      const uptr BatchGroupEnd = BatchGroupBase + GroupSize;
       const uptr AllocatedGroupSize = AllocatedUserEnd >= BatchGroupEnd
-                                          ? BatchGroupEnd - BatchGroupBase
+                                          ? GroupSize
                                           : AllocatedUserEnd - BatchGroupBase;
       const uptr BatchGroupUsedEnd = BatchGroupBase + AllocatedGroupSize;
+      const bool MayContainLastBlockInRegion =
+          BatchGroupUsedEnd == AllocatedUserEnd;
       const bool BlockAlignedWithUsedEnd =
           (BatchGroupUsedEnd - Region->RegionBeg) % BlockSize == 0;
 
@@ -972,84 +1562,174 @@ private:
                              BG.Batches.front()->getCount();
 
       if (NumBlocks == MaxContainedBlocks) {
-        for (const auto &It : BG.Batches)
+        for (const auto &It : BG.Batches) {
+          if (&It != BG.Batches.front())
+            DCHECK_EQ(It.getCount(), BG.MaxCachedPerBatch);
           for (u16 I = 0; I < It.getCount(); ++I)
-            DCHECK_EQ(compactPtrGroup(It.get(I)), BG.GroupId);
+            DCHECK_EQ(compactPtrGroup(It.get(I)), BG.CompactPtrGroupBase);
+        }
 
         Context.markRangeAsAllCounted(BatchGroupBase, BatchGroupUsedEnd,
-                                      Region->RegionBeg);
+                                      Region->RegionBeg, /*RegionIndex=*/0,
+                                      Region->MemMapInfo.AllocatedUser);
       } else {
         DCHECK_LT(NumBlocks, MaxContainedBlocks);
         // Note that we don't always visit blocks in each BatchGroup so that we
         // may miss the chance of releasing certain pages that cross
         // BatchGroups.
-        Context.markFreeBlocks(BG.Batches, DecompactPtr, Region->RegionBeg);
+        Context.markFreeBlocksInRegion(
+            BG.Batches, DecompactPtr, Region->RegionBeg, /*RegionIndex=*/0,
+            Region->MemMapInfo.AllocatedUser, MayContainLastBlockInRegion);
       }
     }
 
     DCHECK(Context.hasBlockMarked());
 
-    auto SkipRegion = [](UNUSED uptr RegionIndex) { return false; };
-    releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
+    return Context;
+  }
 
-    if (Recorder.getReleasedRangesCount() > 0) {
-      Region->ReleaseInfo.PushedBlocksAtLastRelease =
-          Region->Stats.PushedBlocks;
-      Region->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
-      Region->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
-    }
-    Region->ReleaseInfo.LastReleaseAtNs = getMonotonicTime();
-
-    // Merge GroupToRelease back to the Region::FreeList. Note that both
-    // `Region->FreeList` and `GroupToRelease` are sorted.
-    for (BatchGroup *BG = Region->FreeList.front(), *Prev = nullptr;;) {
-      if (BG == nullptr || GroupToRelease.empty()) {
-        if (!GroupToRelease.empty())
-          Region->FreeList.append_back(&GroupToRelease);
+  void mergeGroupsToReleaseBack(RegionInfo *Region,
+                                SinglyLinkedList<BatchGroupT> &GroupsToRelease)
+      REQUIRES(Region->MMLock) EXCLUDES(Region->FLLock) {
+    ScopedLock L(Region->FLLock);
+
+    // After merging two freelists, we may have redundant `BatchGroup`s that
+    // need to be recycled. The number of unused `BatchGroup`s is expected to be
+    // small. Pick a constant which is inferred from real programs.
+    constexpr uptr MaxUnusedSize = 8;
+    CompactPtrT Blocks[MaxUnusedSize];
+    u32 Idx = 0;
+    RegionInfo *BatchClassRegion = getRegionInfo(SizeClassMap::BatchClassId);
+    // We can't call pushBatchClassBlocks() to recycle the unused `BatchGroup`s
+    // when we are manipulating the freelist of `BatchClassRegion`. Instead, we
+    // should just push it back to the freelist when we merge two `BatchGroup`s.
+    // This logic hasn't been implemented because we haven't supported releasing
+    // pages in `BatchClassRegion`.
+    DCHECK_NE(BatchClassRegion, Region);
+
+    // Merge GroupsToRelease back to the Region::FreeListInfo.BlockList. Note
+    // that both `Region->FreeListInfo.BlockList` and `GroupsToRelease` are
+    // sorted.
+    for (BatchGroupT *BG = Region->FreeListInfo.BlockList.front(),
+                     *Prev = nullptr;
+         ;) {
+      if (BG == nullptr || GroupsToRelease.empty()) {
+        if (!GroupsToRelease.empty())
+          Region->FreeListInfo.BlockList.append_back(&GroupsToRelease);
         break;
       }
 
-      DCHECK_NE(BG->GroupId, GroupToRelease.front()->GroupId);
+      DCHECK(!BG->Batches.empty());
+
+      if (BG->CompactPtrGroupBase <
+          GroupsToRelease.front()->CompactPtrGroupBase) {
+        Prev = BG;
+        BG = BG->Next;
+        continue;
+      }
+
+      BatchGroupT *Cur = GroupsToRelease.front();
+      TransferBatchT *UnusedTransferBatch = nullptr;
+      GroupsToRelease.pop_front();
+
+      if (BG->CompactPtrGroupBase == Cur->CompactPtrGroupBase) {
+        BG->PushedBlocks += Cur->PushedBlocks;
+        // We have updated `BatchGroup::BytesInBGAtLastCheckpoint` while
+        // collecting the `GroupsToRelease`.
+        BG->BytesInBGAtLastCheckpoint = Cur->BytesInBGAtLastCheckpoint;
+        const uptr MaxCachedPerBatch = BG->MaxCachedPerBatch;
+
+        // Note that the first TransferBatches in both `Batches` may not be
+        // full and only the first TransferBatch can have non-full blocks. Thus
+        // we have to merge them before appending one to another.
+        if (Cur->Batches.front()->getCount() == MaxCachedPerBatch) {
+          BG->Batches.append_back(&Cur->Batches);
+        } else {
+          TransferBatchT *NonFullBatch = Cur->Batches.front();
+          Cur->Batches.pop_front();
+          const u16 NonFullBatchCount = NonFullBatch->getCount();
+          // The remaining Batches in `Cur` are full.
+          BG->Batches.append_back(&Cur->Batches);
+
+          if (BG->Batches.front()->getCount() == MaxCachedPerBatch) {
+            // Only 1 non-full TransferBatch, push it to the front.
+            BG->Batches.push_front(NonFullBatch);
+          } else {
+            const u16 NumBlocksToMove = static_cast<u16>(
+                Min(static_cast<u16>(MaxCachedPerBatch -
+                                     BG->Batches.front()->getCount()),
+                    NonFullBatchCount));
+            BG->Batches.front()->appendFromTransferBatch(NonFullBatch,
+                                                         NumBlocksToMove);
+            if (NonFullBatch->isEmpty())
+              UnusedTransferBatch = NonFullBatch;
+            else
+              BG->Batches.push_front(NonFullBatch);
+          }
+        }
 
-      if (BG->GroupId < GroupToRelease.front()->GroupId) {
+        const u32 NeededSlots = UnusedTransferBatch == nullptr ? 1U : 2U;
+        if (UNLIKELY(Idx + NeededSlots > MaxUnusedSize)) {
+          ScopedLock L(BatchClassRegion->FLLock);
+          pushBatchClassBlocks(BatchClassRegion, Blocks, Idx);
+          if (conditionVariableEnabled())
+            BatchClassRegion->FLLockCV.notifyAll(BatchClassRegion->FLLock);
+          Idx = 0;
+        }
+        Blocks[Idx++] =
+            compactPtr(SizeClassMap::BatchClassId, reinterpret_cast<uptr>(Cur));
+        if (UnusedTransferBatch) {
+          Blocks[Idx++] =
+              compactPtr(SizeClassMap::BatchClassId,
+                         reinterpret_cast<uptr>(UnusedTransferBatch));
+        }
         Prev = BG;
         BG = BG->Next;
         continue;
       }
 
-      // At here, the `BG` is the first BatchGroup with GroupId larger than the
-      // first element in `GroupToRelease`. We need to insert
-      // `GroupToRelease::front()` (which is `Cur` below)  before `BG`.
+      // At here, the `BG` is the first BatchGroup with CompactPtrGroupBase
+      // larger than the first element in `GroupsToRelease`. We need to insert
+      // `GroupsToRelease::front()` (which is `Cur` below)  before `BG`.
       //
       //   1. If `Prev` is nullptr, we simply push `Cur` to the front of
-      //      FreeList.
+      //      FreeListInfo.BlockList.
       //   2. Otherwise, use `insert()` which inserts an element next to `Prev`.
       //
       // Afterwards, we don't need to advance `BG` because the order between
-      // `BG` and the new `GroupToRelease::front()` hasn't been checked.
-      BatchGroup *Cur = GroupToRelease.front();
-      GroupToRelease.pop_front();
+      // `BG` and the new `GroupsToRelease::front()` hasn't been checked.
       if (Prev == nullptr)
-        Region->FreeList.push_front(Cur);
+        Region->FreeListInfo.BlockList.push_front(Cur);
       else
-        Region->FreeList.insert(Prev, Cur);
+        Region->FreeListInfo.BlockList.insert(Prev, Cur);
       DCHECK_EQ(Cur->Next, BG);
       Prev = Cur;
     }
 
-    DCHECK_EQ(Region->FreeList.size(), NumberOfBatchGroups);
-    (void)NumberOfBatchGroups;
+    if (Idx != 0) {
+      ScopedLock L(BatchClassRegion->FLLock);
+      pushBatchClassBlocks(BatchClassRegion, Blocks, Idx);
+      if (conditionVariableEnabled())
+        BatchClassRegion->FLLockCV.notifyAll(BatchClassRegion->FLLock);
+    }
 
     if (SCUDO_DEBUG) {
-      BatchGroup *Prev = Region->FreeList.front();
-      for (BatchGroup *Cur = Prev->Next; Cur != nullptr;
+      BatchGroupT *Prev = Region->FreeListInfo.BlockList.front();
+      for (BatchGroupT *Cur = Prev->Next; Cur != nullptr;
            Prev = Cur, Cur = Cur->Next) {
-        CHECK_LT(Prev->GroupId, Cur->GroupId);
+        CHECK_LT(Prev->CompactPtrGroupBase, Cur->CompactPtrGroupBase);
       }
     }
 
-    return Recorder.getReleasedBytes();
+    if (conditionVariableEnabled())
+      Region->FLLockCV.notifyAll(Region->FLLock);
   }
+
+  // The minimum size of pushed blocks that we will try to release the pages in
+  // that size class.
+  uptr SmallerBlockReleasePageDelta = 0;
+  atomic_s32 ReleaseToOsIntervalMs = {};
+  alignas(SCUDO_CACHE_LINE_SIZE) RegionInfo RegionInfoArray[NumClasses];
 };
 
 } // namespace scudo
diff --git a/standalone/quarantine.h b/standalone/quarantine.h
index e65a733ced7..b5f8db0e87c 100644
--- a/standalone/quarantine.h
+++ b/standalone/quarantine.h
@@ -192,6 +192,12 @@ public:
   uptr getMaxSize() const { return atomic_load_relaxed(&MaxSize); }
   uptr getCacheSize() const { return atomic_load_relaxed(&MaxCacheSize); }
 
+  // This is supposed to be used in test only.
+  bool isEmpty() {
+    ScopedLock L(CacheMutex);
+    return Cache.getSize() == 0U;
+  }
+
   void put(CacheT *C, Callback Cb, Node *Ptr, uptr Size) {
     C->enqueue(Cb, Ptr, Size);
     if (C->getSize() > getCacheSize())
diff --git a/standalone/release.cpp b/standalone/release.cpp
index 3f40dbec6d7..875a2b0c1c5 100644
--- a/standalone/release.cpp
+++ b/standalone/release.cpp
@@ -10,7 +10,8 @@
 
 namespace scudo {
 
-HybridMutex RegionPageMap::Mutex = {};
-uptr RegionPageMap::StaticBuffer[RegionPageMap::StaticBufferCount];
+BufferPool<RegionPageMap::StaticBufferCount,
+           RegionPageMap::StaticBufferNumElements>
+    RegionPageMap::Buffers;
 
 } // namespace scudo
diff --git a/standalone/release.h b/standalone/release.h
index d29d1c1f53f..b6f76a4d205 100644
--- a/standalone/release.h
+++ b/standalone/release.h
@@ -11,15 +11,46 @@
 
 #include "common.h"
 #include "list.h"
+#include "mem_map.h"
 #include "mutex.h"
 #include "thread_annotations.h"
 
 namespace scudo {
 
+template <typename MemMapT> class RegionReleaseRecorder {
+public:
+  RegionReleaseRecorder(MemMapT *RegionMemMap, uptr Base, uptr Offset = 0)
+      : RegionMemMap(RegionMemMap), Base(Base), Offset(Offset) {}
+
+  uptr getReleasedRangesCount() const { return ReleasedRangesCount; }
+
+  uptr getReleasedBytes() const { return ReleasedBytes; }
+
+  uptr getBase() const { return Base; }
+
+  // Releases [From, To) range of pages back to OS. Note that `From` and `To`
+  // are offseted from `Base` + Offset.
+  void releasePageRangeToOS(uptr From, uptr To) {
+    const uptr Size = To - From;
+    RegionMemMap->releasePagesToOS(getBase() + Offset + From, Size);
+    ReleasedRangesCount++;
+    ReleasedBytes += Size;
+  }
+
+private:
+  uptr ReleasedRangesCount = 0;
+  uptr ReleasedBytes = 0;
+  MemMapT *RegionMemMap = nullptr;
+  uptr Base = 0;
+  // The release offset from Base. This is used when we know a given range after
+  // Base will not be released.
+  uptr Offset = 0;
+};
+
 class ReleaseRecorder {
 public:
-  ReleaseRecorder(uptr Base, MapPlatformData *Data = nullptr)
-      : Base(Base), Data(Data) {}
+  ReleaseRecorder(uptr Base, uptr Offset = 0, MapPlatformData *Data = nullptr)
+      : Base(Base), Offset(Offset), Data(Data) {}
 
   uptr getReleasedRangesCount() const { return ReleasedRangesCount; }
 
@@ -30,7 +61,7 @@ public:
   // Releases [From, To) range of pages back to OS.
   void releasePageRangeToOS(uptr From, uptr To) {
     const uptr Size = To - From;
-    releasePagesToOS(Base, From, Size, Data);
+    releasePagesToOS(Base, From + Offset, Size, Data);
     ReleasedRangesCount++;
     ReleasedBytes += Size;
   }
@@ -38,10 +69,129 @@ public:
 private:
   uptr ReleasedRangesCount = 0;
   uptr ReleasedBytes = 0;
+  // The starting address to release. Note that we may want to combine (Base +
+  // Offset) as a new Base. However, the Base is retrieved from
+  // `MapPlatformData` on Fuchsia, which means the offset won't be aware.
+  // Therefore, store them separately to make it work on all the platforms.
   uptr Base = 0;
+  // The release offset from Base. This is used when we know a given range after
+  // Base will not be released.
+  uptr Offset = 0;
   MapPlatformData *Data = nullptr;
 };
 
+class FragmentationRecorder {
+public:
+  FragmentationRecorder() = default;
+
+  uptr getReleasedPagesCount() const { return ReleasedPagesCount; }
+
+  void releasePageRangeToOS(uptr From, uptr To) {
+    DCHECK_EQ((To - From) % getPageSizeCached(), 0U);
+    ReleasedPagesCount += (To - From) / getPageSizeCached();
+  }
+
+private:
+  uptr ReleasedPagesCount = 0;
+};
+
+// A buffer pool which holds a fixed number of static buffers of `uptr` elements
+// for fast buffer allocation. If the request size is greater than
+// `StaticBufferNumElements` or if all the static buffers are in use, it'll
+// delegate the allocation to map().
+template <uptr StaticBufferCount, uptr StaticBufferNumElements>
+class BufferPool {
+public:
+  // Preserve 1 bit in the `Mask` so that we don't need to do zero-check while
+  // extracting the least significant bit from the `Mask`.
+  static_assert(StaticBufferCount < SCUDO_WORDSIZE, "");
+  static_assert(isAligned(StaticBufferNumElements * sizeof(uptr),
+                          SCUDO_CACHE_LINE_SIZE),
+                "");
+
+  struct Buffer {
+    // Pointer to the buffer's memory, or nullptr if no buffer was allocated.
+    uptr *Data = nullptr;
+
+    // The index of the underlying static buffer, or StaticBufferCount if this
+    // buffer was dynamically allocated. This value is initially set to a poison
+    // value to aid debugging.
+    uptr BufferIndex = ~static_cast<uptr>(0);
+
+    // Only valid if BufferIndex == StaticBufferCount.
+    MemMapT MemMap = {};
+  };
+
+  // Return a zero-initialized buffer which can contain at least the given
+  // number of elements, or nullptr on failure.
+  Buffer getBuffer(const uptr NumElements) {
+    if (UNLIKELY(NumElements > StaticBufferNumElements))
+      return getDynamicBuffer(NumElements);
+
+    uptr index;
+    {
+      // TODO: In general, we expect this operation should be fast so the
+      // waiting thread won't be put into sleep. The HybridMutex does implement
+      // the busy-waiting but we may want to review the performance and see if
+      // we need an explict spin lock here.
+      ScopedLock L(Mutex);
+      index = getLeastSignificantSetBitIndex(Mask);
+      if (index < StaticBufferCount)
+        Mask ^= static_cast<uptr>(1) << index;
+    }
+
+    if (index >= StaticBufferCount)
+      return getDynamicBuffer(NumElements);
+
+    Buffer Buf;
+    Buf.Data = &RawBuffer[index * StaticBufferNumElements];
+    Buf.BufferIndex = index;
+    memset(Buf.Data, 0, StaticBufferNumElements * sizeof(uptr));
+    return Buf;
+  }
+
+  void releaseBuffer(Buffer Buf) {
+    DCHECK_NE(Buf.Data, nullptr);
+    DCHECK_LE(Buf.BufferIndex, StaticBufferCount);
+    if (Buf.BufferIndex != StaticBufferCount) {
+      ScopedLock L(Mutex);
+      DCHECK_EQ((Mask & (static_cast<uptr>(1) << Buf.BufferIndex)), 0U);
+      Mask |= static_cast<uptr>(1) << Buf.BufferIndex;
+    } else {
+      Buf.MemMap.unmap(Buf.MemMap.getBase(), Buf.MemMap.getCapacity());
+    }
+  }
+
+  bool isStaticBufferTestOnly(const Buffer &Buf) {
+    DCHECK_NE(Buf.Data, nullptr);
+    DCHECK_LE(Buf.BufferIndex, StaticBufferCount);
+    return Buf.BufferIndex != StaticBufferCount;
+  }
+
+private:
+  Buffer getDynamicBuffer(const uptr NumElements) {
+    // When using a heap-based buffer, precommit the pages backing the
+    // Vmar by passing |MAP_PRECOMMIT| flag. This allows an optimization
+    // where page fault exceptions are skipped as the allocated memory
+    // is accessed. So far, this is only enabled on Fuchsia. It hasn't proven a
+    // performance benefit on other platforms.
+    const uptr MmapFlags = MAP_ALLOWNOMEM | (SCUDO_FUCHSIA ? MAP_PRECOMMIT : 0);
+    const uptr MappedSize =
+        roundUp(NumElements * sizeof(uptr), getPageSizeCached());
+    Buffer Buf;
+    if (Buf.MemMap.map(/*Addr=*/0, MappedSize, "scudo:counters", MmapFlags)) {
+      Buf.Data = reinterpret_cast<uptr *>(Buf.MemMap.getBase());
+      Buf.BufferIndex = StaticBufferCount;
+    }
+    return Buf;
+  }
+
+  HybridMutex Mutex;
+  // '1' means that buffer index is not used. '0' means the buffer is in use.
+  uptr Mask GUARDED_BY(Mutex) = ~static_cast<uptr>(0);
+  uptr RawBuffer[StaticBufferCount * StaticBufferNumElements] GUARDED_BY(Mutex);
+};
+
 // A Region page map is used to record the usage of pages in the regions. It
 // implements a packed array of Counters. Each counter occupies 2^N bits, enough
 // to store counter's MaxValue. Ctor will try to use a static buffer first, and
@@ -54,35 +204,24 @@ private:
 class RegionPageMap {
 public:
   RegionPageMap()
-      : Regions(0),
-        NumCounters(0),
-        CounterSizeBitsLog(0),
-        CounterMask(0),
-        PackingRatioLog(0),
-        BitOffsetMask(0),
-        SizePerRegion(0),
-        BufferSize(0),
-        Buffer(nullptr) {}
+      : Regions(0), NumCounters(0), CounterSizeBitsLog(0), CounterMask(0),
+        PackingRatioLog(0), BitOffsetMask(0), SizePerRegion(0),
+        BufferNumElements(0) {}
   RegionPageMap(uptr NumberOfRegions, uptr CountersPerRegion, uptr MaxValue) {
     reset(NumberOfRegions, CountersPerRegion, MaxValue);
   }
   ~RegionPageMap() {
     if (!isAllocated())
       return;
-    if (Buffer == &StaticBuffer[0])
-      Mutex.unlock();
-    else
-      unmap(reinterpret_cast<void *>(Buffer),
-            roundUp(BufferSize, getPageSizeCached()));
-    Buffer = nullptr;
+    Buffers.releaseBuffer(Buffer);
+    Buffer = {};
   }
 
   // Lock of `StaticBuffer` is acquired conditionally and there's no easy way to
   // specify the thread-safety attribute properly in current code structure.
   // Besides, it's the only place we may want to check thread safety. Therefore,
   // it's fine to bypass the thread-safety analysis now.
-  void reset(uptr NumberOfRegion, uptr CountersPerRegion,
-             uptr MaxValue) NO_THREAD_SAFETY_ANALYSIS {
+  void reset(uptr NumberOfRegion, uptr CountersPerRegion, uptr MaxValue) {
     DCHECK_GT(NumberOfRegion, 0);
     DCHECK_GT(CountersPerRegion, 0);
     DCHECK_GT(MaxValue, 0);
@@ -90,7 +229,7 @@ public:
     Regions = NumberOfRegion;
     NumCounters = CountersPerRegion;
 
-    constexpr uptr MaxCounterBits = sizeof(*Buffer) * 8UL;
+    constexpr uptr MaxCounterBits = sizeof(*Buffer.Data) * 8UL;
     // Rounding counter storage size up to the power of two allows for using
     // bit shifts calculating particular counter's Index and offset.
     const uptr CounterSizeBits =
@@ -107,25 +246,11 @@ public:
     SizePerRegion =
         roundUp(NumCounters, static_cast<uptr>(1U) << PackingRatioLog) >>
         PackingRatioLog;
-    BufferSize = SizePerRegion * sizeof(*Buffer) * Regions;
-    if (BufferSize <= (StaticBufferCount * sizeof(Buffer[0])) &&
-        Mutex.tryLock()) {
-      Buffer = &StaticBuffer[0];
-      memset(Buffer, 0, BufferSize);
-    } else {
-      // When using a heap-based buffer, precommit the pages backing the
-      // Vmar by passing |MAP_PRECOMMIT| flag. This allows an optimization
-      // where page fault exceptions are skipped as the allocated memory
-      // is accessed.
-      const uptr MmapFlags =
-          MAP_ALLOWNOMEM | (SCUDO_FUCHSIA ? MAP_PRECOMMIT : 0);
-      Buffer = reinterpret_cast<uptr *>(
-          map(nullptr, roundUp(BufferSize, getPageSizeCached()),
-              "scudo:counters", MmapFlags, &MapData));
-    }
+    BufferNumElements = SizePerRegion * Regions;
+    Buffer = Buffers.getBuffer(BufferNumElements);
   }
 
-  bool isAllocated() const { return !!Buffer; }
+  bool isAllocated() const { return Buffer.Data != nullptr; }
 
   uptr getCount() const { return NumCounters; }
 
@@ -134,7 +259,8 @@ public:
     DCHECK_LT(I, NumCounters);
     const uptr Index = I >> PackingRatioLog;
     const uptr BitOffset = (I & BitOffsetMask) << CounterSizeBitsLog;
-    return (Buffer[Region * SizePerRegion + Index] >> BitOffset) & CounterMask;
+    return (Buffer.Data[Region * SizePerRegion + Index] >> BitOffset) &
+           CounterMask;
   }
 
   void inc(uptr Region, uptr I) const {
@@ -143,8 +269,8 @@ public:
     const uptr BitOffset = (I & BitOffsetMask) << CounterSizeBitsLog;
     DCHECK_LT(BitOffset, SCUDO_WORDSIZE);
     DCHECK_EQ(isAllCounted(Region, I), false);
-    Buffer[Region * SizePerRegion + Index] += static_cast<uptr>(1U)
-                                              << BitOffset;
+    Buffer.Data[Region * SizePerRegion + Index] += static_cast<uptr>(1U)
+                                                   << BitOffset;
   }
 
   void incN(uptr Region, uptr I, uptr N) const {
@@ -155,7 +281,7 @@ public:
     const uptr BitOffset = (I & BitOffsetMask) << CounterSizeBitsLog;
     DCHECK_LT(BitOffset, SCUDO_WORDSIZE);
     DCHECK_EQ(isAllCounted(Region, I), false);
-    Buffer[Region * SizePerRegion + Index] += N << BitOffset;
+    Buffer.Data[Region * SizePerRegion + Index] += N << BitOffset;
   }
 
   void incRange(uptr Region, uptr From, uptr To) const {
@@ -174,7 +300,7 @@ public:
     const uptr Index = I >> PackingRatioLog;
     const uptr BitOffset = (I & BitOffsetMask) << CounterSizeBitsLog;
     DCHECK_LT(BitOffset, SCUDO_WORDSIZE);
-    Buffer[Region * SizePerRegion + Index] |= CounterMask << BitOffset;
+    Buffer.Data[Region * SizePerRegion + Index] |= CounterMask << BitOffset;
   }
   void setAsAllCountedRange(uptr Region, uptr From, uptr To) const {
     DCHECK_LE(From, To);
@@ -197,11 +323,16 @@ public:
     return get(Region, I) == CounterMask;
   }
 
-  uptr getBufferSize() const { return BufferSize; }
-
-  static const uptr StaticBufferCount = 2048U;
+  uptr getBufferNumElements() const { return BufferNumElements; }
 
 private:
+  // We may consider making this configurable if there are cases which may
+  // benefit from this.
+  static const uptr StaticBufferCount = 2U;
+  static const uptr StaticBufferNumElements = 512U;
+  using BufferPoolT = BufferPool<StaticBufferCount, StaticBufferNumElements>;
+  static BufferPoolT Buffers;
+
   uptr Regions;
   uptr NumCounters;
   uptr CounterSizeBitsLog;
@@ -210,12 +341,8 @@ private:
   uptr BitOffsetMask;
 
   uptr SizePerRegion;
-  uptr BufferSize;
-  uptr *Buffer;
-  [[no_unique_address]] MapPlatformData MapData = {};
-
-  static HybridMutex Mutex;
-  static uptr StaticBuffer[StaticBufferCount] GUARDED_BY(Mutex);
+  uptr BufferNumElements;
+  BufferPoolT::Buffer Buffer;
 };
 
 template <class ReleaseRecorderT> class FreePagesRangeTracker {
@@ -259,10 +386,9 @@ private:
 };
 
 struct PageReleaseContext {
-  PageReleaseContext(uptr BlockSize, uptr RegionSize, uptr NumberOfRegions,
-                     uptr ReleaseSize, uptr ReleaseOffset = 0)
-      : BlockSize(BlockSize), RegionSize(RegionSize),
-        NumberOfRegions(NumberOfRegions) {
+  PageReleaseContext(uptr BlockSize, uptr NumberOfRegions, uptr ReleaseSize,
+                     uptr ReleaseOffset = 0)
+      : BlockSize(BlockSize), NumberOfRegions(NumberOfRegions) {
     PageSize = getPageSizeCached();
     if (BlockSize <= PageSize) {
       if (PageSize % BlockSize == 0) {
@@ -298,15 +424,11 @@ struct PageReleaseContext {
     // region marking (which includes the complexity of how to handle the last
     // block in a region). We may consider this after markFreeBlocks() accepts
     // only free blocks from the same region.
-    if (NumberOfRegions != 1) {
-      DCHECK_EQ(ReleaseSize, RegionSize);
+    if (NumberOfRegions != 1)
       DCHECK_EQ(ReleaseOffset, 0U);
-    }
 
     PagesCount = roundUp(ReleaseSize, PageSize) / PageSize;
     PageSizeLog = getLog2(PageSize);
-    RoundedRegionSize = roundUp(RegionSize, PageSize);
-    RoundedSize = NumberOfRegions * RoundedRegionSize;
     ReleasePageOffset = ReleaseOffset >> PageSizeLog;
   }
 
@@ -315,41 +437,35 @@ struct PageReleaseContext {
     return PageMap.isAllocated();
   }
 
-  void ensurePageMapAllocated() {
+  bool ensurePageMapAllocated() {
     if (PageMap.isAllocated())
-      return;
+      return true;
     PageMap.reset(NumberOfRegions, PagesCount, FullPagesBlockCountMax);
-    DCHECK(PageMap.isAllocated());
+    // TODO: Log some message when we fail on PageMap allocation.
+    return PageMap.isAllocated();
   }
 
   // Mark all the blocks in the given range [From, to). Instead of visiting all
   // the blocks, we will just mark the page as all counted. Note the `From` and
   // `To` has to be page aligned but with one exception, if `To` is equal to the
   // RegionSize, it's not necessary to be aligned with page size.
-  void markRangeAsAllCounted(uptr From, uptr To, uptr Base) {
+  bool markRangeAsAllCounted(uptr From, uptr To, uptr Base,
+                             const uptr RegionIndex, const uptr RegionSize) {
     DCHECK_LT(From, To);
+    DCHECK_LE(To, Base + RegionSize);
     DCHECK_EQ(From % PageSize, 0U);
+    DCHECK_LE(To - From, RegionSize);
 
-    ensurePageMapAllocated();
-
-    const uptr FromOffset = From - Base;
-    const uptr ToOffset = To - Base;
-
-    const uptr RegionIndex =
-        NumberOfRegions == 1U ? 0 : FromOffset / RegionSize;
-    if (SCUDO_DEBUG) {
-      const uptr ToRegionIndex =
-          NumberOfRegions == 1U ? 0 : (ToOffset - 1) / RegionSize;
-      CHECK_EQ(RegionIndex, ToRegionIndex);
-    }
+    if (!ensurePageMapAllocated())
+      return false;
 
-    uptr FromInRegion = FromOffset - RegionIndex * RegionSize;
-    uptr ToInRegion = ToOffset - RegionIndex * RegionSize;
+    uptr FromInRegion = From - Base;
+    uptr ToInRegion = To - Base;
     uptr FirstBlockInRange = roundUpSlow(FromInRegion, BlockSize);
 
     // The straddling block sits across entire range.
     if (FirstBlockInRange >= ToInRegion)
-      return;
+      return true;
 
     // First block may not sit at the first pape in the range, move
     // `FromInRegion` to the first block page.
@@ -380,8 +496,9 @@ struct PageReleaseContext {
     }
 
     uptr LastBlockInRange = roundDownSlow(ToInRegion - 1, BlockSize);
-    if (LastBlockInRange < FromInRegion)
-      return;
+
+    // Note that LastBlockInRange may be smaller than `FromInRegion` at this
+    // point because it may contain only one block in the range.
 
     // When the last block sits across `To`, we can't just mark the pages
     // occupied by the last block as all counted. Instead, we increment the
@@ -415,25 +532,52 @@ struct PageReleaseContext {
       PageMap.setAsAllCountedRange(RegionIndex, getPageIndex(FromInRegion),
                                    getPageIndex(ToInRegion - 1));
     }
+
+    return true;
   }
 
-  template<class TransferBatchT, typename DecompactPtrT>
-  void markFreeBlocks(const IntrusiveList<TransferBatchT> &FreeList,
-                      DecompactPtrT DecompactPtr, uptr Base) {
-    ensurePageMapAllocated();
-
-    const uptr LastBlockInRegion = ((RegionSize / BlockSize) - 1U) * BlockSize;
-
-    // The last block in a region may not use the entire page, so if it's free,
-    // we mark the following "pretend" memory block(s) as free.
-    auto markLastBlock = [this, LastBlockInRegion](const uptr RegionIndex) {
-      uptr PInRegion = LastBlockInRegion + BlockSize;
-      while (PInRegion < RoundedRegionSize) {
-        PageMap.incRange(RegionIndex, getPageIndex(PInRegion),
-                         getPageIndex(PInRegion + BlockSize - 1));
-        PInRegion += BlockSize;
+  template <class TransferBatchT, typename DecompactPtrT>
+  bool markFreeBlocksInRegion(const IntrusiveList<TransferBatchT> &FreeList,
+                              DecompactPtrT DecompactPtr, const uptr Base,
+                              const uptr RegionIndex, const uptr RegionSize,
+                              bool MayContainLastBlockInRegion) {
+    if (!ensurePageMapAllocated())
+      return false;
+
+    if (MayContainLastBlockInRegion) {
+      const uptr LastBlockInRegion =
+          ((RegionSize / BlockSize) - 1U) * BlockSize;
+      // The last block in a region may not use the entire page, we mark the
+      // following "pretend" memory block(s) as free in advance.
+      //
+      //     Region Boundary
+      //         v
+      //  -----+-----------------------+
+      //       |      Last Page        | <- Rounded Region Boundary
+      //  -----+-----------------------+
+      //   |-----||- trailing blocks  -|
+      //      ^
+      //   last block
+      const uptr RoundedRegionSize = roundUp(RegionSize, PageSize);
+      const uptr TrailingBlockBase = LastBlockInRegion + BlockSize;
+      // If the difference between `RoundedRegionSize` and
+      // `TrailingBlockBase` is larger than a page, that implies the reported
+      // `RegionSize` may not be accurate.
+      DCHECK_LT(RoundedRegionSize - TrailingBlockBase, PageSize);
+
+      // Only the last page touched by the last block needs to mark the trailing
+      // blocks. Note that if the last "pretend" block straddles the boundary,
+      // we still have to count it in so that the logic of counting the number
+      // of blocks on a page is consistent.
+      uptr NumTrailingBlocks =
+          (roundUpSlow(RoundedRegionSize - TrailingBlockBase, BlockSize) +
+           BlockSize - 1) /
+          BlockSize;
+      if (NumTrailingBlocks > 0) {
+        PageMap.incN(RegionIndex, getPageIndex(TrailingBlockBase),
+                     NumTrailingBlocks);
       }
-    };
+    }
 
     // Iterate over free chunks and count how many free chunks affect each
     // allocated page.
@@ -441,14 +585,9 @@ struct PageReleaseContext {
       // Each chunk affects one page only.
       for (const auto &It : FreeList) {
         for (u16 I = 0; I < It.getCount(); I++) {
-          const uptr P = DecompactPtr(It.get(I)) - Base;
-          if (P >= RoundedSize)
-            continue;
-          const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize;
-          const uptr PInRegion = P - RegionIndex * RegionSize;
+          const uptr PInRegion = DecompactPtr(It.get(I)) - Base;
+          DCHECK_LT(PInRegion, RegionSize);
           PageMap.inc(RegionIndex, getPageIndex(PInRegion));
-          if (PInRegion == LastBlockInRegion)
-            markLastBlock(RegionIndex);
         }
       }
     } else {
@@ -456,24 +595,20 @@ struct PageReleaseContext {
       DCHECK_GE(RegionSize, BlockSize);
       for (const auto &It : FreeList) {
         for (u16 I = 0; I < It.getCount(); I++) {
-          const uptr P = DecompactPtr(It.get(I)) - Base;
-          if (P >= RoundedSize)
-            continue;
-          const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize;
-          uptr PInRegion = P - RegionIndex * RegionSize;
+          const uptr PInRegion = DecompactPtr(It.get(I)) - Base;
           PageMap.incRange(RegionIndex, getPageIndex(PInRegion),
                            getPageIndex(PInRegion + BlockSize - 1));
-          if (PInRegion == LastBlockInRegion)
-            markLastBlock(RegionIndex);
         }
       }
     }
+
+    return true;
   }
 
   uptr getPageIndex(uptr P) { return (P >> PageSizeLog) - ReleasePageOffset; }
+  uptr getReleaseOffset() { return ReleasePageOffset << PageSizeLog; }
 
   uptr BlockSize;
-  uptr RegionSize;
   uptr NumberOfRegions;
   // For partial region marking, some pages in front are not needed to be
   // counted.
@@ -481,8 +616,6 @@ struct PageReleaseContext {
   uptr PageSize;
   uptr PagesCount;
   uptr PageSizeLog;
-  uptr RoundedRegionSize;
-  uptr RoundedSize;
   uptr FullPagesBlockCountMax;
   bool SameBlockCountPerPage;
   RegionPageMap PageMap;
@@ -563,21 +696,6 @@ releaseFreeMemoryToOS(PageReleaseContext &Context,
   RangeTracker.finish();
 }
 
-// An overload releaseFreeMemoryToOS which doesn't require the page usage
-// information after releasing.
-template <class TransferBatchT, class ReleaseRecorderT, typename DecompactPtrT,
-          typename SkipRegionT>
-NOINLINE void
-releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList,
-                      uptr RegionSize, uptr NumberOfRegions, uptr BlockSize,
-                      ReleaseRecorderT &Recorder, DecompactPtrT DecompactPtr,
-                      SkipRegionT SkipRegion) {
-  PageReleaseContext Context(BlockSize, /*ReleaseSize=*/RegionSize, RegionSize,
-                             NumberOfRegions);
-  Context.markFreeBlocks(FreeList, DecompactPtr, Recorder.getBase());
-  releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
-}
-
 } // namespace scudo
 
 #endif // SCUDO_RELEASE_H_
diff --git a/standalone/report.cpp b/standalone/report.cpp
index 16eae8c3136..9cef0adc0bb 100644
--- a/standalone/report.cpp
+++ b/standalone/report.cpp
@@ -21,14 +21,10 @@ public:
   void append(const char *Format, ...) {
     va_list Args;
     va_start(Args, Format);
-    Message.append(Format, Args);
+    Message.vappend(Format, Args);
     va_end(Args);
   }
-  NORETURN ~ScopedErrorReport() {
-    outputRaw(Message.data());
-    setAbortMessage(Message.data());
-    die();
-  }
+  NORETURN ~ScopedErrorReport() { reportRawError(Message.data()); }
 
 private:
   ScopedString Message;
@@ -36,18 +32,6 @@ private:
 
 inline void NORETURN trap() { __builtin_trap(); }
 
-void NORETURN reportSoftRSSLimit(uptr RssLimitMb) {
-  ScopedErrorReport Report;
-  Report.append("Soft RSS limit of %zu MB exhausted, current RSS is %zu MB\n",
-                RssLimitMb, GetRSS() >> 20);
-}
-
-void NORETURN reportHardRSSLimit(uptr RssLimitMb) {
-  ScopedErrorReport Report;
-  Report.append("Hard RSS limit of %zu MB exhausted, current RSS is %zu MB\n",
-                RssLimitMb, GetRSS() >> 20);
-}
-
 // This could potentially be called recursively if a CHECK fails in the reports.
 void NORETURN reportCheckFailed(const char *File, int Line,
                                 const char *Condition, u64 Value1, u64 Value2) {
@@ -67,6 +51,13 @@ void NORETURN reportError(const char *Message) {
   Report.append("%s\n", Message);
 }
 
+// Generic fatal error message without ScopedString.
+void NORETURN reportRawError(const char *Message) {
+  outputRaw(Message);
+  setAbortMessage(Message);
+  die();
+}
+
 void NORETURN reportInvalidFlag(const char *FlagType, const char *Value) {
   ScopedErrorReport Report;
   Report.append("invalid value for %s option: '%s'\n", FlagType, Value);
@@ -79,14 +70,6 @@ void NORETURN reportHeaderCorruption(void *Ptr) {
   Report.append("corrupted chunk header at address %p\n", Ptr);
 }
 
-// Two threads have attempted to modify a chunk header at the same time. This is
-// symptomatic of a race-condition in the application code, or general lack of
-// proper locking.
-void NORETURN reportHeaderRace(void *Ptr) {
-  ScopedErrorReport Report;
-  Report.append("race on chunk header at address %p\n", Ptr);
-}
-
 // The allocator was compiled with parameters that conflict with field size
 // requirements.
 void NORETURN reportSanityCheckError(const char *Field) {
diff --git a/standalone/report.h b/standalone/report.h
index 3a78ab64b13..a510fdaebb6 100644
--- a/standalone/report.h
+++ b/standalone/report.h
@@ -15,15 +15,17 @@ namespace scudo {
 
 // Reports are *fatal* unless stated otherwise.
 
-// Generic error.
+// Generic error, adds newline to end of message.
 void NORETURN reportError(const char *Message);
 
+// Generic error, but the message is not modified.
+void NORETURN reportRawError(const char *Message);
+
 // Flags related errors.
 void NORETURN reportInvalidFlag(const char *FlagType, const char *Value);
 
 // Chunk header related errors.
 void NORETURN reportHeaderCorruption(void *Ptr);
-void NORETURN reportHeaderRace(void *Ptr);
 
 // Sanity checks related error.
 void NORETURN reportSanityCheckError(const char *Field);
@@ -34,8 +36,6 @@ void NORETURN reportAllocationSizeTooBig(uptr UserSize, uptr TotalSize,
                                          uptr MaxSize);
 void NORETURN reportOutOfBatchClass();
 void NORETURN reportOutOfMemory(uptr RequestedSize);
-void NORETURN reportSoftRSSLimit(uptr RssLimitMb);
-void NORETURN reportHardRSSLimit(uptr RssLimitMb);
 enum class AllocatorAction : u8 {
   Recycling,
   Deallocating,
diff --git a/standalone/report_linux.cpp b/standalone/report_linux.cpp
new file mode 100644
index 00000000000..432f6a01696
--- /dev/null
+++ b/standalone/report_linux.cpp
@@ -0,0 +1,55 @@
+//===-- report_linux.cpp ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "platform.h"
+
+#if SCUDO_LINUX || SCUDO_TRUSTY
+
+#include "common.h"
+#include "internal_defs.h"
+#include "report.h"
+#include "report_linux.h"
+#include "string_utils.h"
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+namespace scudo {
+
+// Fatal internal map() error (potentially OOM related).
+void NORETURN reportMapError(uptr SizeIfOOM) {
+  ScopedString Error;
+  Error.append("Scudo ERROR: internal map failure (error desc=%s)",
+               strerror(errno));
+  if (SizeIfOOM)
+    Error.append(" requesting %zuKB", SizeIfOOM >> 10);
+  Error.append("\n");
+  reportRawError(Error.data());
+}
+
+void NORETURN reportUnmapError(uptr Addr, uptr Size) {
+  ScopedString Error;
+  Error.append("Scudo ERROR: internal unmap failure (error desc=%s) Addr 0x%zx "
+               "Size %zu\n",
+               strerror(errno), Addr, Size);
+  reportRawError(Error.data());
+}
+
+void NORETURN reportProtectError(uptr Addr, uptr Size, int Prot) {
+  ScopedString Error;
+  Error.append(
+      "Scudo ERROR: internal protect failure (error desc=%s) Addr 0x%zx "
+      "Size %zu Prot %x\n",
+      strerror(errno), Addr, Size, Prot);
+  reportRawError(Error.data());
+}
+
+} // namespace scudo
+
+#endif // SCUDO_LINUX || SCUDO_TRUSTY
diff --git a/standalone/report_linux.h b/standalone/report_linux.h
new file mode 100644
index 00000000000..aa0bb247e67
--- /dev/null
+++ b/standalone/report_linux.h
@@ -0,0 +1,34 @@
+//===-- report_linux.h ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_REPORT_LINUX_H_
+#define SCUDO_REPORT_LINUX_H_
+
+#include "platform.h"
+
+#if SCUDO_LINUX || SCUDO_TRUSTY
+
+#include "internal_defs.h"
+
+namespace scudo {
+
+// Report a fatal error when a map call fails. SizeIfOOM shall
+// hold the requested size on an out-of-memory error, 0 otherwise.
+void NORETURN reportMapError(uptr SizeIfOOM = 0);
+
+// Report a fatal error when an unmap call fails.
+void NORETURN reportUnmapError(uptr Addr, uptr Size);
+
+// Report a fatal error when a mprotect call fails.
+void NORETURN reportProtectError(uptr Addr, uptr Size, int Prot);
+
+} // namespace scudo
+
+#endif // SCUDO_LINUX || SCUDO_TRUSTY
+
+#endif // SCUDO_REPORT_LINUX_H_
diff --git a/standalone/rss_limit_checker.cpp b/standalone/rss_limit_checker.cpp
deleted file mode 100644
index f428386b755..00000000000
--- a/standalone/rss_limit_checker.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-//===-- common.cpp ----------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "rss_limit_checker.h"
-#include "atomic_helpers.h"
-#include "string_utils.h"
-
-namespace scudo {
-
-void RssLimitChecker::check(u64 NextCheck) {
-  // The interval for the checks is 250ms.
-  static constexpr u64 CheckInterval = 250 * 1000000;
-
-  // Early return in case another thread already did the calculation.
-  if (!atomic_compare_exchange_strong(&RssNextCheckAtNS, &NextCheck,
-                                      getMonotonicTime() + CheckInterval,
-                                      memory_order_relaxed)) {
-    return;
-  }
-
-  const uptr CurrentRssMb = GetRSS() >> 20;
-
-  RssLimitExceeded Result = RssLimitExceeded::Neither;
-  if (UNLIKELY(HardRssLimitMb && HardRssLimitMb < CurrentRssMb))
-    Result = RssLimitExceeded::Hard;
-  else if (UNLIKELY(SoftRssLimitMb && SoftRssLimitMb < CurrentRssMb))
-    Result = RssLimitExceeded::Soft;
-
-  atomic_store_relaxed(&RssLimitStatus, static_cast<u8>(Result));
-}
-
-} // namespace scudo
diff --git a/standalone/rss_limit_checker.h b/standalone/rss_limit_checker.h
deleted file mode 100644
index 29dc063f3fc..00000000000
--- a/standalone/rss_limit_checker.h
+++ /dev/null
@@ -1,63 +0,0 @@
-//===-- common.h ------------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef SCUDO_RSS_LIMIT_CHECKER_H_
-#define SCUDO_RSS_LIMIT_CHECKER_H_
-
-#include "atomic_helpers.h"
-#include "common.h"
-#include "internal_defs.h"
-
-namespace scudo {
-
-class RssLimitChecker {
-public:
-  enum RssLimitExceeded {
-    Neither,
-    Soft,
-    Hard,
-  };
-
-  void init(int SoftRssLimitMb, int HardRssLimitMb) {
-    CHECK_GE(SoftRssLimitMb, 0);
-    CHECK_GE(HardRssLimitMb, 0);
-    this->SoftRssLimitMb = static_cast<uptr>(SoftRssLimitMb);
-    this->HardRssLimitMb = static_cast<uptr>(HardRssLimitMb);
-  }
-
-  // Opportunistic RSS limit check. This will update the RSS limit status, if
-  // it can, every 250ms, otherwise it will just return the current one.
-  RssLimitExceeded getRssLimitExceeded() {
-    if (!HardRssLimitMb && !SoftRssLimitMb)
-      return RssLimitExceeded::Neither;
-
-    u64 NextCheck = atomic_load_relaxed(&RssNextCheckAtNS);
-    u64 Now = getMonotonicTime();
-
-    if (UNLIKELY(Now >= NextCheck))
-      check(NextCheck);
-
-    return static_cast<RssLimitExceeded>(atomic_load_relaxed(&RssLimitStatus));
-  }
-
-  uptr getSoftRssLimit() const { return SoftRssLimitMb; }
-  uptr getHardRssLimit() const { return HardRssLimitMb; }
-
-private:
-  void check(u64 NextCheck);
-
-  uptr SoftRssLimitMb = 0;
-  uptr HardRssLimitMb = 0;
-
-  atomic_u64 RssNextCheckAtNS = {};
-  atomic_u8 RssLimitStatus = {};
-};
-
-} // namespace scudo
-
-#endif // SCUDO_RSS_LIMIT_CHECKER_H_
diff --git a/standalone/secondary.h b/standalone/secondary.h
index b3128877222..d8c9f5bcfca 100644
--- a/standalone/secondary.h
+++ b/standalone/secondary.h
@@ -12,6 +12,7 @@
 #include "chunk.h"
 #include "common.h"
 #include "list.h"
+#include "mem_map.h"
 #include "memtag.h"
 #include "mutex.h"
 #include "options.h"
@@ -37,9 +38,7 @@ struct alignas(Max<uptr>(archSupportsMemoryTagging()
   LargeBlock::Header *Next;
   uptr CommitBase;
   uptr CommitSize;
-  uptr MapBase;
-  uptr MapSize;
-  [[no_unique_address]] MapPlatformData Data;
+  MemMapT MemMap;
 };
 
 static_assert(sizeof(Header) % (1U << SCUDO_MIN_ALIGNMENT_LOG) == 0, "");
@@ -65,16 +64,34 @@ template <typename Config> static Header *getHeader(const void *Ptr) {
 
 } // namespace LargeBlock
 
-static void unmap(LargeBlock::Header *H) {
-  MapPlatformData Data = H->Data;
-  unmap(reinterpret_cast<void *>(H->MapBase), H->MapSize, UNMAP_ALL, &Data);
+static inline void unmap(LargeBlock::Header *H) {
+  // Note that the `H->MapMap` is stored on the pages managed by itself. Take
+  // over the ownership before unmap() so that any operation along with unmap()
+  // won't touch inaccessible pages.
+  MemMapT MemMap = H->MemMap;
+  MemMap.unmap(MemMap.getBase(), MemMap.getCapacity());
 }
 
-class MapAllocatorNoCache {
+namespace {
+struct CachedBlock {
+  uptr CommitBase = 0;
+  uptr CommitSize = 0;
+  uptr BlockBegin = 0;
+  MemMapT MemMap = {};
+  u64 Time = 0;
+
+  bool isValid() { return CommitBase != 0; }
+
+  void invalidate() { CommitBase = 0; }
+};
+} // namespace
+
+template <typename Config> class MapAllocatorNoCache {
 public:
   void init(UNUSED s32 ReleaseToOsInterval) {}
   bool retrieve(UNUSED Options Options, UNUSED uptr Size, UNUSED uptr Alignment,
-                UNUSED LargeBlock::Header **H, UNUSED bool *Zeroed) {
+                UNUSED uptr HeadersSize, UNUSED LargeBlock::Header **H,
+                UNUSED bool *Zeroed) {
     return false;
   }
   void store(UNUSED Options Options, LargeBlock::Header *H) { unmap(H); }
@@ -91,26 +108,53 @@ public:
     // Not supported by the Secondary Cache, but not an error either.
     return true;
   }
+
+  void getStats(UNUSED ScopedString *Str) {
+    Str->append("Secondary Cache Disabled\n");
+  }
 };
 
 static const uptr MaxUnusedCachePages = 4U;
 
 template <typename Config>
-void mapSecondary(Options Options, uptr CommitBase, uptr CommitSize,
-                  uptr AllocPos, uptr Flags, MapPlatformData *Data) {
-  const uptr MaxUnusedCacheBytes = MaxUnusedCachePages * getPageSizeCached();
+bool mapSecondary(const Options &Options, uptr CommitBase, uptr CommitSize,
+                  uptr AllocPos, uptr Flags, MemMapT &MemMap) {
+  Flags |= MAP_RESIZABLE;
+  Flags |= MAP_ALLOWNOMEM;
+
+  const uptr PageSize = getPageSizeCached();
+  if (SCUDO_TRUSTY) {
+    /*
+     * On Trusty we need AllocPos to be usable for shared memory, which cannot
+     * cross multiple mappings. This means we need to split around AllocPos
+     * and not over it. We can only do this if the address is page-aligned.
+     */
+    const uptr TaggedSize = AllocPos - CommitBase;
+    if (useMemoryTagging<Config>(Options) && isAligned(TaggedSize, PageSize)) {
+      DCHECK_GT(TaggedSize, 0);
+      return MemMap.remap(CommitBase, TaggedSize, "scudo:secondary",
+                          MAP_MEMTAG | Flags) &&
+             MemMap.remap(AllocPos, CommitSize - TaggedSize, "scudo:secondary",
+                          Flags);
+    } else {
+      const uptr RemapFlags =
+          (useMemoryTagging<Config>(Options) ? MAP_MEMTAG : 0) | Flags;
+      return MemMap.remap(CommitBase, CommitSize, "scudo:secondary",
+                          RemapFlags);
+    }
+  }
+
+  const uptr MaxUnusedCacheBytes = MaxUnusedCachePages * PageSize;
   if (useMemoryTagging<Config>(Options) && CommitSize > MaxUnusedCacheBytes) {
     const uptr UntaggedPos = Max(AllocPos, CommitBase + MaxUnusedCacheBytes);
-    map(reinterpret_cast<void *>(CommitBase), UntaggedPos - CommitBase,
-        "scudo:secondary", MAP_RESIZABLE | MAP_MEMTAG | Flags, Data);
-    map(reinterpret_cast<void *>(UntaggedPos),
-        CommitBase + CommitSize - UntaggedPos, "scudo:secondary",
-        MAP_RESIZABLE | Flags, Data);
+    return MemMap.remap(CommitBase, UntaggedPos - CommitBase, "scudo:secondary",
+                        MAP_MEMTAG | Flags) &&
+           MemMap.remap(UntaggedPos, CommitBase + CommitSize - UntaggedPos,
+                        "scudo:secondary", Flags);
   } else {
-    map(reinterpret_cast<void *>(CommitBase), CommitSize, "scudo:secondary",
-        MAP_RESIZABLE | (useMemoryTagging<Config>(Options) ? MAP_MEMTAG : 0) |
-            Flags,
-        Data);
+    const uptr RemapFlags =
+        (useMemoryTagging<Config>(Options) ? MAP_MEMTAG : 0) | Flags;
+    return MemMap.remap(CommitBase, CommitSize, "scudo:secondary", RemapFlags);
   }
 }
 
@@ -129,36 +173,62 @@ public:
 
 template <typename Config> class MapAllocatorCache {
 public:
+  void getStats(ScopedString *Str) {
+    ScopedLock L(Mutex);
+    uptr Integral;
+    uptr Fractional;
+    computePercentage(SuccessfulRetrieves, CallsToRetrieve, &Integral,
+                      &Fractional);
+    const s32 Interval = atomic_load_relaxed(&ReleaseToOsIntervalMs);
+    Str->append(
+        "Stats: MapAllocatorCache: EntriesCount: %d, "
+        "MaxEntriesCount: %u, MaxEntrySize: %zu, ReleaseToOsIntervalMs = %d\n",
+        EntriesCount, atomic_load_relaxed(&MaxEntriesCount),
+        atomic_load_relaxed(&MaxEntrySize), Interval >= 0 ? Interval : -1);
+    Str->append("Stats: CacheRetrievalStats: SuccessRate: %u/%u "
+                "(%zu.%02zu%%)\n",
+                SuccessfulRetrieves, CallsToRetrieve, Integral, Fractional);
+    for (CachedBlock Entry : Entries) {
+      if (!Entry.isValid())
+        continue;
+      Str->append("StartBlockAddress: 0x%zx, EndBlockAddress: 0x%zx, "
+                  "BlockSize: %zu %s\n",
+                  Entry.CommitBase, Entry.CommitBase + Entry.CommitSize,
+                  Entry.CommitSize, Entry.Time == 0 ? "[R]" : "");
+    }
+  }
+
   // Ensure the default maximum specified fits the array.
-  static_assert(Config::SecondaryCacheDefaultMaxEntriesCount <=
-                    Config::SecondaryCacheEntriesArraySize,
+  static_assert(Config::getDefaultMaxEntriesCount() <=
+                    Config::getEntriesArraySize(),
                 "");
 
   void init(s32 ReleaseToOsInterval) NO_THREAD_SAFETY_ANALYSIS {
     DCHECK_EQ(EntriesCount, 0U);
     setOption(Option::MaxCacheEntriesCount,
-              static_cast<sptr>(Config::SecondaryCacheDefaultMaxEntriesCount));
+              static_cast<sptr>(Config::getDefaultMaxEntriesCount()));
     setOption(Option::MaxCacheEntrySize,
-              static_cast<sptr>(Config::SecondaryCacheDefaultMaxEntrySize));
+              static_cast<sptr>(Config::getDefaultMaxEntrySize()));
+    // The default value in the cache config has the higher priority.
+    if (Config::getDefaultReleaseToOsIntervalMs() != INT32_MIN)
+      ReleaseToOsInterval = Config::getDefaultReleaseToOsIntervalMs();
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
 
-  void store(Options Options, LargeBlock::Header *H) EXCLUDES(Mutex) {
+  void store(const Options &Options, LargeBlock::Header *H) EXCLUDES(Mutex) {
     if (!canCache(H->CommitSize))
       return unmap(H);
 
     bool EntryCached = false;
     bool EmptyCache = false;
     const s32 Interval = atomic_load_relaxed(&ReleaseToOsIntervalMs);
-    const u64 Time = getMonotonicTime();
+    const u64 Time = getMonotonicTimeFast();
     const u32 MaxCount = atomic_load_relaxed(&MaxEntriesCount);
     CachedBlock Entry;
     Entry.CommitBase = H->CommitBase;
     Entry.CommitSize = H->CommitSize;
-    Entry.MapBase = H->MapBase;
-    Entry.MapSize = H->MapSize;
     Entry.BlockBegin = reinterpret_cast<uptr>(H + 1);
-    Entry.Data = H->Data;
+    Entry.MemMap = H->MemMap;
     Entry.Time = Time;
     if (useMemoryTagging<Config>(Options)) {
       if (Interval == 0 && !SCUDO_FUCHSIA) {
@@ -168,13 +238,13 @@ public:
         // on top so we just do the two syscalls there.
         Entry.Time = 0;
         mapSecondary<Config>(Options, Entry.CommitBase, Entry.CommitSize,
-                             Entry.CommitBase, MAP_NOACCESS, &Entry.Data);
+                             Entry.CommitBase, MAP_NOACCESS, Entry.MemMap);
       } else {
-        setMemoryPermission(Entry.CommitBase, Entry.CommitSize, MAP_NOACCESS,
-                            &Entry.Data);
+        Entry.MemMap.setMemoryPermission(Entry.CommitBase, Entry.CommitSize,
+                                         MAP_NOACCESS);
       }
     } else if (Interval == 0) {
-      releasePagesToOS(Entry.CommitBase, 0, Entry.CommitSize, &Entry.Data);
+      Entry.MemMap.releaseAndZeroPagesToOS(Entry.CommitBase, Entry.CommitSize);
       Entry.Time = 0;
     }
     do {
@@ -186,11 +256,10 @@ public:
         // just unmap it.
         break;
       }
-      if (Config::SecondaryCacheQuarantineSize &&
-          useMemoryTagging<Config>(Options)) {
+      if (Config::getQuarantineSize() && useMemoryTagging<Config>(Options)) {
         QuarantinePos =
-            (QuarantinePos + 1) % Max(Config::SecondaryCacheQuarantineSize, 1u);
-        if (!Quarantine[QuarantinePos].CommitBase) {
+            (QuarantinePos + 1) % Max(Config::getQuarantineSize(), 1u);
+        if (!Quarantine[QuarantinePos].isValid()) {
           Quarantine[QuarantinePos] = Entry;
           return;
         }
@@ -205,7 +274,7 @@ public:
           EmptyCache = true;
       } else {
         for (u32 I = 0; I < MaxCount; I++) {
-          if (Entries[I].CommitBase)
+          if (Entries[I].isValid())
             continue;
           if (I != 0)
             Entries[I] = Entries[0];
@@ -223,30 +292,34 @@ public:
     else if (Interval >= 0)
       releaseOlderThan(Time - static_cast<u64>(Interval) * 1000000);
     if (!EntryCached)
-      unmap(reinterpret_cast<void *>(Entry.MapBase), Entry.MapSize, UNMAP_ALL,
-            &Entry.Data);
+      Entry.MemMap.unmap(Entry.MemMap.getBase(), Entry.MemMap.getCapacity());
   }
 
-  bool retrieve(Options Options, uptr Size, uptr Alignment,
+  bool retrieve(Options Options, uptr Size, uptr Alignment, uptr HeadersSize,
                 LargeBlock::Header **H, bool *Zeroed) EXCLUDES(Mutex) {
     const uptr PageSize = getPageSizeCached();
     const u32 MaxCount = atomic_load_relaxed(&MaxEntriesCount);
+    // 10% of the requested size proved to be the optimal choice for
+    // retrieving cached blocks after testing several options.
+    constexpr u32 FragmentedBytesDivisor = 10;
     bool Found = false;
     CachedBlock Entry;
-    uptr HeaderPos = 0;
+    uptr EntryHeaderPos = 0;
     {
       ScopedLock L(Mutex);
+      CallsToRetrieve++;
       if (EntriesCount == 0)
         return false;
+      u32 OptimalFitIndex = 0;
+      uptr MinDiff = UINTPTR_MAX;
       for (u32 I = 0; I < MaxCount; I++) {
-        const uptr CommitBase = Entries[I].CommitBase;
-        if (!CommitBase)
+        if (!Entries[I].isValid())
           continue;
+        const uptr CommitBase = Entries[I].CommitBase;
         const uptr CommitSize = Entries[I].CommitSize;
         const uptr AllocPos =
             roundDown(CommitBase + CommitSize - Size, Alignment);
-        HeaderPos =
-            AllocPos - Chunk::getHeaderSize() - LargeBlock::getHeaderSize();
+        const uptr HeaderPos = AllocPos - HeadersSize;
         if (HeaderPos > CommitBase + CommitSize)
           continue;
         if (HeaderPos < CommitBase ||
@@ -254,38 +327,54 @@ public:
           continue;
         }
         Found = true;
-        Entry = Entries[I];
-        Entries[I].CommitBase = 0;
-        break;
+        const uptr Diff = HeaderPos - CommitBase;
+        // immediately use a cached block if it's size is close enough to the
+        // requested size.
+        const uptr MaxAllowedFragmentedBytes =
+            (CommitBase + CommitSize - HeaderPos) / FragmentedBytesDivisor;
+        if (Diff <= MaxAllowedFragmentedBytes) {
+          OptimalFitIndex = I;
+          EntryHeaderPos = HeaderPos;
+          break;
+        }
+        // keep track of the smallest cached block
+        // that is greater than (AllocSize + HeaderSize)
+        if (Diff > MinDiff)
+          continue;
+        OptimalFitIndex = I;
+        MinDiff = Diff;
+        EntryHeaderPos = HeaderPos;
       }
-    }
-    if (Found) {
-      *H = reinterpret_cast<LargeBlock::Header *>(
-          LargeBlock::addHeaderTag<Config>(HeaderPos));
-      *Zeroed = Entry.Time == 0;
-      if (useMemoryTagging<Config>(Options))
-        setMemoryPermission(Entry.CommitBase, Entry.CommitSize, 0, &Entry.Data);
-      uptr NewBlockBegin = reinterpret_cast<uptr>(*H + 1);
-      if (useMemoryTagging<Config>(Options)) {
-        if (*Zeroed)
-          storeTags(LargeBlock::addHeaderTag<Config>(Entry.CommitBase),
-                    NewBlockBegin);
-        else if (Entry.BlockBegin < NewBlockBegin)
-          storeTags(Entry.BlockBegin, NewBlockBegin);
-        else
-          storeTags(untagPointer(NewBlockBegin),
-                    untagPointer(Entry.BlockBegin));
+      if (Found) {
+        Entry = Entries[OptimalFitIndex];
+        Entries[OptimalFitIndex].invalidate();
+        EntriesCount--;
+        SuccessfulRetrieves++;
       }
-      (*H)->CommitBase = Entry.CommitBase;
-      (*H)->CommitSize = Entry.CommitSize;
-      (*H)->MapBase = Entry.MapBase;
-      (*H)->MapSize = Entry.MapSize;
-      (*H)->Data = Entry.Data;
+    }
+    if (!Found)
+      return false;
 
-      ScopedLock L(Mutex);
-      EntriesCount--;
+    *H = reinterpret_cast<LargeBlock::Header *>(
+        LargeBlock::addHeaderTag<Config>(EntryHeaderPos));
+    *Zeroed = Entry.Time == 0;
+    if (useMemoryTagging<Config>(Options))
+      Entry.MemMap.setMemoryPermission(Entry.CommitBase, Entry.CommitSize, 0);
+    uptr NewBlockBegin = reinterpret_cast<uptr>(*H + 1);
+    if (useMemoryTagging<Config>(Options)) {
+      if (*Zeroed) {
+        storeTags(LargeBlock::addHeaderTag<Config>(Entry.CommitBase),
+                  NewBlockBegin);
+      } else if (Entry.BlockBegin < NewBlockBegin) {
+        storeTags(Entry.BlockBegin, NewBlockBegin);
+      } else {
+        storeTags(untagPointer(NewBlockBegin), untagPointer(Entry.BlockBegin));
+      }
     }
-    return Found;
+    (*H)->CommitBase = Entry.CommitBase;
+    (*H)->CommitSize = Entry.CommitSize;
+    (*H)->MemMap = Entry.MemMap;
+    return true;
   }
 
   bool canCache(uptr Size) {
@@ -295,16 +384,15 @@ public:
 
   bool setOption(Option O, sptr Value) {
     if (O == Option::ReleaseInterval) {
-      const s32 Interval =
-          Max(Min(static_cast<s32>(Value),
-                  Config::SecondaryCacheMaxReleaseToOsIntervalMs),
-              Config::SecondaryCacheMinReleaseToOsIntervalMs);
+      const s32 Interval = Max(
+          Min(static_cast<s32>(Value), Config::getMaxReleaseToOsIntervalMs()),
+          Config::getMinReleaseToOsIntervalMs());
       atomic_store_relaxed(&ReleaseToOsIntervalMs, Interval);
       return true;
     }
     if (O == Option::MaxCacheEntriesCount) {
       const u32 MaxCount = static_cast<u32>(Value);
-      if (MaxCount > Config::SecondaryCacheEntriesArraySize)
+      if (MaxCount > Config::getEntriesArraySize())
         return false;
       atomic_store_relaxed(&MaxEntriesCount, MaxCount);
       return true;
@@ -321,18 +409,20 @@ public:
 
   void disableMemoryTagging() EXCLUDES(Mutex) {
     ScopedLock L(Mutex);
-    for (u32 I = 0; I != Config::SecondaryCacheQuarantineSize; ++I) {
-      if (Quarantine[I].CommitBase) {
-        unmap(reinterpret_cast<void *>(Quarantine[I].MapBase),
-              Quarantine[I].MapSize, UNMAP_ALL, &Quarantine[I].Data);
-        Quarantine[I].CommitBase = 0;
+    for (u32 I = 0; I != Config::getQuarantineSize(); ++I) {
+      if (Quarantine[I].isValid()) {
+        MemMapT &MemMap = Quarantine[I].MemMap;
+        MemMap.unmap(MemMap.getBase(), MemMap.getCapacity());
+        Quarantine[I].invalidate();
       }
     }
     const u32 MaxCount = atomic_load_relaxed(&MaxEntriesCount);
-    for (u32 I = 0; I < MaxCount; I++)
-      if (Entries[I].CommitBase)
-        setMemoryPermission(Entries[I].CommitBase, Entries[I].CommitSize, 0,
-                            &Entries[I].Data);
+    for (u32 I = 0; I < MaxCount; I++) {
+      if (Entries[I].isValid()) {
+        Entries[I].MemMap.setMemoryPermission(Entries[I].CommitBase,
+                                              Entries[I].CommitSize, 0);
+      }
+    }
     QuarantinePos = -1U;
   }
 
@@ -344,50 +434,35 @@ public:
 
 private:
   void empty() {
-    struct {
-      void *MapBase;
-      uptr MapSize;
-      MapPlatformData Data;
-    } MapInfo[Config::SecondaryCacheEntriesArraySize];
+    MemMapT MapInfo[Config::getEntriesArraySize()];
     uptr N = 0;
     {
       ScopedLock L(Mutex);
-      for (uptr I = 0; I < Config::SecondaryCacheEntriesArraySize; I++) {
-        if (!Entries[I].CommitBase)
+      for (uptr I = 0; I < Config::getEntriesArraySize(); I++) {
+        if (!Entries[I].isValid())
           continue;
-        MapInfo[N].MapBase = reinterpret_cast<void *>(Entries[I].MapBase);
-        MapInfo[N].MapSize = Entries[I].MapSize;
-        MapInfo[N].Data = Entries[I].Data;
-        Entries[I].CommitBase = 0;
+        MapInfo[N] = Entries[I].MemMap;
+        Entries[I].invalidate();
         N++;
       }
       EntriesCount = 0;
       IsFullEvents = 0;
     }
-    for (uptr I = 0; I < N; I++)
-      unmap(MapInfo[I].MapBase, MapInfo[I].MapSize, UNMAP_ALL,
-            &MapInfo[I].Data);
+    for (uptr I = 0; I < N; I++) {
+      MemMapT &MemMap = MapInfo[I];
+      MemMap.unmap(MemMap.getBase(), MemMap.getCapacity());
+    }
   }
 
-  struct CachedBlock {
-    uptr CommitBase;
-    uptr CommitSize;
-    uptr MapBase;
-    uptr MapSize;
-    uptr BlockBegin;
-    [[no_unique_address]] MapPlatformData Data;
-    u64 Time;
-  };
-
   void releaseIfOlderThan(CachedBlock &Entry, u64 Time) REQUIRES(Mutex) {
-    if (!Entry.CommitBase || !Entry.Time)
+    if (!Entry.isValid() || !Entry.Time)
       return;
     if (Entry.Time > Time) {
       if (OldestTime == 0 || Entry.Time < OldestTime)
         OldestTime = Entry.Time;
       return;
     }
-    releasePagesToOS(Entry.CommitBase, 0, Entry.CommitSize, &Entry.Data);
+    Entry.MemMap.releaseAndZeroPagesToOS(Entry.CommitBase, Entry.CommitSize);
     Entry.Time = 0;
   }
 
@@ -396,9 +471,9 @@ private:
     if (!EntriesCount || OldestTime == 0 || OldestTime > Time)
       return;
     OldestTime = 0;
-    for (uptr I = 0; I < Config::SecondaryCacheQuarantineSize; I++)
+    for (uptr I = 0; I < Config::getQuarantineSize(); I++)
       releaseIfOlderThan(Quarantine[I], Time);
-    for (uptr I = 0; I < Config::SecondaryCacheEntriesArraySize; I++)
+    for (uptr I = 0; I < Config::getEntriesArraySize(); I++)
       releaseIfOlderThan(Entries[I], Time);
   }
 
@@ -410,10 +485,11 @@ private:
   u64 OldestTime GUARDED_BY(Mutex) = 0;
   u32 IsFullEvents GUARDED_BY(Mutex) = 0;
   atomic_s32 ReleaseToOsIntervalMs = {};
+  u32 CallsToRetrieve GUARDED_BY(Mutex) = 0;
+  u32 SuccessfulRetrieves GUARDED_BY(Mutex) = 0;
 
-  CachedBlock
-      Entries[Config::SecondaryCacheEntriesArraySize] GUARDED_BY(Mutex) = {};
-  NonZeroLengthArray<CachedBlock, Config::SecondaryCacheQuarantineSize>
+  CachedBlock Entries[Config::getEntriesArraySize()] GUARDED_BY(Mutex) = {};
+  NonZeroLengthArray<CachedBlock, Config::getQuarantineSize()>
       Quarantine GUARDED_BY(Mutex) = {};
 };
 
@@ -429,11 +505,11 @@ public:
       S->link(&Stats);
   }
 
-  void *allocate(Options Options, uptr Size, uptr AlignmentHint = 0,
+  void *allocate(const Options &Options, uptr Size, uptr AlignmentHint = 0,
                  uptr *BlockEnd = nullptr,
                  FillContentsMode FillContents = NoFill);
 
-  void deallocate(Options Options, void *Ptr);
+  void deallocate(const Options &Options, void *Ptr);
 
   static uptr getBlockEnd(void *Ptr) {
     auto *B = LargeBlock::getHeader<Config>(Ptr);
@@ -444,7 +520,9 @@ public:
     return getBlockEnd(Ptr) - reinterpret_cast<uptr>(Ptr);
   }
 
-  void getStats(ScopedString *Str);
+  static constexpr uptr getHeadersSize() {
+    return Chunk::getHeaderSize() + LargeBlock::getHeaderSize();
+  }
 
   void disable() NO_THREAD_SAFETY_ANALYSIS {
     Mutex.lock();
@@ -477,13 +555,16 @@ public:
 
   void unmapTestOnly() { Cache.unmapTestOnly(); }
 
+  void getStats(ScopedString *Str);
+
 private:
-  typename Config::SecondaryCache Cache;
+  typename Config::template CacheT<typename Config::CacheConfig> Cache;
 
   mutable HybridMutex Mutex;
   DoublyLinkedList<LargeBlock::Header> InUseBlocks GUARDED_BY(Mutex);
   uptr AllocatedBytes GUARDED_BY(Mutex) = 0;
   uptr FreedBytes GUARDED_BY(Mutex) = 0;
+  uptr FragmentedBytes GUARDED_BY(Mutex) = 0;
   uptr LargestSize GUARDED_BY(Mutex) = 0;
   u32 NumberOfAllocs GUARDED_BY(Mutex) = 0;
   u32 NumberOfFrees GUARDED_BY(Mutex) = 0;
@@ -502,24 +583,23 @@ private:
 // the committed memory will amount to something close to Size - AlignmentHint
 // (pending rounding and headers).
 template <typename Config>
-void *MapAllocator<Config>::allocate(Options Options, uptr Size, uptr Alignment,
-                                     uptr *BlockEndPtr,
+void *MapAllocator<Config>::allocate(const Options &Options, uptr Size,
+                                     uptr Alignment, uptr *BlockEndPtr,
                                      FillContentsMode FillContents) {
   if (Options.get(OptionBit::AddLargeAllocationSlack))
     Size += 1UL << SCUDO_MIN_ALIGNMENT_LOG;
   Alignment = Max(Alignment, uptr(1U) << SCUDO_MIN_ALIGNMENT_LOG);
   const uptr PageSize = getPageSizeCached();
-  uptr RoundedSize =
-      roundUp(roundUp(Size, Alignment) + LargeBlock::getHeaderSize() +
-                  Chunk::getHeaderSize(),
-              PageSize);
-  if (Alignment > PageSize)
-    RoundedSize += Alignment - PageSize;
 
-  if (Alignment < PageSize && Cache.canCache(RoundedSize)) {
+  // Note that cached blocks may have aligned address already. Thus we simply
+  // pass the required size (`Size` + `getHeadersSize()`) to do cache look up.
+  const uptr MinNeededSizeForCache = roundUp(Size + getHeadersSize(), PageSize);
+
+  if (Alignment < PageSize && Cache.canCache(MinNeededSizeForCache)) {
     LargeBlock::Header *H;
     bool Zeroed;
-    if (Cache.retrieve(Options, Size, Alignment, &H, &Zeroed)) {
+    if (Cache.retrieve(Options, Size, Alignment, getHeadersSize(), &H,
+                       &Zeroed)) {
       const uptr BlockEnd = H->CommitBase + H->CommitSize;
       if (BlockEndPtr)
         *BlockEndPtr = BlockEnd;
@@ -531,25 +611,35 @@ void *MapAllocator<Config>::allocate(Options Options, uptr Size, uptr Alignment,
       if (FillContents && !Zeroed)
         memset(Ptr, FillContents == ZeroFill ? 0 : PatternFillByte,
                BlockEnd - PtrInt);
-      const uptr BlockSize = BlockEnd - HInt;
       {
         ScopedLock L(Mutex);
         InUseBlocks.push_back(H);
-        AllocatedBytes += BlockSize;
+        AllocatedBytes += H->CommitSize;
+        FragmentedBytes += H->MemMap.getCapacity() - H->CommitSize;
         NumberOfAllocs++;
-        Stats.add(StatAllocated, BlockSize);
-        Stats.add(StatMapped, H->MapSize);
+        Stats.add(StatAllocated, H->CommitSize);
+        Stats.add(StatMapped, H->MemMap.getCapacity());
       }
       return Ptr;
     }
   }
 
-  MapPlatformData Data = {};
+  uptr RoundedSize =
+      roundUp(roundUp(Size, Alignment) + getHeadersSize(), PageSize);
+  if (Alignment > PageSize)
+    RoundedSize += Alignment - PageSize;
+
+  ReservedMemoryT ReservedMemory;
   const uptr MapSize = RoundedSize + 2 * PageSize;
-  uptr MapBase = reinterpret_cast<uptr>(
-      map(nullptr, MapSize, nullptr, MAP_NOACCESS | MAP_ALLOWNOMEM, &Data));
-  if (UNLIKELY(!MapBase))
+  if (UNLIKELY(!ReservedMemory.create(/*Addr=*/0U, MapSize, nullptr,
+                                      MAP_ALLOWNOMEM))) {
     return nullptr;
+  }
+
+  // Take the entire ownership of reserved region.
+  MemMapT MemMap = ReservedMemory.dispatch(ReservedMemory.getBase(),
+                                           ReservedMemory.getCapacity());
+  uptr MapBase = MemMap.getBase();
   uptr CommitBase = MapBase + PageSize;
   uptr MapEnd = MapBase + MapSize;
 
@@ -565,50 +655,52 @@ void *MapAllocator<Config>::allocate(Options Options, uptr Size, uptr Alignment,
     // We only trim the extra memory on 32-bit platforms: 64-bit platforms
     // are less constrained memory wise, and that saves us two syscalls.
     if (SCUDO_WORDSIZE == 32U && NewMapBase != MapBase) {
-      unmap(reinterpret_cast<void *>(MapBase), NewMapBase - MapBase, 0, &Data);
+      MemMap.unmap(MapBase, NewMapBase - MapBase);
       MapBase = NewMapBase;
     }
     const uptr NewMapEnd =
         CommitBase + PageSize + roundUp(Size, PageSize) + PageSize;
     DCHECK_LE(NewMapEnd, MapEnd);
     if (SCUDO_WORDSIZE == 32U && NewMapEnd != MapEnd) {
-      unmap(reinterpret_cast<void *>(NewMapEnd), MapEnd - NewMapEnd, 0, &Data);
+      MemMap.unmap(NewMapEnd, MapEnd - NewMapEnd);
       MapEnd = NewMapEnd;
     }
   }
 
   const uptr CommitSize = MapEnd - PageSize - CommitBase;
   const uptr AllocPos = roundDown(CommitBase + CommitSize - Size, Alignment);
-  mapSecondary<Config>(Options, CommitBase, CommitSize, AllocPos, 0, &Data);
-  const uptr HeaderPos =
-      AllocPos - Chunk::getHeaderSize() - LargeBlock::getHeaderSize();
+  if (!mapSecondary<Config>(Options, CommitBase, CommitSize, AllocPos, 0,
+                            MemMap)) {
+    MemMap.unmap(MemMap.getBase(), MemMap.getCapacity());
+    return nullptr;
+  }
+  const uptr HeaderPos = AllocPos - getHeadersSize();
   LargeBlock::Header *H = reinterpret_cast<LargeBlock::Header *>(
       LargeBlock::addHeaderTag<Config>(HeaderPos));
   if (useMemoryTagging<Config>(Options))
     storeTags(LargeBlock::addHeaderTag<Config>(CommitBase),
               reinterpret_cast<uptr>(H + 1));
-  H->MapBase = MapBase;
-  H->MapSize = MapEnd - MapBase;
   H->CommitBase = CommitBase;
   H->CommitSize = CommitSize;
-  H->Data = Data;
+  H->MemMap = MemMap;
   if (BlockEndPtr)
     *BlockEndPtr = CommitBase + CommitSize;
   {
     ScopedLock L(Mutex);
     InUseBlocks.push_back(H);
     AllocatedBytes += CommitSize;
+    FragmentedBytes += H->MemMap.getCapacity() - CommitSize;
     if (LargestSize < CommitSize)
       LargestSize = CommitSize;
     NumberOfAllocs++;
     Stats.add(StatAllocated, CommitSize);
-    Stats.add(StatMapped, H->MapSize);
+    Stats.add(StatMapped, H->MemMap.getCapacity());
   }
   return reinterpret_cast<void *>(HeaderPos + LargeBlock::getHeaderSize());
 }
 
 template <typename Config>
-void MapAllocator<Config>::deallocate(Options Options, void *Ptr)
+void MapAllocator<Config>::deallocate(const Options &Options, void *Ptr)
     EXCLUDES(Mutex) {
   LargeBlock::Header *H = LargeBlock::getHeader<Config>(Ptr);
   const uptr CommitSize = H->CommitSize;
@@ -616,9 +708,10 @@ void MapAllocator<Config>::deallocate(Options Options, void *Ptr)
     ScopedLock L(Mutex);
     InUseBlocks.remove(H);
     FreedBytes += CommitSize;
+    FragmentedBytes -= H->MemMap.getCapacity() - CommitSize;
     NumberOfFrees++;
     Stats.sub(StatAllocated, CommitSize);
-    Stats.sub(StatMapped, H->MapSize);
+    Stats.sub(StatMapped, H->MemMap.getCapacity());
   }
   Cache.store(Options, H);
 }
@@ -627,10 +720,12 @@ template <typename Config>
 void MapAllocator<Config>::getStats(ScopedString *Str) EXCLUDES(Mutex) {
   ScopedLock L(Mutex);
   Str->append("Stats: MapAllocator: allocated %u times (%zuK), freed %u times "
-              "(%zuK), remains %u (%zuK) max %zuM\n",
+              "(%zuK), remains %u (%zuK) max %zuM, Fragmented %zuK\n",
               NumberOfAllocs, AllocatedBytes >> 10, NumberOfFrees,
               FreedBytes >> 10, NumberOfAllocs - NumberOfFrees,
-              (AllocatedBytes - FreedBytes) >> 10, LargestSize >> 20);
+              (AllocatedBytes - FreedBytes) >> 10, LargestSize >> 20,
+              FragmentedBytes >> 10);
+  Cache.getStats(Str);
 }
 
 } // namespace scudo
diff --git a/standalone/size_class_map.h b/standalone/size_class_map.h
index 766562495ec..4138885de33 100644
--- a/standalone/size_class_map.h
+++ b/standalone/size_class_map.h
@@ -254,7 +254,7 @@ struct AndroidSizeClassConfig {
   static const u16 MaxNumCachedHint = 13;
   static const uptr MaxBytesCachedLog = 13;
 
-  static constexpr u32 Classes[] = {
+  static constexpr uptr Classes[] = {
       0x00020, 0x00030, 0x00040, 0x00050, 0x00060, 0x00070, 0x00090, 0x000b0,
       0x000c0, 0x000e0, 0x00120, 0x00160, 0x001c0, 0x00250, 0x00320, 0x00450,
       0x00670, 0x00830, 0x00a10, 0x00c30, 0x01010, 0x01210, 0x01bd0, 0x02210,
@@ -269,7 +269,7 @@ struct AndroidSizeClassConfig {
   static const u16 MaxNumCachedHint = 14;
   static const uptr MaxBytesCachedLog = 13;
 
-  static constexpr u32 Classes[] = {
+  static constexpr uptr Classes[] = {
       0x00020, 0x00030, 0x00040, 0x00050, 0x00060, 0x00070, 0x00080, 0x00090,
       0x000a0, 0x000b0, 0x000c0, 0x000e0, 0x000f0, 0x00110, 0x00120, 0x00130,
       0x00150, 0x00160, 0x00170, 0x00190, 0x001d0, 0x00210, 0x00240, 0x002a0,
@@ -289,35 +289,11 @@ typedef TableSizeClassMap<AndroidSizeClassConfig> AndroidSizeClassMap;
 static_assert(AndroidSizeClassMap::usesCompressedLSBFormat(), "");
 #endif
 
-struct SvelteSizeClassConfig {
-#if SCUDO_WORDSIZE == 64U
-  static const uptr NumBits = 4;
-  static const uptr MinSizeLog = 4;
-  static const uptr MidSizeLog = 8;
-  static const uptr MaxSizeLog = 14;
-  static const u16 MaxNumCachedHint = 13;
-  static const uptr MaxBytesCachedLog = 10;
-  static const uptr SizeDelta = Chunk::getHeaderSize();
-#else
-  static const uptr NumBits = 4;
-  static const uptr MinSizeLog = 3;
-  static const uptr MidSizeLog = 7;
-  static const uptr MaxSizeLog = 14;
-  static const u16 MaxNumCachedHint = 14;
-  static const uptr MaxBytesCachedLog = 10;
-  static const uptr SizeDelta = Chunk::getHeaderSize();
-#endif
-};
-
-typedef FixedSizeClassMap<SvelteSizeClassConfig> SvelteSizeClassMap;
-
-// Trusty is configured to only have one region containing blocks of size
-// 2^7 bytes.
 struct TrustySizeClassConfig {
   static const uptr NumBits = 1;
-  static const uptr MinSizeLog = 7;
-  static const uptr MidSizeLog = 7;
-  static const uptr MaxSizeLog = 7;
+  static const uptr MinSizeLog = 5;
+  static const uptr MidSizeLog = 5;
+  static const uptr MaxSizeLog = 15;
   static const u16 MaxNumCachedHint = 12;
   static const uptr MaxBytesCachedLog = 10;
   static const uptr SizeDelta = 0;
diff --git a/standalone/stack_depot.h b/standalone/stack_depot.h
index 458198fcb7a..0176c40aa89 100644
--- a/standalone/stack_depot.h
+++ b/standalone/stack_depot.h
@@ -10,6 +10,7 @@
 #define SCUDO_STACK_DEPOT_H_
 
 #include "atomic_helpers.h"
+#include "common.h"
 #include "mutex.h"
 
 namespace scudo {
@@ -38,7 +39,7 @@ public:
   }
 };
 
-class StackDepot {
+class alignas(atomic_u64) StackDepot {
   HybridMutex RingEndMu;
   u32 RingEnd = 0;
 
@@ -62,29 +63,77 @@ class StackDepot {
   // This is achieved by re-checking the hash of the stack trace before
   // returning the trace.
 
-#ifdef SCUDO_FUZZ
-  // Use smaller table sizes for fuzzing in order to reduce input size.
-  static const uptr TabBits = 4;
-#else
-  static const uptr TabBits = 16;
-#endif
-  static const uptr TabSize = 1 << TabBits;
-  static const uptr TabMask = TabSize - 1;
-  atomic_u32 Tab[TabSize] = {};
-
-#ifdef SCUDO_FUZZ
-  static const uptr RingBits = 4;
-#else
-  static const uptr RingBits = 19;
-#endif
-  static const uptr RingSize = 1 << RingBits;
-  static const uptr RingMask = RingSize - 1;
-  atomic_u64 Ring[RingSize] = {};
+  u32 RingSize = 0;
+  u32 RingMask = 0;
+  u32 TabMask = 0;
+  // This is immediately followed by RingSize atomic_u64 and
+  // (TabMask + 1) atomic_u32.
+
+  atomic_u64 *getRing() {
+    return reinterpret_cast<atomic_u64 *>(reinterpret_cast<char *>(this) +
+                                          sizeof(StackDepot));
+  }
+
+  atomic_u32 *getTab() {
+    return reinterpret_cast<atomic_u32 *>(reinterpret_cast<char *>(this) +
+                                          sizeof(StackDepot) +
+                                          sizeof(atomic_u64) * RingSize);
+  }
+
+  const atomic_u64 *getRing() const {
+    return reinterpret_cast<const atomic_u64 *>(
+        reinterpret_cast<const char *>(this) + sizeof(StackDepot));
+  }
+
+  const atomic_u32 *getTab() const {
+    return reinterpret_cast<const atomic_u32 *>(
+        reinterpret_cast<const char *>(this) + sizeof(StackDepot) +
+        sizeof(atomic_u64) * RingSize);
+  }
 
 public:
+  void init(u32 RingSz, u32 TabSz) {
+    DCHECK(isPowerOfTwo(RingSz));
+    DCHECK(isPowerOfTwo(TabSz));
+    RingSize = RingSz;
+    RingMask = RingSz - 1;
+    TabMask = TabSz - 1;
+  }
+
+  // Ensure that RingSize, RingMask and TabMask are set up in a way that
+  // all accesses are within range of BufSize.
+  bool isValid(uptr BufSize) const {
+    if (!isPowerOfTwo(RingSize))
+      return false;
+    uptr RingBytes = sizeof(atomic_u64) * RingSize;
+    if (RingMask + 1 != RingSize)
+      return false;
+
+    if (TabMask == 0)
+      return false;
+    uptr TabSize = TabMask + 1;
+    if (!isPowerOfTwo(TabSize))
+      return false;
+    uptr TabBytes = sizeof(atomic_u32) * TabSize;
+
+    // Subtract and detect underflow.
+    if (BufSize < sizeof(StackDepot))
+      return false;
+    BufSize -= sizeof(StackDepot);
+    if (BufSize < TabBytes)
+      return false;
+    BufSize -= TabBytes;
+    if (BufSize < RingBytes)
+      return false;
+    return BufSize == RingBytes;
+  }
+
   // Insert hash of the stack trace [Begin, End) into the stack depot, and
   // return the hash.
   u32 insert(uptr *Begin, uptr *End) {
+    auto *Tab = getTab();
+    auto *Ring = getRing();
+
     MurMur2HashBuilder B;
     for (uptr *I = Begin; I != End; ++I)
       B.add(u32(*I) >> 2);
@@ -113,6 +162,9 @@ public:
   // accessed via operator[] passing indexes between *RingPosPtr and
   // *RingPosPtr + *SizePtr.
   bool find(u32 Hash, uptr *RingPosPtr, uptr *SizePtr) const {
+    auto *Tab = getTab();
+    auto *Ring = getRing();
+
     u32 Pos = Hash & TabMask;
     u32 RingPos = atomic_load_relaxed(&Tab[Pos]);
     if (RingPos >= RingSize)
@@ -134,11 +186,23 @@ public:
     return B.get() == Hash;
   }
 
-  u64 operator[](uptr RingPos) const {
+  u64 at(uptr RingPos) const {
+    auto *Ring = getRing();
     return atomic_load_relaxed(&Ring[RingPos & RingMask]);
   }
+
+  // This is done for the purpose of fork safety in multithreaded programs and
+  // does not fully disable StackDepot. In particular, find() still works and
+  // only insert() is blocked.
+  void disable() NO_THREAD_SAFETY_ANALYSIS { RingEndMu.lock(); }
+
+  void enable() NO_THREAD_SAFETY_ANALYSIS { RingEndMu.unlock(); }
 };
 
+// We need StackDepot to be aligned to 8-bytes so the ring we store after
+// is correctly assigned.
+static_assert(sizeof(StackDepot) % alignof(atomic_u64) == 0);
+
 } // namespace scudo
 
 #endif // SCUDO_STACK_DEPOT_H_
diff --git a/standalone/string_utils.cpp b/standalone/string_utils.cpp
index 13fdb9c6ca6..e584bd806e5 100644
--- a/standalone/string_utils.cpp
+++ b/standalone/string_utils.cpp
@@ -14,30 +14,21 @@
 
 namespace scudo {
 
-static int appendChar(char **Buffer, const char *BufferEnd, char C) {
-  if (*Buffer < BufferEnd) {
-    **Buffer = C;
-    (*Buffer)++;
-  }
-  return 1;
-}
-
 // Appends number in a given Base to buffer. If its length is less than
 // |MinNumberLength|, it is padded with leading zeroes or spaces, depending
 // on the value of |PadWithZero|.
-static int appendNumber(char **Buffer, const char *BufferEnd, u64 AbsoluteValue,
-                        u8 Base, u8 MinNumberLength, bool PadWithZero,
-                        bool Negative, bool Upper) {
+void ScopedString::appendNumber(u64 AbsoluteValue, u8 Base, u8 MinNumberLength,
+                                bool PadWithZero, bool Negative, bool Upper) {
   constexpr uptr MaxLen = 30;
   RAW_CHECK(Base == 10 || Base == 16);
   RAW_CHECK(Base == 10 || !Negative);
   RAW_CHECK(AbsoluteValue || !Negative);
   RAW_CHECK(MinNumberLength < MaxLen);
-  int Res = 0;
   if (Negative && MinNumberLength)
     --MinNumberLength;
-  if (Negative && PadWithZero)
-    Res += appendChar(Buffer, BufferEnd, '-');
+  if (Negative && PadWithZero) {
+    String.push_back('-');
+  }
   uptr NumBuffer[MaxLen];
   int Pos = 0;
   do {
@@ -55,34 +46,32 @@ static int appendNumber(char **Buffer, const char *BufferEnd, u64 AbsoluteValue,
   Pos--;
   for (; Pos >= 0 && NumBuffer[Pos] == 0; Pos--) {
     char c = (PadWithZero || Pos == 0) ? '0' : ' ';
-    Res += appendChar(Buffer, BufferEnd, c);
+    String.push_back(c);
   }
   if (Negative && !PadWithZero)
-    Res += appendChar(Buffer, BufferEnd, '-');
+    String.push_back('-');
   for (; Pos >= 0; Pos--) {
     char Digit = static_cast<char>(NumBuffer[Pos]);
     Digit = static_cast<char>((Digit < 10) ? '0' + Digit
                                            : (Upper ? 'A' : 'a') + Digit - 10);
-    Res += appendChar(Buffer, BufferEnd, Digit);
+    String.push_back(Digit);
   }
-  return Res;
 }
 
-static int appendUnsigned(char **Buffer, const char *BufferEnd, u64 Num,
-                          u8 Base, u8 MinNumberLength, bool PadWithZero,
-                          bool Upper) {
-  return appendNumber(Buffer, BufferEnd, Num, Base, MinNumberLength,
-                      PadWithZero, /*Negative=*/false, Upper);
+void ScopedString::appendUnsigned(u64 Num, u8 Base, u8 MinNumberLength,
+                                  bool PadWithZero, bool Upper) {
+  appendNumber(Num, Base, MinNumberLength, PadWithZero, /*Negative=*/false,
+               Upper);
 }
 
-static int appendSignedDecimal(char **Buffer, const char *BufferEnd, s64 Num,
-                               u8 MinNumberLength, bool PadWithZero) {
+void ScopedString::appendSignedDecimal(s64 Num, u8 MinNumberLength,
+                                       bool PadWithZero) {
   const bool Negative = (Num < 0);
   const u64 UnsignedNum = (Num == INT64_MIN)
                               ? static_cast<u64>(INT64_MAX) + 1
                               : static_cast<u64>(Negative ? -Num : Num);
-  return appendNumber(Buffer, BufferEnd, UnsignedNum, 10, MinNumberLength,
-                      PadWithZero, Negative, /*Upper=*/false);
+  appendNumber(UnsignedNum, 10, MinNumberLength, PadWithZero, Negative,
+               /*Upper=*/false);
 }
 
 // Use the fact that explicitly requesting 0 Width (%0s) results in UB and
@@ -90,44 +79,45 @@ static int appendSignedDecimal(char **Buffer, const char *BufferEnd, s64 Num,
 // Width == 0 - no Width requested
 // Width  < 0 - left-justify S within and pad it to -Width chars, if necessary
 // Width  > 0 - right-justify S, not implemented yet
-static int appendString(char **Buffer, const char *BufferEnd, int Width,
-                        int MaxChars, const char *S) {
+void ScopedString::appendString(int Width, int MaxChars, const char *S) {
   if (!S)
     S = "<null>";
-  int Res = 0;
+  int NumChars = 0;
   for (; *S; S++) {
-    if (MaxChars >= 0 && Res >= MaxChars)
+    if (MaxChars >= 0 && NumChars >= MaxChars)
       break;
-    Res += appendChar(Buffer, BufferEnd, *S);
+    String.push_back(*S);
+    NumChars++;
+  }
+  if (Width < 0) {
+    // Only left justification supported.
+    Width = -Width - NumChars;
+    while (Width-- > 0)
+      String.push_back(' ');
   }
-  // Only the left justified strings are supported.
-  while (Width < -Res)
-    Res += appendChar(Buffer, BufferEnd, ' ');
-  return Res;
 }
 
-static int appendPointer(char **Buffer, const char *BufferEnd, u64 ptr_value) {
-  int Res = 0;
-  Res += appendString(Buffer, BufferEnd, 0, -1, "0x");
-  Res += appendUnsigned(Buffer, BufferEnd, ptr_value, 16,
-                        SCUDO_POINTER_FORMAT_LENGTH, /*PadWithZero=*/true,
-                        /*Upper=*/false);
-  return Res;
+void ScopedString::appendPointer(u64 ptr_value) {
+  appendString(0, -1, "0x");
+  appendUnsigned(ptr_value, 16, SCUDO_POINTER_FORMAT_LENGTH,
+                 /*PadWithZero=*/true,
+                 /*Upper=*/false);
 }
 
-static int formatString(char *Buffer, uptr BufferLength, const char *Format,
-                        va_list Args) {
+void ScopedString::vappend(const char *Format, va_list &Args) {
+  // Since the string contains the '\0' terminator, put our size before it
+  // so that push_back calls work correctly.
+  DCHECK(String.size() > 0);
+  String.resize(String.size() - 1);
+
   static const char *PrintfFormatsHelp =
-      "Supported formatString formats: %([0-9]*)?(z|ll)?{d,u,x,X}; %p; "
+      "Supported formats: %([0-9]*)?(z|ll)?{d,u,x,X}; %p; "
       "%[-]([0-9]*)?(\\.\\*)?s; %c\n";
   RAW_CHECK(Format);
-  RAW_CHECK(BufferLength > 0);
-  const char *BufferEnd = &Buffer[BufferLength - 1];
   const char *Cur = Format;
-  int Res = 0;
   for (; *Cur; Cur++) {
     if (*Cur != '%') {
-      Res += appendChar(&Buffer, BufferEnd, *Cur);
+      String.push_back(*Cur);
       continue;
     }
     Cur++;
@@ -162,7 +152,7 @@ static int formatString(char *Buffer, uptr BufferLength, const char *Format,
       DVal = HaveLL  ? va_arg(Args, s64)
              : HaveZ ? va_arg(Args, sptr)
                      : va_arg(Args, int);
-      Res += appendSignedDecimal(&Buffer, BufferEnd, DVal, Width, PadWithZero);
+      appendSignedDecimal(DVal, Width, PadWithZero);
       break;
     }
     case 'u':
@@ -172,32 +162,50 @@ static int formatString(char *Buffer, uptr BufferLength, const char *Format,
              : HaveZ ? va_arg(Args, uptr)
                      : va_arg(Args, unsigned);
       const bool Upper = (*Cur == 'X');
-      Res += appendUnsigned(&Buffer, BufferEnd, UVal, (*Cur == 'u') ? 10 : 16,
-                            Width, PadWithZero, Upper);
+      appendUnsigned(UVal, (*Cur == 'u') ? 10 : 16, Width, PadWithZero, Upper);
       break;
     }
     case 'p': {
       RAW_CHECK_MSG(!HaveFlags, PrintfFormatsHelp);
-      Res += appendPointer(&Buffer, BufferEnd, va_arg(Args, uptr));
+      appendPointer(va_arg(Args, uptr));
       break;
     }
     case 's': {
       RAW_CHECK_MSG(!HaveLength, PrintfFormatsHelp);
       // Only left-justified Width is supported.
       CHECK(!HaveWidth || LeftJustified);
-      Res += appendString(&Buffer, BufferEnd, LeftJustified ? -Width : Width,
-                          Precision, va_arg(Args, char *));
+      appendString(LeftJustified ? -Width : Width, Precision,
+                   va_arg(Args, char *));
       break;
     }
     case 'c': {
       RAW_CHECK_MSG(!HaveFlags, PrintfFormatsHelp);
-      Res +=
-          appendChar(&Buffer, BufferEnd, static_cast<char>(va_arg(Args, int)));
+      String.push_back(static_cast<char>(va_arg(Args, int)));
+      break;
+    }
+    // In Scudo, `s64`/`u64` are supposed to use `lld` and `llu` respectively.
+    // However, `-Wformat` doesn't know we have a different parser for those
+    // placeholders and it keeps complaining the type mismatch on 64-bit
+    // platform which uses `ld`/`lu` for `s64`/`u64`. Therefore, in order to
+    // silence the warning, we turn to use `PRId64`/`PRIu64` for printing
+    // `s64`/`u64` and handle the `ld`/`lu` here.
+    case 'l': {
+      ++Cur;
+      RAW_CHECK(*Cur == 'd' || *Cur == 'u');
+
+      if (*Cur == 'd') {
+        DVal = va_arg(Args, s64);
+        appendSignedDecimal(DVal, Width, PadWithZero);
+      } else {
+        UVal = va_arg(Args, u64);
+        appendUnsigned(UVal, 10, Width, PadWithZero, false);
+      }
+
       break;
     }
     case '%': {
       RAW_CHECK_MSG(!HaveFlags, PrintfFormatsHelp);
-      Res += appendChar(&Buffer, BufferEnd, '%');
+      String.push_back('%');
       break;
     }
     default: {
@@ -205,41 +213,19 @@ static int formatString(char *Buffer, uptr BufferLength, const char *Format,
     }
     }
   }
-  RAW_CHECK(Buffer <= BufferEnd);
-  appendChar(&Buffer, BufferEnd + 1, '\0');
-  return Res;
-}
-
-int formatString(char *Buffer, uptr BufferLength, const char *Format, ...) {
-  va_list Args;
-  va_start(Args, Format);
-  int Res = formatString(Buffer, BufferLength, Format, Args);
-  va_end(Args);
-  return Res;
-}
-
-void ScopedString::append(const char *Format, va_list Args) {
-  va_list ArgsCopy;
-  va_copy(ArgsCopy, Args);
-  // formatString doesn't currently support a null buffer or zero buffer length,
-  // so in order to get the resulting formatted string length, we use a one-char
-  // buffer.
-  char C[1];
-  const uptr AdditionalLength =
-      static_cast<uptr>(formatString(C, sizeof(C), Format, Args)) + 1;
-  const uptr Length = length();
-  String.resize(Length + AdditionalLength);
-  const uptr FormattedLength = static_cast<uptr>(formatString(
-      String.data() + Length, String.size() - Length, Format, ArgsCopy));
-  RAW_CHECK(data()[length()] == '\0');
-  RAW_CHECK(FormattedLength + 1 == AdditionalLength);
-  va_end(ArgsCopy);
+  String.push_back('\0');
+  if (String.back() != '\0') {
+    // String truncated, make sure the string is terminated properly.
+    // This can happen if there is no more memory when trying to resize
+    // the string.
+    String.back() = '\0';
+  }
 }
 
 void ScopedString::append(const char *Format, ...) {
   va_list Args;
   va_start(Args, Format);
-  append(Format, Args);
+  vappend(Format, Args);
   va_end(Args);
 }
 
@@ -247,7 +233,7 @@ void Printf(const char *Format, ...) {
   va_list Args;
   va_start(Args, Format);
   ScopedString Msg;
-  Msg.append(Format, Args);
+  Msg.vappend(Format, Args);
   outputRaw(Msg.data());
   va_end(Args);
 }
diff --git a/standalone/string_utils.h b/standalone/string_utils.h
index 41901194dfd..6e00b637797 100644
--- a/standalone/string_utils.h
+++ b/standalone/string_utils.h
@@ -25,17 +25,24 @@ public:
     String.clear();
     String.push_back('\0');
   }
-  void append(const char *Format, va_list Args);
+  void vappend(const char *Format, va_list &Args);
   void append(const char *Format, ...) FORMAT(2, 3);
   void output() const { outputRaw(String.data()); }
   void reserve(size_t Size) { String.reserve(Size + 1); }
+  uptr capacity() { return String.capacity() - 1; }
 
 private:
+  void appendNumber(u64 AbsoluteValue, u8 Base, u8 MinNumberLength,
+                    bool PadWithZero, bool Negative, bool Upper);
+  void appendUnsigned(u64 Num, u8 Base, u8 MinNumberLength, bool PadWithZero,
+                      bool Upper);
+  void appendSignedDecimal(s64 Num, u8 MinNumberLength, bool PadWithZero);
+  void appendString(int Width, int MaxChars, const char *S);
+  void appendPointer(u64 ptr_value);
+
   Vector<char> String;
 };
 
-int formatString(char *Buffer, uptr BufferLength, const char *Format, ...)
-    FORMAT(3, 4);
 void Printf(const char *Format, ...) FORMAT(1, 2);
 
 } // namespace scudo
diff --git a/standalone/tests/allocator_config_test.cpp b/standalone/tests/allocator_config_test.cpp
new file mode 100644
index 00000000000..4c4ceb832e2
--- /dev/null
+++ b/standalone/tests/allocator_config_test.cpp
@@ -0,0 +1,119 @@
+//===-- allocator_config_test.cpp -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "tests/scudo_unit_test.h"
+
+#include "allocator_config.h"
+#include "allocator_config_wrapper.h"
+#include "common.h"
+#include "secondary.h"
+
+#include <type_traits>
+
+struct TestBaseConfig {
+  template <typename> using TSDRegistryT = void;
+  template <typename> using PrimaryT = void;
+  template <typename> using SecondaryT = void;
+};
+
+struct TestBaseConfigEnableOptionalFlag : public TestBaseConfig {
+  static const bool MaySupportMemoryTagging = true;
+  // Use the getter to avoid the test to `use` the address of static const
+  // variable (which requires additional explicit definition).
+  static bool getMaySupportMemoryTagging() { return MaySupportMemoryTagging; }
+};
+
+struct TestBasePrimaryConfig {
+  using SizeClassMap = void;
+  static const scudo::uptr RegionSizeLog = 18U;
+  static const scudo::uptr GroupSizeLog = 18U;
+  static const scudo::s32 MinReleaseToOsIntervalMs = INT32_MIN;
+  static const scudo::s32 MaxReleaseToOsIntervalMs = INT32_MAX;
+  typedef scudo::uptr CompactPtrT;
+  static const scudo::uptr CompactPtrScale = 0;
+  static const scudo::uptr MapSizeIncrement = 1UL << 18;
+};
+
+struct TestPrimaryConfig : public TestBaseConfig {
+  struct Primary : TestBasePrimaryConfig {};
+};
+
+struct TestPrimaryConfigEnableOptionalFlag : public TestBaseConfig {
+  struct Primary : TestBasePrimaryConfig {
+    static const bool EnableRandomOffset = true;
+    static bool getEnableRandomOffset() { return EnableRandomOffset; }
+  };
+};
+
+struct TestPrimaryConfigEnableOptionalType : public TestBaseConfig {
+  struct DummyConditionVariable {};
+
+  struct Primary : TestBasePrimaryConfig {
+    using ConditionVariableT = DummyConditionVariable;
+  };
+};
+
+struct TestSecondaryConfig : public TestPrimaryConfig {
+  struct Secondary {
+    template <typename Config>
+    using CacheT = scudo::MapAllocatorNoCache<Config>;
+  };
+};
+
+struct TestSecondaryCacheConfigEnableOptionalFlag : public TestPrimaryConfig {
+  struct Secondary {
+    struct Cache {
+      static const scudo::u32 EntriesArraySize = 256U;
+      static scudo::u32 getEntriesArraySize() { return EntriesArraySize; }
+    };
+    template <typename T> using CacheT = scudo::MapAllocatorCache<T>;
+  };
+};
+
+TEST(ScudoAllocatorConfigTest, VerifyOptionalFlags) {
+  // Test the top level allocator optional config.
+  //
+  // `MaySupportMemoryTagging` is default off.
+  EXPECT_FALSE(scudo::BaseConfig<TestBaseConfig>::getMaySupportMemoryTagging());
+  EXPECT_EQ(scudo::BaseConfig<
+                TestBaseConfigEnableOptionalFlag>::getMaySupportMemoryTagging(),
+            TestBaseConfigEnableOptionalFlag::getMaySupportMemoryTagging());
+
+  // Test primary optional config.
+  //
+  // `EnableRandomeOffset` is default off.
+  EXPECT_FALSE(
+      scudo::PrimaryConfig<TestPrimaryConfig>::getEnableRandomOffset());
+  EXPECT_EQ(
+      scudo::PrimaryConfig<
+          TestPrimaryConfigEnableOptionalFlag>::getEnableRandomOffset(),
+      TestPrimaryConfigEnableOptionalFlag::Primary::getEnableRandomOffset());
+
+  // `ConditionVariableT` is default off.
+  EXPECT_FALSE(
+      scudo::PrimaryConfig<TestPrimaryConfig>::hasConditionVariableT());
+  EXPECT_TRUE(scudo::PrimaryConfig<
+              TestPrimaryConfigEnableOptionalType>::hasConditionVariableT());
+  EXPECT_TRUE((std::is_same_v<
+               typename scudo::PrimaryConfig<
+                   TestPrimaryConfigEnableOptionalType>::ConditionVariableT,
+               typename TestPrimaryConfigEnableOptionalType::Primary::
+                   ConditionVariableT>));
+
+  // Test secondary cache optional config.
+  using NoCacheConfig =
+      scudo::SecondaryConfig<TestSecondaryConfig>::CacheConfig;
+  // `EntriesArraySize` is default 0.
+  EXPECT_EQ(NoCacheConfig::getEntriesArraySize(), 0U);
+
+  using CacheConfig = scudo::SecondaryConfig<
+      TestSecondaryCacheConfigEnableOptionalFlag>::CacheConfig;
+  EXPECT_EQ(CacheConfig::getEntriesArraySize(),
+            TestSecondaryCacheConfigEnableOptionalFlag::Secondary::Cache::
+                getEntriesArraySize());
+}
diff --git a/standalone/tests/chunk_test.cpp b/standalone/tests/chunk_test.cpp
index 7a29f3c11b7..1b2c1eb5a7d 100644
--- a/standalone/tests/chunk_test.cpp
+++ b/standalone/tests/chunk_test.cpp
@@ -37,29 +37,6 @@ TEST(ScudoChunkDeathTest, ChunkBasic) {
   free(Block);
 }
 
-TEST(ScudoChunkTest, ChunkCmpXchg) {
-  initChecksum();
-  const scudo::uptr Size = 0x100U;
-  scudo::Chunk::UnpackedHeader OldHeader = {};
-  OldHeader.OriginOrWasZeroed = scudo::Chunk::Origin::Malloc;
-  OldHeader.ClassId = 0x42U;
-  OldHeader.SizeOrUnusedBytes = Size;
-  OldHeader.State = scudo::Chunk::State::Allocated;
-  void *Block = malloc(HeaderSize + Size);
-  void *P = reinterpret_cast<void *>(reinterpret_cast<scudo::uptr>(Block) +
-                                     HeaderSize);
-  scudo::Chunk::storeHeader(Cookie, P, &OldHeader);
-  memset(P, 'A', Size);
-  scudo::Chunk::UnpackedHeader NewHeader = OldHeader;
-  NewHeader.State = scudo::Chunk::State::Quarantined;
-  scudo::Chunk::compareExchangeHeader(Cookie, P, &NewHeader, &OldHeader);
-  NewHeader = {};
-  EXPECT_TRUE(scudo::Chunk::isValid(Cookie, P, &NewHeader));
-  EXPECT_EQ(NewHeader.State, scudo::Chunk::State::Quarantined);
-  EXPECT_FALSE(scudo::Chunk::isValid(InvalidCookie, P, &NewHeader));
-  free(Block);
-}
-
 TEST(ScudoChunkDeathTest, CorruptHeader) {
   initChecksum();
   const scudo::uptr Size = 0x100U;
diff --git a/standalone/tests/combined_test.cpp b/standalone/tests/combined_test.cpp
index 6f4fa748ed9..1a36155bcd4 100644
--- a/standalone/tests/combined_test.cpp
+++ b/standalone/tests/combined_test.cpp
@@ -7,12 +7,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "memtag.h"
+#include "stack_depot.h"
 #include "tests/scudo_unit_test.h"
 
 #include "allocator_config.h"
 #include "chunk.h"
 #include "combined.h"
+#include "condition_variable.h"
+#include "mem_map.h"
+#include "size_class_map.h"
 
+#include <algorithm>
 #include <condition_variable>
 #include <memory>
 #include <mutex>
@@ -53,7 +58,7 @@ void checkMemoryTaggingMaybe(AllocatorT *Allocator, void *P, scudo::uptr Size,
     EXPECT_DEATH(
         {
           disableDebuggerdMaybe();
-          reinterpret_cast<char *>(P)[-1] = 0xaa;
+          reinterpret_cast<char *>(P)[-1] = 'A';
         },
         "");
   if (isPrimaryAllocation<AllocatorT>(Size, Alignment)
@@ -62,7 +67,7 @@ void checkMemoryTaggingMaybe(AllocatorT *Allocator, void *P, scudo::uptr Size,
     EXPECT_DEATH(
         {
           disableDebuggerdMaybe();
-          reinterpret_cast<char *>(P)[Size] = 0xaa;
+          reinterpret_cast<char *>(P)[Size] = 'A';
         },
         "");
   }
@@ -77,14 +82,70 @@ template <typename Config> struct TestAllocator : scudo::Allocator<Config> {
   }
   ~TestAllocator() { this->unmapTestOnly(); }
 
-  void *operator new(size_t size) {
+  void *operator new(size_t size);
+  void operator delete(void *ptr);
+};
+
+constexpr size_t kMaxAlign = std::max({
+  alignof(scudo::Allocator<scudo::DefaultConfig>),
+#if SCUDO_CAN_USE_PRIMARY64
+      alignof(scudo::Allocator<scudo::FuchsiaConfig>),
+#endif
+      alignof(scudo::Allocator<scudo::AndroidConfig>)
+});
+
+#if SCUDO_RISCV64
+// The allocator is over 4MB large. Rather than creating an instance of this on
+// the heap, keep it in a global storage to reduce fragmentation from having to
+// mmap this at the start of every test.
+struct TestAllocatorStorage {
+  static constexpr size_t kMaxSize = std::max({
+    sizeof(scudo::Allocator<scudo::DefaultConfig>),
+#if SCUDO_CAN_USE_PRIMARY64
+        sizeof(scudo::Allocator<scudo::FuchsiaConfig>),
+#endif
+        sizeof(scudo::Allocator<scudo::AndroidConfig>)
+  });
+
+  // To alleviate some problem, let's skip the thread safety analysis here.
+  static void *get(size_t size) NO_THREAD_SAFETY_ANALYSIS {
+    CHECK(size <= kMaxSize &&
+          "Allocation size doesn't fit in the allocator storage");
+    M.lock();
+    return AllocatorStorage;
+  }
+
+  static void release(void *ptr) NO_THREAD_SAFETY_ANALYSIS {
+    M.assertHeld();
+    M.unlock();
+    ASSERT_EQ(ptr, AllocatorStorage);
+  }
+
+  static scudo::HybridMutex M;
+  static uint8_t AllocatorStorage[kMaxSize];
+};
+scudo::HybridMutex TestAllocatorStorage::M;
+alignas(kMaxAlign) uint8_t TestAllocatorStorage::AllocatorStorage[kMaxSize];
+#else
+struct TestAllocatorStorage {
+  static void *get(size_t size) NO_THREAD_SAFETY_ANALYSIS {
     void *p = nullptr;
-    EXPECT_EQ(0, posix_memalign(&p, alignof(TestAllocator), size));
+    EXPECT_EQ(0, posix_memalign(&p, kMaxAlign, size));
     return p;
   }
-
-  void operator delete(void *ptr) { free(ptr); }
+  static void release(void *ptr) NO_THREAD_SAFETY_ANALYSIS { free(ptr); }
 };
+#endif
+
+template <typename Config>
+void *TestAllocator<Config>::operator new(size_t size) {
+  return TestAllocatorStorage::get(size);
+}
+
+template <typename Config>
+void TestAllocator<Config>::operator delete(void *ptr) {
+  TestAllocatorStorage::release(ptr);
+}
 
 template <class TypeParam> struct ScudoCombinedTest : public Test {
   ScudoCombinedTest() {
@@ -92,7 +153,7 @@ template <class TypeParam> struct ScudoCombinedTest : public Test {
     Allocator = std::make_unique<AllocatorT>();
   }
   ~ScudoCombinedTest() {
-    Allocator->releaseToOS();
+    Allocator->releaseToOS(scudo::ReleaseToOS::Force);
     UseQuarantine = true;
   }
 
@@ -106,15 +167,59 @@ template <class TypeParam> struct ScudoCombinedTest : public Test {
 
 template <typename T> using ScudoCombinedDeathTest = ScudoCombinedTest<T>;
 
+namespace scudo {
+struct TestConditionVariableConfig {
+  static const bool MaySupportMemoryTagging = true;
+  template <class A>
+  using TSDRegistryT =
+      scudo::TSDRegistrySharedT<A, 8U, 4U>; // Shared, max 8 TSDs.
+
+  struct Primary {
+    using SizeClassMap = scudo::AndroidSizeClassMap;
+#if SCUDO_CAN_USE_PRIMARY64
+    static const scudo::uptr RegionSizeLog = 28U;
+    typedef scudo::u32 CompactPtrT;
+    static const scudo::uptr CompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
+    static const scudo::uptr GroupSizeLog = 20U;
+    static const bool EnableRandomOffset = true;
+    static const scudo::uptr MapSizeIncrement = 1UL << 18;
+#else
+    static const scudo::uptr RegionSizeLog = 18U;
+    static const scudo::uptr GroupSizeLog = 18U;
+    typedef scudo::uptr CompactPtrT;
+#endif
+    static const scudo::s32 MinReleaseToOsIntervalMs = 1000;
+    static const scudo::s32 MaxReleaseToOsIntervalMs = 1000;
+#if SCUDO_LINUX
+    using ConditionVariableT = scudo::ConditionVariableLinux;
+#else
+    using ConditionVariableT = scudo::ConditionVariableDummy;
+#endif
+  };
+#if SCUDO_CAN_USE_PRIMARY64
+  template <typename Config>
+  using PrimaryT = scudo::SizeClassAllocator64<Config>;
+#else
+  template <typename Config>
+  using PrimaryT = scudo::SizeClassAllocator32<Config>;
+#endif
+
+  struct Secondary {
+    template <typename Config>
+    using CacheT = scudo::MapAllocatorNoCache<Config>;
+  };
+  template <typename Config> using SecondaryT = scudo::MapAllocator<Config>;
+};
+} // namespace scudo
+
 #if SCUDO_FUCHSIA
 #define SCUDO_TYPED_TEST_ALL_TYPES(FIXTURE, NAME)                              \
-  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, AndroidSvelteConfig)                    \
   SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, FuchsiaConfig)
 #else
 #define SCUDO_TYPED_TEST_ALL_TYPES(FIXTURE, NAME)                              \
-  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, AndroidSvelteConfig)                    \
   SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, DefaultConfig)                          \
-  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, AndroidConfig)
+  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, AndroidConfig)                          \
+  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConditionVariableConfig)
 #endif
 
 #define SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TYPE)                             \
@@ -124,11 +229,25 @@ template <typename T> using ScudoCombinedDeathTest = ScudoCombinedTest<T>;
 #define SCUDO_TYPED_TEST(FIXTURE, NAME)                                        \
   template <class TypeParam>                                                   \
   struct FIXTURE##NAME : public FIXTURE<TypeParam> {                           \
+    using BaseT = FIXTURE<TypeParam>;                                          \
     void Run();                                                                \
   };                                                                           \
   SCUDO_TYPED_TEST_ALL_TYPES(FIXTURE, NAME)                                    \
   template <class TypeParam> void FIXTURE##NAME<TypeParam>::Run()
 
+// Accessing `TSD->getCache()` requires `TSD::Mutex` which isn't easy to test
+// using thread-safety analysis. Alternatively, we verify the thread safety
+// through a runtime check in ScopedTSD and mark the test body with
+// NO_THREAD_SAFETY_ANALYSIS.
+#define SCUDO_TYPED_TEST_SKIP_THREAD_SAFETY(FIXTURE, NAME)                     \
+  template <class TypeParam>                                                   \
+  struct FIXTURE##NAME : public FIXTURE<TypeParam> {                           \
+    using BaseT = FIXTURE<TypeParam>;                                          \
+    void Run() NO_THREAD_SAFETY_ANALYSIS;                                      \
+  };                                                                           \
+  SCUDO_TYPED_TEST_ALL_TYPES(FIXTURE, NAME)                                    \
+  template <class TypeParam> void FIXTURE##NAME<TypeParam>::Run()
+
 SCUDO_TYPED_TEST(ScudoCombinedTest, IsOwned) {
   auto *Allocator = this->Allocator.get();
   static scudo::u8 StaticBuffer[scudo::Chunk::getHeaderSize() + 1];
@@ -153,9 +272,10 @@ void ScudoCombinedTest<Config>::BasicTest(scudo::uptr SizeLog) {
   for (scudo::uptr AlignLog = MinAlignLog; AlignLog <= 16U; AlignLog++) {
     const scudo::uptr Align = 1U << AlignLog;
     for (scudo::sptr Delta = -32; Delta <= 32; Delta++) {
-      if (static_cast<scudo::sptr>(1U << SizeLog) + Delta < 0)
+      if ((1LL << SizeLog) + Delta < 0)
         continue;
-      const scudo::uptr Size = (1U << SizeLog) + Delta;
+      const scudo::uptr Size =
+          static_cast<scudo::uptr>((1LL << SizeLog) + Delta);
       void *P = Allocator->allocate(Size, Origin, Align);
       EXPECT_NE(P, nullptr);
       EXPECT_TRUE(Allocator->isOwned(P));
@@ -166,6 +286,9 @@ void ScudoCombinedTest<Config>::BasicTest(scudo::uptr SizeLog) {
       Allocator->deallocate(P, Origin, Size);
     }
   }
+
+  Allocator->printStats();
+  Allocator->printFragmentationInfo();
 }
 
 #define SCUDO_MAKE_BASIC_TEST(SizeLog)                                         \
@@ -205,7 +328,7 @@ SCUDO_TYPED_TEST(ScudoCombinedTest, ZeroContents) {
       void *P = Allocator->allocate(Size, Origin, 1U << MinAlignLog, true);
       EXPECT_NE(P, nullptr);
       for (scudo::uptr I = 0; I < Size; I++)
-        ASSERT_EQ((reinterpret_cast<char *>(P))[I], 0);
+        ASSERT_EQ((reinterpret_cast<char *>(P))[I], '\0');
       memset(P, 0xaa, Size);
       Allocator->deallocate(P, Origin, Size);
     }
@@ -223,7 +346,7 @@ SCUDO_TYPED_TEST(ScudoCombinedTest, ZeroFill) {
       void *P = Allocator->allocate(Size, Origin, 1U << MinAlignLog, false);
       EXPECT_NE(P, nullptr);
       for (scudo::uptr I = 0; I < Size; I++)
-        ASSERT_EQ((reinterpret_cast<char *>(P))[I], 0);
+        ASSERT_EQ((reinterpret_cast<char *>(P))[I], '\0');
       memset(P, 0xaa, Size);
       Allocator->deallocate(P, Origin, Size);
     }
@@ -282,7 +405,7 @@ SCUDO_TYPED_TEST(ScudoCombinedTest, ReallocateLargeIncreasing) {
   // we preserve the data in the process.
   scudo::uptr Size = 16;
   void *P = Allocator->allocate(Size, Origin);
-  const char Marker = 0xab;
+  const char Marker = 'A';
   memset(P, Marker, Size);
   while (Size < TypeParam::Primary::SizeClassMap::MaxSize * 4) {
     void *NewP = Allocator->reallocate(P, Size * 2);
@@ -304,7 +427,7 @@ SCUDO_TYPED_TEST(ScudoCombinedTest, ReallocateLargeDecreasing) {
   scudo::uptr Size = TypeParam::Primary::SizeClassMap::MaxSize * 2;
   const scudo::uptr DataSize = 2048U;
   void *P = Allocator->allocate(Size, Origin);
-  const char Marker = 0xab;
+  const char Marker = 'A';
   memset(P, Marker, scudo::Min(Size, DataSize));
   while (Size > 1U) {
     Size /= 2U;
@@ -327,10 +450,11 @@ SCUDO_TYPED_TEST(ScudoCombinedDeathTest, ReallocateSame) {
   constexpr scudo::uptr ReallocSize =
       TypeParam::Primary::SizeClassMap::MaxSize - 64;
   void *P = Allocator->allocate(ReallocSize, Origin);
-  const char Marker = 0xab;
+  const char Marker = 'A';
   memset(P, Marker, ReallocSize);
   for (scudo::sptr Delta = -32; Delta < 32; Delta += 8) {
-    const scudo::uptr NewSize = ReallocSize + Delta;
+    const scudo::uptr NewSize =
+        static_cast<scudo::uptr>(static_cast<scudo::sptr>(ReallocSize) + Delta);
     void *NewP = Allocator->reallocate(P, NewSize);
     EXPECT_EQ(NewP, P);
     for (scudo::uptr I = 0; I < ReallocSize - 32; I++)
@@ -352,11 +476,13 @@ SCUDO_TYPED_TEST(ScudoCombinedTest, IterateOverChunks) {
     std::vector<void *> V;
     for (scudo::uptr I = 0; I < 64U; I++)
       V.push_back(Allocator->allocate(
-          rand() % (TypeParam::Primary::SizeClassMap::MaxSize / 2U), Origin));
+          static_cast<scudo::uptr>(std::rand()) %
+              (TypeParam::Primary::SizeClassMap::MaxSize / 2U),
+          Origin));
     Allocator->disable();
     Allocator->iterateOverChunks(
         0U, static_cast<scudo::uptr>(SCUDO_MMAP_RANGE_SIZE - 1),
-        [](uintptr_t Base, size_t Size, void *Arg) {
+        [](uintptr_t Base, UNUSED size_t Size, void *Arg) {
           std::vector<void *> *V = reinterpret_cast<std::vector<void *> *>(Arg);
           void *P = reinterpret_cast<void *>(Base);
           EXPECT_NE(std::find(V->begin(), V->end(), P), V->end());
@@ -381,7 +507,7 @@ SCUDO_TYPED_TEST(ScudoCombinedDeathTest, UseAfterFree) {
           disableDebuggerdMaybe();
           void *P = Allocator->allocate(Size, Origin);
           Allocator->deallocate(P, Origin);
-          reinterpret_cast<char *>(P)[0] = 0xaa;
+          reinterpret_cast<char *>(P)[0] = 'A';
         },
         "");
     EXPECT_DEATH(
@@ -389,7 +515,7 @@ SCUDO_TYPED_TEST(ScudoCombinedDeathTest, UseAfterFree) {
           disableDebuggerdMaybe();
           void *P = Allocator->allocate(Size, Origin);
           Allocator->deallocate(P, Origin);
-          reinterpret_cast<char *>(P)[Size - 1] = 0xaa;
+          reinterpret_cast<char *>(P)[Size - 1] = 'A';
         },
         "");
   }
@@ -401,18 +527,18 @@ SCUDO_TYPED_TEST(ScudoCombinedDeathTest, DisableMemoryTagging) {
   if (Allocator->useMemoryTaggingTestOnly()) {
     // Check that disabling memory tagging works correctly.
     void *P = Allocator->allocate(2048, Origin);
-    EXPECT_DEATH(reinterpret_cast<char *>(P)[2048] = 0xaa, "");
+    EXPECT_DEATH(reinterpret_cast<char *>(P)[2048] = 'A', "");
     scudo::ScopedDisableMemoryTagChecks NoTagChecks;
     Allocator->disableMemoryTagging();
-    reinterpret_cast<char *>(P)[2048] = 0xaa;
+    reinterpret_cast<char *>(P)[2048] = 'A';
     Allocator->deallocate(P, Origin);
 
     P = Allocator->allocate(2048, Origin);
     EXPECT_EQ(scudo::untagPointer(P), P);
-    reinterpret_cast<char *>(P)[2048] = 0xaa;
+    reinterpret_cast<char *>(P)[2048] = 'A';
     Allocator->deallocate(P, Origin);
 
-    Allocator->releaseToOS();
+    Allocator->releaseToOS(scudo::ReleaseToOS::Force);
   }
 }
 
@@ -435,23 +561,47 @@ SCUDO_TYPED_TEST(ScudoCombinedTest, Stats) {
   EXPECT_NE(Stats.find("Stats: Quarantine"), std::string::npos);
 }
 
-SCUDO_TYPED_TEST(ScudoCombinedTest, CacheDrain) NO_THREAD_SAFETY_ANALYSIS {
+SCUDO_TYPED_TEST_SKIP_THREAD_SAFETY(ScudoCombinedTest, CacheDrain) {
+  using AllocatorT = typename BaseT::AllocatorT;
   auto *Allocator = this->Allocator.get();
 
   std::vector<void *> V;
   for (scudo::uptr I = 0; I < 64U; I++)
     V.push_back(Allocator->allocate(
-        rand() % (TypeParam::Primary::SizeClassMap::MaxSize / 2U), Origin));
+        static_cast<scudo::uptr>(std::rand()) %
+            (TypeParam::Primary::SizeClassMap::MaxSize / 2U),
+        Origin));
   for (auto P : V)
     Allocator->deallocate(P, Origin);
 
-  bool UnlockRequired;
-  auto *TSD = Allocator->getTSDRegistry()->getTSDAndLock(&UnlockRequired);
+  typename AllocatorT::TSDRegistryT::ScopedTSD TSD(
+      *Allocator->getTSDRegistry());
   EXPECT_TRUE(!TSD->getCache().isEmpty());
   TSD->getCache().drain();
   EXPECT_TRUE(TSD->getCache().isEmpty());
-  if (UnlockRequired)
-    TSD->unlock();
+}
+
+SCUDO_TYPED_TEST_SKIP_THREAD_SAFETY(ScudoCombinedTest, ForceCacheDrain) {
+  using AllocatorT = typename BaseT::AllocatorT;
+  auto *Allocator = this->Allocator.get();
+
+  std::vector<void *> V;
+  for (scudo::uptr I = 0; I < 64U; I++)
+    V.push_back(Allocator->allocate(
+        static_cast<scudo::uptr>(std::rand()) %
+            (TypeParam::Primary::SizeClassMap::MaxSize / 2U),
+        Origin));
+  for (auto P : V)
+    Allocator->deallocate(P, Origin);
+
+  // `ForceAll` will also drain the caches.
+  Allocator->releaseToOS(scudo::ReleaseToOS::ForceAll);
+
+  typename AllocatorT::TSDRegistryT::ScopedTSD TSD(
+      *Allocator->getTSDRegistry());
+  EXPECT_TRUE(TSD->getCache().isEmpty());
+  EXPECT_EQ(TSD->getQuarantineCache().getSize(), 0U);
+  EXPECT_TRUE(Allocator->getQuarantine()->isEmpty());
 }
 
 SCUDO_TYPED_TEST(ScudoCombinedTest, ThreadedCombined) {
@@ -469,12 +619,16 @@ SCUDO_TYPED_TEST(ScudoCombinedTest, ThreadedCombined) {
       }
       std::vector<std::pair<void *, scudo::uptr>> V;
       for (scudo::uptr I = 0; I < 256U; I++) {
-        const scudo::uptr Size = std::rand() % 4096U;
+        const scudo::uptr Size = static_cast<scudo::uptr>(std::rand()) % 4096U;
         void *P = Allocator->allocate(Size, Origin);
         // A region could have ran out of memory, resulting in a null P.
         if (P)
           V.push_back(std::make_pair(P, Size));
       }
+
+      // Try to interleave pushBlocks(), popBatch() and releaseToOS().
+      Allocator->releaseToOS(scudo::ReleaseToOS::Force);
+
       while (!V.empty()) {
         auto Pair = V.back();
         Allocator->deallocate(Pair.first, Origin, Pair.second);
@@ -488,18 +642,19 @@ SCUDO_TYPED_TEST(ScudoCombinedTest, ThreadedCombined) {
   }
   for (auto &T : Threads)
     T.join();
-  Allocator->releaseToOS();
+  Allocator->releaseToOS(scudo::ReleaseToOS::Force);
 }
 
 // Test that multiple instantiations of the allocator have not messed up the
 // process's signal handlers (GWP-ASan used to do this).
 TEST(ScudoCombinedDeathTest, SKIP_ON_FUCHSIA(testSEGV)) {
   const scudo::uptr Size = 4 * scudo::getPageSizeCached();
-  scudo::MapPlatformData Data = {};
-  void *P = scudo::map(nullptr, Size, "testSEGV", MAP_NOACCESS, &Data);
-  EXPECT_NE(P, nullptr);
+  scudo::ReservedMemoryT ReservedMemory;
+  ASSERT_TRUE(ReservedMemory.create(/*Addr=*/0U, Size, "testSEGV"));
+  void *P = reinterpret_cast<void *>(ReservedMemory.getBase());
+  ASSERT_NE(P, nullptr);
   EXPECT_DEATH(memset(P, 0xaa, Size), "");
-  scudo::unmap(P, Size, UNMAP_ALL, &Data);
+  ReservedMemory.release();
 }
 
 struct DeathSizeClassConfig {
@@ -515,21 +670,29 @@ struct DeathSizeClassConfig {
 static const scudo::uptr DeathRegionSizeLog = 21U;
 struct DeathConfig {
   static const bool MaySupportMemoryTagging = false;
-
-  // Tiny allocator, its Primary only serves chunks of four sizes.
-  using SizeClassMap = scudo::FixedSizeClassMap<DeathSizeClassConfig>;
-  typedef scudo::SizeClassAllocator64<DeathConfig> Primary;
-  static const scudo::uptr PrimaryRegionSizeLog = DeathRegionSizeLog;
-  static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
-  static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
-  typedef scudo::uptr PrimaryCompactPtrT;
-  static const scudo::uptr PrimaryCompactPtrScale = 0;
-  static const bool PrimaryEnableRandomOffset = true;
-  static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18;
-  static const scudo::uptr PrimaryGroupSizeLog = 18;
-
-  typedef scudo::MapAllocatorNoCache SecondaryCache;
   template <class A> using TSDRegistryT = scudo::TSDRegistrySharedT<A, 1U, 1U>;
+
+  struct Primary {
+    // Tiny allocator, its Primary only serves chunks of four sizes.
+    using SizeClassMap = scudo::FixedSizeClassMap<DeathSizeClassConfig>;
+    static const scudo::uptr RegionSizeLog = DeathRegionSizeLog;
+    static const scudo::s32 MinReleaseToOsIntervalMs = INT32_MIN;
+    static const scudo::s32 MaxReleaseToOsIntervalMs = INT32_MAX;
+    typedef scudo::uptr CompactPtrT;
+    static const scudo::uptr CompactPtrScale = 0;
+    static const bool EnableRandomOffset = true;
+    static const scudo::uptr MapSizeIncrement = 1UL << 18;
+    static const scudo::uptr GroupSizeLog = 18;
+  };
+  template <typename Config>
+  using PrimaryT = scudo::SizeClassAllocator64<Config>;
+
+  struct Secondary {
+    template <typename Config>
+    using CacheT = scudo::MapAllocatorNoCache<Config>;
+  };
+
+  template <typename Config> using SecondaryT = scudo::MapAllocator<Config>;
 };
 
 TEST(ScudoCombinedDeathTest, DeathCombined) {
@@ -574,13 +737,14 @@ TEST(ScudoCombinedTest, FullRegion) {
   std::vector<void *> V;
   scudo::uptr FailedAllocationsCount = 0;
   for (scudo::uptr ClassId = 1U;
-       ClassId <= DeathConfig::SizeClassMap::LargestClassId; ClassId++) {
+       ClassId <= DeathConfig::Primary::SizeClassMap::LargestClassId;
+       ClassId++) {
     const scudo::uptr Size =
-        DeathConfig::SizeClassMap::getSizeByClassId(ClassId);
+        DeathConfig::Primary::SizeClassMap::getSizeByClassId(ClassId);
     // Allocate enough to fill all of the regions above this one.
     const scudo::uptr MaxNumberOfChunks =
         ((1U << DeathRegionSizeLog) / Size) *
-        (DeathConfig::SizeClassMap::LargestClassId - ClassId + 1);
+        (DeathConfig::Primary::SizeClassMap::LargestClassId - ClassId + 1);
     void *P;
     for (scudo::uptr I = 0; I <= MaxNumberOfChunks; I++) {
       P = Allocator->allocate(Size - 64U, Origin);
@@ -601,11 +765,12 @@ TEST(ScudoCombinedTest, FullRegion) {
 // operation without issue.
 SCUDO_TYPED_TEST(ScudoCombinedTest, ReleaseToOS) {
   auto *Allocator = this->Allocator.get();
-  Allocator->releaseToOS();
+  Allocator->releaseToOS(scudo::ReleaseToOS::Force);
 }
 
 SCUDO_TYPED_TEST(ScudoCombinedTest, OddEven) {
   auto *Allocator = this->Allocator.get();
+  Allocator->setOption(scudo::Option::MemtagTuning, M_MEMTAG_TUNING_BUFFER_OVERFLOW);
 
   if (!Allocator->useMemoryTaggingTestOnly())
     return;
@@ -675,7 +840,7 @@ SCUDO_TYPED_TEST(ScudoCombinedTest, DisableMemInit) {
     for (unsigned I = 0; I != Ptrs.size(); ++I) {
       Ptrs[I] = Allocator->allocate(Size, Origin, 1U << MinAlignLog, true);
       for (scudo::uptr J = 0; J < Size; ++J)
-        ASSERT_EQ((reinterpret_cast<char *>(Ptrs[I]))[J], 0);
+        ASSERT_EQ((reinterpret_cast<char *>(Ptrs[I]))[J], '\0');
     }
   }
 
@@ -687,33 +852,120 @@ SCUDO_TYPED_TEST(ScudoCombinedTest, ReallocateInPlaceStress) {
 
   // Regression test: make realloc-in-place happen at the very right end of a
   // mapped region.
-  constexpr int nPtrs = 10000;
-  for (int i = 1; i < 32; ++i) {
+  constexpr size_t nPtrs = 10000;
+  for (scudo::uptr i = 1; i < 32; ++i) {
     scudo::uptr Size = 16 * i - 1;
     std::vector<void *> Ptrs;
-    for (int i = 0; i < nPtrs; ++i) {
+    for (size_t i = 0; i < nPtrs; ++i) {
       void *P = Allocator->allocate(Size, Origin);
       P = Allocator->reallocate(P, Size + 1);
       Ptrs.push_back(P);
     }
 
-    for (int i = 0; i < nPtrs; ++i)
+    for (size_t i = 0; i < nPtrs; ++i)
       Allocator->deallocate(Ptrs[i], Origin);
   }
 }
 
+SCUDO_TYPED_TEST(ScudoCombinedTest, RingBufferDefaultDisabled) {
+  // The RingBuffer is not initialized until tracking is enabled for the
+  // first time.
+  auto *Allocator = this->Allocator.get();
+  EXPECT_EQ(0u, Allocator->getRingBufferSize());
+  EXPECT_EQ(nullptr, Allocator->getRingBufferAddress());
+}
+
+SCUDO_TYPED_TEST(ScudoCombinedTest, RingBufferInitOnce) {
+  auto *Allocator = this->Allocator.get();
+  Allocator->setTrackAllocationStacks(true);
+
+  auto RingBufferSize = Allocator->getRingBufferSize();
+  ASSERT_GT(RingBufferSize, 0u);
+  auto *RingBufferAddress = Allocator->getRingBufferAddress();
+  EXPECT_NE(nullptr, RingBufferAddress);
+
+  // Enable tracking again to verify that the initialization only happens once.
+  Allocator->setTrackAllocationStacks(true);
+  ASSERT_EQ(RingBufferSize, Allocator->getRingBufferSize());
+  EXPECT_EQ(RingBufferAddress, Allocator->getRingBufferAddress());
+}
+
 SCUDO_TYPED_TEST(ScudoCombinedTest, RingBufferSize) {
   auto *Allocator = this->Allocator.get();
-  auto Size = Allocator->getRingBufferSize();
-  if (Size > 0)
-    EXPECT_EQ(Allocator->getRingBufferAddress()[Size - 1], '\0');
+  Allocator->setTrackAllocationStacks(true);
+
+  auto RingBufferSize = Allocator->getRingBufferSize();
+  ASSERT_GT(RingBufferSize, 0u);
+  EXPECT_EQ(Allocator->getRingBufferAddress()[RingBufferSize - 1], '\0');
 }
 
 SCUDO_TYPED_TEST(ScudoCombinedTest, RingBufferAddress) {
   auto *Allocator = this->Allocator.get();
-  auto *Addr = Allocator->getRingBufferAddress();
-  EXPECT_NE(Addr, nullptr);
-  EXPECT_EQ(Addr, Allocator->getRingBufferAddress());
+  Allocator->setTrackAllocationStacks(true);
+
+  auto *RingBufferAddress = Allocator->getRingBufferAddress();
+  EXPECT_NE(RingBufferAddress, nullptr);
+  EXPECT_EQ(RingBufferAddress, Allocator->getRingBufferAddress());
+}
+
+SCUDO_TYPED_TEST(ScudoCombinedTest, StackDepotDefaultDisabled) {
+  // The StackDepot is not initialized until tracking is enabled for the
+  // first time.
+  auto *Allocator = this->Allocator.get();
+  EXPECT_EQ(0u, Allocator->getStackDepotSize());
+  EXPECT_EQ(nullptr, Allocator->getStackDepotAddress());
+}
+
+SCUDO_TYPED_TEST(ScudoCombinedTest, StackDepotInitOnce) {
+  auto *Allocator = this->Allocator.get();
+  Allocator->setTrackAllocationStacks(true);
+
+  auto StackDepotSize = Allocator->getStackDepotSize();
+  EXPECT_GT(StackDepotSize, 0u);
+  auto *StackDepotAddress = Allocator->getStackDepotAddress();
+  EXPECT_NE(nullptr, StackDepotAddress);
+
+  // Enable tracking again to verify that the initialization only happens once.
+  Allocator->setTrackAllocationStacks(true);
+  EXPECT_EQ(StackDepotSize, Allocator->getStackDepotSize());
+  EXPECT_EQ(StackDepotAddress, Allocator->getStackDepotAddress());
+}
+
+SCUDO_TYPED_TEST(ScudoCombinedTest, StackDepotSize) {
+  auto *Allocator = this->Allocator.get();
+  Allocator->setTrackAllocationStacks(true);
+
+  auto StackDepotSize = Allocator->getStackDepotSize();
+  EXPECT_GT(StackDepotSize, 0u);
+  EXPECT_EQ(Allocator->getStackDepotAddress()[StackDepotSize - 1], '\0');
+}
+
+SCUDO_TYPED_TEST(ScudoCombinedTest, StackDepotAddress) {
+  auto *Allocator = this->Allocator.get();
+  Allocator->setTrackAllocationStacks(true);
+
+  auto *StackDepotAddress = Allocator->getStackDepotAddress();
+  EXPECT_NE(StackDepotAddress, nullptr);
+  EXPECT_EQ(StackDepotAddress, Allocator->getStackDepotAddress());
+}
+
+SCUDO_TYPED_TEST(ScudoCombinedTest, StackDepot) {
+  alignas(scudo::StackDepot) char Buf[sizeof(scudo::StackDepot) +
+                                      1024 * sizeof(scudo::atomic_u64) +
+                                      1024 * sizeof(scudo::atomic_u32)] = {};
+  auto *Depot = reinterpret_cast<scudo::StackDepot *>(Buf);
+  Depot->init(1024, 1024);
+  ASSERT_TRUE(Depot->isValid(sizeof(Buf)));
+  ASSERT_FALSE(Depot->isValid(sizeof(Buf) - 1));
+  scudo::uptr Stack[] = {1, 2, 3};
+  scudo::u32 Elem = Depot->insert(&Stack[0], &Stack[3]);
+  scudo::uptr RingPosPtr = 0;
+  scudo::uptr SizePtr = 0;
+  ASSERT_TRUE(Depot->find(Elem, &RingPosPtr, &SizePtr));
+  ASSERT_EQ(SizePtr, 3u);
+  EXPECT_EQ(Depot->at(RingPosPtr), 1u);
+  EXPECT_EQ(Depot->at(RingPosPtr + 1), 2u);
+  EXPECT_EQ(Depot->at(RingPosPtr + 2), 3u);
 }
 
 #if SCUDO_CAN_USE_PRIMARY64
@@ -737,49 +989,12 @@ TEST(ScudoCombinedTest, BasicTrustyConfig) {
   }
 
   bool UnlockRequired;
-  auto *TSD = Allocator->getTSDRegistry()->getTSDAndLock(&UnlockRequired);
+  typename AllocatorT::TSDRegistryT::ScopedTSD TSD(
+      *Allocator->getTSDRegistry());
   TSD->getCache().drain();
 
-  Allocator->releaseToOS();
+  Allocator->releaseToOS(scudo::ReleaseToOS::Force);
 }
 
 #endif
 #endif
-
-#if SCUDO_LINUX
-
-SCUDO_TYPED_TEST(ScudoCombinedTest, SoftRssLimit) {
-  auto *Allocator = this->Allocator.get();
-  Allocator->setRssLimitsTestOnly(1, 0, true);
-
-  size_t Megabyte = 1024 * 1024;
-  size_t ChunkSize = 16;
-  size_t Error = 256;
-
-  std::vector<void *> Ptrs;
-  for (size_t index = 0; index < Megabyte + Error; index += ChunkSize) {
-    void *Ptr = Allocator->allocate(ChunkSize, Origin);
-    Ptrs.push_back(Ptr);
-  }
-
-  EXPECT_EQ(nullptr, Allocator->allocate(ChunkSize, Origin));
-
-  for (void *Ptr : Ptrs)
-    Allocator->deallocate(Ptr, Origin);
-}
-
-SCUDO_TYPED_TEST(ScudoCombinedTest, HardRssLimit) {
-  auto *Allocator = this->Allocator.get();
-  Allocator->setRssLimitsTestOnly(0, 1, false);
-
-  size_t Megabyte = 1024 * 1024;
-
-  EXPECT_DEATH(
-      {
-        disableDebuggerdMaybe();
-        Allocator->allocate(Megabyte, Origin);
-      },
-      "");
-}
-
-#endif
diff --git a/standalone/tests/common_test.cpp b/standalone/tests/common_test.cpp
index a322a01fb93..fff7c662a41 100644
--- a/standalone/tests/common_test.cpp
+++ b/standalone/tests/common_test.cpp
@@ -10,6 +10,7 @@
 #include "tests/scudo_unit_test.h"
 
 #include "common.h"
+#include "mem_map.h"
 #include <algorithm>
 #include <fstream>
 
@@ -34,60 +35,41 @@ TEST(ScudoCommonTest, SKIP_ON_FUCHSIA(ResidentMemorySize)) {
   const uptr Size = 1ull << 30;
   const uptr Threshold = Size >> 3;
 
-  MapPlatformData Data = {};
-  void *P = map(nullptr, Size, "ResidentMemorySize", 0, &Data);
-  ASSERT_NE(nullptr, P);
+  MemMapT MemMap;
+  ASSERT_TRUE(MemMap.map(/*Addr=*/0U, Size, "ResidentMemorySize"));
+  ASSERT_NE(MemMap.getBase(), 0U);
+  void *P = reinterpret_cast<void *>(MemMap.getBase());
   EXPECT_LT(getResidentMemorySize(), OnStart + Threshold);
 
   memset(P, 1, Size);
   EXPECT_GT(getResidentMemorySize(), OnStart + Size - Threshold);
 
-  releasePagesToOS((uptr)P, 0, Size, &Data);
+  MemMap.releasePagesToOS(MemMap.getBase(), Size);
   EXPECT_LT(getResidentMemorySize(), OnStart + Threshold);
 
   memset(P, 1, Size);
   EXPECT_GT(getResidentMemorySize(), OnStart + Size - Threshold);
 
-  unmap(P, Size, 0, &Data);
+  MemMap.unmap(MemMap.getBase(), Size);
 }
 
 TEST(ScudoCommonTest, Zeros) {
   const uptr Size = 1ull << 20;
 
-  MapPlatformData Data = {};
-  uptr *P = reinterpret_cast<uptr *>(map(nullptr, Size, "Zeros", 0, &Data));
-  const ptrdiff_t N = Size / sizeof(*P);
-  ASSERT_NE(nullptr, P);
+  MemMapT MemMap;
+  ASSERT_TRUE(MemMap.map(/*Addr=*/0U, Size, "Zeros"));
+  ASSERT_NE(MemMap.getBase(), 0U);
+  uptr *P = reinterpret_cast<uptr *>(MemMap.getBase());
+  const ptrdiff_t N = Size / sizeof(uptr);
   EXPECT_EQ(std::count(P, P + N, 0), N);
 
   memset(P, 1, Size);
   EXPECT_EQ(std::count(P, P + N, 0), 0);
 
-  releasePagesToOS((uptr)P, 0, Size, &Data);
+  MemMap.releasePagesToOS(MemMap.getBase(), Size);
   EXPECT_EQ(std::count(P, P + N, 0), N);
 
-  unmap(P, Size, 0, &Data);
+  MemMap.unmap(MemMap.getBase(), Size);
 }
 
-#if SCUDO_LINUX && !defined(__powerpc__)
-// This test fails intermediately on PPC, which is why this test is disabled
-// for now on this platform.
-TEST(ScudoCommonTest, GetRssFromBuffer) {
-  constexpr int64_t AllocSize = 10000000;
-  constexpr int64_t Error = 3000000;
-  constexpr size_t Runs = 10;
-
-  int64_t Rss = scudo::GetRSS();
-  EXPECT_GT(Rss, 0);
-
-  std::vector<std::unique_ptr<char[]>> Allocs(Runs);
-  for (auto &Alloc : Allocs) {
-    Alloc.reset(new char[AllocSize]());
-    int64_t Prev = Rss;
-    Rss = scudo::GetRSS();
-    EXPECT_LE(std::abs(Rss - AllocSize - Prev), Error);
-  }
-}
-#endif // SCUDO_LINUX
-
 } // namespace scudo
diff --git a/standalone/tests/condition_variable_test.cpp b/standalone/tests/condition_variable_test.cpp
new file mode 100644
index 00000000000..caba1f64ab0
--- /dev/null
+++ b/standalone/tests/condition_variable_test.cpp
@@ -0,0 +1,59 @@
+//===-- condition_variable_test.cpp -----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "tests/scudo_unit_test.h"
+
+#include "common.h"
+#include "condition_variable.h"
+#include "mutex.h"
+
+#include <thread>
+
+template <typename ConditionVariableT> void simpleWaitAndNotifyAll() {
+  constexpr scudo::u32 NumThreads = 2;
+  constexpr scudo::u32 CounterMax = 1024;
+  std::thread Threads[NumThreads];
+
+  scudo::HybridMutex M;
+  ConditionVariableT CV;
+  CV.bindTestOnly(M);
+  scudo::u32 Counter = 0;
+
+  for (scudo::u32 I = 0; I < NumThreads; ++I) {
+    Threads[I] = std::thread(
+        [&](scudo::u32 Id) {
+          do {
+            scudo::ScopedLock L(M);
+            if (Counter % NumThreads != Id && Counter < CounterMax)
+              CV.wait(M);
+            if (Counter >= CounterMax) {
+              break;
+            } else {
+              ++Counter;
+              CV.notifyAll(M);
+            }
+          } while (true);
+        },
+        I);
+  }
+
+  for (std::thread &T : Threads)
+    T.join();
+
+  EXPECT_EQ(Counter, CounterMax);
+}
+
+TEST(ScudoConditionVariableTest, DummyCVWaitAndNotifyAll) {
+  simpleWaitAndNotifyAll<scudo::ConditionVariableDummy>();
+}
+
+#ifdef SCUDO_LINUX
+TEST(ScudoConditionVariableTest, LinuxCVWaitAndNotifyAll) {
+  simpleWaitAndNotifyAll<scudo::ConditionVariableLinux>();
+}
+#endif
diff --git a/standalone/tests/map_test.cpp b/standalone/tests/map_test.cpp
index ff05258db58..06a56f84803 100644
--- a/standalone/tests/map_test.cpp
+++ b/standalone/tests/map_test.cpp
@@ -9,6 +9,7 @@
 #include "tests/scudo_unit_test.h"
 
 #include "common.h"
+#include "mem_map.h"
 
 #include <string.h>
 #include <unistd.h>
@@ -22,11 +23,15 @@ TEST(ScudoMapTest, PageSize) {
 
 TEST(ScudoMapDeathTest, MapNoAccessUnmap) {
   const scudo::uptr Size = 4 * scudo::getPageSizeCached();
-  scudo::MapPlatformData Data = {};
-  void *P = scudo::map(nullptr, Size, MappingName, MAP_NOACCESS, &Data);
-  EXPECT_NE(P, nullptr);
-  EXPECT_DEATH(memset(P, 0xaa, Size), "");
-  scudo::unmap(P, Size, UNMAP_ALL, &Data);
+  scudo::ReservedMemoryT ReservedMemory;
+
+  ASSERT_TRUE(ReservedMemory.create(/*Addr=*/0U, Size, MappingName));
+  EXPECT_NE(ReservedMemory.getBase(), 0U);
+  EXPECT_DEATH(
+      memset(reinterpret_cast<void *>(ReservedMemory.getBase()), 0xaa, Size),
+      "");
+
+  ReservedMemory.release();
 }
 
 TEST(ScudoMapDeathTest, MapUnmap) {
@@ -36,11 +41,13 @@ TEST(ScudoMapDeathTest, MapUnmap) {
         // Repeat few time to avoid missing crash if it's mmaped by unrelated
         // code.
         for (int i = 0; i < 10; ++i) {
-          void *P = scudo::map(nullptr, Size, MappingName, 0, nullptr);
-          if (!P)
+          scudo::MemMapT MemMap;
+          MemMap.map(/*Addr=*/0U, Size, MappingName);
+          scudo::uptr P = MemMap.getBase();
+          if (P == 0U)
             continue;
-          scudo::unmap(P, Size, 0, nullptr);
-          memset(P, 0xbb, Size);
+          MemMap.unmap(MemMap.getBase(), Size);
+          memset(reinterpret_cast<void *>(P), 0xbb, Size);
         }
       },
       "");
@@ -49,30 +56,36 @@ TEST(ScudoMapDeathTest, MapUnmap) {
 TEST(ScudoMapDeathTest, MapWithGuardUnmap) {
   const scudo::uptr PageSize = scudo::getPageSizeCached();
   const scudo::uptr Size = 4 * PageSize;
-  scudo::MapPlatformData Data = {};
-  void *P = scudo::map(nullptr, Size + 2 * PageSize, MappingName, MAP_NOACCESS,
-                       &Data);
-  EXPECT_NE(P, nullptr);
-  void *Q =
-      reinterpret_cast<void *>(reinterpret_cast<scudo::uptr>(P) + PageSize);
-  EXPECT_EQ(scudo::map(Q, Size, MappingName, 0, &Data), Q);
-  memset(Q, 0xaa, Size);
-  EXPECT_DEATH(memset(Q, 0xaa, Size + 1), "");
-  scudo::unmap(P, Size + 2 * PageSize, UNMAP_ALL, &Data);
+  scudo::ReservedMemoryT ReservedMemory;
+  ASSERT_TRUE(
+      ReservedMemory.create(/*Addr=*/0U, Size + 2 * PageSize, MappingName));
+  ASSERT_NE(ReservedMemory.getBase(), 0U);
+
+  scudo::MemMapT MemMap =
+      ReservedMemory.dispatch(ReservedMemory.getBase(), Size + 2 * PageSize);
+  ASSERT_TRUE(MemMap.isAllocated());
+  scudo::uptr Q = MemMap.getBase() + PageSize;
+  ASSERT_TRUE(MemMap.remap(Q, Size, MappingName));
+  memset(reinterpret_cast<void *>(Q), 0xaa, Size);
+  EXPECT_DEATH(memset(reinterpret_cast<void *>(Q), 0xaa, Size + 1), "");
+  MemMap.unmap(MemMap.getBase(), MemMap.getCapacity());
 }
 
 TEST(ScudoMapTest, MapGrowUnmap) {
   const scudo::uptr PageSize = scudo::getPageSizeCached();
   const scudo::uptr Size = 4 * PageSize;
-  scudo::MapPlatformData Data = {};
-  void *P = scudo::map(nullptr, Size, MappingName, MAP_NOACCESS, &Data);
-  EXPECT_NE(P, nullptr);
-  void *Q =
-      reinterpret_cast<void *>(reinterpret_cast<scudo::uptr>(P) + PageSize);
-  EXPECT_EQ(scudo::map(Q, PageSize, MappingName, 0, &Data), Q);
-  memset(Q, 0xaa, PageSize);
-  Q = reinterpret_cast<void *>(reinterpret_cast<scudo::uptr>(Q) + PageSize);
-  EXPECT_EQ(scudo::map(Q, PageSize, MappingName, 0, &Data), Q);
-  memset(Q, 0xbb, PageSize);
-  scudo::unmap(P, Size, UNMAP_ALL, &Data);
+  scudo::ReservedMemoryT ReservedMemory;
+  ReservedMemory.create(/*Addr=*/0U, Size, MappingName);
+  ASSERT_TRUE(ReservedMemory.isCreated());
+
+  scudo::MemMapT MemMap =
+      ReservedMemory.dispatch(ReservedMemory.getBase(), Size);
+  ASSERT_TRUE(MemMap.isAllocated());
+  scudo::uptr Q = MemMap.getBase() + PageSize;
+  ASSERT_TRUE(MemMap.remap(Q, PageSize, MappingName));
+  memset(reinterpret_cast<void *>(Q), 0xaa, PageSize);
+  Q += PageSize;
+  ASSERT_TRUE(MemMap.remap(Q, PageSize, MappingName));
+  memset(reinterpret_cast<void *>(Q), 0xbb, PageSize);
+  MemMap.unmap(MemMap.getBase(), MemMap.getCapacity());
 }
diff --git a/standalone/tests/memtag_test.cpp b/standalone/tests/memtag_test.cpp
index 8a40eda3a57..37a18858e67 100644
--- a/standalone/tests/memtag_test.cpp
+++ b/standalone/tests/memtag_test.cpp
@@ -7,16 +7,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "common.h"
+#include "mem_map.h"
 #include "memtag.h"
 #include "platform.h"
 #include "tests/scudo_unit_test.h"
 
+extern "C" void __hwasan_init() __attribute__((weak));
+
 #if SCUDO_LINUX
 namespace scudo {
 
 TEST(MemtagBasicDeathTest, Unsupported) {
   if (archSupportsMemoryTagging())
     GTEST_SKIP();
+  // Skip when running with HWASan.
+  if (&__hwasan_init != 0)
+    GTEST_SKIP();
 
   EXPECT_DEATH(archMemoryTagGranuleSize(), "not supported");
   EXPECT_DEATH(untagPointer((uptr)0), "not supported");
@@ -45,20 +51,24 @@ protected:
       GTEST_SKIP() << "Memory tagging is not supported";
 
     BufferSize = getPageSizeCached();
-    Buffer = reinterpret_cast<u8 *>(
-        map(nullptr, BufferSize, "MemtagTest", MAP_MEMTAG, &Data));
-    Addr = reinterpret_cast<uptr>(Buffer);
+    ASSERT_FALSE(MemMap.isAllocated());
+    ASSERT_TRUE(MemMap.map(/*Addr=*/0U, BufferSize, "MemtagTest", MAP_MEMTAG));
+    ASSERT_NE(MemMap.getBase(), 0U);
+    Addr = MemMap.getBase();
+    Buffer = reinterpret_cast<u8 *>(Addr);
     EXPECT_TRUE(isAligned(Addr, archMemoryTagGranuleSize()));
     EXPECT_EQ(Addr, untagPointer(Addr));
   }
 
   void TearDown() override {
-    if (Buffer)
-      unmap(Buffer, BufferSize, 0, &Data);
+    if (Buffer) {
+      ASSERT_TRUE(MemMap.isAllocated());
+      MemMap.unmap(MemMap.getBase(), MemMap.getCapacity());
+    }
   }
 
   uptr BufferSize = 0;
-  MapPlatformData Data = {};
+  scudo::MemMapT MemMap = {};
   u8 *Buffer = nullptr;
   uptr Addr = 0;
 };
@@ -71,20 +81,24 @@ TEST_F(MemtagTest, ArchMemoryTagGranuleSize) {
 }
 
 TEST_F(MemtagTest, ExtractTag) {
+// The test is already skipped on anything other than 64 bit. But
+// compiling on 32 bit leads to warnings/errors, so skip compiling the test.
+#if defined(__LP64__)
   uptr Tags = 0;
   // Try all value for the top byte and check the tags values are in the
   // expected range.
   for (u64 Top = 0; Top < 0x100; ++Top)
     Tags = Tags | (1u << extractTag(Addr | (Top << 56)));
   EXPECT_EQ(0xffffull, Tags);
+#endif
 }
 
 TEST_F(MemtagDeathTest, AddFixedTag) {
   for (uptr Tag = 0; Tag < 0x10; ++Tag)
     EXPECT_EQ(Tag, extractTag(addFixedTag(Addr, Tag)));
   if (SCUDO_DEBUG) {
-    EXPECT_DEBUG_DEATH(addFixedTag(Addr, 16), "");
-    EXPECT_DEBUG_DEATH(addFixedTag(~Addr, 0), "");
+    EXPECT_DEATH(addFixedTag(Addr, 16), "");
+    EXPECT_DEATH(addFixedTag(~Addr, 0), "");
   }
 }
 
@@ -111,23 +125,35 @@ TEST_F(MemtagTest, SelectRandomTag) {
     uptr Tags = 0;
     for (uptr I = 0; I < 100000; ++I)
       Tags = Tags | (1u << extractTag(selectRandomTag(Ptr, 0)));
-    EXPECT_EQ(0xfffeull, Tags);
+    // std::popcnt is C++20
+    int PopCnt = 0;
+    while (Tags) {
+      PopCnt += Tags & 1;
+      Tags >>= 1;
+    }
+    // Random tags are not always very random, and this test is not about PRNG
+    // quality.  Anything above half would be satisfactory.
+    EXPECT_GE(PopCnt, 8);
   }
 }
 
 TEST_F(MemtagTest, SelectRandomTagWithMask) {
+// The test is already skipped on anything other than 64 bit. But
+// compiling on 32 bit leads to warnings/errors, so skip compiling the test.
+#if defined(__LP64__)
   for (uptr j = 0; j < 32; ++j) {
     for (uptr i = 0; i < 1000; ++i)
       EXPECT_NE(j, extractTag(selectRandomTag(Addr, 1ull << j)));
   }
+#endif
 }
 
 TEST_F(MemtagDeathTest, SKIP_NO_DEBUG(LoadStoreTagUnaligned)) {
   for (uptr P = Addr; P < Addr + 4 * archMemoryTagGranuleSize(); ++P) {
     if (P % archMemoryTagGranuleSize() == 0)
       continue;
-    EXPECT_DEBUG_DEATH(loadTag(P), "");
-    EXPECT_DEBUG_DEATH(storeTag(P), "");
+    EXPECT_DEATH(loadTag(P), "");
+    EXPECT_DEATH(storeTag(P), "");
   }
 }
 
@@ -148,11 +174,14 @@ TEST_F(MemtagDeathTest, SKIP_NO_DEBUG(StoreTagsUnaligned)) {
     uptr Tagged = addFixedTag(P, 5);
     if (Tagged % archMemoryTagGranuleSize() == 0)
       continue;
-    EXPECT_DEBUG_DEATH(storeTags(Tagged, Tagged), "");
+    EXPECT_DEATH(storeTags(Tagged, Tagged), "");
   }
 }
 
 TEST_F(MemtagTest, StoreTags) {
+// The test is already skipped on anything other than 64 bit. But
+// compiling on 32 bit leads to warnings/errors, so skip compiling the test.
+#if defined(__LP64__)
   const uptr MaxTaggedSize = 4 * archMemoryTagGranuleSize();
   for (uptr Size = 0; Size <= MaxTaggedSize; ++Size) {
     uptr NoTagBegin = Addr + archMemoryTagGranuleSize();
@@ -179,8 +208,9 @@ TEST_F(MemtagTest, StoreTags) {
     EXPECT_EQ(LoadPtr, loadTag(LoadPtr));
 
     // Reset tags without using StoreTags.
-    releasePagesToOS(Addr, 0, BufferSize, &Data);
+    MemMap.releasePagesToOS(Addr, BufferSize);
   }
+#endif
 }
 
 } // namespace scudo
diff --git a/standalone/tests/primary_test.cpp b/standalone/tests/primary_test.cpp
index c7ebcc3f82f..1cf3bb51db0 100644
--- a/standalone/tests/primary_test.cpp
+++ b/standalone/tests/primary_test.cpp
@@ -8,6 +8,9 @@
 
 #include "tests/scudo_unit_test.h"
 
+#include "allocator_config.h"
+#include "allocator_config_wrapper.h"
+#include "condition_variable.h"
 #include "primary32.h"
 #include "primary64.h"
 #include "size_class_map.h"
@@ -25,84 +28,146 @@
 // 32-bit architectures. It's not something we want to encourage, but we still
 // should ensure the tests pass.
 
-struct TestConfig1 {
-  static const scudo::uptr PrimaryRegionSizeLog = 18U;
-  static const scudo::uptr PrimaryGroupSizeLog = 18U;
-  static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
-  static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
+template <typename SizeClassMapT> struct TestConfig1 {
   static const bool MaySupportMemoryTagging = false;
-  typedef scudo::uptr PrimaryCompactPtrT;
-  static const scudo::uptr PrimaryCompactPtrScale = 0;
-  static const bool PrimaryEnableRandomOffset = true;
-  static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18;
+  template <typename> using TSDRegistryT = void;
+  template <typename> using PrimaryT = void;
+  template <typename> using SecondaryT = void;
+
+  struct Primary {
+    using SizeClassMap = SizeClassMapT;
+    static const scudo::uptr RegionSizeLog = 18U;
+    static const scudo::uptr GroupSizeLog = 18U;
+    static const scudo::s32 MinReleaseToOsIntervalMs = INT32_MIN;
+    static const scudo::s32 MaxReleaseToOsIntervalMs = INT32_MAX;
+    typedef scudo::uptr CompactPtrT;
+    static const scudo::uptr CompactPtrScale = 0;
+    static const bool EnableRandomOffset = true;
+    static const scudo::uptr MapSizeIncrement = 1UL << 18;
+  };
 };
 
-struct TestConfig2 {
+template <typename SizeClassMapT> struct TestConfig2 {
+  static const bool MaySupportMemoryTagging = false;
+  template <typename> using TSDRegistryT = void;
+  template <typename> using PrimaryT = void;
+  template <typename> using SecondaryT = void;
+
+  struct Primary {
+    using SizeClassMap = SizeClassMapT;
 #if defined(__mips__)
-  // Unable to allocate greater size on QEMU-user.
-  static const scudo::uptr PrimaryRegionSizeLog = 23U;
+    // Unable to allocate greater size on QEMU-user.
+    static const scudo::uptr RegionSizeLog = 23U;
 #else
-  static const scudo::uptr PrimaryRegionSizeLog = 24U;
+    static const scudo::uptr RegionSizeLog = 24U;
 #endif
-  static const scudo::uptr PrimaryGroupSizeLog = 20U;
-  static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
-  static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
-  static const bool MaySupportMemoryTagging = false;
-  typedef scudo::uptr PrimaryCompactPtrT;
-  static const scudo::uptr PrimaryCompactPtrScale = 0;
-  static const bool PrimaryEnableRandomOffset = true;
-  static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18;
+    static const scudo::uptr GroupSizeLog = 20U;
+    static const scudo::s32 MinReleaseToOsIntervalMs = INT32_MIN;
+    static const scudo::s32 MaxReleaseToOsIntervalMs = INT32_MAX;
+    typedef scudo::uptr CompactPtrT;
+    static const scudo::uptr CompactPtrScale = 0;
+    static const bool EnableRandomOffset = true;
+    static const scudo::uptr MapSizeIncrement = 1UL << 18;
+  };
 };
 
-struct TestConfig3 {
+template <typename SizeClassMapT> struct TestConfig3 {
+  static const bool MaySupportMemoryTagging = true;
+  template <typename> using TSDRegistryT = void;
+  template <typename> using PrimaryT = void;
+  template <typename> using SecondaryT = void;
+
+  struct Primary {
+    using SizeClassMap = SizeClassMapT;
 #if defined(__mips__)
-  // Unable to allocate greater size on QEMU-user.
-  static const scudo::uptr PrimaryRegionSizeLog = 23U;
+    // Unable to allocate greater size on QEMU-user.
+    static const scudo::uptr RegionSizeLog = 23U;
 #else
-  static const scudo::uptr PrimaryRegionSizeLog = 24U;
+    static const scudo::uptr RegionSizeLog = 24U;
 #endif
-  static const scudo::uptr PrimaryGroupSizeLog = 20U;
-  static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
-  static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
-  static const bool MaySupportMemoryTagging = true;
-  typedef scudo::uptr PrimaryCompactPtrT;
-  static const scudo::uptr PrimaryCompactPtrScale = 0;
-  static const bool PrimaryEnableRandomOffset = true;
-  static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18;
+    static const scudo::uptr GroupSizeLog = 20U;
+    static const scudo::s32 MinReleaseToOsIntervalMs = INT32_MIN;
+    static const scudo::s32 MaxReleaseToOsIntervalMs = INT32_MAX;
+    typedef scudo::uptr CompactPtrT;
+    static const scudo::uptr CompactPtrScale = 0;
+    static const bool EnableContiguousRegions = false;
+    static const bool EnableRandomOffset = true;
+    static const scudo::uptr MapSizeIncrement = 1UL << 18;
+  };
 };
 
-struct TestConfig4 {
+template <typename SizeClassMapT> struct TestConfig4 {
+  static const bool MaySupportMemoryTagging = true;
+  template <typename> using TSDRegistryT = void;
+  template <typename> using PrimaryT = void;
+  template <typename> using SecondaryT = void;
+
+  struct Primary {
+    using SizeClassMap = SizeClassMapT;
 #if defined(__mips__)
-  // Unable to allocate greater size on QEMU-user.
-  static const scudo::uptr PrimaryRegionSizeLog = 23U;
+    // Unable to allocate greater size on QEMU-user.
+    static const scudo::uptr RegionSizeLog = 23U;
 #else
-  static const scudo::uptr PrimaryRegionSizeLog = 24U;
+    static const scudo::uptr RegionSizeLog = 24U;
 #endif
-  static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
-  static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
-  static const bool MaySupportMemoryTagging = true;
-  static const scudo::uptr PrimaryCompactPtrScale = 3U;
-  static const scudo::uptr PrimaryGroupSizeLog = 20U;
-  typedef scudo::u32 PrimaryCompactPtrT;
-  static const bool PrimaryEnableRandomOffset = true;
-  static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18;
+    static const scudo::s32 MinReleaseToOsIntervalMs = INT32_MIN;
+    static const scudo::s32 MaxReleaseToOsIntervalMs = INT32_MAX;
+    static const scudo::uptr CompactPtrScale = 3U;
+    static const scudo::uptr GroupSizeLog = 20U;
+    typedef scudo::u32 CompactPtrT;
+    static const bool EnableRandomOffset = true;
+    static const scudo::uptr MapSizeIncrement = 1UL << 18;
+  };
 };
 
-template <typename BaseConfig, typename SizeClassMapT>
-struct Config : public BaseConfig {
-  using SizeClassMap = SizeClassMapT;
+// This is the only test config that enables the condition variable.
+template <typename SizeClassMapT> struct TestConfig5 {
+  static const bool MaySupportMemoryTagging = true;
+  template <typename> using TSDRegistryT = void;
+  template <typename> using PrimaryT = void;
+  template <typename> using SecondaryT = void;
+
+  struct Primary {
+    using SizeClassMap = SizeClassMapT;
+#if defined(__mips__)
+    // Unable to allocate greater size on QEMU-user.
+    static const scudo::uptr RegionSizeLog = 23U;
+#else
+    static const scudo::uptr RegionSizeLog = 24U;
+#endif
+    static const scudo::s32 MinReleaseToOsIntervalMs = INT32_MIN;
+    static const scudo::s32 MaxReleaseToOsIntervalMs = INT32_MAX;
+    static const scudo::uptr CompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
+    static const scudo::uptr GroupSizeLog = 18U;
+    typedef scudo::u32 CompactPtrT;
+    static const bool EnableRandomOffset = true;
+    static const scudo::uptr MapSizeIncrement = 1UL << 18;
+#if SCUDO_LINUX
+    using ConditionVariableT = scudo::ConditionVariableLinux;
+#else
+    using ConditionVariableT = scudo::ConditionVariableDummy;
+#endif
+  };
 };
 
-template <typename BaseConfig, typename SizeClassMapT>
+template <template <typename> class BaseConfig, typename SizeClassMapT>
+struct Config : public BaseConfig<SizeClassMapT> {};
+
+template <template <typename> class BaseConfig, typename SizeClassMapT>
 struct SizeClassAllocator
-    : public scudo::SizeClassAllocator64<Config<BaseConfig, SizeClassMapT>> {};
+    : public scudo::SizeClassAllocator64<
+          scudo::PrimaryConfig<Config<BaseConfig, SizeClassMapT>>> {};
 template <typename SizeClassMapT>
 struct SizeClassAllocator<TestConfig1, SizeClassMapT>
-    : public scudo::SizeClassAllocator32<Config<TestConfig1, SizeClassMapT>> {};
+    : public scudo::SizeClassAllocator32<
+          scudo::PrimaryConfig<Config<TestConfig1, SizeClassMapT>>> {};
 
-template <typename BaseConfig, typename SizeClassMapT>
+template <template <typename> class BaseConfig, typename SizeClassMapT>
 struct TestAllocator : public SizeClassAllocator<BaseConfig, SizeClassMapT> {
-  ~TestAllocator() { this->unmapTestOnly(); }
+  ~TestAllocator() {
+    this->verifyAllBlocksAreReleasedTestOnly();
+    this->unmapTestOnly();
+  }
 
   void *operator new(size_t size) {
     void *p = nullptr;
@@ -113,7 +178,8 @@ struct TestAllocator : public SizeClassAllocator<BaseConfig, SizeClassMapT> {
   void operator delete(void *ptr) { free(ptr); }
 };
 
-template <class BaseConfig> struct ScudoPrimaryTest : public Test {};
+template <template <typename> class BaseConfig>
+struct ScudoPrimaryTest : public Test {};
 
 #if SCUDO_FUCHSIA
 #define SCUDO_TYPED_TEST_ALL_TYPES(FIXTURE, NAME)                              \
@@ -124,7 +190,8 @@ template <class BaseConfig> struct ScudoPrimaryTest : public Test {};
   SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig1)                            \
   SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig2)                            \
   SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig3)                            \
-  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig4)
+  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig4)                            \
+  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig5)
 #endif
 
 #define SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TYPE)                             \
@@ -132,12 +199,13 @@ template <class BaseConfig> struct ScudoPrimaryTest : public Test {};
   TEST_F(FIXTURE##NAME##_##TYPE, NAME) { FIXTURE##NAME<TYPE>::Run(); }
 
 #define SCUDO_TYPED_TEST(FIXTURE, NAME)                                        \
-  template <class TypeParam>                                                   \
+  template <template <typename> class TypeParam>                               \
   struct FIXTURE##NAME : public FIXTURE<TypeParam> {                           \
     void Run();                                                                \
   };                                                                           \
   SCUDO_TYPED_TEST_ALL_TYPES(FIXTURE, NAME)                                    \
-  template <class TypeParam> void FIXTURE##NAME<TypeParam>::Run()
+  template <template <typename> class TypeParam>                               \
+  void FIXTURE##NAME<TypeParam>::Run()
 
 SCUDO_TYPED_TEST(ScudoPrimaryTest, BasicPrimary) {
   using Primary = TestAllocator<TypeParam, scudo::DefaultSizeClassMap>;
@@ -161,30 +229,36 @@ SCUDO_TYPED_TEST(ScudoPrimaryTest, BasicPrimary) {
       Cache.deallocate(ClassId, Pointers[J]);
   }
   Cache.destroy(nullptr);
-  Allocator->releaseToOS();
+  Allocator->releaseToOS(scudo::ReleaseToOS::Force);
   scudo::ScopedString Str;
   Allocator->getStats(&Str);
   Str.output();
 }
 
 struct SmallRegionsConfig {
-  using SizeClassMap = scudo::DefaultSizeClassMap;
-  static const scudo::uptr PrimaryRegionSizeLog = 21U;
-  static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
-  static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
   static const bool MaySupportMemoryTagging = false;
-  typedef scudo::uptr PrimaryCompactPtrT;
-  static const scudo::uptr PrimaryCompactPtrScale = 0;
-  static const bool PrimaryEnableRandomOffset = true;
-  static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18;
-  static const scudo::uptr PrimaryGroupSizeLog = 20U;
+  template <typename> using TSDRegistryT = void;
+  template <typename> using PrimaryT = void;
+  template <typename> using SecondaryT = void;
+
+  struct Primary {
+    using SizeClassMap = scudo::DefaultSizeClassMap;
+    static const scudo::uptr RegionSizeLog = 21U;
+    static const scudo::s32 MinReleaseToOsIntervalMs = INT32_MIN;
+    static const scudo::s32 MaxReleaseToOsIntervalMs = INT32_MAX;
+    typedef scudo::uptr CompactPtrT;
+    static const scudo::uptr CompactPtrScale = 0;
+    static const bool EnableRandomOffset = true;
+    static const scudo::uptr MapSizeIncrement = 1UL << 18;
+    static const scudo::uptr GroupSizeLog = 20U;
+  };
 };
 
 // The 64-bit SizeClassAllocator can be easily OOM'd with small region sizes.
 // For the 32-bit one, it requires actually exhausting memory, so we skip it.
 TEST(ScudoPrimaryTest, Primary64OOM) {
-  using Primary = scudo::SizeClassAllocator64<SmallRegionsConfig>;
-  using TransferBatch = Primary::CacheT::TransferBatch;
+  using Primary =
+      scudo::SizeClassAllocator64<scudo::PrimaryConfig<SmallRegionsConfig>>;
   Primary Allocator;
   Allocator.init(/*ReleaseToOsInterval=*/-1);
   typename Primary::CacheT Cache;
@@ -192,30 +266,28 @@ TEST(ScudoPrimaryTest, Primary64OOM) {
   Stats.init();
   Cache.init(&Stats, &Allocator);
   bool AllocationFailed = false;
-  std::vector<TransferBatch *> Batches;
+  std::vector<void *> Blocks;
   const scudo::uptr ClassId = Primary::SizeClassMap::LargestClassId;
   const scudo::uptr Size = Primary::getSizeByClassId(ClassId);
-  typename Primary::CacheT::CompactPtrT Blocks[TransferBatch::MaxNumCached];
+  const scudo::u16 MaxCachedBlockCount = Primary::CacheT::getMaxCached(Size);
 
   for (scudo::uptr I = 0; I < 10000U; I++) {
-    TransferBatch *B = Allocator.popBatch(&Cache, ClassId);
-    if (!B) {
-      AllocationFailed = true;
-      break;
+    for (scudo::uptr J = 0; J < MaxCachedBlockCount; ++J) {
+      void *Ptr = Cache.allocate(ClassId);
+      if (Ptr == nullptr) {
+        AllocationFailed = true;
+        break;
+      }
+      memset(Ptr, 'B', Size);
+      Blocks.push_back(Ptr);
     }
-    for (scudo::u16 J = 0; J < B->getCount(); J++)
-      memset(Allocator.decompactPtr(ClassId, B->get(J)), 'B', Size);
-    Batches.push_back(B);
-  }
-  while (!Batches.empty()) {
-    TransferBatch *B = Batches.back();
-    Batches.pop_back();
-    B->copyToArray(Blocks);
-    Allocator.pushBlocks(&Cache, ClassId, Blocks, B->getCount());
-    Cache.deallocate(Primary::SizeClassMap::BatchClassId, B);
   }
+
+  for (auto *Ptr : Blocks)
+    Cache.deallocate(ClassId, Ptr);
+
   Cache.destroy(nullptr);
-  Allocator.releaseToOS();
+  Allocator.releaseToOS(scudo::ReleaseToOS::Force);
   scudo::ScopedString Str;
   Allocator.getStats(&Str);
   Str.output();
@@ -231,7 +303,8 @@ SCUDO_TYPED_TEST(ScudoPrimaryTest, PrimaryIterate) {
   Cache.init(nullptr, Allocator.get());
   std::vector<std::pair<scudo::uptr, void *>> V;
   for (scudo::uptr I = 0; I < 64U; I++) {
-    const scudo::uptr Size = std::rand() % Primary::SizeClassMap::MaxSize;
+    const scudo::uptr Size =
+        static_cast<scudo::uptr>(std::rand()) % Primary::SizeClassMap::MaxSize;
     const scudo::uptr ClassId = Primary::SizeClassMap::getClassIdBySize(Size);
     void *P = Cache.allocate(ClassId);
     V.push_back(std::make_pair(ClassId, P));
@@ -253,21 +326,21 @@ SCUDO_TYPED_TEST(ScudoPrimaryTest, PrimaryIterate) {
     V.pop_back();
   }
   Cache.destroy(nullptr);
-  Allocator->releaseToOS();
+  Allocator->releaseToOS(scudo::ReleaseToOS::Force);
   scudo::ScopedString Str;
   Allocator->getStats(&Str);
   Str.output();
 }
 
 SCUDO_TYPED_TEST(ScudoPrimaryTest, PrimaryThreaded) {
-  using Primary = TestAllocator<TypeParam, scudo::SvelteSizeClassMap>;
+  using Primary = TestAllocator<TypeParam, scudo::Config::Primary::SizeClassMap>;
   std::unique_ptr<Primary> Allocator(new Primary);
   Allocator->init(/*ReleaseToOsInterval=*/-1);
   std::mutex Mutex;
   std::condition_variable Cv;
   bool Ready = false;
   std::thread Threads[32];
-  for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++)
+  for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++) {
     Threads[I] = std::thread([&]() {
       static thread_local typename Primary::CacheT Cache;
       Cache.init(nullptr, Allocator.get());
@@ -278,21 +351,30 @@ SCUDO_TYPED_TEST(ScudoPrimaryTest, PrimaryThreaded) {
           Cv.wait(Lock);
       }
       for (scudo::uptr I = 0; I < 256U; I++) {
-        const scudo::uptr Size =
-            std::rand() % Primary::SizeClassMap::MaxSize / 4;
+        const scudo::uptr Size = static_cast<scudo::uptr>(std::rand()) %
+                                 Primary::SizeClassMap::MaxSize / 4;
         const scudo::uptr ClassId =
             Primary::SizeClassMap::getClassIdBySize(Size);
         void *P = Cache.allocate(ClassId);
         if (P)
           V.push_back(std::make_pair(ClassId, P));
       }
+
+      // Try to interleave pushBlocks(), popBlocks() and releaseToOS().
+      Allocator->releaseToOS(scudo::ReleaseToOS::Force);
+
       while (!V.empty()) {
         auto Pair = V.back();
         Cache.deallocate(Pair.first, Pair.second);
         V.pop_back();
+        // This increases the chance of having non-full TransferBatches and it
+        // will jump into the code path of merging TransferBatches.
+        if (std::rand() % 8 == 0)
+          Cache.drain();
       }
       Cache.destroy(nullptr);
     });
+  }
   {
     std::unique_lock<std::mutex> Lock(Mutex);
     Ready = true;
@@ -300,9 +382,10 @@ SCUDO_TYPED_TEST(ScudoPrimaryTest, PrimaryThreaded) {
   }
   for (auto &T : Threads)
     T.join();
-  Allocator->releaseToOS();
+  Allocator->releaseToOS(scudo::ReleaseToOS::Force);
   scudo::ScopedString Str;
   Allocator->getStats(&Str);
+  Allocator->getFragmentationInfo(&Str);
   Str.output();
 }
 
@@ -322,7 +405,7 @@ SCUDO_TYPED_TEST(ScudoPrimaryTest, ReleaseToOS) {
   EXPECT_NE(P, nullptr);
   Cache.deallocate(ClassId, P);
   Cache.destroy(nullptr);
-  EXPECT_GT(Allocator->releaseToOS(), 0U);
+  EXPECT_GT(Allocator->releaseToOS(scudo::ReleaseToOS::ForceAll), 0U);
 }
 
 SCUDO_TYPED_TEST(ScudoPrimaryTest, MemoryGroup) {
@@ -367,4 +450,10 @@ SCUDO_TYPED_TEST(ScudoPrimaryTest, MemoryGroup) {
   EXPECT_LE(*std::max_element(Blocks.begin(), Blocks.end()) -
                 *std::min_element(Blocks.begin(), Blocks.end()),
             GroupSizeMem * 2);
+
+  while (!Blocks.empty()) {
+    Cache.deallocate(ClassId, reinterpret_cast<void *>(Blocks.back()));
+    Blocks.pop_back();
+  }
+  Cache.drain();
 }
diff --git a/standalone/tests/release_test.cpp b/standalone/tests/release_test.cpp
index 2e738214700..14b398a91fc 100644
--- a/standalone/tests/release_test.cpp
+++ b/standalone/tests/release_test.cpp
@@ -22,15 +22,17 @@ TEST(ScudoReleaseTest, RegionPageMap) {
   for (scudo::uptr I = 0; I < SCUDO_WORDSIZE; I++) {
     // Various valid counter's max values packed into one word.
     scudo::RegionPageMap PageMap2N(1U, 1U, 1UL << I);
-    EXPECT_EQ(sizeof(scudo::uptr), PageMap2N.getBufferSize());
+    ASSERT_TRUE(PageMap2N.isAllocated());
+    EXPECT_EQ(1U, PageMap2N.getBufferNumElements());
     // Check the "all bit set" values too.
     scudo::RegionPageMap PageMap2N1_1(1U, 1U, ~0UL >> I);
-    EXPECT_EQ(sizeof(scudo::uptr), PageMap2N1_1.getBufferSize());
+    ASSERT_TRUE(PageMap2N1_1.isAllocated());
+    EXPECT_EQ(1U, PageMap2N1_1.getBufferNumElements());
     // Verify the packing ratio, the counter is Expected to be packed into the
     // closest power of 2 bits.
     scudo::RegionPageMap PageMap(1U, SCUDO_WORDSIZE, 1UL << I);
-    EXPECT_EQ(sizeof(scudo::uptr) * scudo::roundUpPowerOfTwo(I + 1),
-              PageMap.getBufferSize());
+    ASSERT_TRUE(PageMap.isAllocated());
+    EXPECT_EQ(scudo::roundUpPowerOfTwo(I + 1), PageMap.getBufferNumElements());
   }
 
   // Go through 1, 2, 4, 8, .. {32,64} bits per counter.
@@ -40,6 +42,7 @@ TEST(ScudoReleaseTest, RegionPageMap) {
         (scudo::getPageSizeCached() / 8) * (SCUDO_WORDSIZE >> I);
     scudo::RegionPageMap PageMap(1U, NumCounters,
                                        1UL << ((1UL << I) - 1));
+    ASSERT_TRUE(PageMap.isAllocated());
     PageMap.inc(0U, 0U);
     for (scudo::uptr C = 1; C < NumCounters - 1; C++) {
       EXPECT_EQ(0UL, PageMap.get(0U, C));
@@ -130,8 +133,9 @@ TEST(ScudoReleaseTest, FreePagesRangeTracker) {
     // Strip trailing '.'-pages before comparing the results as they are not
     // going to be reported to range_recorder anyway.
     const char *LastX = strrchr(TestCase, 'x');
-    std::string Expected(TestCase,
-                         LastX == nullptr ? 0 : (LastX - TestCase + 1));
+    std::string Expected(
+        TestCase,
+        LastX == nullptr ? 0U : static_cast<size_t>(LastX - TestCase + 1));
     EXPECT_STREQ(Expected.c_str(), Recorder.ReportedPages.c_str());
   }
 }
@@ -220,12 +224,12 @@ template <class SizeClassMap> void testReleaseFreeMemoryToOS() {
     auto SkipRegion = [](UNUSED scudo::uptr RegionIndex) { return false; };
     auto DecompactPtr = [](scudo::uptr P) { return P; };
     ReleasedPagesRecorder Recorder;
-    scudo::PageReleaseContext Context(BlockSize,
-                                      /*RegionSize=*/MaxBlocks * BlockSize,
-                                      /*NumberOfRegions=*/1U,
+    scudo::PageReleaseContext Context(BlockSize, /*NumberOfRegions=*/1U,
                                       /*ReleaseSize=*/MaxBlocks * BlockSize);
     ASSERT_FALSE(Context.hasBlockMarked());
-    Context.markFreeBlocks(FreeList, DecompactPtr, Recorder.getBase());
+    Context.markFreeBlocksInRegion(FreeList, DecompactPtr, Recorder.getBase(),
+                                   /*RegionIndex=*/0, MaxBlocks * BlockSize,
+                                   /*MayContainLastBlockInRegion=*/true);
     ASSERT_TRUE(Context.hasBlockMarked());
     releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
     scudo::RegionPageMap &PageMap = Context.PageMap;
@@ -315,12 +319,13 @@ template <class SizeClassMap> void testPageMapMarkRange() {
     const scudo::uptr RoundedRegionSize = scudo::roundUp(RegionSize, PageSize);
 
     std::vector<scudo::uptr> Pages(RoundedRegionSize / PageSize, 0);
-    for (scudo::uptr Block = 0; Block + BlockSize <= RoundedRegionSize;
-         Block += BlockSize) {
-      for (scudo::uptr page = Block / PageSize;
-           page <= (Block + BlockSize - 1) / PageSize; ++page) {
-        ASSERT_LT(page, Pages.size());
-        ++Pages[page];
+    for (scudo::uptr Block = 0; Block < RoundedRegionSize; Block += BlockSize) {
+      for (scudo::uptr Page = Block / PageSize;
+           Page <= (Block + BlockSize - 1) / PageSize &&
+           Page < RoundedRegionSize / PageSize;
+           ++Page) {
+        ASSERT_LT(Page, Pages.size());
+        ++Pages[Page];
       }
     }
 
@@ -328,10 +333,10 @@ template <class SizeClassMap> void testPageMapMarkRange() {
       const scudo::uptr GroupBeg = GroupId * GroupSize;
       const scudo::uptr GroupEnd = GroupBeg + GroupSize;
 
-      scudo::PageReleaseContext Context(BlockSize, RegionSize,
-                                        /*NumberOfRegions=*/1U,
+      scudo::PageReleaseContext Context(BlockSize, /*NumberOfRegions=*/1U,
                                         /*ReleaseSize=*/RegionSize);
-      Context.markRangeAsAllCounted(GroupBeg, GroupEnd, /*Base=*/0U);
+      Context.markRangeAsAllCounted(GroupBeg, GroupEnd, /*Base=*/0U,
+                                    /*RegionIndex=*/0, RegionSize);
 
       scudo::uptr FirstBlock =
           ((GroupBeg + BlockSize - 1) / BlockSize) * BlockSize;
@@ -398,10 +403,10 @@ template <class SizeClassMap> void testPageMapMarkRange() {
     } // Iterate each Group
 
     // Release the entire region. This is to ensure the last page is counted.
-    scudo::PageReleaseContext Context(BlockSize, RegionSize,
-                                      /*NumberOfRegions=*/1U,
+    scudo::PageReleaseContext Context(BlockSize, /*NumberOfRegions=*/1U,
                                       /*ReleaseSize=*/RegionSize);
-    Context.markRangeAsAllCounted(/*From=*/0U, /*To=*/RegionSize, /*Base=*/0);
+    Context.markRangeAsAllCounted(/*From=*/0U, /*To=*/RegionSize, /*Base=*/0,
+                                  /*RegionIndex=*/0, RegionSize);
     for (scudo::uptr Page = 0; Page < RoundedRegionSize / PageSize; ++Page)
       EXPECT_TRUE(Context.PageMap.isAllCounted(/*Region=*/0, Page));
   } // Iterate each size class
@@ -410,8 +415,6 @@ template <class SizeClassMap> void testPageMapMarkRange() {
 template <class SizeClassMap> void testReleasePartialRegion() {
   typedef FreeBatch<SizeClassMap> Batch;
   const scudo::uptr PageSize = scudo::getPageSizeCached();
-  const scudo::uptr ReleaseBase = PageSize;
-  const scudo::uptr BasePageOffset = ReleaseBase / PageSize;
 
   for (scudo::uptr I = 1; I <= SizeClassMap::LargestClassId; I++) {
     // In the following, we want to ensure the region includes at least 2 pages
@@ -419,8 +422,11 @@ template <class SizeClassMap> void testReleasePartialRegion() {
     // the last block is tricky, so we always test the case that includes the
     // last block.
     const scudo::uptr BlockSize = SizeClassMap::getSizeByClassId(I);
+    const scudo::uptr ReleaseBase = scudo::roundUp(BlockSize, PageSize);
+    const scudo::uptr BasePageOffset = ReleaseBase / PageSize;
     const scudo::uptr RegionSize =
-        scudo::roundUpSlow(scudo::roundUp(BlockSize, PageSize) * 2, BlockSize) +
+        scudo::roundUpSlow(scudo::roundUp(BlockSize, PageSize) + ReleaseBase,
+                           BlockSize) +
         BlockSize;
     const scudo::uptr RoundedRegionSize = scudo::roundUp(RegionSize, PageSize);
 
@@ -429,7 +435,7 @@ template <class SizeClassMap> void testReleasePartialRegion() {
 
     // Skip the blocks in the first page and add the remaining.
     std::vector<scudo::uptr> Pages(RoundedRegionSize / PageSize, 0);
-    for (scudo::uptr Block = scudo::roundUpSlow(PageSize, BlockSize);
+    for (scudo::uptr Block = scudo::roundUpSlow(ReleaseBase, BlockSize);
          Block + BlockSize <= RoundedRegionSize; Block += BlockSize) {
       for (scudo::uptr Page = Block / PageSize;
            Page <= (Block + BlockSize - 1) / PageSize; ++Page) {
@@ -439,12 +445,12 @@ template <class SizeClassMap> void testReleasePartialRegion() {
     }
 
     // This follows the logic how we count the last page. It should be
-    // consistent with how markFreeBlocks() handles the last block.
+    // consistent with how markFreeBlocksInRegion() handles the last block.
     if (RoundedRegionSize % BlockSize != 0)
       ++Pages.back();
 
     Batch *CurrentBatch = nullptr;
-    for (scudo::uptr Block = scudo::roundUpSlow(PageSize, BlockSize);
+    for (scudo::uptr Block = scudo::roundUpSlow(ReleaseBase, BlockSize);
          Block < RegionSize; Block += BlockSize) {
       if (CurrentBatch == nullptr ||
           CurrentBatch->getCount() == Batch::MaxCount) {
@@ -459,7 +465,7 @@ template <class SizeClassMap> void testReleasePartialRegion() {
       auto SkipRegion = [](UNUSED scudo::uptr RegionIndex) { return false; };
       ReleasedPagesRecorder Recorder(ReleaseBase);
       releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
-      const scudo::uptr FirstBlock = scudo::roundUpSlow(PageSize, BlockSize);
+      const scudo::uptr FirstBlock = scudo::roundUpSlow(ReleaseBase, BlockSize);
 
       for (scudo::uptr P = 0; P < RoundedRegionSize; P += PageSize) {
         if (P < FirstBlock) {
@@ -477,10 +483,12 @@ template <class SizeClassMap> void testReleasePartialRegion() {
     // Test marking by visiting each block.
     {
       auto DecompactPtr = [](scudo::uptr P) { return P; };
-      scudo::PageReleaseContext Context(
-          BlockSize, RegionSize, /*NumberOfRegions=*/1U,
-          /*ReleaseSize=*/RegionSize - PageSize, ReleaseBase);
-      Context.markFreeBlocks(FreeList, DecompactPtr, /*Base=*/0U);
+      scudo::PageReleaseContext Context(BlockSize, /*NumberOfRegions=*/1U,
+                                        /*ReleaseSize=*/RegionSize - PageSize,
+                                        ReleaseBase);
+      Context.markFreeBlocksInRegion(FreeList, DecompactPtr, /*Base=*/0U,
+                                     /*RegionIndex=*/0, RegionSize,
+                                     /*MayContainLastBlockInRegion=*/true);
       for (const Batch &It : FreeList) {
         for (scudo::u16 I = 0; I < It.getCount(); I++) {
           scudo::uptr Block = It.get(I);
@@ -497,10 +505,11 @@ template <class SizeClassMap> void testReleasePartialRegion() {
 
     // Test range marking.
     {
-      scudo::PageReleaseContext Context(
-          BlockSize, RegionSize, /*NumberOfRegions=*/1U,
-          /*ReleaseSize=*/RegionSize - PageSize, ReleaseBase);
-      Context.markRangeAsAllCounted(ReleaseBase, RegionSize, /*Base=*/0U);
+      scudo::PageReleaseContext Context(BlockSize, /*NumberOfRegions=*/1U,
+                                        /*ReleaseSize=*/RegionSize - PageSize,
+                                        ReleaseBase);
+      Context.markRangeAsAllCounted(ReleaseBase, RegionSize, /*Base=*/0U,
+                                    /*RegionIndex=*/0, RegionSize);
       for (scudo::uptr Page = ReleaseBase / PageSize;
            Page < RoundedRegionSize / PageSize; ++Page) {
         if (Context.PageMap.get(/*Region=*/0, Page - BasePageOffset) !=
@@ -515,16 +524,16 @@ template <class SizeClassMap> void testReleasePartialRegion() {
 
     // Check the buffer size of PageMap.
     {
-      scudo::PageReleaseContext Full(BlockSize, RegionSize,
-                                     /*NumberOfRegions=*/1U,
+      scudo::PageReleaseContext Full(BlockSize, /*NumberOfRegions=*/1U,
                                      /*ReleaseSize=*/RegionSize);
       Full.ensurePageMapAllocated();
-      scudo::PageReleaseContext Partial(
-          BlockSize, RegionSize, /*NumberOfRegions=*/1U,
-          /*ReleaseSize=*/RegionSize - PageSize, ReleaseBase);
+      scudo::PageReleaseContext Partial(BlockSize, /*NumberOfRegions=*/1U,
+                                        /*ReleaseSize=*/RegionSize - PageSize,
+                                        ReleaseBase);
       Partial.ensurePageMapAllocated();
 
-      EXPECT_GE(Full.PageMap.getBufferSize(), Partial.PageMap.getBufferSize());
+      EXPECT_GE(Full.PageMap.getBufferNumElements(),
+                Partial.PageMap.getBufferNumElements());
     }
 
     while (!FreeList.empty()) {
@@ -543,20 +552,103 @@ TEST(ScudoReleaseTest, ReleaseFreeMemoryToOSAndroid) {
   testReleaseFreeMemoryToOS<scudo::AndroidSizeClassMap>();
 }
 
-TEST(ScudoReleaseTest, ReleaseFreeMemoryToOSSvelte) {
-  testReleaseFreeMemoryToOS<scudo::SvelteSizeClassMap>();
-}
-
 TEST(ScudoReleaseTest, PageMapMarkRange) {
   testPageMapMarkRange<scudo::DefaultSizeClassMap>();
   testPageMapMarkRange<scudo::AndroidSizeClassMap>();
   testPageMapMarkRange<scudo::FuchsiaSizeClassMap>();
-  testPageMapMarkRange<scudo::SvelteSizeClassMap>();
 }
 
 TEST(ScudoReleaseTest, ReleasePartialRegion) {
   testReleasePartialRegion<scudo::DefaultSizeClassMap>();
   testReleasePartialRegion<scudo::AndroidSizeClassMap>();
   testReleasePartialRegion<scudo::FuchsiaSizeClassMap>();
-  testReleasePartialRegion<scudo::SvelteSizeClassMap>();
+}
+
+template <class SizeClassMap> void testReleaseRangeWithSingleBlock() {
+  const scudo::uptr PageSize = scudo::getPageSizeCached();
+
+  // We want to test if a memory group only contains single block that will be
+  // handled properly. The case is like:
+  //
+  //   From                     To
+  //     +----------------------+
+  //  +------------+------------+
+  //  |            |            |
+  //  +------------+------------+
+  //                            ^
+  //                        RegionSize
+  //
+  // Note that `From` will be page aligned.
+  //
+  // If the second from the last block is aligned at `From`, then we expect all
+  // the pages after `From` will be marked as can-be-released. Otherwise, the
+  // pages only touched by the last blocks will be marked as can-be-released.
+  for (scudo::uptr I = 1; I <= SizeClassMap::LargestClassId; I++) {
+    const scudo::uptr BlockSize = SizeClassMap::getSizeByClassId(I);
+    const scudo::uptr From = scudo::roundUp(BlockSize, PageSize);
+    const scudo::uptr To =
+        From % BlockSize == 0
+            ? From + BlockSize
+            : scudo::roundDownSlow(From + BlockSize, BlockSize) + BlockSize;
+    const scudo::uptr RoundedRegionSize = scudo::roundUp(To, PageSize);
+
+    std::vector<scudo::uptr> Pages(RoundedRegionSize / PageSize, 0);
+    for (scudo::uptr Block = (To - BlockSize); Block < RoundedRegionSize;
+         Block += BlockSize) {
+      for (scudo::uptr Page = Block / PageSize;
+           Page <= (Block + BlockSize - 1) / PageSize &&
+           Page < RoundedRegionSize / PageSize;
+           ++Page) {
+        ASSERT_LT(Page, Pages.size());
+        ++Pages[Page];
+      }
+    }
+
+    scudo::PageReleaseContext Context(BlockSize, /*NumberOfRegions=*/1U,
+                                      /*ReleaseSize=*/To,
+                                      /*ReleaseBase=*/0U);
+    Context.markRangeAsAllCounted(From, To, /*Base=*/0U, /*RegionIndex=*/0,
+                                  /*RegionSize=*/To);
+
+    for (scudo::uptr Page = 0; Page < RoundedRegionSize; Page += PageSize) {
+      if (Context.PageMap.get(/*Region=*/0U, Page / PageSize) !=
+          Pages[Page / PageSize]) {
+        EXPECT_TRUE(
+            Context.PageMap.isAllCounted(/*Region=*/0U, Page / PageSize));
+      }
+    }
+  } // for each size class
+}
+
+TEST(ScudoReleaseTest, RangeReleaseRegionWithSingleBlock) {
+  testReleaseRangeWithSingleBlock<scudo::DefaultSizeClassMap>();
+  testReleaseRangeWithSingleBlock<scudo::AndroidSizeClassMap>();
+  testReleaseRangeWithSingleBlock<scudo::FuchsiaSizeClassMap>();
+}
+
+TEST(ScudoReleaseTest, BufferPool) {
+  constexpr scudo::uptr StaticBufferCount = SCUDO_WORDSIZE - 1;
+  constexpr scudo::uptr StaticBufferNumElements = 512U;
+
+  // Allocate the buffer pool on the heap because it is quite large (slightly
+  // more than StaticBufferCount * StaticBufferNumElements * sizeof(uptr)) and
+  // it may not fit in the stack on some platforms.
+  using BufferPool =
+      scudo::BufferPool<StaticBufferCount, StaticBufferNumElements>;
+  std::unique_ptr<BufferPool> Pool(new BufferPool());
+
+  std::vector<BufferPool::Buffer> Buffers;
+  for (scudo::uptr I = 0; I < StaticBufferCount; ++I) {
+    BufferPool::Buffer Buffer = Pool->getBuffer(StaticBufferNumElements);
+    EXPECT_TRUE(Pool->isStaticBufferTestOnly(Buffer));
+    Buffers.push_back(Buffer);
+  }
+
+  // The static buffer is supposed to be used up.
+  BufferPool::Buffer Buffer = Pool->getBuffer(StaticBufferNumElements);
+  EXPECT_FALSE(Pool->isStaticBufferTestOnly(Buffer));
+
+  Pool->releaseBuffer(Buffer);
+  for (auto &Buffer : Buffers)
+    Pool->releaseBuffer(Buffer);
 }
diff --git a/standalone/tests/report_test.cpp b/standalone/tests/report_test.cpp
index 81587bae6b5..6c46243053d 100644
--- a/standalone/tests/report_test.cpp
+++ b/standalone/tests/report_test.cpp
@@ -23,7 +23,6 @@ TEST(ScudoReportDeathTest, Generic) {
   EXPECT_DEATH(scudo::reportError("TEST123"), "Scudo ERROR.*TEST123");
   EXPECT_DEATH(scudo::reportInvalidFlag("ABC", "DEF"), "Scudo ERROR.*ABC.*DEF");
   EXPECT_DEATH(scudo::reportHeaderCorruption(P), "Scudo ERROR.*42424242");
-  EXPECT_DEATH(scudo::reportHeaderRace(P), "Scudo ERROR.*42424242");
   EXPECT_DEATH(scudo::reportSanityCheckError("XYZ"), "Scudo ERROR.*XYZ");
   EXPECT_DEATH(scudo::reportAlignmentTooBig(123, 456), "Scudo ERROR.*123.*456");
   EXPECT_DEATH(scudo::reportAllocationSizeTooBig(123, 456, 789),
@@ -54,3 +53,28 @@ TEST(ScudoReportDeathTest, CSpecific) {
   EXPECT_DEATH(scudo::reportInvalidAlignedAllocAlignment(123, 456),
                "Scudo ERROR.*123.*456");
 }
+
+#if SCUDO_LINUX || SCUDO_TRUSTY || SCUDO_ANDROID
+#include "report_linux.h"
+
+#include <errno.h>
+#include <sys/mman.h>
+
+TEST(ScudoReportDeathTest, Linux) {
+  errno = ENOMEM;
+  EXPECT_DEATH(scudo::reportMapError(),
+               "Scudo ERROR:.*internal map failure \\(error desc=.*\\)");
+  errno = ENOMEM;
+  EXPECT_DEATH(scudo::reportMapError(1024U),
+               "Scudo ERROR:.*internal map failure \\(error desc=.*\\) "
+               "requesting 1KB");
+  errno = ENOMEM;
+  EXPECT_DEATH(scudo::reportUnmapError(0x1000U, 100U),
+               "Scudo ERROR:.*internal unmap failure \\(error desc=.*\\) Addr "
+               "0x1000 Size 100");
+  errno = ENOMEM;
+  EXPECT_DEATH(scudo::reportProtectError(0x1000U, 100U, PROT_READ),
+               "Scudo ERROR:.*internal protect failure \\(error desc=.*\\) "
+               "Addr 0x1000 Size 100 Prot 1");
+}
+#endif
diff --git a/standalone/tests/scudo_hooks_test.cpp b/standalone/tests/scudo_hooks_test.cpp
deleted file mode 100644
index 7184ec12a8b..00000000000
--- a/standalone/tests/scudo_hooks_test.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-//===-- scudo_hooks_test.cpp ------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "tests/scudo_unit_test.h"
-
-#include "allocator_config.h"
-#include "combined.h"
-
-namespace {
-void *LastAllocatedPtr = nullptr;
-size_t LastRequestSize = 0;
-void *LastDeallocatedPtr = nullptr;
-} // namespace
-
-// Scudo defines weak symbols that can be defined by a client binary
-// to register callbacks at key points in the allocation timeline.  In
-// order to enforce those invariants, we provide definitions that
-// update some global state every time they are called, so that tests
-// can inspect their effects.  An unfortunate side effect of this
-// setup is that because those symbols are part of the binary, they
-// can't be selectively enabled; that means that they will get called
-// on unrelated tests in the same compilation unit. To mitigate this
-// issue, we insulate those tests in a separate compilation unit.
-extern "C" {
-__attribute__((visibility("default"))) void __scudo_allocate_hook(void *Ptr,
-                                                                  size_t Size) {
-  LastAllocatedPtr = Ptr;
-  LastRequestSize = Size;
-}
-__attribute__((visibility("default"))) void __scudo_deallocate_hook(void *Ptr) {
-  LastDeallocatedPtr = Ptr;
-}
-}
-
-// Simple check that allocation callbacks, when registered, are called:
-//   1) __scudo_allocate_hook is called when allocating.
-//   2) __scudo_deallocate_hook is called when deallocating.
-//   3) Both hooks are called when reallocating.
-//   4) Neither are called for a no-op reallocation.
-TEST(ScudoHooksTest, AllocateHooks) {
-  scudo::Allocator<scudo::DefaultConfig> Allocator;
-  constexpr scudo::uptr DefaultSize = 16U;
-  constexpr scudo::Chunk::Origin Origin = scudo::Chunk::Origin::Malloc;
-
-  // Simple allocation and deallocation.
-  {
-    LastAllocatedPtr = nullptr;
-    LastRequestSize = 0;
-
-    void *Ptr = Allocator.allocate(DefaultSize, Origin);
-
-    EXPECT_EQ(Ptr, LastAllocatedPtr);
-    EXPECT_EQ(DefaultSize, LastRequestSize);
-
-    LastDeallocatedPtr = nullptr;
-
-    Allocator.deallocate(Ptr, Origin);
-
-    EXPECT_EQ(Ptr, LastDeallocatedPtr);
-  }
-
-  // Simple no-op, same size reallocation.
-  {
-    void *Ptr = Allocator.allocate(DefaultSize, Origin);
-
-    LastAllocatedPtr = nullptr;
-    LastRequestSize = 0;
-    LastDeallocatedPtr = nullptr;
-
-    void *NewPtr = Allocator.reallocate(Ptr, DefaultSize);
-
-    EXPECT_EQ(Ptr, NewPtr);
-    EXPECT_EQ(nullptr, LastAllocatedPtr);
-    EXPECT_EQ(0U, LastRequestSize);
-    EXPECT_EQ(nullptr, LastDeallocatedPtr);
-  }
-
-  // Reallocation in increasing size classes. This ensures that at
-  // least one of the reallocations will be meaningful.
-  {
-    void *Ptr = Allocator.allocate(0, Origin);
-
-    for (scudo::uptr ClassId = 1U;
-         ClassId <= scudo::DefaultConfig::Primary::SizeClassMap::LargestClassId;
-         ++ClassId) {
-      const scudo::uptr Size =
-          scudo::DefaultConfig::Primary::SizeClassMap::getSizeByClassId(
-              ClassId);
-
-      LastAllocatedPtr = nullptr;
-      LastRequestSize = 0;
-      LastDeallocatedPtr = nullptr;
-
-      void *NewPtr = Allocator.reallocate(Ptr, Size);
-
-      if (NewPtr != Ptr) {
-        EXPECT_EQ(NewPtr, LastAllocatedPtr);
-        EXPECT_EQ(Size, LastRequestSize);
-        EXPECT_EQ(Ptr, LastDeallocatedPtr);
-      } else {
-        EXPECT_EQ(nullptr, LastAllocatedPtr);
-        EXPECT_EQ(0U, LastRequestSize);
-        EXPECT_EQ(nullptr, LastDeallocatedPtr);
-      }
-
-      Ptr = NewPtr;
-    }
-  }
-}
diff --git a/standalone/tests/scudo_unit_test.h b/standalone/tests/scudo_unit_test.h
index 1665fa87e5f..4283416435b 100644
--- a/standalone/tests/scudo_unit_test.h
+++ b/standalone/tests/scudo_unit_test.h
@@ -45,4 +45,10 @@ using Test = ::testing::Test;
 #define SKIP_NO_DEBUG(T) DISABLED_##T
 #endif
 
+#if SCUDO_FUCHSIA
+// The zxtest library provides a default main function that does the same thing
+// for Fuchsia builds.
+#define SCUDO_NO_TEST_MAIN
+#endif
+
 extern bool UseQuarantine;
diff --git a/standalone/tests/scudo_unit_test_main.cpp b/standalone/tests/scudo_unit_test_main.cpp
index fbfefa5c93d..881e0265bb3 100644
--- a/standalone/tests/scudo_unit_test_main.cpp
+++ b/standalone/tests/scudo_unit_test_main.cpp
@@ -45,9 +45,7 @@ __scudo_default_options() {
          "dealloc_type_mismatch=" DEALLOC_TYPE_MISMATCH;
 }
 
-// The zxtest library provides a default main function that does the same thing
-// for Fuchsia builds.
-#if !SCUDO_FUCHSIA
+#if !defined(SCUDO_NO_TEST_MAIN)
 int main(int argc, char **argv) {
   EnableMemoryTaggingIfSupported();
   testing::InitGoogleTest(&argc, argv);
diff --git a/standalone/tests/secondary_test.cpp b/standalone/tests/secondary_test.cpp
index b0319011771..8f0250e88eb 100644
--- a/standalone/tests/secondary_test.cpp
+++ b/standalone/tests/secondary_test.cpp
@@ -10,6 +10,7 @@
 #include "tests/scudo_unit_test.h"
 
 #include "allocator_config.h"
+#include "allocator_config_wrapper.h"
 #include "secondary.h"
 
 #include <algorithm>
@@ -22,7 +23,8 @@
 #include <vector>
 
 template <typename Config> static scudo::Options getOptionsForConfig() {
-  if (!Config::MaySupportMemoryTagging || !scudo::archSupportsMemoryTagging() ||
+  if (!Config::getMaySupportMemoryTagging() ||
+      !scudo::archSupportsMemoryTagging() ||
       !scudo::systemSupportsMemoryTagging())
     return {};
   scudo::AtomicOptions AO;
@@ -31,8 +33,9 @@ template <typename Config> static scudo::Options getOptionsForConfig() {
 }
 
 template <typename Config> static void testSecondaryBasic(void) {
-  using SecondaryT = scudo::MapAllocator<Config>;
-  scudo::Options Options = getOptionsForConfig<Config>();
+  using SecondaryT = scudo::MapAllocator<scudo::SecondaryConfig<Config>>;
+  scudo::Options Options =
+      getOptionsForConfig<scudo::SecondaryConfig<Config>>();
 
   scudo::GlobalStats S;
   S.init();
@@ -83,19 +86,35 @@ template <typename Config> static void testSecondaryBasic(void) {
 }
 
 struct NoCacheConfig {
-  typedef scudo::MapAllocatorNoCache SecondaryCache;
   static const bool MaySupportMemoryTagging = false;
+  template <typename> using TSDRegistryT = void;
+  template <typename> using PrimaryT = void;
+  template <typename Config> using SecondaryT = scudo::MapAllocator<Config>;
+
+  struct Secondary {
+    template <typename Config>
+    using CacheT = scudo::MapAllocatorNoCache<Config>;
+  };
 };
 
 struct TestConfig {
-  typedef scudo::MapAllocatorCache<TestConfig> SecondaryCache;
   static const bool MaySupportMemoryTagging = false;
-  static const scudo::u32 SecondaryCacheEntriesArraySize = 128U;
-  static const scudo::u32 SecondaryCacheQuarantineSize = 0U;
-  static const scudo::u32 SecondaryCacheDefaultMaxEntriesCount = 64U;
-  static const scudo::uptr SecondaryCacheDefaultMaxEntrySize = 1UL << 20;
-  static const scudo::s32 SecondaryCacheMinReleaseToOsIntervalMs = INT32_MIN;
-  static const scudo::s32 SecondaryCacheMaxReleaseToOsIntervalMs = INT32_MAX;
+  template <typename> using TSDRegistryT = void;
+  template <typename> using PrimaryT = void;
+  template <typename> using SecondaryT = void;
+
+  struct Secondary {
+    struct Cache {
+      static const scudo::u32 EntriesArraySize = 128U;
+      static const scudo::u32 QuarantineSize = 0U;
+      static const scudo::u32 DefaultMaxEntriesCount = 64U;
+      static const scudo::uptr DefaultMaxEntrySize = 1UL << 20;
+      static const scudo::s32 MinReleaseToOsIntervalMs = INT32_MIN;
+      static const scudo::s32 MaxReleaseToOsIntervalMs = INT32_MAX;
+    };
+
+    template <typename Config> using CacheT = scudo::MapAllocatorCache<Config>;
+  };
 };
 
 TEST(ScudoSecondaryTest, SecondaryBasic) {
@@ -106,7 +125,7 @@ TEST(ScudoSecondaryTest, SecondaryBasic) {
 
 struct MapAllocatorTest : public Test {
   using Config = scudo::DefaultConfig;
-  using LargeAllocator = scudo::MapAllocator<Config>;
+  using LargeAllocator = scudo::MapAllocator<scudo::SecondaryConfig<Config>>;
 
   void SetUp() override { Allocator->init(nullptr); }
 
@@ -114,7 +133,8 @@ struct MapAllocatorTest : public Test {
 
   std::unique_ptr<LargeAllocator> Allocator =
       std::make_unique<LargeAllocator>();
-  scudo::Options Options = getOptionsForConfig<Config>();
+  scudo::Options Options =
+      getOptionsForConfig<scudo::SecondaryConfig<Config>>();
 };
 
 // This exercises a variety of combinations of size and alignment for the
@@ -128,10 +148,10 @@ TEST_F(MapAllocatorTest, SecondaryCombinations) {
          AlignLog++) {
       const scudo::uptr Align = 1U << AlignLog;
       for (scudo::sptr Delta = -128; Delta <= 128; Delta += 8) {
-        if (static_cast<scudo::sptr>(1U << SizeLog) + Delta <= 0)
+        if ((1LL << SizeLog) + Delta <= 0)
           continue;
-        const scudo::uptr UserSize =
-            scudo::roundUp((1U << SizeLog) + Delta, MinAlign);
+        const scudo::uptr UserSize = scudo::roundUp(
+            static_cast<scudo::uptr>((1LL << SizeLog) + Delta), MinAlign);
         const scudo::uptr Size =
             HeaderSize + UserSize + (Align > MinAlign ? Align - HeaderSize : 0);
         void *P = Allocator->allocate(Options, Size, Align);
@@ -152,7 +172,8 @@ TEST_F(MapAllocatorTest, SecondaryIterate) {
   std::vector<void *> V;
   const scudo::uptr PageSize = scudo::getPageSizeCached();
   for (scudo::uptr I = 0; I < 32U; I++)
-    V.push_back(Allocator->allocate(Options, (std::rand() % 16) * PageSize));
+    V.push_back(Allocator->allocate(
+        Options, (static_cast<scudo::uptr>(std::rand()) % 16U) * PageSize));
   auto Lambda = [&V](scudo::uptr Block) {
     EXPECT_NE(std::find(V.begin(), V.end(), reinterpret_cast<void *>(Block)),
               V.end());
@@ -207,8 +228,9 @@ struct MapAllocatorWithReleaseTest : public MapAllocatorTest {
     }
     for (scudo::uptr I = 0; I < 128U; I++) {
       // Deallocate 75% of the blocks.
-      const bool Deallocate = (rand() & 3) != 0;
-      void *P = Allocator->allocate(Options, (std::rand() % 16) * PageSize);
+      const bool Deallocate = (std::rand() & 3) != 0;
+      void *P = Allocator->allocate(
+          Options, (static_cast<scudo::uptr>(std::rand()) % 16U) * PageSize);
       if (Deallocate)
         Allocator->deallocate(Options, P);
       else
diff --git a/standalone/tests/size_class_map_test.cpp b/standalone/tests/size_class_map_test.cpp
index b11db1e9f64..05b5835ff0b 100644
--- a/standalone/tests/size_class_map_test.cpp
+++ b/standalone/tests/size_class_map_test.cpp
@@ -20,10 +20,6 @@ TEST(ScudoSizeClassMapTest, DefaultSizeClassMap) {
   testSizeClassMap<scudo::DefaultSizeClassMap>();
 }
 
-TEST(ScudoSizeClassMapTest, SvelteSizeClassMap) {
-  testSizeClassMap<scudo::SvelteSizeClassMap>();
-}
-
 TEST(ScudoSizeClassMapTest, AndroidSizeClassMap) {
   testSizeClassMap<scudo::AndroidSizeClassMap>();
 }
diff --git a/standalone/tests/strings_test.cpp b/standalone/tests/strings_test.cpp
index 7a69ffd9762..abb81803f65 100644
--- a/standalone/tests/strings_test.cpp
+++ b/standalone/tests/strings_test.cpp
@@ -66,6 +66,10 @@ TEST(ScudoStringsTest, Precision) {
   Str.append("%-6s", "12345");
   EXPECT_EQ(Str.length(), strlen(Str.data()));
   EXPECT_STREQ("12345 ", Str.data());
+  Str.clear();
+  Str.append("%-8s", "12345");
+  EXPECT_EQ(Str.length(), strlen(Str.data()));
+  EXPECT_STREQ("12345   ", Str.data());
 }
 
 static void fillString(scudo::ScopedString &Str, scudo::uptr Size) {
@@ -123,3 +127,42 @@ TEST(ScudoStringsTest, Padding) {
   testAgainstLibc<int>("%03d - %03d", 12, 1234);
   testAgainstLibc<int>("%03d - %03d", -12, -1234);
 }
+
+#if defined(__linux__)
+
+#include <sys/resource.h>
+
+TEST(ScudoStringsTest, CapacityIncreaseFails) {
+  scudo::ScopedString Str;
+
+  rlimit Limit = {};
+  EXPECT_EQ(0, getrlimit(RLIMIT_AS, &Limit));
+
+  rlimit EmptyLimit = {.rlim_cur = 0, .rlim_max = Limit.rlim_max};
+  EXPECT_EQ(0, setrlimit(RLIMIT_AS, &EmptyLimit));
+
+  // qemu does not honor the setrlimit, so verify before proceeding.
+  scudo::MemMapT MemMap;
+  if (MemMap.map(/*Addr=*/0U, scudo::getPageSizeCached(), "scudo:test",
+                 MAP_ALLOWNOMEM)) {
+    MemMap.unmap(MemMap.getBase(), MemMap.getCapacity());
+    setrlimit(RLIMIT_AS, &Limit);
+    GTEST_SKIP() << "Limiting address space does not prevent mmap.";
+  }
+
+  // Test requires that the default length is at least 6 characters.
+  scudo::uptr MaxSize = Str.capacity();
+  EXPECT_LE(6u, MaxSize);
+
+  for (size_t i = 0; i < MaxSize - 5; i++) {
+    Str.append("B");
+  }
+
+  // Attempt to append past the end of the current capacity.
+  Str.append("%d", 12345678);
+  EXPECT_EQ(MaxSize, Str.capacity());
+  EXPECT_STREQ("B12345", &Str.data()[MaxSize - 6]);
+
+  EXPECT_EQ(0, setrlimit(RLIMIT_AS, &Limit));
+}
+#endif
diff --git a/standalone/tests/timing_test.cpp b/standalone/tests/timing_test.cpp
new file mode 100644
index 00000000000..09a6c312246
--- /dev/null
+++ b/standalone/tests/timing_test.cpp
@@ -0,0 +1,86 @@
+//===-- timing_test.cpp -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "tests/scudo_unit_test.h"
+
+#include "timing.h"
+
+#include <string>
+
+class ScudoTimingTest : public Test {
+public:
+  void testFunc1() { scudo::ScopedTimer ST(Manager, __func__); }
+
+  void testFunc2() {
+    scudo::ScopedTimer ST(Manager, __func__);
+    testFunc1();
+  }
+
+  void testChainedCalls() {
+    scudo::ScopedTimer ST(Manager, __func__);
+    testFunc2();
+  }
+
+  void testIgnoredTimer() {
+    scudo::ScopedTimer ST(Manager, __func__);
+    ST.ignore();
+  }
+
+  void printAllTimersStats() { Manager.printAll(); }
+
+  scudo::TimingManager &getTimingManager() { return Manager; }
+
+private:
+  scudo::TimingManager Manager;
+};
+
+// Given that the output of statistics of timers are dumped through
+// `scudo::Printf` which is platform dependent, so we don't have a reliable way
+// to catch the output and verify the details. Now we only verify the number of
+// invocations on linux.
+TEST_F(ScudoTimingTest, SimpleTimer) {
+#if SCUDO_LINUX
+  testing::internal::LogToStderr();
+  testing::internal::CaptureStderr();
+#endif
+
+  testIgnoredTimer();
+  testChainedCalls();
+  printAllTimersStats();
+
+#if SCUDO_LINUX
+  std::string output = testing::internal::GetCapturedStderr();
+  EXPECT_TRUE(output.find("testIgnoredTimer (1)") == std::string::npos);
+  EXPECT_TRUE(output.find("testChainedCalls (1)") != std::string::npos);
+  EXPECT_TRUE(output.find("testFunc2 (1)") != std::string::npos);
+  EXPECT_TRUE(output.find("testFunc1 (1)") != std::string::npos);
+#endif
+}
+
+TEST_F(ScudoTimingTest, NestedTimer) {
+#if SCUDO_LINUX
+  testing::internal::LogToStderr();
+  testing::internal::CaptureStderr();
+#endif
+
+  {
+    scudo::ScopedTimer Outer(getTimingManager(), "Outer");
+    {
+      scudo::ScopedTimer Inner1(getTimingManager(), Outer, "Inner1");
+      { scudo::ScopedTimer Inner2(getTimingManager(), Inner1, "Inner2"); }
+    }
+  }
+  printAllTimersStats();
+
+#if SCUDO_LINUX
+  std::string output = testing::internal::GetCapturedStderr();
+  EXPECT_TRUE(output.find("Outer (1)") != std::string::npos);
+  EXPECT_TRUE(output.find("Inner1 (1)") != std::string::npos);
+  EXPECT_TRUE(output.find("Inner2 (1)") != std::string::npos);
+#endif
+}
diff --git a/standalone/tests/tsd_test.cpp b/standalone/tests/tsd_test.cpp
index a092fdde904..851ac46b9f0 100644
--- a/standalone/tests/tsd_test.cpp
+++ b/standalone/tests/tsd_test.cpp
@@ -17,6 +17,7 @@
 #include <mutex>
 #include <set>
 #include <thread>
+#include <type_traits>
 
 // We mock out an allocator with a TSD registry, mostly using empty stubs. The
 // cache contains a single volatile uptr, to be able to test that several
@@ -38,7 +39,7 @@ public:
 
   void unmapTestOnly() { TSDRegistry.unmapTestOnly(this); }
   void initCache(CacheT *Cache) { *Cache = {}; }
-  void commitBack(scudo::TSD<MockAllocator> *TSD) {}
+  void commitBack(UNUSED scudo::TSD<MockAllocator> *TSD) {}
   TSDRegistryT *getTSDRegistry() { return &TSDRegistry; }
   void callPostInitCallback() {}
 
@@ -86,7 +87,8 @@ TEST(ScudoTSDTest, TSDRegistryInit) {
   EXPECT_TRUE(Allocator->isInitialized());
 }
 
-template <class AllocatorT> static void testRegistry() {
+template <class AllocatorT>
+static void testRegistry() NO_THREAD_SAFETY_ANALYSIS {
   auto Deleter = [](AllocatorT *A) {
     A->unmapTestOnly();
     delete A;
@@ -99,20 +101,17 @@ template <class AllocatorT> static void testRegistry() {
   Registry->initThreadMaybe(Allocator.get(), /*MinimalInit=*/true);
   EXPECT_TRUE(Allocator->isInitialized());
 
-  bool UnlockRequired;
-  auto TSD = Registry->getTSDAndLock(&UnlockRequired);
-  EXPECT_NE(TSD, nullptr);
-  EXPECT_EQ(TSD->getCache().Canary, 0U);
-  if (UnlockRequired)
-    TSD->unlock();
+  {
+    typename AllocatorT::TSDRegistryT::ScopedTSD TSD(*Registry);
+    EXPECT_EQ(TSD->getCache().Canary, 0U);
+  }
 
   Registry->initThreadMaybe(Allocator.get(), /*MinimalInit=*/false);
-  TSD = Registry->getTSDAndLock(&UnlockRequired);
-  EXPECT_NE(TSD, nullptr);
-  EXPECT_EQ(TSD->getCache().Canary, 0U);
-  memset(&TSD->getCache(), 0x42, sizeof(TSD->getCache()));
-  if (UnlockRequired)
-    TSD->unlock();
+  {
+    typename AllocatorT::TSDRegistryT::ScopedTSD TSD(*Registry);
+    EXPECT_EQ(TSD->getCache().Canary, 0U);
+    memset(&TSD->getCache(), 0x42, sizeof(TSD->getCache()));
+  }
 }
 
 TEST(ScudoTSDTest, TSDRegistryBasic) {
@@ -127,7 +126,12 @@ static std::mutex Mutex;
 static std::condition_variable Cv;
 static bool Ready;
 
-template <typename AllocatorT> static void stressCache(AllocatorT *Allocator) {
+// Accessing `TSD->getCache()` requires `TSD::Mutex` which isn't easy to test
+// using thread-safety analysis. Alternatively, we verify the thread safety
+// through a runtime check in ScopedTSD and mark the test body with
+// NO_THREAD_SAFETY_ANALYSIS.
+template <typename AllocatorT>
+static void stressCache(AllocatorT *Allocator) NO_THREAD_SAFETY_ANALYSIS {
   auto Registry = Allocator->getTSDRegistry();
   {
     std::unique_lock<std::mutex> Lock(Mutex);
@@ -135,13 +139,13 @@ template <typename AllocatorT> static void stressCache(AllocatorT *Allocator) {
       Cv.wait(Lock);
   }
   Registry->initThreadMaybe(Allocator, /*MinimalInit=*/false);
-  bool UnlockRequired;
-  auto TSD = Registry->getTSDAndLock(&UnlockRequired);
-  EXPECT_NE(TSD, nullptr);
+  typename AllocatorT::TSDRegistryT::ScopedTSD TSD(*Registry);
   // For an exclusive TSD, the cache should be empty. We cannot guarantee the
   // same for a shared TSD.
-  if (!UnlockRequired)
+  if (std::is_same<typename AllocatorT::TSDRegistryT,
+                   scudo::TSDRegistryExT<AllocatorT>>()) {
     EXPECT_EQ(TSD->getCache().Canary, 0U);
+  }
   // Transform the thread id to a uptr to use it as canary.
   const scudo::uptr Canary = static_cast<scudo::uptr>(
       std::hash<std::thread::id>{}(std::this_thread::get_id()));
@@ -149,8 +153,6 @@ template <typename AllocatorT> static void stressCache(AllocatorT *Allocator) {
   // Loop a few times to make sure that a concurrent thread isn't modifying it.
   for (scudo::uptr I = 0; I < 4096U; I++)
     EXPECT_EQ(TSD->getCache().Canary, Canary);
-  if (UnlockRequired)
-    TSD->unlock();
 }
 
 template <class AllocatorT> static void testRegistryThreaded() {
@@ -192,13 +194,10 @@ static void stressSharedRegistry(MockAllocator<SharedCaches> *Allocator) {
       Cv.wait(Lock);
   }
   Registry->initThreadMaybe(Allocator, /*MinimalInit=*/false);
-  bool UnlockRequired;
   for (scudo::uptr I = 0; I < 4096U; I++) {
-    auto TSD = Registry->getTSDAndLock(&UnlockRequired);
-    EXPECT_NE(TSD, nullptr);
-    Set.insert(reinterpret_cast<void *>(TSD));
-    if (UnlockRequired)
-      TSD->unlock();
+    typename MockAllocator<SharedCaches>::TSDRegistryT::ScopedTSD TSD(
+        *Registry);
+    Set.insert(reinterpret_cast<void *>(&*TSD));
   }
   {
     std::unique_lock<std::mutex> Lock(Mutex);
diff --git a/standalone/tests/vector_test.cpp b/standalone/tests/vector_test.cpp
index dc23c2a3471..b612676b7bd 100644
--- a/standalone/tests/vector_test.cpp
+++ b/standalone/tests/vector_test.cpp
@@ -41,3 +41,47 @@ TEST(ScudoVectorTest, ResizeReduction) {
   V.resize(1);
   EXPECT_EQ(V.size(), 1U);
 }
+
+#if defined(__linux__)
+
+#include <sys/resource.h>
+
+// Verify that if the reallocate fails, nothing new is added.
+TEST(ScudoVectorTest, ReallocateFails) {
+  scudo::Vector<char> V;
+  scudo::uptr capacity = V.capacity();
+
+  // Get the current address space size.
+  rlimit Limit = {};
+  EXPECT_EQ(0, getrlimit(RLIMIT_AS, &Limit));
+
+  rlimit EmptyLimit = {.rlim_cur = 0, .rlim_max = Limit.rlim_max};
+  EXPECT_EQ(0, setrlimit(RLIMIT_AS, &EmptyLimit));
+
+  // qemu does not honor the setrlimit, so verify before proceeding.
+  scudo::MemMapT MemMap;
+  if (MemMap.map(/*Addr=*/0U, scudo::getPageSizeCached(), "scudo:test",
+                 MAP_ALLOWNOMEM)) {
+    MemMap.unmap(MemMap.getBase(), MemMap.getCapacity());
+    setrlimit(RLIMIT_AS, &Limit);
+    GTEST_SKIP() << "Limiting address space does not prevent mmap.";
+  }
+
+  V.resize(capacity);
+  // Set the last element so we can check it later.
+  V.back() = '\0';
+
+  // The reallocate should fail, so the capacity should not change.
+  V.reserve(capacity + 1000);
+  EXPECT_EQ(capacity, V.capacity());
+
+  // Now try to do a push back and verify that the size does not change.
+  scudo::uptr Size = V.size();
+  V.push_back('2');
+  EXPECT_EQ(Size, V.size());
+  // Verify that the last element in the vector did not change.
+  EXPECT_EQ('\0', V.back());
+
+  EXPECT_EQ(0, setrlimit(RLIMIT_AS, &Limit));
+}
+#endif
diff --git a/standalone/tests/wrappers_c_test.cpp b/standalone/tests/wrappers_c_test.cpp
index 616cf5491b5..f5e17d72148 100644
--- a/standalone/tests/wrappers_c_test.cpp
+++ b/standalone/tests/wrappers_c_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "common.h"
 #include "memtag.h"
 #include "scudo/interface.h"
 #include "tests/scudo_unit_test.h"
@@ -15,11 +16,27 @@
 #include <malloc.h>
 #include <stdlib.h>
 #include <unistd.h>
+#include <vector>
 
 #ifndef __GLIBC_PREREQ
 #define __GLIBC_PREREQ(x, y) 0
 #endif
 
+#if SCUDO_FUCHSIA
+// Fuchsia only has valloc
+#define HAVE_VALLOC 1
+#elif SCUDO_ANDROID
+// Android only has pvalloc/valloc on 32 bit
+#if !defined(__LP64__)
+#define HAVE_PVALLOC 1
+#define HAVE_VALLOC 1
+#endif // !defined(__LP64__)
+#else
+// All others assumed to support both functions.
+#define HAVE_PVALLOC 1
+#define HAVE_VALLOC 1
+#endif
+
 extern "C" {
 void malloc_enable(void);
 void malloc_disable(void);
@@ -28,7 +45,101 @@ int malloc_iterate(uintptr_t base, size_t size,
                    void *arg);
 void *valloc(size_t size);
 void *pvalloc(size_t size);
+
+#ifndef SCUDO_ENABLE_HOOKS_TESTS
+#define SCUDO_ENABLE_HOOKS_TESTS 0
+#endif
+
+#if (SCUDO_ENABLE_HOOKS_TESTS == 1) && (SCUDO_ENABLE_HOOKS == 0)
+#error "Hooks tests should have hooks enabled as well!"
+#endif
+
+struct AllocContext {
+  void *Ptr;
+  size_t Size;
+};
+struct DeallocContext {
+  void *Ptr;
+};
+struct ReallocContext {
+  void *AllocPtr;
+  void *DeallocPtr;
+  size_t Size;
+};
+static AllocContext AC;
+static DeallocContext DC;
+static ReallocContext RC;
+
+#if (SCUDO_ENABLE_HOOKS_TESTS == 1)
+__attribute__((visibility("default"))) void __scudo_allocate_hook(void *Ptr,
+                                                                  size_t Size) {
+  AC.Ptr = Ptr;
+  AC.Size = Size;
+}
+__attribute__((visibility("default"))) void __scudo_deallocate_hook(void *Ptr) {
+  DC.Ptr = Ptr;
 }
+__attribute__((visibility("default"))) void
+__scudo_realloc_allocate_hook(void *OldPtr, void *NewPtr, size_t Size) {
+  // Verify that __scudo_realloc_deallocate_hook is called first and set the
+  // right pointer.
+  EXPECT_EQ(OldPtr, RC.DeallocPtr);
+  RC.AllocPtr = NewPtr;
+  RC.Size = Size;
+
+  // Note that this is only used for testing. In general, only one pair of hooks
+  // will be invoked in `realloc`. if __scudo_realloc_*_hook are not defined,
+  // it'll call the general hooks only. To make the test easier, we call the
+  // general one here so that either case (whether __scudo_realloc_*_hook are
+  // defined) will be verified without separating them into different tests.
+  __scudo_allocate_hook(NewPtr, Size);
+}
+__attribute__((visibility("default"))) void
+__scudo_realloc_deallocate_hook(void *Ptr) {
+  RC.DeallocPtr = Ptr;
+
+  // See the comment in the __scudo_realloc_allocate_hook above.
+  __scudo_deallocate_hook(Ptr);
+}
+#endif // (SCUDO_ENABLE_HOOKS_TESTS == 1)
+}
+
+class ScudoWrappersCTest : public Test {
+protected:
+  void SetUp() override {
+    if (SCUDO_ENABLE_HOOKS && !SCUDO_ENABLE_HOOKS_TESTS)
+      printf("Hooks are enabled but hooks tests are disabled.\n");
+  }
+
+  void invalidateHookPtrs() {
+    if (SCUDO_ENABLE_HOOKS_TESTS) {
+      void *InvalidPtr = reinterpret_cast<void *>(0xdeadbeef);
+      AC.Ptr = InvalidPtr;
+      DC.Ptr = InvalidPtr;
+      RC.AllocPtr = RC.DeallocPtr = InvalidPtr;
+    }
+  }
+  void verifyAllocHookPtr(UNUSED void *Ptr) {
+    if (SCUDO_ENABLE_HOOKS_TESTS)
+      EXPECT_EQ(Ptr, AC.Ptr);
+  }
+  void verifyAllocHookSize(UNUSED size_t Size) {
+    if (SCUDO_ENABLE_HOOKS_TESTS)
+      EXPECT_EQ(Size, AC.Size);
+  }
+  void verifyDeallocHookPtr(UNUSED void *Ptr) {
+    if (SCUDO_ENABLE_HOOKS_TESTS)
+      EXPECT_EQ(Ptr, DC.Ptr);
+  }
+  void verifyReallocHookPtrs(UNUSED void *OldPtr, void *NewPtr, size_t Size) {
+    if (SCUDO_ENABLE_HOOKS_TESTS) {
+      EXPECT_EQ(OldPtr, RC.DeallocPtr);
+      EXPECT_EQ(NewPtr, RC.AllocPtr);
+      EXPECT_EQ(Size, RC.Size);
+    }
+  }
+};
+using ScudoWrappersCDeathTest = ScudoWrappersCTest;
 
 // Note that every C allocation function in the test binary will be fulfilled
 // by Scudo (this includes the gtest APIs, etc.), which is a test by itself.
@@ -42,11 +153,13 @@ void *pvalloc(size_t size);
 
 static const size_t Size = 100U;
 
-TEST(ScudoWrappersCDeathTest, Malloc) {
+TEST_F(ScudoWrappersCDeathTest, Malloc) {
   void *P = malloc(Size);
   EXPECT_NE(P, nullptr);
   EXPECT_LE(Size, malloc_usable_size(P));
   EXPECT_EQ(reinterpret_cast<uintptr_t>(P) % FIRST_32_SECOND_64(8U, 16U), 0U);
+  verifyAllocHookPtr(P);
+  verifyAllocHookSize(Size);
 
   // An update to this warning in Clang now triggers in this line, but it's ok
   // because the check is expecting a bad pointer and should fail.
@@ -61,6 +174,7 @@ TEST(ScudoWrappersCDeathTest, Malloc) {
 #endif
 
   free(P);
+  verifyDeallocHookPtr(P);
   EXPECT_DEATH(free(P), "");
 
   P = malloc(0U);
@@ -72,13 +186,16 @@ TEST(ScudoWrappersCDeathTest, Malloc) {
   EXPECT_EQ(errno, ENOMEM);
 }
 
-TEST(ScudoWrappersCTest, Calloc) {
+TEST_F(ScudoWrappersCTest, Calloc) {
   void *P = calloc(1U, Size);
   EXPECT_NE(P, nullptr);
   EXPECT_LE(Size, malloc_usable_size(P));
+  verifyAllocHookPtr(P);
+  verifyAllocHookSize(Size);
   for (size_t I = 0; I < Size; I++)
     EXPECT_EQ((reinterpret_cast<uint8_t *>(P))[I], 0U);
   free(P);
+  verifyDeallocHookPtr(P);
 
   P = calloc(1U, 0U);
   EXPECT_NE(P, nullptr);
@@ -99,19 +216,28 @@ TEST(ScudoWrappersCTest, Calloc) {
   EXPECT_EQ(errno, ENOMEM);
 }
 
-TEST(ScudoWrappersCTest, SmallAlign) {
-  void *P;
-  for (size_t Size = 1; Size <= 0x10000; Size <<= 1) {
-    for (size_t Align = 1; Align <= 0x10000; Align <<= 1) {
+TEST_F(ScudoWrappersCTest, SmallAlign) {
+  // Allocating pointers by the powers of 2 from 1 to 0x10000
+  // Using powers of 2 due to memalign using powers of 2 and test more sizes
+  constexpr size_t MaxSize = 0x10000;
+  std::vector<void *> ptrs;
+  // Reserving space to prevent further allocation during the test
+  ptrs.reserve((scudo::getLeastSignificantSetBitIndex(MaxSize) + 1) *
+               (scudo::getLeastSignificantSetBitIndex(MaxSize) + 1) * 3);
+  for (size_t Size = 1; Size <= MaxSize; Size <<= 1) {
+    for (size_t Align = 1; Align <= MaxSize; Align <<= 1) {
       for (size_t Count = 0; Count < 3; ++Count) {
-        P = memalign(Align, Size);
+        void *P = memalign(Align, Size);
         EXPECT_TRUE(reinterpret_cast<uintptr_t>(P) % Align == 0);
+        ptrs.push_back(P);
       }
     }
   }
+  for (void *ptr : ptrs)
+    free(ptr);
 }
 
-TEST(ScudoWrappersCTest, Memalign) {
+TEST_F(ScudoWrappersCTest, Memalign) {
   void *P;
   for (size_t I = FIRST_32_SECOND_64(2U, 3U); I <= 18U; I++) {
     const size_t Alignment = 1U << I;
@@ -120,14 +246,20 @@ TEST(ScudoWrappersCTest, Memalign) {
     EXPECT_NE(P, nullptr);
     EXPECT_LE(Size, malloc_usable_size(P));
     EXPECT_EQ(reinterpret_cast<uintptr_t>(P) % Alignment, 0U);
+    verifyAllocHookPtr(P);
+    verifyAllocHookSize(Size);
     free(P);
+    verifyDeallocHookPtr(P);
 
     P = nullptr;
     EXPECT_EQ(posix_memalign(&P, Alignment, Size), 0);
     EXPECT_NE(P, nullptr);
     EXPECT_LE(Size, malloc_usable_size(P));
     EXPECT_EQ(reinterpret_cast<uintptr_t>(P) % Alignment, 0U);
+    verifyAllocHookPtr(P);
+    verifyAllocHookSize(Size);
     free(P);
+    verifyDeallocHookPtr(P);
   }
 
   EXPECT_EQ(memalign(4096U, SIZE_MAX), nullptr);
@@ -139,18 +271,24 @@ TEST(ScudoWrappersCTest, Memalign) {
     for (size_t Alignment = 0U; Alignment <= 128U; Alignment++) {
       P = memalign(Alignment, 1024U);
       EXPECT_NE(P, nullptr);
+      verifyAllocHookPtr(P);
+      verifyAllocHookSize(Size);
       free(P);
+      verifyDeallocHookPtr(P);
     }
   }
 }
 
-TEST(ScudoWrappersCTest, AlignedAlloc) {
+TEST_F(ScudoWrappersCTest, AlignedAlloc) {
   const size_t Alignment = 4096U;
   void *P = aligned_alloc(Alignment, Alignment * 4U);
   EXPECT_NE(P, nullptr);
   EXPECT_LE(Alignment * 4U, malloc_usable_size(P));
   EXPECT_EQ(reinterpret_cast<uintptr_t>(P) % Alignment, 0U);
+  verifyAllocHookPtr(P);
+  verifyAllocHookSize(Alignment * 4U);
   free(P);
+  verifyDeallocHookPtr(P);
 
   errno = 0;
   P = aligned_alloc(Alignment, Size);
@@ -158,33 +296,60 @@ TEST(ScudoWrappersCTest, AlignedAlloc) {
   EXPECT_EQ(errno, EINVAL);
 }
 
-TEST(ScudoWrappersCDeathTest, Realloc) {
+TEST_F(ScudoWrappersCDeathTest, Realloc) {
+  invalidateHookPtrs();
   // realloc(nullptr, N) is malloc(N)
-  void *P = realloc(nullptr, 0U);
+  void *P = realloc(nullptr, Size);
   EXPECT_NE(P, nullptr);
+  verifyAllocHookPtr(P);
+  verifyAllocHookSize(Size);
   free(P);
+  verifyDeallocHookPtr(P);
 
+  invalidateHookPtrs();
   P = malloc(Size);
   EXPECT_NE(P, nullptr);
   // realloc(P, 0U) is free(P) and returns nullptr
   EXPECT_EQ(realloc(P, 0U), nullptr);
+  verifyDeallocHookPtr(P);
 
   P = malloc(Size);
   EXPECT_NE(P, nullptr);
   EXPECT_LE(Size, malloc_usable_size(P));
   memset(P, 0x42, Size);
 
+  invalidateHookPtrs();
+  void *OldP = P;
   P = realloc(P, Size * 2U);
   EXPECT_NE(P, nullptr);
   EXPECT_LE(Size * 2U, malloc_usable_size(P));
   for (size_t I = 0; I < Size; I++)
     EXPECT_EQ(0x42, (reinterpret_cast<uint8_t *>(P))[I]);
+  if (OldP == P) {
+    verifyDeallocHookPtr(OldP);
+    verifyAllocHookPtr(OldP);
+  } else {
+    verifyAllocHookPtr(P);
+    verifyAllocHookSize(Size * 2U);
+    verifyDeallocHookPtr(OldP);
+  }
+  verifyReallocHookPtrs(OldP, P, Size * 2U);
 
+  invalidateHookPtrs();
+  OldP = P;
   P = realloc(P, Size / 2U);
   EXPECT_NE(P, nullptr);
   EXPECT_LE(Size / 2U, malloc_usable_size(P));
   for (size_t I = 0; I < Size / 2U; I++)
     EXPECT_EQ(0x42, (reinterpret_cast<uint8_t *>(P))[I]);
+  if (OldP == P) {
+    verifyDeallocHookPtr(OldP);
+    verifyAllocHookPtr(OldP);
+  } else {
+    verifyAllocHookPtr(P);
+    verifyAllocHookSize(Size / 2U);
+  }
+  verifyReallocHookPtrs(OldP, P, Size / 2U);
   free(P);
 
   EXPECT_DEATH(P = realloc(P, Size), "");
@@ -218,7 +383,7 @@ TEST(ScudoWrappersCDeathTest, Realloc) {
 }
 
 #if !SCUDO_FUCHSIA
-TEST(ScudoWrappersCTest, MallOpt) {
+TEST_F(ScudoWrappersCTest, MallOpt) {
   errno = 0;
   EXPECT_EQ(mallopt(-1000, 1), 0);
   // mallopt doesn't set errno.
@@ -239,15 +404,19 @@ TEST(ScudoWrappersCTest, MallOpt) {
 }
 #endif
 
-TEST(ScudoWrappersCTest, OtherAlloc) {
-#if !SCUDO_FUCHSIA
-  const size_t PageSize = sysconf(_SC_PAGESIZE);
+TEST_F(ScudoWrappersCTest, OtherAlloc) {
+#if HAVE_PVALLOC
+  const size_t PageSize = static_cast<size_t>(sysconf(_SC_PAGESIZE));
 
   void *P = pvalloc(Size);
   EXPECT_NE(P, nullptr);
   EXPECT_EQ(reinterpret_cast<uintptr_t>(P) & (PageSize - 1), 0U);
   EXPECT_LE(PageSize, malloc_usable_size(P));
+  verifyAllocHookPtr(P);
+  // Size will be rounded up to PageSize.
+  verifyAllocHookSize(PageSize);
   free(P);
+  verifyDeallocHookPtr(P);
 
   EXPECT_EQ(pvalloc(SIZE_MAX), nullptr);
 
@@ -257,32 +426,44 @@ TEST(ScudoWrappersCTest, OtherAlloc) {
   free(P);
 #endif
 
+#if HAVE_VALLOC
   EXPECT_EQ(valloc(SIZE_MAX), nullptr);
+#endif
 }
 
-#if !SCUDO_FUCHSIA
-TEST(ScudoWrappersCTest, MallInfo) {
+template<typename FieldType>
+void MallInfoTest() {
   // mallinfo is deprecated.
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
-  const size_t BypassQuarantineSize = 1024U;
+  const FieldType BypassQuarantineSize = 1024U;
   struct mallinfo MI = mallinfo();
-  size_t Allocated = MI.uordblks;
+  FieldType Allocated = MI.uordblks;
   void *P = malloc(BypassQuarantineSize);
   EXPECT_NE(P, nullptr);
   MI = mallinfo();
-  EXPECT_GE(static_cast<size_t>(MI.uordblks), Allocated + BypassQuarantineSize);
-  EXPECT_GT(static_cast<size_t>(MI.hblkhd), 0U);
-  size_t Free = MI.fordblks;
+  EXPECT_GE(MI.uordblks, Allocated + BypassQuarantineSize);
+  EXPECT_GT(MI.hblkhd, static_cast<FieldType>(0));
+  FieldType Free = MI.fordblks;
   free(P);
   MI = mallinfo();
-  EXPECT_GE(static_cast<size_t>(MI.fordblks), Free + BypassQuarantineSize);
+  EXPECT_GE(MI.fordblks, Free + BypassQuarantineSize);
 #pragma clang diagnostic pop
 }
+
+#if !SCUDO_FUCHSIA
+TEST_F(ScudoWrappersCTest, MallInfo) {
+#if SCUDO_ANDROID
+  // Android accidentally set the fields to size_t instead of int.
+  MallInfoTest<size_t>();
+#else
+  MallInfoTest<int>();
+#endif
+}
 #endif
 
-#if __GLIBC_PREREQ(2, 33)
-TEST(ScudoWrappersCTest, MallInfo2) {
+#if __GLIBC_PREREQ(2, 33) || SCUDO_ANDROID
+TEST_F(ScudoWrappersCTest, MallInfo2) {
   const size_t BypassQuarantineSize = 1024U;
   struct mallinfo2 MI = mallinfo2();
   size_t Allocated = MI.uordblks;
@@ -301,7 +482,7 @@ TEST(ScudoWrappersCTest, MallInfo2) {
 static uintptr_t BoundaryP;
 static size_t Count;
 
-static void callback(uintptr_t Base, size_t Size, void *Arg) {
+static void callback(uintptr_t Base, UNUSED size_t Size, UNUSED void *Arg) {
   if (scudo::archSupportsMemoryTagging()) {
     Base = scudo::untagPointer(Base);
     BoundaryP = scudo::untagPointer(BoundaryP);
@@ -314,13 +495,22 @@ static void callback(uintptr_t Base, size_t Size, void *Arg) {
 // To achieve this, we allocate a chunk for which the backing block will be
 // aligned on a page, then run the malloc_iterate on both the pages that the
 // block is a boundary for. It must only be seen once by the callback function.
-TEST(ScudoWrappersCTest, MallocIterateBoundary) {
-  const size_t PageSize = sysconf(_SC_PAGESIZE);
+TEST_F(ScudoWrappersCTest, MallocIterateBoundary) {
+  const size_t PageSize = static_cast<size_t>(sysconf(_SC_PAGESIZE));
+#if SCUDO_ANDROID
+  // Android uses a 16 byte alignment for both 32 bit and 64 bit.
+  const size_t BlockDelta = 16U;
+#else
   const size_t BlockDelta = FIRST_32_SECOND_64(8U, 16U);
+#endif
   const size_t SpecialSize = PageSize - BlockDelta;
 
   // We aren't guaranteed that any size class is exactly a page wide. So we need
-  // to keep making allocations until we succeed.
+  // to keep making allocations until we get an allocation that starts exactly
+  // on a page boundary. The BlockDelta value is expected to be the number of
+  // bytes to subtract from a returned pointer to get to the actual start of
+  // the pointer in the size class. In practice, this means BlockDelta should
+  // be set to the minimum alignment in bytes for the allocation.
   //
   // With a 16-byte block alignment and 4096-byte page size, each allocation has
   // a probability of (1 - (16/4096)) of failing to meet the alignment
@@ -357,7 +547,7 @@ TEST(ScudoWrappersCTest, MallocIterateBoundary) {
 
 // Fuchsia doesn't have alarm, fork or malloc_info.
 #if !SCUDO_FUCHSIA
-TEST(ScudoWrappersCDeathTest, MallocDisableDeadlock) {
+TEST_F(ScudoWrappersCDeathTest, MallocDisableDeadlock) {
   // We expect heap operations within a disable/enable scope to deadlock.
   EXPECT_DEATH(
       {
@@ -372,7 +562,7 @@ TEST(ScudoWrappersCDeathTest, MallocDisableDeadlock) {
       "");
 }
 
-TEST(ScudoWrappersCTest, MallocInfo) {
+TEST_F(ScudoWrappersCTest, MallocInfo) {
   // Use volatile so that the allocations don't get optimized away.
   void *volatile P1 = malloc(1234);
   void *volatile P2 = malloc(4321);
@@ -392,7 +582,7 @@ TEST(ScudoWrappersCTest, MallocInfo) {
   free(P2);
 }
 
-TEST(ScudoWrappersCDeathTest, Fork) {
+TEST_F(ScudoWrappersCDeathTest, Fork) {
   void *P;
   pid_t Pid = fork();
   EXPECT_GE(Pid, 0) << strerror(errno);
@@ -424,7 +614,7 @@ static pthread_mutex_t Mutex;
 static pthread_cond_t Conditional = PTHREAD_COND_INITIALIZER;
 static bool Ready;
 
-static void *enableMalloc(void *Unused) {
+static void *enableMalloc(UNUSED void *Unused) {
   // Initialize the allocator for this thread.
   void *P = malloc(Size);
   EXPECT_NE(P, nullptr);
@@ -444,7 +634,7 @@ static void *enableMalloc(void *Unused) {
   return nullptr;
 }
 
-TEST(ScudoWrappersCTest, DisableForkEnable) {
+TEST_F(ScudoWrappersCTest, DisableForkEnable) {
   pthread_t ThreadId;
   Ready = false;
   EXPECT_EQ(pthread_create(&ThreadId, nullptr, &enableMalloc, nullptr), 0);
diff --git a/standalone/tests/wrappers_cpp_test.cpp b/standalone/tests/wrappers_cpp_test.cpp
index a88dc4aacd5..c802ed22fba 100644
--- a/standalone/tests/wrappers_cpp_test.cpp
+++ b/standalone/tests/wrappers_cpp_test.cpp
@@ -17,49 +17,120 @@
 #include <thread>
 #include <vector>
 
+// Android does not support checking for new/delete mismatches.
+#if SCUDO_ANDROID
+#define SKIP_MISMATCH_TESTS 1
+#else
+#define SKIP_MISMATCH_TESTS 0
+#endif
+
 void operator delete(void *, size_t) noexcept;
 void operator delete[](void *, size_t) noexcept;
 
-// Note that every Cxx allocation function in the test binary will be fulfilled
-// by Scudo. See the comment in the C counterpart of this file.
+extern "C" {
+#ifndef SCUDO_ENABLE_HOOKS_TESTS
+#define SCUDO_ENABLE_HOOKS_TESTS 0
+#endif
+
+#if (SCUDO_ENABLE_HOOKS_TESTS == 1) && (SCUDO_ENABLE_HOOKS == 0)
+#error "Hooks tests should have hooks enabled as well!"
+#endif
 
-template <typename T> static void testCxxNew() {
-  T *P = new T;
-  EXPECT_NE(P, nullptr);
-  memset(P, 0x42, sizeof(T));
-  EXPECT_DEATH(delete[] P, "");
-  delete P;
-  EXPECT_DEATH(delete P, "");
-
-  P = new T;
-  EXPECT_NE(P, nullptr);
-  memset(P, 0x42, sizeof(T));
-  operator delete(P, sizeof(T));
-
-  P = new (std::nothrow) T;
-  EXPECT_NE(P, nullptr);
-  memset(P, 0x42, sizeof(T));
-  delete P;
-
-  const size_t N = 16U;
-  T *A = new T[N];
-  EXPECT_NE(A, nullptr);
-  memset(A, 0x42, sizeof(T) * N);
-  EXPECT_DEATH(delete A, "");
-  delete[] A;
-  EXPECT_DEATH(delete[] A, "");
-
-  A = new T[N];
-  EXPECT_NE(A, nullptr);
-  memset(A, 0x42, sizeof(T) * N);
-  operator delete[](A, sizeof(T) * N);
-
-  A = new (std::nothrow) T[N];
-  EXPECT_NE(A, nullptr);
-  memset(A, 0x42, sizeof(T) * N);
-  delete[] A;
+struct AllocContext {
+  void *Ptr;
+  size_t Size;
+};
+struct DeallocContext {
+  void *Ptr;
+};
+static AllocContext AC;
+static DeallocContext DC;
+
+#if (SCUDO_ENABLE_HOOKS_TESTS == 1)
+__attribute__((visibility("default"))) void __scudo_allocate_hook(void *Ptr,
+                                                                  size_t Size) {
+  AC.Ptr = Ptr;
+  AC.Size = Size;
+}
+__attribute__((visibility("default"))) void __scudo_deallocate_hook(void *Ptr) {
+  DC.Ptr = Ptr;
+}
+#endif // (SCUDO_ENABLE_HOOKS_TESTS == 1)
 }
 
+class ScudoWrappersCppTest : public Test {
+protected:
+  void SetUp() override {
+    if (SCUDO_ENABLE_HOOKS && !SCUDO_ENABLE_HOOKS_TESTS)
+      printf("Hooks are enabled but hooks tests are disabled.\n");
+  }
+
+  void verifyAllocHookPtr(UNUSED void *Ptr) {
+    if (SCUDO_ENABLE_HOOKS_TESTS)
+      EXPECT_EQ(Ptr, AC.Ptr);
+  }
+  void verifyAllocHookSize(UNUSED size_t Size) {
+    if (SCUDO_ENABLE_HOOKS_TESTS)
+      EXPECT_EQ(Size, AC.Size);
+  }
+  void verifyDeallocHookPtr(UNUSED void *Ptr) {
+    if (SCUDO_ENABLE_HOOKS_TESTS)
+      EXPECT_EQ(Ptr, DC.Ptr);
+  }
+
+  template <typename T> void testCxxNew() {
+    T *P = new T;
+    EXPECT_NE(P, nullptr);
+    verifyAllocHookPtr(P);
+    verifyAllocHookSize(sizeof(T));
+    memset(P, 0x42, sizeof(T));
+    EXPECT_DEATH(delete[] P, "");
+    delete P;
+    verifyDeallocHookPtr(P);
+    EXPECT_DEATH(delete P, "");
+
+    P = new T;
+    EXPECT_NE(P, nullptr);
+    memset(P, 0x42, sizeof(T));
+    operator delete(P, sizeof(T));
+    verifyDeallocHookPtr(P);
+
+    P = new (std::nothrow) T;
+    verifyAllocHookPtr(P);
+    verifyAllocHookSize(sizeof(T));
+    EXPECT_NE(P, nullptr);
+    memset(P, 0x42, sizeof(T));
+    delete P;
+    verifyDeallocHookPtr(P);
+
+    const size_t N = 16U;
+    T *A = new T[N];
+    EXPECT_NE(A, nullptr);
+    verifyAllocHookPtr(A);
+    verifyAllocHookSize(sizeof(T) * N);
+    memset(A, 0x42, sizeof(T) * N);
+    EXPECT_DEATH(delete A, "");
+    delete[] A;
+    verifyDeallocHookPtr(A);
+    EXPECT_DEATH(delete[] A, "");
+
+    A = new T[N];
+    EXPECT_NE(A, nullptr);
+    memset(A, 0x42, sizeof(T) * N);
+    operator delete[](A, sizeof(T) * N);
+    verifyDeallocHookPtr(A);
+
+    A = new (std::nothrow) T[N];
+    verifyAllocHookPtr(A);
+    verifyAllocHookSize(sizeof(T) * N);
+    EXPECT_NE(A, nullptr);
+    memset(A, 0x42, sizeof(T) * N);
+    delete[] A;
+    verifyDeallocHookPtr(A);
+  }
+};
+using ScudoWrappersCppDeathTest = ScudoWrappersCppTest;
+
 class Pixel {
 public:
   enum class Color { Red, Green, Blue };
@@ -68,8 +139,11 @@ public:
   Color C = Color::Red;
 };
 
-TEST(ScudoWrappersCppDeathTest, New) {
-  if (getenv("SKIP_TYPE_MISMATCH")) {
+// Note that every Cxx allocation function in the test binary will be fulfilled
+// by Scudo. See the comment in the C counterpart of this file.
+
+TEST_F(ScudoWrappersCppDeathTest, New) {
+  if (getenv("SKIP_TYPE_MISMATCH") || SKIP_MISMATCH_TESTS) {
     printf("Skipped type mismatch tests.\n");
     return;
   }
@@ -96,7 +170,7 @@ static void stressNew() {
       Cv.wait(Lock);
   }
   for (size_t I = 0; I < 256U; I++) {
-    const size_t N = std::rand() % 128U;
+    const size_t N = static_cast<size_t>(std::rand()) % 128U;
     uintptr_t *P = new uintptr_t[N];
     if (P) {
       memset(P, 0x42, sizeof(uintptr_t) * N);
@@ -109,7 +183,7 @@ static void stressNew() {
   }
 }
 
-TEST(ScudoWrappersCppTest, ThreadedNew) {
+TEST_F(ScudoWrappersCppTest, ThreadedNew) {
   // TODO: Investigate why libc sometimes crashes with tag missmatch in
   // __pthread_clockjoin_ex.
   std::unique_ptr<scudo::ScopedDisableMemoryTagChecks> NoTags;
@@ -131,7 +205,7 @@ TEST(ScudoWrappersCppTest, ThreadedNew) {
 }
 
 #if !SCUDO_FUCHSIA
-TEST(ScudoWrappersCppTest, AllocAfterFork) {
+TEST_F(ScudoWrappersCppTest, AllocAfterFork) {
   // This test can fail flakily when ran as a part of large number of
   // other tests if the maxmimum number of mappings allowed is low.
   // We tried to reduce the number of iterations of the loops with
diff --git a/standalone/timing.cpp b/standalone/timing.cpp
new file mode 100644
index 00000000000..59ae21d10f0
--- /dev/null
+++ b/standalone/timing.cpp
@@ -0,0 +1,29 @@
+//===-- timing.cpp ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "timing.h"
+
+namespace scudo {
+
+Timer::~Timer() {
+  if (Manager)
+    Manager->report(*this);
+}
+
+ScopedTimer::ScopedTimer(TimingManager &Manager, const char *Name)
+    : Timer(Manager.getOrCreateTimer(Name)) {
+  start();
+}
+
+ScopedTimer::ScopedTimer(TimingManager &Manager, const Timer &Nest,
+                         const char *Name)
+    : Timer(Manager.nest(Nest, Name)) {
+  start();
+}
+
+} // namespace scudo
diff --git a/standalone/timing.h b/standalone/timing.h
new file mode 100644
index 00000000000..84caa79e5c3
--- /dev/null
+++ b/standalone/timing.h
@@ -0,0 +1,221 @@
+//===-- timing.h ------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_TIMING_H_
+#define SCUDO_TIMING_H_
+
+#include "common.h"
+#include "mutex.h"
+#include "string_utils.h"
+#include "thread_annotations.h"
+
+#include <inttypes.h>
+#include <string.h>
+
+namespace scudo {
+
+class TimingManager;
+
+// A simple timer for evaluating execution time of code snippets. It can be used
+// along with TimingManager or standalone.
+class Timer {
+public:
+  // The use of Timer without binding to a TimingManager is supposed to do the
+  // timer logging manually. Otherwise, TimingManager will do the logging stuff
+  // for you.
+  Timer() = default;
+  Timer(Timer &&Other)
+      : StartTime(0), AccTime(Other.AccTime), Manager(Other.Manager),
+        HandleId(Other.HandleId) {
+    Other.Manager = nullptr;
+  }
+
+  Timer(const Timer &) = delete;
+
+  ~Timer();
+
+  void start() {
+    CHECK_EQ(StartTime, 0U);
+    StartTime = getMonotonicTime();
+  }
+  void stop() {
+    AccTime += getMonotonicTime() - StartTime;
+    StartTime = 0;
+  }
+  u64 getAccumulatedTime() const { return AccTime; }
+
+  // Unset the bound TimingManager so that we don't report the data back. This
+  // is useful if we only want to track subset of certain scope events.
+  void ignore() {
+    StartTime = 0;
+    AccTime = 0;
+    Manager = nullptr;
+  }
+
+protected:
+  friend class TimingManager;
+  Timer(TimingManager &Manager, u32 HandleId)
+      : Manager(&Manager), HandleId(HandleId) {}
+
+  u64 StartTime = 0;
+  u64 AccTime = 0;
+  TimingManager *Manager = nullptr;
+  u32 HandleId;
+};
+
+// A RAII-style wrapper for easy scope execution measurement. Note that in order
+// not to take additional space for the message like `Name`. It only works with
+// TimingManager.
+class ScopedTimer : public Timer {
+public:
+  ScopedTimer(TimingManager &Manager, const char *Name);
+  ScopedTimer(TimingManager &Manager, const Timer &Nest, const char *Name);
+  ~ScopedTimer() { stop(); }
+};
+
+// In Scudo, the execution time of single run of code snippets may not be
+// useful, we are more interested in the average time from several runs.
+// TimingManager lets the registered timer report their data and reports the
+// average execution time for each timer periodically.
+class TimingManager {
+public:
+  TimingManager(u32 PrintingInterval = DefaultPrintingInterval)
+      : PrintingInterval(PrintingInterval) {}
+  ~TimingManager() {
+    if (NumAllocatedTimers != 0)
+      printAll();
+  }
+
+  Timer getOrCreateTimer(const char *Name) EXCLUDES(Mutex) {
+    ScopedLock L(Mutex);
+
+    CHECK_LT(strlen(Name), MaxLenOfTimerName);
+    for (u32 I = 0; I < NumAllocatedTimers; ++I) {
+      if (strncmp(Name, Timers[I].Name, MaxLenOfTimerName) == 0)
+        return Timer(*this, I);
+    }
+
+    CHECK_LT(NumAllocatedTimers, MaxNumberOfTimers);
+    strncpy(Timers[NumAllocatedTimers].Name, Name, MaxLenOfTimerName);
+    TimerRecords[NumAllocatedTimers].AccumulatedTime = 0;
+    TimerRecords[NumAllocatedTimers].Occurrence = 0;
+    return Timer(*this, NumAllocatedTimers++);
+  }
+
+  // Add a sub-Timer associated with another Timer. This is used when we want to
+  // detail the execution time in the scope of a Timer.
+  // For example,
+  //   void Foo() {
+  //     // T1 records the time spent in both first and second tasks.
+  //     ScopedTimer T1(getTimingManager(), "Task1");
+  //     {
+  //       // T2 records the time spent in first task
+  //       ScopedTimer T2(getTimingManager, T1, "Task2");
+  //       // Do first task.
+  //     }
+  //     // Do second task.
+  //   }
+  //
+  // The report will show proper indents to indicate the nested relation like,
+  //   -- Average Operation Time -- -- Name (# of Calls) --
+  //             10.0(ns)            Task1 (1)
+  //              5.0(ns)              Task2 (1)
+  Timer nest(const Timer &T, const char *Name) EXCLUDES(Mutex) {
+    CHECK_EQ(T.Manager, this);
+    Timer Nesting = getOrCreateTimer(Name);
+
+    ScopedLock L(Mutex);
+    CHECK_NE(Nesting.HandleId, T.HandleId);
+    Timers[Nesting.HandleId].Nesting = T.HandleId;
+    return Nesting;
+  }
+
+  void report(const Timer &T) EXCLUDES(Mutex) {
+    ScopedLock L(Mutex);
+
+    const u32 HandleId = T.HandleId;
+    CHECK_LT(HandleId, MaxNumberOfTimers);
+    TimerRecords[HandleId].AccumulatedTime += T.getAccumulatedTime();
+    ++TimerRecords[HandleId].Occurrence;
+    ++NumEventsReported;
+    if (NumEventsReported % PrintingInterval == 0)
+      printAllImpl();
+  }
+
+  void printAll() EXCLUDES(Mutex) {
+    ScopedLock L(Mutex);
+    printAllImpl();
+  }
+
+private:
+  void printAllImpl() REQUIRES(Mutex) {
+    static char NameHeader[] = "-- Name (# of Calls) --";
+    static char AvgHeader[] = "-- Average Operation Time --";
+    ScopedString Str;
+    Str.append("%-15s %-15s\n", AvgHeader, NameHeader);
+
+    for (u32 I = 0; I < NumAllocatedTimers; ++I) {
+      if (Timers[I].Nesting != MaxNumberOfTimers)
+        continue;
+      printImpl(Str, I);
+    }
+
+    Str.output();
+  }
+
+  void printImpl(ScopedString &Str, const u32 HandleId,
+                 const u32 ExtraIndent = 0) REQUIRES(Mutex) {
+    const u64 AccumulatedTime = TimerRecords[HandleId].AccumulatedTime;
+    const u64 Occurrence = TimerRecords[HandleId].Occurrence;
+    const u64 Integral = Occurrence == 0 ? 0 : AccumulatedTime / Occurrence;
+    // Only keep single digit of fraction is enough and it enables easier layout
+    // maintenance.
+    const u64 Fraction =
+        Occurrence == 0 ? 0
+                        : ((AccumulatedTime % Occurrence) * 10) / Occurrence;
+
+    Str.append("%14" PRId64 ".%" PRId64 "(ns) %-11s", Integral, Fraction, " ");
+
+    for (u32 I = 0; I < ExtraIndent; ++I)
+      Str.append("%s", "  ");
+    Str.append("%s (%" PRId64 ")\n", Timers[HandleId].Name, Occurrence);
+
+    for (u32 I = 0; I < NumAllocatedTimers; ++I)
+      if (Timers[I].Nesting == HandleId)
+        printImpl(Str, I, ExtraIndent + 1);
+  }
+
+  // Instead of maintaining pages for timer registration, a static buffer is
+  // sufficient for most use cases in Scudo.
+  static constexpr u32 MaxNumberOfTimers = 50;
+  static constexpr u32 MaxLenOfTimerName = 50;
+  static constexpr u32 DefaultPrintingInterval = 100;
+
+  struct Record {
+    u64 AccumulatedTime = 0;
+    u64 Occurrence = 0;
+  };
+
+  struct TimerInfo {
+    char Name[MaxLenOfTimerName + 1];
+    u32 Nesting = MaxNumberOfTimers;
+  };
+
+  HybridMutex Mutex;
+  // The frequency of proactively dumping the timer statistics. For example, the
+  // default setting is to dump the statistics every 100 reported events.
+  u32 PrintingInterval GUARDED_BY(Mutex);
+  u64 NumEventsReported GUARDED_BY(Mutex) = 0;
+  u32 NumAllocatedTimers GUARDED_BY(Mutex) = 0;
+  TimerInfo Timers[MaxNumberOfTimers] GUARDED_BY(Mutex);
+  Record TimerRecords[MaxNumberOfTimers] GUARDED_BY(Mutex);
+};
+
+} // namespace scudo
+
+#endif // SCUDO_TIMING_H_
diff --git a/standalone/trusty.cpp b/standalone/trusty.cpp
index 592514d4c3a..26b349c6e50 100644
--- a/standalone/trusty.cpp
+++ b/standalone/trusty.cpp
@@ -12,17 +12,18 @@
 
 #include "common.h"
 #include "mutex.h"
-#include "string_utils.h"
+#include "report_linux.h"
 #include "trusty.h"
 
 #include <errno.h>           // for errno
+#include <lk/err_ptr.h>      // for PTR_ERR and IS_ERR
 #include <stdio.h>           // for printf()
 #include <stdlib.h>          // for getenv()
 #include <sys/auxv.h>        // for getauxval()
 #include <time.h>            // for clock_gettime()
+#include <trusty_err.h>      // for lk_err_to_errno()
 #include <trusty_syscalls.h> // for _trusty_brk()
-
-#define SBRK_ALIGN 32
+#include <uapi/mm.h>         // for MMAP flags
 
 namespace scudo {
 
@@ -30,35 +31,39 @@ uptr getPageSize() { return getauxval(AT_PAGESZ); }
 
 void NORETURN die() { abort(); }
 
-void *map(UNUSED void *Addr, uptr Size, UNUSED const char *Name, uptr Flags,
+void *map(void *Addr, uptr Size, const char *Name, uptr Flags,
           UNUSED MapPlatformData *Data) {
-  // Calling _trusty_brk(0) returns the current program break.
-  uptr ProgramBreak = reinterpret_cast<uptr>(_trusty_brk(0));
-  uptr Start;
-  uptr End;
-
-  Start = roundUp(ProgramBreak, SBRK_ALIGN);
-  // Don't actually extend the heap if MAP_NOACCESS flag is set since this is
-  // the case where Scudo tries to reserve a memory region without mapping
-  // physical pages.
+  uint32_t MmapFlags =
+      MMAP_FLAG_ANONYMOUS | MMAP_FLAG_PROT_READ | MMAP_FLAG_PROT_WRITE;
+
+  // If the MAP_NOACCESS flag is set, Scudo tries to reserve
+  // a memory region without mapping physical pages. This corresponds
+  // to MMAP_FLAG_NO_PHYSICAL in Trusty.
   if (Flags & MAP_NOACCESS)
-    return reinterpret_cast<void *>(Start);
-
-  // Attempt to extend the heap by Size bytes using _trusty_brk.
-  End = roundUp(Start + Size, SBRK_ALIGN);
-  ProgramBreak =
-      reinterpret_cast<uptr>(_trusty_brk(reinterpret_cast<void *>(End)));
-  if (ProgramBreak < End) {
-    errno = ENOMEM;
-    dieOnMapUnmapError(Size);
+    MmapFlags |= MMAP_FLAG_NO_PHYSICAL;
+  if (Addr)
+    MmapFlags |= MMAP_FLAG_FIXED_NOREPLACE;
+
+  if (Flags & MAP_MEMTAG)
+    MmapFlags |= MMAP_FLAG_PROT_MTE;
+
+  void *P = (void *)_trusty_mmap(Addr, Size, MmapFlags, 0);
+
+  if (IS_ERR(P)) {
+    errno = lk_err_to_errno(PTR_ERR(P));
+    if (!(Flags & MAP_ALLOWNOMEM) || errno != ENOMEM)
+      reportMapError(Size);
     return nullptr;
   }
-  return reinterpret_cast<void *>(Start); // Base of new reserved region.
+
+  return P;
 }
 
-// Unmap is a no-op since Trusty uses sbrk instead of memory mapping.
 void unmap(UNUSED void *Addr, UNUSED uptr Size, UNUSED uptr Flags,
-           UNUSED MapPlatformData *Data) {}
+           UNUSED MapPlatformData *Data) {
+  if (_trusty_munmap(Addr, Size) != 0)
+    reportUnmapError(reinterpret_cast<uptr>(Addr), Size);
+}
 
 void setMemoryPermission(UNUSED uptr Addr, UNUSED uptr Size, UNUSED uptr Flags,
                          UNUSED MapPlatformData *Data) {}
@@ -85,6 +90,17 @@ u64 getMonotonicTime() {
          static_cast<u64>(TS.tv_nsec);
 }
 
+u64 getMonotonicTimeFast() {
+#if defined(CLOCK_MONOTONIC_COARSE)
+  timespec TS;
+  clock_gettime(CLOCK_MONOTONIC_COARSE, &TS);
+  return static_cast<u64>(TS.tv_sec) * (1000ULL * 1000 * 1000) +
+         static_cast<u64>(TS.tv_nsec);
+#else
+  return getMonotonicTime();
+#endif
+}
+
 u32 getNumberOfCPUs() { return 0; }
 
 u32 getThreadID() { return 0; }
diff --git a/standalone/tsd.h b/standalone/tsd.h
index c5ed6ddfa12..72773f2f72b 100644
--- a/standalone/tsd.h
+++ b/standalone/tsd.h
@@ -53,8 +53,14 @@ template <class Allocator> struct alignas(SCUDO_CACHE_LINE_SIZE) TSD {
   inline void unlock() NO_THREAD_SAFETY_ANALYSIS { Mutex.unlock(); }
   inline uptr getPrecedence() { return atomic_load_relaxed(&Precedence); }
 
-  void commitBack(Allocator *Instance) ASSERT_CAPABILITY(Mutex) {
-    Instance->commitBack(this);
+  void commitBack(Allocator *Instance) { Instance->commitBack(this); }
+
+  // As the comments attached to `getCache()`, the TSD doesn't always need to be
+  // locked. In that case, we would only skip the check before we have all TSDs
+  // locked in all paths.
+  void assertLocked(bool BypassCheck) ASSERT_CAPABILITY(Mutex) {
+    if (SCUDO_DEBUG && !BypassCheck)
+      Mutex.assertHeld();
   }
 
   // Ideally, we may want to assert that all the operations on
@@ -66,11 +72,8 @@ template <class Allocator> struct alignas(SCUDO_CACHE_LINE_SIZE) TSD {
   // TODO(chiahungduan): Ideally, we want to do `Mutex.assertHeld` but acquiring
   // TSD doesn't always require holding the lock. Add this assertion while the
   // lock is always acquired.
-  typename Allocator::CacheT &getCache() ASSERT_CAPABILITY(Mutex) {
-    return Cache;
-  }
-  typename Allocator::QuarantineCacheT &getQuarantineCache()
-      ASSERT_CAPABILITY(Mutex) {
+  typename Allocator::CacheT &getCache() REQUIRES(Mutex) { return Cache; }
+  typename Allocator::QuarantineCacheT &getQuarantineCache() REQUIRES(Mutex) {
     return QuarantineCache;
   }
 
diff --git a/standalone/tsd_exclusive.h b/standalone/tsd_exclusive.h
index 62da8aeb537..a58ba650508 100644
--- a/standalone/tsd_exclusive.h
+++ b/standalone/tsd_exclusive.h
@@ -11,6 +11,8 @@
 
 #include "tsd.h"
 
+#include "string_utils.h"
+
 namespace scudo {
 
 struct ThreadState {
@@ -25,6 +27,31 @@ struct ThreadState {
 template <class Allocator> void teardownThread(void *Ptr);
 
 template <class Allocator> struct TSDRegistryExT {
+  using ThisT = TSDRegistryExT<Allocator>;
+
+  struct ScopedTSD {
+    ALWAYS_INLINE ScopedTSD(ThisT &TSDRegistry) {
+      CurrentTSD = TSDRegistry.getTSDAndLock(&UnlockRequired);
+      DCHECK_NE(CurrentTSD, nullptr);
+    }
+
+    ~ScopedTSD() {
+      if (UNLIKELY(UnlockRequired))
+        CurrentTSD->unlock();
+    }
+
+    TSD<Allocator> &operator*() { return *CurrentTSD; }
+
+    TSD<Allocator> *operator->() {
+      CurrentTSD->assertLocked(/*BypassCheck=*/!UnlockRequired);
+      return CurrentTSD;
+    }
+
+  private:
+    TSD<Allocator> *CurrentTSD;
+    bool UnlockRequired;
+  };
+
   void init(Allocator *Instance) REQUIRES(Mutex) {
     DCHECK(!Initialized);
     Instance->init();
@@ -57,29 +84,21 @@ template <class Allocator> struct TSDRegistryExT {
     Initialized = false;
   }
 
+  void drainCaches(Allocator *Instance) {
+    // We don't have a way to iterate all thread local `ThreadTSD`s. Simply
+    // drain the `ThreadTSD` of current thread and `FallbackTSD`.
+    Instance->drainCache(&ThreadTSD);
+    FallbackTSD.lock();
+    Instance->drainCache(&FallbackTSD);
+    FallbackTSD.unlock();
+  }
+
   ALWAYS_INLINE void initThreadMaybe(Allocator *Instance, bool MinimalInit) {
     if (LIKELY(State.InitState != ThreadState::NotInitialized))
       return;
     initThread(Instance, MinimalInit);
   }
 
-  // TODO(chiahungduan): Consider removing the argument `UnlockRequired` by
-  // embedding the logic into TSD or always locking the TSD. It will enable us
-  // to properly mark thread annotation here and adding proper runtime
-  // assertions in the member functions of TSD. For example, assert the lock is
-  // acquired before calling TSD::commitBack().
-  ALWAYS_INLINE TSD<Allocator> *
-  getTSDAndLock(bool *UnlockRequired) NO_THREAD_SAFETY_ANALYSIS {
-    if (LIKELY(State.InitState == ThreadState::Initialized &&
-               !atomic_load(&Disabled, memory_order_acquire))) {
-      *UnlockRequired = false;
-      return &ThreadTSD;
-    }
-    FallbackTSD.lock();
-    *UnlockRequired = true;
-    return &FallbackTSD;
-  }
-
   // To disable the exclusive TSD registry, we effectively lock the fallback TSD
   // and force all threads to attempt to use it instead of their local one.
   void disable() NO_THREAD_SAFETY_ANALYSIS {
@@ -104,7 +123,26 @@ template <class Allocator> struct TSDRegistryExT {
 
   bool getDisableMemInit() { return State.DisableMemInit; }
 
+  void getStats(ScopedString *Str) {
+    // We don't have a way to iterate all thread local `ThreadTSD`s. Instead of
+    // printing only self `ThreadTSD` which may mislead the usage, we just skip
+    // it.
+    Str->append("Exclusive TSD don't support iterating each TSD\n");
+  }
+
 private:
+  ALWAYS_INLINE TSD<Allocator> *
+  getTSDAndLock(bool *UnlockRequired) NO_THREAD_SAFETY_ANALYSIS {
+    if (LIKELY(State.InitState == ThreadState::Initialized &&
+               !atomic_load(&Disabled, memory_order_acquire))) {
+      *UnlockRequired = false;
+      return &ThreadTSD;
+    }
+    FallbackTSD.lock();
+    *UnlockRequired = true;
+    return &FallbackTSD;
+  }
+
   // Using minimal initialization allows for global initialization while keeping
   // the thread specific structure untouched. The fallback structure will be
   // used instead.
diff --git a/standalone/tsd_shared.h b/standalone/tsd_shared.h
index 64b3bd844b0..dade16dad9f 100644
--- a/standalone/tsd_shared.h
+++ b/standalone/tsd_shared.h
@@ -11,6 +11,8 @@
 
 #include "tsd.h"
 
+#include "string_utils.h"
+
 #if SCUDO_HAS_PLATFORM_TLS_SLOT
 // This is a platform-provided header that needs to be on the include path when
 // Scudo is compiled. It must declare a function with the prototype:
@@ -24,6 +26,27 @@ namespace scudo {
 
 template <class Allocator, u32 TSDsArraySize, u32 DefaultTSDCount>
 struct TSDRegistrySharedT {
+  using ThisT = TSDRegistrySharedT<Allocator, TSDsArraySize, DefaultTSDCount>;
+
+  struct ScopedTSD {
+    ALWAYS_INLINE ScopedTSD(ThisT &TSDRegistry) {
+      CurrentTSD = TSDRegistry.getTSDAndLock();
+      DCHECK_NE(CurrentTSD, nullptr);
+    }
+
+    ~ScopedTSD() { CurrentTSD->unlock(); }
+
+    TSD<Allocator> &operator*() { return *CurrentTSD; }
+
+    TSD<Allocator> *operator->() {
+      CurrentTSD->assertLocked(/*BypassCheck=*/false);
+      return CurrentTSD;
+    }
+
+  private:
+    TSD<Allocator> *CurrentTSD;
+  };
+
   void init(Allocator *Instance) REQUIRES(Mutex) {
     DCHECK(!Initialized);
     Instance->init();
@@ -52,6 +75,15 @@ struct TSDRegistrySharedT {
     Initialized = false;
   }
 
+  void drainCaches(Allocator *Instance) {
+    ScopedLock L(MutexTSDs);
+    for (uptr I = 0; I < NumberOfTSDs; ++I) {
+      TSDs[I].lock();
+      Instance->drainCache(&TSDs[I]);
+      TSDs[I].unlock();
+    }
+  }
+
   ALWAYS_INLINE void initThreadMaybe(Allocator *Instance,
                                      UNUSED bool MinimalInit) {
     if (LIKELY(getCurrentTSD()))
@@ -59,26 +91,6 @@ struct TSDRegistrySharedT {
     initThread(Instance);
   }
 
-  // TSDs is an array of locks and which is not supported for marking
-  // thread-safety capability.
-  ALWAYS_INLINE TSD<Allocator> *
-  getTSDAndLock(bool *UnlockRequired) NO_THREAD_SAFETY_ANALYSIS {
-    TSD<Allocator> *TSD = getCurrentTSD();
-    DCHECK(TSD);
-    *UnlockRequired = true;
-    // Try to lock the currently associated context.
-    if (TSD->tryLock())
-      return TSD;
-    // If that fails, go down the slow path.
-    if (TSDsArraySize == 1U) {
-      // Only 1 TSD, not need to go any further.
-      // The compiler will optimize this one way or the other.
-      TSD->lock();
-      return TSD;
-    }
-    return getTSDAndLockSlow(TSD);
-  }
-
   void disable() NO_THREAD_SAFETY_ANALYSIS {
     Mutex.lock();
     for (u32 I = 0; I < TSDsArraySize; I++)
@@ -102,7 +114,41 @@ struct TSDRegistrySharedT {
 
   bool getDisableMemInit() const { return *getTlsPtr() & 1; }
 
+  void getStats(ScopedString *Str) EXCLUDES(MutexTSDs) {
+    ScopedLock L(MutexTSDs);
+
+    Str->append("Stats: SharedTSDs: %u available; total %u\n", NumberOfTSDs,
+                TSDsArraySize);
+    for (uptr I = 0; I < NumberOfTSDs; ++I) {
+      TSDs[I].lock();
+      // Theoretically, we want to mark TSD::lock()/TSD::unlock() with proper
+      // thread annotations. However, given the TSD is only locked on shared
+      // path, do the assertion in a separate path to avoid confusing the
+      // analyzer.
+      TSDs[I].assertLocked(/*BypassCheck=*/true);
+      Str->append("  Shared TSD[%zu]:\n", I);
+      TSDs[I].getCache().getStats(Str);
+      TSDs[I].unlock();
+    }
+  }
+
 private:
+  ALWAYS_INLINE TSD<Allocator> *getTSDAndLock() NO_THREAD_SAFETY_ANALYSIS {
+    TSD<Allocator> *TSD = getCurrentTSD();
+    DCHECK(TSD);
+    // Try to lock the currently associated context.
+    if (TSD->tryLock())
+      return TSD;
+    // If that fails, go down the slow path.
+    if (TSDsArraySize == 1U) {
+      // Only 1 TSD, not need to go any further.
+      // The compiler will optimize this one way or the other.
+      TSD->lock();
+      return TSD;
+    }
+    return getTSDAndLockSlow(TSD);
+  }
+
   ALWAYS_INLINE uptr *getTlsPtr() const {
 #if SCUDO_HAS_PLATFORM_TLS_SLOT
     return reinterpret_cast<uptr *>(getPlatformAllocatorTlsSlot());
diff --git a/standalone/vector.h b/standalone/vector.h
index 9f2c200958f..ca10cc281d7 100644
--- a/standalone/vector.h
+++ b/standalone/vector.h
@@ -9,26 +9,20 @@
 #ifndef SCUDO_VECTOR_H_
 #define SCUDO_VECTOR_H_
 
-#include "common.h"
+#include "mem_map.h"
 
 #include <string.h>
 
 namespace scudo {
 
-// A low-level vector based on map. May incur a significant memory overhead for
-// small vectors. The current implementation supports only POD types.
+// A low-level vector based on map. It stores the contents inline up to a fixed
+// capacity, or in an external memory buffer if it grows bigger than that. May
+// incur a significant memory overhead for small vectors. The current
+// implementation supports only POD types.
+//
+// NOTE: This class is not meant to be used directly, use Vector<T> instead.
 template <typename T> class VectorNoCtor {
 public:
-  constexpr void init(uptr InitialCapacity = 0) {
-    Data = &LocalData[0];
-    CapacityBytes = sizeof(LocalData);
-    if (InitialCapacity > capacity())
-      reserve(InitialCapacity);
-  }
-  void destroy() {
-    if (Data != &LocalData[0])
-      unmap(Data, CapacityBytes, 0, &MapData);
-  }
   T &operator[](uptr I) {
     DCHECK_LT(I, Size);
     return Data[I];
@@ -41,7 +35,9 @@ public:
     DCHECK_LE(Size, capacity());
     if (Size == capacity()) {
       const uptr NewCapacity = roundUpPowerOfTwo(Size + 1);
-      reallocate(NewCapacity);
+      if (!reallocate(NewCapacity)) {
+        return;
+      }
     }
     memcpy(&Data[Size++], &Element, sizeof(T));
   }
@@ -57,14 +53,17 @@ public:
   const T *data() const { return Data; }
   T *data() { return Data; }
   constexpr uptr capacity() const { return CapacityBytes / sizeof(T); }
-  void reserve(uptr NewSize) {
+  bool reserve(uptr NewSize) {
     // Never downsize internal buffer.
     if (NewSize > capacity())
-      reallocate(NewSize);
+      return reallocate(NewSize);
+    return true;
   }
   void resize(uptr NewSize) {
     if (NewSize > Size) {
-      reserve(NewSize);
+      if (!reserve(NewSize)) {
+        return;
+      }
       memset(&Data[Size], 0, sizeof(T) * (NewSize - Size));
     }
     Size = NewSize;
@@ -78,24 +77,47 @@ public:
   const T *end() const { return data() + size(); }
   T *end() { return data() + size(); }
 
+protected:
+  constexpr void init(uptr InitialCapacity = 0) {
+    Data = &LocalData[0];
+    CapacityBytes = sizeof(LocalData);
+    if (InitialCapacity > capacity())
+      reserve(InitialCapacity);
+  }
+  void destroy() {
+    if (Data != &LocalData[0])
+      ExternalBuffer.unmap(ExternalBuffer.getBase(),
+                           ExternalBuffer.getCapacity());
+  }
+
 private:
-  void reallocate(uptr NewCapacity) {
+  bool reallocate(uptr NewCapacity) {
     DCHECK_GT(NewCapacity, 0);
     DCHECK_LE(Size, NewCapacity);
+
+    MemMapT NewExternalBuffer;
     NewCapacity = roundUp(NewCapacity * sizeof(T), getPageSizeCached());
-    T *NewData = reinterpret_cast<T *>(
-        map(nullptr, NewCapacity, "scudo:vector", 0, &MapData));
-    memcpy(NewData, Data, Size * sizeof(T));
+    if (!NewExternalBuffer.map(/*Addr=*/0U, NewCapacity, "scudo:vector",
+                               MAP_ALLOWNOMEM)) {
+      return false;
+    }
+    T *NewExternalData = reinterpret_cast<T *>(NewExternalBuffer.getBase());
+
+    memcpy(NewExternalData, Data, Size * sizeof(T));
     destroy();
-    Data = NewData;
+
+    Data = NewExternalData;
     CapacityBytes = NewCapacity;
+    ExternalBuffer = NewExternalBuffer;
+    return true;
   }
 
   T *Data = nullptr;
-  T LocalData[256 / sizeof(T)] = {};
   uptr CapacityBytes = 0;
   uptr Size = 0;
-  [[no_unique_address]] MapPlatformData MapData = {};
+
+  T LocalData[256 / sizeof(T)] = {};
+  MemMapT ExternalBuffer;
 };
 
 template <typename T> class Vector : public VectorNoCtor<T> {
diff --git a/standalone/wrappers_c.cpp b/standalone/wrappers_c.cpp
index b4d51be716c..60014a0f66b 100644
--- a/standalone/wrappers_c.cpp
+++ b/standalone/wrappers_c.cpp
@@ -12,6 +12,9 @@
 #if !SCUDO_ANDROID || !_BIONIC
 
 #include "allocator_config.h"
+#include "internal_defs.h"
+#include "platform.h"
+#include "scudo/interface.h"
 #include "wrappers_c.h"
 #include "wrappers_c_checks.h"
 
diff --git a/standalone/wrappers_c.inc b/standalone/wrappers_c.inc
index 37e336ee09d..59f3fb0962f 100644
--- a/standalone/wrappers_c.inc
+++ b/standalone/wrappers_c.inc
@@ -17,6 +17,35 @@
 #define SCUDO_MALLOC_ALIGNMENT FIRST_32_SECOND_64(8U, 16U)
 #endif
 
+static void reportAllocation(void *ptr, size_t size) {
+  if (SCUDO_ENABLE_HOOKS)
+    if (__scudo_allocate_hook && ptr)
+      __scudo_allocate_hook(ptr, size);
+}
+static void reportDeallocation(void *ptr) {
+  if (SCUDO_ENABLE_HOOKS)
+    if (__scudo_deallocate_hook)
+      __scudo_deallocate_hook(ptr);
+}
+static void reportReallocAllocation(void *old_ptr, void *new_ptr, size_t size) {
+  DCHECK_NE(new_ptr, nullptr);
+
+  if (SCUDO_ENABLE_HOOKS) {
+    if (__scudo_realloc_allocate_hook)
+      __scudo_realloc_allocate_hook(old_ptr, new_ptr, size);
+    else if (__scudo_allocate_hook)
+      __scudo_allocate_hook(new_ptr, size);
+  }
+}
+static void reportReallocDeallocation(void *old_ptr) {
+  if (SCUDO_ENABLE_HOOKS) {
+    if (__scudo_realloc_deallocate_hook)
+      __scudo_realloc_deallocate_hook(old_ptr);
+    else if (__scudo_deallocate_hook)
+      __scudo_deallocate_hook(old_ptr);
+  }
+}
+
 extern "C" {
 
 INTERFACE WEAK void *SCUDO_PREFIX(calloc)(size_t nmemb, size_t size) {
@@ -28,11 +57,14 @@ INTERFACE WEAK void *SCUDO_PREFIX(calloc)(size_t nmemb, size_t size) {
     }
     scudo::reportCallocOverflow(nmemb, size);
   }
-  return scudo::setErrnoOnNull(SCUDO_ALLOCATOR.allocate(
-      Product, scudo::Chunk::Origin::Malloc, SCUDO_MALLOC_ALIGNMENT, true));
+  void *Ptr = SCUDO_ALLOCATOR.allocate(Product, scudo::Chunk::Origin::Malloc,
+                                       SCUDO_MALLOC_ALIGNMENT, true);
+  reportAllocation(Ptr, Product);
+  return scudo::setErrnoOnNull(Ptr);
 }
 
 INTERFACE WEAK void SCUDO_PREFIX(free)(void *ptr) {
+  reportDeallocation(ptr);
   SCUDO_ALLOCATOR.deallocate(ptr, scudo::Chunk::Origin::Malloc);
 }
 
@@ -54,6 +86,8 @@ INTERFACE WEAK struct SCUDO_MALLINFO SCUDO_PREFIX(mallinfo)(void) {
   return Info;
 }
 
+// On Android, mallinfo2 is an alias of mallinfo, so don't define both.
+#if !SCUDO_ANDROID
 INTERFACE WEAK struct __scudo_mallinfo2 SCUDO_PREFIX(mallinfo2)(void) {
   struct __scudo_mallinfo2 Info = {};
   scudo::StatCounters Stats;
@@ -70,10 +104,13 @@ INTERFACE WEAK struct __scudo_mallinfo2 SCUDO_PREFIX(mallinfo2)(void) {
   Info.fordblks = Info.fsmblks;
   return Info;
 }
+#endif
 
 INTERFACE WEAK void *SCUDO_PREFIX(malloc)(size_t size) {
-  return scudo::setErrnoOnNull(SCUDO_ALLOCATOR.allocate(
-      size, scudo::Chunk::Origin::Malloc, SCUDO_MALLOC_ALIGNMENT));
+  void *Ptr = SCUDO_ALLOCATOR.allocate(size, scudo::Chunk::Origin::Malloc,
+                                       SCUDO_MALLOC_ALIGNMENT);
+  reportAllocation(Ptr, size);
+  return scudo::setErrnoOnNull(Ptr);
 }
 
 #if SCUDO_ANDROID
@@ -102,8 +139,10 @@ INTERFACE WEAK void *SCUDO_PREFIX(memalign)(size_t alignment, size_t size) {
       scudo::reportAlignmentNotPowerOfTwo(alignment);
     }
   }
-  return SCUDO_ALLOCATOR.allocate(size, scudo::Chunk::Origin::Memalign,
-                                  alignment);
+  void *Ptr =
+      SCUDO_ALLOCATOR.allocate(size, scudo::Chunk::Origin::Memalign, alignment);
+  reportAllocation(Ptr, size);
+  return Ptr;
 }
 
 INTERFACE WEAK int SCUDO_PREFIX(posix_memalign)(void **memptr, size_t alignment,
@@ -117,6 +156,8 @@ INTERFACE WEAK int SCUDO_PREFIX(posix_memalign)(void **memptr, size_t alignment,
       SCUDO_ALLOCATOR.allocate(size, scudo::Chunk::Origin::Memalign, alignment);
   if (UNLIKELY(!Ptr))
     return ENOMEM;
+  reportAllocation(Ptr, size);
+
   *memptr = Ptr;
   return 0;
 }
@@ -131,26 +172,57 @@ INTERFACE WEAK void *SCUDO_PREFIX(pvalloc)(size_t size) {
     scudo::reportPvallocOverflow(size);
   }
   // pvalloc(0) should allocate one page.
-  return scudo::setErrnoOnNull(
+  void *Ptr =
       SCUDO_ALLOCATOR.allocate(size ? scudo::roundUp(size, PageSize) : PageSize,
-                               scudo::Chunk::Origin::Memalign, PageSize));
+                               scudo::Chunk::Origin::Memalign, PageSize);
+  reportAllocation(Ptr, scudo::roundUp(size, PageSize));
+
+  return scudo::setErrnoOnNull(Ptr);
 }
 
 INTERFACE WEAK void *SCUDO_PREFIX(realloc)(void *ptr, size_t size) {
-  if (!ptr)
-    return scudo::setErrnoOnNull(SCUDO_ALLOCATOR.allocate(
-        size, scudo::Chunk::Origin::Malloc, SCUDO_MALLOC_ALIGNMENT));
+  if (!ptr) {
+    void *Ptr = SCUDO_ALLOCATOR.allocate(size, scudo::Chunk::Origin::Malloc,
+                                         SCUDO_MALLOC_ALIGNMENT);
+    reportAllocation(Ptr, size);
+    return scudo::setErrnoOnNull(Ptr);
+  }
   if (size == 0) {
+    reportDeallocation(ptr);
     SCUDO_ALLOCATOR.deallocate(ptr, scudo::Chunk::Origin::Malloc);
     return nullptr;
   }
-  return scudo::setErrnoOnNull(
-      SCUDO_ALLOCATOR.reallocate(ptr, size, SCUDO_MALLOC_ALIGNMENT));
+
+  // Given that the reporting of deallocation and allocation are not atomic, we
+  // always pretend the old pointer will be released so that the user doesn't
+  // need to worry about the false double-use case from the view of hooks.
+  //
+  // For example, assume that `realloc` releases the old pointer and allocates a
+  // new pointer. Before the reporting of both operations has been done, another
+  // thread may get the old pointer from `malloc`. It may be misinterpreted as
+  // double-use if it's not handled properly on the hook side.
+  reportReallocDeallocation(ptr);
+  void *NewPtr = SCUDO_ALLOCATOR.reallocate(ptr, size, SCUDO_MALLOC_ALIGNMENT);
+  if (NewPtr != nullptr) {
+    // Note that even if NewPtr == ptr, the size has changed. We still need to
+    // report the new size.
+    reportReallocAllocation(/*OldPtr=*/ptr, NewPtr, size);
+  } else {
+    // If `realloc` fails, the old pointer is not released. Report the old
+    // pointer as allocated again.
+    reportReallocAllocation(/*OldPtr=*/ptr, /*NewPtr=*/ptr,
+                            SCUDO_ALLOCATOR.getAllocSize(ptr));
+  }
+
+  return scudo::setErrnoOnNull(NewPtr);
 }
 
 INTERFACE WEAK void *SCUDO_PREFIX(valloc)(size_t size) {
-  return scudo::setErrnoOnNull(SCUDO_ALLOCATOR.allocate(
-      size, scudo::Chunk::Origin::Memalign, scudo::getPageSizeCached()));
+  void *Ptr = SCUDO_ALLOCATOR.allocate(size, scudo::Chunk::Origin::Memalign,
+                                       scudo::getPageSizeCached());
+  reportAllocation(Ptr, size);
+
+  return scudo::setErrnoOnNull(Ptr);
 }
 
 INTERFACE WEAK int SCUDO_PREFIX(malloc_iterate)(
@@ -175,20 +247,30 @@ void SCUDO_PREFIX(malloc_postinit)() {
 INTERFACE WEAK int SCUDO_PREFIX(mallopt)(int param, int value) {
   if (param == M_DECAY_TIME) {
     if (SCUDO_ANDROID) {
-      if (value == 0) {
-        // Will set the release values to their minimum values.
-        value = INT32_MIN;
-      } else {
-        // Will set the release values to their maximum values.
+      // Before changing the interval, reset the memory usage status by doing a
+      // M_PURGE call so that we can minimize the impact of any unreleased pages
+      // introduced by interval transition.
+      SCUDO_ALLOCATOR.releaseToOS(scudo::ReleaseToOS::Force);
+
+      // The values allowed on Android are {-1, 0, 1}. "1" means the longest
+      // interval.
+      CHECK(value >= -1 && value <= 1);
+      if (value == 1)
         value = INT32_MAX;
-      }
     }
 
     SCUDO_ALLOCATOR.setOption(scudo::Option::ReleaseInterval,
                               static_cast<scudo::sptr>(value));
     return 1;
   } else if (param == M_PURGE) {
-    SCUDO_ALLOCATOR.releaseToOS();
+    SCUDO_ALLOCATOR.releaseToOS(scudo::ReleaseToOS::Force);
+    return 1;
+  } else if (param == M_PURGE_ALL) {
+    SCUDO_ALLOCATOR.releaseToOS(scudo::ReleaseToOS::ForceAll);
+    return 1;
+  } else if (param == M_LOG_STATS) {
+    SCUDO_ALLOCATOR.printStats();
+    SCUDO_ALLOCATOR.printFragmentationInfo();
     return 1;
   } else {
     scudo::Option option;
@@ -224,8 +306,12 @@ INTERFACE WEAK void *SCUDO_PREFIX(aligned_alloc)(size_t alignment,
     }
     scudo::reportInvalidAlignedAllocAlignment(alignment, size);
   }
-  return scudo::setErrnoOnNull(
-      SCUDO_ALLOCATOR.allocate(size, scudo::Chunk::Origin::Malloc, alignment));
+
+  void *Ptr =
+      SCUDO_ALLOCATOR.allocate(size, scudo::Chunk::Origin::Malloc, alignment);
+  reportAllocation(Ptr, size);
+
+  return scudo::setErrnoOnNull(Ptr);
 }
 
 INTERFACE WEAK int SCUDO_PREFIX(malloc_info)(UNUSED int options, FILE *stream) {
diff --git a/standalone/wrappers_c_bionic.cpp b/standalone/wrappers_c_bionic.cpp
index 18c3bf2c0ed..e9d8c1e8d3d 100644
--- a/standalone/wrappers_c_bionic.cpp
+++ b/standalone/wrappers_c_bionic.cpp
@@ -12,6 +12,9 @@
 #if SCUDO_ANDROID && _BIONIC
 
 #include "allocator_config.h"
+#include "internal_defs.h"
+#include "platform.h"
+#include "scudo/interface.h"
 #include "wrappers_c.h"
 #include "wrappers_c_checks.h"
 
@@ -24,22 +27,7 @@
 
 extern "C" void SCUDO_PREFIX(malloc_postinit)();
 SCUDO_REQUIRE_CONSTANT_INITIALIZATION
-static scudo::Allocator<scudo::AndroidConfig, SCUDO_PREFIX(malloc_postinit)>
-    SCUDO_ALLOCATOR;
-
-#include "wrappers_c.inc"
-
-#undef SCUDO_ALLOCATOR
-#undef SCUDO_PREFIX
-
-// Svelte MallocDispatch definitions.
-#define SCUDO_PREFIX(name) CONCATENATE(scudo_svelte_, name)
-#define SCUDO_ALLOCATOR SvelteAllocator
-
-extern "C" void SCUDO_PREFIX(malloc_postinit)();
-SCUDO_REQUIRE_CONSTANT_INITIALIZATION
-static scudo::Allocator<scudo::AndroidSvelteConfig,
-                        SCUDO_PREFIX(malloc_postinit)>
+static scudo::Allocator<scudo::Config, SCUDO_PREFIX(malloc_postinit)>
     SCUDO_ALLOCATOR;
 
 #include "wrappers_c.inc"
@@ -50,15 +38,14 @@ static scudo::Allocator<scudo::AndroidSvelteConfig,
 // TODO(kostyak): support both allocators.
 INTERFACE void __scudo_print_stats(void) { Allocator.printStats(); }
 
-INTERFACE void
-__scudo_get_error_info(struct scudo_error_info *error_info,
-                       uintptr_t fault_addr, const char *stack_depot,
-                       const char *region_info, const char *ring_buffer,
-                       const char *memory, const char *memory_tags,
-                       uintptr_t memory_addr, size_t memory_size) {
-  Allocator.getErrorInfo(error_info, fault_addr, stack_depot, region_info,
-                         ring_buffer, memory, memory_tags, memory_addr,
-                         memory_size);
+INTERFACE void __scudo_get_error_info(
+    struct scudo_error_info *error_info, uintptr_t fault_addr,
+    const char *stack_depot, size_t stack_depot_size, const char *region_info,
+    const char *ring_buffer, size_t ring_buffer_size, const char *memory,
+    const char *memory_tags, uintptr_t memory_addr, size_t memory_size) {
+  Allocator.getErrorInfo(error_info, fault_addr, stack_depot, stack_depot_size,
+                         region_info, ring_buffer, ring_buffer_size, memory,
+                         memory_tags, memory_addr, memory_size);
 }
 
 INTERFACE const char *__scudo_get_stack_depot_addr() {
@@ -66,7 +53,7 @@ INTERFACE const char *__scudo_get_stack_depot_addr() {
 }
 
 INTERFACE size_t __scudo_get_stack_depot_size() {
-  return sizeof(scudo::StackDepot);
+  return Allocator.getStackDepotSize();
 }
 
 INTERFACE const char *__scudo_get_region_info_addr() {
diff --git a/standalone/wrappers_c_checks.h b/standalone/wrappers_c_checks.h
index 9cd48e82792..d0288699cf1 100644
--- a/standalone/wrappers_c_checks.h
+++ b/standalone/wrappers_c_checks.h
@@ -31,15 +31,13 @@ inline void *setErrnoOnNull(void *Ptr) {
 // Checks aligned_alloc() parameters, verifies that the alignment is a power of
 // two and that the size is a multiple of alignment.
 inline bool checkAlignedAllocAlignmentAndSize(uptr Alignment, uptr Size) {
-  return Alignment == 0 || !isPowerOfTwo(Alignment) ||
-         !isAligned(Size, Alignment);
+  return !isPowerOfTwo(Alignment) || !isAligned(Size, Alignment);
 }
 
 // Checks posix_memalign() parameters, verifies that alignment is a power of two
 // and a multiple of sizeof(void *).
 inline bool checkPosixMemalignAlignment(uptr Alignment) {
-  return Alignment == 0 || !isPowerOfTwo(Alignment) ||
-         !isAligned(Alignment, sizeof(void *));
+  return !isPowerOfTwo(Alignment) || !isAligned(Alignment, sizeof(void *));
 }
 
 // Returns true if calloc(Size, N) overflows on Size*N calculation. Use a
diff --git a/standalone/wrappers_cpp.cpp b/standalone/wrappers_cpp.cpp
index 374e36d72b3..098d4f71acc 100644
--- a/standalone/wrappers_cpp.cpp
+++ b/standalone/wrappers_cpp.cpp
@@ -12,6 +12,9 @@
 #if !SCUDO_ANDROID || !_BIONIC
 
 #include "allocator_config.h"
+#include "internal_defs.h"
+#include "platform.h"
+#include "scudo/interface.h"
 #include "wrappers_c.h"
 
 #include <stdint.h>
@@ -21,86 +24,125 @@ struct nothrow_t {};
 enum class align_val_t : size_t {};
 } // namespace std
 
+static void reportAllocation(void *ptr, size_t size) {
+  if (SCUDO_ENABLE_HOOKS)
+    if (__scudo_allocate_hook && ptr)
+      __scudo_allocate_hook(ptr, size);
+}
+static void reportDeallocation(void *ptr) {
+  if (SCUDO_ENABLE_HOOKS)
+    if (__scudo_deallocate_hook)
+      __scudo_deallocate_hook(ptr);
+}
+
 INTERFACE WEAK void *operator new(size_t size) {
-  return Allocator.allocate(size, scudo::Chunk::Origin::New);
+  void *Ptr = Allocator.allocate(size, scudo::Chunk::Origin::New);
+  reportAllocation(Ptr, size);
+  return Ptr;
 }
 INTERFACE WEAK void *operator new[](size_t size) {
-  return Allocator.allocate(size, scudo::Chunk::Origin::NewArray);
+  void *Ptr = Allocator.allocate(size, scudo::Chunk::Origin::NewArray);
+  reportAllocation(Ptr, size);
+  return Ptr;
 }
 INTERFACE WEAK void *operator new(size_t size,
                                   std::nothrow_t const &) NOEXCEPT {
-  return Allocator.allocate(size, scudo::Chunk::Origin::New);
+  void *Ptr = Allocator.allocate(size, scudo::Chunk::Origin::New);
+  reportAllocation(Ptr, size);
+  return Ptr;
 }
 INTERFACE WEAK void *operator new[](size_t size,
                                     std::nothrow_t const &) NOEXCEPT {
-  return Allocator.allocate(size, scudo::Chunk::Origin::NewArray);
+  void *Ptr = Allocator.allocate(size, scudo::Chunk::Origin::NewArray);
+  reportAllocation(Ptr, size);
+  return Ptr;
 }
 INTERFACE WEAK void *operator new(size_t size, std::align_val_t align) {
-  return Allocator.allocate(size, scudo::Chunk::Origin::New,
-                            static_cast<scudo::uptr>(align));
+  void *Ptr = Allocator.allocate(size, scudo::Chunk::Origin::New,
+                                 static_cast<scudo::uptr>(align));
+  reportAllocation(Ptr, size);
+  return Ptr;
 }
 INTERFACE WEAK void *operator new[](size_t size, std::align_val_t align) {
-  return Allocator.allocate(size, scudo::Chunk::Origin::NewArray,
-                            static_cast<scudo::uptr>(align));
+  void *Ptr = Allocator.allocate(size, scudo::Chunk::Origin::NewArray,
+                                 static_cast<scudo::uptr>(align));
+  reportAllocation(Ptr, size);
+  return Ptr;
 }
 INTERFACE WEAK void *operator new(size_t size, std::align_val_t align,
                                   std::nothrow_t const &) NOEXCEPT {
-  return Allocator.allocate(size, scudo::Chunk::Origin::New,
-                            static_cast<scudo::uptr>(align));
+  void *Ptr = Allocator.allocate(size, scudo::Chunk::Origin::New,
+                                 static_cast<scudo::uptr>(align));
+  reportAllocation(Ptr, size);
+  return Ptr;
 }
 INTERFACE WEAK void *operator new[](size_t size, std::align_val_t align,
                                     std::nothrow_t const &) NOEXCEPT {
-  return Allocator.allocate(size, scudo::Chunk::Origin::NewArray,
-                            static_cast<scudo::uptr>(align));
+  void *Ptr = Allocator.allocate(size, scudo::Chunk::Origin::NewArray,
+                                 static_cast<scudo::uptr>(align));
+  reportAllocation(Ptr, size);
+  return Ptr;
 }
 
 INTERFACE WEAK void operator delete(void *ptr) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::New);
 }
 INTERFACE WEAK void operator delete[](void *ptr) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::NewArray);
 }
 INTERFACE WEAK void operator delete(void *ptr,
                                     std::nothrow_t const &) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::New);
 }
 INTERFACE WEAK void operator delete[](void *ptr,
                                       std::nothrow_t const &) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::NewArray);
 }
 INTERFACE WEAK void operator delete(void *ptr, size_t size) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::New, size);
 }
 INTERFACE WEAK void operator delete[](void *ptr, size_t size) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::NewArray, size);
 }
 INTERFACE WEAK void operator delete(void *ptr,
                                     std::align_val_t align) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::New, 0,
                        static_cast<scudo::uptr>(align));
 }
 INTERFACE WEAK void operator delete[](void *ptr,
                                       std::align_val_t align) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::NewArray, 0,
                        static_cast<scudo::uptr>(align));
 }
 INTERFACE WEAK void operator delete(void *ptr, std::align_val_t align,
                                     std::nothrow_t const &) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::New, 0,
                        static_cast<scudo::uptr>(align));
 }
 INTERFACE WEAK void operator delete[](void *ptr, std::align_val_t align,
                                       std::nothrow_t const &) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::NewArray, 0,
                        static_cast<scudo::uptr>(align));
 }
 INTERFACE WEAK void operator delete(void *ptr, size_t size,
                                     std::align_val_t align) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::New, size,
                        static_cast<scudo::uptr>(align));
 }
 INTERFACE WEAK void operator delete[](void *ptr, size_t size,
                                       std::align_val_t align) NOEXCEPT {
+  reportDeallocation(ptr);
   Allocator.deallocate(ptr, scudo::Chunk::Origin::NewArray, size,
                        static_cast<scudo::uptr>(align));
 }
author	Android Build Coastguard Worker <android-build-coastguard-worker@google.com>	2024-05-10 15:43:44 +0000
committer	Android Build Coastguard Worker <android-build-coastguard-worker@google.com>	2024-05-10 15:43:44 +0000
commit	f3de6d992cb37d6ff14a1907d544d5e9121ddf0e (patch)
tree	6b5bf6fe73d6621b9e2f398896b213e793adf226
parent	06dad597df8cbe3970c5d588545c8102cdc738d8 (diff)
parent	3fd49b0c0bc8410604521d097e01408b351736e7 (diff)
download	scudo-busytown-mac-infra-release.tar.gz