diff --git a/.github/workflows/build-cachelib-centos-8-5.yml b/.github/workflows/build-cachelib-centos-8-5.yml
index 5dade56439..fcb3129b22 100644
--- a/.github/workflows/build-cachelib-centos-8-5.yml
+++ b/.github/workflows/build-cachelib-centos-8-5.yml
@@ -13,11 +13,6 @@
 # limitations under the License.
 name: build-cachelib-centos-8.5
 on:
-  push:
-    tags:
-      - 'v*'
-  pull_request:
-  workflow_dispatch:
   schedule:
      - cron:  '0 9 * * *'
 jobs:
diff --git a/.github/workflows/build-cachelib-centos-long.yml b/.github/workflows/build-cachelib-centos-long.yml
new file mode 100644
index 0000000000..92165f603b
--- /dev/null
+++ b/.github/workflows/build-cachelib-centos-long.yml
@@ -0,0 +1,39 @@
+name: build-cachelib-centos-latest
+on:
+  schedule:
+    - cron:  '0 7 * * *'
+    
+jobs:
+  build-cachelib-centos8-latest:
+    name: "CentOS/latest - Build CacheLib with all dependencies"
+    runs-on: ubuntu-latest
+    # Docker container image name
+    container: "centos:latest"
+    steps:
+      - name: "update packages"
+        run: dnf upgrade -y
+      - name: "install sudo,git"
+        run: dnf install -y sudo git cmake gcc
+      - name: "System Information"
+        run: |
+          echo === uname ===
+          uname -a
+          echo === /etc/os-release ===
+          cat /etc/os-release
+          echo === df -hl ===
+          df -hl
+          echo === free -h ===
+          free -h
+          echo === top ===
+          top -b -n1 -1 -Eg || timeout 1 top -b -n1
+          echo === env ===
+          env
+          echo === gcc -v ===
+          gcc -v
+      - name: "checkout sources"
+        uses: actions/checkout@v2
+      - name: "build CacheLib using build script"
+        run: ./contrib/build.sh -j -v -T
+      - name: "run tests"
+        timeout-minutes: 60
+        run: cd opt/cachelib/tests && ../../../run_tests.sh long
diff --git a/.github/workflows/build-cachelib-debian.yml b/.github/workflows/build-cachelib-debian.yml
new file mode 100644
index 0000000000..5bc3ad3c70
--- /dev/null
+++ b/.github/workflows/build-cachelib-debian.yml
@@ -0,0 +1,43 @@
+name: build-cachelib-debian-10
+on:
+  schedule:
+    - cron:  '30 5 * * 0,3'
+
+jobs:
+  build-cachelib-debian-10:
+    name: "Debian/Buster - Build CacheLib with all dependencies"
+    runs-on: ubuntu-latest
+    # Docker container image name
+    container: "debian:buster-slim"
+    steps:
+      - name: "update packages"
+        run: apt-get update
+      - name: "upgrade packages"
+        run: apt-get -y upgrade
+      - name: "install sudo,git"
+        run: apt-get install -y sudo git procps
+      - name: "System Information"
+        run: |
+          echo === uname ===
+          uname -a
+          echo === /etc/os-release ===
+          cat /etc/os-release
+          echo === df -hl ===
+          df -hl
+          echo === free -h ===
+          free -h
+          echo === top ===
+          top -b -n1 -1 -Eg || timeout 1 top -b -n1 ; true
+          echo === env ===
+          env
+          echo === cc -v ===
+          cc -v || true
+          echo === g++ -v ===
+          g++ - || true
+      - name: "checkout sources"
+        uses: actions/checkout@v2
+      - name: "build CacheLib using build script"
+        run: ./contrib/build.sh -j -v -T
+      - name: "run tests"
+        timeout-minutes: 60
+        run: cd opt/cachelib/tests && ../../../run_tests.sh
diff --git a/.github/workflows/build-cachelib-docker.yml b/.github/workflows/build-cachelib-docker.yml
new file mode 100644
index 0000000000..f00c028708
--- /dev/null
+++ b/.github/workflows/build-cachelib-docker.yml
@@ -0,0 +1,50 @@
+name: build-cachelib-docker
+on:
+  push:
+  pull_request:
+
+jobs:
+  build-cachelib-docker:
+    name: "CentOS/latest - Build CacheLib with all dependencies"
+    runs-on: ubuntu-latest
+    env:
+      REPO:           cachelib
+      GITHUB_REPO:    intel/CacheLib
+      CONTAINER_REG:  ghcr.io/pmem/cachelib
+      CONTAINER_REG_USER:   ${{ secrets.GH_CR_USER }}
+      CONTAINER_REG_PASS:   ${{ secrets.GH_CR_PAT }}
+      FORCE_IMAGE_ACTION:   ${{ secrets.FORCE_IMAGE_ACTION }}
+      HOST_WORKDIR:         ${{ github.workspace }}
+      WORKDIR:              docker
+      IMG_VER:              devel
+    strategy:
+      matrix:
+        CONFIG: ["OS=centos OS_VER=8streams PUSH_IMAGE=1"]
+    steps:
+      - name: "System Information"
+        run: |
+          echo === uname ===
+          uname -a
+          echo === /etc/os-release ===
+          cat /etc/os-release
+          echo === df -hl ===
+          df -hl
+          echo === free -h ===
+          free -h
+          echo === top ===
+          top -b -n1 -1 -Eg || timeout 1 top -b -n1
+          echo === env ===
+          env
+          echo === gcc -v ===
+          gcc -v
+      - name: "checkout sources"
+        uses: actions/checkout@v2
+        with:
+          submodules: recursive
+          fetch-depth: 0
+
+      - name: Pull the image or rebuild and push it
+        run: cd $WORKDIR && ${{ matrix.CONFIG }} ./pull-or-rebuild-image.sh $FORCE_IMAGE_ACTION
+
+      - name: Run the build
+        run: cd $WORKDIR && ${{ matrix.CONFIG }} ./build.sh
diff --git a/MultiTierDataMovement.md b/MultiTierDataMovement.md
new file mode 100644
index 0000000000..cccc14b947
--- /dev/null
+++ b/MultiTierDataMovement.md
@@ -0,0 +1,90 @@
+# Background Data Movement
+
+In order to reduce the number of online evictions and support asynchronous
+promotion - we have added two periodic workers to handle eviction and promotion.
+
+The diagram below shows a simplified version of how the background evictor
+thread (green) is integrated to the CacheLib architecture. 
+
+<p align="center">
+  <img width="640" height="360" alt="BackgroundEvictor" src="cachelib-background-evictor.png">
+</p>
+
+## Background Evictors
+
+The background evictors scan each class to see if there are objects to move the next (lower)
+tier using a given strategy. Here we document the parameters for the different
+strategies and general parameters. 
+
+- `backgroundEvictorIntervalMilSec`: The interval that this thread runs for - by default
+the background evictor threads will wake up every 10 ms to scan the AllocationClasses. Also,
+the background evictor thread will be woken up everytime there is a failed allocation (from
+a request handling thread) and the current percentage of free memory for the 
+AllocationClass is lower than `lowEvictionAcWatermark`. This may render the interval parameter
+not as important when there are many allocations occuring from request handling threads. 
+
+- `evictorThreads`: The number of background evictors to run - each thread is a assigned
+a set of AllocationClasses to scan and evict objects from. Currently, each thread gets
+an equal number of classes to scan - but as object size distribution may be unequal - future
+versions will attempt to balance the classes among threads. The range is 1 to number of AllocationClasses.
+The default is 1. 
+
+- `maxEvictionBatch`: The number of objects to remove in a given eviction call. The
+default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not
+remove objects at a reasonable rate, too high and it might increase contention with user threads.
+
+- `minEvictionBatch`: Minimum number of items to evict at any time (if there are any
+candidates)
+
+- `maxEvictionPromotionHotness`: Maximum candidates to consider for eviction. This is similar to `maxEvictionBatch`
+but it specifies how many candidates will be taken into consideration, not the actual number of items to evict.
+This option can be used to configure duration of critical section on LRU lock.
+
+
+### FreeThresholdStrategy (default)
+
+- `lowEvictionAcWatermark`: Triggers background eviction thread to run
+when this percentage of the AllocationClass is free. 
+The default is `2.0`, to avoid wasting capacity we don't set this above `10.0`.
+
+- `highEvictionAcWatermark`: Stop the evictions from an AllocationClass when this 
+percentage of the AllocationClass is free. The default is `5.0`, to avoid wasting capacity we
+don't set this above `10`.
+
+
+## Background Promoters
+
+The background promoters scan each class to see if there are objects to move to a lower
+tier using a given strategy. Here we document the parameters for the different
+strategies and general parameters.
+
+- `backgroundPromoterIntervalMilSec`: The interval that this thread runs for - by default
+the background promoter threads will wake up every 10 ms to scan the AllocationClasses for
+objects to promote.
+
+- `promoterThreads`: The number of background promoters to run - each thread is a assigned
+a set of AllocationClasses to scan and promote objects from. Currently, each thread gets
+an equal number of classes to scan - but as object size distribution may be unequal - future
+versions will attempt to balance the classes among threads. The range is `1` to number of AllocationClasses. The default is `1`.
+
+- `maxProtmotionBatch`: The number of objects to promote in a given promotion call. The
+default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not
+remove objects at a reasonable rate, too high and it might increase contention with user threads. 
+
+- `minPromotionBatch`: Minimum number of items to promote at any time (if there are any
+candidates)
+
+- `numDuplicateElements`: This allows us to promote items that have existing handles (read-only) since
+we won't need to modify the data when a user is done with the data. Therefore, for a short time
+the data could reside in both tiers until it is evicted from its current tier. The default is to
+not allow this (0). Setting the value to 100 will enable duplicate elements in tiers.
+
+### Background Promotion Strategy (only one currently)
+
+- `promotionAcWatermark`: Promote items if there is at least this
+percent of free AllocationClasses. Promotion thread will attempt to move `maxPromotionBatch` number of objects
+to that tier. The objects are chosen from the head of the LRU. The default is `4.0`.
+This value should correlate with `lowEvictionAcWatermark`, `highEvictionAcWatermark`, `minAcAllocationWatermark`, `maxAcAllocationWatermark`.
+- `maxPromotionBatch`: The number of objects to promote in batch during BG promotion. Analogous to
+`maxEvictionBatch`. It's value should be lower to decrease contention on hot items.
+
diff --git a/cachelib/CMakeLists.txt b/cachelib/CMakeLists.txt
index 506ba66bcf..bb77d54dc6 100644
--- a/cachelib/CMakeLists.txt
+++ b/cachelib/CMakeLists.txt
@@ -43,6 +43,7 @@ set(PACKAGE_BUGREPORT "https://github.com/facebook/TBD")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 option(BUILD_TESTS "If enabled, compile the tests." ON)
+option(BUILD_WITH_DTO "If enabled, build with DSA transparent offloading." OFF)
 
 
 set(BIN_INSTALL_DIR bin CACHE STRING
@@ -85,6 +86,11 @@ set(CMAKE_MODULE_PATH
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
 
+if(COVERAGE_ENABLED)
+  # Add code coverage
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --coverage -fprofile-arcs -ftest-coverage")
+endif()
+
 # include(fb_cxx_flags)
 message(STATUS "Update CXXFLAGS: ${CMAKE_CXX_FLAGS}")
 
diff --git a/cachelib/allocator/BackgroundMover.h b/cachelib/allocator/BackgroundMover.h
index aee86a4e32..e8c1242283 100644
--- a/cachelib/allocator/BackgroundMover.h
+++ b/cachelib/allocator/BackgroundMover.h
@@ -18,7 +18,6 @@
 
 #include "cachelib/allocator/BackgroundMoverStrategy.h"
 #include "cachelib/allocator/CacheStats.h"
-#include "cachelib/common/AtomicCounter.h"
 #include "cachelib/common/PeriodicWorker.h"
 
 namespace facebook::cachelib {
@@ -27,17 +26,19 @@ namespace facebook::cachelib {
 template <typename C>
 struct BackgroundMoverAPIWrapper {
   static size_t traverseAndEvictItems(C& cache,
+                                      unsigned int tid,
                                       unsigned int pid,
                                       unsigned int cid,
                                       size_t batch) {
-    return cache.traverseAndEvictItems(pid, cid, batch);
+    return cache.traverseAndEvictItems(tid, pid, cid, batch);
   }
 
   static size_t traverseAndPromoteItems(C& cache,
+                                        unsigned int tid,
                                         unsigned int pid,
                                         unsigned int cid,
                                         size_t batch) {
-    return cache.traverseAndPromoteItems(pid, cid, batch);
+    return cache.traverseAndPromoteItems(tid, pid, cid, batch);
   }
 };
 
@@ -49,6 +50,7 @@ enum class MoverDir { Evict = 0, Promote };
 template <typename CacheT>
 class BackgroundMover : public PeriodicWorker {
  public:
+  using ClassBgStatsType = std::map<MemoryDescriptorType,uint64_t>;
   using Cache = CacheT;
   // @param cache               the cache interface
   // @param strategy            the stragey class that defines how objects are
@@ -60,16 +62,38 @@ class BackgroundMover : public PeriodicWorker {
   ~BackgroundMover() override;
 
   BackgroundMoverStats getStats() const noexcept;
-  std::map<PoolId, std::map<ClassId, uint64_t>> getClassStats() const noexcept;
+  ClassBgStatsType getClassStats() const noexcept {
+    return movesPerClass_;
+  }
 
   void setAssignedMemory(std::vector<MemoryDescriptorType>&& assignedMemory);
 
   // return id of the worker responsible for promoting/evicting from particlar
   // pool and allocation calss (id is in range [0, numWorkers))
-  static size_t workerId(PoolId pid, ClassId cid, size_t numWorkers);
+  static size_t workerId(TierId tid, PoolId pid, ClassId cid, size_t numWorkers);
 
  private:
-  std::map<PoolId, std::map<ClassId, uint64_t>> movesPerClass_;
+  ClassBgStatsType movesPerClass_;
+
+  struct TraversalStats {
+    // record a traversal and its time taken
+    void recordTraversalTime(uint64_t nsTaken);
+
+    uint64_t getAvgTraversalTimeNs(uint64_t numTraversals) const;
+    uint64_t getMinTraversalTimeNs() const { return minTraversalTimeNs_; }
+    uint64_t getMaxTraversalTimeNs() const { return maxTraversalTimeNs_; }
+    uint64_t getLastTraversalTimeNs() const { return lastTraversalTimeNs_; }
+
+   private:
+    // time it took us the last time to traverse the cache.
+    uint64_t lastTraversalTimeNs_{0};
+    uint64_t minTraversalTimeNs_{
+        std::numeric_limits<uint64_t>::max()};
+    uint64_t maxTraversalTimeNs_{0};
+    uint64_t totalTraversalTimeNs_{0};
+  };
+
+  TraversalStats traversalStats_;
   // cache allocator's interface for evicting
   using Item = typename Cache::Item;
 
@@ -77,15 +101,18 @@ class BackgroundMover : public PeriodicWorker {
   std::shared_ptr<BackgroundMoverStrategy> strategy_;
   MoverDir direction_;
 
-  std::function<size_t(Cache&, unsigned int, unsigned int, size_t)> moverFunc;
+  std::function<size_t(
+      Cache&, unsigned int, unsigned int, unsigned int, size_t)>
+      moverFunc;
 
   // implements the actual logic of running the background evictor
   void work() override final;
   void checkAndRun();
 
-  AtomicCounter numMovedItems_{0};
-  AtomicCounter numTraversals_{0};
-  AtomicCounter totalBytesMoved_{0};
+  uint64_t numMovedItems{0};
+  uint64_t numTraversals{0};
+  uint64_t totalClasses{0};
+  uint64_t totalBytesMoved{0};
 
   std::vector<MemoryDescriptorType> assignedMemory_;
   folly::DistributedMutex mutex_;
@@ -105,6 +132,20 @@ BackgroundMover<CacheT>::BackgroundMover(
   }
 }
 
+template <typename CacheT>
+void BackgroundMover<CacheT>::TraversalStats::recordTraversalTime(uint64_t nsTaken) {
+  lastTraversalTimeNs_ = nsTaken;
+  minTraversalTimeNs_ = std::min(minTraversalTimeNs_, nsTaken);
+  maxTraversalTimeNs_ = std::max(maxTraversalTimeNs_, nsTaken);
+  totalTraversalTimeNs_ += nsTaken;
+}
+
+template <typename CacheT>
+uint64_t BackgroundMover<CacheT>::TraversalStats::getAvgTraversalTimeNs(
+    uint64_t numTraversals) const {
+  return numTraversals ? totalTraversalTimeNs_ / numTraversals : 0;
+}
+
 template <typename CacheT>
 BackgroundMover<CacheT>::~BackgroundMover() {
   stop(std::chrono::seconds(0));
@@ -123,8 +164,8 @@ template <typename CacheT>
 void BackgroundMover<CacheT>::setAssignedMemory(
     std::vector<MemoryDescriptorType>&& assignedMemory) {
   XLOG(INFO, "Class assigned to background worker:");
-  for (auto [pid, cid] : assignedMemory) {
-    XLOGF(INFO, "Pid: {}, Cid: {}", pid, cid);
+  for (auto [tid, pid, cid] : assignedMemory) {
+    XLOGF(INFO, "Tid: {}, Pid: {}, Cid: {}", tid, pid, cid);
   }
 
   mutex_.lock_combine([this, &assignedMemory] {
@@ -138,51 +179,64 @@ template <typename CacheT>
 void BackgroundMover<CacheT>::checkAndRun() {
   auto assignedMemory = mutex_.lock_combine([this] { return assignedMemory_; });
 
-  unsigned int moves = 0;
-  auto batches = strategy_->calculateBatchSizes(cache_, assignedMemory);
-
-  for (size_t i = 0; i < batches.size(); i++) {
-    const auto [pid, cid] = assignedMemory[i];
-    const auto batch = batches[i];
-
-    if (batch == 0) {
-      continue;
+  while (true) {
+    unsigned int moves = 0;
+    std::set<ClassId> classes{};
+    auto batches = strategy_->calculateBatchSizes(cache_, assignedMemory);
+
+    const auto begin = util::getCurrentTimeNs();
+    for (size_t i = 0; i < batches.size(); i++) {
+      const auto [tid, pid, cid] = assignedMemory[i];
+      const auto batch = batches[i];
+      if (!batch) {
+        continue;
+      }
+
+      // try moving BATCH items from the class in order to reach free target
+      auto moved = moverFunc(cache_, tid, pid, cid, batch);
+      moves += moved;
+      movesPerClass_[assignedMemory[i]] += moved;
+    }
+    auto end = util::getCurrentTimeNs();
+    if (moves > 0) {
+      traversalStats_.recordTraversalTime(end > begin ? end - begin : 0);
+      numMovedItems += moves;
+      numTraversals++;
     }
 
-    // try moving BATCH items from the class in order to reach free target
-    auto moved = moverFunc(cache_, pid, cid, batch);
-    moves += moved;
-    movesPerClass_[pid][cid] += moved;
-    totalBytesMoved_.add(moved * cache_.getPool(pid).getAllocSizes()[cid]);
+    //we didn't move any objects done with this run
+    if (moves == 0 || shouldStopWork()) {
+        break;
+    }
   }
-
-  numTraversals_.inc();
-  numMovedItems_.add(moves);
 }
 
 template <typename CacheT>
 BackgroundMoverStats BackgroundMover<CacheT>::getStats() const noexcept {
   BackgroundMoverStats stats;
-  stats.numMovedItems = numMovedItems_.get();
-  stats.runCount = numTraversals_.get();
-  stats.totalBytesMoved = totalBytesMoved_.get();
+  stats.numMovedItems = numMovedItems;
+  stats.totalBytesMoved = totalBytesMoved;
+  stats.totalClasses = totalClasses;
+  auto runCount = getRunCount();
+  stats.runCount = runCount;
+  stats.numTraversals = numTraversals;
+  stats.avgItemsMoved = (double) stats.numMovedItems / (double)runCount;
+  stats.lastTraversalTimeNs = traversalStats_.getLastTraversalTimeNs();
+  stats.avgTraversalTimeNs = traversalStats_.getAvgTraversalTimeNs(numTraversals);
+  stats.minTraversalTimeNs = traversalStats_.getMinTraversalTimeNs();
+  stats.maxTraversalTimeNs = traversalStats_.getMaxTraversalTimeNs();
 
   return stats;
 }
 
 template <typename CacheT>
-std::map<PoolId, std::map<ClassId, uint64_t>>
-BackgroundMover<CacheT>::getClassStats() const noexcept {
-  return movesPerClass_;
-}
-
-template <typename CacheT>
-size_t BackgroundMover<CacheT>::workerId(PoolId pid,
+size_t BackgroundMover<CacheT>::workerId(TierId tid,
+                                         PoolId pid,
                                          ClassId cid,
                                          size_t numWorkers) {
   XDCHECK(numWorkers);
 
   // TODO: came up with some better sharding (use hashing?)
-  return (pid + cid) % numWorkers;
+  return (tid + pid + cid) % numWorkers;
 }
 } // namespace facebook::cachelib
diff --git a/cachelib/allocator/BackgroundMoverStrategy.h b/cachelib/allocator/BackgroundMoverStrategy.h
index abf37edd13..2f187636c6 100644
--- a/cachelib/allocator/BackgroundMoverStrategy.h
+++ b/cachelib/allocator/BackgroundMoverStrategy.h
@@ -21,12 +21,6 @@
 namespace facebook {
 namespace cachelib {
 
-struct MemoryDescriptorType {
-  MemoryDescriptorType(PoolId pid, ClassId cid) : pid_(pid), cid_(cid) {}
-  PoolId pid_;
-  ClassId cid_;
-};
-
 // Base class for background eviction strategy.
 class BackgroundMoverStrategy {
  public:
@@ -44,5 +38,34 @@ class BackgroundMoverStrategy {
   virtual ~BackgroundMoverStrategy() = default;
 };
 
+class DefaultBackgroundMoverStrategy : public BackgroundMoverStrategy {
+  public:
+    DefaultBackgroundMoverStrategy(uint64_t batchSize, double targetFree)
+      : batchSize_(batchSize), targetFree_((double)targetFree/100.0) {}
+    ~DefaultBackgroundMoverStrategy() {}
+
+  std::vector<size_t> calculateBatchSizes(
+      const CacheBase& cache,
+      std::vector<MemoryDescriptorType> acVec) {
+    std::vector<size_t> batches{};
+    for (auto [tid, pid, cid] : acVec) {
+        double usage = cache.getPoolByTid(pid, tid).getApproxUsage(cid);
+        uint32_t perSlab = cache.getPoolByTid(pid, tid).getPerSlab(cid);
+        if (usage >= (1.0-targetFree_)) {
+          uint32_t batch = batchSize_ > perSlab ? perSlab : batchSize_;
+          batches.push_back(batch);
+        } else {
+          //no work to be done since there is already
+          //at least targetFree remaining in the class
+          batches.push_back(0);
+        }
+    }
+    return batches;
+  }
+  private:
+    uint64_t batchSize_{100};
+    double targetFree_{0.05};
+};
+
 } // namespace cachelib
 } // namespace facebook
diff --git a/cachelib/allocator/CMakeLists.txt b/cachelib/allocator/CMakeLists.txt
index 6103cdc823..0f96a0cd7f 100644
--- a/cachelib/allocator/CMakeLists.txt
+++ b/cachelib/allocator/CMakeLists.txt
@@ -55,6 +55,7 @@ add_library (cachelib_allocator
     PoolOptimizeStrategy.cpp
     PoolRebalancer.cpp
     PoolResizer.cpp
+    PrivateMemoryManager.cpp
     RebalanceStrategy.cpp
     SlabReleaseStats.cpp
     TempShmMapping.cpp
diff --git a/cachelib/allocator/Cache.cpp b/cachelib/allocator/Cache.cpp
index 37bba99a67..8d958b3510 100644
--- a/cachelib/allocator/Cache.cpp
+++ b/cachelib/allocator/Cache.cpp
@@ -244,6 +244,7 @@ void CacheBase::updateGlobalCacheStats(const std::string& statPrefix) const {
       statPrefix + "cache.size.configured",
       memStats.configuredRamCacheSize + memStats.nvmCacheSize);
 
+  //TODO: add specific per-tier counters
   const auto stats = getGlobalCacheStats();
 
   // Eviction Stats
@@ -253,7 +254,8 @@ void CacheBase::updateGlobalCacheStats(const std::string& statPrefix) const {
   //   from both ram and nvm, this is counted as a single eviction from cache.
   // Ram Evictions: item evicted from ram but it can be inserted into nvm
   const std::string ramEvictionKey = statPrefix + "ram.evictions";
-  counters_.updateDelta(ramEvictionKey, stats.numEvictions);
+  counters_.updateDelta(ramEvictionKey,
+                        std::accumulate(stats.numEvictions.begin(), stats.numEvictions.end(), 0));
   // Nvm Evictions: item evicted from nvm but it can be still in ram
   const std::string nvmEvictionKey = statPrefix + "nvm.evictions";
   counters_.updateDelta(nvmEvictionKey, stats.numNvmEvictions);
@@ -295,11 +297,11 @@ void CacheBase::updateGlobalCacheStats(const std::string& statPrefix) const {
   }
 
   counters_.updateDelta(statPrefix + "cache.alloc_attempts",
-                        stats.allocAttempts);
+                        std::accumulate(stats.allocAttempts.begin(), stats.allocAttempts.end(),0));
   counters_.updateDelta(statPrefix + "cache.eviction_attempts",
-                        stats.evictionAttempts);
+                        std::accumulate(stats.evictionAttempts.begin(),stats.evictionAttempts.end(),0));
   counters_.updateDelta(statPrefix + "cache.alloc_failures",
-                        stats.allocFailures);
+                        std::accumulate(stats.allocFailures.begin(),stats.allocFailures.end(),0));
   counters_.updateDelta(statPrefix + "cache.invalid_allocs",
                         stats.invalidAllocs);
 
@@ -475,6 +477,10 @@ void CacheBase::updateGlobalCacheStats(const std::string& statPrefix) const {
 
   visitEstimates(uploadStatsNanoToMicro, stats.allocateLatencyNs,
                  statPrefix + "allocate.latency_us");
+  visitEstimates(uploadStatsNanoToMicro, stats.bgEvictLatencyNs,
+                 statPrefix + "background.eviction.latency_us");
+  visitEstimates(uploadStatsNanoToMicro, stats.bgPromoteLatencyNs,
+                 statPrefix + "background.promotion.latency_us");
   visitEstimates(uploadStatsNanoToMicro, stats.moveChainedLatencyNs,
                  statPrefix + "move.chained.latency_us");
   visitEstimates(uploadStatsNanoToMicro, stats.moveRegularLatencyNs,
diff --git a/cachelib/allocator/Cache.h b/cachelib/allocator/Cache.h
index e225ba8a01..6f7ae20bc5 100644
--- a/cachelib/allocator/Cache.h
+++ b/cachelib/allocator/Cache.h
@@ -73,6 +73,22 @@ enum class DestructorContext {
   kRemovedFromNVM
 };
 
+struct MemoryDescriptorType {
+    MemoryDescriptorType(TierId tid, PoolId pid, ClassId cid) :
+        tid_(tid), pid_(pid), cid_(cid) {}
+    TierId tid_;
+    PoolId pid_;
+    ClassId cid_;
+
+    bool operator<(const MemoryDescriptorType& rhs) const {
+      return std::make_tuple(tid_, pid_, cid_) < std::make_tuple(rhs.tid_, rhs.pid_, rhs.cid_);
+    }
+
+    bool operator==(const MemoryDescriptorType& rhs) const {
+      return std::make_tuple(tid_, pid_, cid_) == std::make_tuple(rhs.tid_, rhs.pid_, rhs.cid_);
+    }
+};
+
 // A base class of cache exposing members and status agnostic of template type.
 class CacheBase {
  public:
@@ -85,6 +101,9 @@ class CacheBase {
   CacheBase(CacheBase&&) = default;
   CacheBase& operator=(CacheBase&&) = default;
 
+  // TODO: come up with some reasonable number
+  static constexpr unsigned kMaxTiers = 2;
+
   // Get a string referring to the cache name for this cache
   virtual const std::string getCacheName() const = 0;
 
@@ -96,12 +115,24 @@ class CacheBase {
   // @param poolId    The pool id to query
   virtual const MemoryPool& getPool(PoolId poolId) const = 0;
 
+  // Get the reference to a memory pool using a tier id, for stats purposes
+  //
+  // @param poolId    The pool id to query
+  // @param tierId    The tier of the pool id
+  virtual const MemoryPool& getPoolByTid(PoolId poolId, TierId tid) const = 0;
+
   // Get Pool specific stats (regular pools). This includes stats from the
   // Memory Pool and also the cache.
   //
   // @param poolId   the pool id
   virtual PoolStats getPoolStats(PoolId poolId) const = 0;
 
+  // Get Allocation Class specific stats.
+  //
+  // @param poolId   the pool id
+  // @param classId   the class id
+  virtual ACStats getACStats(TierId tid, PoolId poolId, ClassId classId) const = 0;
+
   // @param poolId   the pool id
   virtual AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId poolId) const = 0;
 
diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index 3b0d9eeaef..7422c1c61c 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -23,6 +23,8 @@
 #include <folly/fibers/TimedMutex.h>
 #include <folly/logging/xlog.h>
 #include <folly/synchronization/SanitizeThread.h>
+#include <folly/hash/Hash.h>
+#include <folly/container/F14Map.h>
 #include <gtest/gtest.h>
 
 #include <chrono>
@@ -59,6 +61,7 @@
 #include "cachelib/allocator/PoolOptimizer.h"
 #include "cachelib/allocator/PoolRebalancer.h"
 #include "cachelib/allocator/PoolResizer.h"
+#include "cachelib/allocator/PrivateMemoryManager.h"
 #include "cachelib/allocator/ReadOnlySharedCacheView.h"
 #include "cachelib/allocator/Reaper.h"
 #include "cachelib/allocator/RebalanceStrategy.h"
@@ -219,7 +222,7 @@ class CacheAllocator : public CacheBase {
   using PoolIds = std::set<PoolId>;
 
   using EventTracker = EventInterface<Key>;
-
+  using ClassBgStatsType = std::map<MemoryDescriptorType,uint64_t>;
   // SampleItem is a wrapper for the CacheItem which is provided as the sample
   // for uploading to Scuba (see ItemStatsExporter). It is guaranteed that the
   // CacheItem is accessible as long as the SampleItem is around since the
@@ -350,6 +353,38 @@ class CacheAllocator : public CacheBase {
     virtual bool isValid() const { return true; }
   };
   using ChainedItemMovingSync = std::function<std::unique_ptr<SyncObj>(Key)>;
+  
+  // Eviction related data returned from
+  // function executed under mmContainer lock
+  struct EvictionData {
+    EvictionData() = delete;
+    EvictionData(Item *candidate_, 
+                 Item *toRecycle_,
+                 Item *toRecycleParent_,
+                 bool chainedItem_,
+                 bool expired_,
+                 typename NvmCacheT::PutToken token_,
+                 WriteHandle candidateHandle_) :
+                 candidate(candidate_),
+                 toRecycle(toRecycle_),
+                 toRecycleParent(toRecycleParent_),
+                 expired(expired_),
+                 chainedItem(chainedItem_),
+                 token(std::move(token_)),
+                 candidateHandle(std::move(candidateHandle_)) {}
+
+    // item that is candidate for eviction
+    Item *candidate;
+    // acutal alloc that will be recycled
+    // back up to allocator
+    Item *toRecycle;
+    // possible parent ref
+    Item *toRecycleParent;
+    bool expired; //is item expired
+    bool chainedItem; //is it a chained item
+    typename NvmCacheT::PutToken token; //put token for NVM cache
+    WriteHandle candidateHandle; //hande in case we don't use moving bit
+  };
 
   using AccessContainer = typename Item::AccessContainer;
   using MMContainer = typename Item::MMContainer;
@@ -709,10 +744,10 @@ class CacheAllocator : public CacheBase {
   uint32_t getUsableSize(const Item& item) const;
 
   // create memory assignment to bg workers
-  auto createBgWorkerMemoryAssignments(size_t numWorkers);
+  auto createBgWorkerMemoryAssignments(size_t numWorkers, TierId tid);
 
   // whether bg worker should be woken
-  bool shouldWakeupBgEvictor(PoolId pid, ClassId cid);
+  bool shouldWakeupBgEvictor(TierId tid, PoolId pid, ClassId cid);
 
   // Get a random item from memory
   // This is useful for profiling and sampling cachelib managed memory
@@ -810,7 +845,7 @@ class CacheAllocator : public CacheBase {
   // @param config    new config for the pool
   //
   // @throw std::invalid_argument if the poolId is invalid
-  void overridePoolConfig(PoolId pid, const MMConfig& config);
+  void overridePoolConfig(TierId tid, PoolId pid, const MMConfig& config);
 
   // update an existing pool's rebalance strategy
   //
@@ -851,8 +886,9 @@ class CacheAllocator : public CacheBase {
   // @return  true if the operation succeeded. false if the size of the pool is
   //          smaller than _bytes_
   // @throw   std::invalid_argument if the poolId is invalid.
+  // TODO: should call shrinkPool for specific tier?
   bool shrinkPool(PoolId pid, size_t bytes) {
-    return allocator_->shrinkPool(pid, bytes);
+    return allocator_[currentTier()]->shrinkPool(pid, bytes);
   }
 
   // grow an existing pool by _bytes_. This will fail if there is no
@@ -861,8 +897,9 @@ class CacheAllocator : public CacheBase {
   // @return    true if the pool was grown. false if the necessary number of
   //            bytes were not available.
   // @throw     std::invalid_argument if the poolId is invalid.
+  // TODO: should call growPool for specific tier?
   bool growPool(PoolId pid, size_t bytes) {
-    return allocator_->growPool(pid, bytes);
+    return allocator_[currentTier()]->growPool(pid, bytes);
   }
 
   // move bytes from one pool to another. The source pool should be at least
@@ -875,7 +912,7 @@ class CacheAllocator : public CacheBase {
   //          correct size to do the transfer.
   // @throw   std::invalid_argument if src or dest is invalid pool
   bool resizePools(PoolId src, PoolId dest, size_t bytes) override {
-    return allocator_->resizePools(src, dest, bytes);
+    return allocator_[currentTier()]->resizePools(src, dest, bytes);
   }
 
   // Add a new compact cache with given name and size
@@ -1104,12 +1141,13 @@ class CacheAllocator : public CacheBase {
   // @throw std::invalid_argument if the memory does not belong to this
   //        cache allocator
   AllocInfo getAllocInfo(const void* memory) const {
-    return allocator_->getAllocInfo(memory);
+    return allocator_[getTierId(memory)]->getAllocInfo(memory);
   }
 
   // return the ids for the set of existing pools in this cache.
   std::set<PoolId> getPoolIds() const override final {
-    return allocator_->getPoolIds();
+    // all tiers have the same pool ids. TODO: deduplicate
+    return allocator_[0]->getPoolIds();
   }
 
   // return a list of pool ids that are backing compact caches. This includes
@@ -1121,18 +1159,22 @@ class CacheAllocator : public CacheBase {
 
   // return the pool with speicified id.
   const MemoryPool& getPool(PoolId pid) const override final {
-    return allocator_->getPool(pid);
+    return allocator_[currentTier()]->getPool(pid);
+  }
+
+  const MemoryPool& getPoolByTid(PoolId pid, TierId tid) const override final {
+    return allocator_[tid]->getPool(pid);
   }
 
   // calculate the number of slabs to be advised/reclaimed in each pool
   PoolAdviseReclaimData calcNumSlabsToAdviseReclaim() override final {
     auto regularPoolIds = getRegularPoolIds();
-    return allocator_->calcNumSlabsToAdviseReclaim(regularPoolIds);
+    return allocator_[currentTier()]->calcNumSlabsToAdviseReclaim(regularPoolIds);
   }
 
   // update number of slabs to advise in the cache
   void updateNumSlabsToAdvise(int32_t numSlabsToAdvise) override final {
-    allocator_->updateNumSlabsToAdvise(numSlabsToAdvise);
+    allocator_[currentTier()]->updateNumSlabsToAdvise(numSlabsToAdvise);
   }
 
   // returns a valid PoolId corresponding to the name or kInvalidPoolId if the
@@ -1140,8 +1182,9 @@ class CacheAllocator : public CacheBase {
   PoolId getPoolId(folly::StringPiece name) const noexcept;
 
   // returns the pool's name by its poolId.
-  std::string getPoolName(PoolId poolId) const override {
-    return allocator_->getPoolName(poolId);
+  std::string getPoolName(PoolId poolId) const {
+    // all tiers have the same pool names.
+    return allocator_[0]->getPoolName(poolId);
   }
 
   // get stats related to all kinds of slab release events.
@@ -1174,6 +1217,43 @@ class CacheAllocator : public CacheBase {
     return stats;
   }
 
+  // returns the background mover stats per thread
+  std::vector<BackgroundMoverStats> getBackgroundMoverStats(MoverDir direction) const {
+    auto stats = std::vector<BackgroundMoverStats>();
+    if (direction == MoverDir::Evict) {
+      for (auto& bg : backgroundEvictor_)
+        stats.push_back(bg->getStats());
+    } else if (direction == MoverDir::Promote) {
+      for (auto& bg : backgroundPromoter_)
+        stats.push_back(bg->getStats());
+    }
+    return stats;
+  }
+
+  ClassBgStatsType
+  getBackgroundMoverClassStats(MoverDir direction) const {
+    ClassBgStatsType stats;
+    auto record = [&](auto &bg) {
+      //gives a unique descriptor
+      auto classStats = bg->getClassStats();
+      for (const auto& [key,value] : classStats) {
+          stats[key] = value;
+      }
+    };
+
+    if (direction == MoverDir::Evict) {
+      for (auto& bg : backgroundEvictor_) {
+          record(bg);
+      }
+    } else if (direction == MoverDir::Promote) {
+      for (auto& bg : backgroundPromoter_) {
+          record(bg);
+      }
+    }
+
+    return stats;
+  }
+
   // returns the pool rebalancer stats
   RebalancerStats getRebalancerStats() const {
     auto stats =
@@ -1199,6 +1279,8 @@ class CacheAllocator : public CacheBase {
 
   // pool stats by pool id
   PoolStats getPoolStats(PoolId pid) const override final;
+  // pool stats by tier id and pool id
+  PoolStats getPoolStats(TierId tid, PoolId pid) const;
 
   // This can be expensive so it is not part of PoolStats
   PoolEvictionAgeStats getPoolEvictionAgeStats(
@@ -1213,6 +1295,9 @@ class CacheAllocator : public CacheBase {
   // return cache's memory usage stats
   CacheMemoryStats getCacheMemoryStats() const override final;
 
+  // return stats for Allocation Class
+  ACStats getACStats(TierId tid, PoolId pid, ClassId cid) const override final;
+
   // return the nvm cache stats map
   util::StatsMap getNvmCacheStatsMap() const override final;
 
@@ -1322,6 +1407,7 @@ class CacheAllocator : public CacheBase {
                  sizeof(typename RefcountWithFlags::Value) + sizeof(uint32_t) +
                  sizeof(uint32_t) + sizeof(KAllocation)) == sizeof(Item),
                 "vtable overhead");
+  // Check for CompressedPtr single/multi tier support
   static_assert(32 == sizeof(Item), "item overhead is 32 bytes");
 
   // make sure there is no overhead in ChainedItem on top of a regular Item
@@ -1416,11 +1502,14 @@ class CacheAllocator : public CacheBase {
 
   using MMContainerPtr = std::unique_ptr<MMContainer>;
   using MMContainers =
-      std::array<std::array<MMContainerPtr, MemoryAllocator::kMaxClasses>,
-                 MemoryPoolManager::kMaxPools>;
+      std::vector<std::array<std::array<MMContainerPtr, MemoryAllocator::kMaxClasses>,
+                 MemoryPoolManager::kMaxPools>>;
 
   void createMMContainers(const PoolId pid, MMConfig config);
 
+  TierId getTierId(const Item& item) const;
+  TierId getTierId(const void* ptr) const;
+
   // acquire the MMContainer corresponding to the the Item's class and pool.
   //
   // @return pointer to the MMContainer.
@@ -1428,7 +1517,12 @@ class CacheAllocator : public CacheBase {
   // allocation from the memory allocator.
   MMContainer& getMMContainer(const Item& item) const noexcept;
 
-  MMContainer& getMMContainer(PoolId pid, ClassId cid) const noexcept;
+  MMContainer& getMMContainer(TierId tid, PoolId pid, ClassId cid) const noexcept;
+
+  // Get stats of the specified pid and cid.
+  // If such mmcontainer is not valid (pool id or cid out of bound)
+  // or the mmcontainer is not initialized, return an empty stat.
+  MMContainerStat getMMContainerStat(TierId tid, PoolId pid, ClassId cid) const noexcept;
 
   // create a new cache allocation. The allocation can be initialized
   // appropriately and made accessible through insert or insertOrReplace.
@@ -1459,8 +1553,49 @@ class CacheAllocator : public CacheBase {
                                Key key,
                                uint32_t size,
                                uint32_t creationTime,
-                               uint32_t expiryTime,
-                               bool fromBgThread = false);
+                               uint32_t expiryTime);
+
+  // create a new cache allocation on specific memory tier.
+  // For description see allocateInternal.
+  //
+  // @param tid id a memory tier
+  // @param evict whether to evict an item from tier tid in case there
+  //        is not enough memory
+  WriteHandle allocateInternalTier(TierId tid,
+                                   PoolId id,
+                                   Key key,
+                                   uint32_t size,
+                                   uint32_t creationTime,
+                                   uint32_t expiryTime,
+                                   bool evict);
+  
+  // create a new cache allocation on specific memory tier,
+  // for a given class id. used in moving between tiers since
+  // class id's are the same among the tiers.
+  // For description see allocateInternal.
+  //
+  // @param tid id a memory tier
+  // @param pid a poold id
+  // @param cid a class id
+  //
+  void* allocateInternalTierByCid(TierId tid,
+                                   PoolId pid,
+                                   ClassId cid);
+
+  // create a new cache allocation on specific memory tier,
+  // for a given class id. used in moving between tiers since
+  // class id's are the same among the tiers.
+  // For description see allocateInternal.
+  //
+  // @param tid id a memory tier
+  // @param pid a poold id
+  // @param cid a class id
+  // @param batch the number of allocations to make
+  //
+  std::vector<void*> allocateInternalTierByCidBatch(TierId tid,
+                                   PoolId pid,
+                                   ClassId cid,
+                                   uint64_t batch);
 
   // Allocate a chained item
   //
@@ -1478,14 +1613,34 @@ class CacheAllocator : public CacheBase {
   //            if the item is invalid
   WriteHandle allocateChainedItemInternal(const Item& parent, uint32_t size);
 
+  // Allocate a chained item to a specific tier
+  //
+  // The resulting chained item does not have a parent item yet
+  // and if we fail to link to the chain for any reasoin
+  // the chained item will be freed once the handle is dropped.
+  //
+  // The parent item parameter here is mainly used to find the
+  // correct pool to allocate memory for this chained item
+  //
+  // @param parent    parent item
+  // @param size      the size for the chained allocation
+  // @param tid       the tier to allocate on
+  //
+  // @return    handle to the chained allocation
+  // @throw     std::invalid_argument if the size requested is invalid or
+  //            if the item is invalid
+  WriteHandle allocateChainedItemInternalTier(const Item& parent,
+                                              uint32_t size,
+                                              TierId tid);
+
   // Given an existing item, allocate a new one for the
   // existing one to later be moved into.
   //
-  // @param oldItem    the item we want to allocate a new item for
+  // @param item   reference to the item we want to allocate a new item for
   //
   // @return  handle to the newly allocated item
   //
-  WriteHandle allocateNewItemForOldItem(const Item& oldItem);
+  WriteHandle allocateNewItemForOldItem(const Item& item);
 
   // internal helper that grabs a refcounted handle to the item. This does
   // not record the access to reflect in the mmContainer.
@@ -1544,12 +1699,14 @@ class CacheAllocator : public CacheBase {
   // callback is responsible for copying the contents and fixing the semantics
   // of chained item.
   //
-  // @param oldItem     Reference to the item being moved
+  // @param oldItem     item being moved
   // @param newItemHdl  Reference to the handle of the new item being moved into
-  //
+  // @param skipAddInMMContainer so we can tell if we should add in mmContainer or wait
+  //                     to do in batch
+  // @param fromBgThread use memmove instead of memcopy (for DTO testing)
   // @return true  If the move was completed, and the containers were updated
   //               successfully.
-  bool moveRegularItem(Item& oldItem, WriteHandle& newItemHdl);
+  bool moveRegularItem(Item& oldItem, WriteHandle& newItemHdl, bool skipAddInMMContainer, bool fromBgThread);
 
   // template class for viewAsChainedAllocs that takes either ReadHandle or
   // WriteHandle
@@ -1582,9 +1739,8 @@ class CacheAllocator : public CacheBase {
   // will be unmarked as having chained allocations. Parent will not be null
   // after calling this API.
   //
-  // Parent and NewParent must be valid handles to items with same key and
-  // parent must have chained items and parent handle must be the only
-  // outstanding handle for parent. New parent must be without any chained item
+  // NewParent must be valid handles to item with same key as Parent and
+  // Parent must have chained items. New parent must be without any chained item
   // handles.
   //
   // Chained item lock for the parent's key needs to be held in exclusive mode.
@@ -1711,15 +1867,18 @@ class CacheAllocator : public CacheBase {
   // Implementation to find a suitable eviction from the container. The
   // two parameters together identify a single container.
   //
+  // @param  tid  the id of the tier to look for evictions inside
   // @param  pid  the id of the pool to look for evictions inside
   // @param  cid  the id of the class to look for evictions inside
   // @return An evicted item or nullptr  if there is no suitable candidate found
   // within the configured number of attempts.
-  Item* findEviction(PoolId pid, ClassId cid);
+  Item* findEviction(TierId tid, PoolId pid, ClassId cid);
+  std::vector<Item*> findEvictionBatch(TierId tid, PoolId pid, ClassId cid, unsigned int batch);
 
   // Get next eviction candidate from MMContainer, remove from AccessContainer,
   // MMContainer and insert into NVMCache if enabled.
   //
+  // @param tid  the id of the tier to look for evictions inside
   // @param pid  the id of the pool to look for evictions inside
   // @param cid  the id of the class to look for evictions inside
   // @param searchTries number of search attempts so far.
@@ -1727,11 +1886,68 @@ class CacheAllocator : public CacheBase {
   // @return pair of [candidate, toRecycle]. Pair of null if reached the end of
   // the eviction queue or no suitable candidate found
   // within the configured number of attempts
-  std::pair<Item*, Item*> getNextCandidate(PoolId pid,
+  std::pair<Item*, Item*> getNextCandidate(TierId tid,
+                                           PoolId pid,
                                            ClassId cid,
                                            unsigned int& searchTries);
 
   using EvictionIterator = typename MMContainer::LockedIterator;
+  // similiar to the above method but returns a batch of evicted items
+  // as a pair of vectors
+  std::vector<EvictionData> getNextCandidates(TierId tid,
+                                              PoolId pid,
+                                              ClassId cid,
+                                              unsigned int batch,
+                                              bool markMoving,
+                                              bool fromBgThread);
+  
+  std::vector<Item*> getNextCandidatesPromotion(TierId tid,
+                                       PoolId pid,
+                                       ClassId cid,
+                                       unsigned int batch,
+                                       bool markMoving,
+                                       bool fromBgThread);
+
+  // 
+  // Common function in case move among tiers fails during eviction
+  // @param candidate that failed to move
+  // @param the corresponding put token
+  // @param if we are on the last tier
+  // @param if candidate is expired
+  // @param if we are using moving bit
+  //
+  // if insertOrReplace was called during move
+  // then candidate will not be accessible (failed replace during tryEvict)
+  //  - therefore this was why we failed to
+  //    evict to the next tier and insertOrReplace
+  //    will remove from NVM cache
+  // however, if candidate is accessible
+  // that means the allocation in the next
+  // tier failed - so we will continue to
+  // evict the item to NVM cache
+  bool handleFailedMove(Item* candidate, 
+                        typename NvmCacheT::PutToken& token, 
+                        bool isExpired,
+                        bool markMoving);
+
+  // Try to move the item down to the next memory tier
+  //
+  // @param tid current tier ID of the item
+  // @param pid the pool ID the item belong to.
+  // @param item the item to evict
+  //
+  // @return valid handle to the item. This will be the last
+  //         handle to the item. On failure an empty handle.
+  WriteHandle tryEvictToNextMemoryTier(TierId tid, PoolId pid, Item& item);
+
+  // Try to move the item down to the next memory tier
+  //
+  // @param item the item to evict
+  //
+  // @return valid handle to the item. This will be the last
+  //         handle to the item. On failure an empty handle. 
+  WriteHandle tryEvictToNextMemoryTier(Item& item);
+
 
   // Wakes up waiters if there are any
   //
@@ -1758,7 +1974,7 @@ class CacheAllocator : public CacheBase {
       const typename Item::PtrCompressor& compressor);
 
   unsigned int reclaimSlabs(PoolId id, size_t numSlabs) final {
-    return allocator_->reclaimSlabsAndGrow(id, numSlabs);
+    return allocator_[currentTier()]->reclaimSlabsAndGrow(id, numSlabs);
   }
 
   FOLLY_ALWAYS_INLINE EventTracker* getEventTracker() const {
@@ -1817,7 +2033,7 @@ class CacheAllocator : public CacheBase {
                    const void* hint = nullptr) final;
 
   // @param releaseContext  slab release context
-  void releaseSlabImpl(const SlabReleaseContext& releaseContext);
+  void releaseSlabImpl(TierId tid, const SlabReleaseContext& releaseContext);
 
   // @return  true when successfully marked as moving,
   //          fasle when this item has already been freed
@@ -1860,24 +2076,37 @@ class CacheAllocator : public CacheBase {
     // primitives. So we consciously exempt ourselves here from TSAN data race
     // detection.
     folly::annotate_ignore_thread_sanitizer_guard g(__FILE__, __LINE__);
-    auto slabsSkipped = allocator_->forEachAllocation(std::forward<Fn>(f));
+    auto slabsSkipped = allocator_[currentTier()]->forEachAllocation(std::forward<Fn>(f));
     stats().numReaperSkippedSlabs.add(slabsSkipped);
   }
 
   // exposed for the background evictor to iterate through the memory and evict
   // in batch. This should improve insertion path for tiered memory config
-  size_t traverseAndEvictItems(unsigned int /* pid */,
-                               unsigned int /* cid */,
-                               size_t /* batch */) {
-    throw std::runtime_error("Not supported yet!");
+  size_t traverseAndEvictItems(unsigned int tid,
+                               unsigned int pid,
+                               unsigned int cid,
+                               size_t batch) {
+    util::LatencyTracker tracker{stats().bgEvictLatency_, batch};
+    auto& mmContainer = getMMContainer(tid, pid, cid);
+    uint32_t currItems = mmContainer.size();
+    if (currItems < batch) {
+      batch = currItems;
+      if (batch == 0) {
+        return 0;
+      }
+    }
+    auto evictionData = getNextCandidates(tid,pid,cid,batch,
+                                     true,true);
+    size_t evictions = evictionData.size();
+    (*stats_.regularItemEvictions)[tid][pid][cid].add(evictions);
+    return evictions;
   }
-
-  // exposed for the background promoter to iterate through the memory and
-  // promote in batch. This should improve find latency
-  size_t traverseAndPromoteItems(unsigned int /* pid */,
-                                 unsigned int /* cid */,
-                                 size_t /* batch */) {
-    throw std::runtime_error("Not supported yet!");
+  
+  size_t traverseAndPromoteItems(unsigned int tid, unsigned int pid, unsigned int cid, size_t batch) {
+    util::LatencyTracker tracker{stats().bgPromoteLatency_, batch};
+    auto candidates = getNextCandidatesPromotion(tid,pid,cid,batch,
+                                     true,true);
+    return candidates.size();
   }
 
   // returns true if nvmcache is enabled and we should write this item to
@@ -1920,10 +2149,12 @@ class CacheAllocator : public CacheBase {
                   std::unique_ptr<T>& worker,
                   std::chrono::seconds timeout = std::chrono::seconds{0});
 
-  ShmSegmentOpts createShmCacheOpts();
-  std::unique_ptr<MemoryAllocator> createNewMemoryAllocator();
-  std::unique_ptr<MemoryAllocator> restoreMemoryAllocator();
-  std::unique_ptr<CCacheManager> restoreCCacheManager();
+  ShmSegmentOpts createShmCacheOpts(TierId tid);
+  PrivateSegmentOpts createPrivateSegmentOpts(TierId tid);
+  std::unique_ptr<MemoryAllocator> createPrivateAllocator(TierId tid);
+  std::unique_ptr<MemoryAllocator> createNewMemoryAllocator(TierId tid);
+  std::unique_ptr<MemoryAllocator> restoreMemoryAllocator(TierId tid);
+  std::unique_ptr<CCacheManager> restoreCCacheManager(TierId tid);
 
   PoolIds filterCompactCachePools(const PoolIds& poolIds) const;
 
@@ -1943,7 +2174,7 @@ class CacheAllocator : public CacheBase {
   }
 
   typename Item::PtrCompressor createPtrCompressor() const {
-    return allocator_->createPtrCompressor<Item>();
+    return typename Item::PtrCompressor(allocator_);
   }
 
   // helper utility to throttle and optionally log.
@@ -1966,9 +2197,14 @@ class CacheAllocator : public CacheBase {
 
   // @param type        the type of initialization
   // @return nullptr if the type is invalid
-  // @return pointer to memory allocator
+  // @return vector of pointers to memory allocator
   // @throw std::runtime_error if type is invalid
-  std::unique_ptr<MemoryAllocator> initAllocator(InitMemType type);
+  std::vector<std::unique_ptr<MemoryAllocator>> initAllocator(InitMemType type);
+
+  std::vector<std::unique_ptr<MemoryAllocator>> createPrivateAllocators();
+  std::vector<std::unique_ptr<MemoryAllocator>> createAllocators();
+  std::vector<std::unique_ptr<MemoryAllocator>> restoreAllocators();
+
   // @param type        the type of initialization
   // @return nullptr if the type is invalid
   // @return pointer to access container
@@ -1980,18 +2216,14 @@ class CacheAllocator : public CacheBase {
   std::optional<bool> saveNvmCache();
   void saveRamCache();
 
-  static bool itemExclusivePredicate(const Item& item) {
-    return item.getRefCount() == 0;
+  static bool itemSlabMovePredicate(const Item& item) {
+    return item.isMoving() && item.getRefCount() == 0;
   }
 
   static bool itemExpiryPredicate(const Item& item) {
     return item.getRefCount() == 1 && item.isExpired();
   }
 
-  static bool parentEvictForSlabReleasePredicate(const Item& item) {
-    return item.getRefCount() == 1 && !item.isMoving();
-  }
-
   std::unique_ptr<Deserializer> createDeserializer();
 
   // Execute func on each item. `func` can throw exception but must ensure
@@ -2028,44 +2260,6 @@ class CacheAllocator : public CacheBase {
                      : false;
   }
 
-  // returns the background mover stats
-  BackgroundMoverStats getBackgroundMoverStats(MoverDir direction) const {
-    auto stats = BackgroundMoverStats{};
-    if (direction == MoverDir::Evict) {
-      for (auto& bg : backgroundEvictor_)
-        stats += bg->getStats();
-    } else if (direction == MoverDir::Promote) {
-      for (auto& bg : backgroundPromoter_)
-        stats += bg->getStats();
-    }
-    return stats;
-  }
-
-  std::map<PoolId, std::map<ClassId, uint64_t>> getBackgroundMoverClassStats(
-      MoverDir direction) const {
-    std::map<PoolId, std::map<ClassId, uint64_t>> stats;
-
-    if (direction == MoverDir::Evict) {
-      for (auto& bg : backgroundEvictor_) {
-        for (auto& pid : bg->getClassStats()) {
-          for (auto& cid : pid.second) {
-            stats[pid.first][cid.first] += cid.second;
-          }
-        }
-      }
-    } else if (direction == MoverDir::Promote) {
-      for (auto& bg : backgroundPromoter_) {
-        for (auto& pid : bg->getClassStats()) {
-          for (auto& cid : pid.second) {
-            stats[pid.first][cid.first] += cid.second;
-          }
-        }
-      }
-    }
-
-    return stats;
-  }
-
   bool tryGetHandleWithWaitContextForMovingItem(Item& item,
                                                 WriteHandle& handle);
 
@@ -2148,6 +2342,19 @@ class CacheAllocator : public CacheBase {
 
   // BEGIN private members
 
+  TierId currentTier() const {
+    // TODO: every function which calls this method should be refactored.
+    // We should go case by case and either make such function work on
+    // all tiers or expose separate parameter to describe the tier ID.
+    return 0;
+  }
+
+  unsigned getNumTiers() const {
+    return config_.memoryTierConfigs.size();
+  }
+
+  size_t memoryTierSize(TierId tid) const;
+
   // Whether the memory allocator for this cache allocator was created on shared
   // memory. The hash table, chained item hash table etc is also created on
   // shared memory except for temporary shared memory mode when they're created
@@ -2160,6 +2367,8 @@ class CacheAllocator : public CacheBase {
   // is not persisted when cache process exits.
   std::unique_ptr<TempShmMapping> tempShm_;
 
+  std::unique_ptr<PrivateMemoryManager> privMemManager_;
+
   std::unique_ptr<ShmManager> shmManager_;
 
   // Deserialize data to restore cache allocator. Used only while attaching to
@@ -2173,9 +2382,10 @@ class CacheAllocator : public CacheBase {
   const MMConfig mmConfig_{};
 
   // the memory allocator for allocating out of the available memory.
-  std::unique_ptr<MemoryAllocator> allocator_;
+  std::vector<std::unique_ptr<MemoryAllocator>> allocator_;
 
   // compact cache allocator manager
+  // TODO: per tier?
   std::unique_ptr<CCacheManager> compactCacheManager_;
 
   // compact cache instances reside here when user "add" or "attach" compact
@@ -2226,7 +2436,7 @@ class CacheAllocator : public CacheBase {
   // free memory monitor
   std::unique_ptr<MemoryMonitor> memMonitor_;
 
-  // background evictor
+  // background data movement
   std::vector<std::unique_ptr<BackgroundMover<CacheT>>> backgroundEvictor_;
   std::vector<std::unique_ptr<BackgroundMover<CacheT>>> backgroundPromoter_;
 
@@ -2371,6 +2581,9 @@ CacheAllocator<CacheTrait>::CacheAllocator(
       tempShm_(type == InitMemType::kNone && isOnShm_
                    ? std::make_unique<TempShmMapping>(config_.getCacheSize())
                    : nullptr),
+      privMemManager_(type == InitMemType::kNone && !isOnShm_
+                          ? std::make_unique<PrivateMemoryManager>()
+                          : nullptr),
       shmManager_(type != InitMemType::kNone
                       ? std::make_unique<ShmManager>(config_.cacheDir,
                                                      config_.isUsingPosixShm())
@@ -2382,12 +2595,12 @@ CacheAllocator<CacheTrait>::CacheAllocator(
                     : serialization::CacheAllocatorMetadata{}},
       allocator_(initAllocator(type)),
       compactCacheManager_(type != InitMemType::kMemAttach
-                               ? std::make_unique<CCacheManager>(*allocator_)
-                               : restoreCCacheManager()),
+                               ? std::make_unique<CCacheManager>(*allocator_[0] /* TODO: per tier */)
+                               : restoreCCacheManager(0/* TODO: per tier */)),
       compressor_(createPtrCompressor()),
       mmContainers_(type == InitMemType::kMemAttach
                         ? deserializeMMContainers(*deserializer_, compressor_)
-                        : MMContainers{}),
+                        : MMContainers{getNumTiers()}),
       accessContainer_(initAccessContainer(
           type, detail::kShmHashTableName, config.accessConfig)),
       chainedItemAccessContainer_(
@@ -2422,48 +2635,115 @@ CacheAllocator<CacheTrait>::~CacheAllocator() {
 }
 
 template <typename CacheTrait>
-ShmSegmentOpts CacheAllocator<CacheTrait>::createShmCacheOpts() {
+ShmSegmentOpts CacheAllocator<CacheTrait>::createShmCacheOpts(TierId tid) {
   ShmSegmentOpts opts;
   opts.alignment = sizeof(Slab);
   // TODO: we support single tier so far
-  if (config_.memoryTierConfigs.size() > 1) {
-    throw std::invalid_argument("CacheLib only supports a single memory tier");
+  if (config_.memoryTierConfigs.size() > 2) {
+    throw std::invalid_argument("CacheLib only supports two memory tiers");
   }
-  opts.memBindNumaNodes = config_.memoryTierConfigs[0].getMemBind();
+  opts.memBindNumaNodes = config_.memoryTierConfigs[tid].getMemBind();
   return opts;
 }
 
+template <typename CacheTrait>
+PrivateSegmentOpts CacheAllocator<CacheTrait>::createPrivateSegmentOpts(TierId tid) {
+  PrivateSegmentOpts opts;
+  opts.alignment = sizeof(Slab);
+  auto memoryTierConfigs = config_.getMemoryTierConfigs();
+  opts.memBindNumaNodes = memoryTierConfigs[tid].getMemBind();
+
+  return opts;
+}
+
+template <typename CacheTrait>
+size_t CacheAllocator<CacheTrait>::memoryTierSize(TierId tid) const {
+  auto partitions = std::accumulate(config_.memoryTierConfigs.begin(), config_.memoryTierConfigs.end(), 0UL,
+  [](const size_t i, const MemoryTierCacheConfig& config){
+    return i + config.getRatio();
+  });
+
+  return config_.memoryTierConfigs[tid].calculateTierSize(config_.getCacheSize(), partitions);
+}
+
 template <typename CacheTrait>
 std::unique_ptr<MemoryAllocator>
-CacheAllocator<CacheTrait>::createNewMemoryAllocator() {
+CacheAllocator<CacheTrait>::createPrivateAllocator(TierId tid) {
+  if (isOnShm_) {
+    return std::make_unique<MemoryAllocator>(
+                            getAllocatorConfig(config_),
+                            tempShm_->getAddr(),
+                            memoryTierSize(tid));
+  } else {
+    return std::make_unique<MemoryAllocator>(
+                            getAllocatorConfig(config_),
+                            privMemManager_->createMapping(config_.size, createPrivateSegmentOpts(tid)),
+                            memoryTierSize(tid));
+  }
+}
+
+template <typename CacheTrait>
+std::unique_ptr<MemoryAllocator>
+CacheAllocator<CacheTrait>::createNewMemoryAllocator(TierId tid) {
+  size_t tierSize = memoryTierSize(tid);
   return std::make_unique<MemoryAllocator>(
       getAllocatorConfig(config_),
       shmManager_
-          ->createShm(detail::kShmCacheName, config_.getCacheSize(),
-                      config_.slabMemoryBaseAddr, createShmCacheOpts())
+          ->createShm(detail::kShmCacheName + std::to_string(tid),
+                      tierSize, config_.slabMemoryBaseAddr,
+                      createShmCacheOpts(tid))
           .addr,
-      config_.getCacheSize());
+      tierSize);
 }
 
 template <typename CacheTrait>
 std::unique_ptr<MemoryAllocator>
-CacheAllocator<CacheTrait>::restoreMemoryAllocator() {
+CacheAllocator<CacheTrait>::restoreMemoryAllocator(TierId tid) {
   return std::make_unique<MemoryAllocator>(
       deserializer_->deserialize<MemoryAllocator::SerializationType>(),
       shmManager_
-          ->attachShm(detail::kShmCacheName, config_.slabMemoryBaseAddr,
-                      createShmCacheOpts())
-          .addr,
-      config_.getCacheSize(),
+          ->attachShm(detail::kShmCacheName + std::to_string(tid),
+            config_.slabMemoryBaseAddr, createShmCacheOpts(tid)).addr,
+      memoryTierSize(tid),
       config_.disableFullCoredump);
 }
 
+template <typename CacheTrait>
+std::vector<std::unique_ptr<MemoryAllocator>>
+CacheAllocator<CacheTrait>::createPrivateAllocators() {
+  std::vector<std::unique_ptr<MemoryAllocator>> allocators;
+  for (int tid = 0; tid < getNumTiers(); tid++) {
+    allocators.emplace_back(createPrivateAllocator(tid));
+  }
+  return allocators;
+}
+
+template <typename CacheTrait>
+std::vector<std::unique_ptr<MemoryAllocator>>
+CacheAllocator<CacheTrait>::createAllocators() {
+  std::vector<std::unique_ptr<MemoryAllocator>> allocators;
+  for (int tid = 0; tid < getNumTiers(); tid++) {
+    allocators.emplace_back(createNewMemoryAllocator(tid));
+  }
+  return allocators;
+}
+
+template <typename CacheTrait>
+std::vector<std::unique_ptr<MemoryAllocator>>
+CacheAllocator<CacheTrait>::restoreAllocators() {
+  std::vector<std::unique_ptr<MemoryAllocator>> allocators;
+  for (int tid = 0; tid < getNumTiers(); tid++) {
+    allocators.emplace_back(restoreMemoryAllocator(tid));
+  }
+  return allocators;
+}
+
 template <typename CacheTrait>
 std::unique_ptr<CCacheManager>
-CacheAllocator<CacheTrait>::restoreCCacheManager() {
+CacheAllocator<CacheTrait>::restoreCCacheManager(TierId tid) {
   return std::make_unique<CCacheManager>(
       deserializer_->deserialize<CCacheManager::SerializationType>(),
-      *allocator_);
+      *allocator_[tid]);
 }
 
 template <typename CacheTrait>
@@ -2567,21 +2847,15 @@ void CacheAllocator<CacheTrait>::initWorkers() {
 }
 
 template <typename CacheTrait>
-std::unique_ptr<MemoryAllocator> CacheAllocator<CacheTrait>::initAllocator(
+std::vector<std::unique_ptr<MemoryAllocator>>
+CacheAllocator<CacheTrait>::initAllocator(
     InitMemType type) {
   if (type == InitMemType::kNone) {
-    if (isOnShm_ == true) {
-      return std::make_unique<MemoryAllocator>(getAllocatorConfig(config_),
-                                               tempShm_->getAddr(),
-                                               config_.getCacheSize());
-    } else {
-      return std::make_unique<MemoryAllocator>(getAllocatorConfig(config_),
-                                               config_.getCacheSize());
-    }
+    return createPrivateAllocators();
   } else if (type == InitMemType::kMemNew) {
-    return createNewMemoryAllocator();
+    return createAllocators();
   } else if (type == InitMemType::kMemAttach) {
-    return restoreMemoryAllocator();
+    return restoreAllocators();
   }
 
   // Invalid type
@@ -2649,42 +2923,83 @@ CacheAllocator<CacheTrait>::allocate(PoolId poolId,
 }
 
 template <typename CacheTrait>
-bool CacheAllocator<CacheTrait>::shouldWakeupBgEvictor(PoolId /* pid */,
-                                                       ClassId /* cid */) {
+bool CacheAllocator<CacheTrait>::shouldWakeupBgEvictor(TierId tid, PoolId pid, ClassId cid) {
+  // TODO: should we also work on lower tiers? should we have separate set of params?
+  if (tid == 1) return false;
+  double usage = getPoolByTid(pid, tid).getApproxUsage(cid);
+  if (((1-usage)*100) <= config_.lowEvictionAcWatermark) {
+    return true;
+  }
   return false;
 }
 
 template <typename CacheTrait>
-typename CacheAllocator<CacheTrait>::WriteHandle
-CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
-                                             typename Item::Key key,
-                                             uint32_t size,
-                                             uint32_t creationTime,
-                                             uint32_t expiryTime,
-                                             bool fromBgThread) {
+std::vector<void*> CacheAllocator<CacheTrait>::allocateInternalTierByCidBatch(TierId tid,
+                                                 PoolId pid,
+                                                 ClassId cid, uint64_t batch) {
   util::LatencyTracker tracker{stats().allocateLatency_};
 
+  SCOPE_FAIL { stats_.invalidAllocs.add(batch); };
+
+  util::RollingLatencyTracker rollTracker{
+      (*stats_.classAllocLatency)[tid][pid][cid]};
+
+  (*stats_.allocAttempts)[tid][pid][cid].add(batch);
+  
+  auto memory = allocator_[tid]->allocateByCidBatch(pid, cid, batch);
+  
+  if (memory.size() < batch) {
+    uint64_t toEvict = batch - memory.size();
+    auto evicted = findEvictionBatch(tid, pid, cid, toEvict);
+    if (evicted.size() < toEvict) {
+      (*stats_.allocFailures)[tid][pid][cid].add(toEvict - evicted.size());
+    }
+    if (evicted.size() > 0) {
+      //case where we some allocations from eviction - add them to
+      //the new allocations
+      memory.insert(memory.end(),evicted.begin(),evicted.end());
+      return memory;
+    }
+  }
+  return memory;
+}
+
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::allocateInternalTier(TierId tid,
+                                                 PoolId pid,
+                                                 typename Item::Key key,
+                                                 uint32_t size,
+                                                 uint32_t creationTime,
+                                                 uint32_t expiryTime,
+                                                 bool evict) {
+  util::LatencyTracker tracker{stats().allocateLatency_};
   SCOPE_FAIL { stats_.invalidAllocs.inc(); };
 
   // number of bytes required for this item
   const auto requiredSize = Item::getRequiredSize(key, size);
 
   // the allocation class in our memory allocator.
-  const auto cid = allocator_->getAllocationClassId(pid, requiredSize);
+  const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize);
+  util::RollingLatencyTracker rollTracker{
+      (*stats_.classAllocLatency)[tid][pid][cid]};
 
-  (*stats_.allocAttempts)[pid][cid].inc();
+  (*stats_.allocAttempts)[tid][pid][cid].inc();
 
-  void* memory = allocator_->allocate(pid, requiredSize);
+  void* memory = allocator_[tid]->allocate(pid, requiredSize);
 
-  if (backgroundEvictor_.size() && !fromBgThread &&
-      (memory == nullptr || shouldWakeupBgEvictor(pid, cid))) {
+  if (backgroundEvictor_.size() &&
+      (memory == nullptr || shouldWakeupBgEvictor(tid, pid, cid))) {
     backgroundEvictor_[BackgroundMover<CacheT>::workerId(
-                           pid, cid, backgroundEvictor_.size())]
+                         tid, pid, cid, backgroundEvictor_.size())]
         ->wakeUp();
   }
 
   if (memory == nullptr) {
-    memory = findEviction(pid, cid);
+    if (!evict || config_.noOnlineEviction) {
+      return {};
+    }
+    memory = findEviction(tid, pid, cid);
   }
 
   WriteHandle handle;
@@ -2695,18 +3010,18 @@ CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
     // for example.
     SCOPE_FAIL {
       // free back the memory to the allocator since we failed.
-      allocator_->free(memory);
+      allocator_[tid]->free(memory);
     };
 
     handle = acquire(new (memory) Item(key, size, creationTime, expiryTime));
     if (handle) {
       handle.markNascent();
-      (*stats_.fragmentationSize)[pid][cid].add(
+      (*stats_.fragmentationSize)[tid][pid][cid].add(
           util::getFragmentation(*this, *handle));
     }
 
   } else { // failed to allocate memory.
-    (*stats_.allocFailures)[pid][cid].inc();
+    (*stats_.allocFailures)[tid][pid][cid].inc();
     // wake up rebalancer
     if (!config_.poolRebalancerDisableForcedWakeUp && poolRebalancer_) {
       poolRebalancer_->wakeUp();
@@ -2723,6 +3038,23 @@ CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
   return handle;
 }
 
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
+                                             typename Item::Key key,
+                                             uint32_t size,
+                                             uint32_t creationTime,
+                                             uint32_t expiryTime) {
+  auto tid = 0; /* TODO: consult admission policy */
+  for(TierId tid = 0; tid < getNumTiers(); ++tid) {
+    bool evict = (!config_.insertToFirstFreeTier || tid == getNumTiers() - 1);
+    auto handle = allocateInternalTier(tid, pid, key, size, creationTime,
+                                       expiryTime, evict);
+    if (handle) return handle;
+  }
+  return {};
+}
+
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
 CacheAllocator<CacheTrait>::allocateChainedItem(const ReadHandle& parent,
@@ -2746,35 +3078,55 @@ template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
 CacheAllocator<CacheTrait>::allocateChainedItemInternal(const Item& parent,
                                                         uint32_t size) {
+  auto tid = 0; /* TODO: consult admission policy */
+  for(TierId tid = 0; tid < getNumTiers(); ++tid) {
+    auto handle = allocateChainedItemInternalTier(parent, size, tid);
+    if (handle) return handle;
+  }
+  return {};
+}
+
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::allocateChainedItemInternalTier(const Item& parent,
+                                                            uint32_t size,
+                                                            TierId tid) {
   util::LatencyTracker tracker{stats().allocateLatency_};
 
   SCOPE_FAIL { stats_.invalidAllocs.inc(); };
 
   // number of bytes required for this item
   const auto requiredSize = ChainedItem::getRequiredSize(size);
+  
+  //this is okay because pools/classes are duplicated among the tiers
+  auto ptid = getTierId(parent);
+  const auto pid = allocator_[ptid]->getAllocInfo(parent.getMemory()).poolId;
+  const auto cid = allocator_[ptid]->getAllocationClassId(pid, requiredSize);
 
-  const auto pid = allocator_->getAllocInfo(parent.getMemory()).poolId;
-  const auto cid = allocator_->getAllocationClassId(pid, requiredSize);
+  // TODO: per-tier? Right now stats_ are not used in any public periodic
+  // worker
+  util::RollingLatencyTracker rollTracker{
+      (*stats_.classAllocLatency)[tid][pid][cid]};
 
-  (*stats_.allocAttempts)[pid][cid].inc();
+  (*stats_.allocAttempts)[tid][pid][cid].inc();
 
-  void* memory = allocator_->allocate(pid, requiredSize);
+  void* memory = allocator_[tid]->allocate(pid, requiredSize);
   if (memory == nullptr) {
-    memory = findEviction(pid, cid);
+    memory = findEviction(tid, pid, cid);
   }
   if (memory == nullptr) {
-    (*stats_.allocFailures)[pid][cid].inc();
+    (*stats_.allocFailures)[tid][pid][cid].inc();
     return WriteHandle{};
   }
 
-  SCOPE_FAIL { allocator_->free(memory); };
+  SCOPE_FAIL { allocator_[tid]->free(memory); };
 
   auto child = acquire(new (memory) ChainedItem(
       compressor_.compress(&parent), size, util::getCurrentTimeSec()));
 
   if (child) {
     child.markNascent();
-    (*stats_.fragmentationSize)[pid][cid].add(
+    (*stats_.fragmentationSize)[tid][pid][cid].add(
         util::getFragmentation(*this, *child));
   }
 
@@ -3101,8 +3453,8 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
     throw std::runtime_error(
         folly::sformat("cannot release this item: {}", it.toString()));
   }
-
-  const auto allocInfo = allocator_->getAllocInfo(it.getMemory());
+  const auto tid = getTierId(it);
+  const auto allocInfo = allocator_[tid]->getAllocInfo(it.getMemory());
 
   if (ctx == RemoveContext::kEviction) {
     const auto timeNow = util::getCurrentTimeSec();
@@ -3113,21 +3465,23 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
     stats_.perPoolEvictionAgeSecs_[allocInfo.poolId].trackValue(refreshTime);
   }
 
-  (*stats_.fragmentationSize)[allocInfo.poolId][allocInfo.classId].sub(
+  (*stats_.fragmentationSize)[tid][allocInfo.poolId][allocInfo.classId].sub(
       util::getFragmentation(*this, it));
 
   // Chained items can only end up in this place if the user has allocated
   // memory for a chained item but has decided not to insert the chained item
   // to a parent item and instead drop the chained item handle. In this case,
   // we free the chained item directly without calling remove callback.
-  if (it.isChainedItem()) {
+  //
+  // Except if we are moving a chained item between tiers -
+  // then it == toRecycle and we will want the normal recycle path
+  if (it.isChainedItem() && &it != toRecycle) {
     if (toRecycle) {
       throw std::runtime_error(
           folly::sformat("Can not recycle a chained item {}, toRecyle",
                          it.toString(), toRecycle->toString()));
     }
-
-    allocator_->free(&it);
+    allocator_[tid]->free(&it);
     return ReleaseRes::kReleased;
   }
 
@@ -3194,10 +3548,10 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
 
     while (head) {
       auto next = head->getNext(compressor_);
-
+      const auto tid = getTierId(head);
       const auto childInfo =
-          allocator_->getAllocInfo(static_cast<const void*>(head));
-      (*stats_.fragmentationSize)[childInfo.poolId][childInfo.classId].sub(
+          allocator_[tid]->getAllocInfo(static_cast<const void*>(head));
+      (*stats_.fragmentationSize)[tid][childInfo.poolId][childInfo.classId].sub(
           util::getFragmentation(*this, *head));
 
       removeFromMMContainer(*head);
@@ -3212,7 +3566,7 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
         XDCHECK(ReleaseRes::kReleased != res);
         res = ReleaseRes::kRecycled;
       } else {
-        allocator_->free(head);
+        allocator_[tid]->free(head);
       }
 
       stats_.numChainedChildItems.dec();
@@ -3226,7 +3580,7 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
     res = ReleaseRes::kRecycled;
   } else {
     XDCHECK(it.isDrained());
-    allocator_->free(&it);
+    allocator_[tid]->free(&it);
   }
 
   return res;
@@ -3515,14 +3869,9 @@ void CacheAllocator<CacheTrait>::wakeUpWaiters(folly::StringPiece key,
 }
 
 template <typename CacheTrait>
-bool CacheAllocator<CacheTrait>::moveRegularItem(Item& oldItem,
-                                                 WriteHandle& newItemHdl) {
-  XDCHECK(oldItem.isMoving());
-  // If an item is expired, proceed to eviction.
-  if (oldItem.isExpired()) {
-    return false;
-  }
-
+bool CacheAllocator<CacheTrait>::moveRegularItem(
+    Item& oldItem, WriteHandle& newItemHdl, bool skipAddInMMContainer, bool fromBgThread) {
+  XDCHECK(!oldItem.isExpired());
   util::LatencyTracker tracker{stats_.moveRegularLatency_};
 
   XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize());
@@ -3534,20 +3883,32 @@ bool CacheAllocator<CacheTrait>::moveRegularItem(Item& oldItem,
     newItemHdl->markNvmClean();
   }
 
-  // Execute the move callback. We cannot make any guarantees about the
-  // consistency of the old item beyond this point, because the callback can
-  // do more than a simple memcpy() e.g. update external references. If there
-  // are any remaining handles to the old item, it is the caller's
-  // responsibility to invalidate them. The move can only fail after this
-  // statement if the old item has been removed or replaced, in which case it
-  // should be fine for it to be left in an inconsistent state.
-  config_.moveCb(oldItem, *newItemHdl, nullptr);
+  if (config_.moveCb) {
+    // Execute the move callback. We cannot make any guarantees about the
+    // consistency of the old item beyond this point, because the callback can
+    // do more than a simple memcpy() e.g. update external references. If there
+    // are any remaining handles to the old item, it is the caller's
+    // responsibility to invalidate them. The move can only fail after this
+    // statement if the old item has been removed or replaced, in which case it
+    // should be fine for it to be left in an inconsistent state.
+    config_.moveCb(oldItem, *newItemHdl, nullptr);
+  } else {
+    if (fromBgThread) {
+      std::memmove(newItemHdl->getMemory(), oldItem.getMemory(),
+                oldItem.getSize());
+    } else {
+      std::memcpy(newItemHdl->getMemory(), oldItem.getMemory(),
+                oldItem.getSize());
+    }
+  }
 
-  // Adding the item to mmContainer has to succeed since no one can remove the
-  // item
   auto& newContainer = getMMContainer(*newItemHdl);
-  auto mmContainerAdded = newContainer.add(*newItemHdl);
-  XDCHECK(mmContainerAdded);
+  if (!skipAddInMMContainer) {
+    // Adding the item to mmContainer has to succeed since no one can remove the
+    // item
+    auto mmContainerAdded = newContainer.add(*newItemHdl);
+    XDCHECK(mmContainerAdded);
+  }
 
   if (oldItem.hasChainedItem()) {
     XDCHECK(!newItemHdl->hasChainedItem()) << newItemHdl->toString();
@@ -3589,14 +3950,19 @@ bool CacheAllocator<CacheTrait>::moveChainedItem(ChainedItem& oldItem,
 
   auto parentPtr = &parentItem;
 
-  // Execute the move callback. We cannot make any guarantees about the
-  // consistency of the old item beyond this point, because the callback can
-  // do more than a simple memcpy() e.g. update external references. If there
-  // are any remaining handles to the old item, it is the caller's
-  // responsibility to invalidate them. The move can only fail after this
-  // statement if the old item has been removed or replaced, in which case it
-  // should be fine for it to be left in an inconsistent state.
-  config_.moveCb(oldItem, *newItemHdl, parentPtr);
+  if (config_.moveCb) {
+    // Execute the move callback. We cannot make any guarantees about the
+    // consistency of the old item beyond this point, because the callback can
+    // do more than a simple memcpy() e.g. update external references. If there
+    // are any remaining handles to the old item, it is the caller's
+    // responsibility to invalidate them. The move can only fail after this
+    // statement if the old item has been removed or replaced, in which case it
+    // should be fine for it to be left in an inconsistent state.
+    config_.moveCb(oldItem, *newItemHdl, parentPtr);
+  } else {
+    std::memcpy(newItemHdl->getMemory(), oldItem.getMemory(),
+                oldItem.getSize());
+  }
 
   // Replace the new item in the position of the old one before both in the
   // parent's chain and the MMContainer.
@@ -3631,23 +3997,496 @@ void CacheAllocator<CacheTrait>::unlinkItemForEviction(Item& it) {
   XDCHECK_EQ(0u, ref);
 }
 
+template <typename CacheTrait>
+std::vector<typename CacheAllocator<CacheTrait>::Item*>
+CacheAllocator<CacheTrait>::findEvictionBatch(TierId tid,
+                                             PoolId pid,
+                                             ClassId cid,
+                                             unsigned int batch) {
+
+  std::vector<Item*> toRecycles;
+  toRecycles.reserve(batch);
+  auto evictionData = getNextCandidates(tid,pid,cid,batch,true,false);
+  for (int i = 0; i < evictionData.size(); i++) {
+    Item *candidate = evictionData[i].candidate;
+    Item *toRecycle = evictionData[i].toRecycle;
+    toRecycles.push_back(toRecycle);
+    // recycle the item. it's safe to do so, even if toReleaseHandle was
+    // NULL. If `ref` == 0 then it means that we are the last holder of
+    // that item.
+    if (candidate->hasChainedItem()) {
+      (*stats_.chainedItemEvictions)[tid][pid][cid].inc();
+    } else {
+      (*stats_.regularItemEvictions)[tid][pid][cid].inc();
+    }
+
+    if (auto eventTracker = getEventTracker()) {
+      eventTracker->record(AllocatorApiEvent::DRAM_EVICT, candidate->getKey(),
+                           AllocatorApiResult::EVICTED, candidate->getSize(),
+                           candidate->getConfiguredTTL().count());
+    }
+
+    XDCHECK(!candidate->isChainedItem());
+    // check if by releasing the item we intend to, we actually
+    // recycle the candidate.
+    auto ret = releaseBackToAllocator(*candidate, RemoveContext::kEviction,
+                                      /* isNascent */ false, toRecycle);
+    XDCHECK_EQ(ret,ReleaseRes::kRecycled);
+  }
+  return toRecycles;
+}
+
+template <typename CacheTrait>
+std::vector<typename CacheAllocator<CacheTrait>::Item*>
+CacheAllocator<CacheTrait>::getNextCandidatesPromotion(TierId tid,
+                                             PoolId pid,
+                                             ClassId cid,
+                                             unsigned int batch,
+                                             bool markMoving,
+                                             bool fromBgThread) {
+  std::vector<Item*> newAllocs;
+  std::vector<void*> blankAllocs;
+  std::vector<WriteHandle> newHandles;
+  std::vector<WriteHandle> candidateHandles;
+  std::vector<Item*> candidates;
+  candidates.reserve(batch);
+  candidateHandles.reserve(batch);
+  newAllocs.reserve(batch);
+  newHandles.reserve(batch);
+
+  auto& mmContainer = getMMContainer(tid, pid, cid);
+  unsigned int maxSearchTries = std::max(config_.evictionSearchTries,
+                                            batch*4);
+
+  // first try and get allocations in the next tier
+  blankAllocs = allocateInternalTierByCidBatch(tid-1,pid,cid,batch);
+  if (blankAllocs.empty()) {
+    return candidates;  
+  } else if (blankAllocs.size() < batch) {
+    batch = blankAllocs.size();
+  }
+  XDCHECK_EQ(blankAllocs.size(),batch);
+
+  auto iterateAndMark = [this, tid, pid, cid, batch,
+                         markMoving, maxSearchTries,
+                         &candidates, &candidateHandles,
+                         &mmContainer](auto&& itr) {
+
+    unsigned int searchTries = 0;
+    if (!itr) {
+      ++searchTries;
+      return;
+    }
+
+    while ((config_.evictionSearchTries == 0 ||
+            maxSearchTries > searchTries) &&
+           itr && candidates.size() < batch) {
+      ++searchTries;
+      auto* toRecycle_ = itr.get();
+      bool chainedItem_ = toRecycle_->isChainedItem();
+
+      if (chainedItem_) {
+          ++itr;
+          continue;
+      }
+      Item* candidate_;
+      WriteHandle candidateHandle_;
+      Item* syncItem_;
+      //sync on the parent item for chained items to move to next tier
+      candidate_ = toRecycle_;
+      syncItem_ = toRecycle_;
+      
+      bool marked = false;
+      if (markMoving) {
+        marked = syncItem_->markMoving();
+      } else if (!markMoving) {
+        //we use item handle as sync point - for background eviction
+        auto hdl = acquire(candidate_);
+        if (hdl && hdl->getRefCount() == 1) {
+          marked = true;
+          candidateHandle_ = std::move(hdl);
+        }
+      }
+      if (!marked) {
+        ++itr;
+        continue;
+      }
+      XDCHECK(!chainedItem_); 
+      mmContainer.remove(itr);
+      candidates.push_back(candidate_);
+      candidateHandles.push_back(std::move(candidateHandle_));
+    }
+  };
+  
+  mmContainer.withPromotionIterator(iterateAndMark);
+
+  if (candidates.size() < batch) {
+    unsigned int toErase = batch - candidates.size();
+    for (int i = 0; i < toErase; i++) {
+      allocator_[tid-1]->free(blankAllocs.back());
+      blankAllocs.pop_back();
+    }
+    if (candidates.size() == 0) {
+      return candidates;  
+    }
+  }
+  
+  //1. get and item handle from a new allocation
+  for (int i = 0; i < candidates.size(); i++) {
+    Item *candidate = candidates[i];
+    WriteHandle newItemHdl = acquire(new (blankAllocs[i]) 
+            Item(candidate->getKey(), candidate->getSize(),
+                 candidate->getCreationTime(), candidate->getExpiryTime()));
+    XDCHECK(newItemHdl);
+    if (newItemHdl) {
+      newItemHdl.markNascent();
+      (*stats_.fragmentationSize)[tid][pid][cid].add(
+          util::getFragmentation(*this, *newItemHdl));
+      newAllocs.push_back(newItemHdl.getInternal());
+      newHandles.push_back(std::move(newItemHdl));
+    } else {
+      //failed to get item handle
+      throw std::runtime_error(
+         folly::sformat("Was not to acquire new alloc, failed alloc {}", blankAllocs[i]));
+    }
+  }
+  //2. add in batch to mmContainer
+  auto& newMMContainer = getMMContainer(tid-1, pid, cid);
+  uint32_t added = newMMContainer.addBatch(newAllocs.begin(), newAllocs.end());
+  XDCHECK_EQ(added,newAllocs.size());
+  if (added != newAllocs.size()) {
+    throw std::runtime_error(
+      folly::sformat("Was not able to add all new items, failed item {} and handle {}", 
+                      newAllocs[added]->toString(),newHandles[added]->toString()));
+  }
+  //3. copy item data - don't need to add in mmContainer
+  for (int i = 0; i < candidates.size(); i++) {
+    Item *candidate = candidates[i];
+    WriteHandle newHandle = std::move(newHandles[i]);
+    bool moved = moveRegularItem(*candidate,newHandle, true, true);
+    if (moved) {
+      XDCHECK(candidate->getKey() == newHandle->getKey());
+      if (markMoving) {
+        auto ref = candidate->unmarkMoving();
+        XDCHECK_EQ(ref,0);
+        wakeUpWaiters(candidate->getKey(), std::move(newHandle));
+        const auto res =
+            releaseBackToAllocator(*candidate, RemoveContext::kNormal, false);
+        XDCHECK(res == ReleaseRes::kReleased);
+      }
+    } else {
+      typename NvmCacheT::PutToken token{};
+      
+      removeFromMMContainer(*newAllocs[i]);
+      auto ret = handleFailedMove(candidate,token,false,markMoving);
+      XDCHECK(ret);
+      if (markMoving && candidate->getRefCountAndFlagsRaw() == 0) {
+        const auto res =
+            releaseBackToAllocator(*candidate, RemoveContext::kNormal, false);
+        XDCHECK(res == ReleaseRes::kReleased);
+      }
+
+    }
+  }
+  return candidates;
+}
+
+template <typename CacheTrait>
+std::vector<typename CacheAllocator<CacheTrait>::EvictionData>
+CacheAllocator<CacheTrait>::getNextCandidates(TierId tid,
+                                             PoolId pid,
+                                             ClassId cid,
+                                             unsigned int batch,
+                                             bool markMoving,
+                                             bool fromBgThread) {
+
+  std::vector<void*> blankAllocs;
+  std::vector<Item*> newAllocs;
+  std::vector<WriteHandle> newHandles;
+  std::vector<EvictionData> evictionData;
+  evictionData.reserve(batch);
+  newAllocs.reserve(batch);
+  newHandles.reserve(batch);
+  
+  auto& mmContainer = getMMContainer(tid, pid, cid);
+  bool lastTier = tid+1 >= getNumTiers();
+  unsigned int maxSearchTries = std::max(config_.evictionSearchTries,
+                                            batch*4);
+  if (!lastTier) {
+    blankAllocs = allocateInternalTierByCidBatch(tid+1,pid,cid,batch);
+    if (blankAllocs.empty()) {
+      return evictionData;  
+    } else if (blankAllocs.size() != batch) {
+      batch = blankAllocs.size(); 
+    }
+    XDCHECK_EQ(blankAllocs.size(),batch);
+  }
+
+  auto iterateAndMark = [this, tid, pid, cid, batch,
+                         markMoving, lastTier, maxSearchTries,
+                         &evictionData, &mmContainer](auto&& itr) {
+    unsigned int searchTries = 0;
+    if (!itr) {
+      ++searchTries;
+      (*stats_.evictionAttempts)[tid][pid][cid].inc();
+      return;
+    }
+
+    while ((config_.evictionSearchTries == 0 ||
+            maxSearchTries > searchTries) &&
+           itr && evictionData.size() < batch) {
+      ++searchTries;
+      (*stats_.evictionAttempts)[tid][pid][cid].inc();
+
+      auto* toRecycle_ = itr.get();
+      bool chainedItem_ = toRecycle_->isChainedItem();
+      Item* toRecycleParent_ = chainedItem_
+              ? &toRecycle_->asChainedItem().getParentItem(compressor_)
+              : nullptr;
+      if (toRecycle_->isExpired()) {
+          ++itr;
+          continue;
+      }
+      // in order to safely check if the expected parent (toRecycleParent_) matches
+      // the current parent on the chained item, we need to take the chained
+      // item lock so we are sure that nobody else will be editing the chain
+      auto l_ = chainedItem_
+                ? chainedItemLocks_.tryLockExclusive(toRecycleParent_->getKey())
+                : decltype(chainedItemLocks_.tryLockExclusive(toRecycle_->getKey()))();
+
+      if (chainedItem_ &&
+          ( !l_ || &toRecycle_->asChainedItem().getParentItem(compressor_)
+                    != toRecycleParent_) ) {
+          ++itr;
+          continue;
+      }
+      Item* candidate_;
+      WriteHandle candidateHandle_;
+      Item* syncItem_;
+      //sync on the parent item for chained items to move to next tier
+      if (!lastTier && chainedItem_) {
+          syncItem_ = toRecycleParent_;
+          candidate_ = toRecycle_;
+      } else if (lastTier && chainedItem_) {
+          candidate_ = toRecycleParent_;
+          syncItem_ = toRecycleParent_;
+      } else {
+          candidate_ = toRecycle_;
+          syncItem_ = toRecycle_;
+      }
+      // if it's last tier, the item will be evicted
+      // need to create put token before marking it exclusive
+      const bool evictToNvmCache = lastTier && shouldWriteToNvmCache(*candidate_);
+
+      auto token_ = evictToNvmCache
+                        ? nvmCache_->createPutToken(candidate_->getKey())
+                        : typename NvmCacheT::PutToken{};
+      
+      if (evictToNvmCache && !token_.isValid()) {
+        stats_.evictFailConcurrentFill.inc();
+        ++itr;
+        continue;
+      }
+      bool marked = false;
+      //case 1: mark the item for eviction
+      if ((lastTier || candidate_->isExpired()) && markMoving) {
+        marked = syncItem_->markForEviction();
+      } else if (markMoving) {
+        marked = syncItem_->markMoving();
+      } else if (!markMoving) {
+        //we use item handle as sync point - for background eviction
+        auto hdl = acquire(candidate_);
+        if (hdl && hdl->getRefCount() == 1) {
+          marked = true;
+          candidateHandle_ = std::move(hdl);
+        }
+      }
+      if (!marked) {
+        if (candidate_->hasChainedItem()) {
+          stats_.evictFailParentAC.inc();
+        } else {
+          stats_.evictFailAC.inc();
+        }
+        ++itr;
+        continue;
+      }
+      
+      if (chainedItem_) {
+          XDCHECK(l_);
+          XDCHECK_EQ(toRecycleParent_,&toRecycle_->asChainedItem().getParentItem(compressor_));
+      }
+      mmContainer.remove(itr);
+      EvictionData ed(candidate_,toRecycle_,toRecycleParent_,chainedItem_,
+                      candidate_->isExpired(), std::move(token_), std::move(candidateHandle_));
+      evictionData.push_back(std::move(ed));
+    }
+  };
+  
+  mmContainer.withEvictionIterator(iterateAndMark);
+
+  if (evictionData.size() < batch) {
+    if (!lastTier) {
+      unsigned int toErase = batch - evictionData.size();
+      for (int i = 0; i < toErase; i++) {
+        allocator_[tid+1]->free(blankAllocs.back());
+        blankAllocs.pop_back();
+      }
+    }
+    if (evictionData.size() == 0) {
+      return evictionData;  
+    }
+  }
+  
+  if (!lastTier) {
+    //1. get and item handle from a new allocation
+    for (int i = 0; i < evictionData.size(); i++) {
+      Item *candidate = evictionData[i].candidate;
+      WriteHandle newItemHdl = acquire(new (blankAllocs[i]) 
+              Item(candidate->getKey(), candidate->getSize(),
+                   candidate->getCreationTime(), candidate->getExpiryTime()));
+      XDCHECK(newItemHdl);
+      if (newItemHdl) {
+        newItemHdl.markNascent();
+        (*stats_.fragmentationSize)[tid][pid][cid].add(
+            util::getFragmentation(*this, *newItemHdl));
+        newAllocs.push_back(newItemHdl.getInternal());
+        newHandles.push_back(std::move(newItemHdl));
+      } else {
+        //failed to get item handle
+        throw std::runtime_error(
+           folly::sformat("Was not to acquire new alloc, failed alloc {}", blankAllocs[i]));
+      }
+    }
+    //2. add in batch to mmContainer
+    auto& newMMContainer = getMMContainer(tid+1, pid, cid);
+    uint32_t added = newMMContainer.addBatch(newAllocs.begin(), newAllocs.end());
+    XDCHECK_EQ(added,newAllocs.size());
+    if (added != newAllocs.size()) {
+      throw std::runtime_error(
+        folly::sformat("Was not able to add all new items, failed item {} and handle {}", 
+                        newAllocs[added]->toString(),newHandles[added]->toString()));
+    }
+    //3. copy item data - don't need to add in mmContainer
+    for (int i = 0; i < evictionData.size(); i++) {
+      Item *candidate = evictionData[i].candidate;
+      WriteHandle newHandle = std::move(newHandles[i]);
+      bool moved = moveRegularItem(*candidate,newHandle, true, true);
+      if (moved) {
+        (*stats_.numWritebacks)[tid][pid][cid].inc();
+        XDCHECK(candidate->getKey() == newHandle->getKey());
+        if (markMoving) {
+          auto ref = candidate->unmarkMoving();
+          XDCHECK_EQ(ref,0);
+          wakeUpWaiters(candidate->getKey(), std::move(newHandle));
+          if (fromBgThread) {
+            const auto res =
+                releaseBackToAllocator(*candidate, RemoveContext::kNormal, false);
+            XDCHECK(res == ReleaseRes::kReleased);
+          }
+        }
+      } else {
+        typename NvmCacheT::PutToken token = std::move(evictionData[i].token);
+        removeFromMMContainer(*newAllocs[i]);
+        auto ret = handleFailedMove(candidate,token,evictionData[i].expired,markMoving);
+        XDCHECK(ret);
+        if (fromBgThread && markMoving) {
+          const auto res =
+              releaseBackToAllocator(*candidate, RemoveContext::kNormal, false);
+          XDCHECK(res == ReleaseRes::kReleased);
+        }
+
+      }
+    }
+  } else {
+    //we are the last tier - just remove
+    for (int i = 0; i < evictionData.size(); i++) {
+      Item *candidate = evictionData[i].candidate;
+      typename NvmCacheT::PutToken token = std::move(evictionData[i].token);
+      auto ret = handleFailedMove(candidate,token,evictionData[i].expired,markMoving);
+      if (fromBgThread && markMoving) {
+        const auto res =
+            releaseBackToAllocator(*candidate, RemoveContext::kNormal, false);
+        XDCHECK(res == ReleaseRes::kReleased);
+      }
+    }
+  }
+
+  return evictionData;
+}
+
+// 
+// Common function in case move among tiers fails during eviction
+//
+// if insertOrReplace was called during move
+// then candidate will not be accessible (failed replace during tryEvict)
+//  - therefore this was why we failed to
+//    evict to the next tier and insertOrReplace
+//    will remove from NVM cache
+// however, if candidate is accessible
+// that means the allocation in the next
+// tier failed - so we will continue to
+// evict the item to NVM cache
+template <typename CacheTrait>
+bool CacheAllocator<CacheTrait>::handleFailedMove(Item* candidate, 
+                                                  typename NvmCacheT::PutToken& token, 
+                                                  bool isExpired,
+                                                  bool markMoving) {
+  bool failedToReplace = !candidate->isAccessible();
+  if (!token.isValid() && !failedToReplace) {
+    token = createPutToken(*candidate);
+  }
+  // in case that we are on the last tier, we whould have already marked
+  // as exclusive since we will not be moving the item to the next tier
+  // but rather just evicting all together, no need to
+  // markForEvictionWhenMoving
+  if (markMoving) {
+    if (!candidate->isMarkedForEviction() &&
+        candidate->isMoving()) {
+      auto ret = (isExpired) ? true : candidate->markForEvictionWhenMoving();
+      XDCHECK(ret);
+    }
+    unlinkItemForEviction(*candidate);
+  } else if (candidate->isAccessible()) {
+    accessContainer_->remove(*candidate);
+  }
+ 
+  if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate)
+          && !failedToReplace) {
+    nvmCache_->put(*candidate, std::move(token));
+  }
+  // wake up any readers that wait for the move to complete
+  // it's safe to do now, as we have the item marked exclusive and
+  // no other reader can be added to the waiters list
+  if (markMoving) {
+    wakeUpWaiters(candidate->getKey(), {});
+  }
+  return true;
+}
+
 template <typename CacheTrait>
 std::pair<typename CacheAllocator<CacheTrait>::Item*,
           typename CacheAllocator<CacheTrait>::Item*>
-CacheAllocator<CacheTrait>::getNextCandidate(PoolId pid,
+CacheAllocator<CacheTrait>::getNextCandidate(TierId tid,
+                                             PoolId pid,
                                              ClassId cid,
                                              unsigned int& searchTries) {
   typename NvmCacheT::PutToken token;
   Item* toRecycle = nullptr;
+  Item* toRecycleParent = nullptr;
   Item* candidate = nullptr;
-  auto& mmContainer = getMMContainer(pid, cid);
-
-  mmContainer.withEvictionIterator([this, pid, cid, &candidate, &toRecycle,
-                                    &searchTries, &mmContainer,
-                                    &token](auto&& itr) {
+  bool isExpired = false;
+  bool chainedItem = false;
+  auto& mmContainer = getMMContainer(tid, pid, cid);
+  bool lastTier = tid+1 >= getNumTiers() || config_.noOnlineEviction;
+
+  mmContainer.withEvictionIterator([this, tid, pid, cid, &candidate,
+                                    &toRecycle, &toRecycleParent,
+                                    &chainedItem,
+                                    &searchTries, &mmContainer, &lastTier,
+                                    &isExpired, &token](auto&& itr) {
     if (!itr) {
       ++searchTries;
-      (*stats_.evictionAttempts)[pid][cid].inc();
+      (*stats_.evictionAttempts)[tid][pid][cid].inc();
       return;
     }
 
@@ -3655,26 +4494,57 @@ CacheAllocator<CacheTrait>::getNextCandidate(PoolId pid,
             config_.evictionSearchTries > searchTries) &&
            itr) {
       ++searchTries;
-      (*stats_.evictionAttempts)[pid][cid].inc();
+      (*stats_.evictionAttempts)[tid][pid][cid].inc();
 
       auto* toRecycle_ = itr.get();
-      auto* candidate_ =
-          toRecycle_->isChainedItem()
+      bool chainedItem_ = toRecycle_->isChainedItem();
+      Item* toRecycleParent_ = chainedItem_
               ? &toRecycle_->asChainedItem().getParentItem(compressor_)
-              : toRecycle_;
-
-      const bool evictToNvmCache = shouldWriteToNvmCache(*candidate_);
-      auto putToken = evictToNvmCache
-                          ? nvmCache_->createPutToken(candidate_->getKey())
-                          : typename NvmCacheT::PutToken{};
-
-      if (evictToNvmCache && !putToken.isValid()) {
+              : nullptr;
+      // in order to safely check if the expected parent (toRecycleParent_) matches
+      // the current parent on the chained item, we need to take the chained
+      // item lock so we are sure that nobody else will be editing the chain
+      auto l_ = chainedItem_
+                ? chainedItemLocks_.tryLockExclusive(toRecycleParent_->getKey())
+                : decltype(chainedItemLocks_.tryLockExclusive(toRecycle_->getKey()))();
+
+      if (chainedItem_ &&
+          ( !l_ || &toRecycle_->asChainedItem().getParentItem(compressor_)
+                    != toRecycleParent_) ) {
+          // Fail moving if we either couldn't acquire the chained item lock,
+          // or if the parent had already been replaced in the meanwhile.
+          ++itr;
+          continue;
+      }
+      Item* candidate_;
+      Item* syncItem_;
+      //sync on the parent item for chained items to move to next tier
+      if (!lastTier && chainedItem_) {
+          syncItem_ = toRecycleParent_;
+          candidate_ = toRecycle_;
+      } else if (lastTier && chainedItem_) {
+          candidate_ = toRecycleParent_;
+          syncItem_ = toRecycleParent_;
+      } else {
+          candidate_ = toRecycle_;
+          syncItem_ = toRecycle_;
+      }
+      // if it's last tier, the item will be evicted
+      // need to create put token before marking it exclusive
+      const bool evictToNvmCache = lastTier && shouldWriteToNvmCache(*candidate_);
+      auto token_ = evictToNvmCache
+                        ? nvmCache_->createPutToken(candidate_->getKey())
+                        : typename NvmCacheT::PutToken{};
+
+      if (evictToNvmCache && !token_.isValid()) {
         stats_.evictFailConcurrentFill.inc();
         ++itr;
         continue;
       }
 
-      auto markedForEviction = candidate_->markForEviction();
+      auto markedForEviction = (lastTier || candidate_->isExpired()) ?
+                                   syncItem_->markForEviction() :
+                                   syncItem_->markMoving();
       if (!markedForEviction) {
         if (candidate_->hasChainedItem()) {
           stats_.evictFailParentAC.inc();
@@ -3685,20 +4555,21 @@ CacheAllocator<CacheTrait>::getNextCandidate(PoolId pid,
         continue;
       }
 
+      XDCHECK(syncItem_->isMoving() || syncItem_->isMarkedForEviction());
+      toRecycleParent = toRecycleParent_;
+      chainedItem = chainedItem_;
       // markForEviction to make sure no other thead is evicting the item
-      // nor holding a handle to that item
+      // nor holding a handle to that item if this is last tier
+      // since we won't be moving the item to the next tier
       toRecycle = toRecycle_;
       candidate = candidate_;
-      token = std::move(putToken);
-
-      // Check if parent changed for chained items - if yes, we cannot
-      // remove the child from the mmContainer as we will not be evicting
-      // it. We could abort right here, but we need to cleanup in case
-      // unmarkForEviction() returns 0 - so just go through normal path.
-      if (!toRecycle_->isChainedItem() ||
-          &toRecycle->asChainedItem().getParentItem(compressor_) == candidate) {
-        mmContainer.remove(itr);
+      isExpired = candidate_->isExpired();
+      token = std::move(token_);
+      if (chainedItem) {
+          XDCHECK(l_);
+          XDCHECK_EQ(toRecycleParent,&toRecycle_->asChainedItem().getParentItem(compressor_));
       }
+      mmContainer.remove(itr);
       return;
     }
   });
@@ -3709,25 +4580,72 @@ CacheAllocator<CacheTrait>::getNextCandidate(PoolId pid,
 
   XDCHECK(toRecycle);
   XDCHECK(candidate);
-  XDCHECK(candidate->isMarkedForEviction());
 
-  unlinkItemForEviction(*candidate);
-
-  if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate)) {
-    nvmCache_->put(*candidate, std::move(token));
+  auto evictedToNext = (lastTier || isExpired) ? nullptr
+      : tryEvictToNextMemoryTier(*candidate);
+  if (!evictedToNext) {
+    //failed to move a chained item - so evict the entire chain
+    if (candidate->isChainedItem()) {
+      //candidate should be parent now
+      XDCHECK(toRecycleParent->isMoving());
+      XDCHECK_EQ(candidate,toRecycle);
+      candidate = toRecycleParent; //but now we evict the chain and in
+                                    //doing so recycle the child
+    }
+    //clean up and evict the candidate since we failed
+    auto ret = handleFailedMove(candidate,token,isExpired,true);
+    XDCHECK(ret);
+  } else {
+    XDCHECK(!evictedToNext->isMarkedForEviction() && !evictedToNext->isMoving());
+    XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
+    XDCHECK(!candidate->isAccessible());
+    XDCHECK(candidate->getKey() == evictedToNext->getKey());
+
+    (*stats_.numWritebacks)[tid][pid][cid].inc();
+    if (chainedItem) {
+      XDCHECK(toRecycleParent->isMoving());
+      XDCHECK_EQ(evictedToNext->getRefCount(),2u);
+      (*stats_.chainedItemEvictions)[tid][pid][cid].inc();
+      // check if by releasing the item we intend to, we actually
+      // recycle the candidate.
+      auto ret = releaseBackToAllocator(*candidate, RemoveContext::kEviction,
+                                        /* isNascent */ false, toRecycle);
+      XDCHECK_EQ(ret,ReleaseRes::kRecycled);
+      evictedToNext.reset(); //once we unmark moving threads will try and alloc, drop
+                              //the handle now - and refcount will drop to 1
+      auto ref = toRecycleParent->unmarkMoving();
+      if (UNLIKELY(ref == 0)) {
+        wakeUpWaiters(toRecycleParent->getKey(),{});
+        const auto res =
+            releaseBackToAllocator(*toRecycleParent, RemoveContext::kNormal, false);
+        XDCHECK(res == ReleaseRes::kReleased);
+      } else {
+        auto parentHandle = acquire(toRecycleParent);
+        if (parentHandle) {
+          wakeUpWaiters(toRecycleParent->getKey(),std::move(parentHandle));
+        } //in case where parent handle is null that means some other thread
+          // would have called wakeUpWaiters with null handle and released
+          // parent back to allocator
+      }
+    } else {
+      wakeUpWaiters(candidate->getKey(), std::move(evictedToNext));
+    }
   }
+
+  XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
+
   return {candidate, toRecycle};
 }
 
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::Item*
-CacheAllocator<CacheTrait>::findEviction(PoolId pid, ClassId cid) {
+CacheAllocator<CacheTrait>::findEviction(TierId tid, PoolId pid, ClassId cid) {
   // Keep searching for a candidate until we were able to evict it
   // or until the search limit has been exhausted
   unsigned int searchTries = 0;
   while (config_.evictionSearchTries == 0 ||
          config_.evictionSearchTries > searchTries) {
-    auto [candidate, toRecycle] = getNextCandidate(pid, cid, searchTries);
+    auto [candidate, toRecycle] = getNextCandidate(tid, pid, cid, searchTries);
 
     // Reached the end of the eviction queue but doulen't find a candidate,
     // start again.
@@ -3738,9 +4656,9 @@ CacheAllocator<CacheTrait>::findEviction(PoolId pid, ClassId cid) {
     // NULL. If `ref` == 0 then it means that we are the last holder of
     // that item.
     if (candidate->hasChainedItem()) {
-      (*stats_.chainedItemEvictions)[pid][cid].inc();
+      (*stats_.chainedItemEvictions)[tid][pid][cid].inc();
     } else {
-      (*stats_.regularItemEvictions)[pid][cid].inc();
+      (*stats_.regularItemEvictions)[tid][pid][cid].inc();
     }
 
     if (auto eventTracker = getEventTracker()) {
@@ -3808,6 +4726,70 @@ bool CacheAllocator<CacheTrait>::shouldWriteToNvmCacheExclusive(
   return true;
 }
 
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::tryEvictToNextMemoryTier(
+    TierId tid, PoolId pid, Item& item) {
+
+  TierId nextTier = tid; // TODO - calculate this based on some admission policy
+  while (++nextTier < getNumTiers()) { // try to evict down to the next memory tiers
+    // always evict item from the nextTier to make room for new item
+    bool evict = true;
+    // allocateInternal might trigger another eviction
+    WriteHandle newItemHdl{};
+    Item* parentItem;
+    bool chainedItem = false;
+    if(item.isChainedItem()) {
+        chainedItem = true;
+        parentItem = &item.asChainedItem().getParentItem(compressor_);
+        XDCHECK(parentItem->isMoving());
+        XDCHECK(item.isChainedItem() && item.getRefCount() == 1);
+        XDCHECK_EQ(0, parentItem->getRefCount());
+        newItemHdl = allocateChainedItemInternalTier(*parentItem,
+                                                     item.getSize(),
+                                                     nextTier);
+    } else {
+      // this assert can fail if parent changed
+      XDCHECK(item.isMoving());
+      XDCHECK(item.getRefCount() == 0);
+      newItemHdl = allocateInternalTier(nextTier, pid,
+                     item.getKey(),
+                     item.getSize(),
+                     item.getCreationTime(),
+                     item.getExpiryTime(),
+                     evict);
+    }
+
+    if (newItemHdl) {
+      bool moveSuccess = chainedItem
+                      ? moveChainedItem(item.asChainedItem(), newItemHdl)
+                      : moveRegularItem(item, newItemHdl,
+                      /* skipAddInMMContainer */ false, /* fromBgThread*/ false);
+      if (!moveSuccess) {
+        return WriteHandle{};
+      }
+      XDCHECK_EQ(newItemHdl->getSize(), item.getSize());
+      if (!chainedItem) { // TODO: do we need it?
+        XDCHECK_EQ(newItemHdl->getKey(),item.getKey());
+        item.unmarkMoving();
+      }
+      return newItemHdl;
+    } else {
+      return WriteHandle{};
+    }
+  }
+
+  return {};
+}
+
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::tryEvictToNextMemoryTier(Item& item) {
+  auto tid = getTierId(item);
+  auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId;
+  return tryEvictToNextMemoryTier(tid, pid, item);
+}
+
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::RemoveRes
 CacheAllocator<CacheTrait>::remove(typename Item::Key key) {
@@ -4008,21 +4990,57 @@ void CacheAllocator<CacheTrait>::invalidateNvm(Item& item) {
   }
 }
 
+template <typename CacheTrait>
+TierId
+CacheAllocator<CacheTrait>::getTierId(const Item& item) const {
+  return getTierId(item.getMemory());
+}
+
+template <typename CacheTrait>
+TierId
+CacheAllocator<CacheTrait>::getTierId(const void* ptr) const {
+  for (TierId tid = 0; tid < getNumTiers(); tid++) {
+    if (allocator_[tid]->isMemoryInAllocator(ptr))
+      return tid;
+  }
+
+  throw std::invalid_argument("Item does not belong to any tier!");
+}
+
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::MMContainer&
 CacheAllocator<CacheTrait>::getMMContainer(const Item& item) const noexcept {
+  const auto tid = getTierId(item);
   const auto allocInfo =
-      allocator_->getAllocInfo(static_cast<const void*>(&item));
-  return getMMContainer(allocInfo.poolId, allocInfo.classId);
+      allocator_[tid]->getAllocInfo(static_cast<const void*>(&item));
+  return getMMContainer(tid, allocInfo.poolId, allocInfo.classId);
 }
 
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::MMContainer&
-CacheAllocator<CacheTrait>::getMMContainer(PoolId pid,
+CacheAllocator<CacheTrait>::getMMContainer(TierId tid,
+                                           PoolId pid,
                                            ClassId cid) const noexcept {
-  XDCHECK_LT(static_cast<size_t>(pid), mmContainers_.size());
-  XDCHECK_LT(static_cast<size_t>(cid), mmContainers_[pid].size());
-  return *mmContainers_[pid][cid];
+  XDCHECK_LT(static_cast<size_t>(tid), mmContainers_.size());
+  XDCHECK_LT(static_cast<size_t>(pid), mmContainers_[tid].size());
+  XDCHECK_LT(static_cast<size_t>(cid), mmContainers_[tid][pid].size());
+  return *mmContainers_[tid][pid][cid];
+}
+
+template <typename CacheTrait>
+MMContainerStat CacheAllocator<CacheTrait>::getMMContainerStat(
+    TierId tid, PoolId pid, ClassId cid) const noexcept {
+  if(static_cast<size_t>(tid) >= mmContainers_.size()) {
+    return MMContainerStat{};
+  }
+  if (static_cast<size_t>(pid) >= mmContainers_[tid].size()) {
+    return MMContainerStat{};
+  }
+  if (static_cast<size_t>(cid) >= mmContainers_[tid][pid].size()) {
+    return MMContainerStat{};
+  }
+  return mmContainers_[tid][pid][cid] ? mmContainers_[tid][pid][cid]->getStats()
+                                 : MMContainerStat{};
 }
 
 template <typename CacheTrait>
@@ -4211,23 +5229,25 @@ void CacheAllocator<CacheTrait>::markUseful(const ReadHandle& handle,
 template <typename CacheTrait>
 bool CacheAllocator<CacheTrait>::recordAccessInMMContainer(Item& item,
                                                            AccessMode mode) {
+  const auto tid = getTierId(item);
   const auto allocInfo =
-      allocator_->getAllocInfo(static_cast<const void*>(&item));
-  (*stats_.cacheHits)[allocInfo.poolId][allocInfo.classId].inc();
+      allocator_[tid]->getAllocInfo(static_cast<const void*>(&item));
+  (*stats_.cacheHits)[tid][allocInfo.poolId][allocInfo.classId].inc();
 
   // track recently accessed items if needed
   if (UNLIKELY(config_.trackRecentItemsForDump)) {
     ring_->trackItem(reinterpret_cast<uintptr_t>(&item), item.getSize());
   }
 
-  auto& mmContainer = getMMContainer(allocInfo.poolId, allocInfo.classId);
+  auto& mmContainer = getMMContainer(tid, allocInfo.poolId, allocInfo.classId);
   return mmContainer.recordAccess(item, mode);
 }
 
 template <typename CacheTrait>
 uint32_t CacheAllocator<CacheTrait>::getUsableSize(const Item& item) const {
+  const auto tid = getTierId(item);
   const auto allocSize =
-      allocator_->getAllocInfo(static_cast<const void*>(&item)).allocSize;
+      allocator_[tid]->getAllocInfo(static_cast<const void*>(&item)).allocSize;
   return item.isChainedItem()
              ? allocSize - ChainedItem::getRequiredSize(0)
              : allocSize - Item::getRequiredSize(item.getKey(), 0);
@@ -4236,8 +5256,9 @@ uint32_t CacheAllocator<CacheTrait>::getUsableSize(const Item& item) const {
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::SampleItem
 CacheAllocator<CacheTrait>::getSampleItem() {
+  auto tid = folly::Random::rand32() % getNumTiers();
   size_t nvmCacheSize = nvmCache_ ? nvmCache_->getUsableSize() : 0;
-  size_t ramCacheSize = allocator_->getMemorySizeInclAdvised();
+  size_t ramCacheSize = allocator_[tid]->getMemorySizeInclAdvised();
 
   bool fromNvm =
       folly::Random::rand64(0, nvmCacheSize + ramCacheSize) >= ramCacheSize;
@@ -4246,19 +5267,18 @@ CacheAllocator<CacheTrait>::getSampleItem() {
   }
 
   // Sampling from DRAM cache
-  auto item = reinterpret_cast<const Item*>(allocator_->getRandomAlloc());
+  auto item = reinterpret_cast<const Item*>(allocator_[tid]->getRandomAlloc());
   if (!item || UNLIKELY(item->isExpired())) {
     return SampleItem{false /* fromNvm */};
   }
 
   // Check that item returned is the same that was sampled
-
   auto sharedHdl = std::make_shared<ReadHandle>(findInternal(item->getKey()));
   if (sharedHdl->get() != item) {
     return SampleItem{false /* fromNvm */};
   }
 
-  const auto allocInfo = allocator_->getAllocInfo(item->getMemory());
+  const auto allocInfo = allocator_[tid]->getAllocInfo(item->getMemory());
 
   // Convert the Item to IOBuf to make SampleItem
   auto iobuf = folly::IOBuf{
@@ -4282,21 +5302,27 @@ std::vector<std::string> CacheAllocator<CacheTrait>::dumpEvictionIterator(
     return {};
   }
 
-  if (static_cast<size_t>(pid) >= mmContainers_.size() ||
-      static_cast<size_t>(cid) >= mmContainers_[pid].size()) {
+  // Always evict from the lowest layer.
+  int tid = getNumTiers() - 1;
+  if (static_cast<size_t>(tid) >= mmContainers_.size() ||
+      static_cast<size_t>(pid) >= mmContainers_[tid].size() ||
+      static_cast<size_t>(cid) >= mmContainers_[tid][pid].size()) {
     throw std::invalid_argument(
-        folly::sformat("Invalid PoolId: {} and ClassId: {}.", pid, cid));
+        folly::sformat("Invalid TierId: {} and PoolId: {} and ClassId: {}.", tid, pid, cid));
   }
 
   std::vector<std::string> content;
 
-  auto& mm = *mmContainers_[pid][cid];
-  auto evictItr = mm.getEvictionIterator();
   size_t i = 0;
-  while (evictItr && i < numItems) {
-    content.push_back(evictItr->toString());
-    ++evictItr;
-    ++i;
+  while (i < numItems && tid >= 0) {
+    auto& mm = *mmContainers_[tid][pid][cid];
+    mm.withEvictionIterator([&content, numItems](auto&& itr) {
+      while (itr && content.size() < numItems) {
+        content.push_back(itr->toString());
+        ++itr;
+      }
+    });
+    --tid;
   }
 
   return content;
@@ -4474,14 +5500,34 @@ PoolId CacheAllocator<CacheTrait>::addPool(
     std::shared_ptr<RebalanceStrategy> resizeStrategy,
     bool ensureProvisionable) {
   std::unique_lock w(poolsResizeAndRebalanceLock_);
-  auto pid = allocator_->addPool(name, size, allocSizes, ensureProvisionable);
+
+  PoolId pid = 0;
+  size_t totalCacheSize = 0;
+
+  for (TierId tid = 0; tid < getNumTiers(); tid++) {
+    totalCacheSize += allocator_[tid]->getMemorySize();
+  }
+
+  for (TierId tid = 0; tid < getNumTiers(); tid++) {
+    auto tierSizeRatio =
+        static_cast<double>(allocator_[tid]->getMemorySize()) / totalCacheSize;
+    size_t tierPoolSize = static_cast<size_t>(tierSizeRatio * size);
+    
+    // TODO: what if we manage to add pool only in one tier?
+    // we should probably remove that on failure
+    auto res = allocator_[tid]->addPool(
+        name, tierPoolSize, allocSizes, ensureProvisionable);
+    XDCHECK(tid == 0 || res == pid);
+    pid = res;
+  }
+
   createMMContainers(pid, std::move(config));
   setRebalanceStrategy(pid, std::move(rebalanceStrategy));
   setResizeStrategy(pid, std::move(resizeStrategy));
 
   if (backgroundEvictor_.size()) {
     auto memoryAssignments =
-        createBgWorkerMemoryAssignments(backgroundEvictor_.size());
+        createBgWorkerMemoryAssignments(backgroundEvictor_.size(), 0);
     for (size_t id = 0; id < backgroundEvictor_.size(); id++)
       backgroundEvictor_[id]->setAssignedMemory(
           std::move(memoryAssignments[id]));
@@ -4489,7 +5535,7 @@ PoolId CacheAllocator<CacheTrait>::addPool(
 
   if (backgroundPromoter_.size()) {
     auto memoryAssignments =
-        createBgWorkerMemoryAssignments(backgroundPromoter_.size());
+        createBgWorkerMemoryAssignments(backgroundPromoter_.size(), 1);
     for (size_t id = 0; id < backgroundPromoter_.size(); id++)
       backgroundPromoter_[id]->setAssignedMemory(
           std::move(memoryAssignments[id]));
@@ -4501,9 +5547,9 @@ PoolId CacheAllocator<CacheTrait>::addPool(
 template <typename CacheTrait>
 void CacheAllocator<CacheTrait>::overridePoolRebalanceStrategy(
     PoolId pid, std::shared_ptr<RebalanceStrategy> rebalanceStrategy) {
-  if (static_cast<size_t>(pid) >= mmContainers_.size()) {
+  if (static_cast<size_t>(pid) >= mmContainers_[0].size()) {
     throw std::invalid_argument(folly::sformat(
-        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size()));
+        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[0].size()));
   }
   setRebalanceStrategy(pid, std::move(rebalanceStrategy));
 }
@@ -4511,9 +5557,9 @@ void CacheAllocator<CacheTrait>::overridePoolRebalanceStrategy(
 template <typename CacheTrait>
 void CacheAllocator<CacheTrait>::overridePoolResizeStrategy(
     PoolId pid, std::shared_ptr<RebalanceStrategy> resizeStrategy) {
-  if (static_cast<size_t>(pid) >= mmContainers_.size()) {
+  if (static_cast<size_t>(pid) >= mmContainers_[0].size()) {
     throw std::invalid_argument(folly::sformat(
-        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size()));
+        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[0].size()));
   }
   setResizeStrategy(pid, std::move(resizeStrategy));
 }
@@ -4525,14 +5571,14 @@ void CacheAllocator<CacheTrait>::overridePoolOptimizeStrategy(
 }
 
 template <typename CacheTrait>
-void CacheAllocator<CacheTrait>::overridePoolConfig(PoolId pid,
+void CacheAllocator<CacheTrait>::overridePoolConfig(TierId tid, PoolId pid,
                                                     const MMConfig& config) {
-  if (static_cast<size_t>(pid) >= mmContainers_.size()) {
+  if (static_cast<size_t>(pid) >= mmContainers_[tid].size()) {
     throw std::invalid_argument(folly::sformat(
-        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size()));
+        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[tid].size()));
   }
 
-  auto& pool = allocator_->getPool(pid);
+  auto& pool = allocator_[tid]->getPool(pid);
   for (unsigned int cid = 0; cid < pool.getNumClassId(); ++cid) {
     MMConfig mmConfig = config;
     mmConfig.addExtraConfig(
@@ -4540,29 +5586,33 @@ void CacheAllocator<CacheTrait>::overridePoolConfig(PoolId pid,
             ? pool.getAllocationClass(static_cast<ClassId>(cid))
                   .getAllocsPerSlab()
             : 0);
-    DCHECK_NOTNULL(mmContainers_[pid][cid].get());
-    mmContainers_[pid][cid]->setConfig(mmConfig);
+    DCHECK_NOTNULL(mmContainers_[tid][pid][cid].get());
+    mmContainers_[tid][pid][cid]->setConfig(mmConfig);
   }
 }
 
 template <typename CacheTrait>
 void CacheAllocator<CacheTrait>::createMMContainers(const PoolId pid,
                                                     MMConfig config) {
-  auto& pool = allocator_->getPool(pid);
+  // pools on each layer should have the same number of class id, etc.
+  auto& pool = allocator_[0]->getPool(pid);
   for (unsigned int cid = 0; cid < pool.getNumClassId(); ++cid) {
     config.addExtraConfig(
         config_.trackTailHits
             ? pool.getAllocationClass(static_cast<ClassId>(cid))
                   .getAllocsPerSlab()
             : 0);
-    mmContainers_[pid][cid].reset(new MMContainer(config, compressor_));
+    for (TierId tid = 0; tid < getNumTiers(); tid++) {
+      mmContainers_[tid][pid][cid].reset(new MMContainer(config, compressor_));
+    }
   }
 }
 
 template <typename CacheTrait>
 PoolId CacheAllocator<CacheTrait>::getPoolId(
     folly::StringPiece name) const noexcept {
-  return allocator_->getPoolId(name.str());
+  // each tier has the same pools
+  return allocator_[0]->getPoolId(name.str());
 }
 
 // The Function returns a consolidated vector of Release Slab
@@ -4605,7 +5655,9 @@ std::set<PoolId> CacheAllocator<CacheTrait>::filterCompactCachePools(
 template <typename CacheTrait>
 std::set<PoolId> CacheAllocator<CacheTrait>::getRegularPoolIds() const {
   std::shared_lock r(poolsResizeAndRebalanceLock_);
-  return filterCompactCachePools(allocator_->getPoolIds());
+  // TODO - get rid of the duplication - right now, each tier
+  // holds pool objects with mostly the same info
+  return filterCompactCachePools(allocator_[0]->getPoolIds());
 }
 
 template <typename CacheTrait>
@@ -4630,10 +5682,9 @@ std::set<PoolId> CacheAllocator<CacheTrait>::getRegularPoolIdsForResize()
   // getAdvisedMemorySize - then pools may be overLimit even when
   // all slabs are not allocated. Otherwise, pools may be overLimit
   // only after all slabs are allocated.
-  //
-  return (allocator_->allSlabsAllocated()) ||
-                 (allocator_->getAdvisedMemorySize() != 0)
-             ? filterCompactCachePools(allocator_->getPoolsOverLimit())
+  return (allocator_[0]->allSlabsAllocated()) ||
+                 (allocator_[0]->getAdvisedMemorySize() != 0)
+             ? filterCompactCachePools(allocator_[0]->getPoolsOverLimit())
              : std::set<PoolId>{};
 }
 
@@ -4642,9 +5693,19 @@ const std::string CacheAllocator<CacheTrait>::getCacheName() const {
   return config_.cacheName;
 }
 
+template <typename CacheTrait>
+size_t CacheAllocator<CacheTrait>::getPoolSize(PoolId poolId) const {
+  size_t poolSize = 0;
+  for (auto& allocator: allocator_) {
+    const auto& pool = allocator->getPool(poolId);
+    poolSize += pool.getPoolSize();
+  }
+  return poolSize;
+}
+
 template <typename CacheTrait>
 PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
-  const auto& pool = allocator_->getPool(poolId);
+  const auto& pool = allocator_[0]->getPool(poolId);
   const auto& allocSizes = pool.getAllocSizes();
   auto mpStats = pool.getStats();
   const auto& classIds = mpStats.classIds;
@@ -4662,27 +5723,43 @@ PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
   // TODO export evictions, numItems etc from compact cache directly.
   if (!isCompactCache) {
     for (const ClassId cid : classIds) {
-      uint64_t classHits = (*stats_.cacheHits)[poolId][cid].get();
-      XDCHECK(mmContainers_[poolId][cid],
-              folly::sformat("Pid {}, Cid {} not initialized.", poolId, cid));
+      uint64_t allocAttempts = 0, evictionAttempts = 0, allocFailures = 0,
+               fragmentationSize = 0, classHits = 0, chainedItemEvictions = 0,
+               regularItemEvictions = 0, numWritebacks = 0;
+      MMContainerStat mmContainerStats;
+      for (TierId tid = 0; tid < getNumTiers(); tid++) {
+        allocAttempts += (*stats_.allocAttempts)[tid][poolId][cid].get();
+        evictionAttempts += (*stats_.evictionAttempts)[tid][poolId][cid].get();
+        allocFailures += (*stats_.allocFailures)[tid][poolId][cid].get();
+        fragmentationSize += (*stats_.fragmentationSize)[tid][poolId][cid].get();
+        classHits += (*stats_.cacheHits)[tid][poolId][cid].get();
+        chainedItemEvictions += (*stats_.chainedItemEvictions)[tid][poolId][cid].get();
+        regularItemEvictions += (*stats_.regularItemEvictions)[tid][poolId][cid].get();
+        numWritebacks += (*stats_.numWritebacks)[tid][poolId][cid].get();
+        mmContainerStats += getMMContainerStat(tid, poolId, cid);
+        XDCHECK(mmContainers_[tid][poolId][cid],
+                folly::sformat("Tid {}, Pid {}, Cid {} not initialized.", tid, poolId, cid));
+      }
       cacheStats.insert(
           {cid,
-           {allocSizes[cid], (*stats_.allocAttempts)[poolId][cid].get(),
-            (*stats_.evictionAttempts)[poolId][cid].get(),
-            (*stats_.allocFailures)[poolId][cid].get(),
-            (*stats_.fragmentationSize)[poolId][cid].get(), classHits,
-            (*stats_.chainedItemEvictions)[poolId][cid].get(),
-            (*stats_.regularItemEvictions)[poolId][cid].get(),
-            mmContainers_[poolId][cid]->getStats()}
-
-          });
+           {allocSizes[cid],
+            allocAttempts,
+            evictionAttempts,
+            allocFailures,
+            fragmentationSize,
+            classHits,
+            chainedItemEvictions,
+            regularItemEvictions,
+            numWritebacks,
+            mmContainerStats}});
       totalHits += classHits;
     }
   }
 
   PoolStats ret;
   ret.isCompactCache = isCompactCache;
-  ret.poolName = allocator_->getPoolName(poolId);
+  //pool name is also shared among tiers
+  ret.poolName = allocator_[0]->getPoolName(poolId);
   ret.poolSize = pool.getPoolSize();
   ret.poolUsableSize = pool.getPoolUsableSize();
   ret.poolAdvisedSize = pool.getPoolAdvisedSize();
@@ -4694,17 +5771,84 @@ PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
   return ret;
 }
 
+template <typename CacheTrait>
+PoolStats CacheAllocator<CacheTrait>::getPoolStats(TierId tid, PoolId poolId) const {
+  const auto& pool = allocator_[tid]->getPool(poolId);
+  const auto& allocSizes = pool.getAllocSizes();
+  auto mpStats = pool.getStats();
+  const auto& classIds = mpStats.classIds;
+
+  // check if this is a compact cache.
+  bool isCompactCache = false;
+  {
+    std::shared_lock lock(compactCachePoolsLock_);
+    isCompactCache = isCompactCachePool_[poolId];
+  }
+
+  folly::F14FastMap<ClassId, CacheStat> cacheStats;
+  uint64_t totalHits = 0;
+  // cacheStats is only menaningful for pools that are not compact caches.
+  // TODO export evictions, numItems etc from compact cache directly.
+  if (!isCompactCache) {
+    for (const ClassId cid : classIds) {
+      uint64_t classHits = (*stats_.cacheHits)[tid][poolId][cid].get();
+      XDCHECK(mmContainers_[tid][poolId][cid],
+              folly::sformat("Tid {}, Pid {}, Cid {} not initialized.", tid, poolId, cid));
+      cacheStats.insert(
+          {cid,
+           {allocSizes[cid],
+            (*stats_.allocAttempts)[tid][poolId][cid].get(),
+            (*stats_.evictionAttempts)[tid][poolId][cid].get(),
+            (*stats_.allocFailures)[tid][poolId][cid].get(),
+            (*stats_.fragmentationSize)[tid][poolId][cid].get(),
+            classHits,
+            (*stats_.chainedItemEvictions)[tid][poolId][cid].get(),
+            (*stats_.regularItemEvictions)[tid][poolId][cid].get(),
+            (*stats_.numWritebacks)[tid][poolId][cid].get(),
+            getMMContainerStat(tid, poolId, cid)}});
+      totalHits += classHits;
+    }
+  }
+
+  PoolStats ret;
+  ret.isCompactCache = isCompactCache;
+  ret.poolName = allocator_[tid]->getPoolName(poolId);
+  ret.poolSize = pool.getPoolSize();
+  ret.poolUsableSize = pool.getPoolUsableSize();
+  ret.poolAdvisedSize = pool.getPoolAdvisedSize();
+  ret.cacheStats = std::move(cacheStats);
+  ret.mpStats = std::move(mpStats);
+  ret.numPoolGetHits = totalHits;
+  ret.evictionAgeSecs = stats_.perPoolEvictionAgeSecs_[poolId].estimate();
+
+  return ret;
+}
+
+template <typename CacheTrait>
+ACStats CacheAllocator<CacheTrait>::getACStats(TierId tid,
+                                               PoolId poolId,
+                                               ClassId classId) const {
+  const auto& pool = allocator_[tid]->getPool(poolId);
+  const auto& ac = pool.getAllocationClass(classId);
+  auto stats = ac.getStats();
+  stats.allocLatencyNs = (*stats_.classAllocLatency)[tid][poolId][classId];
+  stats.evictionAttempts = (*stats_.evictionAttempts)[tid][poolId][classId].get();
+  stats.evictions = (*stats_.regularItemEvictions)[tid][poolId][classId].get() +
+                    (*stats_.chainedItemEvictions)[tid][poolId][classId].get();
+  return stats;
+}
+
 template <typename CacheTrait>
 PoolEvictionAgeStats CacheAllocator<CacheTrait>::getPoolEvictionAgeStats(
     PoolId pid, unsigned int slabProjectionLength) const {
   PoolEvictionAgeStats stats;
 
-  const auto& pool = allocator_->getPool(pid);
+  const auto& pool = allocator_[0]->getPool(pid);
   const auto& allocSizes = pool.getAllocSizes();
   for (ClassId cid = 0; cid < static_cast<ClassId>(allocSizes.size()); ++cid) {
-    auto& mmContainer = getMMContainer(pid, cid);
+    auto& mmContainer = getMMContainer(0, pid, cid);
     const auto numItemsPerSlab =
-        allocator_->getPool(pid).getAllocationClass(cid).getAllocsPerSlab();
+        allocator_[0]->getPool(pid).getAllocationClass(cid).getAllocsPerSlab();
     const auto projectionLength = numItemsPerSlab * slabProjectionLength;
     stats.classEvictionAgeStats[cid] =
         mmContainer.getEvictionAgeStat(projectionLength);
@@ -4748,7 +5892,7 @@ void CacheAllocator<CacheTrait>::releaseSlab(PoolId pid,
   }
 
   try {
-    auto releaseContext = allocator_->startSlabRelease(
+    auto releaseContext = allocator_[0]->startSlabRelease(
         pid, victim, receiver, mode, hint,
         [this]() -> bool { return shutDownInProgress_; });
 
@@ -4757,15 +5901,15 @@ void CacheAllocator<CacheTrait>::releaseSlab(PoolId pid,
       return;
     }
 
-    releaseSlabImpl(releaseContext);
-    if (!allocator_->allAllocsFreed(releaseContext)) {
+    releaseSlabImpl(0, releaseContext);
+    if (!allocator_[0]->allAllocsFreed(releaseContext)) {
       throw std::runtime_error(
           folly::sformat("Was not able to free all allocs. PoolId: {}, AC: {}",
                          releaseContext.getPoolId(),
                          releaseContext.getClassId()));
     }
 
-    allocator_->completeSlabRelease(releaseContext);
+    allocator_[0]->completeSlabRelease(releaseContext);
   } catch (const exception::SlabReleaseAborted& e) {
     stats_.numAbortedSlabReleases.inc();
     throw exception::SlabReleaseAborted(folly::sformat(
@@ -4795,7 +5939,7 @@ SlabReleaseStats CacheAllocator<CacheTrait>::getSlabReleaseStats()
 }
 
 template <typename CacheTrait>
-void CacheAllocator<CacheTrait>::releaseSlabImpl(
+void CacheAllocator<CacheTrait>::releaseSlabImpl(TierId tid,
     const SlabReleaseContext& releaseContext) {
   auto startTime = std::chrono::milliseconds(util::getCurrentTimeMs());
   bool releaseStuck = false;
@@ -4838,7 +5982,7 @@ void CacheAllocator<CacheTrait>::releaseSlabImpl(
       // If moving fails, evict it
       evictForSlabRelease(item);
     }
-    XDCHECK(allocator_->isAllocFreed(releaseContext, alloc));
+    XDCHECK(allocator_[tid]->isAllocFreed(releaseContext, alloc));
   }
 }
 
@@ -4890,7 +6034,7 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(Item& oldItem) {
     // will send it back to the allocator
     bool isMoved = chainedItem
                        ? moveChainedItem(oldItem.asChainedItem(), newItemHdl)
-                       : moveRegularItem(oldItem, newItemHdl);
+                       : moveRegularItem(oldItem, newItemHdl, false, false);
     if (!isMoved) {
       return false;
     }
@@ -4899,7 +6043,8 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(Item& oldItem) {
     return false;
   }
 
-  const auto allocInfo = allocator_->getAllocInfo(oldItem.getMemory());
+  auto tid = getTierId(oldItem);
+  const auto allocInfo = allocator_[tid]->getAllocInfo(oldItem.getMemory());
   if (chainedItem) {
     newItemHdl.reset();
     auto parentKey = parentItem->getKey();
@@ -4927,9 +6072,9 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(Item& oldItem) {
     auto ref = unmarkMovingAndWakeUpWaiters(oldItem, std::move(newItemHdl));
     XDCHECK_EQ(0u, ref);
   }
-  allocator_->free(&oldItem);
+  allocator_[tid]->free(&oldItem);
 
-  (*stats_.fragmentationSize)[allocInfo.poolId][allocInfo.classId].sub(
+  (*stats_.fragmentationSize)[tid][allocInfo.poolId][allocInfo.classId].sub(
       util::getFragmentation(*this, oldItem));
   stats_.numMoveSuccesses.inc();
   return true;
@@ -4951,22 +6096,25 @@ CacheAllocator<CacheTrait>::allocateNewItemForOldItem(const Item& oldItem) {
     XDCHECK_EQ(newItemHdl->getSize(), oldChainedItem.getSize());
     XDCHECK_EQ(reinterpret_cast<uintptr_t>(&parentItem),
                reinterpret_cast<uintptr_t>(
-                   &oldChainedItem.getParentItem(compressor_)));
+                   &newItemHdl->asChainedItem().getParentItem(compressor_)));
 
     return newItemHdl;
   }
 
+  const auto tid = getTierId(oldItem);
   const auto allocInfo =
-      allocator_->getAllocInfo(static_cast<const void*>(&oldItem));
+      allocator_[tid]->getAllocInfo(static_cast<const void*>(&oldItem));
+  bool evict = !config_.insertToFirstFreeTier || tid == getNumTiers() - 1;
 
   // Set up the destination for the move. Since oldItem would have the moving
   // bit set, it won't be picked for eviction.
-  auto newItemHdl = allocateInternal(allocInfo.poolId,
-                                     oldItem.getKey(),
-                                     oldItem.getSize(),
-                                     oldItem.getCreationTime(),
-                                     oldItem.getExpiryTime(),
-                                     false);
+  auto newItemHdl = allocateInternalTier(tid,
+                                         allocInfo.poolId,
+                                         oldItem.getKey(),
+                                         oldItem.getSize(),
+                                         oldItem.getCreationTime(),
+                                         oldItem.getExpiryTime(),
+                                         evict);
   if (!newItemHdl) {
     return {};
   }
@@ -5002,12 +6150,13 @@ void CacheAllocator<CacheTrait>::evictForSlabRelease(Item& item) {
     nvmCache_->put(*evicted, std::move(token));
   }
 
+  const auto tid = getTierId(*evicted);
   const auto allocInfo =
-      allocator_->getAllocInfo(static_cast<const void*>(&item));
+      allocator_[tid]->getAllocInfo(static_cast<const void*>(evicted));
   if (evicted->hasChainedItem()) {
-    (*stats_.chainedItemEvictions)[allocInfo.poolId][allocInfo.classId].inc();
+    (*stats_.chainedItemEvictions)[tid][allocInfo.poolId][allocInfo.classId].inc();
   } else {
-    (*stats_.regularItemEvictions)[allocInfo.poolId][allocInfo.classId].inc();
+    (*stats_.regularItemEvictions)[tid][allocInfo.poolId][allocInfo.classId].inc();
   }
 
   stats_.numEvictionSuccesses.inc();
@@ -5052,11 +6201,15 @@ bool CacheAllocator<CacheTrait>::markMovingForSlabRelease(
   // At first, we assume this item was already freed
   bool itemFreed = true;
   bool markedMoving = false;
-  const auto fn = [this, &markedMoving, &itemFreed](void* memory) {
+  TierId tid = getTierId(alloc);
+  const auto fn = [this, tid, &markedMoving, &itemFreed](void* memory) {
     // Since this callback is executed, the item is not yet freed
     itemFreed = false;
     Item* item = static_cast<Item*>(memory);
-    auto& mmContainer = getMMContainer(*item);
+    auto allocInfo = allocator_[tid]->getAllocInfo(memory);
+    auto pid = allocInfo.poolId;
+    auto cid = allocInfo.classId;
+    auto& mmContainer = getMMContainer(tid, pid, cid);
     mmContainer.withContainerLock([this, &mmContainer, &item, &markedMoving]() {
       // we rely on the mmContainer lock to safely check that the item is
       // currently in the mmContainer (no other threads are currently
@@ -5094,7 +6247,7 @@ bool CacheAllocator<CacheTrait>::markMovingForSlabRelease(
 
   auto startTime = util::getCurrentTimeSec();
   while (true) {
-    allocator_->processAllocForRelease(ctx, alloc, fn);
+    allocator_[tid]->processAllocForRelease(ctx, alloc, fn);
 
     // If item is already freed we give up trying to mark the item moving
     // and return false, otherwise if marked as moving, we return true.
@@ -5109,7 +6262,7 @@ bool CacheAllocator<CacheTrait>::markMovingForSlabRelease(
     itemFreed = true;
 
     if (shutDownInProgress_) {
-      allocator_->abortSlabRelease(ctx);
+      allocator_[tid]->abortSlabRelease(ctx);
       throw exception::SlabReleaseAborted(
           folly::sformat("Slab Release aborted while still trying to mark"
                          " as moving for Item: {}. Pool: {}, Class: {}.",
@@ -5133,12 +6286,15 @@ template <typename CCacheT, typename... Args>
 CCacheT* CacheAllocator<CacheTrait>::addCompactCache(folly::StringPiece name,
                                                      size_t size,
                                                      Args&&... args) {
+  if (getNumTiers() != 1)
+    throw std::runtime_error("TODO: compact cache for multi-tier Cache not supported.");
+
   if (!config_.isCompactCacheEnabled()) {
     throw std::logic_error("Compact cache is not enabled");
   }
 
   std::unique_lock lock(compactCachePoolsLock_);
-  auto poolId = allocator_->addPool(name, size, {Slab::kSize});
+  auto poolId = allocator_[0]->addPool(name, size, {Slab::kSize});
   isCompactCachePool_[poolId] = true;
 
   auto ptr = std::make_unique<CCacheT>(
@@ -5234,8 +6390,12 @@ folly::IOBufQueue CacheAllocator<CacheTrait>::saveStateToIOBuf() {
     for (PoolId pid : pools) {
       for (unsigned int cid = 0; cid < (*stats_.fragmentationSize)[pid].size();
            ++cid) {
+        uint64_t fragmentationSize = 0;
+        for (TierId tid = 0; tid < getNumTiers(); tid++) {
+            fragmentationSize += (*stats_.fragmentationSize)[tid][pid][cid].get();
+        }
         metadata_.fragmentationSize()[pid][static_cast<ClassId>(cid)] =
-            (*stats_.fragmentationSize)[pid][cid].get();
+            fragmentationSize;
       }
       if (isCompactCachePool_[pid]) {
         metadata_.compactCachePools()->push_back(pid);
@@ -5247,12 +6407,15 @@ folly::IOBufQueue CacheAllocator<CacheTrait>::saveStateToIOBuf() {
   *metadata_.numChainedChildItems() = stats_.numChainedChildItems.get();
   *metadata_.numAbortedSlabReleases() = stats_.numAbortedSlabReleases.get();
 
+  // TODO: implement serialization for multiple tiers
   auto serializeMMContainers = [](MMContainers& mmContainers) {
     MMSerializationTypeContainer state;
-    for (unsigned int i = 0; i < mmContainers.size(); ++i) {
+    for (unsigned int i = 0; i < 1 /* TODO: */ ; ++i) {
       for (unsigned int j = 0; j < mmContainers[i].size(); ++j) {
-        if (mmContainers[i][j]) {
-          state.pools_ref()[i][j] = mmContainers[i][j]->saveState();
+        for (unsigned int k = 0; k < mmContainers[i][j].size(); ++k) {
+          if (mmContainers[i][j][k]) {
+            state.pools_ref()[j][k] = mmContainers[i][j][k]->saveState();
+          }
         }
       }
     }
@@ -5262,7 +6425,8 @@ folly::IOBufQueue CacheAllocator<CacheTrait>::saveStateToIOBuf() {
       serializeMMContainers(mmContainers_);
 
   AccessSerializationType accessContainerState = accessContainer_->saveState();
-  MemoryAllocator::SerializationType allocatorState = allocator_->saveState();
+  // TODO: foreach allocator
+  MemoryAllocator::SerializationType allocatorState = allocator_[0]->saveState();
   CCacheManager::SerializationType ccState = compactCacheManager_->saveState();
 
   AccessSerializationType chainedItemAccessContainerState =
@@ -5326,6 +6490,8 @@ CacheAllocator<CacheTrait>::shutDown() {
       (shmShutDownStatus == ShmShutDownRes::kSuccess);
   shmManager_.reset();
 
+  // TODO: save per-tier state
+
   if (shmShutDownSucceeded) {
     if (!nvmShutDownStatusOpt || *nvmShutDownStatusOpt)
       return ShutDownStatus::kSuccess;
@@ -5389,22 +6555,26 @@ CacheAllocator<CacheTrait>::deserializeMMContainers(
   const auto container =
       deserializer.deserialize<MMSerializationTypeContainer>();
 
-  MMContainers mmContainers;
+  /* TODO: right now, we create empty containers because deserialization
+   * only works for a single (topmost) tier. */
+  MMContainers mmContainers{getNumTiers()};
 
   for (auto& kvPool : *container.pools_ref()) {
     auto i = static_cast<PoolId>(kvPool.first);
     auto& pool = getPool(i);
     for (auto& kv : kvPool.second) {
       auto j = static_cast<ClassId>(kv.first);
-      MMContainerPtr ptr =
-          std::make_unique<typename MMContainerPtr::element_type>(kv.second,
-                                                                  compressor);
-      auto config = ptr->getConfig();
-      config.addExtraConfig(config_.trackTailHits
-                                ? pool.getAllocationClass(j).getAllocsPerSlab()
-                                : 0);
-      ptr->setConfig(config);
-      mmContainers[i][j] = std::move(ptr);
+      for (TierId tid = 0; tid < getNumTiers(); tid++) {
+        MMContainerPtr ptr =
+            std::make_unique<typename MMContainerPtr::element_type>(kv.second,
+                                                                    compressor);
+        auto config = ptr->getConfig();
+        config.addExtraConfig(config_.trackTailHits
+                                  ? pool.getAllocationClass(j).getAllocsPerSlab()
+                                  : 0);
+        ptr->setConfig(config);
+        mmContainers[tid][i][j] = std::move(ptr);
+      }
     }
   }
   // We need to drop the unevictableMMContainer in the desierializer.
@@ -5471,8 +6641,18 @@ void CacheAllocator<CacheTrait>::initStats() {
   // deserialize the fragmentation size of each thread.
   for (const auto& pid : *metadata_.fragmentationSize()) {
     for (const auto& cid : pid.second) {
-      (*stats_.fragmentationSize)[pid.first][cid.first].set(
-          static_cast<uint64_t>(cid.second));
+      //in multi-tier we serialized as the sum - no way
+      //to get back so just divide the two for now
+      //TODO: proper multi-tier serialization
+      uint64_t total = static_cast<uint64_t>(cid.second);
+      uint64_t part = total / getNumTiers();
+      uint64_t sum = 0;
+      for (TierId tid = 1; tid < getNumTiers(); tid++) {
+        (*stats_.fragmentationSize)[tid][pid.first][cid.first].set(part);
+        sum += part;
+      }
+      uint64_t leftover = total - sum;
+      (*stats_.fragmentationSize)[0][pid.first][cid.first].set(leftover);
     }
   }
 
@@ -5560,11 +6740,14 @@ GlobalCacheStats CacheAllocator<CacheTrait>::getGlobalCacheStats() const {
 
 template <typename CacheTrait>
 CacheMemoryStats CacheAllocator<CacheTrait>::getCacheMemoryStats() const {
-  const auto totalCacheSize = allocator_->getMemorySize();
-  const auto configuredTotalCacheSize = allocator_->getMemorySizeInclAdvised();
-
+  size_t totalCacheSize = 0;
+  size_t configuredTotalCacheSize = 0;
+  for(auto& allocator: allocator_) {
+    totalCacheSize += allocator->getMemorySize();
+    configuredTotalCacheSize += allocator->getMemorySizeInclAdvised();
+  }
   auto addSize = [this](size_t a, PoolId pid) {
-    return a + allocator_->getPool(pid).getPoolSize();
+    return a + allocator_[0]->getPool(pid).getPoolSize();
   };
   const auto regularPoolIds = getRegularPoolIds();
   const auto ccCachePoolIds = getCCachePoolIds();
@@ -5577,9 +6760,9 @@ CacheMemoryStats CacheAllocator<CacheTrait>::getCacheMemoryStats() const {
                           configuredTotalCacheSize,
                           configuredRegularCacheSize,
                           configuredCompactCacheSize,
-                          allocator_->getAdvisedMemorySize(),
+                          allocator_[0]->getAdvisedMemorySize(),
                           memMonitor_ ? memMonitor_->getMaxAdvisePct() : 0,
-                          allocator_->getUnreservedMemorySize(),
+                          allocator_[0]->getUnreservedMemorySize(),
                           nvmCache_ ? nvmCache_->getSize() : 0,
                           util::getMemAvailable(),
                           util::getRSSBytes()};
@@ -5718,14 +6901,14 @@ bool CacheAllocator<CacheTrait>::startNewReaper(
 
 template <typename CacheTrait>
 auto CacheAllocator<CacheTrait>::createBgWorkerMemoryAssignments(
-    size_t numWorkers) {
+    size_t numWorkers, TierId tid) {
   std::vector<std::vector<MemoryDescriptorType>> asssignedMemory(numWorkers);
-  auto pools = filterCompactCachePools(allocator_->getPoolIds());
+  auto pools = filterCompactCachePools(allocator_[tid]->getPoolIds());
   for (const auto pid : pools) {
-    const auto& mpStats = getPool(pid).getStats();
+    const auto& mpStats = getPoolByTid(pid, tid).getStats();
     for (const auto cid : mpStats.classIds) {
-      asssignedMemory[BackgroundMover<CacheT>::workerId(pid, cid, numWorkers)]
-          .emplace_back(pid, cid);
+      asssignedMemory[BackgroundMover<CacheT>::workerId(tid, pid, cid, numWorkers)]
+          .emplace_back(tid, pid, cid);
     }
   }
   return asssignedMemory;
@@ -5740,7 +6923,7 @@ bool CacheAllocator<CacheTrait>::startNewBackgroundEvictor(
   backgroundEvictor_.resize(threads);
   bool result = true;
 
-  auto memoryAssignments = createBgWorkerMemoryAssignments(threads);
+  auto memoryAssignments = createBgWorkerMemoryAssignments(threads, 0);
   for (size_t i = 0; i < threads; i++) {
     auto ret = startNewWorker("BackgroundEvictor" + std::to_string(i),
                               backgroundEvictor_[i], interval, *this, strategy,
@@ -5763,7 +6946,7 @@ bool CacheAllocator<CacheTrait>::startNewBackgroundPromoter(
   backgroundPromoter_.resize(threads);
   bool result = true;
 
-  auto memoryAssignments = createBgWorkerMemoryAssignments(threads);
+  auto memoryAssignments = createBgWorkerMemoryAssignments(threads, 1);
   for (size_t i = 0; i < threads; i++) {
     auto ret = startNewWorker("BackgroundPromoter" + std::to_string(i),
                               backgroundPromoter_[i], interval, *this, strategy,
@@ -5866,7 +7049,8 @@ bool CacheAllocator<CacheTrait>::cleanupStrayShmSegments(
     // Any other concurrent process can not be attached to the segments or
     // even if it does, we want to mark it for destruction.
     ShmManager::removeByName(cacheDir, detail::kShmInfoName, posix);
-    ShmManager::removeByName(cacheDir, detail::kShmCacheName, posix);
+    ShmManager::removeByName(cacheDir, detail::kShmCacheName
+                             + std::to_string(0 /* TODO: per tier */), posix);
     ShmManager::removeByName(cacheDir, detail::kShmHashTableName, posix);
     ShmManager::removeByName(cacheDir, detail::kShmChainedItemHashTableName,
                              posix);
@@ -5881,13 +7065,14 @@ uint64_t CacheAllocator<CacheTrait>::getItemPtrAsOffset(const void* ptr) {
   // errors downstream.
 
   // if this succeeeds, the address is valid within the cache.
-  allocator_->getAllocInfo(ptr);
+  auto tid = getTierId(ptr);
+  allocator_[tid]->getAllocInfo(ptr);
 
   if (!isOnShm_ || !shmManager_) {
     throw std::invalid_argument("Shared memory not used");
   }
 
-  const auto& shm = shmManager_->getShmByName(detail::kShmCacheName);
+  const auto& shm = shmManager_->getShmByName(detail::kShmCacheName + std::to_string(tid));
 
   return reinterpret_cast<uint64_t>(ptr) -
          reinterpret_cast<uint64_t>(shm.getCurrentMapping().addr);
diff --git a/cachelib/allocator/CacheAllocatorConfig.h b/cachelib/allocator/CacheAllocatorConfig.h
index 768b15c5eb..70be2e37cf 100644
--- a/cachelib/allocator/CacheAllocatorConfig.h
+++ b/cachelib/allocator/CacheAllocatorConfig.h
@@ -313,6 +313,11 @@ class CacheAllocatorConfig {
   // Library team if you find yourself customizing this.
   CacheAllocatorConfig& setThrottlerConfig(util::Throttler::Config config);
 
+  // Insert items to first free memory tier
+  CacheAllocatorConfig& enableInsertToFirstFreeTier();
+  
+  CacheAllocatorConfig& enableNoOnlineEviction();
+
   // Passes in a callback to initialize an event tracker when the allocator
   // starts
   CacheAllocatorConfig& setEventTracker(EventTrackerSharedPtr&&);
@@ -539,6 +544,13 @@ class CacheAllocatorConfig {
   // ABOVE are the config for various cache workers
   //
 
+  // if turned off, always insert new elements to topmost memory tier.
+  // if turned on, insert new element to first free memory tier or evict memory
+  // from the bottom one if memory cache is full
+  bool insertToFirstFreeTier = false;
+
+  bool noOnlineEviction = false;
+
   // the number of tries to search for an item to evict
   // 0 means it's infinite
   unsigned int evictionSearchTries{50};
@@ -639,6 +651,24 @@ class CacheAllocatorConfig {
   // CacheAllocator::startCacheWorkers()
   bool delayCacheWorkersStart{false};
 
+  // see MultiTierDataMovement.md
+  double promotionAcWatermark{4.0}; 
+  double lowEvictionAcWatermark{2.0};
+  double highEvictionAcWatermark{5.0};
+  double numDuplicateElements{0.0}; // inclusivness of the cache
+  double syncPromotion{0.0}; // can promotion be done synchronously in user thread
+  
+  uint64_t evictorThreads{1};
+  uint64_t promoterThreads{1};
+
+  uint64_t maxEvictionBatch{40};
+  uint64_t maxPromotionBatch{10};
+
+  uint64_t minEvictionBatch{1};
+  uint64_t minPromotionBatch{1};
+
+  uint64_t maxEvictionPromotionHotness{60};
+
   friend CacheT;
 
  private:
@@ -655,6 +685,18 @@ class CacheAllocatorConfig {
       {MemoryTierCacheConfig::fromShm().setRatio(1)}};
 };
 
+template <typename T>
+CacheAllocatorConfig<T>& CacheAllocatorConfig<T>::enableInsertToFirstFreeTier() {
+  insertToFirstFreeTier = true;
+  return *this;
+}
+
+template <typename T>
+CacheAllocatorConfig<T>& CacheAllocatorConfig<T>::enableNoOnlineEviction() {
+  noOnlineEviction = true;
+  return *this;
+}
+
 template <typename T>
 CacheAllocatorConfig<T>& CacheAllocatorConfig<T>::setCacheName(
     const std::string& _cacheName) {
@@ -1236,6 +1278,8 @@ std::map<std::string, std::string> CacheAllocatorConfig<T>::serialize() const {
   configMap["nvmAdmissionMinTTL"] = std::to_string(nvmAdmissionMinTTL);
   configMap["delayCacheWorkersStart"] =
       delayCacheWorkersStart ? "true" : "false";
+  configMap["insertToFirstFreeTier"] = std::to_string(insertToFirstFreeTier);
+  configMap["noOnlineEviction"] = std::to_string(noOnlineEviction);
   mergeWithPrefix(configMap, throttleConfig.serialize(), "throttleConfig");
   mergeWithPrefix(configMap,
                   chainedItemAccessConfig.serialize(),
diff --git a/cachelib/allocator/CacheItem.h b/cachelib/allocator/CacheItem.h
index fe60187e6e..17b80f5ba3 100644
--- a/cachelib/allocator/CacheItem.h
+++ b/cachelib/allocator/CacheItem.h
@@ -43,6 +43,9 @@ class BaseAllocatorTest;
 template <typename AllocatorT>
 class AllocatorHitStatsTest;
 
+template <typename AllocatorT>
+class AllocatorMemoryTiersTest;
+
 template <typename AllocatorT>
 class MapTest;
 
@@ -466,6 +469,8 @@ class CACHELIB_PACKED_ATTR CacheItem {
   FRIEND_TEST(ItemTest, NonStringKey);
   template <typename AllocatorT>
   friend class facebook::cachelib::tests::AllocatorHitStatsTest;
+  template <typename AllocatorT>
+  friend class facebook::cachelib::tests::AllocatorMemoryTiersTest;
 };
 
 // A chained item has a hook pointing to the next chained item. The hook is
diff --git a/cachelib/allocator/CacheStats.cpp b/cachelib/allocator/CacheStats.cpp
index 6b7a1c943b..f09fe4e0db 100644
--- a/cachelib/allocator/CacheStats.cpp
+++ b/cachelib/allocator/CacheStats.cpp
@@ -22,18 +22,21 @@ namespace facebook::cachelib {
 namespace detail {
 
 void Stats::init() {
-  cacheHits = std::make_unique<PerPoolClassTLCounters>();
-  allocAttempts = std::make_unique<PerPoolClassAtomicCounters>();
-  evictionAttempts = std::make_unique<PerPoolClassAtomicCounters>();
-  fragmentationSize = std::make_unique<PerPoolClassAtomicCounters>();
-  allocFailures = std::make_unique<PerPoolClassAtomicCounters>();
-  chainedItemEvictions = std::make_unique<PerPoolClassAtomicCounters>();
-  regularItemEvictions = std::make_unique<PerPoolClassAtomicCounters>();
+  cacheHits = std::make_unique<PerTierPerPoolClassTLCounters>();
+  allocAttempts = std::make_unique<PerTierPerPoolClassAtomicCounters>();
+  evictionAttempts = std::make_unique<PerTierPerPoolClassAtomicCounters>();
+  fragmentationSize = std::make_unique<PerTierPerPoolClassAtomicCounters>();
+  allocFailures = std::make_unique<PerTierPerPoolClassAtomicCounters>();
+  chainedItemEvictions = std::make_unique<PerTierPerPoolClassAtomicCounters>();
+  regularItemEvictions = std::make_unique<PerTierPerPoolClassAtomicCounters>();
+  numWritebacks = std::make_unique<PerTierPerPoolClassAtomicCounters>();
   auto initToZero = [](auto& a) {
-    for (auto& s : a) {
-      for (auto& c : s) {
+    for (auto& t : a) {
+     for (auto& p : t) {
+      for (auto& c : p) {
         c.set(0);
       }
+     }
     }
   };
 
@@ -43,6 +46,9 @@ void Stats::init() {
   initToZero(*fragmentationSize);
   initToZero(*chainedItemEvictions);
   initToZero(*regularItemEvictions);
+  initToZero(*numWritebacks);
+
+  classAllocLatency = std::make_unique<PerTierPoolClassRollingStats>();
 }
 
 template <int>
@@ -50,7 +56,7 @@ struct SizeVerify {};
 
 void Stats::populateGlobalCacheStats(GlobalCacheStats& ret) const {
 #ifndef SKIP_SIZE_VERIFY
-  SizeVerify<sizeof(Stats)> a = SizeVerify<16272>{};
+  SizeVerify<sizeof(Stats)> a = SizeVerify<16640>{};
   std::ignore = a;
 #endif
   ret.numCacheGets = numCacheGets.get();
@@ -99,6 +105,8 @@ void Stats::populateGlobalCacheStats(GlobalCacheStats& ret) const {
   ret.numNvmItemDestructorAllocErrors = numNvmItemDestructorAllocErrors.get();
 
   ret.allocateLatencyNs = this->allocateLatency_.estimate();
+  ret.bgEvictLatencyNs = this->bgEvictLatency_.estimate();
+  ret.bgPromoteLatencyNs = this->bgPromoteLatency_.estimate();
   ret.moveChainedLatencyNs = this->moveChainedLatency_.estimate();
   ret.moveRegularLatencyNs = this->moveRegularLatency_.estimate();
   ret.nvmLookupLatencyNs = this->nvmLookupLatency_.estimate();
@@ -113,20 +121,43 @@ void Stats::populateGlobalCacheStats(GlobalCacheStats& ret) const {
   ret.nvmEvictionSecondsToExpiry = this->nvmEvictionSecondsToExpiry_.estimate();
   ret.nvmPutSize = this->nvmPutSize_.estimate();
 
-  auto accum = [](const PerPoolClassAtomicCounters& c) {
-    uint64_t sum = 0;
-    for (const auto& x : c) {
-      for (const auto& v : x) {
-        sum += v.get();
-      }
+  auto accum = [](const PerTierPerPoolClassAtomicCounters& t) {
+    std::vector<uint64_t> stat;
+    for (const auto& c : t) {
+     uint64_t sum = 0;
+     for (const auto& x : c) {
+       for (const auto& v : x) {
+         sum += v.get();
+       }
+     }
+     stat.push_back(sum);
+    }
+    return stat;
+  };
+
+  auto accumTL = [](const PerTierPerPoolClassTLCounters& t) {
+    std::vector<uint64_t> stat;
+    for (const auto& c : t) {
+     uint64_t sum = 0;
+     for (const auto& x : c) {
+       for (const auto& v : x) {
+         sum += v.get();
+       }
+     }
+     stat.push_back(sum);
     }
-    return sum;
+    return stat;
   };
   ret.allocAttempts = accum(*allocAttempts);
   ret.evictionAttempts = accum(*evictionAttempts);
   ret.allocFailures = accum(*allocFailures);
-  ret.numEvictions = accum(*chainedItemEvictions);
-  ret.numEvictions += accum(*regularItemEvictions);
+  auto chainedEvictions = accum(*chainedItemEvictions);
+  auto regularEvictions = accum(*regularItemEvictions);
+  for (TierId tid = 0; tid < chainedEvictions.size(); tid++) {
+    ret.numEvictions.push_back(chainedEvictions[tid] + regularEvictions[tid]);
+  }
+  ret.numWritebacks = accum(*numWritebacks);
+  ret.numCacheHits = accumTL(*cacheHits);
 
   ret.invalidAllocs = invalidAllocs.get();
   ret.numRefcountOverflow = numRefcountOverflow.get();
@@ -144,6 +175,18 @@ void Stats::populateGlobalCacheStats(GlobalCacheStats& ret) const {
 
 } // namespace detail
 
+MMContainerStat& MMContainerStat::operator+=(const MMContainerStat& other) {
+
+  size += other.size;
+  oldestTimeSec = std::min(oldestTimeSec,other.oldestTimeSec);
+  lruRefreshTime = std::max(lruRefreshTime,other.lruRefreshTime);
+  numHotAccesses += other.numHotAccesses;
+  numColdAccesses += other.numColdAccesses;
+  numWarmAccesses += other.numWarmAccesses;
+  numTailAccesses += other.numTailAccesses;
+  return *this;
+}
+
 PoolStats& PoolStats::operator+=(const PoolStats& other) {
   auto verify = [](bool isCompatible) {
     if (!isCompatible) {
@@ -181,6 +224,7 @@ PoolStats& PoolStats::operator+=(const PoolStats& other) {
       d.allocFailures += s.allocFailures;
       d.fragmentationSize += s.fragmentationSize;
       d.numHits += s.numHits;
+      d.numWritebacks += s.numWritebacks;
       d.chainedItemEvictions += s.chainedItemEvictions;
       d.regularItemEvictions += s.regularItemEvictions;
     }
@@ -236,6 +280,14 @@ uint64_t PoolStats::numEvictions() const noexcept {
   return n;
 }
 
+uint64_t PoolStats::numWritebacks() const noexcept {
+  uint64_t n = 0;
+  for (const auto& s : cacheStats) {
+    n += s.second.numWritebacks;
+  }
+  return n;
+}
+
 uint64_t PoolStats::numItems() const noexcept {
   uint64_t n = 0;
   for (const auto& s : cacheStats) {
@@ -244,6 +296,14 @@ uint64_t PoolStats::numItems() const noexcept {
   return n;
 }
 
+uint64_t PoolStats::numHits() const noexcept {
+  uint64_t n = 0;
+  for (const auto& s : cacheStats) {
+    n += s.second.numHits;
+  }
+  return n;
+}
+
 uint64_t PoolStats::numAllocFailures() const {
   uint64_t n = 0;
   for (const auto& s : cacheStats) {
diff --git a/cachelib/allocator/CacheStats.h b/cachelib/allocator/CacheStats.h
index 60f6f5e2c5..18e62dbfee 100644
--- a/cachelib/allocator/CacheStats.h
+++ b/cachelib/allocator/CacheStats.h
@@ -27,6 +27,7 @@
 #include "cachelib/allocator/memory/Slab.h"
 #include "cachelib/common/FastStats.h"
 #include "cachelib/common/PercentileStats.h"
+#include "cachelib/common/RollingStats.h"
 #include "cachelib/common/Time.h"
 
 namespace facebook {
@@ -79,22 +80,25 @@ struct PoolEvictionAgeStats {
 // Stats for MM container
 struct MMContainerStat {
   // number of elements in the container.
-  size_t size;
+  size_t size{0};
 
   // what is the unix timestamp in seconds of the oldest element existing in
   // the container.
-  uint64_t oldestTimeSec;
+  uint64_t oldestTimeSec{0};
 
   // refresh time for LRU
-  uint64_t lruRefreshTime;
+  uint64_t lruRefreshTime{0};
 
   // TODO: Make the MMContainerStat generic by moving the Lru/2Q specific
   // stats inside MMType and exporting them through a generic stats interface.
   // number of hits in each lru.
-  uint64_t numHotAccesses;
-  uint64_t numColdAccesses;
-  uint64_t numWarmAccesses;
-  uint64_t numTailAccesses;
+  uint64_t numHotAccesses{0};
+  uint64_t numColdAccesses{0};
+  uint64_t numWarmAccesses{0};
+  uint64_t numTailAccesses{0};
+
+  // aggregate stats together (accross tiers)
+  MMContainerStat& operator+=(const MMContainerStat& other);
 };
 
 // cache related stats for a given allocation class.
@@ -115,13 +119,16 @@ struct CacheStat {
   uint64_t fragmentationSize{0};
 
   // number of hits for this container.
-  uint64_t numHits;
+  uint64_t numHits{0};
 
   // number of evictions from this class id that was of a chained item
-  uint64_t chainedItemEvictions;
+  uint64_t chainedItemEvictions{0};
 
   // number of regular items that were evicted from this classId
-  uint64_t regularItemEvictions;
+  uint64_t regularItemEvictions{0};
+
+  // number of items that are moved to next tier
+  uint64_t numWritebacks{0};
 
   // the stats from the mm container
   MMContainerStat containerStat;
@@ -198,12 +205,18 @@ struct PoolStats {
   // number of evictions for this pool
   uint64_t numEvictions() const noexcept;
 
+  // number of writebacks for this pool
+  uint64_t numWritebacks() const noexcept;
+
   // number of all items in this pool
   uint64_t numItems() const noexcept;
 
   // total number of allocations currently in this pool
   uint64_t numActiveAllocs() const noexcept;
 
+  // number of hits for an alloc class in this pool
+  uint64_t numHits() const noexcept;
+
   // number of hits for an alloc class in this pool
   uint64_t numHitsForClass(ClassId cid) const {
     return cacheStats.at(cid).numHits;
@@ -300,26 +313,43 @@ struct RebalancerStats {
 
   uint64_t lastPickTimeMs{0};
   uint64_t avgPickTimeMs{0};
+  
+  // aggregate stats together (accross tiers)
+  RebalancerStats& operator+=(const RebalancerStats& other);
 };
 
 // Mover Stats
 struct BackgroundMoverStats {
   // the number of items this worker moved by looking at pools/classes stats
   uint64_t numMovedItems{0};
-  // number of times we went executed the thread //TODO: is this def correct?
+  
+  // number of times we went executed the thread (by periodic worker)
   uint64_t runCount{0};
-  // total number of classes
+
+  // average number of items moved per run
+  double avgItemsMoved{0.0};
+
+  // number of times we actually traversed the mmContainer
+  uint64_t numTraversals{0};
+
+  // number of classes traversed
   uint64_t totalClasses{0};
-  // eviction size
+
+  // total bytes moved
   uint64_t totalBytesMoved{0};
+  
+  // indicates the time in ns for the last iteration
+  uint64_t lastTraversalTimeNs{0};
+
+  // indicates the maximum of all traversals
+  uint64_t minTraversalTimeNs{0};
+
+  // indicates the minimum of all traversals
+  uint64_t maxTraversalTimeNs{0};
+
+  // indicates the average of all traversals
+  uint64_t avgTraversalTimeNs{0};
 
-  BackgroundMoverStats& operator+=(const BackgroundMoverStats& rhs) {
-    numMovedItems += rhs.numMovedItems;
-    runCount += rhs.runCount;
-    totalClasses += rhs.totalClasses;
-    totalBytesMoved += rhs.totalBytesMoved;
-    return *this;
-  }
 };
 
 // CacheMetadata type to export
@@ -343,9 +373,9 @@ struct Stats;
 // the ones that are aggregated over all pools
 struct GlobalCacheStats {
   // background eviction stats
-  BackgroundMoverStats evictionStats;
-
-  BackgroundMoverStats promotionStats;
+  std::vector<BackgroundMoverStats> evictionStats;
+  
+  std::vector<BackgroundMoverStats> promotionStats;
 
   // number of calls to CacheAllocator::find
   uint64_t numCacheGets{0};
@@ -453,16 +483,22 @@ struct GlobalCacheStats {
   uint64_t numNvmItemRemovedSetSize{0};
 
   // number of attempts to allocate an item
-  uint64_t allocAttempts{0};
+  std::vector<uint64_t> allocAttempts;
 
   // number of eviction attempts
-  uint64_t evictionAttempts{0};
+  std::vector<uint64_t> evictionAttempts;
 
   // number of failures to allocate an item due to internal error
-  uint64_t allocFailures{0};
+  std::vector<uint64_t> allocFailures;
 
   // number of evictions across all the pools in the cache.
-  uint64_t numEvictions{0};
+  std::vector<uint64_t> numEvictions;
+
+  // number of writebacks across all the pools in the cache.
+  std::vector<uint64_t> numWritebacks;
+
+  // number of hits per tier across all the pools in the cache.
+  std::vector<uint64_t> numCacheHits;
 
   // number of allocation attempts with invalid input params.
   uint64_t invalidAllocs{0};
@@ -493,6 +529,8 @@ struct GlobalCacheStats {
 
   // latency and percentile stats of various cachelib operations
   util::PercentileStats::Estimates allocateLatencyNs{};
+  util::PercentileStats::Estimates bgEvictLatencyNs{};
+  util::PercentileStats::Estimates bgPromoteLatencyNs{};
   util::PercentileStats::Estimates moveChainedLatencyNs{};
   util::PercentileStats::Estimates moveRegularLatencyNs{};
   util::PercentileStats::Estimates nvmLookupLatencyNs{};
diff --git a/cachelib/allocator/CacheStatsInternal.h b/cachelib/allocator/CacheStatsInternal.h
index b0934eb0c1..ece1f87a48 100644
--- a/cachelib/allocator/CacheStatsInternal.h
+++ b/cachelib/allocator/CacheStatsInternal.h
@@ -21,6 +21,7 @@
 #include "cachelib/allocator/Cache.h"
 #include "cachelib/allocator/memory/MemoryAllocator.h"
 #include "cachelib/common/AtomicCounter.h"
+#include "cachelib/common/RollingStats.h"
 
 namespace facebook {
 namespace cachelib {
@@ -188,6 +189,8 @@ struct Stats {
 
   // latency stats of various cachelib operations
   mutable util::PercentileStats allocateLatency_;
+  mutable util::PercentileStats bgEvictLatency_;
+  mutable util::PercentileStats bgPromoteLatency_;
   mutable util::PercentileStats moveChainedLatency_;
   mutable util::PercentileStats moveRegularLatency_;
   mutable util::PercentileStats nvmLookupLatency_;
@@ -211,23 +214,34 @@ struct Stats {
   // we're currently writing into flash.
   mutable util::PercentileStats nvmPutSize_;
 
-  using PerPoolClassAtomicCounters =
+  using PerTierPerPoolClassAtomicCounters = std::array<
       std::array<std::array<AtomicCounter, MemoryAllocator::kMaxClasses>,
-                 MemoryPoolManager::kMaxPools>;
+                 MemoryPoolManager::kMaxPools>,
+      CacheBase::kMaxTiers>;
 
   // count of a stat for a specific allocation class
-  using PerPoolClassTLCounters =
+  using PerTierPerPoolClassTLCounters = std::array<
       std::array<std::array<TLCounter, MemoryAllocator::kMaxClasses>,
-                 MemoryPoolManager::kMaxPools>;
+                 MemoryPoolManager::kMaxPools>,
+      CacheBase::kMaxTiers>;
 
   // hit count for every alloc class in every pool
-  std::unique_ptr<PerPoolClassTLCounters> cacheHits{};
-  std::unique_ptr<PerPoolClassAtomicCounters> allocAttempts{};
-  std::unique_ptr<PerPoolClassAtomicCounters> evictionAttempts{};
-  std::unique_ptr<PerPoolClassAtomicCounters> allocFailures{};
-  std::unique_ptr<PerPoolClassAtomicCounters> fragmentationSize{};
-  std::unique_ptr<PerPoolClassAtomicCounters> chainedItemEvictions{};
-  std::unique_ptr<PerPoolClassAtomicCounters> regularItemEvictions{};
+  std::unique_ptr<PerTierPerPoolClassTLCounters> cacheHits{};
+  std::unique_ptr<PerTierPerPoolClassAtomicCounters> allocAttempts{};
+  std::unique_ptr<PerTierPerPoolClassAtomicCounters> evictionAttempts{};
+  std::unique_ptr<PerTierPerPoolClassAtomicCounters> allocFailures{};
+  std::unique_ptr<PerTierPerPoolClassAtomicCounters> fragmentationSize{};
+  std::unique_ptr<PerTierPerPoolClassAtomicCounters> chainedItemEvictions{};
+  std::unique_ptr<PerTierPerPoolClassAtomicCounters> regularItemEvictions{};
+  std::unique_ptr<PerTierPerPoolClassAtomicCounters> numWritebacks{};
+
+  using PerTierPoolClassRollingStats = std::array<
+      std::array<std::array<util::RollingStats, MemoryAllocator::kMaxClasses>,
+                 MemoryPoolManager::kMaxPools>,
+      CacheBase::kMaxTiers>;
+
+  // rolling latency tracking for every alloc class in every pool
+  std::unique_ptr<PerTierPoolClassRollingStats> classAllocLatency{};
 
   // Eviction failures due to parent cannot be removed from access container
   AtomicCounter evictFailParentAC{0};
diff --git a/cachelib/allocator/FreeThresholdStrategy.cpp b/cachelib/allocator/FreeThresholdStrategy.cpp
index 1fafda2bc9..284248b1cf 100644
--- a/cachelib/allocator/FreeThresholdStrategy.cpp
+++ b/cachelib/allocator/FreeThresholdStrategy.cpp
@@ -30,9 +30,47 @@ FreeThresholdStrategy::FreeThresholdStrategy(double lowEvictionAcWatermark,
       minEvictionBatch(minEvictionBatch) {}
 
 std::vector<size_t> FreeThresholdStrategy::calculateBatchSizes(
-    const CacheBase& /* cache */,
-    std::vector<MemoryDescriptorType> /* acVec */) {
-  throw std::runtime_error("Not supported yet!");
+    const CacheBase& cache,
+    std::vector<MemoryDescriptorType> acVec) {
+  std::vector<size_t> batches{};
+  for (auto [tid, pid, cid] : acVec) {
+    const auto& pool = cache.getPoolByTid(pid, tid);
+    if (pool.getApproxFreeSlabs()) {
+      batches.push_back(0);
+    }
+    double usage = pool.getApproxUsage(cid);
+    if ((1-usage)*100 < highEvictionAcWatermark && pool.allSlabsAllocated()) {
+      auto toFreeMemPercent = highEvictionAcWatermark - (1-usage)*100;
+      auto toFreeItems = static_cast<size_t>(
+          toFreeMemPercent * (pool.getApproxSlabs(cid) * pool.getPerSlab(cid)) );
+      batches.push_back(toFreeItems);
+    } else {
+      batches.push_back(0);
+    }
+  }
+
+  if (batches.size() == 0) {
+    return batches;
+  }
+
+  auto maxBatch = *std::max_element(batches.begin(), batches.end());
+  if (maxBatch == 0)
+    return batches;
+
+  std::transform(
+      batches.begin(), batches.end(), batches.begin(), [&](auto numItems) {
+        if (numItems == 0) {
+          return 0UL;
+        }
+
+        auto cappedBatchSize = maxEvictionBatch * numItems / maxBatch;
+        if (cappedBatchSize < minEvictionBatch)
+          return minEvictionBatch;
+        else
+          return cappedBatchSize;
+      });
+
+  return batches;
 }
 
 } // namespace facebook::cachelib
diff --git a/cachelib/allocator/MM2Q.h b/cachelib/allocator/MM2Q.h
index 316229d3bb..710b5c597c 100644
--- a/cachelib/allocator/MM2Q.h
+++ b/cachelib/allocator/MM2Q.h
@@ -66,6 +66,7 @@ class MM2Q {
   enum LruType { Warm, WarmTail, Hot, Cold, ColdTail, NumTypes };
 
   // Config class for MM2Q
+  // TODO: implement support for useCombinedLockForIterators
   struct Config {
     // Create from serialized config
     explicit Config(SerializationConfigType configState)
@@ -460,6 +461,18 @@ class MM2Q {
     //          is unchanged.
     bool add(T& node) noexcept;
 
+    // helper function to add the node under the container lock
+    void addNodeLocked(T& node, const Time& currTime);
+    
+    // adds the given nodes into the container and marks each as being present in
+    // the container. The nodes are added to the head of the lru.
+    //
+    // @param vector of nodes  The nodes to be added to the container.
+    // @return  number of nodes added - it is up to user to verify all
+    //          expected nodes have been added.
+    template <typename It>
+    uint32_t addBatch(It begin, It end) noexcept;
+
     // removes the node from the lru and sets it previous and next to nullptr.
     //
     // @param node  The node to be removed from the container.
@@ -499,6 +512,11 @@ class MM2Q {
     // Iterator passed as parameter.
     template <typename F>
     void withEvictionIterator(F&& f);
+    
+    // Execute provided function under container lock. Function gets
+    // iterator passed as parameter.
+    template <typename F>
+    void withPromotionIterator(F&& f);
 
     // Execute provided function under container lock.
     template <typename F>
@@ -889,16 +907,41 @@ bool MM2Q::Container<T, HookPtr>::add(T& node) noexcept {
     if (node.isInMMContainer()) {
       return false;
     }
+    addNodeLocked(node, currTime);
+    return true;
+  });
+}
 
-    markHot(node);
-    unmarkCold(node);
-    unmarkTail(node);
-    lru_.getList(LruType::Hot).linkAtHead(node);
-    rebalance();
+// adds the node to the list assuming not in 
+// container and holding container lock
+template <typename T, MM2Q::Hook<T> T::*HookPtr>
+void MM2Q::Container<T, HookPtr>::addNodeLocked(T& node, const Time& currTime) {
+  XDCHECK(!node.isInMMContainer());
+  markHot(node);
+  unmarkCold(node);
+  unmarkTail(node);
+  lru_.getList(LruType::Hot).linkAtHead(node);
+  rebalance();
+
+  node.markInMMContainer();
+  setUpdateTime(node, currTime);
+}
 
-    node.markInMMContainer();
-    setUpdateTime(node, currTime);
-    return true;
+template <typename T, MM2Q::Hook<T> T::*HookPtr>
+template <typename It>
+uint32_t MM2Q::Container<T, HookPtr>::addBatch(It begin, It end) noexcept {
+  const auto currTime = static_cast<Time>(util::getCurrentTimeSec());
+  return lruMutex_->lock_combine([this, begin, end, currTime]() {
+    uint32_t i = 0;
+    for (auto itr = begin; itr != end; itr++) {
+      T* node = *itr;
+      if (node->isInMMContainer()) {
+        return i;
+      }
+      addNodeLocked(*node,currTime);
+      i++;
+    }
+    return i;
   });
 }
 
@@ -920,6 +963,16 @@ void MM2Q::Container<T, HookPtr>::withEvictionIterator(F&& fun) {
   }
 }
 
+// returns the head of the hot queue for promotion
+template <typename T, MM2Q::Hook<T> T::*HookPtr>
+template <typename F>
+void
+MM2Q::Container<T, HookPtr>::withPromotionIterator(F&& fun) {
+  lruMutex_->lock_combine([this, &fun]() {
+    fun(LockedIterator{LockHolder{}, lru_.begin(LruType::Hot)});
+  });
+}
+
 template <typename T, MM2Q::Hook<T> T::*HookPtr>
 template <typename F>
 void MM2Q::Container<T, HookPtr>::withContainerLock(F&& fun) {
diff --git a/cachelib/allocator/MMLru.h b/cachelib/allocator/MMLru.h
index d17be6b15b..534d4bc850 100644
--- a/cachelib/allocator/MMLru.h
+++ b/cachelib/allocator/MMLru.h
@@ -233,7 +233,7 @@ class MMLru {
     std::chrono::seconds mmReconfigureIntervalSecs{};
 
     // Whether to use combined locking for withEvictionIterator.
-    bool useCombinedLockForIterators{false};
+    bool useCombinedLockForIterators{true};
   };
 
   // The container object which can be used to keep track of objects of type
@@ -337,6 +337,18 @@ class MMLru {
     //          is unchanged.
     bool add(T& node) noexcept;
 
+    // helper function to add the node under the container lock
+    void addNodeLocked(T& node, const Time& currTime);
+    
+    // adds the given nodes into the container and marks each as being present in
+    // the container. The nodes are added to the head of the lru.
+    //
+    // @param vector of nodes  The nodes to be added to the container.
+    // @return  number of nodes added - it is up to user to verify all
+    //          expected nodes have been added.
+    template <typename It>
+    uint32_t addBatch(It begin, It end) noexcept;
+
     // removes the node from the lru and sets it previous and next to nullptr.
     //
     // @param node  The node to be removed from the container.
@@ -378,6 +390,11 @@ class MMLru {
     template <typename F>
     void withContainerLock(F&& f);
 
+    // Execute provided function under container lock. Function gets
+    // iterator passed as parameter.
+    template <typename F>
+    void withPromotionIterator(F&& f);
+
     // get copy of current config
     Config getConfig() const;
 
@@ -685,19 +702,46 @@ bool MMLru::Container<T, HookPtr>::add(T& node) noexcept {
     if (node.isInMMContainer()) {
       return false;
     }
-    if (config_.lruInsertionPointSpec == 0 || insertionPoint_ == nullptr) {
-      lru_.linkAtHead(node);
-    } else {
-      lru_.insertBefore(*insertionPoint_, node);
-    }
-    node.markInMMContainer();
-    setUpdateTime(node, currTime);
-    unmarkAccessed(node);
-    updateLruInsertionPoint();
+    addNodeLocked(node,currTime);
     return true;
   });
 }
 
+template <typename T, MMLru::Hook<T> T::*HookPtr>
+void MMLru::Container<T, HookPtr>::addNodeLocked(T& node, const Time& currTime) {
+  XDCHECK(!node.isInMMContainer());
+  if (config_.lruInsertionPointSpec == 0 || insertionPoint_ == nullptr) {
+    lru_.linkAtHead(node);
+  } else {
+    lru_.insertBefore(*insertionPoint_, node);
+  }
+  node.markInMMContainer();
+  setUpdateTime(node, currTime);
+  unmarkAccessed(node);
+  updateLruInsertionPoint();
+}
+
+template <typename T, MMLru::Hook<T> T::*HookPtr>
+template <typename It>
+uint32_t MMLru::Container<T, HookPtr>::addBatch(It begin, It end) noexcept {
+  const auto currTime = static_cast<Time>(util::getCurrentTimeSec());
+  return lruMutex_->lock_combine([this, begin, end, currTime]() {
+    uint32_t i = 0;
+    for (auto itr = begin; itr != end; ++itr) {
+      T* node = *itr;
+      XDCHECK(!node->isInMMContainer());
+      if (node->isInMMContainer()) {
+        throw std::runtime_error(
+          folly::sformat("Was not able to add all new items, failed item {}",
+                          node->toString()));
+      }
+      addNodeLocked(*node,currTime);
+      i++;
+    }
+    return i;
+  });
+}
+
 template <typename T, MMLru::Hook<T> T::*HookPtr>
 typename MMLru::Container<T, HookPtr>::LockedIterator
 MMLru::Container<T, HookPtr>::getEvictionIterator() const noexcept {
@@ -716,6 +760,18 @@ void MMLru::Container<T, HookPtr>::withEvictionIterator(F&& fun) {
   }
 }
 
+template <typename T, MMLru::Hook<T> T::*HookPtr>
+template <typename F>
+void
+MMLru::Container<T, HookPtr>::withPromotionIterator(F&& fun) {
+  if (config_.useCombinedLockForIterators) {
+    lruMutex_->lock_combine([this, &fun]() { fun(Iterator{lru_.begin()}); });
+  } else {
+    LockHolder lck{*lruMutex_};
+    fun(Iterator{lru_.begin()});
+  }
+}
+
 template <typename T, MMLru::Hook<T> T::*HookPtr>
 template <typename F>
 void MMLru::Container<T, HookPtr>::withContainerLock(F&& fun) {
diff --git a/cachelib/allocator/MMTinyLFU.h b/cachelib/allocator/MMTinyLFU.h
index 71359c4782..6e3585d932 100644
--- a/cachelib/allocator/MMTinyLFU.h
+++ b/cachelib/allocator/MMTinyLFU.h
@@ -325,6 +325,18 @@ class MMTinyLFU {
     //          if the node was already in the contianer. On error state of node
     //          is unchanged.
     bool add(T& node) noexcept;
+    
+    // helper function to add the node under the container lock
+    void addNodeLocked(T& node, const Time& currTime);
+    
+    // adds the given nodes into the container and marks each as being present in
+    // the container. The nodes are added to the head of the lru.
+    //
+    // @param vector of nodes  The nodes to be added to the container.
+    // @return  number of nodes added - it is up to user to verify all
+    //          expected nodes have been added.
+    template <typename It>
+    uint32_t addBatch(It begin, It end) noexcept;
 
     // removes the node from the lru and sets it previous and next to nullptr.
     //
@@ -495,6 +507,11 @@ class MMTinyLFU {
     template <typename F>
     void withEvictionIterator(F&& f);
 
+    // Execute provided function under container lock. Function gets
+    // iterator passed as parameter.
+    template <typename F>
+    void withPromotionIterator(F&& f);
+
     // Execute provided function under container lock.
     template <typename F>
     void withContainerLock(F&& f);
@@ -801,7 +818,15 @@ bool MMTinyLFU::Container<T, HookPtr>::add(T& node) noexcept {
   if (node.isInMMContainer()) {
     return false;
   }
+  addNodeLocked(node, currTime);
+  return true;
+}
 
+// adds the node to the list assuming not in 
+// container and holding container lock
+template <typename T, MMTinyLFU::Hook<T> T::*HookPtr>
+void MMTinyLFU::Container<T, HookPtr>::addNodeLocked(T& node, const Time &currTime) {
+  XDCHECK(!node.isInMMContainer());
   auto& tinyLru = lru_.getList(LruType::Tiny);
   tinyLru.linkAtHead(node);
   markTiny(node);
@@ -829,7 +854,23 @@ bool MMTinyLFU::Container<T, HookPtr>::add(T& node) noexcept {
   node.markInMMContainer();
   setUpdateTime(node, currTime);
   unmarkAccessed(node);
-  return true;
+}
+
+template <typename T, MMTinyLFU::Hook<T> T::*HookPtr>
+template <typename It>
+uint32_t MMTinyLFU::Container<T, HookPtr>::addBatch(It begin, It end) noexcept {
+  const auto currTime = static_cast<Time>(util::getCurrentTimeSec());
+  LockHolder l(lruMutex_);
+  uint32_t i = 0;
+  for (auto itr = begin; itr != end; itr++) {
+    T* node = *itr;
+    if (node->isInMMContainer()) {
+      return i;
+    }
+    addNodeLocked(*node, currTime);
+    i++;
+  }
+  return i;
 }
 
 template <typename T, MMTinyLFU::Hook<T> T::*HookPtr>
@@ -846,6 +887,13 @@ void MMTinyLFU::Container<T, HookPtr>::withEvictionIterator(F&& fun) {
   fun(getEvictionIterator());
 }
 
+template <typename T, MMTinyLFU::Hook<T> T::*HookPtr>
+template <typename F>
+void
+MMTinyLFU::Container<T, HookPtr>::withPromotionIterator(F&& fun) {
+  throw std::runtime_error("Not supported");
+}
+
 template <typename T, MMTinyLFU::Hook<T> T::*HookPtr>
 template <typename F>
 void MMTinyLFU::Container<T, HookPtr>::withContainerLock(F&& fun) {
diff --git a/cachelib/allocator/MemoryTierCacheConfig.h b/cachelib/allocator/MemoryTierCacheConfig.h
index 1b9477c048..ee579a5386 100644
--- a/cachelib/allocator/MemoryTierCacheConfig.h
+++ b/cachelib/allocator/MemoryTierCacheConfig.h
@@ -16,11 +16,14 @@
 
 #pragma once
 
+#include "cachelib/common/Utils.h"
 #include "cachelib/shm/ShmCommon.h"
 
 namespace facebook {
 namespace cachelib {
 class MemoryTierCacheConfig {
+  using bitmask_type = util::NumaBitMask;
+
  public:
   // Creates instance of MemoryTierCacheConfig for Posix/SysV Shared memory.
   static MemoryTierCacheConfig fromShm() { return MemoryTierCacheConfig(); }
@@ -39,12 +42,12 @@ class MemoryTierCacheConfig {
   size_t getRatio() const noexcept { return ratio; }
 
   // Allocate memory only from specified NUMA nodes
-  MemoryTierCacheConfig& setMemBind(const NumaBitMask& _numaNodes) {
+  MemoryTierCacheConfig& setMemBind(const bitmask_type& _numaNodes) {
     numaNodes = _numaNodes;
     return *this;
   }
 
-  const NumaBitMask& getMemBind() const noexcept { return numaNodes; }
+  const bitmask_type& getMemBind() const noexcept { return numaNodes; }
 
   size_t calculateTierSize(size_t totalCacheSize, size_t partitionNum) const {
     // TODO: Call this method when tiers are enabled in allocator
@@ -71,7 +74,7 @@ class MemoryTierCacheConfig {
   size_t ratio{1};
 
   // Numa node(s) to bind the tier
-  NumaBitMask numaNodes;
+  bitmask_type numaNodes;
 
   // TODO: introduce a container for tier settings when adding support for
   // file-mapped memory
diff --git a/cachelib/allocator/PoolOptimizer.cpp b/cachelib/allocator/PoolOptimizer.cpp
index 1902bfebf8..d23bb77b58 100644
--- a/cachelib/allocator/PoolOptimizer.cpp
+++ b/cachelib/allocator/PoolOptimizer.cpp
@@ -50,6 +50,8 @@ void PoolOptimizer::optimizeRegularPoolSizes() {
 
 void PoolOptimizer::optimizeCompactCacheSizes() {
   try {
+    // TODO: should optimizer look at each tier individually?
+    // If yes, then resizePools should be per-tier
     auto strategy = cache_.getPoolOptimizeStrategy();
     if (!strategy) {
       strategy = strategy_;
diff --git a/cachelib/allocator/PrivateMemoryManager.cpp b/cachelib/allocator/PrivateMemoryManager.cpp
new file mode 100644
index 0000000000..afcf1b2202
--- /dev/null
+++ b/cachelib/allocator/PrivateMemoryManager.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cachelib/allocator/PrivateMemoryManager.h"
+
+#include <folly/ScopeGuard.h>
+
+namespace facebook {
+namespace cachelib {
+
+PrivateMemoryManager::~PrivateMemoryManager() {
+  for (auto& entry : mappings) {
+    util::munmapMemory(entry.first, entry.second);
+  }
+}
+
+void* PrivateMemoryManager::createMapping(size_t size,
+                                          PrivateSegmentOpts opts) {
+  void* addr = util::mmapAlignedZeroedMemory(opts.alignment, size);
+  auto guard = folly::makeGuard([&]() {
+    util::munmapMemory(addr, size);
+    mappings.erase(addr);
+  });
+
+  XDCHECK_EQ(reinterpret_cast<uint64_t>(addr) & (opts.alignment - 1), 0ULL);
+
+  if (!opts.memBindNumaNodes.empty()) {
+    util::mbindMemory(addr, size, MPOL_BIND, opts.memBindNumaNodes, 0);
+  }
+
+  mappings.emplace(addr, size);
+
+  guard.dismiss();
+  return addr;
+}
+} // namespace cachelib
+} // namespace facebook
\ No newline at end of file
diff --git a/cachelib/allocator/PrivateMemoryManager.h b/cachelib/allocator/PrivateMemoryManager.h
new file mode 100644
index 0000000000..7880ca928a
--- /dev/null
+++ b/cachelib/allocator/PrivateMemoryManager.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <unordered_map>
+
+#include "cachelib/common/Utils.h"
+
+namespace facebook {
+namespace cachelib {
+
+struct PrivateSegmentOpts {
+  size_t alignment{1}; // alignment for mapping.
+  util::NumaBitMask memBindNumaNodes;
+};
+
+class PrivateMemoryManager {
+ public:
+  PrivateMemoryManager() {}
+  ~PrivateMemoryManager();
+
+  void* createMapping(size_t size, PrivateSegmentOpts opts);
+
+ private:
+  std::unordered_map<void*, size_t> mappings;
+};
+
+} // namespace cachelib
+} // namespace facebook
\ No newline at end of file
diff --git a/cachelib/allocator/PromotionStrategy.h b/cachelib/allocator/PromotionStrategy.h
index d3eb8686c5..233c03cc10 100644
--- a/cachelib/allocator/PromotionStrategy.h
+++ b/cachelib/allocator/PromotionStrategy.h
@@ -35,7 +35,43 @@ class PromotionStrategy : public BackgroundMoverStrategy {
 
   std::vector<size_t> calculateBatchSizes(
       const CacheBase& cache, std::vector<MemoryDescriptorType> acVec) {
-    return {};
+    std::vector<size_t> batches{};
+    for (auto [tid, pid, cid] : acVec) {
+      XDCHECK(tid > 0);
+      const auto& pool = cache.getPoolByTid(pid, tid-1);
+      double usage = pool.getApproxUsage(cid);
+      if ((1-usage)*100 <= promotionAcWatermark)
+        batches.push_back(0);
+      else {
+        auto maxPossibleItemsToPromote = static_cast<size_t>(
+            ( (promotionAcWatermark - (1-usage*100) ) *
+              (pool.getApproxSlabs(cid) * pool.getPerSlab(cid)) ) );
+        batches.push_back(maxPossibleItemsToPromote);
+      }
+    }
+
+    if (batches.size() == 0) {
+      return batches;
+    }
+
+    auto maxBatch = *std::max_element(batches.begin(), batches.end());
+    if (maxBatch == 0)
+      return batches;
+
+    std::transform(
+        batches.begin(), batches.end(), batches.begin(), [&](auto numItems) {
+          if (numItems == 0) {
+            return 0UL;
+          }
+
+          auto cappedBatchSize = maxPromotionBatch * numItems / maxBatch;
+          if (cappedBatchSize < minPromotionBatch)
+            return minPromotionBatch;
+          else
+            return cappedBatchSize;
+        });
+
+    return batches;
   }
 
  private:
diff --git a/cachelib/allocator/datastruct/DList.h b/cachelib/allocator/datastruct/DList.h
index 0708115385..56c9e21212 100644
--- a/cachelib/allocator/datastruct/DList.h
+++ b/cachelib/allocator/datastruct/DList.h
@@ -219,6 +219,10 @@ class DList {
       curr_ = dir_ == Direction::FROM_HEAD ? dlist_->head_ : dlist_->tail_;
     }
 
+    Direction getDirection() noexcept {
+        return dir_;
+    }
+
    protected:
     void goForward() noexcept;
     void goBackward() noexcept;
diff --git a/cachelib/allocator/datastruct/MultiDList.h b/cachelib/allocator/datastruct/MultiDList.h
index 9470c9edae..d7a1351418 100644
--- a/cachelib/allocator/datastruct/MultiDList.h
+++ b/cachelib/allocator/datastruct/MultiDList.h
@@ -108,14 +108,18 @@ class MultiDList {
     }
 
     explicit Iterator(const MultiDList<T, HookPtr>& mlist,
-                      size_t listIdx) noexcept
+                      size_t listIdx, bool head) noexcept
         : currIter_(mlist.lists_[mlist.lists_.size() - 1]->rbegin()),
           mlist_(mlist) {
       XDCHECK_LT(listIdx, mlist.lists_.size());
-      initToValidRBeginFrom(listIdx);
+      if (head) {
+        initToValidBeginFrom(listIdx);
+      } else {
+        initToValidRBeginFrom(listIdx);
+      }
       // We should either point to an element or the end() iterator
       // which has an invalid index_.
-      XDCHECK(index_ == kInvalidIndex || currIter_.get() != nullptr);
+      XDCHECK(index_ == kInvalidIndex || index_ == mlist.lists_.size() || currIter_.get() != nullptr);
     }
     virtual ~Iterator() = default;
 
@@ -167,6 +171,9 @@ class MultiDList {
 
     // reset iterator to the beginning of a speicific queue
     void initToValidRBeginFrom(size_t listIdx) noexcept;
+    
+    // reset iterator to the head of a specific queue
+    void initToValidBeginFrom(size_t listIdx) noexcept;
 
     // Index of current list
     size_t index_{0};
@@ -182,6 +189,9 @@ class MultiDList {
 
   // provides an iterator starting from the tail of a specific list.
   Iterator rbegin(size_t idx) const;
+  
+  // provides an iterator starting from the head of a specific list.
+  Iterator begin(size_t idx) const;
 
   // Iterator to compare against for the end.
   Iterator rend() const noexcept;
@@ -201,12 +211,26 @@ void MultiDList<T, HookPtr>::Iterator::goForward() noexcept {
   }
   // Move iterator forward
   ++currIter_;
-  // If we land at the rend of this list, move to the previous list.
-  while (index_ != kInvalidIndex &&
-         currIter_ == mlist_.lists_[index_]->rend()) {
-    --index_;
-    if (index_ != kInvalidIndex) {
-      currIter_ = mlist_.lists_[index_]->rbegin();
+
+  if (currIter_.getDirection() == DListIterator::Direction::FROM_HEAD) {
+    // If we land at the rend of this list, move to the previous list.
+    while (index_ != kInvalidIndex && index_ != mlist_.lists_.size() &&
+           currIter_ == mlist_.lists_[index_]->end()) {
+      ++index_;
+      if (index_ != kInvalidIndex && index_ != mlist_.lists_.size()) {
+        currIter_ = mlist_.lists_[index_]->begin();
+      } else {
+          return;
+      }
+    }
+  } else {
+    // If we land at the rend of this list, move to the previous list.
+    while (index_ != kInvalidIndex &&
+           currIter_ == mlist_.lists_[index_]->rend()) {
+      --index_;
+      if (index_ != kInvalidIndex) {
+        currIter_ = mlist_.lists_[index_]->rbegin();
+      }
     }
   }
 }
@@ -247,6 +271,25 @@ void MultiDList<T, HookPtr>::Iterator::initToValidRBeginFrom(
                   : mlist_.lists_[index_]->rbegin();
 }
 
+template <typename T, DListHook<T> T::*HookPtr>
+void MultiDList<T, HookPtr>::Iterator::initToValidBeginFrom(
+    size_t listIdx) noexcept {
+  // Find the first non-empty list.
+  index_ = listIdx;
+  while (index_ != mlist_.lists_.size() &&
+         mlist_.lists_[index_]->size() == 0) {
+    ++index_;
+  }
+  if (index_ == mlist_.lists_.size()) {
+    //we reached the end - we should get set to
+    //invalid index
+    index_ = std::numeric_limits<size_t>::max();
+  }
+  currIter_ = index_ == std::numeric_limits<size_t>::max()
+                  ? mlist_.lists_[0]->begin()
+                  : mlist_.lists_[index_]->begin();
+}
+
 template <typename T, DListHook<T> T::*HookPtr>
 typename MultiDList<T, HookPtr>::Iterator&
 MultiDList<T, HookPtr>::Iterator::operator++() noexcept {
@@ -273,7 +316,16 @@ typename MultiDList<T, HookPtr>::Iterator MultiDList<T, HookPtr>::rbegin(
   if (listIdx >= lists_.size()) {
     throw std::invalid_argument("Invalid list index for MultiDList iterator.");
   }
-  return MultiDList<T, HookPtr>::Iterator(*this, listIdx);
+  return MultiDList<T, HookPtr>::Iterator(*this, listIdx, false);
+}
+
+template <typename T, DListHook<T> T::*HookPtr>
+typename MultiDList<T, HookPtr>::Iterator MultiDList<T, HookPtr>::begin(
+    size_t listIdx) const {
+  if (listIdx >= lists_.size()) {
+    throw std::invalid_argument("Invalid list index for MultiDList iterator.");
+  }
+  return MultiDList<T, HookPtr>::Iterator(*this, listIdx, true);
 }
 
 template <typename T, DListHook<T> T::*HookPtr>
diff --git a/cachelib/allocator/memory/AllocationClass.cpp b/cachelib/allocator/memory/AllocationClass.cpp
index 71089153e9..6d198b88bb 100644
--- a/cachelib/allocator/memory/AllocationClass.cpp
+++ b/cachelib/allocator/memory/AllocationClass.cpp
@@ -50,7 +50,7 @@ AllocationClass::AllocationClass(ClassId classId,
       poolId_(poolId),
       allocationSize_(allocSize),
       slabAlloc_(s),
-      freedAllocations_{slabAlloc_.createPtrCompressor<FreeAlloc>()} {
+      freedAllocations_{slabAlloc_.createSingleTierPtrCompressor<FreeAlloc>()} {
   checkState();
 }
 
@@ -102,7 +102,7 @@ AllocationClass::AllocationClass(
       currSlab_(s.getSlabForIdx(*object.currSlabIdx())),
       slabAlloc_(s),
       freedAllocations_(*object.freedAllocationsObject(),
-                        slabAlloc_.createPtrCompressor<FreeAlloc>()),
+                        slabAlloc_.createSingleTierPtrCompressor<FreeAlloc>()),
       canAllocate_(*object.canAllocate()) {
   if (!slabAlloc_.isRestorable()) {
     throw std::logic_error("The allocation class cannot be restored.");
@@ -140,6 +140,26 @@ void* AllocationClass::addSlabAndAllocate(Slab* slab) {
   });
 }
 
+std::vector<void*> AllocationClass::addSlabAndAllocateBatch(Slab* slab, uint64_t batch) {
+  XDCHECK_NE(nullptr, slab);
+  std::vector<void*> allocs;
+  allocs.reserve(batch);
+  lock_->lock_combine([this, slab, batch, &allocs]() {
+    addSlabLocked(slab);
+    uint64_t total = 0;
+    while (total < batch) {
+     void *alloc = allocateLocked();
+     if (alloc != nullptr) {
+       allocs.push_back(alloc);
+       total++;
+     } else {
+       break;
+     }
+    }
+  });
+  return allocs;
+}
+
 void* AllocationClass::allocateFromCurrentSlabLocked() noexcept {
   XDCHECK(canAllocateFromCurrentSlabLocked());
   void* ret = currSlab_->memoryAtOffset(currOffset_);
@@ -159,6 +179,26 @@ void* AllocationClass::allocate() {
   return lock_->lock_combine([this]() -> void* { return allocateLocked(); });
 }
 
+std::vector<void*> AllocationClass::allocateBatch(uint64_t batch) {
+  std::vector<void*> allocs;
+  if (!canAllocate_) {
+    return allocs;
+  }
+  lock_->lock_combine([this, &allocs, batch]() { 
+    uint64_t total = 0;
+    while (total < batch) {
+      void *alloc = allocateLocked();
+      if (alloc != nullptr) {
+        allocs.push_back(alloc);
+        total++;
+      } else {
+        break;
+      }
+    }
+  });
+  return allocs;
+}
+
 void* AllocationClass::allocateLocked() {
   // fast path for case when the cache is mostly full.
   if (freedAllocations_.empty() && freeSlabs_.empty() &&
@@ -356,9 +396,10 @@ std::pair<bool, std::vector<void*>> AllocationClass::pruneFreeAllocs(
   // allocated slab, release any freed allocations belonging to this slab.
   // Set the bit to true if the corresponding allocation is freed, false
   // otherwise.
-  FreeList freeAllocs{slabAlloc_.createPtrCompressor<FreeAlloc>()};
-  FreeList notInSlab{slabAlloc_.createPtrCompressor<FreeAlloc>()};
-  FreeList inSlab{slabAlloc_.createPtrCompressor<FreeAlloc>()};
+  FreeList freeAllocs{slabAlloc_.createSingleTierPtrCompressor<FreeAlloc>()};
+  FreeList notInSlab{slabAlloc_.createSingleTierPtrCompressor<FreeAlloc>()};
+  FreeList inSlab{slabAlloc_.createSingleTierPtrCompressor<FreeAlloc>()};
+
 
   lock_->lock_combine([&]() {
     // Take the allocation class free list offline
@@ -703,6 +744,30 @@ ACStats AllocationClass::getStats() const {
   });
 }
 
+uint32_t AllocationClass::getPerSlab() const {
+  return getAllocsPerSlab();
+}
+
+uint32_t AllocationClass::getApproxSlabs() const {
+  return allocatedSlabs_.size();
+}
+
+double AllocationClass::getApproxUsage() const {
+  const unsigned long long nSlabsAllocated = allocatedSlabs_.size();
+  if (nSlabsAllocated == 0) {
+      return 0.0;
+  }
+  const unsigned long long perSlab = getAllocsPerSlab();
+  const auto freeAllocsInCurrSlab =
+      canAllocateFromCurrentSlabLocked()
+          ? (Slab::kSize - currOffset_) / allocationSize_
+          : 0;
+  const unsigned long long nFreedAllocs = freedAllocations_.size();
+  const unsigned long long nActiveAllocs =
+      nSlabsAllocated * perSlab - nFreedAllocs - freeAllocsInCurrSlab;
+  return (double) nActiveAllocs / (double) (nSlabsAllocated * perSlab);
+}
+
 void AllocationClass::createSlabReleaseAllocMapLocked(const Slab* slab) {
   // Initialize slab free state
   // Each bit represents whether or not an alloc has already been freed
diff --git a/cachelib/allocator/memory/AllocationClass.h b/cachelib/allocator/memory/AllocationClass.h
index d45a45c6cd..079e7af01b 100644
--- a/cachelib/allocator/memory/AllocationClass.h
+++ b/cachelib/allocator/memory/AllocationClass.h
@@ -97,6 +97,13 @@ class AllocationClass {
   // fetch stats about this allocation class.
   ACStats getStats() const;
 
+  // get approx usage as fraction of used allocs/total allocs in this class
+  double getApproxUsage() const;
+  // get approx slabs in this class
+  uint32_t getApproxSlabs() const;
+  // get items per slabs in this class
+  uint32_t getPerSlab() const;
+
   // Whether the pool is full or free to allocate more in the current state.
   // This is only a hint and not a gurantee that subsequent allocate will
   // fail/succeed.
@@ -109,6 +116,7 @@ class AllocationClass {
   //          don't have any free memory. The caller will have to add a slab
   //          to this slab class to make further allocations out of it.
   void* allocate();
+  std::vector<void*> allocateBatch(uint64_t batch);
 
   // @param ctx     release context for the slab owning this alloc
   // @param memory  memory to check
@@ -220,6 +228,7 @@ class AllocationClass {
   // @param slab    a new slab to be added. This can NOT be nullptr.
   // @return  new allocation. This cannot fail.
   void* addSlabAndAllocate(Slab* slab);
+  std::vector<void*> addSlabAndAllocateBatch(Slab* slab, uint64_t batch);
 
   // Releasing a slab is a two step process.
   // 1. Mark a slab for release, by calling `startSlabRelease`.
@@ -445,7 +454,7 @@ class AllocationClass {
   struct CACHELIB_PACKED_ATTR FreeAlloc {
     using CompressedPtr = facebook::cachelib::CompressedPtr;
     using PtrCompressor =
-        facebook::cachelib::PtrCompressor<FreeAlloc, SlabAllocator>;
+        facebook::cachelib::SingleTierPtrCompressor<FreeAlloc, SlabAllocator>;
     SListHook<FreeAlloc> hook_{};
   };
 
diff --git a/cachelib/allocator/memory/CompressedPtr.h b/cachelib/allocator/memory/CompressedPtr.h
index 029abd91b9..d664063ea3 100644
--- a/cachelib/allocator/memory/CompressedPtr.h
+++ b/cachelib/allocator/memory/CompressedPtr.h
@@ -27,9 +27,12 @@ namespace cachelib {
 
 class SlabAllocator;
 
+template <typename PtrType, typename AllocatorContainer>
+class PtrCompressor;
+
 // This CompressedPtr makes decompression fast by staying away from division and
-// modulo arithmetic and doing those during the compression time. We most often
-// decompress a CompressedPtr than compress a pointer while creating one. This
+// modulo arithmetic and doing those during the compression time. We most  often
+// decompress a CompressedPtr than compress a pointer  while creating one.  This
 // is used for pointer compression by the memory allocator.
 
 // We compress pointers by storing the tier index, slab index and alloc index of
@@ -173,12 +176,14 @@ class CACHELIB_PACKED_ATTR CompressedPtr {
   }
 
   friend SlabAllocator;
+  template <typename CPtrType, typename AllocatorContainer>
+  friend class PtrCompressor;
 };
 
 template <typename PtrType, typename AllocatorT>
-class PtrCompressor {
+class SingleTierPtrCompressor {
  public:
-  explicit PtrCompressor(const AllocatorT& allocator) noexcept
+  explicit SingleTierPtrCompressor(const AllocatorT& allocator) noexcept
       : allocator_(allocator) {}
 
   const CompressedPtr compress(const PtrType* uncompressed) const {
@@ -190,11 +195,11 @@ class PtrCompressor {
         allocator_.unCompress(compressed, false /* isMultiTiered */));
   }
 
-  bool operator==(const PtrCompressor& rhs) const noexcept {
+  bool operator==(const SingleTierPtrCompressor& rhs) const noexcept {
     return &allocator_ == &rhs.allocator_;
   }
 
-  bool operator!=(const PtrCompressor& rhs) const noexcept {
+  bool operator!=(const SingleTierPtrCompressor& rhs) const noexcept {
     return !(*this == rhs);
   }
 
@@ -202,5 +207,53 @@ class PtrCompressor {
   // memory allocator that does the pointer compression.
   const AllocatorT& allocator_;
 };
+
+template <typename PtrType, typename AllocatorContainer>
+class PtrCompressor {
+ public:
+  explicit PtrCompressor(const AllocatorContainer& allocators) noexcept
+      : allocators_(allocators) {}
+
+  const CompressedPtr compress(const PtrType* uncompressed) const {
+    if (uncompressed == nullptr)
+      return CompressedPtr{};
+
+    TierId tid;
+    for (tid = 0; tid < allocators_.size(); tid++) {
+      if (allocators_[tid]->isMemoryInAllocator(
+              static_cast<const void*>(uncompressed)))
+        break;
+    }
+
+    bool isMultiTiered = allocators_.size() > 1;
+    auto cptr = allocators_[tid]->compress(uncompressed, isMultiTiered);
+    if (isMultiTiered) { // config has multiple tiers
+      cptr.setTierId(tid);
+    }
+    return cptr;
+  }
+
+  PtrType* unCompress(const CompressedPtr compressed) const {
+    if (compressed.isNull()) {
+      return nullptr;
+    }
+    bool isMultiTiered = allocators_.size() > 1;
+    auto& allocator = *allocators_[compressed.getTierId(isMultiTiered)];
+    return static_cast<PtrType*>(
+        allocator.unCompress(compressed, isMultiTiered));
+  }
+
+  bool operator==(const PtrCompressor& rhs) const noexcept {
+    return &allocators_ == &rhs.allocators_;
+  }
+
+  bool operator!=(const PtrCompressor& rhs) const noexcept {
+    return !(*this == rhs);
+  }
+
+ private:
+  // memory allocator that does the pointer compression.
+  const AllocatorContainer& allocators_;
+};
 } // namespace cachelib
 } // namespace facebook
diff --git a/cachelib/allocator/memory/MemoryAllocator.cpp b/cachelib/allocator/memory/MemoryAllocator.cpp
index 5de65e4e13..53549beeb7 100644
--- a/cachelib/allocator/memory/MemoryAllocator.cpp
+++ b/cachelib/allocator/memory/MemoryAllocator.cpp
@@ -71,6 +71,16 @@ void* MemoryAllocator::allocate(PoolId id, uint32_t size) {
   return mp.allocate(size);
 }
 
+void* MemoryAllocator::allocateByCid(PoolId id, ClassId cid) {
+  auto& mp = memoryPoolManager_.getPoolById(id);
+  return mp.allocateByCid(cid);
+}
+
+std::vector<void*> MemoryAllocator::allocateByCidBatch(PoolId id, ClassId cid, uint64_t batch) {
+  auto& mp = memoryPoolManager_.getPoolById(id);
+  return mp.allocateByCidBatch(cid, batch);
+}
+
 void* MemoryAllocator::allocateZeroedSlab(PoolId id) {
   if (!config_.enableZeroedSlabAllocs) {
     throw std::logic_error("Zeroed Slab allcoation is not enabled");
diff --git a/cachelib/allocator/memory/MemoryAllocator.h b/cachelib/allocator/memory/MemoryAllocator.h
index 1ce58857de..efd23838f1 100644
--- a/cachelib/allocator/memory/MemoryAllocator.h
+++ b/cachelib/allocator/memory/MemoryAllocator.h
@@ -167,6 +167,8 @@ class MemoryAllocator {
   // @throw std::invalid_argument if the poolId is invalid or the size is
   //        invalid.
   void* allocate(PoolId id, uint32_t size);
+  void* allocateByCid(PoolId id, ClassId cid);
+  std::vector<void*> allocateByCidBatch(PoolId id, ClassId cid, uint64_t batch);
 
   // Allocate a zeroed Slab
   //
@@ -516,12 +518,13 @@ class MemoryAllocator {
   using CompressedPtr = facebook::cachelib::CompressedPtr;
   template <typename PtrType>
   using PtrCompressor =
-      facebook::cachelib::PtrCompressor<PtrType, SlabAllocator>;
-
+      facebook::cachelib::PtrCompressor<PtrType,
+             std::vector<std::unique_ptr<MemoryAllocator>>>;
+  
   template <typename PtrType>
-  PtrCompressor<PtrType> createPtrCompressor() {
-    return slabAllocator_.createPtrCompressor<PtrType>();
-  }
+  using SingleTierPtrCompressor =
+       facebook::cachelib::PtrCompressor<PtrType,
+       SlabAllocator>;
 
   // compress a given pointer to a valid allocation made out of this allocator
   // through an allocate() or nullptr. Calling this otherwise with invalid
@@ -646,6 +649,13 @@ class MemoryAllocator {
     memoryPoolManager_.updateNumSlabsToAdvise(numSlabs);
   }
 
+  // returns ture if ptr points to memory which is managed by this
+  // allocator
+  bool isMemoryInAllocator(const void *ptr) {
+    return ptr && ptr >= slabAllocator_.getSlabMemoryBegin()
+      && ptr < slabAllocator_.getSlabMemoryEnd();
+  }
+
  private:
   // @param memory    pointer to the memory.
   // @return          the MemoryPool corresponding to the memory.
diff --git a/cachelib/allocator/memory/MemoryAllocatorStats.h b/cachelib/allocator/memory/MemoryAllocatorStats.h
index b019b254c5..048fd84247 100644
--- a/cachelib/allocator/memory/MemoryAllocatorStats.h
+++ b/cachelib/allocator/memory/MemoryAllocatorStats.h
@@ -22,6 +22,7 @@
 #include <unordered_map>
 
 #include "cachelib/allocator/memory/Slab.h"
+#include "cachelib/common/RollingStats.h"
 
 namespace facebook {
 namespace cachelib {
@@ -49,6 +50,12 @@ struct ACStats {
   // true if the allocation class is full.
   bool full;
 
+  // Rolling allocation latency (in ns)
+  util::RollingStats allocLatencyNs;
+
+  uint64_t evictionAttempts;
+  uint64_t evictions;
+
   constexpr unsigned long long totalSlabs() const noexcept {
     return freeSlabs + usedSlabs;
   }
@@ -56,6 +63,26 @@ struct ACStats {
   constexpr size_t getTotalFreeMemory() const noexcept {
     return Slab::kSize * freeSlabs + freeAllocs * allocSize;
   }
+
+  constexpr double usageFraction() const noexcept {
+    if (usedSlabs == 0)
+      return 0.0;
+
+    return activeAllocs / (usedSlabs * allocsPerSlab);
+  }
+  
+  constexpr double approxUsage() const noexcept {
+    const unsigned long long nSlabsAllocated = usedSlabs;
+    if (nSlabsAllocated == 0) {
+        return 0.0;
+    }
+    const unsigned long long perSlab = allocsPerSlab;
+    return (double) activeAllocs / (double) (nSlabsAllocated * perSlab);
+  }
+
+  constexpr size_t totalAllocatedSize() const noexcept {
+    return activeAllocs * allocSize;
+  }
 };
 
 // structure to query stats corresponding to a MemoryPool
diff --git a/cachelib/allocator/memory/MemoryPool.cpp b/cachelib/allocator/memory/MemoryPool.cpp
index 21c04841e5..99ccafbcc8 100644
--- a/cachelib/allocator/memory/MemoryPool.cpp
+++ b/cachelib/allocator/memory/MemoryPool.cpp
@@ -262,12 +262,71 @@ Slab* MemoryPool::getSlabLocked() noexcept {
   return slab;
 }
 
-void* MemoryPool::allocate(uint32_t size) {
-  auto& ac = getAllocationClassFor(size);
+std::vector<void*> MemoryPool::allocateForClassBatch(AllocationClass& ac, uint64_t batch) {
+  uint64_t total = 0;
+  const auto allocSize = ac.getAllocSize();
+  auto allocs = ac.allocateBatch(batch);
+  if (allocs.size() > 0) {
+    total += allocs.size();
+    currAllocSize_ += allocSize * allocs.size();
+  }
+  if (total == batch) {
+    return allocs;
+  }
+
+  // atomically see if we can acquire a slab by checking if we have
+  // reached the limit by size. If not, then they can be acquired from
+  // either the slab allocator or our free list. It is important to check
+  // this before we grab it from the slab allocator or free list. Things
+  // that release slab, bump down the currSlabAllocSize_ after actually
+  // releasing and adding it to free list or slab allocator.
+  if (allSlabsAllocated()) {
+    return allocs;
+  }
+
+  uint32_t remain = batch - total;
+  // TODO: introduce a new sharded lock by allocation class id for this slow
+  // path Currently this would also serialize the slow paths of two different
+  // allocation class ids that need slab to initiate an allocation.
+  LockHolder l(lock_);
+  auto allocs2 = ac.allocateBatch(remain);
+  if (allocs2.size() > 0) {
+    total += allocs2.size();
+    currAllocSize_ += allocSize * allocs2.size();
+    allocs.insert(allocs.end(),allocs2.begin(),allocs2.end());
+  }
+  if (total == batch) {
+    return allocs;
+  }
+
+  remain = batch - total;
+  // see if we have a slab to add to the allocation class.
+  auto slab = getSlabLocked();
+  while (remain && slab != nullptr) {
+    if (slab == nullptr) {
+      // out of memory
+      return allocs;
+    }
 
+    // add it to the allocation class and try to allocate.
+    auto allocs3 = ac.addSlabAndAllocateBatch(slab, remain);
+    //XDCHECK_NE(nullptr, alloc);
+
+    currAllocSize_ += allocSize * allocs3.size();
+    total += allocs3.size();
+    remain -= allocs3.size();
+    allocs.insert(allocs.end(),allocs3.begin(),allocs3.end());
+    if (total == batch) {
+      return allocs;
+    }
+    slab = getSlabLocked();
+  }
+  return allocs;
+}
+
+void* MemoryPool::allocateForClass(AllocationClass& ac) {
   auto alloc = ac.allocate();
   const auto allocSize = ac.getAllocSize();
-  XDCHECK_GE(allocSize, size);
 
   if (alloc != nullptr) {
     currAllocSize_ += allocSize;
@@ -309,6 +368,23 @@ void* MemoryPool::allocate(uint32_t size) {
   return alloc;
 }
 
+void* MemoryPool::allocateByCid(ClassId cid) {
+  auto& ac = getAllocationClassFor(cid);
+  return allocateForClass(ac);
+}
+
+std::vector<void*> MemoryPool::allocateByCidBatch(ClassId cid, uint64_t batch) {
+  auto& ac = getAllocationClassFor(cid);
+  return allocateForClassBatch(ac, batch);
+}
+
+void* MemoryPool::allocate(uint32_t size) {
+  auto& ac = getAllocationClassFor(size);
+  const auto allocSize = ac.getAllocSize();
+  XDCHECK_GE(allocSize, size);
+  return allocateForClass(ac);
+}
+
 void* MemoryPool::allocateZeroedSlab() { return allocate(Slab::kSize); }
 
 void MemoryPool::free(void* alloc) {
@@ -523,3 +599,22 @@ MPStats MemoryPool::getStats() const {
                  slabsUnAllocated,    nSlabResize_,       nSlabRebalance_,
                  curSlabsAdvised_};
 }
+
+double MemoryPool::getApproxUsage(ClassId cid) const {
+  const auto& ac = getAllocationClassFor(cid);
+  return ac.getApproxUsage();
+}
+
+uint32_t MemoryPool::getApproxFreeSlabs() const {
+  return freeSlabs_.size();
+}
+
+uint32_t MemoryPool::getApproxSlabs(ClassId cid) const {
+  const auto& ac = getAllocationClassFor(cid);
+  return ac.getApproxSlabs();
+}
+
+uint32_t MemoryPool::getPerSlab(ClassId cid) const {
+  const auto& ac = getAllocationClassFor(cid);
+  return ac.getPerSlab();
+}
diff --git a/cachelib/allocator/memory/MemoryPool.h b/cachelib/allocator/memory/MemoryPool.h
index 00c2c8c8b8..d4d72b7c0d 100644
--- a/cachelib/allocator/memory/MemoryPool.h
+++ b/cachelib/allocator/memory/MemoryPool.h
@@ -132,6 +132,14 @@ class MemoryPool {
   }
 
   MPStats getStats() const;
+  // approx usage fraction per class
+  double getApproxUsage(ClassId cid) const;
+  // approx slabs assigned to a given class
+  uint32_t getApproxSlabs(ClassId cid) const;
+  
+  uint32_t getApproxFreeSlabs() const;
+  // items per slab for a class
+  uint32_t getPerSlab(ClassId cid) const;
 
   // allocates memory of at least _size_ bytes.
   //
@@ -139,6 +147,8 @@ class MemoryPool {
   // @return pointer to allocation or nullptr on failure to allocate.
   // @throw  std::invalid_argument if size is invalid.
   void* allocate(uint32_t size);
+  void* allocateByCid(ClassId cid);
+  std::vector<void*> allocateByCidBatch(ClassId cid, uint64_t batch);
 
   // Allocate a slab with zeroed memory
   //
@@ -323,6 +333,9 @@ class MemoryPool {
 
   // create allocation classes corresponding to the pool's configuration.
   ACVector createAllocationClasses() const;
+  
+  void* allocateForClass(AllocationClass& ac);
+  std::vector<void*> allocateForClassBatch(AllocationClass& ac, uint64_t batch);
 
   // @return  AllocationClass corresponding to the memory, if it
   //          belongs to an AllocationClass
diff --git a/cachelib/allocator/memory/SlabAllocator.h b/cachelib/allocator/memory/SlabAllocator.h
index d82cf5b947..a80a54672c 100644
--- a/cachelib/allocator/memory/SlabAllocator.h
+++ b/cachelib/allocator/memory/SlabAllocator.h
@@ -318,8 +318,19 @@ class SlabAllocator {
   }
 
   template <typename PtrType>
-  PtrCompressor<PtrType, SlabAllocator> createPtrCompressor() const {
-    return PtrCompressor<PtrType, SlabAllocator>(*this);
+  SingleTierPtrCompressor<PtrType, SlabAllocator> createSingleTierPtrCompressor() const {
+     return SingleTierPtrCompressor<PtrType, SlabAllocator>(*this);
+  }
+
+  // returns starting address of memory we own.
+  const Slab* getSlabMemoryBegin() const noexcept {
+    return reinterpret_cast<Slab*>(memoryStart_);
+  }
+
+  // returns first byte after the end of memory region we own.
+  const Slab* getSlabMemoryEnd() const noexcept {
+    return reinterpret_cast<Slab*>(reinterpret_cast<uint8_t*>(memoryStart_) +
+                                   memorySize_);
   }
 
  private:
@@ -339,12 +350,6 @@ class SlabAllocator {
   // @throw std::invalid_argument if the state is invalid.
   void checkState() const;
 
-  // returns first byte after the end of memory region we own.
-  const Slab* getSlabMemoryEnd() const noexcept {
-    return reinterpret_cast<Slab*>(reinterpret_cast<uint8_t*>(memoryStart_) +
-                                   memorySize_);
-  }
-
   // returns true if we have slabbed all the memory that is available to us.
   // false otherwise.
   bool allMemorySlabbed() const noexcept {
diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
index 3e4847251f..a08ee04e6d 100644
--- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
+++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
@@ -21,11 +21,16 @@ namespace cachelib {
 namespace tests {
 
 using LruAllocatorMemoryTiersTest = AllocatorMemoryTiersTest<LruAllocator>;
-
+//using LruTestAllocatorMemoryTiersTest = AllocatorMemoryTiersTest<LruTestAllocator>;
 // TODO(MEMORY_TIER): add more tests with different eviction policies
-TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValid1) {
-  this->testMultiTiersValid1();
-}
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersInvalid) { this->testMultiTiersInvalid(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValid) { this->testMultiTiersValid(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValidStats) { this->testMultiTiersValidStats(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValidMixed) { this->testMultiTiersValidMixed(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersBackgroundMovers ) { this->testMultiTiersBackgroundMovers(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersRemoveDuringEviction) { this->testMultiTiersRemoveDuringEviction(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersReplaceDuringEviction) { this->testMultiTiersReplaceDuringEviction(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersReplaceDuringEvictionWithReader) { this->testMultiTiersReplaceDuringEvictionWithReader(); }
 
 } // end of namespace tests
 } // end of namespace cachelib
diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
index a0d1513990..5af34db94a 100644
--- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
+++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
@@ -19,6 +19,14 @@
 #include "cachelib/allocator/CacheAllocatorConfig.h"
 #include "cachelib/allocator/MemoryTierCacheConfig.h"
 #include "cachelib/allocator/tests/TestBase.h"
+#include "cachelib/allocator/FreeThresholdStrategy.h"
+#include "cachelib/allocator/PromotionStrategy.h"
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <semaphore.h>
+#include <folly/synchronization/Latch.h>
 
 namespace facebook {
 namespace cachelib {
@@ -26,15 +34,414 @@ namespace tests {
 
 template <typename AllocatorT>
 class AllocatorMemoryTiersTest : public AllocatorTest<AllocatorT> {
+ private:
+  template<typename MvCallback>
+  void testMultiTiersAsyncOpDuringMove(std::unique_ptr<AllocatorT>& alloc,
+                                       PoolId& pool, bool& quit, MvCallback&& moveCb) {
+    typename AllocatorT::Config config;
+    config.setCacheSize(4 * Slab::kSize);
+    config.enableCachePersistence("/tmp");
+    config.configureMemoryTiers({
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0")),
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0"))
+    });
+
+    config.enableMovingOnSlabRelease(moveCb, {} /* ChainedItemsMoveSync */,
+                                     -1 /* movingAttemptsLimit */);
+
+    alloc = std::make_unique<AllocatorT>(AllocatorT::SharedMemNew, config);
+    ASSERT(alloc != nullptr);
+    pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize);
+
+    int i = 0;
+    while(!quit) {
+      auto handle = alloc->allocate(pool, std::to_string(++i), std::string("value").size());
+      ASSERT(handle != nullptr);
+      ASSERT_NO_THROW(alloc->insertOrReplace(handle));
+    }
+  }
  public:
-  void testMultiTiersValid1() {
+  void testMultiTiersInvalid() {
+    typename AllocatorT::Config config;
+    config.setCacheSize(100 * Slab::kSize);
+    ASSERT_NO_THROW(config.configureMemoryTiers(
+        {MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+             std::string("0")),
+         MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+             std::string("0"))}));
+  }
+
+  void testMultiTiersValid() {
+    typename AllocatorT::Config config;
+    config.setCacheSize(100 * Slab::kSize);
+    config.enableCachePersistence("/tmp");
+    ASSERT_NO_THROW(config.configureMemoryTiers(
+        {MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+             std::string("0")),
+         MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+             std::string("0"))}));
+
+    auto alloc = std::make_unique<AllocatorT>(AllocatorT::SharedMemNew, config);
+    ASSERT(alloc != nullptr);
+
+    auto pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize);
+    auto handle = alloc->allocate(pool, "key", std::string("value").size());
+    ASSERT(handle != nullptr);
+    ASSERT_NO_THROW(alloc->insertOrReplace(handle));
+  }
+  
+  void testMultiTiersValidStats() {
+    typename AllocatorT::Config config;
+    size_t nSlabs = 20;
+    config.setCacheSize(nSlabs * Slab::kSize);
+    config.enableCachePersistence("/tmp");
+    auto moveCb = [&](typename AllocatorT::Item& oldItem,
+                      typename AllocatorT::Item& newItem,
+                      typename AllocatorT::Item* /* parentPtr */) {
+      std::memcpy(newItem.getMemory(), oldItem.getMemory(),
+                  oldItem.getSize());
+    };
+
+    config.enableMovingOnSlabRelease(moveCb, {}, 10);
+    ASSERT_NO_THROW(config.configureMemoryTiers(
+        {MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+             std::string("0")),
+         MemoryTierCacheConfig::fromShm().setRatio(2).setMemBind(
+             std::string("0"))}));
+
+    auto alloc = std::make_unique<AllocatorT>(AllocatorT::SharedMemNew, config);
+    ASSERT(alloc != nullptr);
+    size_t keyLen = 8;
+    auto pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize);
+    std::vector<uint32_t> valsize = {1000};
+    std::vector<uint32_t> itemCount;
+    std::vector<uint32_t> evictCount;
+    for (uint32_t tid = 0; tid < 2; tid++) {
+        this->fillUpPoolUntilEvictions(*alloc, tid, pool, valsize, keyLen);
+        auto stats = alloc->getPoolStats(tid, pool);
+        const auto& classIds = stats.mpStats.classIds;
+        uint32_t prev = 0;
+        ClassId cid = 0;
+        for (const ClassId c : classIds) {
+            uint32_t currSize = stats.cacheStats[c].allocSize;
+            if (prev <= valsize[0] && valsize[0] <= currSize) {
+                cid = c;
+                break;
+            }
+            prev = currSize;
+        }
+
+        std::cout << "Tid: " << tid << " cid: " << static_cast<uint32_t>(cid)
+                  << " items: " << stats.cacheStats[cid].numItems()
+                  << " evicts: " << stats.cacheStats[cid].numEvictions()
+                  << std::endl;
+        ASSERT_GE(stats.cacheStats[cid].numItems(), 1);
+        ASSERT_EQ(stats.cacheStats[cid].numEvictions(), 1);
+        itemCount.push_back(stats.cacheStats[cid].numItems());
+        evictCount.push_back(stats.cacheStats[cid].numEvictions());
+        //first tier should have some writebacks to second tier
+        //second tier should not have any writebacks since it
+        //is last memory tier
+        if (tid == 0) {
+            ASSERT_EQ(stats.cacheStats[cid].numWritebacks, 1);
+        } else {
+            ASSERT_EQ(0, stats.cacheStats[cid].numWritebacks);
+        }
+    }
+    for (uint32_t tid = 1; tid < 2; tid++) {
+        ASSERT_NE(itemCount[tid],itemCount[tid-1]);
+        ASSERT_EQ(evictCount[tid],evictCount[tid-1]);
+    }
+  }
+  
+  void testMultiTiersBackgroundMovers() {
+    typename AllocatorT::Config config;
+    config.setCacheSize(10 * Slab::kSize);
+    config.enableCachePersistence("/tmp");
+    config.usePosixForShm();
+    auto moveCb = [&](typename AllocatorT::Item& oldItem,
+                      typename AllocatorT::Item& newItem,
+                      typename AllocatorT::Item* /* parentPtr */) {
+      std::memcpy(newItem.getMemory(), oldItem.getMemory(),
+                  oldItem.getSize());
+    };
+
+    config.enableMovingOnSlabRelease(moveCb, {}, 10);
+    config.configureMemoryTiers({
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0")),
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0"))
+    });
+    config.enableBackgroundEvictor(std::make_shared<FreeThresholdStrategy>(2, 10, 100, 40),
+            std::chrono::milliseconds(10),1);
+    config.enableBackgroundPromoter(std::make_shared<PromotionStrategy>(5, 4, 2),
+            std::chrono::milliseconds(10),1);
+
+    auto allocator = std::make_unique<AllocatorT>(AllocatorT::SharedMemNew, config);
+    ASSERT(allocator != nullptr);
+    const size_t numBytes = allocator->getCacheMemoryStats().ramCacheSize;
+
+    auto poolId = allocator->addPool("default", numBytes);
+
+    const unsigned int keyLen = 100;
+    const unsigned int size = 100;
+    unsigned int allocs = 0;
+
+    //we should work on pool stats because filluppooluntil evictions
+    //will finish once we evict an item from tier 0 to tier 1 and
+    //there will be unallocated memory left.
+    while (allocs < 174760) {
+      const auto key = this->getRandomNewKey(*allocator, keyLen);
+      ASSERT_EQ(allocator->find(key), nullptr);
+      auto handle = util::allocateAccessible(*allocator, poolId, key, size);
+      allocs++;
+    }
+   
+    const auto key = this->getRandomNewKey(*allocator, keyLen);
+    auto handle = util::allocateAccessible(*allocator, poolId, key, size);
+    ASSERT_NE(nullptr, handle);
+    const uint8_t cid = allocator->getAllocInfo(handle->getMemory()).classId;
+    ASSERT_EQ(cid,5);
+    auto stats = allocator->getGlobalCacheStats();
+    auto slabStats = allocator->getACStats(0,0,cid);
+    const auto& mpStats = allocator->getPoolByTid(poolId, 0).getStats(); 
+    //cache is 10MB should move about 1MB to reach 10% free
+    uint32_t approxEvict = (1024*1024)/mpStats.acStats.at(cid).allocSize;
+    while (stats.evictionStats[0].numMovedItems < approxEvict*0.95 && (1-slabStats.usageFraction()) >= 0.095) {
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+        stats = allocator->getGlobalCacheStats();
+        slabStats = allocator->getACStats(0,0,cid);
+    }
+    ASSERT_GE(1-slabStats.usageFraction(),0.095);
+
+    auto perclassEstats = allocator->getBackgroundMoverClassStats(MoverDir::Evict);
+    auto perclassPstats = allocator->getBackgroundMoverClassStats(MoverDir::Promote);
+
+    ASSERT_GE(stats.evictionStats[0].numMovedItems,1);
+    ASSERT_GE(stats.evictionStats[0].runCount,1);
+    ASSERT_GE(stats.promotionStats[0].numMovedItems,1);
+    
+    MemoryDescriptorType tier0(0,0,cid);
+    MemoryDescriptorType tier1(1,0,cid);
+    ASSERT_GE(perclassEstats[tier0], 1);
+    ASSERT_GE(perclassPstats[tier1], 1);
+    
+  }
+
+  void testMultiTiersValidMixed() {
     typename AllocatorT::Config config;
     config.setCacheSize(100 * Slab::kSize);
+    config.enableCachePersistence("/tmp");
     ASSERT_NO_THROW(config.configureMemoryTiers(
         {MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
              std::string("0")),
          MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
              std::string("0"))}));
+
+    auto alloc = std::make_unique<AllocatorT>(AllocatorT::SharedMemNew, config);
+    ASSERT(alloc != nullptr);
+
+    auto pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize);
+    auto handle = alloc->allocate(pool, "key", std::string("value").size());
+    ASSERT(handle != nullptr);
+    ASSERT_NO_THROW(alloc->insertOrReplace(handle));
+  }
+
+  void testMultiTiersRemoveDuringEviction() {
+    std::unique_ptr<AllocatorT> alloc;
+    PoolId pool;
+    std::unique_ptr<std::thread> t;
+    folly::Latch latch(1);
+    bool quit = false;
+
+    auto moveCb = [&] (typename AllocatorT::Item& oldItem,
+                       typename AllocatorT::Item& newItem,
+                       typename AllocatorT::Item* /* parentPtr */) {
+      
+      auto key = oldItem.getKey();
+      t = std::make_unique<std::thread>([&](){
+            // remove() function is blocked by wait context
+            // till item is moved to next tier. So that, we should
+            // notify latch before calling remove()
+            latch.count_down();
+            alloc->remove(key);
+          });
+      // wait till async thread is running
+      latch.wait();
+      memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize());
+      quit = true;
+    };
+
+    testMultiTiersAsyncOpDuringMove(alloc, pool, quit, moveCb);
+
+    t->join();
+  }
+
+  void testMultiTiersReplaceDuringEviction() {
+    std::unique_ptr<AllocatorT> alloc;
+    PoolId pool;
+    std::unique_ptr<std::thread> t;
+    folly::Latch latch(1);
+    bool quit = false;
+
+    auto moveCb = [&] (typename AllocatorT::Item& oldItem,
+                       typename AllocatorT::Item& newItem,
+                       typename AllocatorT::Item* /* parentPtr */) {
+      auto key = oldItem.getKey();
+      if(!quit) {
+        // we need to replace only once because subsequent allocate calls
+        // will cause evictions recursevly
+        quit = true;
+        t = std::make_unique<std::thread>([&](){
+              auto handle = alloc->allocate(pool, key, std::string("new value").size());
+              // insertOrReplace() function is blocked by wait context
+              // till item is moved to next tier. So that, we should
+              // notify latch before calling insertOrReplace()
+              latch.count_down();
+              ASSERT_NO_THROW(alloc->insertOrReplace(handle));
+            });
+        // wait till async thread is running
+        latch.wait();
+      }
+      memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize());
+    };
+
+    testMultiTiersAsyncOpDuringMove(alloc, pool, quit, moveCb);
+
+    t->join();
+
+  }
+
+
+  inline void gdb_sync1() { for (volatile int i = 0; i < 100; i++); }
+  inline void gdb_sync2() { for (volatile int i = 0; i < 100; i++); }
+  inline void gdb_sync3() { for (volatile int i = 0; i < 100; i++); }
+  using ReadHandle = typename AllocatorT::ReadHandle;
+  void testMultiTiersReplaceDuringEvictionWithReader() {
+    sem_unlink ("/gdb1_sem");
+    sem_t *sem = sem_open ("/gdb1_sem", O_CREAT | O_EXCL, S_IRUSR | S_IWUSR, 0);
+    int gdbfd = open("/tmp/gdb1.gdb",O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR);
+    char gdbcmds[] = 
+                     "set attached=1\n"
+                     "break gdb_sync1\n"
+                     "break gdb_sync2\n"
+                     "break gdb_sync3\n"
+                     "break moveRegularItem\n"
+                     "c\n"
+                     "set scheduler-locking on\n"
+                     "thread 1\n"
+                     "c\n"
+                     "thread 3\n"
+                     "c\n"
+                     "thread 4\n"
+                     "break nativeFutexWaitImpl thread 4\n"
+                     "c\n"
+                     "thread 3\n"
+                     "break nativeFutexWaitImpl thread 3\n"
+                     "c\n"
+                     "thread 1\n"
+                     "break releaseBackToAllocator\n"
+                     "c\n"
+                     "c\n"
+                     "thread 4\n"
+                     "c\n"
+                     "thread 3\n"
+                     "c\n"
+                     "thread 1\n"
+                     "c\n"
+                     "quit\n";
+    int ret = write(gdbfd,gdbcmds,strlen(gdbcmds));
+    int ppid = getpid(); //parent pid
+    int pid = fork();
+    if (pid == 0) {
+        sem_wait(sem);
+        sem_close(sem);
+        sem_unlink("/gdb1_sem");
+        char cmdpid[256];
+        sprintf(cmdpid,"%d",ppid);
+        int f = execlp("gdb","gdb","--pid",cmdpid,"--batch-silent","--command=/tmp/gdb1.gdb",(char*) 0);
+        ASSERT(f != -1);
+    }
+    sem_post(sem);
+    //wait for gdb to run
+    volatile int attached = 0;
+    while (attached == 0);
+    
+    std::unique_ptr<AllocatorT> alloc;
+    PoolId pool;
+    bool quit = false;
+    
+    typename AllocatorT::Config config;
+    config.setCacheSize(4 * Slab::kSize);
+    config.enableCachePersistence("/tmp");
+    auto moveCb = [&](typename AllocatorT::Item& oldItem,
+                      typename AllocatorT::Item& newItem,
+                      typename AllocatorT::Item* /* parentPtr */) {
+      std::memcpy(newItem.getMemory(), oldItem.getMemory(),
+                  oldItem.getSize());
+    };
+
+    config.enableMovingOnSlabRelease(moveCb, {}, 10);
+    // Disable slab rebalancing
+    config.enablePoolRebalancing(nullptr, std::chrono::seconds{0});
+    config.configureMemoryTiers({
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0")),
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0"))
+    });
+
+    alloc = std::make_unique<AllocatorT>(AllocatorT::SharedMemNew, config);
+    ASSERT(alloc != nullptr);
+    pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize);
+
+    int i = 0;
+    typename AllocatorT::Item* evicted;
+    std::unique_ptr<std::thread> t;
+    std::unique_ptr<std::thread> r;
+    while(!quit) {
+      auto handle = alloc->allocate(pool, std::to_string(++i), std::string("value").size());
+      ASSERT(handle != nullptr);
+      if (i == 1) {
+          evicted = static_cast<typename AllocatorT::Item*>(handle.get());
+          folly::Latch latch_t(1);
+          t = std::make_unique<std::thread>([&](){
+                auto handleNew = alloc->allocate(pool, std::to_string(1), std::string("new value").size());
+                ASSERT(handleNew != nullptr);
+                latch_t.count_down();
+                //first breakpoint will be this one because 
+                //thread 1 still has more items to fill up the
+                //cache before an evict is evicted
+                gdb_sync1();
+                ASSERT(evicted->isMoving());
+                //need to suspend thread 1 - who is doing the eviction
+                //gdb will do this for us
+                folly::Latch latch(1);
+                r = std::make_unique<std::thread>([&](){
+                    ASSERT(evicted->isMoving());
+                    latch.count_down();
+                    auto handleEvict = alloc->find(std::to_string(1));
+                    //does find block until done moving?? yes
+                    while (evicted->isMarkedForEviction()); //move will fail
+                    XDCHECK(handleEvict == nullptr) << handleEvict->toString();
+                    ASSERT(handleEvict == nullptr);
+                });
+                latch.wait();
+                gdb_sync2();
+                alloc->insertOrReplace(handleNew);
+                ASSERT(!evicted->isAccessible()); //move failed
+                quit = true;
+              });
+          latch_t.wait();
+      }
+      ASSERT_NO_THROW(alloc->insertOrReplace(handle));
+    }
+    t->join();
+    r->join();
+    gdb_sync3();
   }
 };
 } // namespace tests
diff --git a/cachelib/allocator/tests/AllocatorResizeTest.h b/cachelib/allocator/tests/AllocatorResizeTest.h
index d65205ac74..883dd9c056 100644
--- a/cachelib/allocator/tests/AllocatorResizeTest.h
+++ b/cachelib/allocator/tests/AllocatorResizeTest.h
@@ -966,23 +966,23 @@ class AllocatorResizeTest : public AllocatorTest<AllocatorT> {
       for (i = 1; i <= numItersToMaxAdviseAway + 1; i++) {
         alloc.memMonitor_->adviseAwaySlabs();
         std::this_thread::sleep_for(std::chrono::seconds{2});
-        ASSERT_EQ(alloc.allocator_->getAdvisedMemorySize(), i * perIterAdvSize);
+        ASSERT_EQ(alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize(), i * perIterAdvSize);
       }
       i--;
       // This should fail
       alloc.memMonitor_->adviseAwaySlabs();
       std::this_thread::sleep_for(std::chrono::seconds{2});
-      auto totalAdvisedAwayMemory = alloc.allocator_->getAdvisedMemorySize();
+      auto totalAdvisedAwayMemory = alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize();
       ASSERT_EQ(totalAdvisedAwayMemory, i * perIterAdvSize);
 
       // Try to reclaim back
       for (i = 1; i <= numItersToMaxAdviseAway + 1; i++) {
         alloc.memMonitor_->reclaimSlabs();
         std::this_thread::sleep_for(std::chrono::seconds{2});
-        ASSERT_EQ(alloc.allocator_->getAdvisedMemorySize(),
+        ASSERT_EQ(alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize(),
                   totalAdvisedAwayMemory - i * perIterAdvSize);
       }
-      totalAdvisedAwayMemory = alloc.allocator_->getAdvisedMemorySize();
+      totalAdvisedAwayMemory = alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize();
       ASSERT_EQ(totalAdvisedAwayMemory, 0);
     }
   }
diff --git a/cachelib/allocator/tests/AllocatorTypeTest.cpp b/cachelib/allocator/tests/AllocatorTypeTest.cpp
index 97ff04efea..28c145b39d 100644
--- a/cachelib/allocator/tests/AllocatorTypeTest.cpp
+++ b/cachelib/allocator/tests/AllocatorTypeTest.cpp
@@ -410,6 +410,7 @@ TYPED_TEST(BaseAllocatorTest, RateMap) { this->testRateMap(); }
 TYPED_TEST(BaseAllocatorTest, StatSnapshotTest) {
   this->testStatSnapshotTest();
 }
+TYPED_TEST(BaseAllocatorTest, BasicMultiTier) {this->testBasicMultiTier(); }
 
 namespace { // the tests that cannot be done by TYPED_TEST.
 
diff --git a/cachelib/allocator/tests/BaseAllocatorTest.h b/cachelib/allocator/tests/BaseAllocatorTest.h
index e0c988832e..16d3c03ccd 100644
--- a/cachelib/allocator/tests/BaseAllocatorTest.h
+++ b/cachelib/allocator/tests/BaseAllocatorTest.h
@@ -4182,15 +4182,16 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
   // Check that item is in the expected container.
   bool findItem(AllocatorT& allocator, typename AllocatorT::Item* item) {
     auto& container = allocator.getMMContainer(*item);
-    auto itr = container.getEvictionIterator();
     bool found = false;
-    while (itr) {
-      if (itr.get() == item) {
-        found = true;
-        break;
+    container.withEvictionIterator([&found, &item](auto&& itr) {
+      while (itr) {
+        if (itr.get() == item) {
+          found = true;
+          break;
+        }
+        ++itr;
       }
-      ++itr;
-    }
+    });
     return found;
   }
 
@@ -4340,13 +4341,13 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
     // Had a bug: D4799860 where we allocated the wrong size for chained item
     {
       const auto parentAllocInfo =
-          alloc.allocator_->getAllocInfo(itemHandle->getMemory());
+          alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(itemHandle->getMemory());
       const auto child1AllocInfo =
-          alloc.allocator_->getAllocInfo(chainedItemHandle->getMemory());
+          alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(chainedItemHandle->getMemory());
       const auto child2AllocInfo =
-          alloc.allocator_->getAllocInfo(chainedItemHandle2->getMemory());
+          alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(chainedItemHandle2->getMemory());
       const auto child3AllocInfo =
-          alloc.allocator_->getAllocInfo(chainedItemHandle3->getMemory());
+          alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(chainedItemHandle3->getMemory());
 
       const auto parentCid = parentAllocInfo.classId;
       const auto child1Cid = child1AllocInfo.classId;
@@ -4915,7 +4916,7 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
 
       std::memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize());
       ++numMoves;
-    });
+    }, {}, 1000000 /* lots of moving tries */);
 
     AllocatorT alloc(config);
     const size_t numBytes = alloc.getCacheMemoryStats().ramCacheSize;
@@ -4956,7 +4957,7 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
         }
 
         /* sleep override */
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
       }
     };
 
@@ -4964,7 +4965,7 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
     auto releaseFn = [&] {
       for (unsigned int i = 0; i < 5;) {
         /* sleep override */
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
 
         ClassId cid = static_cast<ClassId>(i);
         alloc.releaseSlab(pid, cid, SlabReleaseMode::kRebalance);
@@ -5124,7 +5125,7 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
     auto releaseFn = [&] {
       for (unsigned int i = 0; i < 5;) {
         /* sleep override */
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
 
         ClassId cid = static_cast<ClassId>(i);
         alloc.releaseSlab(pid, cid, SlabReleaseMode::kRebalance);
@@ -5482,8 +5483,12 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
       ASSERT_TRUE(big->isInMMContainer());
 
       auto& mmContainer = alloc.getMMContainer(*big);
-      auto itr = mmContainer.getEvictionIterator();
-      ASSERT_EQ(big.get(), &(*itr));
+
+      typename AllocatorT::Item* evictionCandidate = nullptr;
+      mmContainer.withEvictionIterator(
+          [&evictionCandidate](auto&& itr) { evictionCandidate = itr.get(); });
+
+      ASSERT_EQ(big.get(), evictionCandidate);
 
       alloc.remove("hello");
     }
@@ -5497,8 +5502,11 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
       ASSERT_TRUE(small2->isInMMContainer());
 
       auto& mmContainer = alloc.getMMContainer(*small2);
-      auto itr = mmContainer.getEvictionIterator();
-      ASSERT_EQ(small2.get(), &(*itr));
+
+      typename AllocatorT::Item* evictionCandidate = nullptr;
+      mmContainer.withEvictionIterator(
+          [&evictionCandidate](auto&& itr) { evictionCandidate = itr.get(); });
+      ASSERT_EQ(small2.get(), evictionCandidate);
 
       alloc.remove("hello");
     }
@@ -5960,7 +5968,6 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
     EXPECT_EQ(nullptr,
               util::allocateAccessible(alloc, poolId, "large", largeSize));
 
-    std::this_thread::sleep_for(std::chrono::seconds{1});
     // trigger the slab rebalance
     EXPECT_EQ(nullptr,
               util::allocateAccessible(alloc, poolId, "large", largeSize));
@@ -6296,6 +6303,86 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
         });
     EXPECT_EQ(intervalNameExists, 4);
   }
+  
+  void testSingleTierMemoryAllocatorSize() {
+    typename AllocatorT::Config config;
+    static constexpr size_t cacheSize = 100 * 1024 * 1024; /* 100 MB */
+    config.setCacheSize(cacheSize);
+    config.enableCachePersistence(folly::sformat("/tmp/single-tier-test/{}", ::getpid()));
+
+    AllocatorT alloc(AllocatorT::SharedMemNew, config);
+
+    EXPECT_LE(alloc.allocator_[0]->getMemorySize(), cacheSize);
+  }
+
+  void testSingleTierMemoryAllocatorSizeAnonymous() {
+    typename AllocatorT::Config config;
+    static constexpr size_t cacheSize = 100 * 1024 * 1024; /* 100 MB */
+    config.setCacheSize(cacheSize);
+
+    AllocatorT alloc(config);
+
+    EXPECT_LE(alloc.allocator_[0]->getMemorySize(), cacheSize);
+  }
+
+  void testBasicMultiTier() {
+    using Item = typename AllocatorT::Item;
+    const static std::string data = "data";
+
+    std::set<std::string> movedKeys;
+    auto moveCb = [&](const Item& oldItem, Item& newItem, Item* /* parentPtr */) {
+      std::memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize());
+      movedKeys.insert(oldItem.getKey().str());
+    };
+
+    typename AllocatorT::Config config;
+    static constexpr size_t cacheSize = 100 * 1024 * 1024; /* 100 MB */
+    config.setCacheSize(100 * 1024 * 1024); /* 100 MB */
+    config.enableCachePersistence(folly::sformat("/tmp/multi-tier-test/{}", ::getpid()));
+    config.configureMemoryTiers({
+      MemoryTierCacheConfig::fromShm().setRatio(1)
+        .setMemBind(std::string("0")),
+      MemoryTierCacheConfig::fromShm().setRatio(1)
+        .setMemBind(std::string("0")),
+    });
+    config.enableMovingOnSlabRelease(moveCb);
+
+    AllocatorT alloc(AllocatorT::SharedMemNew, config);
+
+    EXPECT_EQ(alloc.allocator_.size(), 2);
+    EXPECT_LE(alloc.allocator_[0]->getMemorySize(), cacheSize / 2);
+    EXPECT_LE(alloc.allocator_[1]->getMemorySize(), cacheSize / 2);
+
+    const size_t numBytes = alloc.getCacheMemoryStats().ramCacheSize;
+    auto pid = alloc.addPool("default", numBytes);
+
+    static constexpr size_t numOps = cacheSize / 1024;
+    for (int i = 0; i < numOps; i++) {
+      std::string key = std::to_string(i);
+      auto h = alloc.allocate(pid, key, 1024);
+      EXPECT_TRUE(h);
+
+      std::memcpy(h->getMemory(), data.data(), data.size());
+
+      alloc.insertOrReplace(h);
+    }
+
+    EXPECT_TRUE(movedKeys.size() > 0);
+
+    size_t movedButStillInMemory = 0;
+    for (const auto &k : movedKeys) {
+      auto h = alloc.find(k);
+
+      if (h) {
+        movedButStillInMemory++;
+        /* All moved elements should be in the second tier. */
+        EXPECT_TRUE(alloc.allocator_[1]->isMemoryInAllocator(h->getMemory()));
+        EXPECT_EQ(data, std::string((char*)h->getMemory(), data.size()));
+      }
+    }
+
+    EXPECT_TRUE(movedButStillInMemory > 0);
+  }
 };
 } // namespace tests
 } // namespace cachelib
diff --git a/cachelib/allocator/tests/CacheBaseTest.cpp b/cachelib/allocator/tests/CacheBaseTest.cpp
index 928fcc0c67..dae14c5335 100644
--- a/cachelib/allocator/tests/CacheBaseTest.cpp
+++ b/cachelib/allocator/tests/CacheBaseTest.cpp
@@ -33,7 +33,10 @@ class CacheBaseTest : public CacheBase, public SlabAllocatorTestBase {
   const std::string getCacheName() const override { return cacheName; }
   bool isObjectCache() const override { return false; }
   const MemoryPool& getPool(PoolId) const override { return memoryPool_; }
+  //TODO: support tiers
+  const MemoryPool& getPoolByTid(PoolId, TierId tid) const override { return memoryPool_; }
   PoolStats getPoolStats(PoolId) const override { return PoolStats(); }
+  ACStats getACStats(TierId, PoolId, ClassId) const { return ACStats(); };
   AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId) const override {
     return AllSlabReleaseEvents{};
   }
diff --git a/cachelib/allocator/tests/MM2QTest.cpp b/cachelib/allocator/tests/MM2QTest.cpp
index e11dd95f5a..0e01ffa56f 100644
--- a/cachelib/allocator/tests/MM2QTest.cpp
+++ b/cachelib/allocator/tests/MM2QTest.cpp
@@ -223,6 +223,19 @@ void MMTypeTest<MMType>::testIterate(std::vector<std::unique_ptr<Node>>& nodes,
   }
 }
 
+template <typename MMType>
+void MMTypeTest<MMType>::testIterateHot(std::vector<std::unique_ptr<Node>>& nodes,
+                                     Container& c) {
+  auto it = nodes.rbegin();
+  c.withPromotionIterator([&it,&c](auto &&it2q) {
+    while (it2q && c.isHot(*it2q)) {
+        ASSERT_EQ(it2q->getId(), (*it)->getId());
+        ++it2q;
+        ++it;
+    }
+  });
+}
+
 template <typename MMType>
 void MMTypeTest<MMType>::testMatch(std::string expected,
                                    MMTypeTest<MMType>::Container& c) {
@@ -238,6 +251,23 @@ void MMTypeTest<MMType>::testMatch(std::string expected,
   ASSERT_EQ(expected, actual);
 }
 
+template <typename MMType>
+void MMTypeTest<MMType>::testMatchHot(std::string expected,
+                                   MMTypeTest<MMType>::Container& c) {
+  int index = -1;
+  std::string actual;
+  c.withPromotionIterator([&c,&actual,&index](auto &&it2q) {
+    while (it2q) {
+      ++index;
+      actual += folly::stringPrintf(
+          "%d:%s, ", it2q->getId(),
+          (c.isHot(*it2q) ? "H" : (c.isCold(*it2q) ? "C" : "W")));
+      ++it2q;
+    }
+  });
+  ASSERT_EQ(expected, actual);
+}
+
 TEST_F(MM2QTest, DetailedTest) {
   MM2Q::Config config;
   config.lruRefreshTime = 0;
@@ -259,8 +289,11 @@ TEST_F(MM2QTest, DetailedTest) {
   }
 
   testIterate(nodes, c);
+  testIterateHot(nodes, c);
 
   testMatch("0:C, 1:C, 2:C, 3:C, 4:H, 5:H, ", c);
+  testMatchHot("5:H, 4:H, 3:C, 2:C, 1:C, 0:C, ", c);
+
   // Move 3 to top of the hot cache
   c.recordAccess(*(nodes[4]), AccessMode::kRead);
   testMatch("0:C, 1:C, 2:C, 3:C, 5:H, 4:H, ", c);
diff --git a/cachelib/allocator/tests/MemoryTiersTest.cpp b/cachelib/allocator/tests/MemoryTiersTest.cpp
index ed35115c0c..535cb14bbe 100644
--- a/cachelib/allocator/tests/MemoryTiersTest.cpp
+++ b/cachelib/allocator/tests/MemoryTiersTest.cpp
@@ -34,7 +34,7 @@ constexpr size_t MB = 1024ULL * 1024ULL;
 constexpr size_t GB = MB * 1024ULL;
 
 const size_t defaultTotalCacheSize{1 * GB};
-const std::string defaultCacheDir{"/var/metadataDir"};
+const std::string defaultCacheDir{"/tmp/metadataDir"};
 
 template <typename Allocator>
 class MemoryTiersTest : public AllocatorTest<Allocator> {
@@ -109,7 +109,7 @@ class MemoryTiersTest : public AllocatorTest<Allocator> {
   void validatePoolSize(PoolId poolId,
                         std::unique_ptr<LruAllocator>& allocator,
                         size_t expectedSize) {
-    size_t actualSize = allocator->getPool(poolId).getPoolSize();
+    size_t actualSize = allocator->getPoolSize(poolId);
     EXPECT_EQ(actualSize, expectedSize);
   }
 
@@ -119,9 +119,9 @@ class MemoryTiersTest : public AllocatorTest<Allocator> {
                    size_t numTiers = 2) {
     if (isSizeValid) {
       auto pool = alloc->addPool("validPoolSize", poolSize);
-      EXPECT_LE(alloc->getPool(pool).getPoolSize(), poolSize);
+      EXPECT_LE(alloc->getPoolSize(pool), poolSize);
       if (poolSize >= numTiers * Slab::kSize)
-        EXPECT_GE(alloc->getPool(pool).getPoolSize(),
+        EXPECT_GE(alloc->getPoolSize(pool),
                   poolSize - numTiers * Slab::kSize);
     } else {
       EXPECT_THROW(alloc->addPool("invalidPoolSize", poolSize),
@@ -172,6 +172,84 @@ TEST_F(LruMemoryTiersTest, TestInvalid2TierConfigRatioNotSet) {
 TEST_F(LruMemoryTiersTest, TestInvalid2TierConfigSizesNeCacheSize) {
   EXPECT_THROW(createTestCacheConfig({0, 0}), std::invalid_argument);
 }
+
+TEST_F(LruMemoryTiersTest, TestPoolAllocations) {
+  std::vector<size_t> totalCacheSizes = {8 * GB, 2 * GB};
+
+  static const size_t numExtraSizes = 4;
+  static const size_t numExtraSlabs = 20;
+
+  for (size_t i = 0; i < numExtraSizes; i++) {
+    totalCacheSizes.push_back(totalCacheSizes.back() +
+                              (folly::Random::rand64() % numExtraSlabs) *
+                                  Slab::kSize);
+  }
+
+  size_t min_ratio = 1;
+  size_t max_ratio = 111;
+
+  static const size_t numCombinations = 10;
+
+  for (auto totalCacheSize : totalCacheSizes) {
+    for (size_t k = 0; k < numCombinations; k++) {
+      const size_t i = folly::Random::rand32() % max_ratio + min_ratio;
+      const size_t j = folly::Random::rand32() % max_ratio + min_ratio;
+      LruAllocatorConfig cfg =
+          createTestCacheConfig({i, j},
+                                /* usePoisx */ true, totalCacheSize);
+      basicCheck(cfg, totalCacheSize);
+
+      std::unique_ptr<LruAllocator> alloc = std::unique_ptr<LruAllocator>(
+          new LruAllocator(LruAllocator::SharedMemNew, cfg));
+
+      size_t size = (folly::Random::rand64() %
+                      (alloc->getCacheMemoryStats().ramCacheSize - Slab::kSize)) +
+                    Slab::kSize;
+      testAddPool(alloc, size, true);
+    }
+  }
+}
+
+TEST_F(LruMemoryTiersTest, TestPoolInvalidAllocations) {
+  std::vector<size_t> totalCacheSizes = {48 * MB, 51 * MB, 256 * MB,
+                                         1 * GB,  5 * GB,  8 * GB};
+  size_t min_ratio = 1;
+  size_t max_ratio = 111;
+
+  static const size_t numCombinations = 10;
+
+  for (auto totalCacheSize : totalCacheSizes) {
+    for (size_t k = 0; k < numCombinations; k++) {
+      const size_t i = folly::Random::rand32() % max_ratio + min_ratio;
+      const size_t j = folly::Random::rand32() % max_ratio + min_ratio;
+      LruAllocatorConfig cfg =
+          createTestCacheConfig({i, j},
+                                /* usePoisx */ true, totalCacheSize);
+
+      std::unique_ptr<LruAllocator> alloc = nullptr;
+      try {
+         alloc = std::unique_ptr<LruAllocator>(
+            new LruAllocator(LruAllocator::SharedMemNew, cfg));
+      } catch(...) {
+        // expection only if cache too small
+        size_t sum_ratios = std::accumulate(
+          cfg.getMemoryTierConfigs().begin(), cfg.getMemoryTierConfigs().end(), 0UL,
+          [](const size_t i, const MemoryTierCacheConfig& config) {
+            return i + config.getRatio();
+        });
+        auto tier1slabs = cfg.getMemoryTierConfigs()[0].calculateTierSize(cfg.getCacheSize(), sum_ratios) / Slab::kSize;
+        auto tier2slabs = cfg.getMemoryTierConfigs()[1].calculateTierSize(cfg.getCacheSize(), sum_ratios) / Slab::kSize;
+        EXPECT_TRUE(tier1slabs <= 2 || tier2slabs <= 2);
+
+        continue;
+      }
+
+      size_t size = (folly::Random::rand64() % (100 * GB)) +
+                    alloc->getCacheMemoryStats().ramCacheSize;
+      testAddPool(alloc, size, false);
+    }
+  }
+}
 } // namespace tests
 } // namespace cachelib
 } // namespace facebook
diff --git a/cachelib/allocator/tests/RebalanceStrategyTest.cpp b/cachelib/allocator/tests/RebalanceStrategyTest.cpp
index a11ab234e3..2843cec883 100644
--- a/cachelib/allocator/tests/RebalanceStrategyTest.cpp
+++ b/cachelib/allocator/tests/RebalanceStrategyTest.cpp
@@ -214,6 +214,9 @@ class RebalanceStrategyTest : public testing::Test {
     config.poolRebalancerFreeAllocThreshold = 20;
 
     initAllocatorConfigForStrategy(config, LruTailAge);
+    //TODO: why does this fail with orig. value of 8?
+    //on upstream this fails too, it always reports 4 instead
+    //of the original test value, which is 8 expected slabs
     doWork(config, true, 8);
   }
 
diff --git a/cachelib/allocator/tests/RefCountTest.cpp b/cachelib/allocator/tests/RefCountTest.cpp
index e8e16259f9..7131d0e11e 100644
--- a/cachelib/allocator/tests/RefCountTest.cpp
+++ b/cachelib/allocator/tests/RefCountTest.cpp
@@ -209,16 +209,6 @@ void RefCountTest::testMarkForEvictionAndMoving() {
     ASSERT_EQ(ret, 0);
   }
 
-  {
-    // cannot mark moving when ref count > 0
-    RefcountWithFlags ref;
-    ref.markInMMContainer();
-
-    ref.incRef();
-
-    ASSERT_FALSE(ref.markMoving());
-  }
-
   {
     // cannot mark for eviction when ref count > 0
     RefcountWithFlags ref;
diff --git a/cachelib/allocator/tests/SimpleRebalancingTest.h b/cachelib/allocator/tests/SimpleRebalancingTest.h
index 634882c730..3f1869ede3 100644
--- a/cachelib/allocator/tests/SimpleRebalancingTest.h
+++ b/cachelib/allocator/tests/SimpleRebalancingTest.h
@@ -104,7 +104,7 @@ class SimpleRebalanceTest : public testing::Test {
 
     // Sleep for 2 seconds to let the rebalancing work
     /* sleep override */
-    std::this_thread::sleep_for(std::chrono::seconds(3));
+    std::this_thread::sleep_for(std::chrono::seconds(10));
 
     // Evicted keys shouldn't be in the allocator anymore
     ASSERT_FALSE(evictedKeys.empty());
diff --git a/cachelib/allocator/tests/TestBase.h b/cachelib/allocator/tests/TestBase.h
index 086fa65d3f..e35bc54d01 100644
--- a/cachelib/allocator/tests/TestBase.h
+++ b/cachelib/allocator/tests/TestBase.h
@@ -69,6 +69,11 @@ class AllocatorTest : public SlabAllocatorTestBase {
                                 PoolId pid,
                                 const std::vector<uint32_t>& sizes,
                                 unsigned int keyLen);
+  void fillUpPoolUntilEvictions(AllocatorT& alloc,
+                                TierId tid,
+                                PoolId pid,
+                                const std::vector<uint32_t>& sizes,
+                                unsigned int keyLen);
   void fillUpOneSlab(AllocatorT& alloc,
                      PoolId poolId,
                      const uint32_t size,
@@ -204,6 +209,30 @@ void AllocatorTest<AllocatorT>::fillUpPoolUntilEvictions(
   } while (allocs != 0);
 }
 
+template <typename AllocatorT>
+void AllocatorTest<AllocatorT>::fillUpPoolUntilEvictions(
+    AllocatorT& alloc,
+    TierId tid,
+    PoolId poolId,
+    const std::vector<uint32_t>& sizes,
+    unsigned int keyLen) {
+  unsigned int allocs = 0;
+  do {
+    allocs = 0;
+    for (const auto size : sizes) {
+      const auto key = getRandomNewKey(alloc, keyLen);
+      ASSERT_EQ(alloc.find(key), nullptr);
+      const size_t prev = alloc.getPoolByTid(poolId, tid).getCurrentAllocSize();
+      auto handle = util::allocateAccessible(alloc, poolId, key, size);
+      if (handle && prev != alloc.getPoolByTid(poolId, tid).getCurrentAllocSize()) {
+        // this means we did not cause an eviction.
+        ASSERT_GE(handle->getSize(), size);
+        allocs++;
+      }
+    }
+  } while (allocs != 0);
+}
+
 template <typename AllocatorT>
 void AllocatorTest<AllocatorT>::testAllocWithoutEviction(
     AllocatorT& alloc,
@@ -418,7 +447,7 @@ void AllocatorTest<AllocatorT>::testShmIsRemoved(
   ASSERT_FALSE(AllocatorT::ShmManager::segmentExists(
       config.getCacheDir(), detail::kShmHashTableName, config.usePosixShm));
   ASSERT_FALSE(AllocatorT::ShmManager::segmentExists(
-      config.getCacheDir(), detail::kShmCacheName, config.usePosixShm));
+      config.getCacheDir(), detail::kShmCacheName + std::to_string(0), config.usePosixShm));
   ASSERT_FALSE(AllocatorT::ShmManager::segmentExists(
       config.getCacheDir(), detail::kShmChainedItemHashTableName,
       config.usePosixShm));
@@ -432,7 +461,7 @@ void AllocatorTest<AllocatorT>::testShmIsNotRemoved(
   ASSERT_TRUE(AllocatorT::ShmManager::segmentExists(
       config.getCacheDir(), detail::kShmHashTableName, config.usePosixShm));
   ASSERT_TRUE(AllocatorT::ShmManager::segmentExists(
-      config.getCacheDir(), detail::kShmCacheName, config.usePosixShm));
+      config.getCacheDir(), detail::kShmCacheName + std::to_string(0), config.usePosixShm));
   ASSERT_TRUE(AllocatorT::ShmManager::segmentExists(
       config.getCacheDir(), detail::kShmChainedItemHashTableName,
       config.usePosixShm));
diff --git a/cachelib/benchmarks/CMakeLists.txt b/cachelib/benchmarks/CMakeLists.txt
index a4c6dab340..2adb2ad7cd 100644
--- a/cachelib/benchmarks/CMakeLists.txt
+++ b/cachelib/benchmarks/CMakeLists.txt
@@ -35,12 +35,8 @@ if (BUILD_TESTS)
                       benchmark_test_support "${ARGN}")
   endfunction()
 
-  add_test (BinarySearchVsHashTableBench.cpp)
   add_test (BucketMutexBench.cpp)
   add_test (BytesEqualBenchmark.cpp)
-  add_test (CachelibMapOperationBench.cpp)
-  add_test (CachelibMapWorkloadBench.cpp)
-  add_test (CachelibRangeMapWorkloadBench.cpp)
   add_test (CachelibTickerClockBench.cpp)
   add_test (CompactCacheBench.cpp)
   add_test (HashMapBenchmark.cpp)
diff --git a/cachelib/cachebench/CMakeLists.txt b/cachelib/cachebench/CMakeLists.txt
index 35622ee666..1dcbf8d7d8 100644
--- a/cachelib/cachebench/CMakeLists.txt
+++ b/cachelib/cachebench/CMakeLists.txt
@@ -51,6 +51,10 @@ endif()
 add_executable (cachebench main.cpp)
 target_link_libraries(cachebench cachelib_cachebench)
 
+if (BUILD_WITH_DTO)
+    target_link_libraries(cachebench accel-config dto)
+endif ()
+
 install(
   TARGETS
      cachebench
diff --git a/cachelib/cachebench/cache/Cache.h b/cachelib/cachebench/cache/Cache.h
index fc9a13d704..fe6f7e7cb7 100644
--- a/cachelib/cachebench/cache/Cache.h
+++ b/cachelib/cachebench/cache/Cache.h
@@ -325,6 +325,10 @@ class Cache {
   // return the stats for the pool.
   PoolStats getPoolStats(PoolId pid) const { return cache_->getPoolStats(pid); }
 
+  ACStats getACStats(TierId tid, PoolId pid, ClassId cid) const {
+    return cache_->getACStats(tid, pid, cid);
+  }
+
   // return the total number of inconsistent operations detected since start.
   unsigned int getInconsistencyCount() const {
     return inconsistencyCount_.load(std::memory_order_relaxed);
@@ -517,6 +521,15 @@ Cache<Allocator>::Cache(const CacheConfig& config,
       config_.getRebalanceStrategy(),
       std::chrono::seconds(config_.poolRebalanceIntervalSec));
 
+  allocatorConfig_.enableBackgroundEvictor(
+      config_.getBackgroundEvictorStrategy(),
+      std::chrono::milliseconds(config_.backgroundEvictorIntervalMilSec),
+      config_.evictorThreads);
+
+  allocatorConfig_.enableBackgroundPromoter(
+      config_.getBackgroundPromoterStrategy(),
+      std::chrono::milliseconds(config_.backgroundPromoterIntervalMilSec),
+      config_.promoterThreads);
   if (config_.moveOnSlabRelease && movingSync != nullptr) {
     allocatorConfig_.enableMovingOnSlabRelease(
         [](Item& oldItem, Item& newItem, Item* parentPtr) {
@@ -565,12 +578,21 @@ Cache<Allocator>::Cache(const CacheConfig& config,
     allocatorConfig_.configureMemoryTiers(config_.memoryTierConfigs);
   }
 
+  allocatorConfig_.insertToFirstFreeTier = config_.insertToFirstFreeTier;
+  allocatorConfig_.noOnlineEviction = config_.noOnlineEviction;
+
   auto cleanupGuard = folly::makeGuard([&] {
     if (!nvmCacheFilePath_.empty()) {
       util::removePath(nvmCacheFilePath_);
     }
   });
 
+  allocatorConfig_.maxEvictionBatch = config_.maxEvictionBatch;
+  allocatorConfig_.maxPromotionBatch = config_.maxPromotionBatch;
+  allocatorConfig_.minEvictionBatch = config_.minEvictionBatch;
+  allocatorConfig_.minPromotionBatch = config_.minPromotionBatch;
+  allocatorConfig_.maxEvictionPromotionHotness = config_.maxEvictionPromotionHotness;
+
   if (config_.enableItemDestructorCheck) {
     auto removeCB = [&](const typename Allocator::DestructorData& data) {
       if (!itemRecords_.validate(data)) {
@@ -1110,28 +1132,37 @@ double Cache<Allocator>::getNvmBytesWritten() const {
 
 template <typename Allocator>
 Stats Cache<Allocator>::getStats() const {
-  PoolStats aggregate = cache_->getPoolStats(pools_[0]);
-  auto usageFraction =
-      1.0 - (static_cast<double>(aggregate.freeMemoryBytes())) /
-                aggregate.poolUsableSize;
   Stats ret;
-  ret.poolUsageFraction.push_back(usageFraction);
-  for (size_t pid = 1; pid < pools_.size(); pid++) {
-    auto poolStats = cache_->getPoolStats(static_cast<PoolId>(pid));
-    usageFraction = 1.0 - (static_cast<double>(poolStats.freeMemoryBytes())) /
-                              poolStats.poolUsableSize;
-    ret.poolUsageFraction.push_back(usageFraction);
-    aggregate += poolStats;
+  for (TierId tid = 0; tid < cache_->getNumTiers(); tid++) {
+    PoolStats aggregate = cache_->getPoolStats(tid,pools_[0]);
+    auto usageFraction =
+        1.0 - (static_cast<double>(aggregate.freeMemoryBytes())) /
+                  aggregate.poolUsableSize;
+    ret.poolUsageFraction[tid].push_back(usageFraction);
+    for (size_t pid = 1; pid < pools_.size(); pid++) {
+      auto poolStats = cache_->getPoolStats(tid, static_cast<PoolId>(pid));
+      usageFraction = 1.0 - (static_cast<double>(poolStats.freeMemoryBytes())) /
+                                poolStats.poolUsableSize;
+      ret.poolUsageFraction[tid].push_back(usageFraction);
+      aggregate += poolStats;
+    }
+    ret.numEvictions.push_back(aggregate.numEvictions());
+    ret.numWritebacks.push_back(aggregate.numWritebacks());
+    ret.numCacheHits.push_back(aggregate.numHits());
+    ret.numItems.push_back(aggregate.numItems());
   }
 
-  std::map<PoolId, std::map<ClassId, ACStats>> allocationClassStats{};
+  std::map<MemoryDescriptorType, ACStats> allocationClassStats{};
 
   for (size_t pid = 0; pid < pools_.size(); pid++) {
     PoolId poolId = static_cast<PoolId>(pid);
     auto poolStats = cache_->getPoolStats(poolId);
     auto cids = poolStats.getClassIds();
-    for (auto [cid, stats] : poolStats.mpStats.acStats) {
-      allocationClassStats[poolId][cid] = stats;
+    for (TierId tid = 0; tid < cache_->getNumTiers(); tid++) {
+      for (auto cid : cids) {
+        MemoryDescriptorType md(tid,pid,cid);
+        allocationClassStats[md] = cache_->getACStats(tid, pid, cid);
+      }
     }
   }
 
@@ -1140,21 +1171,14 @@ Stats Cache<Allocator>::getStats() const {
   const auto navyStats = cache_->getNvmCacheStatsMap().toMap();
 
   ret.allocationClassStats = allocationClassStats;
-  ret.numEvictions = aggregate.numEvictions();
-  ret.numItems = aggregate.numItems();
+
+  ret.backgroundEvictorStats = cacheStats.evictionStats;
+  ret.backgroundPromoStats = cacheStats.promotionStats;
+
   ret.evictAttempts = cacheStats.evictionAttempts;
   ret.allocAttempts = cacheStats.allocAttempts;
   ret.allocFailures = cacheStats.allocFailures;
 
-  ret.backgndEvicStats.nEvictedItems = cacheStats.evictionStats.numMovedItems;
-  ret.backgndEvicStats.nTraversals = cacheStats.evictionStats.runCount;
-  ret.backgndEvicStats.nClasses = cacheStats.evictionStats.totalClasses;
-  ret.backgndEvicStats.evictionSize = cacheStats.evictionStats.totalBytesMoved;
-
-  ret.backgndPromoStats.nPromotedItems =
-      cacheStats.promotionStats.numMovedItems;
-  ret.backgndPromoStats.nTraversals = cacheStats.promotionStats.runCount;
-
   ret.numCacheGets = cacheStats.numCacheGets;
   ret.numCacheGetMiss = cacheStats.numCacheGetMiss;
   ret.numCacheEvictions = cacheStats.numCacheEvictions;
@@ -1194,6 +1218,8 @@ Stats Cache<Allocator>::getStats() const {
       static_cast<int64_t>(itemRecords_.count()) - totalDestructor_;
 
   ret.cacheAllocateLatencyNs = cacheStats.allocateLatencyNs;
+  ret.cacheBgEvictLatencyNs = cacheStats.bgEvictLatencyNs;
+  ret.cacheBgPromoteLatencyNs = cacheStats.bgPromoteLatencyNs;
   ret.cacheFindLatencyNs = cacheFindLatency_.estimate();
 
   // Populate counters.
diff --git a/cachelib/cachebench/cache/CacheStats.h b/cachelib/cachebench/cache/CacheStats.h
index a0bb1e4ddd..1e2442d2e8 100644
--- a/cachelib/cachebench/cache/CacheStats.h
+++ b/cachelib/cachebench/cache/CacheStats.h
@@ -27,40 +27,22 @@ namespace facebook {
 namespace cachelib {
 namespace cachebench {
 
-struct BackgroundEvictionStats {
-  // the number of items this worker evicted by looking at pools/classes stats
-  uint64_t nEvictedItems{0};
-
-  // number of times we went executed the thread //TODO: is this def correct?
-  uint64_t nTraversals{0};
-
-  // number of classes
-  uint64_t nClasses{0};
-
-  // size of evicted items
-  uint64_t evictionSize{0};
-};
-
-struct BackgroundPromotionStats {
-  // the number of items this worker evicted by looking at pools/classes stats
-  uint64_t nPromotedItems{0};
-
-  // number of times we went executed the thread //TODO: is this def correct?
-  uint64_t nTraversals{0};
-};
 
 struct Stats {
-  BackgroundEvictionStats backgndEvicStats;
-  BackgroundPromotionStats backgndPromoStats;
+  std::vector<BackgroundMoverStats> backgroundEvictorStats;
+  std::vector<BackgroundMoverStats> backgroundPromoStats;
+  ReaperStats reaperStats;
 
-  uint64_t numEvictions{0};
-  uint64_t numItems{0};
+  std::vector<uint64_t> numEvictions;
+  std::vector<uint64_t> numWritebacks;
+  std::vector<uint64_t> numCacheHits;
+  std::vector<uint64_t> numItems;
 
-  uint64_t evictAttempts{0};
-  uint64_t allocAttempts{0};
-  uint64_t allocFailures{0};
+  std::vector<uint64_t> evictAttempts{0};
+  std::vector<uint64_t> allocAttempts{0};
+  std::vector<uint64_t> allocFailures{0};
 
-  std::vector<double> poolUsageFraction;
+  std::map<TierId,std::vector<double>> poolUsageFraction;
 
   uint64_t numCacheGets{0};
   uint64_t numCacheGetMiss{0};
@@ -88,6 +70,8 @@ struct Stats {
   uint64_t numNvmItemRemovedSetSize{0};
 
   util::PercentileStats::Estimates cacheAllocateLatencyNs;
+  util::PercentileStats::Estimates cacheBgEvictLatencyNs;
+  util::PercentileStats::Estimates cacheBgPromoteLatencyNs;
   util::PercentileStats::Estimates cacheFindLatencyNs;
 
   double nvmReadLatencyMicrosP50{0};
@@ -127,15 +111,17 @@ struct Stats {
   uint64_t invalidDestructorCount{0};
   int64_t unDestructedItemCount{0};
 
-  std::map<PoolId, std::map<ClassId, ACStats>> allocationClassStats;
+  std::map<MemoryDescriptorType, ACStats> allocationClassStats;
 
   // populate the counters related to nvm usage. Cache implementation can decide
   // what to populate since not all of those are interesting when running
   // cachebench.
   std::unordered_map<std::string, double> nvmCounters;
+  
+  using ClassBgStatsType = std::map<MemoryDescriptorType,uint64_t>;
 
-  std::map<PoolId, std::map<ClassId, uint64_t>> backgroundEvictionClasses;
-  std::map<PoolId, std::map<ClassId, uint64_t>> backgroundPromotionClasses;
+  ClassBgStatsType backgroundEvictionClasses;
+  ClassBgStatsType backgroundPromotionClasses;
 
   // errors from the nvm engine.
   std::unordered_map<std::string, double> nvmErrors;
@@ -143,31 +129,53 @@ struct Stats {
   void render(std::ostream& out) const {
     auto totalMisses = getTotalMisses();
     const double overallHitRatio = invertPctFn(totalMisses, numCacheGets);
-    out << folly::sformat("Items in RAM  : {:,}", numItems) << std::endl;
-    out << folly::sformat("Items in NVM  : {:,}", numNvmItems) << std::endl;
-
-    out << folly::sformat("Alloc Attempts: {:,} Success: {:.2f}%",
-                          allocAttempts,
-                          invertPctFn(allocFailures, allocAttempts))
-        << std::endl;
-    out << folly::sformat("Evict Attempts: {:,} Success: {:.2f}%",
-                          evictAttempts,
-                          pctFn(numEvictions, evictAttempts))
-        << std::endl;
-    out << folly::sformat("RAM Evictions : {:,}", numEvictions) << std::endl;
-
-    auto foreachAC = [](const auto& map, auto cb) {
-      for (auto& pidStat : map) {
-        for (auto& cidStat : pidStat.second) {
-          cb(pidStat.first, cidStat.first, cidStat.second);
-        }
+    const auto nTiers = numItems.size();
+    for (TierId tid = 0; tid < nTiers; tid++) {
+        out << folly::sformat("Items in Tier {}  : {:,}", tid, numItems[tid]) << std::endl;
+    }
+    out << folly::sformat("Items in NVM    : {:,}", numNvmItems) << std::endl;
+    for (TierId tid = 0; tid < nTiers; tid++) {
+        out << folly::sformat("Tier {} Alloc Attempts: {:,}\n"
+                              "Tier {} Alloc Success: {:.2f}%",
+                              tid, allocAttempts[tid],
+                              tid, invertPctFn(allocFailures[tid], allocAttempts[tid]))
+            << std::endl;
+    }
+    for (TierId tid = 0; tid < nTiers; tid++) {
+        out << folly::sformat(
+                   "Tier {} Evict Attempts: {:,}\n"
+                   "Tier {} Success: {:.2f}%",
+                   tid, evictAttempts[tid],
+                   tid, invertPctFn(evictAttempts[tid] - numEvictions[tid], evictAttempts[tid]))
+            << std::endl;
+    }
+    for (TierId tid = 0; tid < nTiers; tid++) {
+        out << folly::sformat("Tier {} Evictions: {:,}\n"
+                              "Tier {} Writebacks: {:,}\n"
+                              "Tier {} Success: {:.2f}%",
+                              tid, numEvictions[tid],
+                              tid, numWritebacks[tid],
+                              tid, invertPctFn(numEvictions[tid] - numWritebacks[tid], numEvictions[tid]))
+            << std::endl;
+    }
+    
+    auto foreachAC = [&](auto &map, auto cb) {
+      for (const auto& [key, value] : map) {
+        auto [tid,pid,cid] = key;
+        cb(tid, pid, cid, value);
       }
     };
 
-    for (auto pid = 0U; pid < poolUsageFraction.size(); pid++) {
-      out << folly::sformat("Fraction of pool {:,} used : {:.2f}", pid,
-                            poolUsageFraction[pid])
-          << std::endl;
+    for (auto entry : poolUsageFraction) {
+        auto tid = entry.first;
+        auto usageFraction = entry.second;
+        for (auto pid = 0U; pid < usageFraction.size(); pid++) {
+          out << folly::sformat("Tier {} fraction of pool {:,} used : {:.2f}",
+                                tid,
+                                pid,
+                                usageFraction[pid])
+              << std::endl;
+        }
     }
 
     if (FLAGS_report_ac_memory_usage_stats != "") {
@@ -191,51 +199,90 @@ struct Stats {
         }
       };
 
-      foreachAC(allocationClassStats, [&](auto pid, auto cid, auto stats) {
+      auto foreachAC = [&](auto cb) {
+        for (const auto& [key, value] : allocationClassStats) {
+          auto [tid,pid,cid] = key;
+          cb(tid, pid, cid, value);
+        }
+      };
+ 
+      foreachAC([&](auto tid, auto pid, auto cid, auto stats) {
         auto [allocSizeSuffix, allocSize] = formatMemory(stats.allocSize);
         auto [memorySizeSuffix, memorySize] =
-            formatMemory(stats.activeAllocs * stats.allocSize);
-        out << folly::sformat("pid{:2} cid{:4} {:8.2f}{} memorySize: {:8.2f}{}",
-                              pid, cid, allocSize, allocSizeSuffix, memorySize,
-                              memorySizeSuffix)
-            << std::endl;
-      });
-
-      foreachAC(allocationClassStats, [&](auto pid, auto cid, auto stats) {
-        auto [allocSizeSuffix, allocSize] = formatMemory(stats.allocSize);
+            formatMemory(stats.totalAllocatedSize());
 
         // If the pool is not full, extrapolate usageFraction for AC assuming it
         // will grow at the same rate. This value will be the same for all ACs.
-        double acUsageFraction;
-        if (poolUsageFraction[pid] < 1.0) {
-          acUsageFraction = poolUsageFraction[pid];
-        } else if (stats.usedSlabs == 0) {
-          acUsageFraction = 0.0;
-        } else {
-          acUsageFraction =
-              stats.activeAllocs / (stats.usedSlabs * stats.allocsPerSlab);
-        }
-
-        out << folly::sformat(
-                   "pid{:2} cid{:4} {:8.2f}{} usageFraction: {:4.2f}", pid, cid,
-                   allocSize, allocSizeSuffix, acUsageFraction)
+        if (memorySize > 0) {
+          const auto acUsageFraction = stats.approxUsage();
+          out << folly::sformat(
+                   "tid{:2} pid{:2} cid{:4} {:8.2f}{} usage fraction: {:4.2f}\n"
+                   "tid{:2} pid{:2} cid{:4} {:8.2f}{} memory size in {}: {:8.2f}\n"
+                   "tid{:2} pid{:2} cid{:4} {:8.2f}{} eviction success: {:4.2f}\n"
+                   "tid{:2} pid{:2} cid{:4} {:8.2f}{} rolling avg alloc latency in ns: {:8.2f}",
+                   tid, pid, cid, allocSize, allocSizeSuffix, acUsageFraction,
+                   tid, pid, cid, allocSize, allocSizeSuffix, memorySizeSuffix, memorySize,
+                   tid, pid, cid, allocSize, allocSizeSuffix, (double)(stats.evictions/(double)stats.evictionAttempts),
+                   tid, pid, cid, allocSize, allocSizeSuffix, stats.allocLatencyNs.estimate())
             << std::endl;
+        }
       });
     }
 
+    int bgId = 1;
+    for (auto &bgWorkerStats : backgroundEvictorStats) {
+        if (bgWorkerStats.numMovedItems > 0) {
+          out << folly::sformat(" == Background Evictor Threads ==") << std::endl;
+          out << folly::sformat("Background Evictor Thread {} Evicted Items: {:,}\n"
+                                "Background Evictor Thread {} Traversals: {:,}\n"
+                                "Background Evictor Thread {} Run Count: {:,}\n"
+                                "Background Evictor Thread {} Avg Time Per Traversal in ns: {:,}\n"
+                                "Background Evictor Thread {} Avg Items Evicted: {:.2f}",
+                                bgId, bgWorkerStats.numMovedItems,
+                                bgId, bgWorkerStats.numTraversals,
+                                bgId, bgWorkerStats.runCount,
+                                bgId, bgWorkerStats.avgTraversalTimeNs,
+                                bgId, (double)bgWorkerStats.numMovedItems/(double)bgWorkerStats.numTraversals)
+              << std::endl;
+        }
+        bgId++;
+
+    }
+    bgId = 1;
+    for (auto &bgWorkerStats : backgroundPromoStats) {
+        if (bgWorkerStats.numMovedItems > 0) {
+          out << folly::sformat(" == Background Promoter Threads ==") << std::endl;
+          out << folly::sformat("Background Promoter Thread {} Promoted Items: {:,}\n"
+                                "Background Promoter Thread {} Traversals: {:,}\n"
+                                "Background Promoter Thread {} Run Count: {:,}\n"
+                                "Background Promoter Thread {} Avg Time Per Traversal in ns: {:,}\n"
+                                "Background Promoter Thread {} Avg Items Promoted: {:.2f}",
+                                bgId, bgWorkerStats.numMovedItems,
+                                bgId, bgWorkerStats.numTraversals,
+                                bgId, bgWorkerStats.runCount,
+                                bgId, bgWorkerStats.avgTraversalTimeNs,
+                                bgId, (double)bgWorkerStats.numMovedItems/(double)bgWorkerStats.numTraversals)
+              << std::endl;
+        }
+        bgId++;
+
+    }
     if (numCacheGets > 0) {
       out << folly::sformat("Cache Gets    : {:,}", numCacheGets) << std::endl;
       out << folly::sformat("Hit Ratio     : {:6.2f}%", overallHitRatio)
           << std::endl;
-
+      for (TierId tid = 0; tid < numCacheHits.size(); tid++) {
+        double tierHitRatio = pctFn(numCacheHits[tid],numCacheGets);
+        out << folly::sformat("Tier {} Hit Ratio     : {:6.2f}%", tid, tierHitRatio)
+            << std::endl;
+      }
       if (FLAGS_report_api_latency) {
         auto printLatencies =
             [&out](folly::StringPiece cat,
                    const util::PercentileStats::Estimates& latency) {
               auto fmtLatency = [&out, &cat](folly::StringPiece pct,
                                              double val) {
-                out << folly::sformat("{:20} {:8} : {:>10.2f} ns\n", cat, pct,
-                                      val);
+                out << folly::sformat("{:20} {:8} in ns: {:>10.2f}\n", cat, pct, val);
               };
 
               fmtLatency("p50", latency.p50);
@@ -250,43 +297,45 @@ struct Stats {
 
         printLatencies("Cache Find API latency", cacheFindLatencyNs);
         printLatencies("Cache Allocate API latency", cacheAllocateLatencyNs);
+        printLatencies("Cache Background Eviction API latency", cacheBgEvictLatencyNs);
+        printLatencies("Cache Background Promotion API latency", cacheBgPromoteLatencyNs);
       }
     }
 
-    if (!backgroundEvictionClasses.empty() &&
-        backgndEvicStats.nEvictedItems > 0) {
+    uint64_t totalbgevicted = 0;
+    uint64_t totalpromoted = 0;
+    for (int i = 0; i < backgroundEvictorStats.size(); i++) {
+        totalbgevicted += backgroundEvictorStats[i].numMovedItems;
+    }
+    for (int i = 0; i < backgroundPromoStats.size(); i++) {
+        totalpromoted += backgroundPromoStats[i].numMovedItems;
+    }
+    if (!backgroundEvictionClasses.empty() && totalbgevicted > 0 ) {
       out << "== Class Background Eviction Counters Map ==" << std::endl;
-      foreachAC(backgroundEvictionClasses,
-                [&](auto pid, auto cid, auto evicted) {
-                  out << folly::sformat("pid{:2} cid{:4} evicted: {:4}", pid,
-                                        cid, evicted)
-                      << std::endl;
-                });
-
-      out << folly::sformat("Background Evicted Items : {:,}",
-                            backgndEvicStats.nEvictedItems)
-          << std::endl;
-      out << folly::sformat("Background Evictor Traversals : {:,}",
-                            backgndEvicStats.nTraversals)
-          << std::endl;
+      foreachAC(backgroundEvictionClasses, [&](auto tid, auto pid, auto cid, auto evicted){
+        if (evicted > 0) {
+          out << folly::sformat("tid{:2} pid{:2} cid{:4} evicted: {:4}",
+            tid, pid, cid, evicted) << std::endl;
+        }
+      });
     }
-
-    if (!backgroundPromotionClasses.empty() &&
-        backgndPromoStats.nPromotedItems > 0) {
+    
+    if (!backgroundPromotionClasses.empty() && totalpromoted) {
       out << "== Class Background Promotion Counters Map ==" << std::endl;
-      foreachAC(backgroundPromotionClasses,
-                [&](auto pid, auto cid, auto promoted) {
-                  out << folly::sformat("pid{:2} cid{:4} promoted: {:4}", pid,
-                                        cid, promoted)
-                      << std::endl;
-                });
-
-      out << folly::sformat("Background Promoted Items : {:,}",
-                            backgndPromoStats.nPromotedItems)
-          << std::endl;
-      out << folly::sformat("Background Promoter Traversals : {:,}",
-                            backgndPromoStats.nTraversals)
-          << std::endl;
+      foreachAC(backgroundPromotionClasses, [&](auto tid, auto pid, auto cid, auto promoted){
+        if (promoted > 0) {
+          out << folly::sformat("tid{:2} pid{:2} cid{:4} promoted: {:4}",
+            tid, pid, cid, promoted) << std::endl;
+        }
+      });
+    }
+
+    if (reaperStats.numReapedItems > 0) {
+
+      out << folly::sformat("Reaper reaped: {:,} visited: {:,} traversals: {:,} avg traversal time: {:,}",
+              reaperStats.numReapedItems,reaperStats.numVisitedItems,
+              reaperStats.numTraversals,reaperStats.avgTraversalTimeMs)
+              << std::endl;
     }
 
     if (numNvmGets > 0 || numNvmDeletes > 0 || numNvmPuts > 0) {
@@ -334,15 +383,15 @@ struct Stats {
 
       double devWriteAmp =
           pctFn(numNvmNandBytesWritten, numNvmBytesWritten) / 100.0;
-      out << folly::sformat("NVM bytes written (physical)  : {:6.2f} GB\n",
+      out << folly::sformat("NVM bytes written (physical) in GB : {:6.2f}\n",
                             numNvmBytesWritten / GB);
-      out << folly::sformat("NVM bytes written (logical)   : {:6.2f} GB\n",
+      out << folly::sformat("NVM bytes written (logical) in GB  : {:6.2f}\n",
                             numNvmLogicalBytesWritten / GB);
-      out << folly::sformat("NVM bytes written (nand)      : {:6.2f} GB\n",
+      out << folly::sformat("NVM bytes written (nand) in GB     : {:6.2f}\n",
                             numNvmNandBytesWritten / GB);
-      out << folly::sformat("NVM app write amplification   : {:6.2f}\n",
+      out << folly::sformat("NVM app write amplification        : {:6.2f}\n",
                             appWriteAmp);
-      out << folly::sformat("NVM dev write amplification   : {:6.2f}\n",
+      out << folly::sformat("NVM dev write amplification        : {:6.2f}\n",
                             devWriteAmp);
     }
     const double putSuccessPct =
@@ -351,62 +400,57 @@ struct Stats {
                     numNvmPuts);
     const double cleanEvictPct = pctFn(numNvmCleanEvict, numNvmEvictions);
     const double getCoalescedPct = pctFn(numNvmGetCoalesced, numNvmGets);
-    out << folly::sformat("{:14}: {:15,}, {:10}: {:6.2f}%",
-                          "NVM Gets",
-                          numNvmGets,
-                          "Coalesced",
-                          getCoalescedPct)
+    out << folly::sformat("{:30}: {:10,}\n"
+                          "{:30}: {:10.2f}",
+                          "NVM Gets", numNvmGets,
+                          "NVM Coalesced in pct", getCoalescedPct)
         << std::endl;
     out << folly::sformat(
-               "{:14}: {:15,}, {:10}: {:6.2f}%, {:8}: {:6.2f}%, {:16}: "
-               "{:8,}, {:16}: {:8,}",
-               "NVM Puts",
-               numNvmPuts,
-               "Success",
-               putSuccessPct,
-               "Clean",
-               pctFn(numNvmPutFromClean, numNvmPuts),
-               "AbortsFromDel",
-               numNvmAbortedPutOnTombstone,
-               "AbortsFromGet",
-               numNvmAbortedPutOnInflightGet)
+               "{:30}: {:10,}\n"
+               "{:30}: {:10.2f}\n"
+               "{:30}: {:10.2f}\n"
+               "{:30}: {:10,}\n"
+               "{:30}: {:10,}",
+               "NVM Puts", numNvmPuts,
+               "NVM Puts Success in pct", putSuccessPct,
+               "NVM Puts from Clean in pct", pctFn(numNvmPutFromClean, numNvmPuts),
+               "NVM AbortsFromDel", numNvmAbortedPutOnTombstone,
+               "NVM AbortsFromGet", numNvmAbortedPutOnInflightGet)
         << std::endl;
     out << folly::sformat(
-               "{:14}: {:15,}, {:10}: {:6.2f}%, {:8}: {:7,},"
-               " {:16}: {:8,}",
-               "NVM Evicts",
-               numNvmEvictions,
-               "Clean",
-               cleanEvictPct,
-               "Unclean",
-               numNvmUncleanEvict,
-               "Double",
-               numNvmCleanDoubleEvict)
+               "{:30}: {:10,}\n"
+               "{:30}: {:10.2f}\n"
+               "{:30}: {:10,}\n"
+               "{:30}: {:10,}",
+               "NVM Evicts", numNvmEvictions,
+               "NVM Clean Evicts in pct", cleanEvictPct,
+               "NVM Unclean Evicts", numNvmUncleanEvict,
+               "NVM Clean Double Evicts", numNvmCleanDoubleEvict)
         << std::endl;
     const double skippedDeletesPct = pctFn(numNvmSkippedDeletes, numNvmDeletes);
-    out << folly::sformat("{:14}: {:15,} {:14}: {:6.2f}%",
-                          "NVM Deletes",
-                          numNvmDeletes,
-                          "Skipped Deletes",
-                          skippedDeletesPct)
+    out << folly::sformat("{:30}: {:10,}\n"
+                          "{:30}: {:10.2f}",
+                          "NVM Deletes", numNvmDeletes,
+                          "NVM Skipped Deletes in pct", skippedDeletesPct)
         << std::endl;
     if (numNvmExceededMaxRetry > 0) {
-      out << folly::sformat("{}: {}", "NVM max read retry reached",
+      out << folly::sformat("{:30}: {:10,}", "NVM max read retry reached",
                             numNvmExceededMaxRetry)
           << std::endl;
     }
 
     if (slabsReleased > 0) {
       out << folly::sformat(
-                 "Released {:,} slabs\n"
-                 "  Moves     : attempts: {:10,}, success: {:6.2f}%\n"
-                 "  Evictions : attempts: {:10,}, success: {:6.2f}%",
+                 "Released slabs: {:,}\n"
+                 "Slab Move attempts: {:10,}\n"
+                 "Slab Move success in pct: {:6.2f}\n"
+                 "Slab Eviction attempts: {:10,}\n"
+                 "Slab Eviction success in pct: {:6.2f}",
                  slabsReleased,
                  moveAttemptsForSlabRelease,
                  pctFn(moveSuccessesForSlabRelease, moveAttemptsForSlabRelease),
                  evictionAttemptsForSlabRelease,
-                 pctFn(evictionSuccessesForSlabRelease,
-                       evictionAttemptsForSlabRelease))
+                 pctFn(evictionSuccessesForSlabRelease, evictionAttemptsForSlabRelease))
           << std::endl;
     }
 
@@ -424,8 +468,13 @@ struct Stats {
     }
 
     if (numCacheEvictions > 0) {
-      out << folly::sformat("Total eviction executed {}", numCacheEvictions)
-          << std::endl;
+      out << folly::sformat("Total evictions executed  : {:10,}", numCacheEvictions)
+              << std::endl;
+      out << folly::sformat("Total background evictions: {:10,}", totalbgevicted)
+              << std::endl;
+    }
+    if (totalpromoted > 0) {
+      out << folly::sformat("Total promotions          : {:10,}", totalpromoted) << std::endl;
     }
   }
 
@@ -483,12 +532,15 @@ struct Stats {
     };
 
     auto totalMisses = getTotalMisses();
-    counters["num_items"] = numItems;
+    //TODO: per tier
+    counters["num_items"] = std::accumulate(numItems.begin(),numItems.end(),0);
     counters["num_nvm_items"] = numNvmItems;
     counters["hit_rate"] = calcInvertPctFn(totalMisses, numCacheGets);
 
     counters["find_latency_p99"] = cacheFindLatencyNs.p99;
     counters["alloc_latency_p99"] = cacheAllocateLatencyNs.p99;
+    counters["bg_evict_latency_p99"] = cacheBgEvictLatencyNs.p99;
+    counters["bg_promote_latency_p99"] = cacheBgPromoteLatencyNs.p99;
 
     counters["ram_hit_rate"] = calcInvertPctFn(numCacheGetMiss, numCacheGets);
     counters["nvm_hit_rate"] = calcInvertPctFn(numCacheGetMiss, numCacheGets);
diff --git a/cachelib/cachebench/runner/CacheStressor.h b/cachelib/cachebench/runner/CacheStressor.h
index cbc8204b52..9b396cb1b7 100644
--- a/cachelib/cachebench/runner/CacheStressor.h
+++ b/cachelib/cachebench/runner/CacheStressor.h
@@ -77,7 +77,7 @@ class CacheStressor : public Stressor {
         std::unique_lock<folly::SharedMutex> lock;
 
         CacheStressSyncObj(CacheStressor& s, std::string itemKey)
-            : lock{s.chainedItemAcquireUniqueLock(itemKey)} {}
+            : lock{s.chainedItemTryAcquireUniqueLock(itemKey)} {}
       };
       movingSync = [this](typename CacheT::Item::Key key) {
         return std::make_unique<CacheStressSyncObj>(*this, key.str());
@@ -247,6 +247,10 @@ class CacheStressor : public Stressor {
     using Lock = std::unique_lock<folly::SharedMutex>;
     return lockEnabled_ ? Lock{getLock(key)} : Lock{};
   }
+  auto chainedItemTryAcquireUniqueLock(Key key) {
+    using Lock = std::unique_lock<folly::SharedMutex>;
+    return lockEnabled_ ? Lock{getLock(key), std::try_to_lock} : Lock{};
+  }
 
   // populate the input item handle according to the stress setup.
   void populateItem(WriteHandle& handle, const std::string& itemValue = "") {
diff --git a/cachelib/cachebench/test_configs/consistency/navy-multi-tier.json b/cachelib/cachebench/test_configs/consistency/navy-multi-tier.json
new file mode 100644
index 0000000000..076550bc5c
--- /dev/null
+++ b/cachelib/cachebench/test_configs/consistency/navy-multi-tier.json
@@ -0,0 +1,54 @@
+{
+  "cache_config" : {
+    "cacheSizeMB" : 300,
+    "poolRebalanceIntervalSec" : 1,
+    "moveOnSlabRelease" : true,
+
+    "cacheDir": "/tmp/mem-tier2",
+    "memoryTiers" : [
+      {
+        "ratio": 1,
+	"memBindNodes": 0
+      },
+      {
+        "ratio": 1,
+	"memBindNodes": 0
+      }
+    ],
+
+    "numPools" : 2,
+    "poolSizes" : [0.5, 0.5],
+    "allocFactor" : 2.0,
+    "nvmCacheSizeMB" : 1024
+  },
+  "test_config" :
+    {
+
+      "checkConsistency" : true,
+
+      "numOps" : 60000,
+      "numThreads" : 20,
+      "numKeys" : 200000,
+
+
+      "keySizeRange" : [1, 8, 64],
+      "keySizeRangeProbability" : [0.5, 0.5],
+
+      "valSizeRange" : [256, 1024, 4096, 8192],
+      "valSizeRangeProbability" : [0.2, 0.7, 0.1],
+
+      "chainedItemLengthRange" : [1, 2, 4, 32],
+      "chainedItemLengthRangeProbability" : [0.8, 0.18, 0.02],
+
+      "chainedItemValSizeRange" : [1, 128, 256, 1024, 4096, 20480],
+      "chainedItemValSizeRangeProbability" : [0.1, 0.1, 0.2, 0.3, 0.3],
+
+      "getRatio" : 0.8,
+      "setRatio" : 0.1,
+      "delRatio" : 0.0,
+      "addChainedRatio" : 0.05,
+      "keyPoolDistribution": [0.5, 0.5],
+      "opPoolDistribution" : [0.5, 0.5]
+    }
+
+}
diff --git a/cachelib/cachebench/test_configs/consistency/navy.json b/cachelib/cachebench/test_configs/consistency/navy.json
index 73b016a50f..b95b056d31 100644
--- a/cachelib/cachebench/test_configs/consistency/navy.json
+++ b/cachelib/cachebench/test_configs/consistency/navy.json
@@ -14,8 +14,8 @@
 
       "checkConsistency" : true,
 
-      "numOps" : 30000000,
-      "numThreads" : 40,
+      "numOps" : 600000,
+      "numThreads" : 20,
       "numKeys" : 200000,
 
 
diff --git a/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-4GB-DRAM-4GB-PMEM.json b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-4GB-DRAM-4GB-PMEM.json
new file mode 100644
index 0000000000..d9acdf7c6c
--- /dev/null
+++ b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-4GB-DRAM-4GB-PMEM.json
@@ -0,0 +1,42 @@
+{
+  "cache_config": {
+    "cacheSizeMB": 8192,
+    "poolRebalanceIntervalSec": 0,
+    "cacheDir": "/tmp/mem-tiers",
+    "memoryTiers" : [
+      {
+        "ratio": 1,
+        "memBindNodes": 0
+      },
+      {
+        "ratio": 1,
+        "memBindNodes": 0
+      }
+    ]
+  }, 
+  "test_config": 
+    {
+      "addChainedRatio": 0.0, 
+      "delRatio": 0.0, 
+      "enableLookaside": true, 
+      "getRatio": 0.7684563460126871, 
+      "keySizeRange": [
+        1, 
+        8, 
+        64
+      ], 
+      "keySizeRangeProbability": [
+        0.3, 
+        0.7
+      ], 
+      "loneGetRatio": 0.2315436539873129, 
+      "numKeys": 71605574, 
+      "numOps": 5000000, 
+      "numThreads": 24, 
+      "popDistFile": "pop.json",
+
+      "setRatio": 0.0, 
+      "valSizeDistFile": "sizes.json"
+    }
+ 
+}
diff --git a/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-DRAM.json b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-DRAM.json
new file mode 100644
index 0000000000..6d47e08b74
--- /dev/null
+++ b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-DRAM.json
@@ -0,0 +1,32 @@
+{
+  "cache_config": {
+    "cacheSizeMB": 8192,
+    "poolRebalanceIntervalSec": 0,
+    "cacheDir": "/tmp/mem-tier"
+  }, 
+  "test_config": 
+    {
+      "addChainedRatio": 0.0, 
+      "delRatio": 0.0, 
+      "enableLookaside": true, 
+      "getRatio": 0.7684563460126871, 
+      "keySizeRange": [
+        1, 
+        8, 
+        64
+      ], 
+      "keySizeRangeProbability": [
+        0.3, 
+        0.7
+      ], 
+      "loneGetRatio": 0.2315436539873129, 
+      "numKeys": 71605574, 
+      "numOps": 5000000, 
+      "numThreads": 24, 
+      "popDistFile": "pop.json", 
+       
+      "setRatio": 0.0, 
+      "valSizeDistFile": "sizes.json"
+    }
+ 
+}
diff --git a/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-PMEM.json b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-PMEM.json
new file mode 100644
index 0000000000..4feab55154
--- /dev/null
+++ b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-PMEM.json
@@ -0,0 +1,38 @@
+{
+  "cache_config": {
+    "cacheSizeMB": 8192,
+    "poolRebalanceIntervalSec": 0,
+    "cacheDir": "/tmp/mem-tier",
+    "memoryTiers" : [
+      {
+        "ratio": 1,
+        "memBindNodes": 0
+      }
+    ]
+  }, 
+  "test_config": 
+    {
+      "addChainedRatio": 0.0, 
+      "delRatio": 0.0, 
+      "enableLookaside": true, 
+      "getRatio": 0.7684563460126871, 
+      "keySizeRange": [
+        1, 
+        8, 
+        64
+      ], 
+      "keySizeRangeProbability": [
+        0.3, 
+        0.7
+      ], 
+      "loneGetRatio": 0.2315436539873129, 
+      "numKeys": 71605574, 
+      "numOps": 5000000, 
+      "numThreads": 24, 
+      "popDistFile": "pop.json", 
+       
+      "setRatio": 0.0, 
+      "valSizeDistFile": "sizes.json"
+    }
+ 
+}
diff --git a/cachelib/cachebench/test_configs/simple_tiers_test.json b/cachelib/cachebench/test_configs/simple_tiers_test.json
index 182bb514cb..58302b9f20 100644
--- a/cachelib/cachebench/test_configs/simple_tiers_test.json
+++ b/cachelib/cachebench/test_configs/simple_tiers_test.json
@@ -1,14 +1,18 @@
 // @nolint instantiates a small cache and runs a quick run of basic operations.
 {
     "cache_config" : {
-      "cacheSizeMB" : 512,
-      "usePosixShm" : false,
+      "cacheSizeMB" : 1024,
       "cacheDir" : "/tmp/mem-tiers",
       "memoryTiers" : [
+        {
+          "ratio": 1,
+          "memBindNodes": "0"
+        },
         {
           "ratio": 1,
           "memBindNodes": "0"
         }
+
       ],
       "poolRebalanceIntervalSec" : 1,
       "moveOnSlabRelease" : false,
@@ -19,7 +23,7 @@
     "test_config" : {
         "numOps" : 100000,
         "numThreads" : 32,
-        "numKeys" : 1000000,
+        "numKeys" : 2000000,
 
         "keySizeRange" : [1, 8, 64],
         "keySizeRangeProbability" : [0.3, 0.7],
@@ -33,4 +37,4 @@
         "keyPoolDistribution": [0.4, 0.6],
         "opPoolDistribution" : [0.5, 0.5]
     }
-  }
\ No newline at end of file
+  }
diff --git a/cachelib/cachebench/test_configs/small_moving_bg.json b/cachelib/cachebench/test_configs/small_moving_bg.json
new file mode 100644
index 0000000000..c4838f42b5
--- /dev/null
+++ b/cachelib/cachebench/test_configs/small_moving_bg.json
@@ -0,0 +1,35 @@
+// @nolint like default.json, but moves items during slab release instead of evicting them.
+{
+    "cache_config" : {
+      "cacheSizeMB" : 2248,
+      "cacheDir": "/tmp/mem-tier5",
+      "memoryTiers" : [
+        {
+          "ratio": 1,
+          "memBindNodes": 0
+        }, {
+          "ratio": 1,
+          "memBindNodes": 0
+        }
+      ],
+      "poolRebalanceIntervalSec" : 1,
+      "moveOnSlabRelease" : true,
+      "rebalanceMinSlabs" : 2,
+      "evictorThreads": 2,
+      "promoterThreads": 2
+    },
+    "test_config" :
+      {
+        "preallocateCache" : true,
+        "numOps" : 20000000,
+        "numThreads" : 32,
+        "numKeys" : 250000,
+        "generator": "online",
+        "keySizeRange" : [1, 8, 32, 64, 128, 256, 512],
+        "keySizeRangeProbability" : [0.1, 0.1, 0.2, 0.2, 0.3, 0.1],
+        "valSizeRange" : [1, 128, 512, 1024, 4096, 10240, 20480, 40960, 60000],
+        "valSizeRangeProbability" : [0.1, 0.1, 0.1, 0.2, 0.2, 0.1, 0.1, 0.1],
+        "getRatio" : 0.70,
+        "setRatio" : 0.30
+      }
+  }
diff --git a/cachelib/cachebench/util/CacheConfig.cpp b/cachelib/cachebench/util/CacheConfig.cpp
index 6d8f40874b..8d0044da75 100644
--- a/cachelib/cachebench/util/CacheConfig.cpp
+++ b/cachelib/cachebench/util/CacheConfig.cpp
@@ -19,6 +19,8 @@
 #include "cachelib/allocator/HitsPerSlabStrategy.h"
 #include "cachelib/allocator/LruTailAgeStrategy.h"
 #include "cachelib/allocator/RandomStrategy.h"
+#include "cachelib/allocator/FreeThresholdStrategy.h"
+#include "cachelib/allocator/PromotionStrategy.h"
 
 namespace facebook {
 namespace cachelib {
@@ -28,6 +30,9 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) {
   JSONSetVal(configJson, cacheDir);
   JSONSetVal(configJson, cacheSizeMB);
   JSONSetVal(configJson, poolRebalanceIntervalSec);
+  JSONSetVal(configJson, backgroundEvictorIntervalMilSec);
+  JSONSetVal(configJson, backgroundPromoterIntervalMilSec);
+  JSONSetVal(configJson, backgroundEvictorStrategy);
   JSONSetVal(configJson, moveOnSlabRelease);
   JSONSetVal(configJson, rebalanceStrategy);
   JSONSetVal(configJson, rebalanceMinSlabs);
@@ -44,6 +49,9 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) {
   JSONSetVal(configJson, tryLockUpdate);
   JSONSetVal(configJson, lruIpSpec);
   JSONSetVal(configJson, useCombinedLockForIterators);
+  
+  JSONSetVal(configJson, insertToFirstFreeTier);
+  JSONSetVal(configJson, noOnlineEviction);
 
   JSONSetVal(configJson, lru2qHotPct);
   JSONSetVal(configJson, lru2qColdPct);
@@ -109,10 +117,27 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) {
   JSONSetVal(configJson, nvmAdmissionRetentionTimeThreshold);
 
   JSONSetVal(configJson, customConfigJson);
+  
+  //Background related configs
+  JSONSetVal(configJson, lowEvictionAcWatermark);
+  JSONSetVal(configJson, highEvictionAcWatermark);
+  JSONSetVal(configJson, minAcAllocationWatermark);
+  JSONSetVal(configJson, maxAcAllocationWatermark);
+  JSONSetVal(configJson, numDuplicateElements);
+  JSONSetVal(configJson, syncPromotion);
+  JSONSetVal(configJson, evictorThreads);
+  JSONSetVal(configJson, promoterThreads);
+  JSONSetVal(configJson, promotionAcWatermark);
+  JSONSetVal(configJson, maxEvictionBatch);
+  JSONSetVal(configJson, maxPromotionBatch);
+  JSONSetVal(configJson, minEvictionBatch);
+  JSONSetVal(configJson, minPromotionBatch);
+  JSONSetVal(configJson, maxEvictionPromotionHotness);
+  
   // if you added new fields to the configuration, update the JSONSetVal
   // to make them available for the json configs and increment the size
   // below
-  checkCorrectSize<CacheConfig, 760>();
+  checkCorrectSize<CacheConfig, 920>();
 
   if (numPools != poolSizes.size()) {
     throw std::invalid_argument(folly::sformat(
@@ -148,6 +173,26 @@ MemoryTierConfig::MemoryTierConfig(const folly::dynamic& configJson) {
 
   checkCorrectSize<MemoryTierConfig, 40>();
 }
+
+std::shared_ptr<BackgroundMoverStrategy> CacheConfig::getBackgroundEvictorStrategy() const {
+  if (backgroundEvictorIntervalMilSec == 0) {
+    return nullptr;
+  }
+  if (backgroundEvictorStrategy == "threshold") {
+    return std::make_shared<FreeThresholdStrategy>(lowEvictionAcWatermark, highEvictionAcWatermark, maxEvictionBatch, minEvictionBatch);
+  } else if (backgroundEvictorStrategy == "fixed") {
+    return std::make_shared<DefaultBackgroundMoverStrategy>(maxEvictionBatch, highEvictionAcWatermark);
+  } else {
+    return std::make_shared<FreeThresholdStrategy>(lowEvictionAcWatermark, highEvictionAcWatermark, maxEvictionBatch, minEvictionBatch);
+  }
+}
+
+std::shared_ptr<BackgroundMoverStrategy> CacheConfig::getBackgroundPromoterStrategy() const {
+  if (backgroundPromoterIntervalMilSec == 0) {
+    return nullptr;
+  }
+  return std::make_shared<PromotionStrategy>(promotionAcWatermark, maxPromotionBatch, minPromotionBatch);
+}
 } // namespace cachebench
 } // namespace cachelib
 } // namespace facebook
diff --git a/cachelib/cachebench/util/CacheConfig.h b/cachelib/cachebench/util/CacheConfig.h
index 0a1569615d..e8dff5bcae 100644
--- a/cachelib/cachebench/util/CacheConfig.h
+++ b/cachelib/cachebench/util/CacheConfig.h
@@ -20,6 +20,7 @@
 
 #include "cachelib/allocator/CacheAllocator.h"
 #include "cachelib/allocator/RebalanceStrategy.h"
+#include "cachelib/allocator/BackgroundMoverStrategy.h"
 #include "cachelib/cachebench/util/JSONConfig.h"
 #include "cachelib/common/Ticker.h"
 #include "cachelib/navy/common/Device.h"
@@ -51,7 +52,7 @@ struct MemoryTierConfig : public JSONConfig {
   MemoryTierCacheConfig getMemoryTierCacheConfig() {
     MemoryTierCacheConfig config = MemoryTierCacheConfig::fromShm();
     config.setRatio(ratio);
-    config.setMemBind(NumaBitMask(memBindNodes));
+    config.setMemBind(util::NumaBitMask(memBindNodes));
     return config;
   }
 
@@ -71,7 +72,10 @@ struct CacheConfig : public JSONConfig {
 
   uint64_t cacheSizeMB{0};
   uint64_t poolRebalanceIntervalSec{0};
+  uint64_t backgroundEvictorIntervalMilSec{0};
+  uint64_t backgroundPromoterIntervalMilSec{0};
   std::string rebalanceStrategy;
+  std::string backgroundEvictorStrategy;
   uint64_t rebalanceMinSlabs{1};
   double rebalanceDiffRatio{0.25};
   bool moveOnSlabRelease{false};
@@ -92,7 +96,10 @@ struct CacheConfig : public JSONConfig {
   bool lruUpdateOnWrite{false};
   bool lruUpdateOnRead{true};
   bool tryLockUpdate{false};
-  bool useCombinedLockForIterators{false};
+  bool useCombinedLockForIterators{true};
+  
+  bool insertToFirstFreeTier{false};
+  bool noOnlineEviction{false};
 
   // LRU param
   uint64_t lruIpSpec{0};
@@ -271,6 +278,27 @@ struct CacheConfig : public JSONConfig {
   // eviction-age is more than this threshold. 0 means no threshold
   uint32_t nvmAdmissionRetentionTimeThreshold{0};
 
+  // See BackgroundMovers.md for complete description
+  double promotionAcWatermark{4.0};
+  double lowEvictionAcWatermark{2.0};
+  double highEvictionAcWatermark{5.0};
+  double minAcAllocationWatermark{0.0};
+  double maxAcAllocationWatermark{0.0};
+
+  double numDuplicateElements{0.0}; // inclusivness of the cache
+  double syncPromotion{0.0}; // can promotion be done synchronously in user thread
+  
+  uint64_t evictorThreads{1};
+  uint64_t promoterThreads{1};
+  
+  uint64_t maxEvictionBatch{40};
+  uint64_t maxPromotionBatch{10};
+  
+  uint64_t minEvictionBatch{5};
+  uint64_t minPromotionBatch{5};
+  
+  uint64_t maxEvictionPromotionHotness{60};
+
   //
   // Options below are not to be populated with JSON
   //
@@ -306,6 +334,8 @@ struct CacheConfig : public JSONConfig {
   CacheConfig() {}
 
   std::shared_ptr<RebalanceStrategy> getRebalanceStrategy() const;
+  std::shared_ptr<BackgroundMoverStrategy> getBackgroundEvictorStrategy() const;
+  std::shared_ptr<BackgroundMoverStrategy> getBackgroundPromoterStrategy() const;
 };
 } // namespace cachebench
 } // namespace cachelib
diff --git a/cachelib/common/CMakeLists.txt b/cachelib/common/CMakeLists.txt
index 927f2fa3f7..2e3aaf0493 100644
--- a/cachelib/common/CMakeLists.txt
+++ b/cachelib/common/CMakeLists.txt
@@ -40,6 +40,7 @@ target_link_libraries(cachelib_common PUBLIC
   Folly::folly_exception_tracer
   Folly::folly_exception_tracer_base
   Folly::folly_exception_counter
+  numa
 )
 
 install(TARGETS cachelib_common
diff --git a/cachelib/common/PercentileStats.h b/cachelib/common/PercentileStats.h
index bdd3255eba..c308671ee9 100644
--- a/cachelib/common/PercentileStats.h
+++ b/cachelib/common/PercentileStats.h
@@ -107,16 +107,16 @@ class PercentileStats {
 
 class LatencyTracker {
  public:
-  explicit LatencyTracker(PercentileStats& stats)
-      : stats_(&stats), begin_(std::chrono::steady_clock::now()) {}
+  explicit LatencyTracker(PercentileStats& stats, size_t nSamples = 1)
+      : stats_(&stats), nSamples_(nSamples), begin_(std::chrono::steady_clock::now()) {}
   LatencyTracker() {}
   ~LatencyTracker() {
-    if (stats_) {
+    if (nSamples_ > 0 && stats_) {
       auto tp = std::chrono::steady_clock::now();
       auto diffNanos =
           std::chrono::duration_cast<std::chrono::nanoseconds>(tp - begin_)
               .count();
-      stats_->trackValue(static_cast<double>(diffNanos), tp);
+      stats_->trackValue(static_cast<double>(diffNanos/nSamples_), tp);
     }
   }
 
@@ -124,7 +124,7 @@ class LatencyTracker {
   LatencyTracker& operator=(const LatencyTracker&) = delete;
 
   LatencyTracker(LatencyTracker&& rhs) noexcept
-      : stats_(rhs.stats_), begin_(rhs.begin_) {
+      : stats_(rhs.stats_), nSamples_(rhs.nSamples_), begin_(rhs.begin_) {
     rhs.stats_ = nullptr;
   }
 
@@ -138,6 +138,7 @@ class LatencyTracker {
 
  private:
   PercentileStats* stats_{nullptr};
+  size_t nSamples_{1};
   std::chrono::time_point<std::chrono::steady_clock> begin_;
 };
 } // namespace util
diff --git a/cachelib/common/RollingStats.h b/cachelib/common/RollingStats.h
new file mode 100644
index 0000000000..4d179681ad
--- /dev/null
+++ b/cachelib/common/RollingStats.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <folly/Range.h>
+#include <folly/logging/xlog.h>
+
+#include "cachelib/common/Utils.h"
+
+namespace facebook {
+namespace cachelib {
+namespace util {
+
+class RollingStats {
+ public:
+  // track latency by taking the value of duration directly.
+  void trackValue(double value) {
+    // This is a highly unlikely scenario where
+    // cnt_ reaches numerical limits. Skip update
+    // of the rolling average anymore.
+    if (cnt_ == std::numeric_limits<uint64_t>::max()) {
+      cnt_ = 0;
+      return;
+    }
+    auto ratio = static_cast<double>(cnt_) / (cnt_ + 1);
+    avg_ *= ratio;
+    ++cnt_;
+    avg_ += value / cnt_;
+  }
+
+  // Return the rolling average.
+  double estimate() { return avg_; }
+
+ private:
+  double avg_{0};
+  uint64_t cnt_{0};
+};
+
+class RollingLatencyTracker {
+ public:
+  explicit RollingLatencyTracker(RollingStats& stats)
+      : stats_(&stats), begin_(std::chrono::steady_clock::now()) {}
+  RollingLatencyTracker() {}
+  ~RollingLatencyTracker() {
+    if (stats_) {
+      auto tp = std::chrono::steady_clock::now();
+      auto diffNanos =
+          std::chrono::duration_cast<std::chrono::nanoseconds>(tp - begin_)
+              .count();
+      stats_->trackValue(static_cast<double>(diffNanos));
+    }
+  }
+
+  RollingLatencyTracker(const RollingLatencyTracker&) = delete;
+  RollingLatencyTracker& operator=(const RollingLatencyTracker&) = delete;
+
+  RollingLatencyTracker(RollingLatencyTracker&& rhs) noexcept
+      : stats_(rhs.stats_), begin_(rhs.begin_) {
+    rhs.stats_ = nullptr;
+  }
+
+  RollingLatencyTracker& operator=(RollingLatencyTracker&& rhs) noexcept {
+    if (this != &rhs) {
+      this->~RollingLatencyTracker();
+      new (this) RollingLatencyTracker(std::move(rhs));
+    }
+    return *this;
+  }
+
+ private:
+  RollingStats* stats_{nullptr};
+  std::chrono::time_point<std::chrono::steady_clock> begin_;
+};
+} // namespace util
+} // namespace cachelib
+} // namespace facebook
diff --git a/cachelib/common/Utils.cpp b/cachelib/common/Utils.cpp
index 82ec0bf72e..9b051519dc 100644
--- a/cachelib/common/Utils.cpp
+++ b/cachelib/common/Utils.cpp
@@ -16,6 +16,7 @@
 
 #include <dirent.h>
 #include <folly/experimental/exception_tracer/ExceptionTracer.h>
+#include <numaif.h>
 #include <sys/mman.h>
 #include <sys/resource.h>
 #include <sys/shm.h>
@@ -181,6 +182,22 @@ void* mmapAlignedZeroedMemory(size_t alignment,
   throw std::system_error(errno, std::system_category(), "Cannot mmap");
 }
 
+void munmapMemory(void* addr, size_t size) { munmap(addr, size); }
+
+void mbindMemory(void* addr,
+                 unsigned long len,
+                 int mode,
+                 const NumaBitMask& mask,
+                 unsigned int flags) {
+  auto nodesMask = mask.getNativeBitmask();
+
+  long ret = mbind(addr, len, mode, nodesMask->maskp, nodesMask->size, flags);
+  if (ret != 0) {
+    util::throwSystemError(
+        errno, folly::sformat("mbind() failed: {}", std::strerror(errno)));
+  }
+}
+
 void setMaxLockMemory(uint64_t bytes) {
   struct rlimit rlim {
     bytes, bytes
diff --git a/cachelib/common/Utils.h b/cachelib/common/Utils.h
index 3d8acf3654..3a045c10ba 100644
--- a/cachelib/common/Utils.h
+++ b/cachelib/common/Utils.h
@@ -18,6 +18,8 @@
 
 #include <folly/Format.h>
 #include <folly/Random.h>
+#include <numa.h>
+#include <numaif.h>
 
 #include <unordered_map>
 
@@ -35,6 +37,57 @@ namespace facebook {
 namespace cachelib {
 namespace util {
 
+class NumaBitMask {
+ public:
+  using native_bitmask_type = struct bitmask*;
+
+  NumaBitMask() { nodesMask = numa_allocate_nodemask(); }
+
+  NumaBitMask(const NumaBitMask& other) {
+    nodesMask = numa_allocate_nodemask();
+    copy_bitmask_to_bitmask(other.nodesMask, nodesMask);
+  }
+
+  NumaBitMask(NumaBitMask&& other) {
+    nodesMask = other.nodesMask;
+    other.nodesMask = nullptr;
+  }
+
+  NumaBitMask(const std::string& str) {
+    nodesMask = numa_parse_nodestring_all(str.c_str());
+  }
+
+  ~NumaBitMask() {
+    if (nodesMask) {
+      numa_bitmask_free(nodesMask);
+    }
+  }
+
+  constexpr NumaBitMask& operator=(const NumaBitMask& other) {
+    if (this != &other) {
+      if (!nodesMask) {
+        nodesMask = numa_allocate_nodemask();
+      }
+      copy_bitmask_to_bitmask(other.nodesMask, nodesMask);
+    }
+    return *this;
+  }
+
+  native_bitmask_type getNativeBitmask() const noexcept { return nodesMask; }
+
+  NumaBitMask& setBit(unsigned int n) {
+    numa_bitmask_setbit(nodesMask, n);
+    return *this;
+  }
+
+  bool empty() const noexcept {
+    return numa_bitmask_equal(numa_no_nodes_ptr, nodesMask) == 1;
+  }
+
+ protected:
+  native_bitmask_type nodesMask = nullptr;
+};
+
 // A wrapper class for functions to collect counters.
 // It can be initialized by either
 // 1. folly::StringPiece, double -> void, or
@@ -295,6 +348,25 @@ void* mmapAlignedZeroedMemory(size_t alignment,
                               size_t numBytes,
                               bool noAccess = false);
 
+// destroy the mapping created by mmapAlignedZeroedMemory
+//
+// @param addr  the pointer to the memory to unmap
+// @param size  size of the memory region
+void munmapMemory(void* addr, size_t size);
+
+// binds memory to the NUMA nodes specified by nmask.
+//
+// @param addr  the pointer to the memory to bind.
+// @param len   length of the memory.
+// @param mode  mode supported by mmap call
+// @param mask  mask specifies node ids
+// @param flags flags supported by mmap call
+void mbindMemory(void* addr,
+                 unsigned long len,
+                 int mode,
+                 const NumaBitMask& mask,
+                 unsigned int flags);
+
 // get the number of pages in the range which are resident in the process.
 //
 // @param mem   memory start which is page aligned
diff --git a/cachelib/shm/PosixShmSegment.cpp b/cachelib/shm/PosixShmSegment.cpp
index a33a052688..bf197aa439 100644
--- a/cachelib/shm/PosixShmSegment.cpp
+++ b/cachelib/shm/PosixShmSegment.cpp
@@ -31,6 +31,8 @@
 namespace facebook {
 namespace cachelib {
 
+using NumaBitMask = util::NumaBitMask;
+
 constexpr static mode_t kRWMode = 0666;
 typedef struct stat stat_t;
 
diff --git a/cachelib/shm/ShmCommon.h b/cachelib/shm/ShmCommon.h
index 8db8707515..bc451c46d1 100644
--- a/cachelib/shm/ShmCommon.h
+++ b/cachelib/shm/ShmCommon.h
@@ -15,8 +15,6 @@
  */
 
 #pragma once
-#include <numa.h>
-#include <numaif.h>
 #include <sys/ipc.h>
 #include <sys/mman.h>
 #include <sys/shm.h>
@@ -30,6 +28,8 @@
 #include <folly/Range.h>
 #pragma GCC diagnostic pop
 
+#include "cachelib/common/Utils.h"
+
 /* On Mac OS / FreeBSD, mmap(2) syscall does not support these flags */
 #ifndef MAP_LOCKED
 #define MAP_LOCKED 0
@@ -72,62 +72,11 @@ enum PageSizeT {
   ONE_GB,
 };
 
-class NumaBitMask {
- public:
-  using native_bitmask_type = struct bitmask*;
-
-  NumaBitMask() { nodesMask = numa_allocate_nodemask(); }
-
-  NumaBitMask(const NumaBitMask& other) {
-    nodesMask = numa_allocate_nodemask();
-    copy_bitmask_to_bitmask(other.nodesMask, nodesMask);
-  }
-
-  NumaBitMask(NumaBitMask&& other) {
-    nodesMask = other.nodesMask;
-    other.nodesMask = nullptr;
-  }
-
-  NumaBitMask(const std::string& str) {
-    nodesMask = numa_parse_nodestring_all(str.c_str());
-  }
-
-  ~NumaBitMask() {
-    if (nodesMask) {
-      numa_bitmask_free(nodesMask);
-    }
-  }
-
-  constexpr NumaBitMask& operator=(const NumaBitMask& other) {
-    if (this != &other) {
-      if (!nodesMask) {
-        nodesMask = numa_allocate_nodemask();
-      }
-      copy_bitmask_to_bitmask(other.nodesMask, nodesMask);
-    }
-    return *this;
-  }
-
-  native_bitmask_type getNativeBitmask() const noexcept { return nodesMask; }
-
-  NumaBitMask& setBit(unsigned int n) {
-    numa_bitmask_setbit(nodesMask, n);
-    return *this;
-  }
-
-  bool empty() const noexcept {
-    return numa_bitmask_equal(numa_no_nodes_ptr, nodesMask) == 1;
-  }
-
- protected:
-  native_bitmask_type nodesMask = nullptr;
-};
-
 struct ShmSegmentOpts {
   PageSizeT pageSize{PageSizeT::NORMAL};
   bool readOnly{false};
   size_t alignment{1}; // alignment for mapping.
-  NumaBitMask memBindNumaNodes;
+  util::NumaBitMask memBindNumaNodes;
 
   explicit ShmSegmentOpts(PageSizeT p) : pageSize(p) {}
   explicit ShmSegmentOpts(PageSizeT p, bool ro) : pageSize(p), readOnly(ro) {}
diff --git a/cachelib/shm/SysVShmSegment.cpp b/cachelib/shm/SysVShmSegment.cpp
index 43c1755bbf..d70762ad98 100644
--- a/cachelib/shm/SysVShmSegment.cpp
+++ b/cachelib/shm/SysVShmSegment.cpp
@@ -191,21 +191,6 @@ void shmCtlImpl(int shmid, int cmd, shmid_ds* buf) {
   }
 }
 
-void mbindImpl(void* addr,
-               unsigned long len,
-               int mode,
-
-               const NumaBitMask& memBindNumaNodes,
-               unsigned int flags) {
-  auto nodesMask = memBindNumaNodes.getNativeBitmask();
-
-  long ret = mbind(addr, len, mode, nodesMask->maskp, nodesMask->size, flags);
-  if (ret != 0) {
-    util::throwSystemError(
-        errno, folly::sformat("mbind() failed: {}", std::strerror(errno)));
-  }
-}
-
 } // namespace detail
 
 void ensureSizeforHugePage(size_t size) {
@@ -302,7 +287,7 @@ void SysVShmSegment::memBind(void* addr) const {
   if (opts_.memBindNumaNodes.empty()) {
     return;
   }
-  detail::mbindImpl(addr, getSize(), MPOL_BIND, opts_.memBindNumaNodes, 0);
+  util::mbindMemory(addr, getSize(), MPOL_BIND, opts_.memBindNumaNodes, 0);
 }
 
 void SysVShmSegment::markForRemoval() {
diff --git a/contrib/build-package.sh b/contrib/build-package.sh
index 406031bd40..cef2f3dffe 100755
--- a/contrib/build-package.sh
+++ b/contrib/build-package.sh
@@ -91,8 +91,8 @@ do
     v) verbose=yes ;;
     j) many_jobs=yes ;;
     t) build_tests=yes ;;
-    p) PREFIX=$OPTARG ;;
-    ?) die "unknown option. See -h for help."
+    p) PREFIX=${OPTARG} ;;
+    ?) die "unknown option $param. See -h for help."
   esac
 done
 test -n "$show_help" && show_help_and_exit;
@@ -198,7 +198,6 @@ case "$1" in
   folly)
     NAME=folly
     SRCDIR=cachelib/external/$NAME
-    update_submodules=yes
     cmake_custom_params="-DBUILD_SHARED_LIBS=ON"
     if test "$build_tests" = "yes" ; then
         cmake_custom_params="$cmake_custom_params -DBUILD_TESTS=ON"
@@ -210,7 +209,6 @@ case "$1" in
   fizz)
     NAME=fizz
     SRCDIR=cachelib/external/$NAME/$NAME
-    update_submodules=yes
     cmake_custom_params="-DBUILD_SHARED_LIBS=ON"
     if test "$build_tests" = "yes" ; then
         cmake_custom_params="$cmake_custom_params -DBUILD_TESTS=ON"
@@ -222,7 +220,6 @@ case "$1" in
   wangle)
     NAME=wangle
     SRCDIR=cachelib/external/$NAME/$NAME
-    update_submodules=yes
     cmake_custom_params="-DBUILD_SHARED_LIBS=ON"
     if test "$build_tests" = "yes" ; then
         cmake_custom_params="$cmake_custom_params -DBUILD_TESTS=ON"
@@ -241,7 +238,6 @@ case "$1" in
   fbthrift)
     NAME=fbthrift
     SRCDIR=cachelib/external/$NAME
-    update_submodules=yes
     cmake_custom_params="-DBUILD_SHARED_LIBS=ON"
     ;;
 
diff --git a/docker/build.sh b/docker/build.sh
new file mode 100755
index 0000000000..bb82f0142d
--- /dev/null
+++ b/docker/build.sh
@@ -0,0 +1,97 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2022, Intel Corporation
+
+#
+# build.sh - runs a Docker container from a Docker image with environment
+#		prepared for running CacheLib builds and tests. It uses Docker image
+#		tagged as described in ./images/build-image.sh.
+#
+# Notes:
+# - set env var 'HOST_WORKDIR' to where the root of this project is on the host machine,
+# - set env var 'OS' and 'OS_VER' properly to a system/Docker you want to build this
+#	repo on (for proper values take a look at the list of Dockerfiles at the
+#	utils/docker/images directory in this repo), e.g. OS=ubuntu, OS_VER=20.04,
+# - set env var 'CONTAINER_REG' to container registry address
+#	[and possibly user/org name, and package name], e.g. "<CR_addr>/pmem/CacheLib",
+# - set env var 'DNS_SERVER' if you use one,
+# - set env var 'COMMAND' to execute specific command within Docker container or
+#	env var 'TYPE' to pick command based on one of the predefined types of build (see below).
+#
+
+set -e
+
+source $(dirname ${0})/set-ci-vars.sh
+IMG_VER=${IMG_VER:-devel}
+TAG="${OS}-${OS_VER}-${IMG_VER}"
+IMAGE_NAME=${CONTAINER_REG}:${TAG}
+CONTAINER_NAME=CacheLib-${OS}-${OS_VER}
+WORKDIR=/CacheLib  # working dir within Docker container
+SCRIPTSDIR=${WORKDIR}/docker
+
+if [[ -z "${OS}" || -z "${OS_VER}" ]]; then
+	echo "ERROR: The variables OS and OS_VER have to be set " \
+		"(e.g. OS=fedora, OS_VER=32)."
+	exit 1
+fi
+
+if [[ -z "${HOST_WORKDIR}" ]]; then
+	echo "ERROR: The variable HOST_WORKDIR has to contain a path to " \
+		"the root of this project on the host machine."
+	exit 1
+fi
+
+if [[ -z "${CONTAINER_REG}" ]]; then
+	echo "ERROR: CONTAINER_REG environment variable is not set " \
+		"(e.g. \"<registry_addr>/<org_name>/<package_name>\")."
+	exit 1
+fi
+
+# Set command to execute in the Docker container
+COMMAND="./run-build.sh";
+echo "COMMAND to execute within Docker container: ${COMMAND}"
+
+if [ -n "${DNS_SERVER}" ]; then DOCKER_OPTS="${DOCKER_OPTS} --dns=${DNS_SERVER}"; fi
+
+# Check if we are running on a CI (Travis or GitHub Actions)
+[ -n "${GITHUB_ACTIONS}" -o -n "${TRAVIS}" ] && CI_RUN="YES" || CI_RUN="NO"
+
+# Do not allocate a pseudo-TTY if we are running on GitHub Actions
+[ ! "${GITHUB_ACTIONS}" ] && DOCKER_OPTS="${DOCKER_OPTS} --tty=true"
+
+
+echo "Running build using Docker image: ${IMAGE_NAME}"
+
+# Run a container with
+#  - environment variables set (--env)
+#  - host directory containing source mounted (-v)
+#  - working directory set (-w)
+docker run --privileged=true --name=${CONTAINER_NAME} -i \
+	${DOCKER_OPTS} \
+	--env http_proxy=${http_proxy} \
+	--env https_proxy=${https_proxy} \
+	--env TERM=xterm-256color \
+	--env WORKDIR=${WORKDIR} \
+	--env SCRIPTSDIR=${SCRIPTSDIR} \
+	--env GITHUB_REPO=${GITHUB_REPO} \
+	--env CI_RUN=${CI_RUN} \
+	--env TRAVIS=${TRAVIS} \
+	--env GITHUB_ACTIONS=${GITHUB_ACTIONS} \
+	--env CI_COMMIT=${CI_COMMIT} \
+	--env CI_COMMIT_RANGE=${CI_COMMIT_RANGE} \
+	--env CI_BRANCH=${CI_BRANCH} \
+	--env CI_EVENT_TYPE=${CI_EVENT_TYPE} \
+	--env CI_REPO_SLUG=${CI_REPO_SLUG} \
+	--env DOC_UPDATE_GITHUB_TOKEN=${DOC_UPDATE_GITHUB_TOKEN} \
+	--env DOC_UPDATE_BOT_NAME=${DOC_UPDATE_BOT_NAME} \
+	--env DOC_REPO_OWNER=${DOC_REPO_OWNER} \
+	--env COVERITY_SCAN_TOKEN=${COVERITY_SCAN_TOKEN} \
+	--env COVERITY_SCAN_NOTIFICATION_EMAIL=${COVERITY_SCAN_NOTIFICATION_EMAIL} \
+	--env TEST_TIMEOUT=${TEST_TIMEOUT} \
+	--env TZ='Europe/Warsaw' \
+	--shm-size=4G \
+	-v ${HOST_WORKDIR}:${WORKDIR} \
+	-v /etc/localtime:/etc/localtime \
+	-w ${SCRIPTSDIR} \
+	${IMAGE_NAME} ${COMMAND}
+
diff --git a/docker/images/build-image.sh b/docker/images/build-image.sh
new file mode 100755
index 0000000000..1024c8e6d5
--- /dev/null
+++ b/docker/images/build-image.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2016-2021, Intel Corporation
+#
+# build-image.sh - prepares a Docker image with <OS>-based environment for
+#		testing (or dev) purpose, tagged with ${CONTAINER_REG}:${OS}-${OS_VER}-${IMG_VER},
+#		according to the ${OS}-${OS_VER}.Dockerfile file located in the same directory.
+#		IMG_VER is a version of Docker image (it usually relates to project's release tag)
+#		and it defaults to "devel".
+#
+
+set -e
+IMG_VER=${IMG_VER:-devel}
+TAG="${OS}-${OS_VER}-${IMG_VER}"
+
+if [[ -z "${OS}" || -z "${OS_VER}" ]]; then
+	echo "ERROR: The variables OS and OS_VER have to be set " \
+		"(e.g. OS=fedora, OS_VER=34)."
+	exit 1
+fi
+
+if [[ -z "${CONTAINER_REG}" ]]; then
+	echo "ERROR: CONTAINER_REG environment variable is not set " \
+		"(e.g. \"<registry_addr>/<org_name>/<package_name>\")."
+	exit 1
+fi
+
+echo "Check if the file ${OS}-${OS_VER}.Dockerfile exists"
+if [[ ! -f "${OS}-${OS_VER}.Dockerfile" ]]; then
+	echo "Error: ${OS}-${OS_VER}.Dockerfile does not exist."
+	exit 1
+fi
+
+echo "Build a Docker image tagged with: ${CONTAINER_REG}:${TAG}"
+docker build -t ${CONTAINER_REG}:${TAG} \
+	--build-arg http_proxy=$http_proxy \
+	--build-arg https_proxy=$https_proxy \
+	-f ${OS}-${OS_VER}.Dockerfile ../.. # need access to contrib and submodules
diff --git a/docker/images/centos-8streams.Dockerfile b/docker/images/centos-8streams.Dockerfile
new file mode 100644
index 0000000000..b6d8515ede
--- /dev/null
+++ b/docker/images/centos-8streams.Dockerfile
@@ -0,0 +1,31 @@
+FROM quay.io/centos/centos:stream8
+
+RUN dnf install -y \
+cmake \
+sudo \
+git \
+tzdata \
+vim \
+gdb \
+clang \
+python36 \
+glibc-devel.i686 \
+xmlto \
+uuid \
+libuuid-devel \
+json-c-devel \
+perf \
+numactl
+
+# GCC 12 fixes some compile errors
+# and resolves symbols better in VTune
+RUN dnf -y install gcc-toolset-12
+RUN echo "source /opt/rh/gcc-toolset-12/enable" >> /etc/bashrc
+SHELL ["/bin/bash", "--login", "-c"]
+
+COPY ./contrib ./contrib
+COPY ./docker ./docker
+COPY ./cachelib/external ./cachelib/external
+
+RUN ./docker/images/install-cachelib-deps.sh
+RUN ./docker/images/install-dsa-deps.sh
diff --git a/docker/images/install-cachelib-deps.sh b/docker/images/install-cachelib-deps.sh
new file mode 100755
index 0000000000..f95685fe21
--- /dev/null
+++ b/docker/images/install-cachelib-deps.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2022, Intel Corporation
+
+echo 'Defaults env_keep += "HTTPS_PROXY https_proxy HTTP_PROXY http_proxy NO_PROXY no_proxy"' >> /etc/sudoers
+./contrib/prerequisites-centos8.sh
+
+for pkg in zstd googleflags googlelog googletest sparsemap fmt folly fizz wangle mvfst fbthrift ;
+do
+    #sudo ./contrib/build-package.sh -j -p /opt/ "$pkg"
+    sudo ./contrib/build-package.sh -j -v -d "$pkg"
+done
+
diff --git a/docker/images/install-dsa-deps.sh b/docker/images/install-dsa-deps.sh
new file mode 100755
index 0000000000..f3484746b4
--- /dev/null
+++ b/docker/images/install-dsa-deps.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+# Copyright 2023, Intel Corporation
+
+# Install idxd-config
+git clone https://github.com/intel/idxd-config.git
+cd idxd-config
+./autogen.sh
+./configure CFLAGS='-g -O2' --prefix=/usr --sysconfdir=/etc --libdir=/usr/lib64
+make
+make check
+sudo make install
+cd ../
+rm -rf idxd-config
+
+# Install DML Library
+git clone --recursive https://github.com/intel/DML.git
+cd DML
+git checkout v1.1.0
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_BUILD_TYPE=RelWithDebInfo ..
+cmake --build . --target install
+cd ../../
+rm -rf DML
diff --git a/docker/images/push-image.sh b/docker/images/push-image.sh
new file mode 100755
index 0000000000..8f516b4205
--- /dev/null
+++ b/docker/images/push-image.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2016-2021, Intel Corporation
+
+#
+# push-image.sh - pushes the Docker image tagged as described in
+#		./build-image.sh, to the ${CONTAINER_REG}.
+#
+# The script utilizes ${CONTAINER_REG_USER} and ${CONTAINER_REG_PASS} variables to
+# log in to the ${CONTAINER_REG}. The variables can be set in the CI's configuration
+# for automated builds.
+#
+
+set -e
+IMG_VER=${IMG_VER:-devel}
+TAG="${OS}-${OS_VER}-${IMG_VER}"
+
+if [[ -z "${OS}" || -z "${OS_VER}" ]]; then
+	echo "ERROR: The variables OS and OS_VER have to be set " \
+		"(e.g. OS=fedora, OS_VER=34)."
+	exit 1
+fi
+
+if [[ -z "${CONTAINER_REG}" ]]; then
+	echo "ERROR: CONTAINER_REG environment variable is not set " \
+		"(e.g. \"<registry_addr>/<org_name>/<package_name>\")."
+	exit 1
+fi
+
+if [[ -z "${CONTAINER_REG_USER}" || -z "${CONTAINER_REG_PASS}" ]]; then
+	echo "ERROR: variables CONTAINER_REG_USER=\"${CONTAINER_REG_USER}\" and " \
+		"CONTAINER_REG_PASS=\"${CONTAINER_REG_PASS}\"" \
+		"have to be set properly to allow login to the Container Registry."
+	exit 1
+fi
+
+# Check if the image tagged with ${CONTAINER_REG}:${TAG} exists locally
+if [[ ! $(docker images -a | awk -v pattern="^${CONTAINER_REG}:${TAG}\$" \
+	'$1":"$2 ~ pattern') ]]
+then
+	echo "ERROR: Docker image tagged ${CONTAINER_REG}:${TAG} does not exist locally."
+	exit 1
+fi
+
+echo "Log in to the Container Registry: ${CONTAINER_REG}"
+echo "${CONTAINER_REG_PASS}" | docker login ghcr.io -u="${CONTAINER_REG_USER}" --password-stdin
+
+echo "Push the image to the Container Registry"
+docker push ${CONTAINER_REG}:${TAG}
diff --git a/docker/pull-or-rebuild-image.sh b/docker/pull-or-rebuild-image.sh
new file mode 100755
index 0000000000..43ac166b80
--- /dev/null
+++ b/docker/pull-or-rebuild-image.sh
@@ -0,0 +1,124 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2016-2021, Intel Corporation
+
+#
+# pull-or-rebuild-image.sh - rebuilds the Docker image used in the
+#		current build (if necessary) or pulls it from the Container Registry.
+#		Docker image is tagged as described in docker/build-image.sh,
+#		but IMG_VER defaults in this script to "latest" (just in case it's
+#		used locally without building any images).
+#
+# If Docker was rebuilt and all requirements are fulfilled (more details in
+# push_image function below) image will be pushed to the ${CONTAINER_REG}.
+#
+# The script rebuilds the Docker image if:
+# 1. the Dockerfile for the current OS version (${OS}-${OS_VER}.Dockerfile)
+#    or any .sh script in the Dockerfiles directory were modified and committed, or
+# 2. "rebuild" param was passed as a first argument to this script.
+#
+# The script pulls the Docker image if:
+# 1. it does not have to be rebuilt (based on committed changes), or
+# 2. "pull" param was passed as a first argument to this script.
+#
+
+set -e
+
+source $(dirname ${0})/set-ci-vars.sh
+IMG_VER=${IMG_VER:-latest}
+TAG="${OS}-${OS_VER}-${IMG_VER}"
+IMAGES_DIR_NAME=images
+BASE_DIR=docker/${IMAGES_DIR_NAME}
+
+if [[ -z "${OS}" || -z "${OS_VER}" ]]; then
+	echo "ERROR: The variables OS and OS_VER have to be set properly " \
+             "(eg. OS=fedora, OS_VER=34)."
+	exit 1
+fi
+
+if [[ -z "${CONTAINER_REG}" ]]; then
+	echo "ERROR: CONTAINER_REG environment variable is not set " \
+		"(e.g. \"<registry_addr>/<org_name>/<package_name>\")."
+	exit 1
+fi
+
+function build_image() {
+	echo "Building the Docker image for the ${OS}-${OS_VER}.Dockerfile"
+	pushd ${IMAGES_DIR_NAME}
+	./build-image.sh
+	popd
+}
+
+function pull_image() {
+	echo "Pull the image '${CONTAINER_REG}:${TAG}' from the Container Registry."
+	docker pull ${CONTAINER_REG}:${TAG}
+}
+
+function push_image {
+	# Check if the image has to be pushed to the Container Registry:
+	# - only upstream (not forked) repository,
+	# - stable-* or master branch,
+	# - not a pull_request event,
+	# - and PUSH_IMAGE flag was set for current build.
+	if [[ "${CI_REPO_SLUG}" == "${GITHUB_REPO}" \
+		&& (${CI_BRANCH} == develop || ${CI_BRANCH} == main) \
+		&& ${CI_EVENT_TYPE} != "pull_request" \
+		&& ${PUSH_IMAGE} == "1" ]]
+	then
+		echo "The image will be pushed to the Container Registry: ${CONTAINER_REG}"
+		pushd ${IMAGES_DIR_NAME}
+		./push-image.sh
+		popd
+	else
+		echo "Skip pushing the image to the Container Registry."
+	fi
+}
+echo "${1}"
+# If "rebuild" or "pull" are passed to the script as param, force rebuild/pull.
+if [[ "${1}" == "rebuild" ]]; then
+	build_image
+	push_image
+	exit 0
+elif [[ "${1}" == "pull" ]]; then
+	pull_image
+	exit 0
+fi
+
+# Determine if we need to rebuild the image or just pull it from
+# the Container Registry, based on committed changes.
+if [ -n "${CI_COMMIT_RANGE}" ]; then
+	commits=$(git rev-list ${CI_COMMIT_RANGE})
+else
+	commits=${CI_COMMIT}
+fi
+
+if [[ -z "${commits}" ]]; then
+	echo "'commits' variable is empty. Docker image will be pulled."
+fi
+
+echo "Commits in the commit range:"
+for commit in ${commits}; do echo ${commit}; done
+
+echo "Files modified within the commit range:"
+files=$(for commit in ${commits}; do git diff-tree --no-commit-id --name-only \
+	-r ${commit}; done | sort -u)
+for file in ${files}; do echo ${file}; done
+
+# Check if committed file modifications require the Docker image to be rebuilt
+for file in ${files}; do
+	# Check if modified files are relevant to the current build
+	if [[ ${file} =~ ^(${BASE_DIR})\/(${OS})-(${OS_VER})\.Dockerfile$ ]] \
+		|| [[ ${file} =~ ^(${BASE_DIR})\/.*\.sh$ ]]
+	then
+		build_image
+		push_image
+		exit 0
+	fi
+done
+
+# Getting here means rebuilding the Docker image isn't required (based on changed files).
+# Pull the image from the Container Registry or rebuild anyway, if pull fails.
+if ! pull_image; then
+	build_image
+	push_image
+fi
diff --git a/docker/run-build.sh b/docker/run-build.sh
new file mode 100755
index 0000000000..a0ba69d27e
--- /dev/null
+++ b/docker/run-build.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2022, Intel Corporation
+
+set -e
+
+function sudo_password() {
+	echo ${USERPASS} | sudo -Sk $*
+}
+
+cd ..
+mkdir build
+cd build
+
+source /opt/rh/gcc-toolset-12/enable
+
+./build-package -d -j -v -T cachelib
+#cmake ../cachelib -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=/opt -DCMAKE_BUILD_TYPE=Debug
+#sudo_password make install -j$(nproc)
+
+cd opt/tests && $WORKDIR/run_tests.sh
diff --git a/docker/set-ci-vars.sh b/docker/set-ci-vars.sh
new file mode 100755
index 0000000000..2e919f415d
--- /dev/null
+++ b/docker/set-ci-vars.sh
@@ -0,0 +1,130 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2020-2021, Intel Corporation
+
+#
+# set-ci-vars.sh -- set CI variables common for both:
+#                   Travis and GitHub Actions CIs
+#
+
+set -e
+
+function get_commit_range_from_develop {
+    LAST_COMMIT=$(git log origin/develop --pretty=%H -1)
+    RANGE_END="HEAD"
+    COMMIT_RANGE="${LAST_COMMIT}..${RANGE_END}"
+    echo ${COMMIT_RANGE}
+}
+
+function get_commit_range_from_last_merge {
+	# get commit id of the last merge
+	LAST_MERGE=$(git log --merges --pretty=%H -1)
+	LAST_COMMIT=$(git log --pretty=%H -1)
+	RANGE_END="HEAD"
+	if [ -n "${GITHUB_ACTIONS}" ] && [ "${GITHUB_EVENT_NAME}" == "pull_request" ] && [ "${LAST_MERGE}" == "${LAST_COMMIT}" ]; then
+		# GitHub Actions commits its own merge in case of pull requests
+		# so the first merge commit has to be skipped.
+
+		LAST_COMMIT=$(git log --pretty=%H -2 | tail -n1)
+		LAST_MERGE=$(git log --merges --pretty=%H -2 | tail -n1)
+		# If still the last commit is a merge commit it means we're manually
+		# merging changes (probably back from stable branch). We have to use
+		# left parent of the merge and the current commit for COMMIT_RANGE.
+		if [ "${LAST_MERGE}" == "${LAST_COMMIT}" ]; then
+			LAST_MERGE=$(git log --merges --pretty=%P -2 | tail -n1 | cut -d" " -f1)
+			RANGE_END=${LAST_COMMIT}
+		fi
+	elif [ "${LAST_MERGE}" == "${LAST_COMMIT}" ] &&
+		([ "${TRAVIS_EVENT_TYPE}" == "push" ] || [ "${GITHUB_EVENT_NAME}" == "push" ]); then
+		# Other case in which last commit equals last merge, is when committing
+		# a manual merge. Push events don't set proper COMMIT_RANGE.
+		# It has to be then set: from merge's left parent to the current commit.
+		LAST_MERGE=$(git log --merges --pretty=%P -1 | cut -d" " -f1)
+	fi
+	if [ "${LAST_MERGE}" == "" ]; then
+		# possible in case of shallow clones
+		# or new repos with no merge commits yet
+		# - pick up the first commit
+		LAST_MERGE=$(git log --pretty=%H | tail -n1)
+	fi
+	COMMIT_RANGE="${LAST_MERGE}..${RANGE_END}"
+	# make sure it works now
+	if ! git rev-list ${COMMIT_RANGE} >/dev/null; then
+		COMMIT_RANGE=""
+	fi
+	echo ${COMMIT_RANGE}
+}
+
+COMMIT_RANGE_FROM_LAST_MERGE=$(get_commit_range_from_develop)
+
+if [ -n "${TRAVIS}" ]; then
+	CI_COMMIT=${TRAVIS_COMMIT}
+	CI_COMMIT_RANGE="${TRAVIS_COMMIT_RANGE/.../..}"
+	CI_BRANCH=${TRAVIS_BRANCH}
+	CI_EVENT_TYPE=${TRAVIS_EVENT_TYPE}
+	CI_REPO_SLUG=${TRAVIS_REPO_SLUG}
+
+	# CI_COMMIT_RANGE is usually invalid for force pushes - fix it when used
+	# with non-upstream repository
+	if [ -n "${CI_COMMIT_RANGE}" -a "${CI_REPO_SLUG}" != "${GITHUB_REPO}" ]; then
+		if ! git rev-list ${CI_COMMIT_RANGE}; then
+			CI_COMMIT_RANGE=${COMMIT_RANGE_FROM_LAST_MERGE}
+		fi
+	fi
+
+	case "${TRAVIS_CPU_ARCH}" in
+	"amd64")
+		CI_CPU_ARCH="x86_64"
+		;;
+	*)
+		CI_CPU_ARCH=${TRAVIS_CPU_ARCH}
+		;;
+	esac
+
+elif [ -n "${GITHUB_ACTIONS}" ]; then
+	CI_COMMIT=${GITHUB_SHA}
+	CI_COMMIT_RANGE=${COMMIT_RANGE_FROM_LAST_MERGE}
+	CI_BRANCH=$(echo ${GITHUB_REF} | cut -d'/' -f3)
+	CI_REPO_SLUG=${GITHUB_REPOSITORY}
+	CI_CPU_ARCH="x86_64" # GitHub Actions supports only x86_64
+
+	case "${GITHUB_EVENT_NAME}" in
+	"schedule")
+		CI_EVENT_TYPE="cron"
+		;;
+	*)
+		CI_EVENT_TYPE=${GITHUB_EVENT_NAME}
+		;;
+	esac
+
+else
+	CI_COMMIT=$(git log --pretty=%H -1)
+	CI_COMMIT_RANGE=${COMMIT_RANGE_FROM_LAST_MERGE}
+	CI_CPU_ARCH="x86_64"
+fi
+
+export CI_COMMIT=${CI_COMMIT}
+export CI_COMMIT_RANGE=${CI_COMMIT_RANGE}
+export CI_BRANCH=${CI_BRANCH}
+export CI_EVENT_TYPE=${CI_EVENT_TYPE}
+export CI_REPO_SLUG=${CI_REPO_SLUG}
+export CI_CPU_ARCH=${CI_CPU_ARCH}
+
+echo CI_COMMIT=${CI_COMMIT}
+echo "-----------------------"
+echo $(git show ${CI_COMMIT})
+echo "-----------------------"
+echo "-----------------------"
+echo $(git show)
+echo "-----------------------"
+echo "-----------------------"
+echo $(git branches)
+echo "-----------------------"
+echo "-----------------------"
+echo $(git remote -v)
+echo "-----------------------"
+echo CI_COMMIT_RANGE=${CI_COMMIT_RANGE}
+echo CI_BRANCH=${CI_BRANCH}
+echo CI_EVENT_TYPE=${CI_EVENT_TYPE}
+echo CI_REPO_SLUG=${CI_REPO_SLUG}
+echo CI_CPU_ARCH=${CI_CPU_ARCH}
diff --git a/examples/single_tier_cache/main.cpp b/examples/single_tier_cache/main.cpp
index de6373622c..9c19dfeea9 100644
--- a/examples/single_tier_cache/main.cpp
+++ b/examples/single_tier_cache/main.cpp
@@ -25,7 +25,7 @@ using CacheConfig = typename Cache::Config;
 using CacheKey = typename Cache::Key;
 using CacheReadHandle = typename Cache::ReadHandle;
 using MemoryTierCacheConfig = typename cachelib::MemoryTierCacheConfig;
-using NumaBitMask = typename cachelib::NumaBitMask;
+using NumaBitMask = typename cachelib::util::NumaBitMask;
 
 // Global cache object and a default cache pool
 std::unique_ptr<Cache> gCache_;
diff --git a/run_code_coverage.sh b/run_code_coverage.sh
new file mode 100755
index 0000000000..7722e262bf
--- /dev/null
+++ b/run_code_coverage.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+#Build CacheLib with flag -DCOVERAGE_ENABLED=ON
+
+# Track coverage
+lcov -c -i -b . -d . -o Coverage.baseline
+./run_tests.sh
+lcov -c -d . -b . -o Coverage.out
+lcov -a Coverage.baseline -a Coverage.out -o Coverage.combined
+
+# Generate report
+COVERAGE_DIR='coverage_report'
+genhtml Coverage.combined -o ${COVERAGE_DIR}
+COVERAGE_REPORT="${COVERAGE_DIR}.tgz"
+tar -zcvf ${COVERAGE_REPORT} ${COVERAGE_DIR}
+echo "Created coverage report ${COVERAGE_REPORT}"
+
+# Cleanup
+rm Coverage.baseline Coverage.out Coverage.combined
+rm -rf ${COVERAGE_DIR}
diff --git a/run_tests.sh b/run_tests.sh
new file mode 100755
index 0000000000..6ff2ac65ed
--- /dev/null
+++ b/run_tests.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# Newline separated list of tests to ignore
+BLACKLIST="allocator-test-NavySetupTest
+allocator-test-NvmCacheTests
+shm-test-test_page_size"
+
+if [ "$1" == "long" ]; then
+    find -type f -executable | grep -vF "$BLACKLIST" | xargs -n1 bash -c
+else
+    find -type f \( -not -name "*bench*" -and -not -name "navy*" \) -executable | grep -vF "$BLACKLIST" | xargs -n1 bash -c
+fi
+
+../bin/cachebench --json_test_config ../test_configs/consistency/navy.json
+../bin/cachebench --json_test_config ../test_configs/consistency/navy-multi-tier.json
+../bin/cachebench --json_test_config ../test_configs/small_moving_bg.json