From c0c90d243b28a3081ab1ad4bf21798b4c97ce9a9 Mon Sep 17 00:00:00 2001
From: "Chorazewicz, Igor" <igor.chorazewicz@intel.com>
Date: Tue, 2 Nov 2021 16:00:53 +0100
Subject: [PATCH 01/40] Run centos and debian workflows on push and PR

Run tests on CI

Run long tests (navy/bench) every day on CI

Run CI on prebuild docker image

Run only centos build on CI

Update docker file used in CI

Centos8 is EOL

Disable failing clang-format-check

Add extra param to build-package.sh

Add scripts for rebuilding/pushing docker images

Taken from: https://github.com/pmem/dev-utils-kit/commit/30794c3e1bbc9273e87da3e8f3ce7e5a2792b19e

Extend CI to rebuild docker automatically

Update build-cachelib-docker.yml

Do not use shallow clone to make sure Docker rebuild logic works correctly.

Added required packages to install Intel ittapi

Update CI to use intel/CacheLib repo (#17)

Add multi-tier navy benchmark and run it on CI
- fix navy multi-tier config for NUMA bindings

added code coverage support in CacheLib

Adding libdml to CentOS docker image (#53)

only exclude allocator-test-NavySetupTestm, shm-test-test_page_size tests

added perf and numactl to docker packages

---------------------------------------------
one large commit for all CI and code coverage
see above for the change history.
---
 .../workflows/build-cachelib-centos-long.yml  |  39 ++++++
 .github/workflows/build-cachelib-debian.yml   |  43 ++++++
 .github/workflows/build-cachelib-docker.yml   |  49 +++++++
 cachelib/CMakeLists.txt                       |   5 +
 .../consistency/navy-multi-tier.json          |  54 ++++++++
 .../test_configs/consistency/navy.json        |   4 +-
 contrib/build-package.sh                      |   8 +-
 docker/build.sh                               |  97 ++++++++++++++
 docker/images/build-image.sh                  |  38 ++++++
 docker/images/centos-8streams.Dockerfile      |  24 ++++
 docker/images/install-cachelib-deps.sh        |  14 ++
 docker/images/install-dsa-deps.sh             |  23 ++++
 docker/images/push-image.sh                   |  49 +++++++
 docker/pull-or-rebuild-image.sh               | 124 ++++++++++++++++++
 docker/run-build.sh                           |  17 +++
 docker/set-ci-vars.sh                         | 111 ++++++++++++++++
 run_code_coverage.sh                          |  20 +++
 run_tests.sh                                  |  14 ++
 18 files changed, 727 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/build-cachelib-centos-long.yml
 create mode 100644 .github/workflows/build-cachelib-debian.yml
 create mode 100644 .github/workflows/build-cachelib-docker.yml
 create mode 100644 cachelib/cachebench/test_configs/consistency/navy-multi-tier.json
 create mode 100755 docker/build.sh
 create mode 100755 docker/images/build-image.sh
 create mode 100644 docker/images/centos-8streams.Dockerfile
 create mode 100755 docker/images/install-cachelib-deps.sh
 create mode 100755 docker/images/install-dsa-deps.sh
 create mode 100755 docker/images/push-image.sh
 create mode 100755 docker/pull-or-rebuild-image.sh
 create mode 100755 docker/run-build.sh
 create mode 100755 docker/set-ci-vars.sh
 create mode 100755 run_code_coverage.sh
 create mode 100755 run_tests.sh

diff --git a/.github/workflows/build-cachelib-centos-long.yml b/.github/workflows/build-cachelib-centos-long.yml
new file mode 100644
index 0000000000..92165f603b
--- /dev/null
+++ b/.github/workflows/build-cachelib-centos-long.yml
@@ -0,0 +1,39 @@
+name: build-cachelib-centos-latest
+on:
+  schedule:
+    - cron:  '0 7 * * *'
+    
+jobs:
+  build-cachelib-centos8-latest:
+    name: "CentOS/latest - Build CacheLib with all dependencies"
+    runs-on: ubuntu-latest
+    # Docker container image name
+    container: "centos:latest"
+    steps:
+      - name: "update packages"
+        run: dnf upgrade -y
+      - name: "install sudo,git"
+        run: dnf install -y sudo git cmake gcc
+      - name: "System Information"
+        run: |
+          echo === uname ===
+          uname -a
+          echo === /etc/os-release ===
+          cat /etc/os-release
+          echo === df -hl ===
+          df -hl
+          echo === free -h ===
+          free -h
+          echo === top ===
+          top -b -n1 -1 -Eg || timeout 1 top -b -n1
+          echo === env ===
+          env
+          echo === gcc -v ===
+          gcc -v
+      - name: "checkout sources"
+        uses: actions/checkout@v2
+      - name: "build CacheLib using build script"
+        run: ./contrib/build.sh -j -v -T
+      - name: "run tests"
+        timeout-minutes: 60
+        run: cd opt/cachelib/tests && ../../../run_tests.sh long
diff --git a/.github/workflows/build-cachelib-debian.yml b/.github/workflows/build-cachelib-debian.yml
new file mode 100644
index 0000000000..5bc3ad3c70
--- /dev/null
+++ b/.github/workflows/build-cachelib-debian.yml
@@ -0,0 +1,43 @@
+name: build-cachelib-debian-10
+on:
+  schedule:
+    - cron:  '30 5 * * 0,3'
+
+jobs:
+  build-cachelib-debian-10:
+    name: "Debian/Buster - Build CacheLib with all dependencies"
+    runs-on: ubuntu-latest
+    # Docker container image name
+    container: "debian:buster-slim"
+    steps:
+      - name: "update packages"
+        run: apt-get update
+      - name: "upgrade packages"
+        run: apt-get -y upgrade
+      - name: "install sudo,git"
+        run: apt-get install -y sudo git procps
+      - name: "System Information"
+        run: |
+          echo === uname ===
+          uname -a
+          echo === /etc/os-release ===
+          cat /etc/os-release
+          echo === df -hl ===
+          df -hl
+          echo === free -h ===
+          free -h
+          echo === top ===
+          top -b -n1 -1 -Eg || timeout 1 top -b -n1 ; true
+          echo === env ===
+          env
+          echo === cc -v ===
+          cc -v || true
+          echo === g++ -v ===
+          g++ - || true
+      - name: "checkout sources"
+        uses: actions/checkout@v2
+      - name: "build CacheLib using build script"
+        run: ./contrib/build.sh -j -v -T
+      - name: "run tests"
+        timeout-minutes: 60
+        run: cd opt/cachelib/tests && ../../../run_tests.sh
diff --git a/.github/workflows/build-cachelib-docker.yml b/.github/workflows/build-cachelib-docker.yml
new file mode 100644
index 0000000000..be28bc233c
--- /dev/null
+++ b/.github/workflows/build-cachelib-docker.yml
@@ -0,0 +1,49 @@
+name: build-cachelib-docker
+on:
+  push:
+  pull_request:
+
+jobs:
+  build-cachelib-docker:
+    name: "CentOS/latest - Build CacheLib with all dependencies"
+    runs-on: ubuntu-latest
+    env:
+      REPO:           cachelib
+      GITHUB_REPO:    intel/CacheLib
+      CONTAINER_REG:  ghcr.io/pmem/cachelib
+      CONTAINER_REG_USER:   ${{ secrets.GH_CR_USER }}
+      CONTAINER_REG_PASS:   ${{ secrets.GH_CR_PAT }}
+      FORCE_IMAGE_ACTION:   ${{ secrets.FORCE_IMAGE_ACTION }}
+      HOST_WORKDIR:         ${{ github.workspace }}
+      WORKDIR:              docker
+      IMG_VER:              devel
+    strategy:
+      matrix:
+        CONFIG: ["OS=centos OS_VER=8streams PUSH_IMAGE=1"]
+    steps:
+      - name: "System Information"
+        run: |
+          echo === uname ===
+          uname -a
+          echo === /etc/os-release ===
+          cat /etc/os-release
+          echo === df -hl ===
+          df -hl
+          echo === free -h ===
+          free -h
+          echo === top ===
+          top -b -n1 -1 -Eg || timeout 1 top -b -n1
+          echo === env ===
+          env
+          echo === gcc -v ===
+          gcc -v
+      - name: "checkout sources"
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - name: Pull the image or rebuild and push it
+        run: cd $WORKDIR && ${{ matrix.CONFIG }} ./pull-or-rebuild-image.sh $FORCE_IMAGE_ACTION
+
+      - name: Run the build
+        run: cd $WORKDIR && ${{ matrix.CONFIG }} ./build.sh
diff --git a/cachelib/CMakeLists.txt b/cachelib/CMakeLists.txt
index 506ba66bcf..32b2859e44 100644
--- a/cachelib/CMakeLists.txt
+++ b/cachelib/CMakeLists.txt
@@ -85,6 +85,11 @@ set(CMAKE_MODULE_PATH
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
 
+if(COVERAGE_ENABLED)
+  # Add code coverage
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --coverage -fprofile-arcs -ftest-coverage")
+endif()
+
 # include(fb_cxx_flags)
 message(STATUS "Update CXXFLAGS: ${CMAKE_CXX_FLAGS}")
 
diff --git a/cachelib/cachebench/test_configs/consistency/navy-multi-tier.json b/cachelib/cachebench/test_configs/consistency/navy-multi-tier.json
new file mode 100644
index 0000000000..076550bc5c
--- /dev/null
+++ b/cachelib/cachebench/test_configs/consistency/navy-multi-tier.json
@@ -0,0 +1,54 @@
+{
+  "cache_config" : {
+    "cacheSizeMB" : 300,
+    "poolRebalanceIntervalSec" : 1,
+    "moveOnSlabRelease" : true,
+
+    "cacheDir": "/tmp/mem-tier2",
+    "memoryTiers" : [
+      {
+        "ratio": 1,
+	"memBindNodes": 0
+      },
+      {
+        "ratio": 1,
+	"memBindNodes": 0
+      }
+    ],
+
+    "numPools" : 2,
+    "poolSizes" : [0.5, 0.5],
+    "allocFactor" : 2.0,
+    "nvmCacheSizeMB" : 1024
+  },
+  "test_config" :
+    {
+
+      "checkConsistency" : true,
+
+      "numOps" : 60000,
+      "numThreads" : 20,
+      "numKeys" : 200000,
+
+
+      "keySizeRange" : [1, 8, 64],
+      "keySizeRangeProbability" : [0.5, 0.5],
+
+      "valSizeRange" : [256, 1024, 4096, 8192],
+      "valSizeRangeProbability" : [0.2, 0.7, 0.1],
+
+      "chainedItemLengthRange" : [1, 2, 4, 32],
+      "chainedItemLengthRangeProbability" : [0.8, 0.18, 0.02],
+
+      "chainedItemValSizeRange" : [1, 128, 256, 1024, 4096, 20480],
+      "chainedItemValSizeRangeProbability" : [0.1, 0.1, 0.2, 0.3, 0.3],
+
+      "getRatio" : 0.8,
+      "setRatio" : 0.1,
+      "delRatio" : 0.0,
+      "addChainedRatio" : 0.05,
+      "keyPoolDistribution": [0.5, 0.5],
+      "opPoolDistribution" : [0.5, 0.5]
+    }
+
+}
diff --git a/cachelib/cachebench/test_configs/consistency/navy.json b/cachelib/cachebench/test_configs/consistency/navy.json
index 73b016a50f..b95b056d31 100644
--- a/cachelib/cachebench/test_configs/consistency/navy.json
+++ b/cachelib/cachebench/test_configs/consistency/navy.json
@@ -14,8 +14,8 @@
 
       "checkConsistency" : true,
 
-      "numOps" : 30000000,
-      "numThreads" : 40,
+      "numOps" : 600000,
+      "numThreads" : 20,
       "numKeys" : 200000,
 
 
diff --git a/contrib/build-package.sh b/contrib/build-package.sh
index 406031bd40..1b646049f7 100755
--- a/contrib/build-package.sh
+++ b/contrib/build-package.sh
@@ -78,9 +78,8 @@ build_tests=
 show_help=
 many_jobs=
 verbose=
-PREFIX="$PWD/opt/cachelib/"
-
-while getopts :BSdhijtvp: param
+install_path=
+while getopts :BSdhijtvI: param
 do
   case $param in
     i) install=yes ;;
@@ -91,7 +90,7 @@ do
     v) verbose=yes ;;
     j) many_jobs=yes ;;
     t) build_tests=yes ;;
-    p) PREFIX=$OPTARG ;;
+    I) install_path=${OPTARG} ; install=yes ;;
     ?) die "unknown option. See -h for help."
   esac
 done
@@ -288,6 +287,7 @@ test -d cachelib || die "expected 'cachelib' directory not found in $PWD"
 
 
 # After ensuring we are in the correct directory, set the installation prefix"
+PREFIX=${install_path:-"$PWD/opt/cachelib/"}
 CMAKE_PARAMS="$CMAKE_PARAMS -DCMAKE_INSTALL_PREFIX=$PREFIX"
 CMAKE_PREFIX_PATH="$PREFIX/lib/cmake:$PREFIX/lib64/cmake:$PREFIX/lib:$PREFIX/lib64:$PREFIX:${CMAKE_PREFIX_PATH:-}"
 export CMAKE_PREFIX_PATH
diff --git a/docker/build.sh b/docker/build.sh
new file mode 100755
index 0000000000..bb82f0142d
--- /dev/null
+++ b/docker/build.sh
@@ -0,0 +1,97 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2022, Intel Corporation
+
+#
+# build.sh - runs a Docker container from a Docker image with environment
+#		prepared for running CacheLib builds and tests. It uses Docker image
+#		tagged as described in ./images/build-image.sh.
+#
+# Notes:
+# - set env var 'HOST_WORKDIR' to where the root of this project is on the host machine,
+# - set env var 'OS' and 'OS_VER' properly to a system/Docker you want to build this
+#	repo on (for proper values take a look at the list of Dockerfiles at the
+#	utils/docker/images directory in this repo), e.g. OS=ubuntu, OS_VER=20.04,
+# - set env var 'CONTAINER_REG' to container registry address
+#	[and possibly user/org name, and package name], e.g. "<CR_addr>/pmem/CacheLib",
+# - set env var 'DNS_SERVER' if you use one,
+# - set env var 'COMMAND' to execute specific command within Docker container or
+#	env var 'TYPE' to pick command based on one of the predefined types of build (see below).
+#
+
+set -e
+
+source $(dirname ${0})/set-ci-vars.sh
+IMG_VER=${IMG_VER:-devel}
+TAG="${OS}-${OS_VER}-${IMG_VER}"
+IMAGE_NAME=${CONTAINER_REG}:${TAG}
+CONTAINER_NAME=CacheLib-${OS}-${OS_VER}
+WORKDIR=/CacheLib  # working dir within Docker container
+SCRIPTSDIR=${WORKDIR}/docker
+
+if [[ -z "${OS}" || -z "${OS_VER}" ]]; then
+	echo "ERROR: The variables OS and OS_VER have to be set " \
+		"(e.g. OS=fedora, OS_VER=32)."
+	exit 1
+fi
+
+if [[ -z "${HOST_WORKDIR}" ]]; then
+	echo "ERROR: The variable HOST_WORKDIR has to contain a path to " \
+		"the root of this project on the host machine."
+	exit 1
+fi
+
+if [[ -z "${CONTAINER_REG}" ]]; then
+	echo "ERROR: CONTAINER_REG environment variable is not set " \
+		"(e.g. \"<registry_addr>/<org_name>/<package_name>\")."
+	exit 1
+fi
+
+# Set command to execute in the Docker container
+COMMAND="./run-build.sh";
+echo "COMMAND to execute within Docker container: ${COMMAND}"
+
+if [ -n "${DNS_SERVER}" ]; then DOCKER_OPTS="${DOCKER_OPTS} --dns=${DNS_SERVER}"; fi
+
+# Check if we are running on a CI (Travis or GitHub Actions)
+[ -n "${GITHUB_ACTIONS}" -o -n "${TRAVIS}" ] && CI_RUN="YES" || CI_RUN="NO"
+
+# Do not allocate a pseudo-TTY if we are running on GitHub Actions
+[ ! "${GITHUB_ACTIONS}" ] && DOCKER_OPTS="${DOCKER_OPTS} --tty=true"
+
+
+echo "Running build using Docker image: ${IMAGE_NAME}"
+
+# Run a container with
+#  - environment variables set (--env)
+#  - host directory containing source mounted (-v)
+#  - working directory set (-w)
+docker run --privileged=true --name=${CONTAINER_NAME} -i \
+	${DOCKER_OPTS} \
+	--env http_proxy=${http_proxy} \
+	--env https_proxy=${https_proxy} \
+	--env TERM=xterm-256color \
+	--env WORKDIR=${WORKDIR} \
+	--env SCRIPTSDIR=${SCRIPTSDIR} \
+	--env GITHUB_REPO=${GITHUB_REPO} \
+	--env CI_RUN=${CI_RUN} \
+	--env TRAVIS=${TRAVIS} \
+	--env GITHUB_ACTIONS=${GITHUB_ACTIONS} \
+	--env CI_COMMIT=${CI_COMMIT} \
+	--env CI_COMMIT_RANGE=${CI_COMMIT_RANGE} \
+	--env CI_BRANCH=${CI_BRANCH} \
+	--env CI_EVENT_TYPE=${CI_EVENT_TYPE} \
+	--env CI_REPO_SLUG=${CI_REPO_SLUG} \
+	--env DOC_UPDATE_GITHUB_TOKEN=${DOC_UPDATE_GITHUB_TOKEN} \
+	--env DOC_UPDATE_BOT_NAME=${DOC_UPDATE_BOT_NAME} \
+	--env DOC_REPO_OWNER=${DOC_REPO_OWNER} \
+	--env COVERITY_SCAN_TOKEN=${COVERITY_SCAN_TOKEN} \
+	--env COVERITY_SCAN_NOTIFICATION_EMAIL=${COVERITY_SCAN_NOTIFICATION_EMAIL} \
+	--env TEST_TIMEOUT=${TEST_TIMEOUT} \
+	--env TZ='Europe/Warsaw' \
+	--shm-size=4G \
+	-v ${HOST_WORKDIR}:${WORKDIR} \
+	-v /etc/localtime:/etc/localtime \
+	-w ${SCRIPTSDIR} \
+	${IMAGE_NAME} ${COMMAND}
+
diff --git a/docker/images/build-image.sh b/docker/images/build-image.sh
new file mode 100755
index 0000000000..985a6e0ff1
--- /dev/null
+++ b/docker/images/build-image.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2016-2021, Intel Corporation
+#
+# build-image.sh - prepares a Docker image with <OS>-based environment for
+#		testing (or dev) purpose, tagged with ${CONTAINER_REG}:${OS}-${OS_VER}-${IMG_VER},
+#		according to the ${OS}-${OS_VER}.Dockerfile file located in the same directory.
+#		IMG_VER is a version of Docker image (it usually relates to project's release tag)
+#		and it defaults to "devel".
+#
+
+set -e
+IMG_VER=${IMG_VER:-devel}
+TAG="${OS}-${OS_VER}-${IMG_VER}"
+
+if [[ -z "${OS}" || -z "${OS_VER}" ]]; then
+	echo "ERROR: The variables OS and OS_VER have to be set " \
+		"(e.g. OS=fedora, OS_VER=34)."
+	exit 1
+fi
+
+if [[ -z "${CONTAINER_REG}" ]]; then
+	echo "ERROR: CONTAINER_REG environment variable is not set " \
+		"(e.g. \"<registry_addr>/<org_name>/<package_name>\")."
+	exit 1
+fi
+
+echo "Check if the file ${OS}-${OS_VER}.Dockerfile exists"
+if [[ ! -f "${OS}-${OS_VER}.Dockerfile" ]]; then
+	echo "Error: ${OS}-${OS_VER}.Dockerfile does not exist."
+	exit 1
+fi
+
+echo "Build a Docker image tagged with: ${CONTAINER_REG}:${TAG}"
+docker build -t ${CONTAINER_REG}:${TAG} \
+	--build-arg http_proxy=$http_proxy \
+	--build-arg https_proxy=$https_proxy \
+	-f ${OS}-${OS_VER}.Dockerfile .
diff --git a/docker/images/centos-8streams.Dockerfile b/docker/images/centos-8streams.Dockerfile
new file mode 100644
index 0000000000..29752c5d98
--- /dev/null
+++ b/docker/images/centos-8streams.Dockerfile
@@ -0,0 +1,24 @@
+FROM quay.io/centos/centos:stream8
+
+RUN dnf install -y \
+cmake \
+sudo \
+git \
+tzdata \
+vim \
+gdb \
+clang \
+python36 \
+glibc-devel.i686 \
+xmlto \
+uuid \
+libuuid-devel \
+json-c-devel \
+perf \
+numactl
+
+COPY ./install-cachelib-deps.sh ./install-cachelib-deps.sh
+RUN ./install-cachelib-deps.sh
+
+COPY ./install-dsa-deps.sh ./install-dsa-deps.sh
+RUN ./install-dsa-deps.sh
diff --git a/docker/images/install-cachelib-deps.sh b/docker/images/install-cachelib-deps.sh
new file mode 100755
index 0000000000..6d8fbdef7b
--- /dev/null
+++ b/docker/images/install-cachelib-deps.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2022, Intel Corporation
+
+git clone -b develop https://github.com/intel/CacheLib CacheLib
+
+./CacheLib/contrib/prerequisites-centos8.sh
+
+for pkg in zstd googleflags googlelog googletest sparsemap fmt folly fizz wangle fbthrift ;
+do
+    sudo ./CacheLib/contrib/build-package.sh -j -I /opt/ "$pkg"
+done
+
+rm -rf CacheLib
diff --git a/docker/images/install-dsa-deps.sh b/docker/images/install-dsa-deps.sh
new file mode 100755
index 0000000000..b4c62ecc93
--- /dev/null
+++ b/docker/images/install-dsa-deps.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+# Copyright 2023, Intel Corporation
+
+# Install idxd-config
+git clone https://github.com/intel/idxd-config.git
+cd idxd-config
+./autogen.sh
+./configure CFLAGS='-g -O2' --prefix=/usr --sysconfdir=/etc --libdir=/usr/lib64
+make
+make check
+sudo make install
+cd ../
+rm -rf idxd-config
+
+# Install DML Library
+git clone --recursive https://github.com/intel/DML.git
+cd DML
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_BUILD_TYPE=RelWithDebInfo ..
+cmake --build . --target install
+cd ../../
+rm -rf DML
diff --git a/docker/images/push-image.sh b/docker/images/push-image.sh
new file mode 100755
index 0000000000..8f516b4205
--- /dev/null
+++ b/docker/images/push-image.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2016-2021, Intel Corporation
+
+#
+# push-image.sh - pushes the Docker image tagged as described in
+#		./build-image.sh, to the ${CONTAINER_REG}.
+#
+# The script utilizes ${CONTAINER_REG_USER} and ${CONTAINER_REG_PASS} variables to
+# log in to the ${CONTAINER_REG}. The variables can be set in the CI's configuration
+# for automated builds.
+#
+
+set -e
+IMG_VER=${IMG_VER:-devel}
+TAG="${OS}-${OS_VER}-${IMG_VER}"
+
+if [[ -z "${OS}" || -z "${OS_VER}" ]]; then
+	echo "ERROR: The variables OS and OS_VER have to be set " \
+		"(e.g. OS=fedora, OS_VER=34)."
+	exit 1
+fi
+
+if [[ -z "${CONTAINER_REG}" ]]; then
+	echo "ERROR: CONTAINER_REG environment variable is not set " \
+		"(e.g. \"<registry_addr>/<org_name>/<package_name>\")."
+	exit 1
+fi
+
+if [[ -z "${CONTAINER_REG_USER}" || -z "${CONTAINER_REG_PASS}" ]]; then
+	echo "ERROR: variables CONTAINER_REG_USER=\"${CONTAINER_REG_USER}\" and " \
+		"CONTAINER_REG_PASS=\"${CONTAINER_REG_PASS}\"" \
+		"have to be set properly to allow login to the Container Registry."
+	exit 1
+fi
+
+# Check if the image tagged with ${CONTAINER_REG}:${TAG} exists locally
+if [[ ! $(docker images -a | awk -v pattern="^${CONTAINER_REG}:${TAG}\$" \
+	'$1":"$2 ~ pattern') ]]
+then
+	echo "ERROR: Docker image tagged ${CONTAINER_REG}:${TAG} does not exist locally."
+	exit 1
+fi
+
+echo "Log in to the Container Registry: ${CONTAINER_REG}"
+echo "${CONTAINER_REG_PASS}" | docker login ghcr.io -u="${CONTAINER_REG_USER}" --password-stdin
+
+echo "Push the image to the Container Registry"
+docker push ${CONTAINER_REG}:${TAG}
diff --git a/docker/pull-or-rebuild-image.sh b/docker/pull-or-rebuild-image.sh
new file mode 100755
index 0000000000..dcdcb40e8c
--- /dev/null
+++ b/docker/pull-or-rebuild-image.sh
@@ -0,0 +1,124 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2016-2021, Intel Corporation
+
+#
+# pull-or-rebuild-image.sh - rebuilds the Docker image used in the
+#		current build (if necessary) or pulls it from the Container Registry.
+#		Docker image is tagged as described in docker/build-image.sh,
+#		but IMG_VER defaults in this script to "latest" (just in case it's
+#		used locally without building any images).
+#
+# If Docker was rebuilt and all requirements are fulfilled (more details in
+# push_image function below) image will be pushed to the ${CONTAINER_REG}.
+#
+# The script rebuilds the Docker image if:
+# 1. the Dockerfile for the current OS version (${OS}-${OS_VER}.Dockerfile)
+#    or any .sh script in the Dockerfiles directory were modified and committed, or
+# 2. "rebuild" param was passed as a first argument to this script.
+#
+# The script pulls the Docker image if:
+# 1. it does not have to be rebuilt (based on committed changes), or
+# 2. "pull" param was passed as a first argument to this script.
+#
+
+set -e
+
+source $(dirname ${0})/set-ci-vars.sh
+IMG_VER=${IMG_VER:-latest}
+TAG="${OS}-${OS_VER}-${IMG_VER}"
+IMAGES_DIR_NAME=images
+BASE_DIR=docker/${IMAGES_DIR_NAME}
+
+if [[ -z "${OS}" || -z "${OS_VER}" ]]; then
+	echo "ERROR: The variables OS and OS_VER have to be set properly " \
+             "(eg. OS=fedora, OS_VER=34)."
+	exit 1
+fi
+
+if [[ -z "${CONTAINER_REG}" ]]; then
+	echo "ERROR: CONTAINER_REG environment variable is not set " \
+		"(e.g. \"<registry_addr>/<org_name>/<package_name>\")."
+	exit 1
+fi
+
+function build_image() {
+	echo "Building the Docker image for the ${OS}-${OS_VER}.Dockerfile"
+	pushd ${IMAGES_DIR_NAME}
+	./build-image.sh
+	popd
+}
+
+function pull_image() {
+	echo "Pull the image '${CONTAINER_REG}:${TAG}' from the Container Registry."
+	docker pull ${CONTAINER_REG}:${TAG}
+}
+
+function push_image {
+	# Check if the image has to be pushed to the Container Registry:
+	# - only upstream (not forked) repository,
+	# - stable-* or master branch,
+	# - not a pull_request event,
+	# - and PUSH_IMAGE flag was set for current build.
+	if [[ "${CI_REPO_SLUG}" == "${GITHUB_REPO}" \
+		&& (${CI_BRANCH} == develop || ${CI_BRANCH} == main) \
+		&& ${CI_EVENT_TYPE} != "pull_request" \
+		&& ${PUSH_IMAGE} == "1" ]]
+	then
+		echo "The image will be pushed to the Container Registry: ${CONTAINER_REG}"
+		pushd ${IMAGES_DIR_NAME}
+		./push-image.sh
+		popd
+	else
+		echo "Skip pushing the image to the Container Registry."
+	fi
+}
+
+# If "rebuild" or "pull" are passed to the script as param, force rebuild/pull.
+if [[ "${1}" == "rebuild" ]]; then
+	build_image
+	push_image
+	exit 0
+elif [[ "${1}" == "pull" ]]; then
+	pull_image
+	exit 0
+fi
+
+# Determine if we need to rebuild the image or just pull it from
+# the Container Registry, based on committed changes.
+if [ -n "${CI_COMMIT_RANGE}" ]; then
+	commits=$(git rev-list ${CI_COMMIT_RANGE})
+else
+	commits=${CI_COMMIT}
+fi
+
+if [[ -z "${commits}" ]]; then
+	echo "'commits' variable is empty. Docker image will be pulled."
+fi
+
+echo "Commits in the commit range:"
+for commit in ${commits}; do echo ${commit}; done
+
+echo "Files modified within the commit range:"
+files=$(for commit in ${commits}; do git diff-tree --no-commit-id --name-only \
+	-r ${commit}; done | sort -u)
+for file in ${files}; do echo ${file}; done
+
+# Check if committed file modifications require the Docker image to be rebuilt
+for file in ${files}; do
+	# Check if modified files are relevant to the current build
+	if [[ ${file} =~ ^(${BASE_DIR})\/(${OS})-(${OS_VER})\.Dockerfile$ ]] \
+		|| [[ ${file} =~ ^(${BASE_DIR})\/.*\.sh$ ]]
+	then
+		build_image
+		push_image
+		exit 0
+	fi
+done
+
+# Getting here means rebuilding the Docker image isn't required (based on changed files).
+# Pull the image from the Container Registry or rebuild anyway, if pull fails.
+if ! pull_image; then
+	build_image
+	push_image
+fi
diff --git a/docker/run-build.sh b/docker/run-build.sh
new file mode 100755
index 0000000000..02c7caf731
--- /dev/null
+++ b/docker/run-build.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2022, Intel Corporation
+
+set -e
+
+function sudo_password() {
+	echo ${USERPASS} | sudo -Sk $*
+}
+
+cd ..
+mkdir build
+cd build
+cmake ../cachelib -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=/opt -DCMAKE_BUILD_TYPE=Debug
+sudo_password make install -j$(nproc)
+
+cd /opt/tests && $WORKDIR/run_tests.sh
diff --git a/docker/set-ci-vars.sh b/docker/set-ci-vars.sh
new file mode 100755
index 0000000000..f6f52132c8
--- /dev/null
+++ b/docker/set-ci-vars.sh
@@ -0,0 +1,111 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2020-2021, Intel Corporation
+
+#
+# set-ci-vars.sh -- set CI variables common for both:
+#                   Travis and GitHub Actions CIs
+#
+
+set -e
+
+function get_commit_range_from_last_merge {
+	# get commit id of the last merge
+	LAST_MERGE=$(git log --merges --pretty=%H -1)
+	LAST_COMMIT=$(git log --pretty=%H -1)
+	RANGE_END="HEAD"
+	if [ -n "${GITHUB_ACTIONS}" ] && [ "${GITHUB_EVENT_NAME}" == "pull_request" ] && [ "${LAST_MERGE}" == "${LAST_COMMIT}" ]; then
+		# GitHub Actions commits its own merge in case of pull requests
+		# so the first merge commit has to be skipped.
+
+		LAST_COMMIT=$(git log --pretty=%H -2 | tail -n1)
+		LAST_MERGE=$(git log --merges --pretty=%H -2 | tail -n1)
+		# If still the last commit is a merge commit it means we're manually
+		# merging changes (probably back from stable branch). We have to use
+		# left parent of the merge and the current commit for COMMIT_RANGE.
+		if [ "${LAST_MERGE}" == "${LAST_COMMIT}" ]; then
+			LAST_MERGE=$(git log --merges --pretty=%P -2 | tail -n1 | cut -d" " -f1)
+			RANGE_END=${LAST_COMMIT}
+		fi
+	elif [ "${LAST_MERGE}" == "${LAST_COMMIT}" ] &&
+		([ "${TRAVIS_EVENT_TYPE}" == "push" ] || [ "${GITHUB_EVENT_NAME}" == "push" ]); then
+		# Other case in which last commit equals last merge, is when committing
+		# a manual merge. Push events don't set proper COMMIT_RANGE.
+		# It has to be then set: from merge's left parent to the current commit.
+		LAST_MERGE=$(git log --merges --pretty=%P -1 | cut -d" " -f1)
+	fi
+	if [ "${LAST_MERGE}" == "" ]; then
+		# possible in case of shallow clones
+		# or new repos with no merge commits yet
+		# - pick up the first commit
+		LAST_MERGE=$(git log --pretty=%H | tail -n1)
+	fi
+	COMMIT_RANGE="${LAST_MERGE}..${RANGE_END}"
+	# make sure it works now
+	if ! git rev-list ${COMMIT_RANGE} >/dev/null; then
+		COMMIT_RANGE=""
+	fi
+	echo ${COMMIT_RANGE}
+}
+
+COMMIT_RANGE_FROM_LAST_MERGE=$(get_commit_range_from_last_merge)
+
+if [ -n "${TRAVIS}" ]; then
+	CI_COMMIT=${TRAVIS_COMMIT}
+	CI_COMMIT_RANGE="${TRAVIS_COMMIT_RANGE/.../..}"
+	CI_BRANCH=${TRAVIS_BRANCH}
+	CI_EVENT_TYPE=${TRAVIS_EVENT_TYPE}
+	CI_REPO_SLUG=${TRAVIS_REPO_SLUG}
+
+	# CI_COMMIT_RANGE is usually invalid for force pushes - fix it when used
+	# with non-upstream repository
+	if [ -n "${CI_COMMIT_RANGE}" -a "${CI_REPO_SLUG}" != "${GITHUB_REPO}" ]; then
+		if ! git rev-list ${CI_COMMIT_RANGE}; then
+			CI_COMMIT_RANGE=${COMMIT_RANGE_FROM_LAST_MERGE}
+		fi
+	fi
+
+	case "${TRAVIS_CPU_ARCH}" in
+	"amd64")
+		CI_CPU_ARCH="x86_64"
+		;;
+	*)
+		CI_CPU_ARCH=${TRAVIS_CPU_ARCH}
+		;;
+	esac
+
+elif [ -n "${GITHUB_ACTIONS}" ]; then
+	CI_COMMIT=${GITHUB_SHA}
+	CI_COMMIT_RANGE=${COMMIT_RANGE_FROM_LAST_MERGE}
+	CI_BRANCH=$(echo ${GITHUB_REF} | cut -d'/' -f3)
+	CI_REPO_SLUG=${GITHUB_REPOSITORY}
+	CI_CPU_ARCH="x86_64" # GitHub Actions supports only x86_64
+
+	case "${GITHUB_EVENT_NAME}" in
+	"schedule")
+		CI_EVENT_TYPE="cron"
+		;;
+	*)
+		CI_EVENT_TYPE=${GITHUB_EVENT_NAME}
+		;;
+	esac
+
+else
+	CI_COMMIT=$(git log --pretty=%H -1)
+	CI_COMMIT_RANGE=${COMMIT_RANGE_FROM_LAST_MERGE}
+	CI_CPU_ARCH="x86_64"
+fi
+
+export CI_COMMIT=${CI_COMMIT}
+export CI_COMMIT_RANGE=${CI_COMMIT_RANGE}
+export CI_BRANCH=${CI_BRANCH}
+export CI_EVENT_TYPE=${CI_EVENT_TYPE}
+export CI_REPO_SLUG=${CI_REPO_SLUG}
+export CI_CPU_ARCH=${CI_CPU_ARCH}
+
+echo CI_COMMIT=${CI_COMMIT}
+echo CI_COMMIT_RANGE=${CI_COMMIT_RANGE}
+echo CI_BRANCH=${CI_BRANCH}
+echo CI_EVENT_TYPE=${CI_EVENT_TYPE}
+echo CI_REPO_SLUG=${CI_REPO_SLUG}
+echo CI_CPU_ARCH=${CI_CPU_ARCH}
diff --git a/run_code_coverage.sh b/run_code_coverage.sh
new file mode 100755
index 0000000000..7722e262bf
--- /dev/null
+++ b/run_code_coverage.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+#Build CacheLib with flag -DCOVERAGE_ENABLED=ON
+
+# Track coverage
+lcov -c -i -b . -d . -o Coverage.baseline
+./run_tests.sh
+lcov -c -d . -b . -o Coverage.out
+lcov -a Coverage.baseline -a Coverage.out -o Coverage.combined
+
+# Generate report
+COVERAGE_DIR='coverage_report'
+genhtml Coverage.combined -o ${COVERAGE_DIR}
+COVERAGE_REPORT="${COVERAGE_DIR}.tgz"
+tar -zcvf ${COVERAGE_REPORT} ${COVERAGE_DIR}
+echo "Created coverage report ${COVERAGE_REPORT}"
+
+# Cleanup
+rm Coverage.baseline Coverage.out Coverage.combined
+rm -rf ${COVERAGE_DIR}
diff --git a/run_tests.sh b/run_tests.sh
new file mode 100755
index 0000000000..111e218333
--- /dev/null
+++ b/run_tests.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+# Newline separated list of tests to ignore
+BLACKLIST="allocator-test-NavySetupTest
+shm-test-test_page_size"
+
+if [ "$1" == "long" ]; then
+    find -type f -executable | grep -vF "$BLACKLIST" | xargs -n1 bash -c
+else
+    find -type f \( -not -name "*bench*" -and -not -name "navy*" \) -executable | grep -vF "$BLACKLIST" | xargs -n1 bash -c
+fi
+
+../bin/cachebench --json_test_config ../test_configs/consistency/navy.json
+../bin/cachebench --json_test_config ../test_configs/consistency/navy-multi-tier.json

From dbe3fda70f27621cf7c96e8694226f0ae07f28d3 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Fri, 3 Feb 2023 16:02:50 -0800
Subject: [PATCH 02/40] Adds createPutToken and switches findEviction to
 utilize combined locking.

---
 cachelib/allocator/CacheAllocator.h          | 38 +++++++++-----------
 cachelib/allocator/MM2Q.h                    |  1 +
 cachelib/allocator/tests/BaseAllocatorTest.h | 30 ++++++++++------
 3 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index 3b0d9eeaef..15ad98be7c 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -1481,11 +1481,11 @@ class CacheAllocator : public CacheBase {
   // Given an existing item, allocate a new one for the
   // existing one to later be moved into.
   //
-  // @param oldItem    the item we want to allocate a new item for
+  // @param item   reference to the item we want to allocate a new item for
   //
   // @return  handle to the newly allocated item
   //
-  WriteHandle allocateNewItemForOldItem(const Item& oldItem);
+  WriteHandle allocateNewItemForOldItem(const Item& item);
 
   // internal helper that grabs a refcounted handle to the item. This does
   // not record the access to reflect in the mmContainer.
@@ -1544,7 +1544,7 @@ class CacheAllocator : public CacheBase {
   // callback is responsible for copying the contents and fixing the semantics
   // of chained item.
   //
-  // @param oldItem     Reference to the item being moved
+  // @param oldItem     item being moved
   // @param newItemHdl  Reference to the handle of the new item being moved into
   //
   // @return true  If the move was completed, and the containers were updated
@@ -1980,18 +1980,14 @@ class CacheAllocator : public CacheBase {
   std::optional<bool> saveNvmCache();
   void saveRamCache();
 
-  static bool itemExclusivePredicate(const Item& item) {
-    return item.getRefCount() == 0;
+  static bool itemSlabMovePredicate(const Item& item) {
+    return item.isMoving() && item.getRefCount() == 0;
   }
 
   static bool itemExpiryPredicate(const Item& item) {
     return item.getRefCount() == 1 && item.isExpired();
   }
 
-  static bool parentEvictForSlabReleasePredicate(const Item& item) {
-    return item.getRefCount() == 1 && !item.isMoving();
-  }
-
   std::unique_ptr<Deserializer> createDeserializer();
 
   // Execute func on each item. `func` can throw exception but must ensure
@@ -3663,12 +3659,9 @@ CacheAllocator<CacheTrait>::getNextCandidate(PoolId pid,
               ? &toRecycle_->asChainedItem().getParentItem(compressor_)
               : toRecycle_;
 
-      const bool evictToNvmCache = shouldWriteToNvmCache(*candidate_);
-      auto putToken = evictToNvmCache
-                          ? nvmCache_->createPutToken(candidate_->getKey())
-                          : typename NvmCacheT::PutToken{};
+      auto putToken = createPutToken(*candidate_);
 
-      if (evictToNvmCache && !putToken.isValid()) {
+      if (shouldWriteToNvmCache(*candidate_) && !putToken.isValid()) {
         stats_.evictFailConcurrentFill.inc();
         ++itr;
         continue;
@@ -4291,13 +4284,13 @@ std::vector<std::string> CacheAllocator<CacheTrait>::dumpEvictionIterator(
   std::vector<std::string> content;
 
   auto& mm = *mmContainers_[pid][cid];
-  auto evictItr = mm.getEvictionIterator();
-  size_t i = 0;
-  while (evictItr && i < numItems) {
-    content.push_back(evictItr->toString());
-    ++evictItr;
-    ++i;
-  }
+
+  mm.withEvictionIterator([&content, numItems](auto&& itr) {
+    while (itr && content.size() < numItems) {
+      content.push_back(itr->toString());
+      ++itr;
+    }
+  });
 
   return content;
 }
@@ -4938,6 +4931,7 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(Item& oldItem) {
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
 CacheAllocator<CacheTrait>::allocateNewItemForOldItem(const Item& oldItem) {
+  XDCHECK(oldItem.isMoving());
   if (oldItem.isChainedItem()) {
     const Item& parentItem = oldItem.asChainedItem().getParentItem(compressor_);
 
@@ -4951,7 +4945,7 @@ CacheAllocator<CacheTrait>::allocateNewItemForOldItem(const Item& oldItem) {
     XDCHECK_EQ(newItemHdl->getSize(), oldChainedItem.getSize());
     XDCHECK_EQ(reinterpret_cast<uintptr_t>(&parentItem),
                reinterpret_cast<uintptr_t>(
-                   &oldChainedItem.getParentItem(compressor_)));
+                   &newItemHdl->asChainedItem().getParentItem(compressor_)));
 
     return newItemHdl;
   }
diff --git a/cachelib/allocator/MM2Q.h b/cachelib/allocator/MM2Q.h
index 316229d3bb..f0a41b4851 100644
--- a/cachelib/allocator/MM2Q.h
+++ b/cachelib/allocator/MM2Q.h
@@ -66,6 +66,7 @@ class MM2Q {
   enum LruType { Warm, WarmTail, Hot, Cold, ColdTail, NumTypes };
 
   // Config class for MM2Q
+  // TODO: implement support for useCombinedLockForIterators
   struct Config {
     // Create from serialized config
     explicit Config(SerializationConfigType configState)
diff --git a/cachelib/allocator/tests/BaseAllocatorTest.h b/cachelib/allocator/tests/BaseAllocatorTest.h
index e0c988832e..c8ee44ac0c 100644
--- a/cachelib/allocator/tests/BaseAllocatorTest.h
+++ b/cachelib/allocator/tests/BaseAllocatorTest.h
@@ -4182,15 +4182,16 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
   // Check that item is in the expected container.
   bool findItem(AllocatorT& allocator, typename AllocatorT::Item* item) {
     auto& container = allocator.getMMContainer(*item);
-    auto itr = container.getEvictionIterator();
     bool found = false;
-    while (itr) {
-      if (itr.get() == item) {
-        found = true;
-        break;
+    container.withEvictionIterator([&found, &item](auto&& itr) {
+      while (itr) {
+        if (itr.get() == item) {
+          found = true;
+          break;
+        }
+        ++itr;
       }
-      ++itr;
-    }
+    });
     return found;
   }
 
@@ -5482,8 +5483,12 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
       ASSERT_TRUE(big->isInMMContainer());
 
       auto& mmContainer = alloc.getMMContainer(*big);
-      auto itr = mmContainer.getEvictionIterator();
-      ASSERT_EQ(big.get(), &(*itr));
+
+      typename AllocatorT::Item* evictionCandidate = nullptr;
+      mmContainer.withEvictionIterator(
+          [&evictionCandidate](auto&& itr) { evictionCandidate = itr.get(); });
+
+      ASSERT_EQ(big.get(), evictionCandidate);
 
       alloc.remove("hello");
     }
@@ -5497,8 +5502,11 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
       ASSERT_TRUE(small2->isInMMContainer());
 
       auto& mmContainer = alloc.getMMContainer(*small2);
-      auto itr = mmContainer.getEvictionIterator();
-      ASSERT_EQ(small2.get(), &(*itr));
+
+      typename AllocatorT::Item* evictionCandidate = nullptr;
+      mmContainer.withEvictionIterator(
+          [&evictionCandidate](auto&& itr) { evictionCandidate = itr.get(); });
+      ASSERT_EQ(small2.get(), evictionCandidate);
 
       alloc.remove("hello");
     }

From 9afcd64ff3168923e6036e57a1b22c83ddc6e762 Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <igor.chorazewicz@intel.com>
Date: Wed, 6 Jul 2022 10:15:17 +0000
Subject: [PATCH 03/40] Add memory usage statistics for allocation classes

This includes printing:
- allocSize
- allocated memory size
- memory usage fraction
---
 cachelib/allocator/Cache.h                       |  6 ++++++
 cachelib/allocator/CacheAllocator.h              | 11 +++++++++++
 cachelib/allocator/memory/MemoryAllocatorStats.h | 11 +++++++++++
 cachelib/allocator/tests/CacheBaseTest.cpp       |  1 +
 cachelib/cachebench/cache/Cache.h                |  4 ++++
 cachelib/cachebench/cache/CacheStats.h           | 14 ++++----------
 6 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/cachelib/allocator/Cache.h b/cachelib/allocator/Cache.h
index e225ba8a01..082db65f7a 100644
--- a/cachelib/allocator/Cache.h
+++ b/cachelib/allocator/Cache.h
@@ -102,6 +102,12 @@ class CacheBase {
   // @param poolId   the pool id
   virtual PoolStats getPoolStats(PoolId poolId) const = 0;
 
+  // Get Allocation Class specific stats.
+  //
+  // @param poolId   the pool id
+  // @param classId   the class id
+  virtual ACStats getACStats(PoolId poolId, ClassId classId) const = 0;
+
   // @param poolId   the pool id
   virtual AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId poolId) const = 0;
 
diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index 15ad98be7c..36b789bcde 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -1213,6 +1213,9 @@ class CacheAllocator : public CacheBase {
   // return cache's memory usage stats
   CacheMemoryStats getCacheMemoryStats() const override final;
 
+  // return stats for Allocation Class
+  ACStats getACStats(PoolId pid, ClassId cid) const override final;
+
   // return the nvm cache stats map
   util::StatsMap getNvmCacheStatsMap() const override final;
 
@@ -4687,6 +4690,14 @@ PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
   return ret;
 }
 
+template <typename CacheTrait>
+ACStats CacheAllocator<CacheTrait>::getACStats(PoolId poolId,
+                                               ClassId classId) const {
+  const auto& pool = allocator_->getPool(poolId);
+  const auto& ac = pool.getAllocationClass(classId);
+  return ac.getStats();
+}
+
 template <typename CacheTrait>
 PoolEvictionAgeStats CacheAllocator<CacheTrait>::getPoolEvictionAgeStats(
     PoolId pid, unsigned int slabProjectionLength) const {
diff --git a/cachelib/allocator/memory/MemoryAllocatorStats.h b/cachelib/allocator/memory/MemoryAllocatorStats.h
index b019b254c5..7ee4ca9916 100644
--- a/cachelib/allocator/memory/MemoryAllocatorStats.h
+++ b/cachelib/allocator/memory/MemoryAllocatorStats.h
@@ -56,6 +56,17 @@ struct ACStats {
   constexpr size_t getTotalFreeMemory() const noexcept {
     return Slab::kSize * freeSlabs + freeAllocs * allocSize;
   }
+
+  constexpr double usageFraction() const noexcept {
+    if (usedSlabs == 0)
+      return 0.0;
+
+    return activeAllocs / (usedSlabs * allocsPerSlab);
+  }
+
+  constexpr size_t totalAllocatedSize() const noexcept {
+    return activeAllocs * allocSize;
+  }
 };
 
 // structure to query stats corresponding to a MemoryPool
diff --git a/cachelib/allocator/tests/CacheBaseTest.cpp b/cachelib/allocator/tests/CacheBaseTest.cpp
index 928fcc0c67..f249786743 100644
--- a/cachelib/allocator/tests/CacheBaseTest.cpp
+++ b/cachelib/allocator/tests/CacheBaseTest.cpp
@@ -34,6 +34,7 @@ class CacheBaseTest : public CacheBase, public SlabAllocatorTestBase {
   bool isObjectCache() const override { return false; }
   const MemoryPool& getPool(PoolId) const override { return memoryPool_; }
   PoolStats getPoolStats(PoolId) const override { return PoolStats(); }
+  ACStats getACStats(PoolId, ClassId) const { return ACStats(); };
   AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId) const override {
     return AllSlabReleaseEvents{};
   }
diff --git a/cachelib/cachebench/cache/Cache.h b/cachelib/cachebench/cache/Cache.h
index fc9a13d704..b259e83f24 100644
--- a/cachelib/cachebench/cache/Cache.h
+++ b/cachelib/cachebench/cache/Cache.h
@@ -325,6 +325,10 @@ class Cache {
   // return the stats for the pool.
   PoolStats getPoolStats(PoolId pid) const { return cache_->getPoolStats(pid); }
 
+  ACStats getACStats(PoolId pid, ClassId cid) const {
+    return cache_->getACStats(pid, cid);
+  }
+
   // return the total number of inconsistent operations detected since start.
   unsigned int getInconsistencyCount() const {
     return inconsistencyCount_.load(std::memory_order_relaxed);
diff --git a/cachelib/cachebench/cache/CacheStats.h b/cachelib/cachebench/cache/CacheStats.h
index a0bb1e4ddd..1b0330fb5f 100644
--- a/cachelib/cachebench/cache/CacheStats.h
+++ b/cachelib/cachebench/cache/CacheStats.h
@@ -194,7 +194,7 @@ struct Stats {
       foreachAC(allocationClassStats, [&](auto pid, auto cid, auto stats) {
         auto [allocSizeSuffix, allocSize] = formatMemory(stats.allocSize);
         auto [memorySizeSuffix, memorySize] =
-            formatMemory(stats.activeAllocs * stats.allocSize);
+            formatMemory(stats.totalAllocatedSize());
         out << folly::sformat("pid{:2} cid{:4} {:8.2f}{} memorySize: {:8.2f}{}",
                               pid, cid, allocSize, allocSizeSuffix, memorySize,
                               memorySizeSuffix)
@@ -206,15 +206,9 @@ struct Stats {
 
         // If the pool is not full, extrapolate usageFraction for AC assuming it
         // will grow at the same rate. This value will be the same for all ACs.
-        double acUsageFraction;
-        if (poolUsageFraction[pid] < 1.0) {
-          acUsageFraction = poolUsageFraction[pid];
-        } else if (stats.usedSlabs == 0) {
-          acUsageFraction = 0.0;
-        } else {
-          acUsageFraction =
-              stats.activeAllocs / (stats.usedSlabs * stats.allocsPerSlab);
-        }
+        auto acUsageFraction = (poolUsageFraction[pid] < 1.0)
+                                   ? poolUsageFraction[pid]
+                                   : stats.usageFraction();
 
         out << folly::sformat(
                    "pid{:2} cid{:4} {:8.2f}{} usageFraction: {:4.2f}", pid, cid,

From eca7d8ce51e476c03d9be3cd6b0db68f5d18d480 Mon Sep 17 00:00:00 2001
From: "Chorazewicz, Igor" <igor.chorazewicz@intel.com>
Date: Tue, 28 Sep 2021 15:11:07 +0200
Subject: [PATCH 04/40] Initial multi-tier support implementation Part 1.
 ----------------------------------------- This includes the following:  -
 Multi-tier allocator with TierId  - allocateInternalTier  - creating
 multi-tier allocator on shared memory

Other patches can be combined/merged with this patch (such as
multi-tier serialization support and improvements to eviction).
We will name those compatible with Part 1 in later patches.
---
 cachelib/allocator/BackgroundMover.h          |  39 +-
 cachelib/allocator/BackgroundMoverStrategy.h  |   4 +-
 cachelib/allocator/Cache.h                    |   8 +-
 cachelib/allocator/CacheAllocator.h           | 616 ++++++++++++------
 cachelib/allocator/PoolOptimizer.cpp          |   2 +
 cachelib/allocator/memory/MemoryAllocator.h   |   7 +
 cachelib/allocator/memory/SlabAllocator.h     |  17 +-
 .../allocator/tests/AllocatorResizeTest.h     |   8 +-
 cachelib/allocator/tests/BaseAllocatorTest.h  |   8 +-
 cachelib/allocator/tests/CacheBaseTest.cpp    |   4 +-
 cachelib/allocator/tests/TestBase.h           |   4 +-
 cachelib/cachebench/cache/Cache.h             |  11 +-
 cachelib/cachebench/cache/CacheStats.h        |  40 +-
 13 files changed, 507 insertions(+), 261 deletions(-)

diff --git a/cachelib/allocator/BackgroundMover.h b/cachelib/allocator/BackgroundMover.h
index aee86a4e32..e7bba4095a 100644
--- a/cachelib/allocator/BackgroundMover.h
+++ b/cachelib/allocator/BackgroundMover.h
@@ -27,17 +27,19 @@ namespace facebook::cachelib {
 template <typename C>
 struct BackgroundMoverAPIWrapper {
   static size_t traverseAndEvictItems(C& cache,
+                                      unsigned int tid,
                                       unsigned int pid,
                                       unsigned int cid,
                                       size_t batch) {
-    return cache.traverseAndEvictItems(pid, cid, batch);
+    return cache.traverseAndEvictItems(tid, pid, cid, batch);
   }
 
   static size_t traverseAndPromoteItems(C& cache,
+                                        unsigned int tid,
                                         unsigned int pid,
                                         unsigned int cid,
                                         size_t batch) {
-    return cache.traverseAndPromoteItems(pid, cid, batch);
+    return cache.traverseAndPromoteItems(tid, pid, cid, batch);
   }
 };
 
@@ -60,16 +62,18 @@ class BackgroundMover : public PeriodicWorker {
   ~BackgroundMover() override;
 
   BackgroundMoverStats getStats() const noexcept;
-  std::map<PoolId, std::map<ClassId, uint64_t>> getClassStats() const noexcept;
+  std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>>
+  getClassStats() const noexcept;
 
   void setAssignedMemory(std::vector<MemoryDescriptorType>&& assignedMemory);
 
   // return id of the worker responsible for promoting/evicting from particlar
   // pool and allocation calss (id is in range [0, numWorkers))
-  static size_t workerId(PoolId pid, ClassId cid, size_t numWorkers);
+  static size_t workerId(TierId tid, PoolId pid, ClassId cid, size_t numWorkers);
 
  private:
-  std::map<PoolId, std::map<ClassId, uint64_t>> movesPerClass_;
+  std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>>
+      movesPerClass_;
   // cache allocator's interface for evicting
   using Item = typename Cache::Item;
 
@@ -77,7 +81,9 @@ class BackgroundMover : public PeriodicWorker {
   std::shared_ptr<BackgroundMoverStrategy> strategy_;
   MoverDir direction_;
 
-  std::function<size_t(Cache&, unsigned int, unsigned int, size_t)> moverFunc;
+  std::function<size_t(
+      Cache&, unsigned int, unsigned int, unsigned int, size_t)>
+      moverFunc;
 
   // implements the actual logic of running the background evictor
   void work() override final;
@@ -123,8 +129,8 @@ template <typename CacheT>
 void BackgroundMover<CacheT>::setAssignedMemory(
     std::vector<MemoryDescriptorType>&& assignedMemory) {
   XLOG(INFO, "Class assigned to background worker:");
-  for (auto [pid, cid] : assignedMemory) {
-    XLOGF(INFO, "Pid: {}, Cid: {}", pid, cid);
+  for (auto [tid, pid, cid] : assignedMemory) {
+    XLOGF(INFO, "Tid: {}, Pid: {}, Cid: {}", tid, pid, cid);
   }
 
   mutex_.lock_combine([this, &assignedMemory] {
@@ -142,18 +148,18 @@ void BackgroundMover<CacheT>::checkAndRun() {
   auto batches = strategy_->calculateBatchSizes(cache_, assignedMemory);
 
   for (size_t i = 0; i < batches.size(); i++) {
-    const auto [pid, cid] = assignedMemory[i];
+    const auto [tid, pid, cid] = assignedMemory[i];
     const auto batch = batches[i];
 
     if (batch == 0) {
       continue;
     }
-
+    const auto& mpStats = cache_.getPoolByTid(pid, tid).getStats();
     // try moving BATCH items from the class in order to reach free target
-    auto moved = moverFunc(cache_, pid, cid, batch);
+    auto moved = moverFunc(cache_, tid, pid, cid, batch);
     moves += moved;
-    movesPerClass_[pid][cid] += moved;
-    totalBytesMoved_.add(moved * cache_.getPool(pid).getAllocSizes()[cid]);
+    movesPerClass_[tid][pid][cid] += moved;
+    totalBytesMoved_.add(moved * mpStats.acStats.at(cid).allocSize );
   }
 
   numTraversals_.inc();
@@ -171,18 +177,19 @@ BackgroundMoverStats BackgroundMover<CacheT>::getStats() const noexcept {
 }
 
 template <typename CacheT>
-std::map<PoolId, std::map<ClassId, uint64_t>>
+std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>>
 BackgroundMover<CacheT>::getClassStats() const noexcept {
   return movesPerClass_;
 }
 
 template <typename CacheT>
-size_t BackgroundMover<CacheT>::workerId(PoolId pid,
+size_t BackgroundMover<CacheT>::workerId(TierId tid,
+                                         PoolId pid,
                                          ClassId cid,
                                          size_t numWorkers) {
   XDCHECK(numWorkers);
 
   // TODO: came up with some better sharding (use hashing?)
-  return (pid + cid) % numWorkers;
+  return (tid + pid + cid) % numWorkers;
 }
 } // namespace facebook::cachelib
diff --git a/cachelib/allocator/BackgroundMoverStrategy.h b/cachelib/allocator/BackgroundMoverStrategy.h
index abf37edd13..14bde15908 100644
--- a/cachelib/allocator/BackgroundMoverStrategy.h
+++ b/cachelib/allocator/BackgroundMoverStrategy.h
@@ -22,7 +22,9 @@ namespace facebook {
 namespace cachelib {
 
 struct MemoryDescriptorType {
-  MemoryDescriptorType(PoolId pid, ClassId cid) : pid_(pid), cid_(cid) {}
+  MemoryDescriptorType(TierId tid, PoolId pid, ClassId cid) : 
+      tid_(tid), pid_(pid), cid_(cid) {}
+  TierId tid_;
   PoolId pid_;
   ClassId cid_;
 };
diff --git a/cachelib/allocator/Cache.h b/cachelib/allocator/Cache.h
index 082db65f7a..8dbe5fdc6e 100644
--- a/cachelib/allocator/Cache.h
+++ b/cachelib/allocator/Cache.h
@@ -96,6 +96,12 @@ class CacheBase {
   // @param poolId    The pool id to query
   virtual const MemoryPool& getPool(PoolId poolId) const = 0;
 
+  // Get the reference to a memory pool using a tier id, for stats purposes
+  //
+  // @param poolId    The pool id to query
+  // @param tierId    The tier of the pool id
+  virtual const MemoryPool& getPoolByTid(PoolId poolId, TierId tid) const = 0;
+
   // Get Pool specific stats (regular pools). This includes stats from the
   // Memory Pool and also the cache.
   //
@@ -106,7 +112,7 @@ class CacheBase {
   //
   // @param poolId   the pool id
   // @param classId   the class id
-  virtual ACStats getACStats(PoolId poolId, ClassId classId) const = 0;
+  virtual ACStats getACStats(TierId tid,PoolId poolId, ClassId classId) const = 0;
 
   // @param poolId   the pool id
   virtual AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId poolId) const = 0;
diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index 36b789bcde..a08fca177a 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -709,7 +709,7 @@ class CacheAllocator : public CacheBase {
   uint32_t getUsableSize(const Item& item) const;
 
   // create memory assignment to bg workers
-  auto createBgWorkerMemoryAssignments(size_t numWorkers);
+  auto createBgWorkerMemoryAssignments(size_t numWorkers, TierId tid);
 
   // whether bg worker should be woken
   bool shouldWakeupBgEvictor(PoolId pid, ClassId cid);
@@ -810,7 +810,7 @@ class CacheAllocator : public CacheBase {
   // @param config    new config for the pool
   //
   // @throw std::invalid_argument if the poolId is invalid
-  void overridePoolConfig(PoolId pid, const MMConfig& config);
+  void overridePoolConfig(TierId tid, PoolId pid, const MMConfig& config);
 
   // update an existing pool's rebalance strategy
   //
@@ -851,8 +851,9 @@ class CacheAllocator : public CacheBase {
   // @return  true if the operation succeeded. false if the size of the pool is
   //          smaller than _bytes_
   // @throw   std::invalid_argument if the poolId is invalid.
+  // TODO: should call shrinkPool for specific tier?
   bool shrinkPool(PoolId pid, size_t bytes) {
-    return allocator_->shrinkPool(pid, bytes);
+    return allocator_[currentTier()]->shrinkPool(pid, bytes);
   }
 
   // grow an existing pool by _bytes_. This will fail if there is no
@@ -861,8 +862,9 @@ class CacheAllocator : public CacheBase {
   // @return    true if the pool was grown. false if the necessary number of
   //            bytes were not available.
   // @throw     std::invalid_argument if the poolId is invalid.
+  // TODO: should call growPool for specific tier?
   bool growPool(PoolId pid, size_t bytes) {
-    return allocator_->growPool(pid, bytes);
+    return allocator_[currentTier()]->growPool(pid, bytes);
   }
 
   // move bytes from one pool to another. The source pool should be at least
@@ -875,7 +877,7 @@ class CacheAllocator : public CacheBase {
   //          correct size to do the transfer.
   // @throw   std::invalid_argument if src or dest is invalid pool
   bool resizePools(PoolId src, PoolId dest, size_t bytes) override {
-    return allocator_->resizePools(src, dest, bytes);
+    return allocator_[currentTier()]->resizePools(src, dest, bytes);
   }
 
   // Add a new compact cache with given name and size
@@ -1104,12 +1106,13 @@ class CacheAllocator : public CacheBase {
   // @throw std::invalid_argument if the memory does not belong to this
   //        cache allocator
   AllocInfo getAllocInfo(const void* memory) const {
-    return allocator_->getAllocInfo(memory);
+    return allocator_[getTierId(memory)]->getAllocInfo(memory);
   }
 
   // return the ids for the set of existing pools in this cache.
   std::set<PoolId> getPoolIds() const override final {
-    return allocator_->getPoolIds();
+    // all tiers have the same pool ids. TODO: deduplicate
+    return allocator_[0]->getPoolIds();
   }
 
   // return a list of pool ids that are backing compact caches. This includes
@@ -1121,18 +1124,22 @@ class CacheAllocator : public CacheBase {
 
   // return the pool with speicified id.
   const MemoryPool& getPool(PoolId pid) const override final {
-    return allocator_->getPool(pid);
+    return allocator_[currentTier()]->getPool(pid);
+  }
+
+  const MemoryPool& getPoolByTid(PoolId pid, TierId tid) const override final {
+    return allocator_[tid]->getPool(pid);
   }
 
   // calculate the number of slabs to be advised/reclaimed in each pool
   PoolAdviseReclaimData calcNumSlabsToAdviseReclaim() override final {
     auto regularPoolIds = getRegularPoolIds();
-    return allocator_->calcNumSlabsToAdviseReclaim(regularPoolIds);
+    return allocator_[currentTier()]->calcNumSlabsToAdviseReclaim(regularPoolIds);
   }
 
   // update number of slabs to advise in the cache
   void updateNumSlabsToAdvise(int32_t numSlabsToAdvise) override final {
-    allocator_->updateNumSlabsToAdvise(numSlabsToAdvise);
+    allocator_[currentTier()]->updateNumSlabsToAdvise(numSlabsToAdvise);
   }
 
   // returns a valid PoolId corresponding to the name or kInvalidPoolId if the
@@ -1140,8 +1147,9 @@ class CacheAllocator : public CacheBase {
   PoolId getPoolId(folly::StringPiece name) const noexcept;
 
   // returns the pool's name by its poolId.
-  std::string getPoolName(PoolId poolId) const override {
-    return allocator_->getPoolName(poolId);
+  std::string getPoolName(PoolId poolId) const {
+    // all tiers have the same pool names.
+    return allocator_[0]->getPoolName(poolId);
   }
 
   // get stats related to all kinds of slab release events.
@@ -1214,7 +1222,7 @@ class CacheAllocator : public CacheBase {
   CacheMemoryStats getCacheMemoryStats() const override final;
 
   // return stats for Allocation Class
-  ACStats getACStats(PoolId pid, ClassId cid) const override final;
+  ACStats getACStats(TierId tid, PoolId pid, ClassId cid) const override final;
 
   // return the nvm cache stats map
   util::StatsMap getNvmCacheStatsMap() const override final;
@@ -1419,11 +1427,14 @@ class CacheAllocator : public CacheBase {
 
   using MMContainerPtr = std::unique_ptr<MMContainer>;
   using MMContainers =
-      std::array<std::array<MMContainerPtr, MemoryAllocator::kMaxClasses>,
-                 MemoryPoolManager::kMaxPools>;
+      std::vector<std::array<std::array<MMContainerPtr, MemoryAllocator::kMaxClasses>,
+                 MemoryPoolManager::kMaxPools>>;
 
   void createMMContainers(const PoolId pid, MMConfig config);
 
+  TierId getTierId(const Item& item) const;
+  TierId getTierId(const void* ptr) const;
+
   // acquire the MMContainer corresponding to the the Item's class and pool.
   //
   // @return pointer to the MMContainer.
@@ -1431,7 +1442,12 @@ class CacheAllocator : public CacheBase {
   // allocation from the memory allocator.
   MMContainer& getMMContainer(const Item& item) const noexcept;
 
-  MMContainer& getMMContainer(PoolId pid, ClassId cid) const noexcept;
+  MMContainer& getMMContainer(TierId tid, PoolId pid, ClassId cid) const noexcept;
+
+  // Get stats of the specified pid and cid.
+  // If such mmcontainer is not valid (pool id or cid out of bound)
+  // or the mmcontainer is not initialized, return an empty stat.
+  MMContainerStat getMMContainerStat(TierId tid, PoolId pid, ClassId cid) const noexcept;
 
   // create a new cache allocation. The allocation can be initialized
   // appropriately and made accessible through insert or insertOrReplace.
@@ -1465,6 +1481,18 @@ class CacheAllocator : public CacheBase {
                                uint32_t expiryTime,
                                bool fromBgThread = false);
 
+  // create a new cache allocation on specific memory tier.
+  // For description see allocateInternal.
+  //
+  // @param tid id a memory tier
+  WriteHandle allocateInternalTier(TierId tid,
+                                   PoolId id,
+                                   Key key,
+                                   uint32_t size,
+                                   uint32_t creationTime,
+                                   uint32_t expiryTime,
+                                   bool fromBgThread);
+
   // Allocate a chained item
   //
   // The resulting chained item does not have a parent item and
@@ -1542,6 +1570,15 @@ class CacheAllocator : public CacheBase {
   //              not exist.
   FOLLY_ALWAYS_INLINE WriteHandle findFastImpl(Key key, AccessMode mode);
 
+  // Moves a regular item to a different memory tier.
+  //
+  // @param oldItem     Reference to the item being moved
+  // @param newItemHdl  Reference to the handle of the new item being moved into
+  //
+  // @return true  If the move was completed, and the containers were updated
+  //               successfully.
+  bool moveRegularItemOnEviction(Item& oldItem, WriteHandle& newItemHdl);
+
   // Moves a regular item to a different slab. This should only be used during
   // slab release after the item's exclusive bit has been set. The user supplied
   // callback is responsible for copying the contents and fixing the semantics
@@ -1714,15 +1751,17 @@ class CacheAllocator : public CacheBase {
   // Implementation to find a suitable eviction from the container. The
   // two parameters together identify a single container.
   //
+  // @param  tid  the id of the tier to look for evictions inside
   // @param  pid  the id of the pool to look for evictions inside
   // @param  cid  the id of the class to look for evictions inside
   // @return An evicted item or nullptr  if there is no suitable candidate found
   // within the configured number of attempts.
-  Item* findEviction(PoolId pid, ClassId cid);
+  Item* findEviction(TierId tid, PoolId pid, ClassId cid);
 
   // Get next eviction candidate from MMContainer, remove from AccessContainer,
   // MMContainer and insert into NVMCache if enabled.
   //
+  // @param tid  the id of the tier to look for evictions inside
   // @param pid  the id of the pool to look for evictions inside
   // @param cid  the id of the class to look for evictions inside
   // @param searchTries number of search attempts so far.
@@ -1730,7 +1769,8 @@ class CacheAllocator : public CacheBase {
   // @return pair of [candidate, toRecycle]. Pair of null if reached the end of
   // the eviction queue or no suitable candidate found
   // within the configured number of attempts
-  std::pair<Item*, Item*> getNextCandidate(PoolId pid,
+  std::pair<Item*, Item*> getNextCandidate(TierId tid,
+                                           PoolId pid,
                                            ClassId cid,
                                            unsigned int& searchTries);
 
@@ -1761,7 +1801,7 @@ class CacheAllocator : public CacheBase {
       const typename Item::PtrCompressor& compressor);
 
   unsigned int reclaimSlabs(PoolId id, size_t numSlabs) final {
-    return allocator_->reclaimSlabsAndGrow(id, numSlabs);
+    return allocator_[currentTier()]->reclaimSlabsAndGrow(id, numSlabs);
   }
 
   FOLLY_ALWAYS_INLINE EventTracker* getEventTracker() const {
@@ -1820,7 +1860,7 @@ class CacheAllocator : public CacheBase {
                    const void* hint = nullptr) final;
 
   // @param releaseContext  slab release context
-  void releaseSlabImpl(const SlabReleaseContext& releaseContext);
+  void releaseSlabImpl(TierId tid, const SlabReleaseContext& releaseContext);
 
   // @return  true when successfully marked as moving,
   //          fasle when this item has already been freed
@@ -1863,13 +1903,14 @@ class CacheAllocator : public CacheBase {
     // primitives. So we consciously exempt ourselves here from TSAN data race
     // detection.
     folly::annotate_ignore_thread_sanitizer_guard g(__FILE__, __LINE__);
-    auto slabsSkipped = allocator_->forEachAllocation(std::forward<Fn>(f));
+    auto slabsSkipped = allocator_[currentTier()]->forEachAllocation(std::forward<Fn>(f));
     stats().numReaperSkippedSlabs.add(slabsSkipped);
   }
 
   // exposed for the background evictor to iterate through the memory and evict
   // in batch. This should improve insertion path for tiered memory config
-  size_t traverseAndEvictItems(unsigned int /* pid */,
+  size_t traverseAndEvictItems(unsigned int /* tid */,
+                               unsigned int /* pid */,
                                unsigned int /* cid */,
                                size_t /* batch */) {
     throw std::runtime_error("Not supported yet!");
@@ -1877,7 +1918,8 @@ class CacheAllocator : public CacheBase {
 
   // exposed for the background promoter to iterate through the memory and
   // promote in batch. This should improve find latency
-  size_t traverseAndPromoteItems(unsigned int /* pid */,
+  size_t traverseAndPromoteItems(unsigned int /* tid */,
+                                 unsigned int /* pid */,
                                  unsigned int /* cid */,
                                  size_t /* batch */) {
     throw std::runtime_error("Not supported yet!");
@@ -1923,10 +1965,10 @@ class CacheAllocator : public CacheBase {
                   std::unique_ptr<T>& worker,
                   std::chrono::seconds timeout = std::chrono::seconds{0});
 
-  ShmSegmentOpts createShmCacheOpts();
-  std::unique_ptr<MemoryAllocator> createNewMemoryAllocator();
-  std::unique_ptr<MemoryAllocator> restoreMemoryAllocator();
-  std::unique_ptr<CCacheManager> restoreCCacheManager();
+  ShmSegmentOpts createShmCacheOpts(TierId tid);
+  std::unique_ptr<MemoryAllocator> createNewMemoryAllocator(TierId tid);
+  std::unique_ptr<MemoryAllocator> restoreMemoryAllocator(TierId tid);
+  std::unique_ptr<CCacheManager> restoreCCacheManager(TierId tid);
 
   PoolIds filterCompactCachePools(const PoolIds& poolIds) const;
 
@@ -1946,7 +1988,7 @@ class CacheAllocator : public CacheBase {
   }
 
   typename Item::PtrCompressor createPtrCompressor() const {
-    return allocator_->createPtrCompressor<Item>();
+    return allocator_[0 /* TODO */]->createPtrCompressor<Item>();
   }
 
   // helper utility to throttle and optionally log.
@@ -1969,9 +2011,14 @@ class CacheAllocator : public CacheBase {
 
   // @param type        the type of initialization
   // @return nullptr if the type is invalid
-  // @return pointer to memory allocator
+  // @return vector of pointers to memory allocator
   // @throw std::runtime_error if type is invalid
-  std::unique_ptr<MemoryAllocator> initAllocator(InitMemType type);
+  std::vector<std::unique_ptr<MemoryAllocator>> initAllocator(InitMemType type);
+
+  std::vector<std::unique_ptr<MemoryAllocator>> createPrivateAllocator();
+  std::vector<std::unique_ptr<MemoryAllocator>> createAllocators();
+  std::vector<std::unique_ptr<MemoryAllocator>> restoreAllocators();
+
   // @param type        the type of initialization
   // @return nullptr if the type is invalid
   // @return pointer to access container
@@ -2040,23 +2087,28 @@ class CacheAllocator : public CacheBase {
     return stats;
   }
 
-  std::map<PoolId, std::map<ClassId, uint64_t>> getBackgroundMoverClassStats(
+  std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>>
+  getBackgroundMoverClassStats(
       MoverDir direction) const {
-    std::map<PoolId, std::map<ClassId, uint64_t>> stats;
+    std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>> stats;
 
     if (direction == MoverDir::Evict) {
       for (auto& bg : backgroundEvictor_) {
-        for (auto& pid : bg->getClassStats()) {
-          for (auto& cid : pid.second) {
-            stats[pid.first][cid.first] += cid.second;
+        for (auto &tid : bg->getClassStats()) {
+          for (auto& pid : tid.second) {
+            for (auto& cid : pid.second) {
+              stats[tid.first][pid.first][cid.first] += cid.second;
+            }
           }
         }
       }
     } else if (direction == MoverDir::Promote) {
       for (auto& bg : backgroundPromoter_) {
-        for (auto& pid : bg->getClassStats()) {
-          for (auto& cid : pid.second) {
-            stats[pid.first][cid.first] += cid.second;
+        for (auto &tid : bg->getClassStats()) {
+          for (auto& pid : tid.second) {
+            for (auto& cid : pid.second) {
+              stats[tid.first][pid.first][cid.first] += cid.second;
+            }
           }
         }
       }
@@ -2147,6 +2199,17 @@ class CacheAllocator : public CacheBase {
 
   // BEGIN private members
 
+  TierId currentTier() const {
+    // TODO: every function which calls this method should be refactored.
+    // We should go case by case and either make such function work on
+    // all tiers or expose separate parameter to describe the tier ID.
+    return 0;
+  }
+
+  unsigned getNumTiers() const {
+    return config_.memoryTierConfigs.size();
+  }
+
   // Whether the memory allocator for this cache allocator was created on shared
   // memory. The hash table, chained item hash table etc is also created on
   // shared memory except for temporary shared memory mode when they're created
@@ -2172,9 +2235,10 @@ class CacheAllocator : public CacheBase {
   const MMConfig mmConfig_{};
 
   // the memory allocator for allocating out of the available memory.
-  std::unique_ptr<MemoryAllocator> allocator_;
+  std::vector<std::unique_ptr<MemoryAllocator>> allocator_;
 
   // compact cache allocator manager
+  // TODO: per tier?
   std::unique_ptr<CCacheManager> compactCacheManager_;
 
   // compact cache instances reside here when user "add" or "attach" compact
@@ -2381,12 +2445,12 @@ CacheAllocator<CacheTrait>::CacheAllocator(
                     : serialization::CacheAllocatorMetadata{}},
       allocator_(initAllocator(type)),
       compactCacheManager_(type != InitMemType::kMemAttach
-                               ? std::make_unique<CCacheManager>(*allocator_)
-                               : restoreCCacheManager()),
+                               ? std::make_unique<CCacheManager>(*allocator_[0] /* TODO: per tier */)
+                               : restoreCCacheManager(0/* TODO: per tier */)),
       compressor_(createPtrCompressor()),
       mmContainers_(type == InitMemType::kMemAttach
                         ? deserializeMMContainers(*deserializer_, compressor_)
-                        : MMContainers{}),
+                        : MMContainers{getNumTiers()}),
       accessContainer_(initAccessContainer(
           type, detail::kShmHashTableName, config.accessConfig)),
       chainedItemAccessContainer_(
@@ -2421,48 +2485,87 @@ CacheAllocator<CacheTrait>::~CacheAllocator() {
 }
 
 template <typename CacheTrait>
-ShmSegmentOpts CacheAllocator<CacheTrait>::createShmCacheOpts() {
+ShmSegmentOpts CacheAllocator<CacheTrait>::createShmCacheOpts(TierId tid) {
   ShmSegmentOpts opts;
   opts.alignment = sizeof(Slab);
   // TODO: we support single tier so far
-  if (config_.memoryTierConfigs.size() > 1) {
-    throw std::invalid_argument("CacheLib only supports a single memory tier");
+  if (config_.memoryTierConfigs.size() > 2) {
+    throw std::invalid_argument("CacheLib only supports two memory tiers");
   }
-  opts.memBindNumaNodes = config_.memoryTierConfigs[0].getMemBind();
+  opts.memBindNumaNodes = config_.memoryTierConfigs[tid].getMemBind();
   return opts;
 }
 
+template <typename CacheTrait>
+std::vector<std::unique_ptr<MemoryAllocator>>
+CacheAllocator<CacheTrait>::createPrivateAllocator() {
+  std::vector<std::unique_ptr<MemoryAllocator>> allocators;
+
+  if (isOnShm_) {
+    allocators.emplace_back(std::make_unique<MemoryAllocator>(
+                            getAllocatorConfig(config_),
+                            tempShm_->getAddr(),
+                            config_.getCacheSize()));
+  } else {
+    allocators.emplace_back(std::make_unique<MemoryAllocator>(
+                            getAllocatorConfig(config_),
+                            config_.getCacheSize()));
+  }
+
+  return allocators;
+}
+
 template <typename CacheTrait>
 std::unique_ptr<MemoryAllocator>
-CacheAllocator<CacheTrait>::createNewMemoryAllocator() {
+CacheAllocator<CacheTrait>::createNewMemoryAllocator(TierId tid) {
   return std::make_unique<MemoryAllocator>(
       getAllocatorConfig(config_),
       shmManager_
-          ->createShm(detail::kShmCacheName, config_.getCacheSize(),
-                      config_.slabMemoryBaseAddr, createShmCacheOpts())
+          ->createShm(detail::kShmCacheName + std::to_string(tid),
+                      config_.getCacheSize(), config_.slabMemoryBaseAddr,
+                      createShmCacheOpts(tid))
           .addr,
       config_.getCacheSize());
 }
 
 template <typename CacheTrait>
 std::unique_ptr<MemoryAllocator>
-CacheAllocator<CacheTrait>::restoreMemoryAllocator() {
+CacheAllocator<CacheTrait>::restoreMemoryAllocator(TierId tid) {
   return std::make_unique<MemoryAllocator>(
       deserializer_->deserialize<MemoryAllocator::SerializationType>(),
       shmManager_
-          ->attachShm(detail::kShmCacheName, config_.slabMemoryBaseAddr,
-                      createShmCacheOpts())
-          .addr,
+          ->attachShm(detail::kShmCacheName + std::to_string(tid),
+            config_.slabMemoryBaseAddr, createShmCacheOpts(tid)).addr,
       config_.getCacheSize(),
       config_.disableFullCoredump);
 }
 
+template <typename CacheTrait>
+std::vector<std::unique_ptr<MemoryAllocator>>
+CacheAllocator<CacheTrait>::createAllocators() {
+  std::vector<std::unique_ptr<MemoryAllocator>> allocators;
+  for (int tid = 0; tid < getNumTiers(); tid++) {
+    allocators.emplace_back(createNewMemoryAllocator(tid));
+  }
+  return allocators;
+}
+
+template <typename CacheTrait>
+std::vector<std::unique_ptr<MemoryAllocator>>
+CacheAllocator<CacheTrait>::restoreAllocators() {
+  std::vector<std::unique_ptr<MemoryAllocator>> allocators;
+  for (int tid = 0; tid < getNumTiers(); tid++) {
+    allocators.emplace_back(restoreMemoryAllocator(tid));
+  }
+  return allocators;
+}
+
 template <typename CacheTrait>
 std::unique_ptr<CCacheManager>
-CacheAllocator<CacheTrait>::restoreCCacheManager() {
+CacheAllocator<CacheTrait>::restoreCCacheManager(TierId tid) {
   return std::make_unique<CCacheManager>(
       deserializer_->deserialize<CCacheManager::SerializationType>(),
-      *allocator_);
+      *allocator_[tid]);
 }
 
 template <typename CacheTrait>
@@ -2566,21 +2669,15 @@ void CacheAllocator<CacheTrait>::initWorkers() {
 }
 
 template <typename CacheTrait>
-std::unique_ptr<MemoryAllocator> CacheAllocator<CacheTrait>::initAllocator(
+std::vector<std::unique_ptr<MemoryAllocator>>
+CacheAllocator<CacheTrait>::initAllocator(
     InitMemType type) {
   if (type == InitMemType::kNone) {
-    if (isOnShm_ == true) {
-      return std::make_unique<MemoryAllocator>(getAllocatorConfig(config_),
-                                               tempShm_->getAddr(),
-                                               config_.getCacheSize());
-    } else {
-      return std::make_unique<MemoryAllocator>(getAllocatorConfig(config_),
-                                               config_.getCacheSize());
-    }
+    return createPrivateAllocator();
   } else if (type == InitMemType::kMemNew) {
-    return createNewMemoryAllocator();
+    return createAllocators();
   } else if (type == InitMemType::kMemAttach) {
-    return restoreMemoryAllocator();
+    return restoreAllocators();
   }
 
   // Invalid type
@@ -2655,12 +2752,13 @@ bool CacheAllocator<CacheTrait>::shouldWakeupBgEvictor(PoolId /* pid */,
 
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
-CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
-                                             typename Item::Key key,
-                                             uint32_t size,
-                                             uint32_t creationTime,
-                                             uint32_t expiryTime,
-                                             bool fromBgThread) {
+CacheAllocator<CacheTrait>::allocateInternalTier(TierId tid,
+                                                 PoolId pid,
+                                                 typename Item::Key key,
+                                                 uint32_t size,
+                                                 uint32_t creationTime,
+                                                 uint32_t expiryTime,
+                                                 bool fromBgThread) {
   util::LatencyTracker tracker{stats().allocateLatency_};
 
   SCOPE_FAIL { stats_.invalidAllocs.inc(); };
@@ -2669,21 +2767,22 @@ CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
   const auto requiredSize = Item::getRequiredSize(key, size);
 
   // the allocation class in our memory allocator.
-  const auto cid = allocator_->getAllocationClassId(pid, requiredSize);
+  const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize);
 
+  // TODO: per-tier
   (*stats_.allocAttempts)[pid][cid].inc();
 
-  void* memory = allocator_->allocate(pid, requiredSize);
+  void* memory = allocator_[tid]->allocate(pid, requiredSize);
 
   if (backgroundEvictor_.size() && !fromBgThread &&
       (memory == nullptr || shouldWakeupBgEvictor(pid, cid))) {
     backgroundEvictor_[BackgroundMover<CacheT>::workerId(
-                           pid, cid, backgroundEvictor_.size())]
+                         tid, pid, cid, backgroundEvictor_.size())]
         ->wakeUp();
   }
 
   if (memory == nullptr) {
-    memory = findEviction(pid, cid);
+    memory = findEviction(tid, pid, cid);
   }
 
   WriteHandle handle;
@@ -2694,7 +2793,7 @@ CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
     // for example.
     SCOPE_FAIL {
       // free back the memory to the allocator since we failed.
-      allocator_->free(memory);
+      allocator_[tid]->free(memory);
     };
 
     handle = acquire(new (memory) Item(key, size, creationTime, expiryTime));
@@ -2705,7 +2804,7 @@ CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
     }
 
   } else { // failed to allocate memory.
-    (*stats_.allocFailures)[pid][cid].inc();
+    (*stats_.allocFailures)[pid][cid].inc(); // TODO: per-tier
     // wake up rebalancer
     if (!config_.poolRebalancerDisableForcedWakeUp && poolRebalancer_) {
       poolRebalancer_->wakeUp();
@@ -2722,6 +2821,22 @@ CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
   return handle;
 }
 
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
+                                             typename Item::Key key,
+                                             uint32_t size,
+                                             uint32_t creationTime,
+                                             uint32_t expiryTime,
+                                             bool fromBgThread) {
+  auto tid = 0; /* TODO: consult admission policy */
+  for(TierId tid = 0; tid < getNumTiers(); ++tid) {
+    auto handle = allocateInternalTier(tid, pid, key, size, creationTime, expiryTime, fromBgThread);
+    if (handle) return handle;
+  }
+  return {};
+}
+
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
 CacheAllocator<CacheTrait>::allocateChainedItem(const ReadHandle& parent,
@@ -2751,22 +2866,30 @@ CacheAllocator<CacheTrait>::allocateChainedItemInternal(const Item& parent,
 
   // number of bytes required for this item
   const auto requiredSize = ChainedItem::getRequiredSize(size);
-
-  const auto pid = allocator_->getAllocInfo(parent.getMemory()).poolId;
-  const auto cid = allocator_->getAllocationClassId(pid, requiredSize);
-
+  
+  // this is correct for now as we can
+  // assume the parent and chained item
+  // will reside in the same tier until 
+  // they are moved
+  auto tid = getTierId(parent);
+
+  const auto pid = allocator_[tid]->getAllocInfo(parent.getMemory()).poolId;
+  const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize);
+
+  // TODO: per-tier? Right now stats_ are not used in any public periodic
+  // worker
   (*stats_.allocAttempts)[pid][cid].inc();
 
-  void* memory = allocator_->allocate(pid, requiredSize);
+  void* memory = allocator_[tid]->allocate(pid, requiredSize);
   if (memory == nullptr) {
-    memory = findEviction(pid, cid);
+    memory = findEviction(tid, pid, cid);
   }
   if (memory == nullptr) {
     (*stats_.allocFailures)[pid][cid].inc();
     return WriteHandle{};
   }
 
-  SCOPE_FAIL { allocator_->free(memory); };
+  SCOPE_FAIL { allocator_[tid]->free(memory); };
 
   auto child = acquire(new (memory) ChainedItem(
       compressor_.compress(&parent), size, util::getCurrentTimeSec()));
@@ -3100,8 +3223,8 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
     throw std::runtime_error(
         folly::sformat("cannot release this item: {}", it.toString()));
   }
-
-  const auto allocInfo = allocator_->getAllocInfo(it.getMemory());
+  const auto tid = getTierId(it);
+  const auto allocInfo = allocator_[tid]->getAllocInfo(it.getMemory());
 
   if (ctx == RemoveContext::kEviction) {
     const auto timeNow = util::getCurrentTimeSec();
@@ -3125,8 +3248,7 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
           folly::sformat("Can not recycle a chained item {}, toRecyle",
                          it.toString(), toRecycle->toString()));
     }
-
-    allocator_->free(&it);
+    allocator_[tid]->free(&it);
     return ReleaseRes::kReleased;
   }
 
@@ -3195,7 +3317,7 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
       auto next = head->getNext(compressor_);
 
       const auto childInfo =
-          allocator_->getAllocInfo(static_cast<const void*>(head));
+          allocator_[tid]->getAllocInfo(static_cast<const void*>(head));
       (*stats_.fragmentationSize)[childInfo.poolId][childInfo.classId].sub(
           util::getFragmentation(*this, *head));
 
@@ -3211,7 +3333,7 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
         XDCHECK(ReleaseRes::kReleased != res);
         res = ReleaseRes::kRecycled;
       } else {
-        allocator_->free(head);
+        allocator_[tid]->free(head);
       }
 
       stats_.numChainedChildItems.dec();
@@ -3225,7 +3347,7 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
     res = ReleaseRes::kRecycled;
   } else {
     XDCHECK(it.isDrained());
-    allocator_->free(&it);
+    allocator_[tid]->free(&it);
   }
 
   return res;
@@ -3633,13 +3755,14 @@ void CacheAllocator<CacheTrait>::unlinkItemForEviction(Item& it) {
 template <typename CacheTrait>
 std::pair<typename CacheAllocator<CacheTrait>::Item*,
           typename CacheAllocator<CacheTrait>::Item*>
-CacheAllocator<CacheTrait>::getNextCandidate(PoolId pid,
+CacheAllocator<CacheTrait>::getNextCandidate(TierId tid,
+                                             PoolId pid,
                                              ClassId cid,
                                              unsigned int& searchTries) {
   typename NvmCacheT::PutToken token;
   Item* toRecycle = nullptr;
   Item* candidate = nullptr;
-  auto& mmContainer = getMMContainer(pid, cid);
+  auto& mmContainer = getMMContainer(tid, pid, cid);
 
   mmContainer.withEvictionIterator([this, pid, cid, &candidate, &toRecycle,
                                     &searchTries, &mmContainer,
@@ -3717,13 +3840,13 @@ CacheAllocator<CacheTrait>::getNextCandidate(PoolId pid,
 
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::Item*
-CacheAllocator<CacheTrait>::findEviction(PoolId pid, ClassId cid) {
+CacheAllocator<CacheTrait>::findEviction(TierId tid, PoolId pid, ClassId cid) {
   // Keep searching for a candidate until we were able to evict it
   // or until the search limit has been exhausted
   unsigned int searchTries = 0;
   while (config_.evictionSearchTries == 0 ||
          config_.evictionSearchTries > searchTries) {
-    auto [candidate, toRecycle] = getNextCandidate(pid, cid, searchTries);
+    auto [candidate, toRecycle] = getNextCandidate(tid, pid, cid, searchTries);
 
     // Reached the end of the eviction queue but doulen't find a candidate,
     // start again.
@@ -4004,21 +4127,57 @@ void CacheAllocator<CacheTrait>::invalidateNvm(Item& item) {
   }
 }
 
+template <typename CacheTrait>
+TierId
+CacheAllocator<CacheTrait>::getTierId(const Item& item) const {
+  return getTierId(item.getMemory());
+}
+
+template <typename CacheTrait>
+TierId
+CacheAllocator<CacheTrait>::getTierId(const void* ptr) const {
+  for (TierId tid = 0; tid < getNumTiers(); tid++) {
+    if (allocator_[tid]->isMemoryInAllocator(ptr))
+      return tid;
+  }
+
+  throw std::invalid_argument("Item does not belong to any tier!");
+}
+
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::MMContainer&
 CacheAllocator<CacheTrait>::getMMContainer(const Item& item) const noexcept {
+  const auto tid = getTierId(item);
   const auto allocInfo =
-      allocator_->getAllocInfo(static_cast<const void*>(&item));
-  return getMMContainer(allocInfo.poolId, allocInfo.classId);
+      allocator_[tid]->getAllocInfo(static_cast<const void*>(&item));
+  return getMMContainer(tid, allocInfo.poolId, allocInfo.classId);
 }
 
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::MMContainer&
-CacheAllocator<CacheTrait>::getMMContainer(PoolId pid,
+CacheAllocator<CacheTrait>::getMMContainer(TierId tid,
+                                           PoolId pid,
                                            ClassId cid) const noexcept {
-  XDCHECK_LT(static_cast<size_t>(pid), mmContainers_.size());
-  XDCHECK_LT(static_cast<size_t>(cid), mmContainers_[pid].size());
-  return *mmContainers_[pid][cid];
+  XDCHECK_LT(static_cast<size_t>(tid), mmContainers_.size());
+  XDCHECK_LT(static_cast<size_t>(pid), mmContainers_[tid].size());
+  XDCHECK_LT(static_cast<size_t>(cid), mmContainers_[tid][pid].size());
+  return *mmContainers_[tid][pid][cid];
+}
+
+template <typename CacheTrait>
+MMContainerStat CacheAllocator<CacheTrait>::getMMContainerStat(
+    TierId tid, PoolId pid, ClassId cid) const noexcept {
+  if(static_cast<size_t>(tid) >= mmContainers_.size()) {
+    return MMContainerStat{};
+  }
+  if (static_cast<size_t>(pid) >= mmContainers_[tid].size()) {
+    return MMContainerStat{};
+  }
+  if (static_cast<size_t>(cid) >= mmContainers_[tid][pid].size()) {
+    return MMContainerStat{};
+  }
+  return mmContainers_[tid][pid][cid] ? mmContainers_[tid][pid][cid]->getStats()
+                                 : MMContainerStat{};
 }
 
 template <typename CacheTrait>
@@ -4207,8 +4366,9 @@ void CacheAllocator<CacheTrait>::markUseful(const ReadHandle& handle,
 template <typename CacheTrait>
 bool CacheAllocator<CacheTrait>::recordAccessInMMContainer(Item& item,
                                                            AccessMode mode) {
+  const auto tid = getTierId(item);
   const auto allocInfo =
-      allocator_->getAllocInfo(static_cast<const void*>(&item));
+      allocator_[tid]->getAllocInfo(static_cast<const void*>(&item));
   (*stats_.cacheHits)[allocInfo.poolId][allocInfo.classId].inc();
 
   // track recently accessed items if needed
@@ -4216,14 +4376,15 @@ bool CacheAllocator<CacheTrait>::recordAccessInMMContainer(Item& item,
     ring_->trackItem(reinterpret_cast<uintptr_t>(&item), item.getSize());
   }
 
-  auto& mmContainer = getMMContainer(allocInfo.poolId, allocInfo.classId);
+  auto& mmContainer = getMMContainer(tid, allocInfo.poolId, allocInfo.classId);
   return mmContainer.recordAccess(item, mode);
 }
 
 template <typename CacheTrait>
 uint32_t CacheAllocator<CacheTrait>::getUsableSize(const Item& item) const {
+  const auto tid = getTierId(item);
   const auto allocSize =
-      allocator_->getAllocInfo(static_cast<const void*>(&item)).allocSize;
+      allocator_[tid]->getAllocInfo(static_cast<const void*>(&item)).allocSize;
   return item.isChainedItem()
              ? allocSize - ChainedItem::getRequiredSize(0)
              : allocSize - Item::getRequiredSize(item.getKey(), 0);
@@ -4232,8 +4393,9 @@ uint32_t CacheAllocator<CacheTrait>::getUsableSize(const Item& item) const {
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::SampleItem
 CacheAllocator<CacheTrait>::getSampleItem() {
+  auto tid = folly::Random::rand32() % getNumTiers();
   size_t nvmCacheSize = nvmCache_ ? nvmCache_->getUsableSize() : 0;
-  size_t ramCacheSize = allocator_->getMemorySizeInclAdvised();
+  size_t ramCacheSize = allocator_[tid]->getMemorySizeInclAdvised();
 
   bool fromNvm =
       folly::Random::rand64(0, nvmCacheSize + ramCacheSize) >= ramCacheSize;
@@ -4242,19 +4404,18 @@ CacheAllocator<CacheTrait>::getSampleItem() {
   }
 
   // Sampling from DRAM cache
-  auto item = reinterpret_cast<const Item*>(allocator_->getRandomAlloc());
+  auto item = reinterpret_cast<const Item*>(allocator_[tid]->getRandomAlloc());
   if (!item || UNLIKELY(item->isExpired())) {
     return SampleItem{false /* fromNvm */};
   }
 
   // Check that item returned is the same that was sampled
-
   auto sharedHdl = std::make_shared<ReadHandle>(findInternal(item->getKey()));
   if (sharedHdl->get() != item) {
     return SampleItem{false /* fromNvm */};
   }
 
-  const auto allocInfo = allocator_->getAllocInfo(item->getMemory());
+  const auto allocInfo = allocator_[tid]->getAllocInfo(item->getMemory());
 
   // Convert the Item to IOBuf to make SampleItem
   auto iobuf = folly::IOBuf{
@@ -4278,22 +4439,28 @@ std::vector<std::string> CacheAllocator<CacheTrait>::dumpEvictionIterator(
     return {};
   }
 
-  if (static_cast<size_t>(pid) >= mmContainers_.size() ||
-      static_cast<size_t>(cid) >= mmContainers_[pid].size()) {
+  // Always evict from the lowest layer.
+  int tid = getNumTiers() - 1;
+  if (static_cast<size_t>(tid) >= mmContainers_.size() ||
+      static_cast<size_t>(pid) >= mmContainers_[tid].size() ||
+      static_cast<size_t>(cid) >= mmContainers_[tid][pid].size()) {
     throw std::invalid_argument(
-        folly::sformat("Invalid PoolId: {} and ClassId: {}.", pid, cid));
+        folly::sformat("Invalid TierId: {} and PoolId: {} and ClassId: {}.", tid, pid, cid));
   }
 
   std::vector<std::string> content;
 
-  auto& mm = *mmContainers_[pid][cid];
-
-  mm.withEvictionIterator([&content, numItems](auto&& itr) {
-    while (itr && content.size() < numItems) {
-      content.push_back(itr->toString());
-      ++itr;
-    }
-  });
+  size_t i = 0;
+  while (i < numItems && tid >= 0) {
+    auto& mm = *mmContainers_[tid][pid][cid];
+    mm.withEvictionIterator([&content, numItems](auto&& itr) {
+      while (itr && content.size() < numItems) {
+        content.push_back(itr->toString());
+        ++itr;
+      }
+    });
+    --tid;
+  }
 
   return content;
 }
@@ -4470,14 +4637,34 @@ PoolId CacheAllocator<CacheTrait>::addPool(
     std::shared_ptr<RebalanceStrategy> resizeStrategy,
     bool ensureProvisionable) {
   std::unique_lock w(poolsResizeAndRebalanceLock_);
-  auto pid = allocator_->addPool(name, size, allocSizes, ensureProvisionable);
+
+  PoolId pid = 0;
+  size_t totalCacheSize = 0;
+
+  for (TierId tid = 0; tid < getNumTiers(); tid++) {
+    totalCacheSize += allocator_[tid]->getMemorySize();
+  }
+
+  for (TierId tid = 0; tid < getNumTiers(); tid++) {
+    auto tierSizeRatio =
+        static_cast<double>(allocator_[tid]->getMemorySize()) / totalCacheSize;
+    size_t tierPoolSize = static_cast<size_t>(tierSizeRatio * size);
+    
+    // TODO: what if we manage to add pool only in one tier?
+    // we should probably remove that on failure
+    auto res = allocator_[tid]->addPool(
+        name, tierPoolSize, allocSizes, ensureProvisionable);
+    XDCHECK(tid == 0 || res == pid);
+    pid = res;
+  }
+
   createMMContainers(pid, std::move(config));
   setRebalanceStrategy(pid, std::move(rebalanceStrategy));
   setResizeStrategy(pid, std::move(resizeStrategy));
 
   if (backgroundEvictor_.size()) {
     auto memoryAssignments =
-        createBgWorkerMemoryAssignments(backgroundEvictor_.size());
+        createBgWorkerMemoryAssignments(backgroundEvictor_.size(), 0);
     for (size_t id = 0; id < backgroundEvictor_.size(); id++)
       backgroundEvictor_[id]->setAssignedMemory(
           std::move(memoryAssignments[id]));
@@ -4485,7 +4672,7 @@ PoolId CacheAllocator<CacheTrait>::addPool(
 
   if (backgroundPromoter_.size()) {
     auto memoryAssignments =
-        createBgWorkerMemoryAssignments(backgroundPromoter_.size());
+        createBgWorkerMemoryAssignments(backgroundPromoter_.size(), 1);
     for (size_t id = 0; id < backgroundPromoter_.size(); id++)
       backgroundPromoter_[id]->setAssignedMemory(
           std::move(memoryAssignments[id]));
@@ -4497,9 +4684,9 @@ PoolId CacheAllocator<CacheTrait>::addPool(
 template <typename CacheTrait>
 void CacheAllocator<CacheTrait>::overridePoolRebalanceStrategy(
     PoolId pid, std::shared_ptr<RebalanceStrategy> rebalanceStrategy) {
-  if (static_cast<size_t>(pid) >= mmContainers_.size()) {
+  if (static_cast<size_t>(pid) >= mmContainers_[0].size()) {
     throw std::invalid_argument(folly::sformat(
-        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size()));
+        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[0].size()));
   }
   setRebalanceStrategy(pid, std::move(rebalanceStrategy));
 }
@@ -4507,9 +4694,9 @@ void CacheAllocator<CacheTrait>::overridePoolRebalanceStrategy(
 template <typename CacheTrait>
 void CacheAllocator<CacheTrait>::overridePoolResizeStrategy(
     PoolId pid, std::shared_ptr<RebalanceStrategy> resizeStrategy) {
-  if (static_cast<size_t>(pid) >= mmContainers_.size()) {
+  if (static_cast<size_t>(pid) >= mmContainers_[0].size()) {
     throw std::invalid_argument(folly::sformat(
-        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size()));
+        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[0].size()));
   }
   setResizeStrategy(pid, std::move(resizeStrategy));
 }
@@ -4521,14 +4708,14 @@ void CacheAllocator<CacheTrait>::overridePoolOptimizeStrategy(
 }
 
 template <typename CacheTrait>
-void CacheAllocator<CacheTrait>::overridePoolConfig(PoolId pid,
+void CacheAllocator<CacheTrait>::overridePoolConfig(TierId tid, PoolId pid,
                                                     const MMConfig& config) {
-  if (static_cast<size_t>(pid) >= mmContainers_.size()) {
+  if (static_cast<size_t>(pid) >= mmContainers_[tid].size()) {
     throw std::invalid_argument(folly::sformat(
-        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size()));
+        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[tid].size()));
   }
 
-  auto& pool = allocator_->getPool(pid);
+  auto& pool = allocator_[tid]->getPool(pid);
   for (unsigned int cid = 0; cid < pool.getNumClassId(); ++cid) {
     MMConfig mmConfig = config;
     mmConfig.addExtraConfig(
@@ -4536,29 +4723,33 @@ void CacheAllocator<CacheTrait>::overridePoolConfig(PoolId pid,
             ? pool.getAllocationClass(static_cast<ClassId>(cid))
                   .getAllocsPerSlab()
             : 0);
-    DCHECK_NOTNULL(mmContainers_[pid][cid].get());
-    mmContainers_[pid][cid]->setConfig(mmConfig);
+    DCHECK_NOTNULL(mmContainers_[tid][pid][cid].get());
+    mmContainers_[tid][pid][cid]->setConfig(mmConfig);
   }
 }
 
 template <typename CacheTrait>
 void CacheAllocator<CacheTrait>::createMMContainers(const PoolId pid,
                                                     MMConfig config) {
-  auto& pool = allocator_->getPool(pid);
+  // pools on each layer should have the same number of class id, etc.
+  auto& pool = allocator_[0]->getPool(pid);
   for (unsigned int cid = 0; cid < pool.getNumClassId(); ++cid) {
     config.addExtraConfig(
         config_.trackTailHits
             ? pool.getAllocationClass(static_cast<ClassId>(cid))
                   .getAllocsPerSlab()
             : 0);
-    mmContainers_[pid][cid].reset(new MMContainer(config, compressor_));
+    for (TierId tid = 0; tid < getNumTiers(); tid++) {
+      mmContainers_[tid][pid][cid].reset(new MMContainer(config, compressor_));
+    }
   }
 }
 
 template <typename CacheTrait>
 PoolId CacheAllocator<CacheTrait>::getPoolId(
     folly::StringPiece name) const noexcept {
-  return allocator_->getPoolId(name.str());
+  // each tier has the same pools
+  return allocator_[0]->getPoolId(name.str());
 }
 
 // The Function returns a consolidated vector of Release Slab
@@ -4601,7 +4792,9 @@ std::set<PoolId> CacheAllocator<CacheTrait>::filterCompactCachePools(
 template <typename CacheTrait>
 std::set<PoolId> CacheAllocator<CacheTrait>::getRegularPoolIds() const {
   std::shared_lock r(poolsResizeAndRebalanceLock_);
-  return filterCompactCachePools(allocator_->getPoolIds());
+  // TODO - get rid of the duplication - right now, each tier
+  // holds pool objects with mostly the same info
+  return filterCompactCachePools(allocator_[0]->getPoolIds());
 }
 
 template <typename CacheTrait>
@@ -4626,10 +4819,9 @@ std::set<PoolId> CacheAllocator<CacheTrait>::getRegularPoolIdsForResize()
   // getAdvisedMemorySize - then pools may be overLimit even when
   // all slabs are not allocated. Otherwise, pools may be overLimit
   // only after all slabs are allocated.
-  //
-  return (allocator_->allSlabsAllocated()) ||
-                 (allocator_->getAdvisedMemorySize() != 0)
-             ? filterCompactCachePools(allocator_->getPoolsOverLimit())
+  return (allocator_[0]->allSlabsAllocated()) ||
+                 (allocator_[0]->getAdvisedMemorySize() != 0)
+             ? filterCompactCachePools(allocator_[0]->getPoolsOverLimit())
              : std::set<PoolId>{};
 }
 
@@ -4640,7 +4832,7 @@ const std::string CacheAllocator<CacheTrait>::getCacheName() const {
 
 template <typename CacheTrait>
 PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
-  const auto& pool = allocator_->getPool(poolId);
+  const auto& pool = allocator_[0]->getPool(poolId);
   const auto& allocSizes = pool.getAllocSizes();
   auto mpStats = pool.getStats();
   const auto& classIds = mpStats.classIds;
@@ -4659,7 +4851,7 @@ PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
   if (!isCompactCache) {
     for (const ClassId cid : classIds) {
       uint64_t classHits = (*stats_.cacheHits)[poolId][cid].get();
-      XDCHECK(mmContainers_[poolId][cid],
+      XDCHECK(mmContainers_[0][poolId][cid],
               folly::sformat("Pid {}, Cid {} not initialized.", poolId, cid));
       cacheStats.insert(
           {cid,
@@ -4669,7 +4861,7 @@ PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
             (*stats_.fragmentationSize)[poolId][cid].get(), classHits,
             (*stats_.chainedItemEvictions)[poolId][cid].get(),
             (*stats_.regularItemEvictions)[poolId][cid].get(),
-            mmContainers_[poolId][cid]->getStats()}
+            mmContainers_[0][poolId][cid]->getStats()}
 
           });
       totalHits += classHits;
@@ -4678,7 +4870,7 @@ PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
 
   PoolStats ret;
   ret.isCompactCache = isCompactCache;
-  ret.poolName = allocator_->getPoolName(poolId);
+  ret.poolName = allocator_[0]->getPoolName(poolId);
   ret.poolSize = pool.getPoolSize();
   ret.poolUsableSize = pool.getPoolUsableSize();
   ret.poolAdvisedSize = pool.getPoolAdvisedSize();
@@ -4691,9 +4883,10 @@ PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
 }
 
 template <typename CacheTrait>
-ACStats CacheAllocator<CacheTrait>::getACStats(PoolId poolId,
+ACStats CacheAllocator<CacheTrait>::getACStats(TierId tid,
+                                               PoolId poolId,
                                                ClassId classId) const {
-  const auto& pool = allocator_->getPool(poolId);
+  const auto& pool = allocator_[tid]->getPool(poolId);
   const auto& ac = pool.getAllocationClass(classId);
   return ac.getStats();
 }
@@ -4703,12 +4896,12 @@ PoolEvictionAgeStats CacheAllocator<CacheTrait>::getPoolEvictionAgeStats(
     PoolId pid, unsigned int slabProjectionLength) const {
   PoolEvictionAgeStats stats;
 
-  const auto& pool = allocator_->getPool(pid);
+  const auto& pool = allocator_[0]->getPool(pid);
   const auto& allocSizes = pool.getAllocSizes();
   for (ClassId cid = 0; cid < static_cast<ClassId>(allocSizes.size()); ++cid) {
-    auto& mmContainer = getMMContainer(pid, cid);
+    auto& mmContainer = getMMContainer(0, pid, cid);
     const auto numItemsPerSlab =
-        allocator_->getPool(pid).getAllocationClass(cid).getAllocsPerSlab();
+        allocator_[0]->getPool(pid).getAllocationClass(cid).getAllocsPerSlab();
     const auto projectionLength = numItemsPerSlab * slabProjectionLength;
     stats.classEvictionAgeStats[cid] =
         mmContainer.getEvictionAgeStat(projectionLength);
@@ -4752,7 +4945,7 @@ void CacheAllocator<CacheTrait>::releaseSlab(PoolId pid,
   }
 
   try {
-    auto releaseContext = allocator_->startSlabRelease(
+    auto releaseContext = allocator_[0]->startSlabRelease(
         pid, victim, receiver, mode, hint,
         [this]() -> bool { return shutDownInProgress_; });
 
@@ -4761,15 +4954,15 @@ void CacheAllocator<CacheTrait>::releaseSlab(PoolId pid,
       return;
     }
 
-    releaseSlabImpl(releaseContext);
-    if (!allocator_->allAllocsFreed(releaseContext)) {
+    releaseSlabImpl(0, releaseContext);
+    if (!allocator_[0]->allAllocsFreed(releaseContext)) {
       throw std::runtime_error(
           folly::sformat("Was not able to free all allocs. PoolId: {}, AC: {}",
                          releaseContext.getPoolId(),
                          releaseContext.getClassId()));
     }
 
-    allocator_->completeSlabRelease(releaseContext);
+    allocator_[0]->completeSlabRelease(releaseContext);
   } catch (const exception::SlabReleaseAborted& e) {
     stats_.numAbortedSlabReleases.inc();
     throw exception::SlabReleaseAborted(folly::sformat(
@@ -4799,7 +4992,7 @@ SlabReleaseStats CacheAllocator<CacheTrait>::getSlabReleaseStats()
 }
 
 template <typename CacheTrait>
-void CacheAllocator<CacheTrait>::releaseSlabImpl(
+void CacheAllocator<CacheTrait>::releaseSlabImpl(TierId tid,
     const SlabReleaseContext& releaseContext) {
   auto startTime = std::chrono::milliseconds(util::getCurrentTimeMs());
   bool releaseStuck = false;
@@ -4842,7 +5035,7 @@ void CacheAllocator<CacheTrait>::releaseSlabImpl(
       // If moving fails, evict it
       evictForSlabRelease(item);
     }
-    XDCHECK(allocator_->isAllocFreed(releaseContext, alloc));
+    XDCHECK(allocator_[tid]->isAllocFreed(releaseContext, alloc));
   }
 }
 
@@ -4903,7 +5096,8 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(Item& oldItem) {
     return false;
   }
 
-  const auto allocInfo = allocator_->getAllocInfo(oldItem.getMemory());
+  auto tid = getTierId(oldItem);
+  const auto allocInfo = allocator_[tid]->getAllocInfo(oldItem.getMemory());
   if (chainedItem) {
     newItemHdl.reset();
     auto parentKey = parentItem->getKey();
@@ -4931,7 +5125,7 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(Item& oldItem) {
     auto ref = unmarkMovingAndWakeUpWaiters(oldItem, std::move(newItemHdl));
     XDCHECK_EQ(0u, ref);
   }
-  allocator_->free(&oldItem);
+  allocator_[tid]->free(&oldItem);
 
   (*stats_.fragmentationSize)[allocInfo.poolId][allocInfo.classId].sub(
       util::getFragmentation(*this, oldItem));
@@ -4942,7 +5136,6 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(Item& oldItem) {
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
 CacheAllocator<CacheTrait>::allocateNewItemForOldItem(const Item& oldItem) {
-  XDCHECK(oldItem.isMoving());
   if (oldItem.isChainedItem()) {
     const Item& parentItem = oldItem.asChainedItem().getParentItem(compressor_);
 
@@ -4961,17 +5154,19 @@ CacheAllocator<CacheTrait>::allocateNewItemForOldItem(const Item& oldItem) {
     return newItemHdl;
   }
 
+  const auto tid = getTierId(oldItem);
   const auto allocInfo =
-      allocator_->getAllocInfo(static_cast<const void*>(&oldItem));
+      allocator_[tid]->getAllocInfo(static_cast<const void*>(&oldItem));
 
   // Set up the destination for the move. Since oldItem would have the moving
   // bit set, it won't be picked for eviction.
-  auto newItemHdl = allocateInternal(allocInfo.poolId,
-                                     oldItem.getKey(),
-                                     oldItem.getSize(),
-                                     oldItem.getCreationTime(),
-                                     oldItem.getExpiryTime(),
-                                     false);
+  auto newItemHdl = allocateInternalTier(tid,
+                                         allocInfo.poolId,
+                                         oldItem.getKey(),
+                                         oldItem.getSize(),
+                                         oldItem.getCreationTime(),
+                                         oldItem.getExpiryTime(),
+                                         false);
   if (!newItemHdl) {
     return {};
   }
@@ -5008,7 +5203,7 @@ void CacheAllocator<CacheTrait>::evictForSlabRelease(Item& item) {
   }
 
   const auto allocInfo =
-      allocator_->getAllocInfo(static_cast<const void*>(&item));
+      allocator_[getTierId(item)]->getAllocInfo(static_cast<const void*>(&item));
   if (evicted->hasChainedItem()) {
     (*stats_.chainedItemEvictions)[allocInfo.poolId][allocInfo.classId].inc();
   } else {
@@ -5057,11 +5252,15 @@ bool CacheAllocator<CacheTrait>::markMovingForSlabRelease(
   // At first, we assume this item was already freed
   bool itemFreed = true;
   bool markedMoving = false;
-  const auto fn = [this, &markedMoving, &itemFreed](void* memory) {
+  TierId tid = getTierId(alloc);
+  const auto fn = [this, tid, &markedMoving, &itemFreed](void* memory) {
     // Since this callback is executed, the item is not yet freed
     itemFreed = false;
     Item* item = static_cast<Item*>(memory);
-    auto& mmContainer = getMMContainer(*item);
+    auto allocInfo = allocator_[tid]->getAllocInfo(memory);
+    auto pid = allocInfo.poolId;
+    auto cid = allocInfo.classId;
+    auto& mmContainer = getMMContainer(tid, pid, cid);
     mmContainer.withContainerLock([this, &mmContainer, &item, &markedMoving]() {
       // we rely on the mmContainer lock to safely check that the item is
       // currently in the mmContainer (no other threads are currently
@@ -5099,7 +5298,7 @@ bool CacheAllocator<CacheTrait>::markMovingForSlabRelease(
 
   auto startTime = util::getCurrentTimeSec();
   while (true) {
-    allocator_->processAllocForRelease(ctx, alloc, fn);
+    allocator_[tid]->processAllocForRelease(ctx, alloc, fn);
 
     // If item is already freed we give up trying to mark the item moving
     // and return false, otherwise if marked as moving, we return true.
@@ -5114,7 +5313,7 @@ bool CacheAllocator<CacheTrait>::markMovingForSlabRelease(
     itemFreed = true;
 
     if (shutDownInProgress_) {
-      allocator_->abortSlabRelease(ctx);
+      allocator_[tid]->abortSlabRelease(ctx);
       throw exception::SlabReleaseAborted(
           folly::sformat("Slab Release aborted while still trying to mark"
                          " as moving for Item: {}. Pool: {}, Class: {}.",
@@ -5138,12 +5337,15 @@ template <typename CCacheT, typename... Args>
 CCacheT* CacheAllocator<CacheTrait>::addCompactCache(folly::StringPiece name,
                                                      size_t size,
                                                      Args&&... args) {
+  if (getNumTiers() != 1)
+    throw std::runtime_error("TODO: compact cache for multi-tier Cache not supported.");
+
   if (!config_.isCompactCacheEnabled()) {
     throw std::logic_error("Compact cache is not enabled");
   }
 
   std::unique_lock lock(compactCachePoolsLock_);
-  auto poolId = allocator_->addPool(name, size, {Slab::kSize});
+  auto poolId = allocator_[0]->addPool(name, size, {Slab::kSize});
   isCompactCachePool_[poolId] = true;
 
   auto ptr = std::make_unique<CCacheT>(
@@ -5252,12 +5454,15 @@ folly::IOBufQueue CacheAllocator<CacheTrait>::saveStateToIOBuf() {
   *metadata_.numChainedChildItems() = stats_.numChainedChildItems.get();
   *metadata_.numAbortedSlabReleases() = stats_.numAbortedSlabReleases.get();
 
+  // TODO: implement serialization for multiple tiers
   auto serializeMMContainers = [](MMContainers& mmContainers) {
     MMSerializationTypeContainer state;
-    for (unsigned int i = 0; i < mmContainers.size(); ++i) {
+    for (unsigned int i = 0; i < 1 /* TODO: */ ; ++i) {
       for (unsigned int j = 0; j < mmContainers[i].size(); ++j) {
-        if (mmContainers[i][j]) {
-          state.pools_ref()[i][j] = mmContainers[i][j]->saveState();
+        for (unsigned int k = 0; k < mmContainers[i][j].size(); ++k) {
+          if (mmContainers[i][j][k]) {
+            state.pools_ref()[j][k] = mmContainers[i][j][k]->saveState();
+          }
         }
       }
     }
@@ -5267,7 +5472,8 @@ folly::IOBufQueue CacheAllocator<CacheTrait>::saveStateToIOBuf() {
       serializeMMContainers(mmContainers_);
 
   AccessSerializationType accessContainerState = accessContainer_->saveState();
-  MemoryAllocator::SerializationType allocatorState = allocator_->saveState();
+  // TODO: foreach allocator
+  MemoryAllocator::SerializationType allocatorState = allocator_[0]->saveState();
   CCacheManager::SerializationType ccState = compactCacheManager_->saveState();
 
   AccessSerializationType chainedItemAccessContainerState =
@@ -5331,6 +5537,8 @@ CacheAllocator<CacheTrait>::shutDown() {
       (shmShutDownStatus == ShmShutDownRes::kSuccess);
   shmManager_.reset();
 
+  // TODO: save per-tier state
+
   if (shmShutDownSucceeded) {
     if (!nvmShutDownStatusOpt || *nvmShutDownStatusOpt)
       return ShutDownStatus::kSuccess;
@@ -5394,22 +5602,26 @@ CacheAllocator<CacheTrait>::deserializeMMContainers(
   const auto container =
       deserializer.deserialize<MMSerializationTypeContainer>();
 
-  MMContainers mmContainers;
+  /* TODO: right now, we create empty containers because deserialization
+   * only works for a single (topmost) tier. */
+  MMContainers mmContainers{getNumTiers()};
 
   for (auto& kvPool : *container.pools_ref()) {
     auto i = static_cast<PoolId>(kvPool.first);
     auto& pool = getPool(i);
     for (auto& kv : kvPool.second) {
       auto j = static_cast<ClassId>(kv.first);
-      MMContainerPtr ptr =
-          std::make_unique<typename MMContainerPtr::element_type>(kv.second,
-                                                                  compressor);
-      auto config = ptr->getConfig();
-      config.addExtraConfig(config_.trackTailHits
-                                ? pool.getAllocationClass(j).getAllocsPerSlab()
-                                : 0);
-      ptr->setConfig(config);
-      mmContainers[i][j] = std::move(ptr);
+      for (TierId tid = 0; tid < getNumTiers(); tid++) {
+        MMContainerPtr ptr =
+            std::make_unique<typename MMContainerPtr::element_type>(kv.second,
+                                                                    compressor);
+        auto config = ptr->getConfig();
+        config.addExtraConfig(config_.trackTailHits
+                                  ? pool.getAllocationClass(j).getAllocsPerSlab()
+                                  : 0);
+        ptr->setConfig(config);
+        mmContainers[tid][i][j] = std::move(ptr);
+      }
     }
   }
   // We need to drop the unevictableMMContainer in the desierializer.
@@ -5565,11 +5777,11 @@ GlobalCacheStats CacheAllocator<CacheTrait>::getGlobalCacheStats() const {
 
 template <typename CacheTrait>
 CacheMemoryStats CacheAllocator<CacheTrait>::getCacheMemoryStats() const {
-  const auto totalCacheSize = allocator_->getMemorySize();
-  const auto configuredTotalCacheSize = allocator_->getMemorySizeInclAdvised();
+  const auto totalCacheSize = allocator_[0]->getMemorySize();
+  const auto configuredTotalCacheSize = allocator_[0]->getMemorySizeInclAdvised();
 
   auto addSize = [this](size_t a, PoolId pid) {
-    return a + allocator_->getPool(pid).getPoolSize();
+    return a + allocator_[0]->getPool(pid).getPoolSize();
   };
   const auto regularPoolIds = getRegularPoolIds();
   const auto ccCachePoolIds = getCCachePoolIds();
@@ -5582,9 +5794,9 @@ CacheMemoryStats CacheAllocator<CacheTrait>::getCacheMemoryStats() const {
                           configuredTotalCacheSize,
                           configuredRegularCacheSize,
                           configuredCompactCacheSize,
-                          allocator_->getAdvisedMemorySize(),
+                          allocator_[0]->getAdvisedMemorySize(),
                           memMonitor_ ? memMonitor_->getMaxAdvisePct() : 0,
-                          allocator_->getUnreservedMemorySize(),
+                          allocator_[0]->getUnreservedMemorySize(),
                           nvmCache_ ? nvmCache_->getSize() : 0,
                           util::getMemAvailable(),
                           util::getRSSBytes()};
@@ -5723,14 +5935,14 @@ bool CacheAllocator<CacheTrait>::startNewReaper(
 
 template <typename CacheTrait>
 auto CacheAllocator<CacheTrait>::createBgWorkerMemoryAssignments(
-    size_t numWorkers) {
+    size_t numWorkers, TierId tid) {
   std::vector<std::vector<MemoryDescriptorType>> asssignedMemory(numWorkers);
-  auto pools = filterCompactCachePools(allocator_->getPoolIds());
+  auto pools = filterCompactCachePools(allocator_[tid]->getPoolIds());
   for (const auto pid : pools) {
-    const auto& mpStats = getPool(pid).getStats();
+    const auto& mpStats = getPoolByTid(pid, tid).getStats();
     for (const auto cid : mpStats.classIds) {
-      asssignedMemory[BackgroundMover<CacheT>::workerId(pid, cid, numWorkers)]
-          .emplace_back(pid, cid);
+      asssignedMemory[BackgroundMover<CacheT>::workerId(tid, pid, cid, numWorkers)]
+          .emplace_back(tid, pid, cid);
     }
   }
   return asssignedMemory;
@@ -5745,7 +5957,7 @@ bool CacheAllocator<CacheTrait>::startNewBackgroundEvictor(
   backgroundEvictor_.resize(threads);
   bool result = true;
 
-  auto memoryAssignments = createBgWorkerMemoryAssignments(threads);
+  auto memoryAssignments = createBgWorkerMemoryAssignments(threads, 0);
   for (size_t i = 0; i < threads; i++) {
     auto ret = startNewWorker("BackgroundEvictor" + std::to_string(i),
                               backgroundEvictor_[i], interval, *this, strategy,
@@ -5768,7 +5980,7 @@ bool CacheAllocator<CacheTrait>::startNewBackgroundPromoter(
   backgroundPromoter_.resize(threads);
   bool result = true;
 
-  auto memoryAssignments = createBgWorkerMemoryAssignments(threads);
+  auto memoryAssignments = createBgWorkerMemoryAssignments(threads, 1);
   for (size_t i = 0; i < threads; i++) {
     auto ret = startNewWorker("BackgroundPromoter" + std::to_string(i),
                               backgroundPromoter_[i], interval, *this, strategy,
@@ -5871,7 +6083,8 @@ bool CacheAllocator<CacheTrait>::cleanupStrayShmSegments(
     // Any other concurrent process can not be attached to the segments or
     // even if it does, we want to mark it for destruction.
     ShmManager::removeByName(cacheDir, detail::kShmInfoName, posix);
-    ShmManager::removeByName(cacheDir, detail::kShmCacheName, posix);
+    ShmManager::removeByName(cacheDir, detail::kShmCacheName
+                             + std::to_string(0 /* TODO: per tier */), posix);
     ShmManager::removeByName(cacheDir, detail::kShmHashTableName, posix);
     ShmManager::removeByName(cacheDir, detail::kShmChainedItemHashTableName,
                              posix);
@@ -5886,13 +6099,14 @@ uint64_t CacheAllocator<CacheTrait>::getItemPtrAsOffset(const void* ptr) {
   // errors downstream.
 
   // if this succeeeds, the address is valid within the cache.
-  allocator_->getAllocInfo(ptr);
+  auto tid = getTierId(ptr);
+  allocator_[tid]->getAllocInfo(ptr);
 
   if (!isOnShm_ || !shmManager_) {
     throw std::invalid_argument("Shared memory not used");
   }
 
-  const auto& shm = shmManager_->getShmByName(detail::kShmCacheName);
+  const auto& shm = shmManager_->getShmByName(detail::kShmCacheName + std::to_string(tid));
 
   return reinterpret_cast<uint64_t>(ptr) -
          reinterpret_cast<uint64_t>(shm.getCurrentMapping().addr);
diff --git a/cachelib/allocator/PoolOptimizer.cpp b/cachelib/allocator/PoolOptimizer.cpp
index 1902bfebf8..d23bb77b58 100644
--- a/cachelib/allocator/PoolOptimizer.cpp
+++ b/cachelib/allocator/PoolOptimizer.cpp
@@ -50,6 +50,8 @@ void PoolOptimizer::optimizeRegularPoolSizes() {
 
 void PoolOptimizer::optimizeCompactCacheSizes() {
   try {
+    // TODO: should optimizer look at each tier individually?
+    // If yes, then resizePools should be per-tier
     auto strategy = cache_.getPoolOptimizeStrategy();
     if (!strategy) {
       strategy = strategy_;
diff --git a/cachelib/allocator/memory/MemoryAllocator.h b/cachelib/allocator/memory/MemoryAllocator.h
index 1ce58857de..625171fd6f 100644
--- a/cachelib/allocator/memory/MemoryAllocator.h
+++ b/cachelib/allocator/memory/MemoryAllocator.h
@@ -646,6 +646,13 @@ class MemoryAllocator {
     memoryPoolManager_.updateNumSlabsToAdvise(numSlabs);
   }
 
+  // returns ture if ptr points to memory which is managed by this
+  // allocator
+  bool isMemoryInAllocator(const void *ptr) {
+    return ptr && ptr >= slabAllocator_.getSlabMemoryBegin()
+      && ptr < slabAllocator_.getSlabMemoryEnd();
+  }
+
  private:
   // @param memory    pointer to the memory.
   // @return          the MemoryPool corresponding to the memory.
diff --git a/cachelib/allocator/memory/SlabAllocator.h b/cachelib/allocator/memory/SlabAllocator.h
index d82cf5b947..9fdb1e60b4 100644
--- a/cachelib/allocator/memory/SlabAllocator.h
+++ b/cachelib/allocator/memory/SlabAllocator.h
@@ -322,6 +322,17 @@ class SlabAllocator {
     return PtrCompressor<PtrType, SlabAllocator>(*this);
   }
 
+  // returns starting address of memory we own.
+  const Slab* getSlabMemoryBegin() const noexcept {
+    return reinterpret_cast<Slab*>(memoryStart_);
+  }
+
+  // returns first byte after the end of memory region we own.
+  const Slab* getSlabMemoryEnd() const noexcept {
+    return reinterpret_cast<Slab*>(reinterpret_cast<uint8_t*>(memoryStart_) +
+                                   memorySize_);
+  }
+
  private:
   // null Slab* presenttation. With 4M Slab size, a valid slab index would never
   // reach 2^16 - 1;
@@ -339,12 +350,6 @@ class SlabAllocator {
   // @throw std::invalid_argument if the state is invalid.
   void checkState() const;
 
-  // returns first byte after the end of memory region we own.
-  const Slab* getSlabMemoryEnd() const noexcept {
-    return reinterpret_cast<Slab*>(reinterpret_cast<uint8_t*>(memoryStart_) +
-                                   memorySize_);
-  }
-
   // returns true if we have slabbed all the memory that is available to us.
   // false otherwise.
   bool allMemorySlabbed() const noexcept {
diff --git a/cachelib/allocator/tests/AllocatorResizeTest.h b/cachelib/allocator/tests/AllocatorResizeTest.h
index d65205ac74..883dd9c056 100644
--- a/cachelib/allocator/tests/AllocatorResizeTest.h
+++ b/cachelib/allocator/tests/AllocatorResizeTest.h
@@ -966,23 +966,23 @@ class AllocatorResizeTest : public AllocatorTest<AllocatorT> {
       for (i = 1; i <= numItersToMaxAdviseAway + 1; i++) {
         alloc.memMonitor_->adviseAwaySlabs();
         std::this_thread::sleep_for(std::chrono::seconds{2});
-        ASSERT_EQ(alloc.allocator_->getAdvisedMemorySize(), i * perIterAdvSize);
+        ASSERT_EQ(alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize(), i * perIterAdvSize);
       }
       i--;
       // This should fail
       alloc.memMonitor_->adviseAwaySlabs();
       std::this_thread::sleep_for(std::chrono::seconds{2});
-      auto totalAdvisedAwayMemory = alloc.allocator_->getAdvisedMemorySize();
+      auto totalAdvisedAwayMemory = alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize();
       ASSERT_EQ(totalAdvisedAwayMemory, i * perIterAdvSize);
 
       // Try to reclaim back
       for (i = 1; i <= numItersToMaxAdviseAway + 1; i++) {
         alloc.memMonitor_->reclaimSlabs();
         std::this_thread::sleep_for(std::chrono::seconds{2});
-        ASSERT_EQ(alloc.allocator_->getAdvisedMemorySize(),
+        ASSERT_EQ(alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize(),
                   totalAdvisedAwayMemory - i * perIterAdvSize);
       }
-      totalAdvisedAwayMemory = alloc.allocator_->getAdvisedMemorySize();
+      totalAdvisedAwayMemory = alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize();
       ASSERT_EQ(totalAdvisedAwayMemory, 0);
     }
   }
diff --git a/cachelib/allocator/tests/BaseAllocatorTest.h b/cachelib/allocator/tests/BaseAllocatorTest.h
index c8ee44ac0c..22c80e6734 100644
--- a/cachelib/allocator/tests/BaseAllocatorTest.h
+++ b/cachelib/allocator/tests/BaseAllocatorTest.h
@@ -4341,13 +4341,13 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
     // Had a bug: D4799860 where we allocated the wrong size for chained item
     {
       const auto parentAllocInfo =
-          alloc.allocator_->getAllocInfo(itemHandle->getMemory());
+          alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(itemHandle->getMemory());
       const auto child1AllocInfo =
-          alloc.allocator_->getAllocInfo(chainedItemHandle->getMemory());
+          alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(chainedItemHandle->getMemory());
       const auto child2AllocInfo =
-          alloc.allocator_->getAllocInfo(chainedItemHandle2->getMemory());
+          alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(chainedItemHandle2->getMemory());
       const auto child3AllocInfo =
-          alloc.allocator_->getAllocInfo(chainedItemHandle3->getMemory());
+          alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(chainedItemHandle3->getMemory());
 
       const auto parentCid = parentAllocInfo.classId;
       const auto child1Cid = child1AllocInfo.classId;
diff --git a/cachelib/allocator/tests/CacheBaseTest.cpp b/cachelib/allocator/tests/CacheBaseTest.cpp
index f249786743..dae14c5335 100644
--- a/cachelib/allocator/tests/CacheBaseTest.cpp
+++ b/cachelib/allocator/tests/CacheBaseTest.cpp
@@ -33,8 +33,10 @@ class CacheBaseTest : public CacheBase, public SlabAllocatorTestBase {
   const std::string getCacheName() const override { return cacheName; }
   bool isObjectCache() const override { return false; }
   const MemoryPool& getPool(PoolId) const override { return memoryPool_; }
+  //TODO: support tiers
+  const MemoryPool& getPoolByTid(PoolId, TierId tid) const override { return memoryPool_; }
   PoolStats getPoolStats(PoolId) const override { return PoolStats(); }
-  ACStats getACStats(PoolId, ClassId) const { return ACStats(); };
+  ACStats getACStats(TierId, PoolId, ClassId) const { return ACStats(); };
   AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId) const override {
     return AllSlabReleaseEvents{};
   }
diff --git a/cachelib/allocator/tests/TestBase.h b/cachelib/allocator/tests/TestBase.h
index 086fa65d3f..81750b9b00 100644
--- a/cachelib/allocator/tests/TestBase.h
+++ b/cachelib/allocator/tests/TestBase.h
@@ -418,7 +418,7 @@ void AllocatorTest<AllocatorT>::testShmIsRemoved(
   ASSERT_FALSE(AllocatorT::ShmManager::segmentExists(
       config.getCacheDir(), detail::kShmHashTableName, config.usePosixShm));
   ASSERT_FALSE(AllocatorT::ShmManager::segmentExists(
-      config.getCacheDir(), detail::kShmCacheName, config.usePosixShm));
+      config.getCacheDir(), detail::kShmCacheName + std::to_string(0), config.usePosixShm));
   ASSERT_FALSE(AllocatorT::ShmManager::segmentExists(
       config.getCacheDir(), detail::kShmChainedItemHashTableName,
       config.usePosixShm));
@@ -432,7 +432,7 @@ void AllocatorTest<AllocatorT>::testShmIsNotRemoved(
   ASSERT_TRUE(AllocatorT::ShmManager::segmentExists(
       config.getCacheDir(), detail::kShmHashTableName, config.usePosixShm));
   ASSERT_TRUE(AllocatorT::ShmManager::segmentExists(
-      config.getCacheDir(), detail::kShmCacheName, config.usePosixShm));
+      config.getCacheDir(), detail::kShmCacheName + std::to_string(0), config.usePosixShm));
   ASSERT_TRUE(AllocatorT::ShmManager::segmentExists(
       config.getCacheDir(), detail::kShmChainedItemHashTableName,
       config.usePosixShm));
diff --git a/cachelib/cachebench/cache/Cache.h b/cachelib/cachebench/cache/Cache.h
index b259e83f24..2953142eed 100644
--- a/cachelib/cachebench/cache/Cache.h
+++ b/cachelib/cachebench/cache/Cache.h
@@ -325,8 +325,8 @@ class Cache {
   // return the stats for the pool.
   PoolStats getPoolStats(PoolId pid) const { return cache_->getPoolStats(pid); }
 
-  ACStats getACStats(PoolId pid, ClassId cid) const {
-    return cache_->getACStats(pid, cid);
+  ACStats getACStats(TierId tid, PoolId pid, ClassId cid) const {
+    return cache_->getACStats(tid, pid, cid);
   }
 
   // return the total number of inconsistent operations detected since start.
@@ -1128,14 +1128,15 @@ Stats Cache<Allocator>::getStats() const {
     aggregate += poolStats;
   }
 
-  std::map<PoolId, std::map<ClassId, ACStats>> allocationClassStats{};
+  std::map<TierId, std::map<PoolId, std::map<ClassId, ACStats>>> allocationClassStats{};
 
   for (size_t pid = 0; pid < pools_.size(); pid++) {
     PoolId poolId = static_cast<PoolId>(pid);
     auto poolStats = cache_->getPoolStats(poolId);
     auto cids = poolStats.getClassIds();
-    for (auto [cid, stats] : poolStats.mpStats.acStats) {
-      allocationClassStats[poolId][cid] = stats;
+    for (TierId tid = 0; tid < cache_->getNumTiers(); tid++) {
+      for (auto cid : cids)
+        allocationClassStats[tid][pid][cid] = cache_->getACStats(tid, pid, cid);
     }
   }
 
diff --git a/cachelib/cachebench/cache/CacheStats.h b/cachelib/cachebench/cache/CacheStats.h
index 1b0330fb5f..a846ab3213 100644
--- a/cachelib/cachebench/cache/CacheStats.h
+++ b/cachelib/cachebench/cache/CacheStats.h
@@ -127,15 +127,15 @@ struct Stats {
   uint64_t invalidDestructorCount{0};
   int64_t unDestructedItemCount{0};
 
-  std::map<PoolId, std::map<ClassId, ACStats>> allocationClassStats;
+  std::map<TierId, std::map<PoolId, std::map<ClassId, ACStats>>> allocationClassStats;
 
   // populate the counters related to nvm usage. Cache implementation can decide
   // what to populate since not all of those are interesting when running
   // cachebench.
   std::unordered_map<std::string, double> nvmCounters;
 
-  std::map<PoolId, std::map<ClassId, uint64_t>> backgroundEvictionClasses;
-  std::map<PoolId, std::map<ClassId, uint64_t>> backgroundPromotionClasses;
+  std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>> backgroundEvictionClasses;
+  std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>> backgroundPromotionClasses;
 
   // errors from the nvm engine.
   std::unordered_map<std::string, double> nvmErrors;
@@ -157,9 +157,11 @@ struct Stats {
     out << folly::sformat("RAM Evictions : {:,}", numEvictions) << std::endl;
 
     auto foreachAC = [](const auto& map, auto cb) {
-      for (auto& pidStat : map) {
-        for (auto& cidStat : pidStat.second) {
-          cb(pidStat.first, cidStat.first, cidStat.second);
+      for (auto &tidStat : map) {
+        for (auto& pidStat : tidStat.second) {
+          for (auto& cidStat : pidStat.second) {
+            cb(tidStat.first, pidStat.first, cidStat.first, cidStat.second);
+          }
         }
       }
     };
@@ -191,17 +193,17 @@ struct Stats {
         }
       };
 
-      foreachAC(allocationClassStats, [&](auto pid, auto cid, auto stats) {
+      foreachAC(allocationClassStats, [&](auto tid, auto pid, auto cid, auto stats) {
         auto [allocSizeSuffix, allocSize] = formatMemory(stats.allocSize);
         auto [memorySizeSuffix, memorySize] =
             formatMemory(stats.totalAllocatedSize());
-        out << folly::sformat("pid{:2} cid{:4} {:8.2f}{} memorySize: {:8.2f}{}",
-                              pid, cid, allocSize, allocSizeSuffix, memorySize,
+        out << folly::sformat("tid{:2} pid{:2} cid{:4} {:8.2f}{} memorySize: {:8.2f}{}",
+                              tid, pid, cid, allocSize, allocSizeSuffix, memorySize,
                               memorySizeSuffix)
             << std::endl;
       });
 
-      foreachAC(allocationClassStats, [&](auto pid, auto cid, auto stats) {
+      foreachAC(allocationClassStats, [&](auto tid, auto pid, auto cid, auto stats) {
         auto [allocSizeSuffix, allocSize] = formatMemory(stats.allocSize);
 
         // If the pool is not full, extrapolate usageFraction for AC assuming it
@@ -211,8 +213,8 @@ struct Stats {
                                    : stats.usageFraction();
 
         out << folly::sformat(
-                   "pid{:2} cid{:4} {:8.2f}{} usageFraction: {:4.2f}", pid, cid,
-                   allocSize, allocSizeSuffix, acUsageFraction)
+                   "tid{:2} pid{:2} cid{:4} {:8.2f}{} usageFraction: {:4.2f}",
+                   tid, pid, cid, allocSize, allocSizeSuffix, acUsageFraction)
             << std::endl;
       });
     }
@@ -251,10 +253,9 @@ struct Stats {
         backgndEvicStats.nEvictedItems > 0) {
       out << "== Class Background Eviction Counters Map ==" << std::endl;
       foreachAC(backgroundEvictionClasses,
-                [&](auto pid, auto cid, auto evicted) {
-                  out << folly::sformat("pid{:2} cid{:4} evicted: {:4}", pid,
-                                        cid, evicted)
-                      << std::endl;
+                [&](auto tid, auto pid, auto cid, auto evicted) {
+                  out << folly::sformat("tid{:2} pid{:2} cid{:4} evicted: {:4}",
+                    tid, pid, cid, evicted) << std::endl;
                 });
 
       out << folly::sformat("Background Evicted Items : {:,}",
@@ -269,10 +270,9 @@ struct Stats {
         backgndPromoStats.nPromotedItems > 0) {
       out << "== Class Background Promotion Counters Map ==" << std::endl;
       foreachAC(backgroundPromotionClasses,
-                [&](auto pid, auto cid, auto promoted) {
-                  out << folly::sformat("pid{:2} cid{:4} promoted: {:4}", pid,
-                                        cid, promoted)
-                      << std::endl;
+                [&](auto tid, auto pid, auto cid, auto promoted) {
+                  out << folly::sformat("tid{:2} pid{:2} cid{:4} promoted: {:4}",
+                    pid, cid, promoted) << std::endl;
                 });
 
       out << folly::sformat("Background Promoted Items : {:,}",

From 664da8d6fb618422c4b068290df9abff938cbb2b Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Tue, 17 Jan 2023 10:49:16 -0800
Subject: [PATCH 05/40] AC stats multi-tier

---
 cachelib/allocator/Cache.h             |  2 +-
 cachelib/cachebench/cache/CacheStats.h | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/cachelib/allocator/Cache.h b/cachelib/allocator/Cache.h
index 8dbe5fdc6e..52fff0b254 100644
--- a/cachelib/allocator/Cache.h
+++ b/cachelib/allocator/Cache.h
@@ -112,7 +112,7 @@ class CacheBase {
   //
   // @param poolId   the pool id
   // @param classId   the class id
-  virtual ACStats getACStats(TierId tid,PoolId poolId, ClassId classId) const = 0;
+  virtual ACStats getACStats(TierId tid, PoolId poolId, ClassId classId) const = 0;
 
   // @param poolId   the pool id
   virtual AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId poolId) const = 0;
diff --git a/cachelib/cachebench/cache/CacheStats.h b/cachelib/cachebench/cache/CacheStats.h
index a846ab3213..39bf498c29 100644
--- a/cachelib/cachebench/cache/CacheStats.h
+++ b/cachelib/cachebench/cache/CacheStats.h
@@ -193,7 +193,17 @@ struct Stats {
         }
       };
 
-      foreachAC(allocationClassStats, [&](auto tid, auto pid, auto cid, auto stats) {
+      auto foreachAC = [&](auto cb) {
+        for (auto& tidStat : allocationClassStats) {
+          for (auto& pidStat : tidStat.second) {
+            for (auto& cidStat : pidStat.second) {
+              cb(tidStat.first, pidStat.first, cidStat.first, cidStat.second);
+            }
+          }
+        }
+      };
+
+      foreachAC([&](auto tid, auto pid, auto cid, auto stats) {
         auto [allocSizeSuffix, allocSize] = formatMemory(stats.allocSize);
         auto [memorySizeSuffix, memorySize] =
             formatMemory(stats.totalAllocatedSize());
@@ -203,7 +213,7 @@ struct Stats {
             << std::endl;
       });
 
-      foreachAC(allocationClassStats, [&](auto tid, auto pid, auto cid, auto stats) {
+      foreachAC([&](auto tid, auto pid, auto cid, auto stats) {
         auto [allocSizeSuffix, allocSize] = formatMemory(stats.allocSize);
 
         // If the pool is not full, extrapolate usageFraction for AC assuming it

From 3b7bb0c698053029c71331f06df29111b272aff3 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Wed, 8 Feb 2023 08:30:48 -0800
Subject: [PATCH 06/40] Tests and fix tier sizing -------------------------
 There are two parts to this commit and we can split them up.

Part 1)

This commit contains the additional memory tiers tests
for different pool sizes. We also use getPoolSize(pid),
to get total size from all pools across allocators.

Part 2)

This part can be merged with the initial multi-tier
part 1. It fixes the tiering sizes (pulls changes from
what was issue75 rebased commit that did not make
it into upstream commits).
---
 cachelib/allocator/CacheAllocator.h           | 38 ++++++--
 .../tests/AllocatorMemoryTiersTest.cpp        |  6 +-
 .../tests/AllocatorMemoryTiersTest.h          | 40 ++++++++-
 cachelib/allocator/tests/MemoryTiersTest.cpp  | 86 ++++++++++++++++++-
 4 files changed, 156 insertions(+), 14 deletions(-)

diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index a08fca177a..72a7063ee2 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -2210,6 +2210,8 @@ class CacheAllocator : public CacheBase {
     return config_.memoryTierConfigs.size();
   }
 
+  size_t memoryTierSize(TierId tid) const;
+
   // Whether the memory allocator for this cache allocator was created on shared
   // memory. The hash table, chained item hash table etc is also created on
   // shared memory except for temporary shared memory mode when they're created
@@ -2496,6 +2498,16 @@ ShmSegmentOpts CacheAllocator<CacheTrait>::createShmCacheOpts(TierId tid) {
   return opts;
 }
 
+template <typename CacheTrait>
+size_t CacheAllocator<CacheTrait>::memoryTierSize(TierId tid) const {
+  auto partitions = std::accumulate(config_.memoryTierConfigs.begin(), config_.memoryTierConfigs.end(), 0UL,
+  [](const size_t i, const MemoryTierCacheConfig& config){
+    return i + config.getRatio();
+  });
+
+  return config_.memoryTierConfigs[tid].calculateTierSize(config_.getCacheSize(), partitions);
+}
+
 template <typename CacheTrait>
 std::vector<std::unique_ptr<MemoryAllocator>>
 CacheAllocator<CacheTrait>::createPrivateAllocator() {
@@ -2518,14 +2530,15 @@ CacheAllocator<CacheTrait>::createPrivateAllocator() {
 template <typename CacheTrait>
 std::unique_ptr<MemoryAllocator>
 CacheAllocator<CacheTrait>::createNewMemoryAllocator(TierId tid) {
+  size_t tierSize = memoryTierSize(tid);
   return std::make_unique<MemoryAllocator>(
       getAllocatorConfig(config_),
       shmManager_
           ->createShm(detail::kShmCacheName + std::to_string(tid),
-                      config_.getCacheSize(), config_.slabMemoryBaseAddr,
+                      tierSize, config_.slabMemoryBaseAddr,
                       createShmCacheOpts(tid))
           .addr,
-      config_.getCacheSize());
+      tierSize);
 }
 
 template <typename CacheTrait>
@@ -2536,7 +2549,7 @@ CacheAllocator<CacheTrait>::restoreMemoryAllocator(TierId tid) {
       shmManager_
           ->attachShm(detail::kShmCacheName + std::to_string(tid),
             config_.slabMemoryBaseAddr, createShmCacheOpts(tid)).addr,
-      config_.getCacheSize(),
+      memoryTierSize(tid),
       config_.disableFullCoredump);
 }
 
@@ -4830,6 +4843,16 @@ const std::string CacheAllocator<CacheTrait>::getCacheName() const {
   return config_.cacheName;
 }
 
+template <typename CacheTrait>
+size_t CacheAllocator<CacheTrait>::getPoolSize(PoolId poolId) const {
+  size_t poolSize = 0;
+  for (auto& allocator: allocator_) {
+    const auto& pool = allocator->getPool(poolId);
+    poolSize += pool.getPoolSize();
+  }
+  return poolSize;
+}
+
 template <typename CacheTrait>
 PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
   const auto& pool = allocator_[0]->getPool(poolId);
@@ -5777,9 +5800,12 @@ GlobalCacheStats CacheAllocator<CacheTrait>::getGlobalCacheStats() const {
 
 template <typename CacheTrait>
 CacheMemoryStats CacheAllocator<CacheTrait>::getCacheMemoryStats() const {
-  const auto totalCacheSize = allocator_[0]->getMemorySize();
-  const auto configuredTotalCacheSize = allocator_[0]->getMemorySizeInclAdvised();
-
+  size_t totalCacheSize = 0;
+  size_t configuredTotalCacheSize = 0;
+  for(auto& allocator: allocator_) {
+    totalCacheSize += allocator->getMemorySize();
+    configuredTotalCacheSize += allocator->getMemorySizeInclAdvised();
+  }
   auto addSize = [this](size_t a, PoolId pid) {
     return a + allocator_[0]->getPool(pid).getPoolSize();
   };
diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
index 3e4847251f..c56f640847 100644
--- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
+++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
@@ -23,9 +23,9 @@ namespace tests {
 using LruAllocatorMemoryTiersTest = AllocatorMemoryTiersTest<LruAllocator>;
 
 // TODO(MEMORY_TIER): add more tests with different eviction policies
-TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValid1) {
-  this->testMultiTiersValid1();
-}
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersInvalid) { this->testMultiTiersInvalid(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValid) { this->testMultiTiersValid(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValidMixed) { this->testMultiTiersValidMixed(); }
 
 } // end of namespace tests
 } // end of namespace cachelib
diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
index a0d1513990..2ecb2c14ca 100644
--- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
+++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
@@ -27,7 +27,7 @@ namespace tests {
 template <typename AllocatorT>
 class AllocatorMemoryTiersTest : public AllocatorTest<AllocatorT> {
  public:
-  void testMultiTiersValid1() {
+  void testMultiTiersInvalid() {
     typename AllocatorT::Config config;
     config.setCacheSize(100 * Slab::kSize);
     ASSERT_NO_THROW(config.configureMemoryTiers(
@@ -36,6 +36,44 @@ class AllocatorMemoryTiersTest : public AllocatorTest<AllocatorT> {
          MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
              std::string("0"))}));
   }
+
+  void testMultiTiersValid() {
+    typename AllocatorT::Config config;
+    config.setCacheSize(100 * Slab::kSize);
+    config.enableCachePersistence("/tmp");
+    ASSERT_NO_THROW(config.configureMemoryTiers(
+        {MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+             std::string("0")),
+         MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+             std::string("0"))}));
+
+    auto alloc = std::make_unique<AllocatorT>(AllocatorT::SharedMemNew, config);
+    ASSERT(alloc != nullptr);
+
+    auto pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize);
+    auto handle = alloc->allocate(pool, "key", std::string("value").size());
+    ASSERT(handle != nullptr);
+    ASSERT_NO_THROW(alloc->insertOrReplace(handle));
+  }
+
+  void testMultiTiersValidMixed() {
+    typename AllocatorT::Config config;
+    config.setCacheSize(100 * Slab::kSize);
+    config.enableCachePersistence("/tmp");
+    ASSERT_NO_THROW(config.configureMemoryTiers(
+        {MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+             std::string("0")),
+         MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+             std::string("0"))}));
+
+    auto alloc = std::make_unique<AllocatorT>(AllocatorT::SharedMemNew, config);
+    ASSERT(alloc != nullptr);
+
+    auto pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize);
+    auto handle = alloc->allocate(pool, "key", std::string("value").size());
+    ASSERT(handle != nullptr);
+    ASSERT_NO_THROW(alloc->insertOrReplace(handle));
+  }
 };
 } // namespace tests
 } // namespace cachelib
diff --git a/cachelib/allocator/tests/MemoryTiersTest.cpp b/cachelib/allocator/tests/MemoryTiersTest.cpp
index ed35115c0c..535cb14bbe 100644
--- a/cachelib/allocator/tests/MemoryTiersTest.cpp
+++ b/cachelib/allocator/tests/MemoryTiersTest.cpp
@@ -34,7 +34,7 @@ constexpr size_t MB = 1024ULL * 1024ULL;
 constexpr size_t GB = MB * 1024ULL;
 
 const size_t defaultTotalCacheSize{1 * GB};
-const std::string defaultCacheDir{"/var/metadataDir"};
+const std::string defaultCacheDir{"/tmp/metadataDir"};
 
 template <typename Allocator>
 class MemoryTiersTest : public AllocatorTest<Allocator> {
@@ -109,7 +109,7 @@ class MemoryTiersTest : public AllocatorTest<Allocator> {
   void validatePoolSize(PoolId poolId,
                         std::unique_ptr<LruAllocator>& allocator,
                         size_t expectedSize) {
-    size_t actualSize = allocator->getPool(poolId).getPoolSize();
+    size_t actualSize = allocator->getPoolSize(poolId);
     EXPECT_EQ(actualSize, expectedSize);
   }
 
@@ -119,9 +119,9 @@ class MemoryTiersTest : public AllocatorTest<Allocator> {
                    size_t numTiers = 2) {
     if (isSizeValid) {
       auto pool = alloc->addPool("validPoolSize", poolSize);
-      EXPECT_LE(alloc->getPool(pool).getPoolSize(), poolSize);
+      EXPECT_LE(alloc->getPoolSize(pool), poolSize);
       if (poolSize >= numTiers * Slab::kSize)
-        EXPECT_GE(alloc->getPool(pool).getPoolSize(),
+        EXPECT_GE(alloc->getPoolSize(pool),
                   poolSize - numTiers * Slab::kSize);
     } else {
       EXPECT_THROW(alloc->addPool("invalidPoolSize", poolSize),
@@ -172,6 +172,84 @@ TEST_F(LruMemoryTiersTest, TestInvalid2TierConfigRatioNotSet) {
 TEST_F(LruMemoryTiersTest, TestInvalid2TierConfigSizesNeCacheSize) {
   EXPECT_THROW(createTestCacheConfig({0, 0}), std::invalid_argument);
 }
+
+TEST_F(LruMemoryTiersTest, TestPoolAllocations) {
+  std::vector<size_t> totalCacheSizes = {8 * GB, 2 * GB};
+
+  static const size_t numExtraSizes = 4;
+  static const size_t numExtraSlabs = 20;
+
+  for (size_t i = 0; i < numExtraSizes; i++) {
+    totalCacheSizes.push_back(totalCacheSizes.back() +
+                              (folly::Random::rand64() % numExtraSlabs) *
+                                  Slab::kSize);
+  }
+
+  size_t min_ratio = 1;
+  size_t max_ratio = 111;
+
+  static const size_t numCombinations = 10;
+
+  for (auto totalCacheSize : totalCacheSizes) {
+    for (size_t k = 0; k < numCombinations; k++) {
+      const size_t i = folly::Random::rand32() % max_ratio + min_ratio;
+      const size_t j = folly::Random::rand32() % max_ratio + min_ratio;
+      LruAllocatorConfig cfg =
+          createTestCacheConfig({i, j},
+                                /* usePoisx */ true, totalCacheSize);
+      basicCheck(cfg, totalCacheSize);
+
+      std::unique_ptr<LruAllocator> alloc = std::unique_ptr<LruAllocator>(
+          new LruAllocator(LruAllocator::SharedMemNew, cfg));
+
+      size_t size = (folly::Random::rand64() %
+                      (alloc->getCacheMemoryStats().ramCacheSize - Slab::kSize)) +
+                    Slab::kSize;
+      testAddPool(alloc, size, true);
+    }
+  }
+}
+
+TEST_F(LruMemoryTiersTest, TestPoolInvalidAllocations) {
+  std::vector<size_t> totalCacheSizes = {48 * MB, 51 * MB, 256 * MB,
+                                         1 * GB,  5 * GB,  8 * GB};
+  size_t min_ratio = 1;
+  size_t max_ratio = 111;
+
+  static const size_t numCombinations = 10;
+
+  for (auto totalCacheSize : totalCacheSizes) {
+    for (size_t k = 0; k < numCombinations; k++) {
+      const size_t i = folly::Random::rand32() % max_ratio + min_ratio;
+      const size_t j = folly::Random::rand32() % max_ratio + min_ratio;
+      LruAllocatorConfig cfg =
+          createTestCacheConfig({i, j},
+                                /* usePoisx */ true, totalCacheSize);
+
+      std::unique_ptr<LruAllocator> alloc = nullptr;
+      try {
+         alloc = std::unique_ptr<LruAllocator>(
+            new LruAllocator(LruAllocator::SharedMemNew, cfg));
+      } catch(...) {
+        // expection only if cache too small
+        size_t sum_ratios = std::accumulate(
+          cfg.getMemoryTierConfigs().begin(), cfg.getMemoryTierConfigs().end(), 0UL,
+          [](const size_t i, const MemoryTierCacheConfig& config) {
+            return i + config.getRatio();
+        });
+        auto tier1slabs = cfg.getMemoryTierConfigs()[0].calculateTierSize(cfg.getCacheSize(), sum_ratios) / Slab::kSize;
+        auto tier2slabs = cfg.getMemoryTierConfigs()[1].calculateTierSize(cfg.getCacheSize(), sum_ratios) / Slab::kSize;
+        EXPECT_TRUE(tier1slabs <= 2 || tier2slabs <= 2);
+
+        continue;
+      }
+
+      size_t size = (folly::Random::rand64() % (100 * GB)) +
+                    alloc->getCacheMemoryStats().ramCacheSize;
+      testAddPool(alloc, size, false);
+    }
+  }
+}
 } // namespace tests
 } // namespace cachelib
 } // namespace facebook

From 58e825b37aa7a0e9d784c428f68168a0ba420595 Mon Sep 17 00:00:00 2001
From: Sounak Gupta <sounak.gupta@intel.com>
Date: Mon, 14 Nov 2022 02:07:57 -0800
Subject: [PATCH 07/40] This is the additional multi-tier support needed for
 the compressed ptr changes that were introduced upstream.  - Includes later
 cosmetic changes added by sounak 9cb5c29fa493499192900227169050773820d265

---
 cachelib/allocator/CacheAllocator.h           |  3 +-
 cachelib/allocator/memory/AllocationClass.cpp | 11 ++--
 cachelib/allocator/memory/AllocationClass.h   |  2 +-
 cachelib/allocator/memory/CompressedPtr.h     | 65 +++++++++++++++++--
 cachelib/allocator/memory/MemoryAllocator.h   | 11 ++--
 cachelib/allocator/memory/SlabAllocator.h     |  4 +-
 run_tests.sh                                  |  1 +
 7 files changed, 77 insertions(+), 20 deletions(-)

diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index 72a7063ee2..38037382ef 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -1333,6 +1333,7 @@ class CacheAllocator : public CacheBase {
                  sizeof(typename RefcountWithFlags::Value) + sizeof(uint32_t) +
                  sizeof(uint32_t) + sizeof(KAllocation)) == sizeof(Item),
                 "vtable overhead");
+  // Check for CompressedPtr single/multi tier support
   static_assert(32 == sizeof(Item), "item overhead is 32 bytes");
 
   // make sure there is no overhead in ChainedItem on top of a regular Item
@@ -1988,7 +1989,7 @@ class CacheAllocator : public CacheBase {
   }
 
   typename Item::PtrCompressor createPtrCompressor() const {
-    return allocator_[0 /* TODO */]->createPtrCompressor<Item>();
+    return typename Item::PtrCompressor(allocator_);
   }
 
   // helper utility to throttle and optionally log.
diff --git a/cachelib/allocator/memory/AllocationClass.cpp b/cachelib/allocator/memory/AllocationClass.cpp
index 71089153e9..512df86bbe 100644
--- a/cachelib/allocator/memory/AllocationClass.cpp
+++ b/cachelib/allocator/memory/AllocationClass.cpp
@@ -50,7 +50,7 @@ AllocationClass::AllocationClass(ClassId classId,
       poolId_(poolId),
       allocationSize_(allocSize),
       slabAlloc_(s),
-      freedAllocations_{slabAlloc_.createPtrCompressor<FreeAlloc>()} {
+      freedAllocations_{slabAlloc_.createSingleTierPtrCompressor<FreeAlloc>()} {
   checkState();
 }
 
@@ -102,7 +102,7 @@ AllocationClass::AllocationClass(
       currSlab_(s.getSlabForIdx(*object.currSlabIdx())),
       slabAlloc_(s),
       freedAllocations_(*object.freedAllocationsObject(),
-                        slabAlloc_.createPtrCompressor<FreeAlloc>()),
+                        slabAlloc_.createSingleTierPtrCompressor<FreeAlloc>()),
       canAllocate_(*object.canAllocate()) {
   if (!slabAlloc_.isRestorable()) {
     throw std::logic_error("The allocation class cannot be restored.");
@@ -356,9 +356,10 @@ std::pair<bool, std::vector<void*>> AllocationClass::pruneFreeAllocs(
   // allocated slab, release any freed allocations belonging to this slab.
   // Set the bit to true if the corresponding allocation is freed, false
   // otherwise.
-  FreeList freeAllocs{slabAlloc_.createPtrCompressor<FreeAlloc>()};
-  FreeList notInSlab{slabAlloc_.createPtrCompressor<FreeAlloc>()};
-  FreeList inSlab{slabAlloc_.createPtrCompressor<FreeAlloc>()};
+  FreeList freeAllocs{slabAlloc_.createSingleTierPtrCompressor<FreeAlloc>()};
+  FreeList notInSlab{slabAlloc_.createSingleTierPtrCompressor<FreeAlloc>()};
+  FreeList inSlab{slabAlloc_.createSingleTierPtrCompressor<FreeAlloc>()};
+
 
   lock_->lock_combine([&]() {
     // Take the allocation class free list offline
diff --git a/cachelib/allocator/memory/AllocationClass.h b/cachelib/allocator/memory/AllocationClass.h
index d45a45c6cd..269887f207 100644
--- a/cachelib/allocator/memory/AllocationClass.h
+++ b/cachelib/allocator/memory/AllocationClass.h
@@ -445,7 +445,7 @@ class AllocationClass {
   struct CACHELIB_PACKED_ATTR FreeAlloc {
     using CompressedPtr = facebook::cachelib::CompressedPtr;
     using PtrCompressor =
-        facebook::cachelib::PtrCompressor<FreeAlloc, SlabAllocator>;
+        facebook::cachelib::SingleTierPtrCompressor<FreeAlloc, SlabAllocator>;
     SListHook<FreeAlloc> hook_{};
   };
 
diff --git a/cachelib/allocator/memory/CompressedPtr.h b/cachelib/allocator/memory/CompressedPtr.h
index 029abd91b9..d664063ea3 100644
--- a/cachelib/allocator/memory/CompressedPtr.h
+++ b/cachelib/allocator/memory/CompressedPtr.h
@@ -27,9 +27,12 @@ namespace cachelib {
 
 class SlabAllocator;
 
+template <typename PtrType, typename AllocatorContainer>
+class PtrCompressor;
+
 // This CompressedPtr makes decompression fast by staying away from division and
-// modulo arithmetic and doing those during the compression time. We most often
-// decompress a CompressedPtr than compress a pointer while creating one. This
+// modulo arithmetic and doing those during the compression time. We most  often
+// decompress a CompressedPtr than compress a pointer  while creating one.  This
 // is used for pointer compression by the memory allocator.
 
 // We compress pointers by storing the tier index, slab index and alloc index of
@@ -173,12 +176,14 @@ class CACHELIB_PACKED_ATTR CompressedPtr {
   }
 
   friend SlabAllocator;
+  template <typename CPtrType, typename AllocatorContainer>
+  friend class PtrCompressor;
 };
 
 template <typename PtrType, typename AllocatorT>
-class PtrCompressor {
+class SingleTierPtrCompressor {
  public:
-  explicit PtrCompressor(const AllocatorT& allocator) noexcept
+  explicit SingleTierPtrCompressor(const AllocatorT& allocator) noexcept
       : allocator_(allocator) {}
 
   const CompressedPtr compress(const PtrType* uncompressed) const {
@@ -190,11 +195,11 @@ class PtrCompressor {
         allocator_.unCompress(compressed, false /* isMultiTiered */));
   }
 
-  bool operator==(const PtrCompressor& rhs) const noexcept {
+  bool operator==(const SingleTierPtrCompressor& rhs) const noexcept {
     return &allocator_ == &rhs.allocator_;
   }
 
-  bool operator!=(const PtrCompressor& rhs) const noexcept {
+  bool operator!=(const SingleTierPtrCompressor& rhs) const noexcept {
     return !(*this == rhs);
   }
 
@@ -202,5 +207,53 @@ class PtrCompressor {
   // memory allocator that does the pointer compression.
   const AllocatorT& allocator_;
 };
+
+template <typename PtrType, typename AllocatorContainer>
+class PtrCompressor {
+ public:
+  explicit PtrCompressor(const AllocatorContainer& allocators) noexcept
+      : allocators_(allocators) {}
+
+  const CompressedPtr compress(const PtrType* uncompressed) const {
+    if (uncompressed == nullptr)
+      return CompressedPtr{};
+
+    TierId tid;
+    for (tid = 0; tid < allocators_.size(); tid++) {
+      if (allocators_[tid]->isMemoryInAllocator(
+              static_cast<const void*>(uncompressed)))
+        break;
+    }
+
+    bool isMultiTiered = allocators_.size() > 1;
+    auto cptr = allocators_[tid]->compress(uncompressed, isMultiTiered);
+    if (isMultiTiered) { // config has multiple tiers
+      cptr.setTierId(tid);
+    }
+    return cptr;
+  }
+
+  PtrType* unCompress(const CompressedPtr compressed) const {
+    if (compressed.isNull()) {
+      return nullptr;
+    }
+    bool isMultiTiered = allocators_.size() > 1;
+    auto& allocator = *allocators_[compressed.getTierId(isMultiTiered)];
+    return static_cast<PtrType*>(
+        allocator.unCompress(compressed, isMultiTiered));
+  }
+
+  bool operator==(const PtrCompressor& rhs) const noexcept {
+    return &allocators_ == &rhs.allocators_;
+  }
+
+  bool operator!=(const PtrCompressor& rhs) const noexcept {
+    return !(*this == rhs);
+  }
+
+ private:
+  // memory allocator that does the pointer compression.
+  const AllocatorContainer& allocators_;
+};
 } // namespace cachelib
 } // namespace facebook
diff --git a/cachelib/allocator/memory/MemoryAllocator.h b/cachelib/allocator/memory/MemoryAllocator.h
index 625171fd6f..a77d23494c 100644
--- a/cachelib/allocator/memory/MemoryAllocator.h
+++ b/cachelib/allocator/memory/MemoryAllocator.h
@@ -516,12 +516,13 @@ class MemoryAllocator {
   using CompressedPtr = facebook::cachelib::CompressedPtr;
   template <typename PtrType>
   using PtrCompressor =
-      facebook::cachelib::PtrCompressor<PtrType, SlabAllocator>;
-
+      facebook::cachelib::PtrCompressor<PtrType,
+             std::vector<std::unique_ptr<MemoryAllocator>>>;
+  
   template <typename PtrType>
-  PtrCompressor<PtrType> createPtrCompressor() {
-    return slabAllocator_.createPtrCompressor<PtrType>();
-  }
+  using SingleTierPtrCompressor =
+       facebook::cachelib::PtrCompressor<PtrType,
+       SlabAllocator>;
 
   // compress a given pointer to a valid allocation made out of this allocator
   // through an allocate() or nullptr. Calling this otherwise with invalid
diff --git a/cachelib/allocator/memory/SlabAllocator.h b/cachelib/allocator/memory/SlabAllocator.h
index 9fdb1e60b4..a80a54672c 100644
--- a/cachelib/allocator/memory/SlabAllocator.h
+++ b/cachelib/allocator/memory/SlabAllocator.h
@@ -318,8 +318,8 @@ class SlabAllocator {
   }
 
   template <typename PtrType>
-  PtrCompressor<PtrType, SlabAllocator> createPtrCompressor() const {
-    return PtrCompressor<PtrType, SlabAllocator>(*this);
+  SingleTierPtrCompressor<PtrType, SlabAllocator> createSingleTierPtrCompressor() const {
+     return SingleTierPtrCompressor<PtrType, SlabAllocator>(*this);
   }
 
   // returns starting address of memory we own.
diff --git a/run_tests.sh b/run_tests.sh
index 111e218333..e575dbc62a 100755
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -2,6 +2,7 @@
 
 # Newline separated list of tests to ignore
 BLACKLIST="allocator-test-NavySetupTest
+allocator-test-NvmCacheTests
 shm-test-test_page_size"
 
 if [ "$1" == "long" ]; then

From 9fc705f990abd2d98864523903b9038c8092bd96 Mon Sep 17 00:00:00 2001
From: Sounak Gupta <sounak.gupta@intel.com>
Date: Thu, 21 Jul 2022 02:01:04 -0700
Subject: [PATCH 08/40] Rolling average alloc latency Part 1. (single tier)
 ----------------------------- added per pool class rolling average latency
 (upstream PR version)

fix for rolling stats (on multi-tier to be followed by multi-tier rolling stats
implementation in the following commit)

it should be noted - an attempt was made to use
average alloc latency as a guide to control background
mover batch size. While average alloc latency decreased,
so did throughput because batch size became too big
and put contention on locks.
---
 cachelib/allocator/CacheAllocator.h           |  9 +-
 cachelib/allocator/CacheStats.cpp             |  2 +
 cachelib/allocator/CacheStatsInternal.h       |  8 ++
 .../allocator/memory/MemoryAllocatorStats.h   |  4 +
 cachelib/cachebench/cache/CacheStats.h        |  6 +-
 cachelib/common/RollingStats.h                | 90 +++++++++++++++++++
 6 files changed, 116 insertions(+), 3 deletions(-)
 create mode 100644 cachelib/common/RollingStats.h

diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index 38037382ef..6660e9f788 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -2782,6 +2782,8 @@ CacheAllocator<CacheTrait>::allocateInternalTier(TierId tid,
 
   // the allocation class in our memory allocator.
   const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize);
+  util::RollingLatencyTracker rollTracker{
+      (*stats_.classAllocLatency)[pid][cid]};
 
   // TODO: per-tier
   (*stats_.allocAttempts)[pid][cid].inc();
@@ -2892,6 +2894,9 @@ CacheAllocator<CacheTrait>::allocateChainedItemInternal(const Item& parent,
 
   // TODO: per-tier? Right now stats_ are not used in any public periodic
   // worker
+  util::RollingLatencyTracker rollTracker{
+      (*stats_.classAllocLatency)[pid][cid]};
+
   (*stats_.allocAttempts)[pid][cid].inc();
 
   void* memory = allocator_[tid]->allocate(pid, requiredSize);
@@ -4912,7 +4917,9 @@ ACStats CacheAllocator<CacheTrait>::getACStats(TierId tid,
                                                ClassId classId) const {
   const auto& pool = allocator_[tid]->getPool(poolId);
   const auto& ac = pool.getAllocationClass(classId);
-  return ac.getStats();
+  auto stats = ac.getStats();
+  stats.allocLatencyNs = (*stats_.classAllocLatency)[poolId][classId];
+  return stats;
 }
 
 template <typename CacheTrait>
diff --git a/cachelib/allocator/CacheStats.cpp b/cachelib/allocator/CacheStats.cpp
index 6b7a1c943b..c708743036 100644
--- a/cachelib/allocator/CacheStats.cpp
+++ b/cachelib/allocator/CacheStats.cpp
@@ -43,6 +43,8 @@ void Stats::init() {
   initToZero(*fragmentationSize);
   initToZero(*chainedItemEvictions);
   initToZero(*regularItemEvictions);
+
+  classAllocLatency = std::make_unique<PerPoolClassRollingStats>();
 }
 
 template <int>
diff --git a/cachelib/allocator/CacheStatsInternal.h b/cachelib/allocator/CacheStatsInternal.h
index b0934eb0c1..b205671e42 100644
--- a/cachelib/allocator/CacheStatsInternal.h
+++ b/cachelib/allocator/CacheStatsInternal.h
@@ -21,6 +21,7 @@
 #include "cachelib/allocator/Cache.h"
 #include "cachelib/allocator/memory/MemoryAllocator.h"
 #include "cachelib/common/AtomicCounter.h"
+#include "cachelib/common/RollingStats.h"
 
 namespace facebook {
 namespace cachelib {
@@ -229,6 +230,13 @@ struct Stats {
   std::unique_ptr<PerPoolClassAtomicCounters> chainedItemEvictions{};
   std::unique_ptr<PerPoolClassAtomicCounters> regularItemEvictions{};
 
+  using PerPoolClassRollingStats =
+      std::array<std::array<util::RollingStats, MemoryAllocator::kMaxClasses>,
+                 MemoryPoolManager::kMaxPools>;
+
+  // rolling latency tracking for every alloc class in every pool
+  std::unique_ptr<PerPoolClassRollingStats> classAllocLatency{};
+
   // Eviction failures due to parent cannot be removed from access container
   AtomicCounter evictFailParentAC{0};
 
diff --git a/cachelib/allocator/memory/MemoryAllocatorStats.h b/cachelib/allocator/memory/MemoryAllocatorStats.h
index 7ee4ca9916..7301145286 100644
--- a/cachelib/allocator/memory/MemoryAllocatorStats.h
+++ b/cachelib/allocator/memory/MemoryAllocatorStats.h
@@ -22,6 +22,7 @@
 #include <unordered_map>
 
 #include "cachelib/allocator/memory/Slab.h"
+#include "cachelib/common/RollingStats.h"
 
 namespace facebook {
 namespace cachelib {
@@ -49,6 +50,9 @@ struct ACStats {
   // true if the allocation class is full.
   bool full;
 
+  // Rolling allocation latency (in ns)
+  util::RollingStats allocLatencyNs;
+
   constexpr unsigned long long totalSlabs() const noexcept {
     return freeSlabs + usedSlabs;
   }
diff --git a/cachelib/cachebench/cache/CacheStats.h b/cachelib/cachebench/cache/CacheStats.h
index 39bf498c29..72a0a815f2 100644
--- a/cachelib/cachebench/cache/CacheStats.h
+++ b/cachelib/cachebench/cache/CacheStats.h
@@ -223,8 +223,10 @@ struct Stats {
                                    : stats.usageFraction();
 
         out << folly::sformat(
-                   "tid{:2} pid{:2} cid{:4} {:8.2f}{} usageFraction: {:4.2f}",
-                   tid, pid, cid, allocSize, allocSizeSuffix, acUsageFraction)
+                   "tid{:2} pid{:2} cid{:4} {:8.2f}{} usageFraction: {:4.2f} "
+                   "rollingAvgAllocLatency: {:8.2f}ns",
+                   tid, pid, cid, allocSize, allocSizeSuffix, acUsageFraction,
+                   stats.allocLatencyNs.estimate())
             << std::endl;
       });
     }
diff --git a/cachelib/common/RollingStats.h b/cachelib/common/RollingStats.h
new file mode 100644
index 0000000000..4d179681ad
--- /dev/null
+++ b/cachelib/common/RollingStats.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <folly/Range.h>
+#include <folly/logging/xlog.h>
+
+#include "cachelib/common/Utils.h"
+
+namespace facebook {
+namespace cachelib {
+namespace util {
+
+class RollingStats {
+ public:
+  // track latency by taking the value of duration directly.
+  void trackValue(double value) {
+    // This is a highly unlikely scenario where
+    // cnt_ reaches numerical limits. Skip update
+    // of the rolling average anymore.
+    if (cnt_ == std::numeric_limits<uint64_t>::max()) {
+      cnt_ = 0;
+      return;
+    }
+    auto ratio = static_cast<double>(cnt_) / (cnt_ + 1);
+    avg_ *= ratio;
+    ++cnt_;
+    avg_ += value / cnt_;
+  }
+
+  // Return the rolling average.
+  double estimate() { return avg_; }
+
+ private:
+  double avg_{0};
+  uint64_t cnt_{0};
+};
+
+class RollingLatencyTracker {
+ public:
+  explicit RollingLatencyTracker(RollingStats& stats)
+      : stats_(&stats), begin_(std::chrono::steady_clock::now()) {}
+  RollingLatencyTracker() {}
+  ~RollingLatencyTracker() {
+    if (stats_) {
+      auto tp = std::chrono::steady_clock::now();
+      auto diffNanos =
+          std::chrono::duration_cast<std::chrono::nanoseconds>(tp - begin_)
+              .count();
+      stats_->trackValue(static_cast<double>(diffNanos));
+    }
+  }
+
+  RollingLatencyTracker(const RollingLatencyTracker&) = delete;
+  RollingLatencyTracker& operator=(const RollingLatencyTracker&) = delete;
+
+  RollingLatencyTracker(RollingLatencyTracker&& rhs) noexcept
+      : stats_(rhs.stats_), begin_(rhs.begin_) {
+    rhs.stats_ = nullptr;
+  }
+
+  RollingLatencyTracker& operator=(RollingLatencyTracker&& rhs) noexcept {
+    if (this != &rhs) {
+      this->~RollingLatencyTracker();
+      new (this) RollingLatencyTracker(std::move(rhs));
+    }
+    return *this;
+  }
+
+ private:
+  RollingStats* stats_{nullptr};
+  std::chrono::time_point<std::chrono::steady_clock> begin_;
+};
+} // namespace util
+} // namespace cachelib
+} // namespace facebook

From ce0e38aa22d31ceac765a510ba1a04f28591bec7 Mon Sep 17 00:00:00 2001
From: Sounak Gupta <sounak.gupta@intel.com>
Date: Thu, 21 Jul 2022 02:01:04 -0700
Subject: [PATCH 09/40] Rolling average class latency Part 2. (multi-tier
 support) -------------------------------------- There is also an introduction
 to kMaxTiers in Cache.h - this should probably be split from this commit.

added per tier pool class rolling average latency (based on upstream PR)
---
 cachelib/allocator/Cache.h              |  3 +++
 cachelib/allocator/CacheAllocator.h     |  6 +++---
 cachelib/allocator/CacheStats.cpp       |  2 +-
 cachelib/allocator/CacheStats.h         |  1 +
 cachelib/allocator/CacheStatsInternal.h |  7 ++++---
 cachelib/cachebench/cache/CacheStats.h  | 11 +++--------
 6 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/cachelib/allocator/Cache.h b/cachelib/allocator/Cache.h
index 52fff0b254..515da3ac47 100644
--- a/cachelib/allocator/Cache.h
+++ b/cachelib/allocator/Cache.h
@@ -85,6 +85,9 @@ class CacheBase {
   CacheBase(CacheBase&&) = default;
   CacheBase& operator=(CacheBase&&) = default;
 
+  // TODO: come up with some reasonable number
+  static constexpr unsigned kMaxTiers = 2;
+
   // Get a string referring to the cache name for this cache
   virtual const std::string getCacheName() const = 0;
 
diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index 6660e9f788..4f306c48e6 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -2783,7 +2783,7 @@ CacheAllocator<CacheTrait>::allocateInternalTier(TierId tid,
   // the allocation class in our memory allocator.
   const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize);
   util::RollingLatencyTracker rollTracker{
-      (*stats_.classAllocLatency)[pid][cid]};
+      (*stats_.classAllocLatency)[tid][pid][cid]};
 
   // TODO: per-tier
   (*stats_.allocAttempts)[pid][cid].inc();
@@ -2895,7 +2895,7 @@ CacheAllocator<CacheTrait>::allocateChainedItemInternal(const Item& parent,
   // TODO: per-tier? Right now stats_ are not used in any public periodic
   // worker
   util::RollingLatencyTracker rollTracker{
-      (*stats_.classAllocLatency)[pid][cid]};
+      (*stats_.classAllocLatency)[tid][pid][cid]};
 
   (*stats_.allocAttempts)[pid][cid].inc();
 
@@ -4918,7 +4918,7 @@ ACStats CacheAllocator<CacheTrait>::getACStats(TierId tid,
   const auto& pool = allocator_[tid]->getPool(poolId);
   const auto& ac = pool.getAllocationClass(classId);
   auto stats = ac.getStats();
-  stats.allocLatencyNs = (*stats_.classAllocLatency)[poolId][classId];
+  stats.allocLatencyNs = (*stats_.classAllocLatency)[tid][poolId][classId];
   return stats;
 }
 
diff --git a/cachelib/allocator/CacheStats.cpp b/cachelib/allocator/CacheStats.cpp
index c708743036..417e8fe246 100644
--- a/cachelib/allocator/CacheStats.cpp
+++ b/cachelib/allocator/CacheStats.cpp
@@ -44,7 +44,7 @@ void Stats::init() {
   initToZero(*chainedItemEvictions);
   initToZero(*regularItemEvictions);
 
-  classAllocLatency = std::make_unique<PerPoolClassRollingStats>();
+  classAllocLatency = std::make_unique<PerTierPoolClassRollingStats>();
 }
 
 template <int>
diff --git a/cachelib/allocator/CacheStats.h b/cachelib/allocator/CacheStats.h
index 60f6f5e2c5..7a16595343 100644
--- a/cachelib/allocator/CacheStats.h
+++ b/cachelib/allocator/CacheStats.h
@@ -27,6 +27,7 @@
 #include "cachelib/allocator/memory/Slab.h"
 #include "cachelib/common/FastStats.h"
 #include "cachelib/common/PercentileStats.h"
+#include "cachelib/common/RollingStats.h"
 #include "cachelib/common/Time.h"
 
 namespace facebook {
diff --git a/cachelib/allocator/CacheStatsInternal.h b/cachelib/allocator/CacheStatsInternal.h
index b205671e42..4b437d9dbc 100644
--- a/cachelib/allocator/CacheStatsInternal.h
+++ b/cachelib/allocator/CacheStatsInternal.h
@@ -230,12 +230,13 @@ struct Stats {
   std::unique_ptr<PerPoolClassAtomicCounters> chainedItemEvictions{};
   std::unique_ptr<PerPoolClassAtomicCounters> regularItemEvictions{};
 
-  using PerPoolClassRollingStats =
+  using PerTierPoolClassRollingStats = std::array<
       std::array<std::array<util::RollingStats, MemoryAllocator::kMaxClasses>,
-                 MemoryPoolManager::kMaxPools>;
+                 MemoryPoolManager::kMaxPools>,
+      CacheBase::kMaxTiers>;
 
   // rolling latency tracking for every alloc class in every pool
-  std::unique_ptr<PerPoolClassRollingStats> classAllocLatency{};
+  std::unique_ptr<PerTierPoolClassRollingStats> classAllocLatency{};
 
   // Eviction failures due to parent cannot be removed from access container
   AtomicCounter evictFailParentAC{0};
diff --git a/cachelib/cachebench/cache/CacheStats.h b/cachelib/cachebench/cache/CacheStats.h
index 72a0a815f2..e848b71e44 100644
--- a/cachelib/cachebench/cache/CacheStats.h
+++ b/cachelib/cachebench/cache/CacheStats.h
@@ -203,18 +203,11 @@ struct Stats {
         }
       };
 
+
       foreachAC([&](auto tid, auto pid, auto cid, auto stats) {
         auto [allocSizeSuffix, allocSize] = formatMemory(stats.allocSize);
         auto [memorySizeSuffix, memorySize] =
             formatMemory(stats.totalAllocatedSize());
-        out << folly::sformat("tid{:2} pid{:2} cid{:4} {:8.2f}{} memorySize: {:8.2f}{}",
-                              tid, pid, cid, allocSize, allocSizeSuffix, memorySize,
-                              memorySizeSuffix)
-            << std::endl;
-      });
-
-      foreachAC([&](auto tid, auto pid, auto cid, auto stats) {
-        auto [allocSizeSuffix, allocSize] = formatMemory(stats.allocSize);
 
         // If the pool is not full, extrapolate usageFraction for AC assuming it
         // will grow at the same rate. This value will be the same for all ACs.
@@ -224,8 +217,10 @@ struct Stats {
 
         out << folly::sformat(
                    "tid{:2} pid{:2} cid{:4} {:8.2f}{} usageFraction: {:4.2f} "
+                   "memorySize: {:8.2f}{} "
                    "rollingAvgAllocLatency: {:8.2f}ns",
                    tid, pid, cid, allocSize, allocSizeSuffix, acUsageFraction,
+                   memorySize, memorySizeSuffix,
                    stats.allocLatencyNs.estimate())
             << std::endl;
       });

From e0a80066f62e94431f43ebba0071db3a5d85df0f Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Tue, 9 Aug 2022 10:45:26 -0400
Subject: [PATCH 10/40] MM2Q promotion iterator -----------------------

Hot queue iterator for 2Q. Will start at Hot queue and move to Warm queue if hot queue is exhausted. Useful for promotion semantics if using 2Q replacement. rebased on to develop and added some tests.
---
 cachelib/allocator/MM2Q.h                  | 14 +++++
 cachelib/allocator/datastruct/DList.h      |  4 ++
 cachelib/allocator/datastruct/MultiDList.h | 72 +++++++++++++++++++---
 cachelib/allocator/tests/MM2QTest.cpp      | 33 ++++++++++
 4 files changed, 113 insertions(+), 10 deletions(-)

diff --git a/cachelib/allocator/MM2Q.h b/cachelib/allocator/MM2Q.h
index f0a41b4851..9c5ebce96b 100644
--- a/cachelib/allocator/MM2Q.h
+++ b/cachelib/allocator/MM2Q.h
@@ -500,6 +500,11 @@ class MM2Q {
     // Iterator passed as parameter.
     template <typename F>
     void withEvictionIterator(F&& f);
+    
+    // Execute provided function under container lock. Function gets
+    // iterator passed as parameter.
+    template <typename F>
+    void withPromotionIterator(F&& f);
 
     // Execute provided function under container lock.
     template <typename F>
@@ -921,6 +926,15 @@ void MM2Q::Container<T, HookPtr>::withEvictionIterator(F&& fun) {
   }
 }
 
+// returns the head of the hot queue for promotion
+template <typename T, MM2Q::Hook<T> T::*HookPtr>
+template <typename F>
+void
+MM2Q::Container<T, HookPtr>::withPromotionIterator(F&& fun) {
+  lruMutex_->lock_combine([this, &fun]() {
+    fun(LockedIterator{LockHolder{}, lru_.begin(LruType::Hot)});
+  });
+}
 template <typename T, MM2Q::Hook<T> T::*HookPtr>
 template <typename F>
 void MM2Q::Container<T, HookPtr>::withContainerLock(F&& fun) {
diff --git a/cachelib/allocator/datastruct/DList.h b/cachelib/allocator/datastruct/DList.h
index 0708115385..56c9e21212 100644
--- a/cachelib/allocator/datastruct/DList.h
+++ b/cachelib/allocator/datastruct/DList.h
@@ -219,6 +219,10 @@ class DList {
       curr_ = dir_ == Direction::FROM_HEAD ? dlist_->head_ : dlist_->tail_;
     }
 
+    Direction getDirection() noexcept {
+        return dir_;
+    }
+
    protected:
     void goForward() noexcept;
     void goBackward() noexcept;
diff --git a/cachelib/allocator/datastruct/MultiDList.h b/cachelib/allocator/datastruct/MultiDList.h
index 9470c9edae..d7a1351418 100644
--- a/cachelib/allocator/datastruct/MultiDList.h
+++ b/cachelib/allocator/datastruct/MultiDList.h
@@ -108,14 +108,18 @@ class MultiDList {
     }
 
     explicit Iterator(const MultiDList<T, HookPtr>& mlist,
-                      size_t listIdx) noexcept
+                      size_t listIdx, bool head) noexcept
         : currIter_(mlist.lists_[mlist.lists_.size() - 1]->rbegin()),
           mlist_(mlist) {
       XDCHECK_LT(listIdx, mlist.lists_.size());
-      initToValidRBeginFrom(listIdx);
+      if (head) {
+        initToValidBeginFrom(listIdx);
+      } else {
+        initToValidRBeginFrom(listIdx);
+      }
       // We should either point to an element or the end() iterator
       // which has an invalid index_.
-      XDCHECK(index_ == kInvalidIndex || currIter_.get() != nullptr);
+      XDCHECK(index_ == kInvalidIndex || index_ == mlist.lists_.size() || currIter_.get() != nullptr);
     }
     virtual ~Iterator() = default;
 
@@ -167,6 +171,9 @@ class MultiDList {
 
     // reset iterator to the beginning of a speicific queue
     void initToValidRBeginFrom(size_t listIdx) noexcept;
+    
+    // reset iterator to the head of a specific queue
+    void initToValidBeginFrom(size_t listIdx) noexcept;
 
     // Index of current list
     size_t index_{0};
@@ -182,6 +189,9 @@ class MultiDList {
 
   // provides an iterator starting from the tail of a specific list.
   Iterator rbegin(size_t idx) const;
+  
+  // provides an iterator starting from the head of a specific list.
+  Iterator begin(size_t idx) const;
 
   // Iterator to compare against for the end.
   Iterator rend() const noexcept;
@@ -201,12 +211,26 @@ void MultiDList<T, HookPtr>::Iterator::goForward() noexcept {
   }
   // Move iterator forward
   ++currIter_;
-  // If we land at the rend of this list, move to the previous list.
-  while (index_ != kInvalidIndex &&
-         currIter_ == mlist_.lists_[index_]->rend()) {
-    --index_;
-    if (index_ != kInvalidIndex) {
-      currIter_ = mlist_.lists_[index_]->rbegin();
+
+  if (currIter_.getDirection() == DListIterator::Direction::FROM_HEAD) {
+    // If we land at the rend of this list, move to the previous list.
+    while (index_ != kInvalidIndex && index_ != mlist_.lists_.size() &&
+           currIter_ == mlist_.lists_[index_]->end()) {
+      ++index_;
+      if (index_ != kInvalidIndex && index_ != mlist_.lists_.size()) {
+        currIter_ = mlist_.lists_[index_]->begin();
+      } else {
+          return;
+      }
+    }
+  } else {
+    // If we land at the rend of this list, move to the previous list.
+    while (index_ != kInvalidIndex &&
+           currIter_ == mlist_.lists_[index_]->rend()) {
+      --index_;
+      if (index_ != kInvalidIndex) {
+        currIter_ = mlist_.lists_[index_]->rbegin();
+      }
     }
   }
 }
@@ -247,6 +271,25 @@ void MultiDList<T, HookPtr>::Iterator::initToValidRBeginFrom(
                   : mlist_.lists_[index_]->rbegin();
 }
 
+template <typename T, DListHook<T> T::*HookPtr>
+void MultiDList<T, HookPtr>::Iterator::initToValidBeginFrom(
+    size_t listIdx) noexcept {
+  // Find the first non-empty list.
+  index_ = listIdx;
+  while (index_ != mlist_.lists_.size() &&
+         mlist_.lists_[index_]->size() == 0) {
+    ++index_;
+  }
+  if (index_ == mlist_.lists_.size()) {
+    //we reached the end - we should get set to
+    //invalid index
+    index_ = std::numeric_limits<size_t>::max();
+  }
+  currIter_ = index_ == std::numeric_limits<size_t>::max()
+                  ? mlist_.lists_[0]->begin()
+                  : mlist_.lists_[index_]->begin();
+}
+
 template <typename T, DListHook<T> T::*HookPtr>
 typename MultiDList<T, HookPtr>::Iterator&
 MultiDList<T, HookPtr>::Iterator::operator++() noexcept {
@@ -273,7 +316,16 @@ typename MultiDList<T, HookPtr>::Iterator MultiDList<T, HookPtr>::rbegin(
   if (listIdx >= lists_.size()) {
     throw std::invalid_argument("Invalid list index for MultiDList iterator.");
   }
-  return MultiDList<T, HookPtr>::Iterator(*this, listIdx);
+  return MultiDList<T, HookPtr>::Iterator(*this, listIdx, false);
+}
+
+template <typename T, DListHook<T> T::*HookPtr>
+typename MultiDList<T, HookPtr>::Iterator MultiDList<T, HookPtr>::begin(
+    size_t listIdx) const {
+  if (listIdx >= lists_.size()) {
+    throw std::invalid_argument("Invalid list index for MultiDList iterator.");
+  }
+  return MultiDList<T, HookPtr>::Iterator(*this, listIdx, true);
 }
 
 template <typename T, DListHook<T> T::*HookPtr>
diff --git a/cachelib/allocator/tests/MM2QTest.cpp b/cachelib/allocator/tests/MM2QTest.cpp
index e11dd95f5a..0e01ffa56f 100644
--- a/cachelib/allocator/tests/MM2QTest.cpp
+++ b/cachelib/allocator/tests/MM2QTest.cpp
@@ -223,6 +223,19 @@ void MMTypeTest<MMType>::testIterate(std::vector<std::unique_ptr<Node>>& nodes,
   }
 }
 
+template <typename MMType>
+void MMTypeTest<MMType>::testIterateHot(std::vector<std::unique_ptr<Node>>& nodes,
+                                     Container& c) {
+  auto it = nodes.rbegin();
+  c.withPromotionIterator([&it,&c](auto &&it2q) {
+    while (it2q && c.isHot(*it2q)) {
+        ASSERT_EQ(it2q->getId(), (*it)->getId());
+        ++it2q;
+        ++it;
+    }
+  });
+}
+
 template <typename MMType>
 void MMTypeTest<MMType>::testMatch(std::string expected,
                                    MMTypeTest<MMType>::Container& c) {
@@ -238,6 +251,23 @@ void MMTypeTest<MMType>::testMatch(std::string expected,
   ASSERT_EQ(expected, actual);
 }
 
+template <typename MMType>
+void MMTypeTest<MMType>::testMatchHot(std::string expected,
+                                   MMTypeTest<MMType>::Container& c) {
+  int index = -1;
+  std::string actual;
+  c.withPromotionIterator([&c,&actual,&index](auto &&it2q) {
+    while (it2q) {
+      ++index;
+      actual += folly::stringPrintf(
+          "%d:%s, ", it2q->getId(),
+          (c.isHot(*it2q) ? "H" : (c.isCold(*it2q) ? "C" : "W")));
+      ++it2q;
+    }
+  });
+  ASSERT_EQ(expected, actual);
+}
+
 TEST_F(MM2QTest, DetailedTest) {
   MM2Q::Config config;
   config.lruRefreshTime = 0;
@@ -259,8 +289,11 @@ TEST_F(MM2QTest, DetailedTest) {
   }
 
   testIterate(nodes, c);
+  testIterateHot(nodes, c);
 
   testMatch("0:C, 1:C, 2:C, 3:C, 4:H, 5:H, ", c);
+  testMatchHot("5:H, 4:H, 3:C, 2:C, 1:C, 0:C, ", c);
+
   // Move 3 to top of the hot cache
   c.recordAccess(*(nodes[4]), AccessMode::kRead);
   testMatch("0:C, 1:C, 2:C, 3:C, 5:H, 4:H, ", c);

From bcb2ae288c931fd589ea5559ca38221970959a06 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Mon, 6 Feb 2023 16:45:18 -0800
Subject: [PATCH 11/40] Multi-tier allocator patch Part 2.
 ---------------------------- This patch introduces tryEvictToNextMemoryTier
 and some additional multi-tier tests.

We can consider merging tryEvictToNextMemoryTier
with the initial implementation and seperating
the tests into a seperate patch.

Per tier pool stats
(multi-tier patch part 3.)
--------------------
This introduces per tier stats
this can go with multi-tier patch part 2.

Fix token creation and stats (#79)
(multi-tier patch 4.)
---------------------------------
This patch can go after we implement tryEvictToNextMemoryTier (or multi-tier part 2)
and should be combined as such.

* Fix issue with token creation

* Do not increment evictFail* stats if evictFailConcurrentFill were incremented

correct handling for expired items in eviction (#86)
(multi-tier patch 5.)
-----------------------------------------------------
This can be merged with patches that fix token creation
and probably squashed into multi-tier patch 2.

- we first check if an item is expired under mmContainer
  lock and if so mark it for eviction so it is recycled
  back up to allocateInternalTier.
---
 cachelib/allocator/Cache.cpp                  |  10 +-
 cachelib/allocator/CacheAllocator.h           | 310 +++++++++++++++---
 cachelib/allocator/CacheItem.h                |   5 +
 cachelib/allocator/CacheStats.cpp             |  94 ++++--
 cachelib/allocator/CacheStats.h               |  46 ++-
 cachelib/allocator/CacheStatsInternal.h       |  25 +-
 cachelib/allocator/MMLru.h                    |   2 +-
 .../tests/AllocatorMemoryTiersTest.cpp        |   6 +-
 .../tests/AllocatorMemoryTiersTest.h          | 292 +++++++++++++++++
 cachelib/allocator/tests/TestBase.h           |  29 ++
 cachelib/cachebench/cache/Cache.h             |  32 +-
 cachelib/cachebench/cache/CacheStats.h        |  98 ++++--
 cachelib/cachebench/util/CacheConfig.h        |   2 +-
 13 files changed, 800 insertions(+), 151 deletions(-)

diff --git a/cachelib/allocator/Cache.cpp b/cachelib/allocator/Cache.cpp
index 37bba99a67..db7a281104 100644
--- a/cachelib/allocator/Cache.cpp
+++ b/cachelib/allocator/Cache.cpp
@@ -244,6 +244,7 @@ void CacheBase::updateGlobalCacheStats(const std::string& statPrefix) const {
       statPrefix + "cache.size.configured",
       memStats.configuredRamCacheSize + memStats.nvmCacheSize);
 
+  //TODO: add specific per-tier counters
   const auto stats = getGlobalCacheStats();
 
   // Eviction Stats
@@ -253,7 +254,8 @@ void CacheBase::updateGlobalCacheStats(const std::string& statPrefix) const {
   //   from both ram and nvm, this is counted as a single eviction from cache.
   // Ram Evictions: item evicted from ram but it can be inserted into nvm
   const std::string ramEvictionKey = statPrefix + "ram.evictions";
-  counters_.updateDelta(ramEvictionKey, stats.numEvictions);
+  counters_.updateDelta(ramEvictionKey,
+                        std::accumulate(stats.numEvictions.begin(), stats.numEvictions.end(), 0));
   // Nvm Evictions: item evicted from nvm but it can be still in ram
   const std::string nvmEvictionKey = statPrefix + "nvm.evictions";
   counters_.updateDelta(nvmEvictionKey, stats.numNvmEvictions);
@@ -295,11 +297,11 @@ void CacheBase::updateGlobalCacheStats(const std::string& statPrefix) const {
   }
 
   counters_.updateDelta(statPrefix + "cache.alloc_attempts",
-                        stats.allocAttempts);
+                        std::accumulate(stats.allocAttempts.begin(), stats.allocAttempts.end(),0));
   counters_.updateDelta(statPrefix + "cache.eviction_attempts",
-                        stats.evictionAttempts);
+                        std::accumulate(stats.evictionAttempts.begin(),stats.evictionAttempts.end(),0));
   counters_.updateDelta(statPrefix + "cache.alloc_failures",
-                        stats.allocFailures);
+                        std::accumulate(stats.allocFailures.begin(),stats.allocFailures.end(),0));
   counters_.updateDelta(statPrefix + "cache.invalid_allocs",
                         stats.invalidAllocs);
 
diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index 4f306c48e6..29cb159b54 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -23,6 +23,8 @@
 #include <folly/fibers/TimedMutex.h>
 #include <folly/logging/xlog.h>
 #include <folly/synchronization/SanitizeThread.h>
+#include <folly/hash/Hash.h>
+#include <folly/container/F14Map.h>
 #include <gtest/gtest.h>
 
 #include <chrono>
@@ -1207,6 +1209,8 @@ class CacheAllocator : public CacheBase {
 
   // pool stats by pool id
   PoolStats getPoolStats(PoolId pid) const override final;
+  // pool stats by tier id and pool id
+  PoolStats getPoolStats(TierId tid, PoolId pid) const;
 
   // This can be expensive so it is not part of PoolStats
   PoolEvictionAgeStats getPoolEvictionAgeStats(
@@ -1571,15 +1575,6 @@ class CacheAllocator : public CacheBase {
   //              not exist.
   FOLLY_ALWAYS_INLINE WriteHandle findFastImpl(Key key, AccessMode mode);
 
-  // Moves a regular item to a different memory tier.
-  //
-  // @param oldItem     Reference to the item being moved
-  // @param newItemHdl  Reference to the handle of the new item being moved into
-  //
-  // @return true  If the move was completed, and the containers were updated
-  //               successfully.
-  bool moveRegularItemOnEviction(Item& oldItem, WriteHandle& newItemHdl);
-
   // Moves a regular item to a different slab. This should only be used during
   // slab release after the item's exclusive bit has been set. The user supplied
   // callback is responsible for copying the contents and fixing the semantics
@@ -1777,6 +1772,27 @@ class CacheAllocator : public CacheBase {
 
   using EvictionIterator = typename MMContainer::LockedIterator;
 
+  // Try to move the item down to the next memory tier
+  //
+  // @param tid current tier ID of the item
+  // @param pid the pool ID the item belong to.
+  // @param item the item to evict
+  // @param fromBgThread whether this is called from BG thread
+  //
+  // @return valid handle to the item. This will be the last
+  //         handle to the item. On failure an empty handle.
+  WriteHandle tryEvictToNextMemoryTier(TierId tid, PoolId pid, Item& item,
+                                       bool fromBgThread);
+
+  // Try to move the item down to the next memory tier
+  //
+  // @param item the item to evict
+  // @param fromBgThread whether this is called from BG thread
+  //
+  // @return valid handle to the item. This will be the last
+  //         handle to the item. On failure an empty handle. 
+  WriteHandle tryEvictToNextMemoryTier(Item& item, bool fromBgThread);
+
   // Wakes up waiters if there are any
   //
   // @param item    wakes waiters that are waiting on that item
@@ -2785,8 +2801,7 @@ CacheAllocator<CacheTrait>::allocateInternalTier(TierId tid,
   util::RollingLatencyTracker rollTracker{
       (*stats_.classAllocLatency)[tid][pid][cid]};
 
-  // TODO: per-tier
-  (*stats_.allocAttempts)[pid][cid].inc();
+  (*stats_.allocAttempts)[tid][pid][cid].inc();
 
   void* memory = allocator_[tid]->allocate(pid, requiredSize);
 
@@ -2815,12 +2830,12 @@ CacheAllocator<CacheTrait>::allocateInternalTier(TierId tid,
     handle = acquire(new (memory) Item(key, size, creationTime, expiryTime));
     if (handle) {
       handle.markNascent();
-      (*stats_.fragmentationSize)[pid][cid].add(
+      (*stats_.fragmentationSize)[tid][pid][cid].add(
           util::getFragmentation(*this, *handle));
     }
 
   } else { // failed to allocate memory.
-    (*stats_.allocFailures)[pid][cid].inc(); // TODO: per-tier
+    (*stats_.allocFailures)[tid][pid][cid].inc();
     // wake up rebalancer
     if (!config_.poolRebalancerDisableForcedWakeUp && poolRebalancer_) {
       poolRebalancer_->wakeUp();
@@ -2897,14 +2912,14 @@ CacheAllocator<CacheTrait>::allocateChainedItemInternal(const Item& parent,
   util::RollingLatencyTracker rollTracker{
       (*stats_.classAllocLatency)[tid][pid][cid]};
 
-  (*stats_.allocAttempts)[pid][cid].inc();
+  (*stats_.allocAttempts)[tid][pid][cid].inc();
 
   void* memory = allocator_[tid]->allocate(pid, requiredSize);
   if (memory == nullptr) {
     memory = findEviction(tid, pid, cid);
   }
   if (memory == nullptr) {
-    (*stats_.allocFailures)[pid][cid].inc();
+    (*stats_.allocFailures)[tid][pid][cid].inc();
     return WriteHandle{};
   }
 
@@ -2915,7 +2930,7 @@ CacheAllocator<CacheTrait>::allocateChainedItemInternal(const Item& parent,
 
   if (child) {
     child.markNascent();
-    (*stats_.fragmentationSize)[pid][cid].add(
+    (*stats_.fragmentationSize)[tid][pid][cid].add(
         util::getFragmentation(*this, *child));
   }
 
@@ -3254,7 +3269,7 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
     stats_.perPoolEvictionAgeSecs_[allocInfo.poolId].trackValue(refreshTime);
   }
 
-  (*stats_.fragmentationSize)[allocInfo.poolId][allocInfo.classId].sub(
+  (*stats_.fragmentationSize)[tid][allocInfo.poolId][allocInfo.classId].sub(
       util::getFragmentation(*this, it));
 
   // Chained items can only end up in this place if the user has allocated
@@ -3337,7 +3352,7 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
 
       const auto childInfo =
           allocator_[tid]->getAllocInfo(static_cast<const void*>(head));
-      (*stats_.fragmentationSize)[childInfo.poolId][childInfo.classId].sub(
+      (*stats_.fragmentationSize)[tid][childInfo.poolId][childInfo.classId].sub(
           util::getFragmentation(*this, *head));
 
       removeFromMMContainer(*head);
@@ -3781,14 +3796,16 @@ CacheAllocator<CacheTrait>::getNextCandidate(TierId tid,
   typename NvmCacheT::PutToken token;
   Item* toRecycle = nullptr;
   Item* candidate = nullptr;
+  bool isExpired = false;
   auto& mmContainer = getMMContainer(tid, pid, cid);
+  bool lastTier = tid+1 >= getNumTiers();
 
-  mmContainer.withEvictionIterator([this, pid, cid, &candidate, &toRecycle,
-                                    &searchTries, &mmContainer,
-                                    &token](auto&& itr) {
+  mmContainer.withEvictionIterator([this, tid, pid, cid, &candidate, &toRecycle,
+                                    &searchTries, &mmContainer, &lastTier,
+                                    &isExpired, &token](auto&& itr) {
     if (!itr) {
       ++searchTries;
-      (*stats_.evictionAttempts)[pid][cid].inc();
+      (*stats_.evictionAttempts)[tid][pid][cid].inc();
       return;
     }
 
@@ -3796,7 +3813,7 @@ CacheAllocator<CacheTrait>::getNextCandidate(TierId tid,
             config_.evictionSearchTries > searchTries) &&
            itr) {
       ++searchTries;
-      (*stats_.evictionAttempts)[pid][cid].inc();
+      (*stats_.evictionAttempts)[tid][pid][cid].inc();
 
       auto* toRecycle_ = itr.get();
       auto* candidate_ =
@@ -3804,15 +3821,22 @@ CacheAllocator<CacheTrait>::getNextCandidate(TierId tid,
               ? &toRecycle_->asChainedItem().getParentItem(compressor_)
               : toRecycle_;
 
-      auto putToken = createPutToken(*candidate_);
+      // if it's last tier, the item will be evicted
+      // need to create put token before marking it exclusive
+      const bool evictToNvmCache = lastTier && shouldWriteToNvmCache(*candidate_);
+      auto token_ = evictToNvmCache
+                        ? nvmCache_->createPutToken(candidate_->getKey())
+                        : typename NvmCacheT::PutToken{};
 
-      if (shouldWriteToNvmCache(*candidate_) && !putToken.isValid()) {
+      if (evictToNvmCache && !token_.isValid()) {
         stats_.evictFailConcurrentFill.inc();
         ++itr;
         continue;
       }
 
-      auto markedForEviction = candidate_->markForEviction();
+      auto markedForEviction = (lastTier || candidate_->isExpired()) ?
+                                   candidate_->markForEviction() :
+                                   candidate_->markMoving();
       if (!markedForEviction) {
         if (candidate_->hasChainedItem()) {
           stats_.evictFailParentAC.inc();
@@ -3823,11 +3847,14 @@ CacheAllocator<CacheTrait>::getNextCandidate(TierId tid,
         continue;
       }
 
+      XDCHECK(candidate_->isMoving() || candidate_->isMarkedForEviction());
       // markForEviction to make sure no other thead is evicting the item
-      // nor holding a handle to that item
+      // nor holding a handle to that item if this is last tier
+      // since we won't be moving the item to the next tier
       toRecycle = toRecycle_;
       candidate = candidate_;
-      token = std::move(putToken);
+      isExpired = candidate_->isExpired();
+      token = std::move(token_);
 
       // Check if parent changed for chained items - if yes, we cannot
       // remove the child from the mmContainer as we will not be evicting
@@ -3847,13 +3874,61 @@ CacheAllocator<CacheTrait>::getNextCandidate(TierId tid,
 
   XDCHECK(toRecycle);
   XDCHECK(candidate);
-  XDCHECK(candidate->isMarkedForEviction());
+  XDCHECK(candidate->isMoving() || candidate->isMarkedForEviction());
+
+  auto evictedToNext = (lastTier || isExpired) ? nullptr
+      : tryEvictToNextMemoryTier(*candidate, false);
+  if (!evictedToNext) {
+    //if insertOrReplace was called during move
+    //then candidate will not be accessible (failed replace during tryEvict)
+    // - therefore this was why we failed to
+    //   evict to the next tier and insertOrReplace
+    //   will remove from NVM cache
+    //however, if candidate is accessible
+    //that means the allocation in the next
+    //tier failed - so we will continue to
+    //evict the item to NVM cache
+    bool failedToReplace = !candidate->isAccessible();
+    if (!token.isValid() && !failedToReplace) {
+      token = createPutToken(*candidate);
+    }
+    // tryEvictToNextMemoryTier can fail if:
+    //    a) allocation of the new item fails in that case,
+    //       it should be still possible to mark item for eviction.
+    //    b) another thread calls insertOrReplace and the item
+    //       is no longer accessible
+    //
+    // in case that we are on the last tier, we whould have already marked
+    // as exclusive since we will not be moving the item to the next tier
+    // but rather just evicting all together, no need to
+    // markForEvictionWhenMoving
+    auto ret = (lastTier || isExpired) ? true : candidate->markForEvictionWhenMoving();
+    XDCHECK(ret);
+
+    unlinkItemForEviction(*candidate);
+
+    // wake up any readers that wait for the move to complete
+    // it's safe to do now, as we have the item marked exclusive and
+    // no other reader can be added to the waiters list
+    wakeUpWaiters(candidate->getKey(), {});
+    
+    if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate)
+            && !failedToReplace) {
+      nvmCache_->put(*candidate, std::move(token));
+    }
 
-  unlinkItemForEviction(*candidate);
+  } else {
+    XDCHECK(!evictedToNext->isMarkedForEviction() && !evictedToNext->isMoving());
+    XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
+    XDCHECK(!candidate->isAccessible());
+    XDCHECK(candidate->getKey() == evictedToNext->getKey());
 
-  if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate)) {
-    nvmCache_->put(*candidate, std::move(token));
+    (*stats_.numWritebacks)[tid][pid][cid].inc();
+    wakeUpWaiters(candidate->getKey(), std::move(evictedToNext));
   }
+
+  XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
+
   return {candidate, toRecycle};
 }
 
@@ -3876,9 +3951,9 @@ CacheAllocator<CacheTrait>::findEviction(TierId tid, PoolId pid, ClassId cid) {
     // NULL. If `ref` == 0 then it means that we are the last holder of
     // that item.
     if (candidate->hasChainedItem()) {
-      (*stats_.chainedItemEvictions)[pid][cid].inc();
+      (*stats_.chainedItemEvictions)[tid][pid][cid].inc();
     } else {
-      (*stats_.regularItemEvictions)[pid][cid].inc();
+      (*stats_.regularItemEvictions)[tid][pid][cid].inc();
     }
 
     if (auto eventTracker = getEventTracker()) {
@@ -3946,6 +4021,49 @@ bool CacheAllocator<CacheTrait>::shouldWriteToNvmCacheExclusive(
   return true;
 }
 
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::tryEvictToNextMemoryTier(
+    TierId tid, PoolId pid, Item& item, bool fromBgThread) {
+  XDCHECK(item.isMoving());
+  XDCHECK(item.getRefCount() == 0);
+  if(item.hasChainedItem()) return WriteHandle{}; // TODO: We do not support ChainedItem yet
+
+  TierId nextTier = tid; // TODO - calculate this based on some admission policy
+  while (++nextTier < getNumTiers()) { // try to evict down to the next memory tiers
+    // allocateInternal might trigger another eviction
+    auto newItemHdl = allocateInternalTier(nextTier, pid,
+                     item.getKey(),
+                     item.getSize(),
+                     item.getCreationTime(),
+                     item.getExpiryTime(),
+                     fromBgThread);
+
+    if (newItemHdl) {
+      
+      bool moveSuccess = moveRegularItem(item, newItemHdl);
+      if (!moveSuccess) {
+        return WriteHandle{};
+      }
+      XDCHECK_EQ(newItemHdl->getSize(), item.getSize());
+      item.unmarkMoving();
+      return newItemHdl;
+    } else {
+      return WriteHandle{};
+    }
+  }
+
+  return {};
+}
+
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::tryEvictToNextMemoryTier(Item& item, bool fromBgThread) {
+  auto tid = getTierId(item);
+  auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId;
+  return tryEvictToNextMemoryTier(tid, pid, item, fromBgThread);
+}
+
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::RemoveRes
 CacheAllocator<CacheTrait>::remove(typename Item::Key key) {
@@ -4388,7 +4506,7 @@ bool CacheAllocator<CacheTrait>::recordAccessInMMContainer(Item& item,
   const auto tid = getTierId(item);
   const auto allocInfo =
       allocator_[tid]->getAllocInfo(static_cast<const void*>(&item));
-  (*stats_.cacheHits)[allocInfo.poolId][allocInfo.classId].inc();
+  (*stats_.cacheHits)[tid][allocInfo.poolId][allocInfo.classId].inc();
 
   // track recently accessed items if needed
   if (UNLIKELY(config_.trackRecentItemsForDump)) {
@@ -4879,26 +4997,42 @@ PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
   // TODO export evictions, numItems etc from compact cache directly.
   if (!isCompactCache) {
     for (const ClassId cid : classIds) {
-      uint64_t classHits = (*stats_.cacheHits)[poolId][cid].get();
-      XDCHECK(mmContainers_[0][poolId][cid],
-              folly::sformat("Pid {}, Cid {} not initialized.", poolId, cid));
+      uint64_t allocAttempts = 0, evictionAttempts = 0, allocFailures = 0,
+               fragmentationSize = 0, classHits = 0, chainedItemEvictions = 0,
+               regularItemEvictions = 0, numWritebacks = 0;
+      MMContainerStat mmContainerStats;
+      for (TierId tid = 0; tid < getNumTiers(); tid++) {
+        allocAttempts += (*stats_.allocAttempts)[tid][poolId][cid].get();
+        evictionAttempts += (*stats_.evictionAttempts)[tid][poolId][cid].get();
+        allocFailures += (*stats_.allocFailures)[tid][poolId][cid].get();
+        fragmentationSize += (*stats_.fragmentationSize)[tid][poolId][cid].get();
+        classHits += (*stats_.cacheHits)[tid][poolId][cid].get();
+        chainedItemEvictions += (*stats_.chainedItemEvictions)[tid][poolId][cid].get();
+        regularItemEvictions += (*stats_.regularItemEvictions)[tid][poolId][cid].get();
+        numWritebacks += (*stats_.numWritebacks)[tid][poolId][cid].get();
+        mmContainerStats += getMMContainerStat(tid, poolId, cid);
+        XDCHECK(mmContainers_[tid][poolId][cid],
+                folly::sformat("Tid {}, Pid {}, Cid {} not initialized.", tid, poolId, cid));
+      }
       cacheStats.insert(
           {cid,
-           {allocSizes[cid], (*stats_.allocAttempts)[poolId][cid].get(),
-            (*stats_.evictionAttempts)[poolId][cid].get(),
-            (*stats_.allocFailures)[poolId][cid].get(),
-            (*stats_.fragmentationSize)[poolId][cid].get(), classHits,
-            (*stats_.chainedItemEvictions)[poolId][cid].get(),
-            (*stats_.regularItemEvictions)[poolId][cid].get(),
-            mmContainers_[0][poolId][cid]->getStats()}
-
-          });
+           {allocSizes[cid],
+            allocAttempts,
+            evictionAttempts,
+            allocFailures,
+            fragmentationSize,
+            classHits,
+            chainedItemEvictions,
+            regularItemEvictions,
+            numWritebacks,
+            mmContainerStats}});
       totalHits += classHits;
     }
   }
 
   PoolStats ret;
   ret.isCompactCache = isCompactCache;
+  //pool name is also shared among tiers
   ret.poolName = allocator_[0]->getPoolName(poolId);
   ret.poolSize = pool.getPoolSize();
   ret.poolUsableSize = pool.getPoolUsableSize();
@@ -4911,6 +5045,59 @@ PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
   return ret;
 }
 
+template <typename CacheTrait>
+PoolStats CacheAllocator<CacheTrait>::getPoolStats(TierId tid, PoolId poolId) const {
+  const auto& pool = allocator_[tid]->getPool(poolId);
+  const auto& allocSizes = pool.getAllocSizes();
+  auto mpStats = pool.getStats();
+  const auto& classIds = mpStats.classIds;
+
+  // check if this is a compact cache.
+  bool isCompactCache = false;
+  {
+    std::shared_lock lock(compactCachePoolsLock_);
+    isCompactCache = isCompactCachePool_[poolId];
+  }
+
+  folly::F14FastMap<ClassId, CacheStat> cacheStats;
+  uint64_t totalHits = 0;
+  // cacheStats is only menaningful for pools that are not compact caches.
+  // TODO export evictions, numItems etc from compact cache directly.
+  if (!isCompactCache) {
+    for (const ClassId cid : classIds) {
+      uint64_t classHits = (*stats_.cacheHits)[tid][poolId][cid].get();
+      XDCHECK(mmContainers_[tid][poolId][cid],
+              folly::sformat("Tid {}, Pid {}, Cid {} not initialized.", tid, poolId, cid));
+      cacheStats.insert(
+          {cid,
+           {allocSizes[cid],
+            (*stats_.allocAttempts)[tid][poolId][cid].get(),
+            (*stats_.evictionAttempts)[tid][poolId][cid].get(),
+            (*stats_.allocFailures)[tid][poolId][cid].get(),
+            (*stats_.fragmentationSize)[tid][poolId][cid].get(),
+            classHits,
+            (*stats_.chainedItemEvictions)[tid][poolId][cid].get(),
+            (*stats_.regularItemEvictions)[tid][poolId][cid].get(),
+            (*stats_.numWritebacks)[tid][poolId][cid].get(),
+            getMMContainerStat(tid, poolId, cid)}});
+      totalHits += classHits;
+    }
+  }
+
+  PoolStats ret;
+  ret.isCompactCache = isCompactCache;
+  ret.poolName = allocator_[tid]->getPoolName(poolId);
+  ret.poolSize = pool.getPoolSize();
+  ret.poolUsableSize = pool.getPoolUsableSize();
+  ret.poolAdvisedSize = pool.getPoolAdvisedSize();
+  ret.cacheStats = std::move(cacheStats);
+  ret.mpStats = std::move(mpStats);
+  ret.numPoolGetHits = totalHits;
+  ret.evictionAgeSecs = stats_.perPoolEvictionAgeSecs_[poolId].estimate();
+
+  return ret;
+}
+
 template <typename CacheTrait>
 ACStats CacheAllocator<CacheTrait>::getACStats(TierId tid,
                                                PoolId poolId,
@@ -5158,7 +5345,7 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(Item& oldItem) {
   }
   allocator_[tid]->free(&oldItem);
 
-  (*stats_.fragmentationSize)[allocInfo.poolId][allocInfo.classId].sub(
+  (*stats_.fragmentationSize)[tid][allocInfo.poolId][allocInfo.classId].sub(
       util::getFragmentation(*this, oldItem));
   stats_.numMoveSuccesses.inc();
   return true;
@@ -5233,12 +5420,13 @@ void CacheAllocator<CacheTrait>::evictForSlabRelease(Item& item) {
     nvmCache_->put(*evicted, std::move(token));
   }
 
+  const auto tid = getTierId(*evicted);
   const auto allocInfo =
-      allocator_[getTierId(item)]->getAllocInfo(static_cast<const void*>(&item));
+      allocator_[tid]->getAllocInfo(static_cast<const void*>(evicted));
   if (evicted->hasChainedItem()) {
-    (*stats_.chainedItemEvictions)[allocInfo.poolId][allocInfo.classId].inc();
+    (*stats_.chainedItemEvictions)[tid][allocInfo.poolId][allocInfo.classId].inc();
   } else {
-    (*stats_.regularItemEvictions)[allocInfo.poolId][allocInfo.classId].inc();
+    (*stats_.regularItemEvictions)[tid][allocInfo.poolId][allocInfo.classId].inc();
   }
 
   stats_.numEvictionSuccesses.inc();
@@ -5472,8 +5660,12 @@ folly::IOBufQueue CacheAllocator<CacheTrait>::saveStateToIOBuf() {
     for (PoolId pid : pools) {
       for (unsigned int cid = 0; cid < (*stats_.fragmentationSize)[pid].size();
            ++cid) {
+        uint64_t fragmentationSize = 0;
+        for (TierId tid = 0; tid < getNumTiers(); tid++) {
+            fragmentationSize += (*stats_.fragmentationSize)[tid][pid][cid].get();
+        }
         metadata_.fragmentationSize()[pid][static_cast<ClassId>(cid)] =
-            (*stats_.fragmentationSize)[pid][cid].get();
+            fragmentationSize;
       }
       if (isCompactCachePool_[pid]) {
         metadata_.compactCachePools()->push_back(pid);
@@ -5719,8 +5911,18 @@ void CacheAllocator<CacheTrait>::initStats() {
   // deserialize the fragmentation size of each thread.
   for (const auto& pid : *metadata_.fragmentationSize()) {
     for (const auto& cid : pid.second) {
-      (*stats_.fragmentationSize)[pid.first][cid.first].set(
-          static_cast<uint64_t>(cid.second));
+      //in multi-tier we serialized as the sum - no way
+      //to get back so just divide the two for now
+      //TODO: proper multi-tier serialization
+      uint64_t total = static_cast<uint64_t>(cid.second);
+      uint64_t part = total / getNumTiers();
+      uint64_t sum = 0;
+      for (TierId tid = 1; tid < getNumTiers(); tid++) {
+        (*stats_.fragmentationSize)[tid][pid.first][cid.first].set(part);
+        sum += part;
+      }
+      uint64_t leftover = total - sum;
+      (*stats_.fragmentationSize)[0][pid.first][cid.first].set(leftover);
     }
   }
 
diff --git a/cachelib/allocator/CacheItem.h b/cachelib/allocator/CacheItem.h
index fe60187e6e..17b80f5ba3 100644
--- a/cachelib/allocator/CacheItem.h
+++ b/cachelib/allocator/CacheItem.h
@@ -43,6 +43,9 @@ class BaseAllocatorTest;
 template <typename AllocatorT>
 class AllocatorHitStatsTest;
 
+template <typename AllocatorT>
+class AllocatorMemoryTiersTest;
+
 template <typename AllocatorT>
 class MapTest;
 
@@ -466,6 +469,8 @@ class CACHELIB_PACKED_ATTR CacheItem {
   FRIEND_TEST(ItemTest, NonStringKey);
   template <typename AllocatorT>
   friend class facebook::cachelib::tests::AllocatorHitStatsTest;
+  template <typename AllocatorT>
+  friend class facebook::cachelib::tests::AllocatorMemoryTiersTest;
 };
 
 // A chained item has a hook pointing to the next chained item. The hook is
diff --git a/cachelib/allocator/CacheStats.cpp b/cachelib/allocator/CacheStats.cpp
index 417e8fe246..dcb81930b9 100644
--- a/cachelib/allocator/CacheStats.cpp
+++ b/cachelib/allocator/CacheStats.cpp
@@ -22,18 +22,21 @@ namespace facebook::cachelib {
 namespace detail {
 
 void Stats::init() {
-  cacheHits = std::make_unique<PerPoolClassTLCounters>();
-  allocAttempts = std::make_unique<PerPoolClassAtomicCounters>();
-  evictionAttempts = std::make_unique<PerPoolClassAtomicCounters>();
-  fragmentationSize = std::make_unique<PerPoolClassAtomicCounters>();
-  allocFailures = std::make_unique<PerPoolClassAtomicCounters>();
-  chainedItemEvictions = std::make_unique<PerPoolClassAtomicCounters>();
-  regularItemEvictions = std::make_unique<PerPoolClassAtomicCounters>();
+  cacheHits = std::make_unique<PerTierPerPoolClassTLCounters>();
+  allocAttempts = std::make_unique<PerTierPerPoolClassAtomicCounters>();
+  evictionAttempts = std::make_unique<PerTierPerPoolClassAtomicCounters>();
+  fragmentationSize = std::make_unique<PerTierPerPoolClassAtomicCounters>();
+  allocFailures = std::make_unique<PerTierPerPoolClassAtomicCounters>();
+  chainedItemEvictions = std::make_unique<PerTierPerPoolClassAtomicCounters>();
+  regularItemEvictions = std::make_unique<PerTierPerPoolClassAtomicCounters>();
+  numWritebacks = std::make_unique<PerTierPerPoolClassAtomicCounters>();
   auto initToZero = [](auto& a) {
-    for (auto& s : a) {
-      for (auto& c : s) {
+    for (auto& t : a) {
+     for (auto& p : t) {
+      for (auto& c : p) {
         c.set(0);
       }
+     }
     }
   };
 
@@ -43,6 +46,7 @@ void Stats::init() {
   initToZero(*fragmentationSize);
   initToZero(*chainedItemEvictions);
   initToZero(*regularItemEvictions);
+  initToZero(*numWritebacks);
 
   classAllocLatency = std::make_unique<PerTierPoolClassRollingStats>();
 }
@@ -52,7 +56,7 @@ struct SizeVerify {};
 
 void Stats::populateGlobalCacheStats(GlobalCacheStats& ret) const {
 #ifndef SKIP_SIZE_VERIFY
-  SizeVerify<sizeof(Stats)> a = SizeVerify<16272>{};
+  SizeVerify<sizeof(Stats)> a = SizeVerify<16288>{};
   std::ignore = a;
 #endif
   ret.numCacheGets = numCacheGets.get();
@@ -115,20 +119,43 @@ void Stats::populateGlobalCacheStats(GlobalCacheStats& ret) const {
   ret.nvmEvictionSecondsToExpiry = this->nvmEvictionSecondsToExpiry_.estimate();
   ret.nvmPutSize = this->nvmPutSize_.estimate();
 
-  auto accum = [](const PerPoolClassAtomicCounters& c) {
-    uint64_t sum = 0;
-    for (const auto& x : c) {
-      for (const auto& v : x) {
-        sum += v.get();
-      }
+  auto accum = [](const PerTierPerPoolClassAtomicCounters& t) {
+    std::vector<uint64_t> stat;
+    for (const auto& c : t) {
+     uint64_t sum = 0;
+     for (const auto& x : c) {
+       for (const auto& v : x) {
+         sum += v.get();
+       }
+     }
+     stat.push_back(sum);
+    }
+    return stat;
+  };
+
+  auto accumTL = [](const PerTierPerPoolClassTLCounters& t) {
+    std::vector<uint64_t> stat;
+    for (const auto& c : t) {
+     uint64_t sum = 0;
+     for (const auto& x : c) {
+       for (const auto& v : x) {
+         sum += v.get();
+       }
+     }
+     stat.push_back(sum);
     }
-    return sum;
+    return stat;
   };
   ret.allocAttempts = accum(*allocAttempts);
   ret.evictionAttempts = accum(*evictionAttempts);
   ret.allocFailures = accum(*allocFailures);
-  ret.numEvictions = accum(*chainedItemEvictions);
-  ret.numEvictions += accum(*regularItemEvictions);
+  auto chainedEvictions = accum(*chainedItemEvictions);
+  auto regularEvictions = accum(*regularItemEvictions);
+  for (TierId tid = 0; tid < chainedEvictions.size(); tid++) {
+    ret.numEvictions.push_back(chainedEvictions[tid] + regularEvictions[tid]);
+  }
+  ret.numWritebacks = accum(*numWritebacks);
+  ret.numCacheHits = accumTL(*cacheHits);
 
   ret.invalidAllocs = invalidAllocs.get();
   ret.numRefcountOverflow = numRefcountOverflow.get();
@@ -146,6 +173,18 @@ void Stats::populateGlobalCacheStats(GlobalCacheStats& ret) const {
 
 } // namespace detail
 
+MMContainerStat& MMContainerStat::operator+=(const MMContainerStat& other) {
+
+  size += other.size;
+  oldestTimeSec = std::min(oldestTimeSec,other.oldestTimeSec);
+  lruRefreshTime = std::max(lruRefreshTime,other.lruRefreshTime);
+  numHotAccesses += other.numHotAccesses;
+  numColdAccesses += other.numColdAccesses;
+  numWarmAccesses += other.numWarmAccesses;
+  numTailAccesses += other.numTailAccesses;
+  return *this;
+}
+
 PoolStats& PoolStats::operator+=(const PoolStats& other) {
   auto verify = [](bool isCompatible) {
     if (!isCompatible) {
@@ -183,6 +222,7 @@ PoolStats& PoolStats::operator+=(const PoolStats& other) {
       d.allocFailures += s.allocFailures;
       d.fragmentationSize += s.fragmentationSize;
       d.numHits += s.numHits;
+      d.numWritebacks += s.numWritebacks;
       d.chainedItemEvictions += s.chainedItemEvictions;
       d.regularItemEvictions += s.regularItemEvictions;
     }
@@ -238,6 +278,14 @@ uint64_t PoolStats::numEvictions() const noexcept {
   return n;
 }
 
+uint64_t PoolStats::numWritebacks() const noexcept {
+  uint64_t n = 0;
+  for (const auto& s : cacheStats) {
+    n += s.second.numWritebacks;
+  }
+  return n;
+}
+
 uint64_t PoolStats::numItems() const noexcept {
   uint64_t n = 0;
   for (const auto& s : cacheStats) {
@@ -246,6 +294,14 @@ uint64_t PoolStats::numItems() const noexcept {
   return n;
 }
 
+uint64_t PoolStats::numHits() const noexcept {
+  uint64_t n = 0;
+  for (const auto& s : cacheStats) {
+    n += s.second.numHits;
+  }
+  return n;
+}
+
 uint64_t PoolStats::numAllocFailures() const {
   uint64_t n = 0;
   for (const auto& s : cacheStats) {
diff --git a/cachelib/allocator/CacheStats.h b/cachelib/allocator/CacheStats.h
index 7a16595343..8c9b1c370c 100644
--- a/cachelib/allocator/CacheStats.h
+++ b/cachelib/allocator/CacheStats.h
@@ -80,22 +80,25 @@ struct PoolEvictionAgeStats {
 // Stats for MM container
 struct MMContainerStat {
   // number of elements in the container.
-  size_t size;
+  size_t size{0};
 
   // what is the unix timestamp in seconds of the oldest element existing in
   // the container.
-  uint64_t oldestTimeSec;
+  uint64_t oldestTimeSec{0};
 
   // refresh time for LRU
-  uint64_t lruRefreshTime;
+  uint64_t lruRefreshTime{0};
 
   // TODO: Make the MMContainerStat generic by moving the Lru/2Q specific
   // stats inside MMType and exporting them through a generic stats interface.
   // number of hits in each lru.
-  uint64_t numHotAccesses;
-  uint64_t numColdAccesses;
-  uint64_t numWarmAccesses;
-  uint64_t numTailAccesses;
+  uint64_t numHotAccesses{0};
+  uint64_t numColdAccesses{0};
+  uint64_t numWarmAccesses{0};
+  uint64_t numTailAccesses{0};
+
+  // aggregate stats together (accross tiers)
+  MMContainerStat& operator+=(const MMContainerStat& other);
 };
 
 // cache related stats for a given allocation class.
@@ -116,13 +119,16 @@ struct CacheStat {
   uint64_t fragmentationSize{0};
 
   // number of hits for this container.
-  uint64_t numHits;
+  uint64_t numHits{0};
 
   // number of evictions from this class id that was of a chained item
-  uint64_t chainedItemEvictions;
+  uint64_t chainedItemEvictions{0};
 
   // number of regular items that were evicted from this classId
-  uint64_t regularItemEvictions;
+  uint64_t regularItemEvictions{0};
+
+  // number of items that are moved to next tier
+  uint64_t numWritebacks{0};
 
   // the stats from the mm container
   MMContainerStat containerStat;
@@ -199,12 +205,18 @@ struct PoolStats {
   // number of evictions for this pool
   uint64_t numEvictions() const noexcept;
 
+  // number of writebacks for this pool
+  uint64_t numWritebacks() const noexcept;
+
   // number of all items in this pool
   uint64_t numItems() const noexcept;
 
   // total number of allocations currently in this pool
   uint64_t numActiveAllocs() const noexcept;
 
+  // number of hits for an alloc class in this pool
+  uint64_t numHits() const noexcept;
+
   // number of hits for an alloc class in this pool
   uint64_t numHitsForClass(ClassId cid) const {
     return cacheStats.at(cid).numHits;
@@ -454,16 +466,22 @@ struct GlobalCacheStats {
   uint64_t numNvmItemRemovedSetSize{0};
 
   // number of attempts to allocate an item
-  uint64_t allocAttempts{0};
+  std::vector<uint64_t> allocAttempts;
 
   // number of eviction attempts
-  uint64_t evictionAttempts{0};
+  std::vector<uint64_t> evictionAttempts;
 
   // number of failures to allocate an item due to internal error
-  uint64_t allocFailures{0};
+  std::vector<uint64_t> allocFailures;
 
   // number of evictions across all the pools in the cache.
-  uint64_t numEvictions{0};
+  std::vector<uint64_t> numEvictions;
+
+  // number of writebacks across all the pools in the cache.
+  std::vector<uint64_t> numWritebacks;
+
+  // number of hits per tier across all the pools in the cache.
+  std::vector<uint64_t> numCacheHits;
 
   // number of allocation attempts with invalid input params.
   uint64_t invalidAllocs{0};
diff --git a/cachelib/allocator/CacheStatsInternal.h b/cachelib/allocator/CacheStatsInternal.h
index 4b437d9dbc..9265f74251 100644
--- a/cachelib/allocator/CacheStatsInternal.h
+++ b/cachelib/allocator/CacheStatsInternal.h
@@ -212,23 +212,26 @@ struct Stats {
   // we're currently writing into flash.
   mutable util::PercentileStats nvmPutSize_;
 
-  using PerPoolClassAtomicCounters =
+  using PerTierPerPoolClassAtomicCounters = std::array<
       std::array<std::array<AtomicCounter, MemoryAllocator::kMaxClasses>,
-                 MemoryPoolManager::kMaxPools>;
+                 MemoryPoolManager::kMaxPools>,
+      CacheBase::kMaxTiers>;
 
   // count of a stat for a specific allocation class
-  using PerPoolClassTLCounters =
+  using PerTierPerPoolClassTLCounters = std::array<
       std::array<std::array<TLCounter, MemoryAllocator::kMaxClasses>,
-                 MemoryPoolManager::kMaxPools>;
+                 MemoryPoolManager::kMaxPools>,
+      CacheBase::kMaxTiers>;
 
   // hit count for every alloc class in every pool
-  std::unique_ptr<PerPoolClassTLCounters> cacheHits{};
-  std::unique_ptr<PerPoolClassAtomicCounters> allocAttempts{};
-  std::unique_ptr<PerPoolClassAtomicCounters> evictionAttempts{};
-  std::unique_ptr<PerPoolClassAtomicCounters> allocFailures{};
-  std::unique_ptr<PerPoolClassAtomicCounters> fragmentationSize{};
-  std::unique_ptr<PerPoolClassAtomicCounters> chainedItemEvictions{};
-  std::unique_ptr<PerPoolClassAtomicCounters> regularItemEvictions{};
+  std::unique_ptr<PerTierPerPoolClassTLCounters> cacheHits{};
+  std::unique_ptr<PerTierPerPoolClassAtomicCounters> allocAttempts{};
+  std::unique_ptr<PerTierPerPoolClassAtomicCounters> evictionAttempts{};
+  std::unique_ptr<PerTierPerPoolClassAtomicCounters> allocFailures{};
+  std::unique_ptr<PerTierPerPoolClassAtomicCounters> fragmentationSize{};
+  std::unique_ptr<PerTierPerPoolClassAtomicCounters> chainedItemEvictions{};
+  std::unique_ptr<PerTierPerPoolClassAtomicCounters> regularItemEvictions{};
+  std::unique_ptr<PerTierPerPoolClassAtomicCounters> numWritebacks{};
 
   using PerTierPoolClassRollingStats = std::array<
       std::array<std::array<util::RollingStats, MemoryAllocator::kMaxClasses>,
diff --git a/cachelib/allocator/MMLru.h b/cachelib/allocator/MMLru.h
index d17be6b15b..a98c86d9a6 100644
--- a/cachelib/allocator/MMLru.h
+++ b/cachelib/allocator/MMLru.h
@@ -233,7 +233,7 @@ class MMLru {
     std::chrono::seconds mmReconfigureIntervalSecs{};
 
     // Whether to use combined locking for withEvictionIterator.
-    bool useCombinedLockForIterators{false};
+    bool useCombinedLockForIterators{true};
   };
 
   // The container object which can be used to keep track of objects of type
diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
index c56f640847..13388f8e8e 100644
--- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
+++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
@@ -21,11 +21,15 @@ namespace cachelib {
 namespace tests {
 
 using LruAllocatorMemoryTiersTest = AllocatorMemoryTiersTest<LruAllocator>;
-
+//using LruTestAllocatorMemoryTiersTest = AllocatorMemoryTiersTest<LruTestAllocator>;
 // TODO(MEMORY_TIER): add more tests with different eviction policies
 TEST_F(LruAllocatorMemoryTiersTest, MultiTiersInvalid) { this->testMultiTiersInvalid(); }
 TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValid) { this->testMultiTiersValid(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValidStats) { this->testMultiTiersValidStats(); }
 TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValidMixed) { this->testMultiTiersValidMixed(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersRemoveDuringEviction) { this->testMultiTiersRemoveDuringEviction(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersReplaceDuringEviction) { this->testMultiTiersReplaceDuringEviction(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersReplaceDuringEvictionWithReader) { this->testMultiTiersReplaceDuringEvictionWithReader(); }
 
 } // end of namespace tests
 } // end of namespace cachelib
diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
index 2ecb2c14ca..27db22bac3 100644
--- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
+++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
@@ -20,12 +20,46 @@
 #include "cachelib/allocator/MemoryTierCacheConfig.h"
 #include "cachelib/allocator/tests/TestBase.h"
 
+#include <fcntl.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <semaphore.h>
+#include <folly/synchronization/Latch.h>
+
 namespace facebook {
 namespace cachelib {
 namespace tests {
 
 template <typename AllocatorT>
 class AllocatorMemoryTiersTest : public AllocatorTest<AllocatorT> {
+ private:
+  template<typename MvCallback>
+  void testMultiTiersAsyncOpDuringMove(std::unique_ptr<AllocatorT>& alloc,
+                                       PoolId& pool, bool& quit, MvCallback&& moveCb) {
+    typename AllocatorT::Config config;
+    config.setCacheSize(4 * Slab::kSize);
+    config.enableCachePersistence("/tmp");
+    config.configureMemoryTiers({
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0")),
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0"))
+    });
+
+    config.enableMovingOnSlabRelease(moveCb, {} /* ChainedItemsMoveSync */,
+                                     -1 /* movingAttemptsLimit */);
+
+    alloc = std::make_unique<AllocatorT>(AllocatorT::SharedMemNew, config);
+    ASSERT(alloc != nullptr);
+    pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize);
+
+    int i = 0;
+    while(!quit) {
+      auto handle = alloc->allocate(pool, std::to_string(++i), std::string("value").size());
+      ASSERT(handle != nullptr);
+      ASSERT_NO_THROW(alloc->insertOrReplace(handle));
+    }
+  }
  public:
   void testMultiTiersInvalid() {
     typename AllocatorT::Config config;
@@ -55,6 +89,70 @@ class AllocatorMemoryTiersTest : public AllocatorTest<AllocatorT> {
     ASSERT(handle != nullptr);
     ASSERT_NO_THROW(alloc->insertOrReplace(handle));
   }
+  
+  void testMultiTiersValidStats() {
+    typename AllocatorT::Config config;
+    size_t nSlabs = 20;
+    config.setCacheSize(nSlabs * Slab::kSize);
+    config.enableCachePersistence("/tmp");
+    auto moveCb = [&](typename AllocatorT::Item& oldItem,
+                      typename AllocatorT::Item& newItem,
+                      typename AllocatorT::Item* /* parentPtr */) {
+      std::memcpy(newItem.getMemory(), oldItem.getMemory(),
+                  oldItem.getSize());
+    };
+
+    config.enableMovingOnSlabRelease(moveCb, {}, 10);
+    ASSERT_NO_THROW(config.configureMemoryTiers(
+        {MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+             std::string("0")),
+         MemoryTierCacheConfig::fromShm().setRatio(2).setMemBind(
+             std::string("0"))}));
+
+    auto alloc = std::make_unique<AllocatorT>(AllocatorT::SharedMemNew, config);
+    ASSERT(alloc != nullptr);
+    size_t keyLen = 8;
+    auto pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize);
+    std::vector<uint32_t> valsize = {1000};
+    std::vector<uint32_t> itemCount;
+    std::vector<uint32_t> evictCount;
+    for (uint32_t tid = 0; tid < 2; tid++) {
+        this->fillUpPoolUntilEvictions(*alloc, tid, pool, valsize, keyLen);
+        auto stats = alloc->getPoolStats(tid, pool);
+        const auto& classIds = stats.mpStats.classIds;
+        uint32_t prev = 0;
+        ClassId cid = 0;
+        for (const ClassId c : classIds) {
+            uint32_t currSize = stats.cacheStats[c].allocSize;
+            if (prev <= valsize[0] && valsize[0] <= currSize) {
+                cid = c;
+                break;
+            }
+            prev = currSize;
+        }
+
+        std::cout << "Tid: " << tid << " cid: " << static_cast<uint32_t>(cid)
+                  << " items: " << stats.cacheStats[cid].numItems()
+                  << " evicts: " << stats.cacheStats[cid].numEvictions()
+                  << std::endl;
+        ASSERT_GE(stats.cacheStats[cid].numItems(), 1);
+        ASSERT_EQ(stats.cacheStats[cid].numEvictions(), 1);
+        itemCount.push_back(stats.cacheStats[cid].numItems());
+        evictCount.push_back(stats.cacheStats[cid].numEvictions());
+        //first tier should have some writebacks to second tier
+        //second tier should not have any writebacks since it
+        //is last memory tier
+        if (tid == 0) {
+            ASSERT_EQ(stats.cacheStats[cid].numWritebacks, 1);
+        } else {
+            ASSERT_EQ(0, stats.cacheStats[cid].numWritebacks);
+        }
+    }
+    for (uint32_t tid = 1; tid < 2; tid++) {
+        ASSERT_NE(itemCount[tid],itemCount[tid-1]);
+        ASSERT_EQ(evictCount[tid],evictCount[tid-1]);
+    }
+  }
 
   void testMultiTiersValidMixed() {
     typename AllocatorT::Config config;
@@ -74,6 +172,200 @@ class AllocatorMemoryTiersTest : public AllocatorTest<AllocatorT> {
     ASSERT(handle != nullptr);
     ASSERT_NO_THROW(alloc->insertOrReplace(handle));
   }
+
+  void testMultiTiersRemoveDuringEviction() {
+    std::unique_ptr<AllocatorT> alloc;
+    PoolId pool;
+    std::unique_ptr<std::thread> t;
+    folly::Latch latch(1);
+    bool quit = false;
+
+    auto moveCb = [&] (typename AllocatorT::Item& oldItem,
+                       typename AllocatorT::Item& newItem,
+                       typename AllocatorT::Item* /* parentPtr */) {
+      
+      auto key = oldItem.getKey();
+      t = std::make_unique<std::thread>([&](){
+            // remove() function is blocked by wait context
+            // till item is moved to next tier. So that, we should
+            // notify latch before calling remove()
+            latch.count_down();
+            alloc->remove(key);
+          });
+      // wait till async thread is running
+      latch.wait();
+      memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize());
+      quit = true;
+    };
+
+    testMultiTiersAsyncOpDuringMove(alloc, pool, quit, moveCb);
+
+    t->join();
+  }
+
+  void testMultiTiersReplaceDuringEviction() {
+    std::unique_ptr<AllocatorT> alloc;
+    PoolId pool;
+    std::unique_ptr<std::thread> t;
+    folly::Latch latch(1);
+    bool quit = false;
+
+    auto moveCb = [&] (typename AllocatorT::Item& oldItem,
+                       typename AllocatorT::Item& newItem,
+                       typename AllocatorT::Item* /* parentPtr */) {
+      auto key = oldItem.getKey();
+      if(!quit) {
+        // we need to replace only once because subsequent allocate calls
+        // will cause evictions recursevly
+        quit = true;
+        t = std::make_unique<std::thread>([&](){
+              auto handle = alloc->allocate(pool, key, std::string("new value").size());
+              // insertOrReplace() function is blocked by wait context
+              // till item is moved to next tier. So that, we should
+              // notify latch before calling insertOrReplace()
+              latch.count_down();
+              ASSERT_NO_THROW(alloc->insertOrReplace(handle));
+            });
+        // wait till async thread is running
+        latch.wait();
+      }
+      memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize());
+    };
+
+    testMultiTiersAsyncOpDuringMove(alloc, pool, quit, moveCb);
+
+    t->join();
+
+  }
+
+
+  inline void gdb_sync1() { for (volatile int i = 0; i < 100; i++); }
+  inline void gdb_sync2() { for (volatile int i = 0; i < 100; i++); }
+  inline void gdb_sync3() { for (volatile int i = 0; i < 100; i++); }
+  using ReadHandle = typename AllocatorT::ReadHandle;
+  void testMultiTiersReplaceDuringEvictionWithReader() {
+    sem_unlink ("/gdb1_sem");
+    sem_t *sem = sem_open ("/gdb1_sem", O_CREAT | O_EXCL, S_IRUSR | S_IWUSR, 0);
+    int gdbfd = open("/tmp/gdb1.gdb",O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR);
+    char gdbcmds[] = 
+                     "set attached=1\n"
+                     "break gdb_sync1\n"
+                     "break gdb_sync2\n"
+                     "break gdb_sync3\n"
+                     "break moveRegularItem\n"
+                     "c\n"
+                     "set scheduler-locking on\n"
+                     "thread 1\n"
+                     "c\n"
+                     "thread 3\n"
+                     "c\n"
+                     "thread 4\n"
+                     "break nativeFutexWaitImpl thread 4\n"
+                     "c\n"
+                     "thread 3\n"
+                     "break nativeFutexWaitImpl thread 3\n"
+                     "c\n"
+                     "thread 1\n"
+                     "break releaseBackToAllocator\n"
+                     "c\n"
+                     "c\n"
+                     "thread 4\n"
+                     "c\n"
+                     "thread 3\n"
+                     "c\n"
+                     "thread 1\n"
+                     "c\n"
+                     "quit\n";
+    int ret = write(gdbfd,gdbcmds,strlen(gdbcmds));
+    int ppid = getpid(); //parent pid
+    int pid = fork();
+    if (pid == 0) {
+        sem_wait(sem);
+        sem_close(sem);
+        sem_unlink("/gdb1_sem");
+        char cmdpid[256];
+        sprintf(cmdpid,"%d",ppid);
+        int f = execlp("gdb","gdb","--pid",cmdpid,"--batch-silent","--command=/tmp/gdb1.gdb",(char*) 0);
+        ASSERT(f != -1);
+    }
+    sem_post(sem);
+    //wait for gdb to run
+    volatile int attached = 0;
+    while (attached == 0);
+    
+    std::unique_ptr<AllocatorT> alloc;
+    PoolId pool;
+    bool quit = false;
+    
+    typename AllocatorT::Config config;
+    config.setCacheSize(4 * Slab::kSize);
+    config.enableCachePersistence("/tmp");
+    auto moveCb = [&](typename AllocatorT::Item& oldItem,
+                      typename AllocatorT::Item& newItem,
+                      typename AllocatorT::Item* /* parentPtr */) {
+      std::memcpy(newItem.getMemory(), oldItem.getMemory(),
+                  oldItem.getSize());
+    };
+
+    config.enableMovingOnSlabRelease(moveCb, {}, 10);
+    // Disable slab rebalancing
+    config.enablePoolRebalancing(nullptr, std::chrono::seconds{0});
+    config.configureMemoryTiers({
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0")),
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0"))
+    });
+
+    alloc = std::make_unique<AllocatorT>(AllocatorT::SharedMemNew, config);
+    ASSERT(alloc != nullptr);
+    pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize);
+
+    int i = 0;
+    typename AllocatorT::Item* evicted;
+    std::unique_ptr<std::thread> t;
+    std::unique_ptr<std::thread> r;
+    while(!quit) {
+      auto handle = alloc->allocate(pool, std::to_string(++i), std::string("value").size());
+      ASSERT(handle != nullptr);
+      if (i == 1) {
+          evicted = static_cast<typename AllocatorT::Item*>(handle.get());
+          folly::Latch latch_t(1);
+          t = std::make_unique<std::thread>([&](){
+                auto handleNew = alloc->allocate(pool, std::to_string(1), std::string("new value").size());
+                ASSERT(handleNew != nullptr);
+                latch_t.count_down();
+                //first breakpoint will be this one because 
+                //thread 1 still has more items to fill up the
+                //cache before an evict is evicted
+                gdb_sync1();
+                ASSERT(evicted->isMoving());
+                //need to suspend thread 1 - who is doing the eviction
+                //gdb will do this for us
+                folly::Latch latch(1);
+                r = std::make_unique<std::thread>([&](){
+                    ASSERT(evicted->isMoving());
+                    latch.count_down();
+                    auto handleEvict = alloc->find(std::to_string(1));
+                    //does find block until done moving?? yes
+                    while (evicted->isMarkedForEviction()); //move will fail
+                    XDCHECK(handleEvict == nullptr) << handleEvict->toString();
+                    ASSERT(handleEvict == nullptr);
+                });
+                latch.wait();
+                gdb_sync2();
+                alloc->insertOrReplace(handleNew);
+                ASSERT(!evicted->isAccessible()); //move failed
+                quit = true;
+              });
+          latch_t.wait();
+      }
+      ASSERT_NO_THROW(alloc->insertOrReplace(handle));
+    }
+    t->join();
+    r->join();
+    gdb_sync3();
+  }
 };
 } // namespace tests
 } // namespace cachelib
diff --git a/cachelib/allocator/tests/TestBase.h b/cachelib/allocator/tests/TestBase.h
index 81750b9b00..e35bc54d01 100644
--- a/cachelib/allocator/tests/TestBase.h
+++ b/cachelib/allocator/tests/TestBase.h
@@ -69,6 +69,11 @@ class AllocatorTest : public SlabAllocatorTestBase {
                                 PoolId pid,
                                 const std::vector<uint32_t>& sizes,
                                 unsigned int keyLen);
+  void fillUpPoolUntilEvictions(AllocatorT& alloc,
+                                TierId tid,
+                                PoolId pid,
+                                const std::vector<uint32_t>& sizes,
+                                unsigned int keyLen);
   void fillUpOneSlab(AllocatorT& alloc,
                      PoolId poolId,
                      const uint32_t size,
@@ -204,6 +209,30 @@ void AllocatorTest<AllocatorT>::fillUpPoolUntilEvictions(
   } while (allocs != 0);
 }
 
+template <typename AllocatorT>
+void AllocatorTest<AllocatorT>::fillUpPoolUntilEvictions(
+    AllocatorT& alloc,
+    TierId tid,
+    PoolId poolId,
+    const std::vector<uint32_t>& sizes,
+    unsigned int keyLen) {
+  unsigned int allocs = 0;
+  do {
+    allocs = 0;
+    for (const auto size : sizes) {
+      const auto key = getRandomNewKey(alloc, keyLen);
+      ASSERT_EQ(alloc.find(key), nullptr);
+      const size_t prev = alloc.getPoolByTid(poolId, tid).getCurrentAllocSize();
+      auto handle = util::allocateAccessible(alloc, poolId, key, size);
+      if (handle && prev != alloc.getPoolByTid(poolId, tid).getCurrentAllocSize()) {
+        // this means we did not cause an eviction.
+        ASSERT_GE(handle->getSize(), size);
+        allocs++;
+      }
+    }
+  } while (allocs != 0);
+}
+
 template <typename AllocatorT>
 void AllocatorTest<AllocatorT>::testAllocWithoutEviction(
     AllocatorT& alloc,
diff --git a/cachelib/cachebench/cache/Cache.h b/cachelib/cachebench/cache/Cache.h
index 2953142eed..c0896cd137 100644
--- a/cachelib/cachebench/cache/Cache.h
+++ b/cachelib/cachebench/cache/Cache.h
@@ -1114,18 +1114,24 @@ double Cache<Allocator>::getNvmBytesWritten() const {
 
 template <typename Allocator>
 Stats Cache<Allocator>::getStats() const {
-  PoolStats aggregate = cache_->getPoolStats(pools_[0]);
-  auto usageFraction =
-      1.0 - (static_cast<double>(aggregate.freeMemoryBytes())) /
-                aggregate.poolUsableSize;
   Stats ret;
-  ret.poolUsageFraction.push_back(usageFraction);
-  for (size_t pid = 1; pid < pools_.size(); pid++) {
-    auto poolStats = cache_->getPoolStats(static_cast<PoolId>(pid));
-    usageFraction = 1.0 - (static_cast<double>(poolStats.freeMemoryBytes())) /
-                              poolStats.poolUsableSize;
-    ret.poolUsageFraction.push_back(usageFraction);
-    aggregate += poolStats;
+  for (TierId tid = 0; tid < cache_->getNumTiers(); tid++) {
+    PoolStats aggregate = cache_->getPoolStats(tid,pools_[0]);
+    auto usageFraction =
+        1.0 - (static_cast<double>(aggregate.freeMemoryBytes())) /
+                  aggregate.poolUsableSize;
+    ret.poolUsageFraction[tid].push_back(usageFraction);
+    for (size_t pid = 1; pid < pools_.size(); pid++) {
+      auto poolStats = cache_->getPoolStats(tid, static_cast<PoolId>(pid));
+      usageFraction = 1.0 - (static_cast<double>(poolStats.freeMemoryBytes())) /
+                                poolStats.poolUsableSize;
+      ret.poolUsageFraction[tid].push_back(usageFraction);
+      aggregate += poolStats;
+    }
+    ret.numEvictions.push_back(aggregate.numEvictions());
+    ret.numWritebacks.push_back(aggregate.numWritebacks());
+    ret.numCacheHits.push_back(aggregate.numHits());
+    ret.numItems.push_back(aggregate.numItems());
   }
 
   std::map<TierId, std::map<PoolId, std::map<ClassId, ACStats>>> allocationClassStats{};
@@ -1145,8 +1151,6 @@ Stats Cache<Allocator>::getStats() const {
   const auto navyStats = cache_->getNvmCacheStatsMap().toMap();
 
   ret.allocationClassStats = allocationClassStats;
-  ret.numEvictions = aggregate.numEvictions();
-  ret.numItems = aggregate.numItems();
   ret.evictAttempts = cacheStats.evictionAttempts;
   ret.allocAttempts = cacheStats.allocAttempts;
   ret.allocFailures = cacheStats.allocFailures;
@@ -1155,7 +1159,7 @@ Stats Cache<Allocator>::getStats() const {
   ret.backgndEvicStats.nTraversals = cacheStats.evictionStats.runCount;
   ret.backgndEvicStats.nClasses = cacheStats.evictionStats.totalClasses;
   ret.backgndEvicStats.evictionSize = cacheStats.evictionStats.totalBytesMoved;
-
+  
   ret.backgndPromoStats.nPromotedItems =
       cacheStats.promotionStats.numMovedItems;
   ret.backgndPromoStats.nTraversals = cacheStats.promotionStats.runCount;
diff --git a/cachelib/cachebench/cache/CacheStats.h b/cachelib/cachebench/cache/CacheStats.h
index e848b71e44..7d5e05522b 100644
--- a/cachelib/cachebench/cache/CacheStats.h
+++ b/cachelib/cachebench/cache/CacheStats.h
@@ -52,15 +52,18 @@ struct BackgroundPromotionStats {
 struct Stats {
   BackgroundEvictionStats backgndEvicStats;
   BackgroundPromotionStats backgndPromoStats;
+  ReaperStats reaperStats;
 
-  uint64_t numEvictions{0};
-  uint64_t numItems{0};
+  std::vector<uint64_t> numEvictions;
+  std::vector<uint64_t> numWritebacks;
+  std::vector<uint64_t> numCacheHits;
+  std::vector<uint64_t> numItems;
 
-  uint64_t evictAttempts{0};
-  uint64_t allocAttempts{0};
-  uint64_t allocFailures{0};
+  std::vector<uint64_t> evictAttempts{0};
+  std::vector<uint64_t> allocAttempts{0};
+  std::vector<uint64_t> allocFailures{0};
 
-  std::vector<double> poolUsageFraction;
+  std::map<TierId,std::vector<double>> poolUsageFraction;
 
   uint64_t numCacheGets{0};
   uint64_t numCacheGetMiss{0};
@@ -143,33 +146,51 @@ struct Stats {
   void render(std::ostream& out) const {
     auto totalMisses = getTotalMisses();
     const double overallHitRatio = invertPctFn(totalMisses, numCacheGets);
-    out << folly::sformat("Items in RAM  : {:,}", numItems) << std::endl;
-    out << folly::sformat("Items in NVM  : {:,}", numNvmItems) << std::endl;
-
-    out << folly::sformat("Alloc Attempts: {:,} Success: {:.2f}%",
-                          allocAttempts,
-                          invertPctFn(allocFailures, allocAttempts))
-        << std::endl;
-    out << folly::sformat("Evict Attempts: {:,} Success: {:.2f}%",
-                          evictAttempts,
-                          pctFn(numEvictions, evictAttempts))
-        << std::endl;
-    out << folly::sformat("RAM Evictions : {:,}", numEvictions) << std::endl;
-
-    auto foreachAC = [](const auto& map, auto cb) {
+    const auto nTiers = numItems.size();
+    for (TierId tid = 0; tid < nTiers; tid++) {
+        out << folly::sformat("Items in Tier {}  : {:,}", tid, numItems[tid]) << std::endl;
+    }
+    out << folly::sformat("Items in NVM    : {:,}", numNvmItems) << std::endl;
+    for (TierId tid = 0; tid < nTiers; tid++) {
+        out << folly::sformat("Tier {} Alloc Attempts: {:,} Success: {:.2f}%",
+                              tid,
+                              allocAttempts[tid],
+                              invertPctFn(allocFailures[tid], allocAttempts[tid]))
+            << std::endl;
+    }
+    for (TierId tid = 0; tid < nTiers; tid++) {
+        out << folly::sformat(
+                   "Tier {} Evict Attempts: {:,} Success: {:.2f}%",
+                   tid,
+                   evictAttempts[tid],
+                   pctFn(numEvictions[tid], evictAttempts[tid]))
+            << std::endl;
+    }
+    for (TierId tid = 0; tid < nTiers; tid++) {
+        out << folly::sformat("Tier {} Evictions : {:,} Writebacks: {:,} Success: {:.2f}%",
+                tid, numEvictions[tid], numWritebacks[tid],
+                invertPctFn(numEvictions[tid] - numWritebacks[tid], numEvictions[tid])) << std::endl;
+    }
+    auto foreachAC = [&](auto &map, auto cb) {
       for (auto &tidStat : map) {
-        for (auto& pidStat : tidStat.second) {
-          for (auto& cidStat : pidStat.second) {
+        for (auto &pidStat : tidStat.second) {
+          for (auto &cidStat : pidStat.second) {
             cb(tidStat.first, pidStat.first, cidStat.first, cidStat.second);
           }
         }
       }
     };
 
-    for (auto pid = 0U; pid < poolUsageFraction.size(); pid++) {
-      out << folly::sformat("Fraction of pool {:,} used : {:.2f}", pid,
-                            poolUsageFraction[pid])
-          << std::endl;
+    for (auto entry : poolUsageFraction) {
+        auto tid = entry.first;
+        auto usageFraction = entry.second;
+        for (auto pid = 0U; pid < usageFraction.size(); pid++) {
+          out << folly::sformat("Tier {} fraction of pool {:,} used : {:.2f}",
+                                tid,
+                                pid,
+                                usageFraction[pid])
+              << std::endl;
+        }
     }
 
     if (FLAGS_report_ac_memory_usage_stats != "") {
@@ -211,8 +232,8 @@ struct Stats {
 
         // If the pool is not full, extrapolate usageFraction for AC assuming it
         // will grow at the same rate. This value will be the same for all ACs.
-        auto acUsageFraction = (poolUsageFraction[pid] < 1.0)
-                                   ? poolUsageFraction[pid]
+        const auto acUsageFraction = (poolUsageFraction.at(tid)[pid] < 1.0)
+                                   ? poolUsageFraction.at(tid)[pid]
                                    : stats.usageFraction();
 
         out << folly::sformat(
@@ -230,7 +251,11 @@ struct Stats {
       out << folly::sformat("Cache Gets    : {:,}", numCacheGets) << std::endl;
       out << folly::sformat("Hit Ratio     : {:6.2f}%", overallHitRatio)
           << std::endl;
-
+      for (TierId tid = 0; tid < numCacheHits.size(); tid++) {
+        double tierHitRatio = pctFn(numCacheHits[tid],numCacheGets);
+        out << folly::sformat("Tier {} Hit Ratio     : {:6.2f}%", tid, tierHitRatio)
+            << std::endl;
+      }
       if (FLAGS_report_api_latency) {
         auto printLatencies =
             [&out](folly::StringPiece cat,
@@ -290,6 +315,14 @@ struct Stats {
           << std::endl;
     }
 
+    if (reaperStats.numReapedItems > 0) {
+
+      out << folly::sformat("Reaper reaped: {:,} visited: {:,} traversals: {:,} avg traversal time: {:,}",
+              reaperStats.numReapedItems,reaperStats.numVisitedItems,
+              reaperStats.numTraversals,reaperStats.avgTraversalTimeMs)
+              << std::endl;
+    }
+
     if (numNvmGets > 0 || numNvmDeletes > 0 || numNvmPuts > 0) {
       const double ramHitRatio = invertPctFn(numCacheGetMiss, numCacheGets);
       const double nvmHitRatio = invertPctFn(numNvmGetMiss, numNvmGets);
@@ -425,8 +458,8 @@ struct Stats {
     }
 
     if (numCacheEvictions > 0) {
-      out << folly::sformat("Total eviction executed {}", numCacheEvictions)
-          << std::endl;
+      out << folly::sformat("Total evictions executed {:,}", numCacheEvictions)
+              << std::endl;
     }
   }
 
@@ -484,7 +517,8 @@ struct Stats {
     };
 
     auto totalMisses = getTotalMisses();
-    counters["num_items"] = numItems;
+    //TODO: per tier
+    counters["num_items"] = std::accumulate(numItems.begin(),numItems.end(),0);
     counters["num_nvm_items"] = numNvmItems;
     counters["hit_rate"] = calcInvertPctFn(totalMisses, numCacheGets);
 
diff --git a/cachelib/cachebench/util/CacheConfig.h b/cachelib/cachebench/util/CacheConfig.h
index 0a1569615d..028a18c596 100644
--- a/cachelib/cachebench/util/CacheConfig.h
+++ b/cachelib/cachebench/util/CacheConfig.h
@@ -92,7 +92,7 @@ struct CacheConfig : public JSONConfig {
   bool lruUpdateOnWrite{false};
   bool lruUpdateOnRead{true};
   bool tryLockUpdate{false};
-  bool useCombinedLockForIterators{false};
+  bool useCombinedLockForIterators{true};
 
   // LRU param
   uint64_t lruIpSpec{0};

From d4cf1d4a460fa2c148e37586964c34f479f5b2ad Mon Sep 17 00:00:00 2001
From: Igor Chorazewicz <Igor.Chorazewicz@intel.com>
Date: Thu, 30 Dec 2021 17:18:29 -0500
Subject: [PATCH 12/40] basic multi-tier test based on numa bindings

---
 .../allocator/tests/AllocatorTypeTest.cpp     |  1 +
 cachelib/allocator/tests/BaseAllocatorTest.h  | 80 +++++++++++++++++++
 2 files changed, 81 insertions(+)

diff --git a/cachelib/allocator/tests/AllocatorTypeTest.cpp b/cachelib/allocator/tests/AllocatorTypeTest.cpp
index 97ff04efea..28c145b39d 100644
--- a/cachelib/allocator/tests/AllocatorTypeTest.cpp
+++ b/cachelib/allocator/tests/AllocatorTypeTest.cpp
@@ -410,6 +410,7 @@ TYPED_TEST(BaseAllocatorTest, RateMap) { this->testRateMap(); }
 TYPED_TEST(BaseAllocatorTest, StatSnapshotTest) {
   this->testStatSnapshotTest();
 }
+TYPED_TEST(BaseAllocatorTest, BasicMultiTier) {this->testBasicMultiTier(); }
 
 namespace { // the tests that cannot be done by TYPED_TEST.
 
diff --git a/cachelib/allocator/tests/BaseAllocatorTest.h b/cachelib/allocator/tests/BaseAllocatorTest.h
index 22c80e6734..ac3d7bbccd 100644
--- a/cachelib/allocator/tests/BaseAllocatorTest.h
+++ b/cachelib/allocator/tests/BaseAllocatorTest.h
@@ -6304,6 +6304,86 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
         });
     EXPECT_EQ(intervalNameExists, 4);
   }
+  
+  void testSingleTierMemoryAllocatorSize() {
+    typename AllocatorT::Config config;
+    static constexpr size_t cacheSize = 100 * 1024 * 1024; /* 100 MB */
+    config.setCacheSize(cacheSize);
+    config.enableCachePersistence(folly::sformat("/tmp/single-tier-test/{}", ::getpid()));
+
+    AllocatorT alloc(AllocatorT::SharedMemNew, config);
+
+    EXPECT_LE(alloc.allocator_[0]->getMemorySize(), cacheSize);
+  }
+
+  void testSingleTierMemoryAllocatorSizeAnonymous() {
+    typename AllocatorT::Config config;
+    static constexpr size_t cacheSize = 100 * 1024 * 1024; /* 100 MB */
+    config.setCacheSize(cacheSize);
+
+    AllocatorT alloc(config);
+
+    EXPECT_LE(alloc.allocator_[0]->getMemorySize(), cacheSize);
+  }
+
+  void testBasicMultiTier() {
+    using Item = typename AllocatorT::Item;
+    const static std::string data = "data";
+
+    std::set<std::string> movedKeys;
+    auto moveCb = [&](const Item& oldItem, Item& newItem, Item* /* parentPtr */) {
+      std::memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize());
+      movedKeys.insert(oldItem.getKey().str());
+    };
+
+    typename AllocatorT::Config config;
+    static constexpr size_t cacheSize = 100 * 1024 * 1024; /* 100 MB */
+    config.setCacheSize(100 * 1024 * 1024); /* 100 MB */
+    config.enableCachePersistence(folly::sformat("/tmp/multi-tier-test/{}", ::getpid()));
+    config.configureMemoryTiers({
+      MemoryTierCacheConfig::fromShm().setRatio(1)
+        .setMemBind(std::string("0")),
+      MemoryTierCacheConfig::fromShm().setRatio(1)
+        .setMemBind(std::string("0")),
+    });
+    config.enableMovingOnSlabRelease(moveCb);
+
+    AllocatorT alloc(AllocatorT::SharedMemNew, config);
+
+    EXPECT_EQ(alloc.allocator_.size(), 2);
+    EXPECT_LE(alloc.allocator_[0]->getMemorySize(), cacheSize / 2);
+    EXPECT_LE(alloc.allocator_[1]->getMemorySize(), cacheSize / 2);
+
+    const size_t numBytes = alloc.getCacheMemoryStats().ramCacheSize;
+    auto pid = alloc.addPool("default", numBytes);
+
+    static constexpr size_t numOps = cacheSize / 1024;
+    for (int i = 0; i < numOps; i++) {
+      std::string key = std::to_string(i);
+      auto h = alloc.allocate(pid, key, 1024);
+      EXPECT_TRUE(h);
+
+      std::memcpy(h->getMemory(), data.data(), data.size());
+
+      alloc.insertOrReplace(h);
+    }
+
+    EXPECT_TRUE(movedKeys.size() > 0);
+
+    size_t movedButStillInMemory = 0;
+    for (const auto &k : movedKeys) {
+      auto h = alloc.find(k);
+
+      if (h) {
+        movedButStillInMemory++;
+        /* All moved elements should be in the second tier. */
+        EXPECT_TRUE(alloc.allocator_[1]->isMemoryInAllocator(h->getMemory()));
+        EXPECT_EQ(data, std::string((char*)h->getMemory(), data.size()));
+      }
+    }
+
+    EXPECT_TRUE(movedButStillInMemory > 0);
+  }
 };
 } // namespace tests
 } // namespace cachelib

From 6d2fbeffd69aedd8dc8a9afeea0c539db8d02c36 Mon Sep 17 00:00:00 2001
From: Sergei Vinogradov <sergey.vinogradov@intel.com>
Date: Thu, 27 Jan 2022 05:27:20 -0800
Subject: [PATCH 13/40] Aadding new configs to
 hit_ratio/graph_cache_leader_fobj -updated configs for numa bindings

---
 .../config-4GB-DRAM-4GB-PMEM.json             | 42 +++++++++++++++++++
 .../config-8GB-DRAM.json                      | 32 ++++++++++++++
 .../config-8GB-PMEM.json                      | 38 +++++++++++++++++
 .../test_configs/simple_tiers_test.json       | 12 ++++--
 4 files changed, 120 insertions(+), 4 deletions(-)
 create mode 100644 cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-4GB-DRAM-4GB-PMEM.json
 create mode 100644 cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-DRAM.json
 create mode 100644 cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-PMEM.json

diff --git a/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-4GB-DRAM-4GB-PMEM.json b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-4GB-DRAM-4GB-PMEM.json
new file mode 100644
index 0000000000..d9acdf7c6c
--- /dev/null
+++ b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-4GB-DRAM-4GB-PMEM.json
@@ -0,0 +1,42 @@
+{
+  "cache_config": {
+    "cacheSizeMB": 8192,
+    "poolRebalanceIntervalSec": 0,
+    "cacheDir": "/tmp/mem-tiers",
+    "memoryTiers" : [
+      {
+        "ratio": 1,
+        "memBindNodes": 0
+      },
+      {
+        "ratio": 1,
+        "memBindNodes": 0
+      }
+    ]
+  }, 
+  "test_config": 
+    {
+      "addChainedRatio": 0.0, 
+      "delRatio": 0.0, 
+      "enableLookaside": true, 
+      "getRatio": 0.7684563460126871, 
+      "keySizeRange": [
+        1, 
+        8, 
+        64
+      ], 
+      "keySizeRangeProbability": [
+        0.3, 
+        0.7
+      ], 
+      "loneGetRatio": 0.2315436539873129, 
+      "numKeys": 71605574, 
+      "numOps": 5000000, 
+      "numThreads": 24, 
+      "popDistFile": "pop.json",
+
+      "setRatio": 0.0, 
+      "valSizeDistFile": "sizes.json"
+    }
+ 
+}
diff --git a/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-DRAM.json b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-DRAM.json
new file mode 100644
index 0000000000..6d47e08b74
--- /dev/null
+++ b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-DRAM.json
@@ -0,0 +1,32 @@
+{
+  "cache_config": {
+    "cacheSizeMB": 8192,
+    "poolRebalanceIntervalSec": 0,
+    "cacheDir": "/tmp/mem-tier"
+  }, 
+  "test_config": 
+    {
+      "addChainedRatio": 0.0, 
+      "delRatio": 0.0, 
+      "enableLookaside": true, 
+      "getRatio": 0.7684563460126871, 
+      "keySizeRange": [
+        1, 
+        8, 
+        64
+      ], 
+      "keySizeRangeProbability": [
+        0.3, 
+        0.7
+      ], 
+      "loneGetRatio": 0.2315436539873129, 
+      "numKeys": 71605574, 
+      "numOps": 5000000, 
+      "numThreads": 24, 
+      "popDistFile": "pop.json", 
+       
+      "setRatio": 0.0, 
+      "valSizeDistFile": "sizes.json"
+    }
+ 
+}
diff --git a/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-PMEM.json b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-PMEM.json
new file mode 100644
index 0000000000..4feab55154
--- /dev/null
+++ b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-PMEM.json
@@ -0,0 +1,38 @@
+{
+  "cache_config": {
+    "cacheSizeMB": 8192,
+    "poolRebalanceIntervalSec": 0,
+    "cacheDir": "/tmp/mem-tier",
+    "memoryTiers" : [
+      {
+        "ratio": 1,
+        "memBindNodes": 0
+      }
+    ]
+  }, 
+  "test_config": 
+    {
+      "addChainedRatio": 0.0, 
+      "delRatio": 0.0, 
+      "enableLookaside": true, 
+      "getRatio": 0.7684563460126871, 
+      "keySizeRange": [
+        1, 
+        8, 
+        64
+      ], 
+      "keySizeRangeProbability": [
+        0.3, 
+        0.7
+      ], 
+      "loneGetRatio": 0.2315436539873129, 
+      "numKeys": 71605574, 
+      "numOps": 5000000, 
+      "numThreads": 24, 
+      "popDistFile": "pop.json", 
+       
+      "setRatio": 0.0, 
+      "valSizeDistFile": "sizes.json"
+    }
+ 
+}
diff --git a/cachelib/cachebench/test_configs/simple_tiers_test.json b/cachelib/cachebench/test_configs/simple_tiers_test.json
index 182bb514cb..58302b9f20 100644
--- a/cachelib/cachebench/test_configs/simple_tiers_test.json
+++ b/cachelib/cachebench/test_configs/simple_tiers_test.json
@@ -1,14 +1,18 @@
 // @nolint instantiates a small cache and runs a quick run of basic operations.
 {
     "cache_config" : {
-      "cacheSizeMB" : 512,
-      "usePosixShm" : false,
+      "cacheSizeMB" : 1024,
       "cacheDir" : "/tmp/mem-tiers",
       "memoryTiers" : [
+        {
+          "ratio": 1,
+          "memBindNodes": "0"
+        },
         {
           "ratio": 1,
           "memBindNodes": "0"
         }
+
       ],
       "poolRebalanceIntervalSec" : 1,
       "moveOnSlabRelease" : false,
@@ -19,7 +23,7 @@
     "test_config" : {
         "numOps" : 100000,
         "numThreads" : 32,
-        "numKeys" : 1000000,
+        "numKeys" : 2000000,
 
         "keySizeRange" : [1, 8, 64],
         "keySizeRangeProbability" : [0.3, 0.7],
@@ -33,4 +37,4 @@
         "keyPoolDistribution": [0.4, 0.6],
         "opPoolDistribution" : [0.5, 0.5]
     }
-  }
\ No newline at end of file
+  }

From 5bfa1ff515e5faf2fea8688fbc281514c3342b66 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Fri, 21 Oct 2022 12:27:47 -0400
Subject: [PATCH 14/40] Background data movement for the tiers Part 1.
 -------------------------------------- This adds the following: 1.
 tryPromoteToNextTier. This could go with multi-tier part 2 2. Promotion
 iterators. This could go with MM2Q promotion iterators patch.

It also enables background workers in the
cache config.

Future changes to the background workers can
be merged with this patch.

Background evictors multi-tier
Part 2.
--------------------------------
This should be rolled into background evictors part 1.

improved bg stats structure and cachebench output
adds the following:
 - approx usage stat
 - evictions / attempts per class

Background evictors multi-tier
Part 3.
--------------------------------
use approximate usage fraction
---
 MultiTierDataMovement.md                      |  90 +++++
 cachelib/allocator/BackgroundMover.h          | 115 ++++--
 cachelib/allocator/BackgroundMoverStrategy.h  |  37 +-
 cachelib/allocator/Cache.h                    |  16 +
 cachelib/allocator/CacheAllocator.h           | 328 ++++++++++++++----
 cachelib/allocator/CacheAllocatorConfig.h     |  18 +
 cachelib/allocator/CacheStats.h               |  43 ++-
 cachelib/allocator/FreeThresholdStrategy.cpp  |  44 ++-
 cachelib/allocator/MMLru.h                    |  17 +
 cachelib/allocator/MMTinyLFU.h                |  12 +
 cachelib/allocator/PromotionStrategy.h        |  38 +-
 cachelib/allocator/memory/AllocationClass.cpp |  24 ++
 cachelib/allocator/memory/AllocationClass.h   |   7 +
 .../allocator/memory/MemoryAllocatorStats.h   |  12 +
 cachelib/allocator/memory/MemoryPool.cpp      |  19 +
 cachelib/allocator/memory/MemoryPool.h        |   8 +
 .../tests/AllocatorMemoryTiersTest.cpp        |   1 +
 .../tests/AllocatorMemoryTiersTest.h          |  77 ++++
 cachelib/cachebench/cache/Cache.h             |  36 +-
 cachelib/cachebench/cache/CacheStats.h        | 283 +++++++--------
 cachelib/cachebench/util/CacheConfig.cpp      |  44 ++-
 cachelib/cachebench/util/CacheConfig.h        |  27 ++
 22 files changed, 1028 insertions(+), 268 deletions(-)
 create mode 100644 MultiTierDataMovement.md

diff --git a/MultiTierDataMovement.md b/MultiTierDataMovement.md
new file mode 100644
index 0000000000..cccc14b947
--- /dev/null
+++ b/MultiTierDataMovement.md
@@ -0,0 +1,90 @@
+# Background Data Movement
+
+In order to reduce the number of online evictions and support asynchronous
+promotion - we have added two periodic workers to handle eviction and promotion.
+
+The diagram below shows a simplified version of how the background evictor
+thread (green) is integrated to the CacheLib architecture. 
+
+<p align="center">
+  <img width="640" height="360" alt="BackgroundEvictor" src="cachelib-background-evictor.png">
+</p>
+
+## Background Evictors
+
+The background evictors scan each class to see if there are objects to move the next (lower)
+tier using a given strategy. Here we document the parameters for the different
+strategies and general parameters. 
+
+- `backgroundEvictorIntervalMilSec`: The interval that this thread runs for - by default
+the background evictor threads will wake up every 10 ms to scan the AllocationClasses. Also,
+the background evictor thread will be woken up everytime there is a failed allocation (from
+a request handling thread) and the current percentage of free memory for the 
+AllocationClass is lower than `lowEvictionAcWatermark`. This may render the interval parameter
+not as important when there are many allocations occuring from request handling threads. 
+
+- `evictorThreads`: The number of background evictors to run - each thread is a assigned
+a set of AllocationClasses to scan and evict objects from. Currently, each thread gets
+an equal number of classes to scan - but as object size distribution may be unequal - future
+versions will attempt to balance the classes among threads. The range is 1 to number of AllocationClasses.
+The default is 1. 
+
+- `maxEvictionBatch`: The number of objects to remove in a given eviction call. The
+default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not
+remove objects at a reasonable rate, too high and it might increase contention with user threads.
+
+- `minEvictionBatch`: Minimum number of items to evict at any time (if there are any
+candidates)
+
+- `maxEvictionPromotionHotness`: Maximum candidates to consider for eviction. This is similar to `maxEvictionBatch`
+but it specifies how many candidates will be taken into consideration, not the actual number of items to evict.
+This option can be used to configure duration of critical section on LRU lock.
+
+
+### FreeThresholdStrategy (default)
+
+- `lowEvictionAcWatermark`: Triggers background eviction thread to run
+when this percentage of the AllocationClass is free. 
+The default is `2.0`, to avoid wasting capacity we don't set this above `10.0`.
+
+- `highEvictionAcWatermark`: Stop the evictions from an AllocationClass when this 
+percentage of the AllocationClass is free. The default is `5.0`, to avoid wasting capacity we
+don't set this above `10`.
+
+
+## Background Promoters
+
+The background promoters scan each class to see if there are objects to move to a lower
+tier using a given strategy. Here we document the parameters for the different
+strategies and general parameters.
+
+- `backgroundPromoterIntervalMilSec`: The interval that this thread runs for - by default
+the background promoter threads will wake up every 10 ms to scan the AllocationClasses for
+objects to promote.
+
+- `promoterThreads`: The number of background promoters to run - each thread is a assigned
+a set of AllocationClasses to scan and promote objects from. Currently, each thread gets
+an equal number of classes to scan - but as object size distribution may be unequal - future
+versions will attempt to balance the classes among threads. The range is `1` to number of AllocationClasses. The default is `1`.
+
+- `maxProtmotionBatch`: The number of objects to promote in a given promotion call. The
+default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not
+remove objects at a reasonable rate, too high and it might increase contention with user threads. 
+
+- `minPromotionBatch`: Minimum number of items to promote at any time (if there are any
+candidates)
+
+- `numDuplicateElements`: This allows us to promote items that have existing handles (read-only) since
+we won't need to modify the data when a user is done with the data. Therefore, for a short time
+the data could reside in both tiers until it is evicted from its current tier. The default is to
+not allow this (0). Setting the value to 100 will enable duplicate elements in tiers.
+
+### Background Promotion Strategy (only one currently)
+
+- `promotionAcWatermark`: Promote items if there is at least this
+percent of free AllocationClasses. Promotion thread will attempt to move `maxPromotionBatch` number of objects
+to that tier. The objects are chosen from the head of the LRU. The default is `4.0`.
+This value should correlate with `lowEvictionAcWatermark`, `highEvictionAcWatermark`, `minAcAllocationWatermark`, `maxAcAllocationWatermark`.
+- `maxPromotionBatch`: The number of objects to promote in batch during BG promotion. Analogous to
+`maxEvictionBatch`. It's value should be lower to decrease contention on hot items.
+
diff --git a/cachelib/allocator/BackgroundMover.h b/cachelib/allocator/BackgroundMover.h
index e7bba4095a..e8c1242283 100644
--- a/cachelib/allocator/BackgroundMover.h
+++ b/cachelib/allocator/BackgroundMover.h
@@ -18,7 +18,6 @@
 
 #include "cachelib/allocator/BackgroundMoverStrategy.h"
 #include "cachelib/allocator/CacheStats.h"
-#include "cachelib/common/AtomicCounter.h"
 #include "cachelib/common/PeriodicWorker.h"
 
 namespace facebook::cachelib {
@@ -51,6 +50,7 @@ enum class MoverDir { Evict = 0, Promote };
 template <typename CacheT>
 class BackgroundMover : public PeriodicWorker {
  public:
+  using ClassBgStatsType = std::map<MemoryDescriptorType,uint64_t>;
   using Cache = CacheT;
   // @param cache               the cache interface
   // @param strategy            the stragey class that defines how objects are
@@ -62,8 +62,9 @@ class BackgroundMover : public PeriodicWorker {
   ~BackgroundMover() override;
 
   BackgroundMoverStats getStats() const noexcept;
-  std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>>
-  getClassStats() const noexcept;
+  ClassBgStatsType getClassStats() const noexcept {
+    return movesPerClass_;
+  }
 
   void setAssignedMemory(std::vector<MemoryDescriptorType>&& assignedMemory);
 
@@ -72,8 +73,27 @@ class BackgroundMover : public PeriodicWorker {
   static size_t workerId(TierId tid, PoolId pid, ClassId cid, size_t numWorkers);
 
  private:
-  std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>>
-      movesPerClass_;
+  ClassBgStatsType movesPerClass_;
+
+  struct TraversalStats {
+    // record a traversal and its time taken
+    void recordTraversalTime(uint64_t nsTaken);
+
+    uint64_t getAvgTraversalTimeNs(uint64_t numTraversals) const;
+    uint64_t getMinTraversalTimeNs() const { return minTraversalTimeNs_; }
+    uint64_t getMaxTraversalTimeNs() const { return maxTraversalTimeNs_; }
+    uint64_t getLastTraversalTimeNs() const { return lastTraversalTimeNs_; }
+
+   private:
+    // time it took us the last time to traverse the cache.
+    uint64_t lastTraversalTimeNs_{0};
+    uint64_t minTraversalTimeNs_{
+        std::numeric_limits<uint64_t>::max()};
+    uint64_t maxTraversalTimeNs_{0};
+    uint64_t totalTraversalTimeNs_{0};
+  };
+
+  TraversalStats traversalStats_;
   // cache allocator's interface for evicting
   using Item = typename Cache::Item;
 
@@ -89,9 +109,10 @@ class BackgroundMover : public PeriodicWorker {
   void work() override final;
   void checkAndRun();
 
-  AtomicCounter numMovedItems_{0};
-  AtomicCounter numTraversals_{0};
-  AtomicCounter totalBytesMoved_{0};
+  uint64_t numMovedItems{0};
+  uint64_t numTraversals{0};
+  uint64_t totalClasses{0};
+  uint64_t totalBytesMoved{0};
 
   std::vector<MemoryDescriptorType> assignedMemory_;
   folly::DistributedMutex mutex_;
@@ -111,6 +132,20 @@ BackgroundMover<CacheT>::BackgroundMover(
   }
 }
 
+template <typename CacheT>
+void BackgroundMover<CacheT>::TraversalStats::recordTraversalTime(uint64_t nsTaken) {
+  lastTraversalTimeNs_ = nsTaken;
+  minTraversalTimeNs_ = std::min(minTraversalTimeNs_, nsTaken);
+  maxTraversalTimeNs_ = std::max(maxTraversalTimeNs_, nsTaken);
+  totalTraversalTimeNs_ += nsTaken;
+}
+
+template <typename CacheT>
+uint64_t BackgroundMover<CacheT>::TraversalStats::getAvgTraversalTimeNs(
+    uint64_t numTraversals) const {
+  return numTraversals ? totalTraversalTimeNs_ / numTraversals : 0;
+}
+
 template <typename CacheT>
 BackgroundMover<CacheT>::~BackgroundMover() {
   stop(std::chrono::seconds(0));
@@ -144,44 +179,56 @@ template <typename CacheT>
 void BackgroundMover<CacheT>::checkAndRun() {
   auto assignedMemory = mutex_.lock_combine([this] { return assignedMemory_; });
 
-  unsigned int moves = 0;
-  auto batches = strategy_->calculateBatchSizes(cache_, assignedMemory);
-
-  for (size_t i = 0; i < batches.size(); i++) {
-    const auto [tid, pid, cid] = assignedMemory[i];
-    const auto batch = batches[i];
+  while (true) {
+    unsigned int moves = 0;
+    std::set<ClassId> classes{};
+    auto batches = strategy_->calculateBatchSizes(cache_, assignedMemory);
+
+    const auto begin = util::getCurrentTimeNs();
+    for (size_t i = 0; i < batches.size(); i++) {
+      const auto [tid, pid, cid] = assignedMemory[i];
+      const auto batch = batches[i];
+      if (!batch) {
+        continue;
+      }
+
+      // try moving BATCH items from the class in order to reach free target
+      auto moved = moverFunc(cache_, tid, pid, cid, batch);
+      moves += moved;
+      movesPerClass_[assignedMemory[i]] += moved;
+    }
+    auto end = util::getCurrentTimeNs();
+    if (moves > 0) {
+      traversalStats_.recordTraversalTime(end > begin ? end - begin : 0);
+      numMovedItems += moves;
+      numTraversals++;
+    }
 
-    if (batch == 0) {
-      continue;
+    //we didn't move any objects done with this run
+    if (moves == 0 || shouldStopWork()) {
+        break;
     }
-    const auto& mpStats = cache_.getPoolByTid(pid, tid).getStats();
-    // try moving BATCH items from the class in order to reach free target
-    auto moved = moverFunc(cache_, tid, pid, cid, batch);
-    moves += moved;
-    movesPerClass_[tid][pid][cid] += moved;
-    totalBytesMoved_.add(moved * mpStats.acStats.at(cid).allocSize );
   }
-
-  numTraversals_.inc();
-  numMovedItems_.add(moves);
 }
 
 template <typename CacheT>
 BackgroundMoverStats BackgroundMover<CacheT>::getStats() const noexcept {
   BackgroundMoverStats stats;
-  stats.numMovedItems = numMovedItems_.get();
-  stats.runCount = numTraversals_.get();
-  stats.totalBytesMoved = totalBytesMoved_.get();
+  stats.numMovedItems = numMovedItems;
+  stats.totalBytesMoved = totalBytesMoved;
+  stats.totalClasses = totalClasses;
+  auto runCount = getRunCount();
+  stats.runCount = runCount;
+  stats.numTraversals = numTraversals;
+  stats.avgItemsMoved = (double) stats.numMovedItems / (double)runCount;
+  stats.lastTraversalTimeNs = traversalStats_.getLastTraversalTimeNs();
+  stats.avgTraversalTimeNs = traversalStats_.getAvgTraversalTimeNs(numTraversals);
+  stats.minTraversalTimeNs = traversalStats_.getMinTraversalTimeNs();
+  stats.maxTraversalTimeNs = traversalStats_.getMaxTraversalTimeNs();
 
   return stats;
 }
 
-template <typename CacheT>
-std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>>
-BackgroundMover<CacheT>::getClassStats() const noexcept {
-  return movesPerClass_;
-}
-
 template <typename CacheT>
 size_t BackgroundMover<CacheT>::workerId(TierId tid,
                                          PoolId pid,
diff --git a/cachelib/allocator/BackgroundMoverStrategy.h b/cachelib/allocator/BackgroundMoverStrategy.h
index 14bde15908..2f187636c6 100644
--- a/cachelib/allocator/BackgroundMoverStrategy.h
+++ b/cachelib/allocator/BackgroundMoverStrategy.h
@@ -21,14 +21,6 @@
 namespace facebook {
 namespace cachelib {
 
-struct MemoryDescriptorType {
-  MemoryDescriptorType(TierId tid, PoolId pid, ClassId cid) : 
-      tid_(tid), pid_(pid), cid_(cid) {}
-  TierId tid_;
-  PoolId pid_;
-  ClassId cid_;
-};
-
 // Base class for background eviction strategy.
 class BackgroundMoverStrategy {
  public:
@@ -46,5 +38,34 @@ class BackgroundMoverStrategy {
   virtual ~BackgroundMoverStrategy() = default;
 };
 
+class DefaultBackgroundMoverStrategy : public BackgroundMoverStrategy {
+  public:
+    DefaultBackgroundMoverStrategy(uint64_t batchSize, double targetFree)
+      : batchSize_(batchSize), targetFree_((double)targetFree/100.0) {}
+    ~DefaultBackgroundMoverStrategy() {}
+
+  std::vector<size_t> calculateBatchSizes(
+      const CacheBase& cache,
+      std::vector<MemoryDescriptorType> acVec) {
+    std::vector<size_t> batches{};
+    for (auto [tid, pid, cid] : acVec) {
+        double usage = cache.getPoolByTid(pid, tid).getApproxUsage(cid);
+        uint32_t perSlab = cache.getPoolByTid(pid, tid).getPerSlab(cid);
+        if (usage >= (1.0-targetFree_)) {
+          uint32_t batch = batchSize_ > perSlab ? perSlab : batchSize_;
+          batches.push_back(batch);
+        } else {
+          //no work to be done since there is already
+          //at least targetFree remaining in the class
+          batches.push_back(0);
+        }
+    }
+    return batches;
+  }
+  private:
+    uint64_t batchSize_{100};
+    double targetFree_{0.05};
+};
+
 } // namespace cachelib
 } // namespace facebook
diff --git a/cachelib/allocator/Cache.h b/cachelib/allocator/Cache.h
index 515da3ac47..6f7ae20bc5 100644
--- a/cachelib/allocator/Cache.h
+++ b/cachelib/allocator/Cache.h
@@ -73,6 +73,22 @@ enum class DestructorContext {
   kRemovedFromNVM
 };
 
+struct MemoryDescriptorType {
+    MemoryDescriptorType(TierId tid, PoolId pid, ClassId cid) :
+        tid_(tid), pid_(pid), cid_(cid) {}
+    TierId tid_;
+    PoolId pid_;
+    ClassId cid_;
+
+    bool operator<(const MemoryDescriptorType& rhs) const {
+      return std::make_tuple(tid_, pid_, cid_) < std::make_tuple(rhs.tid_, rhs.pid_, rhs.cid_);
+    }
+
+    bool operator==(const MemoryDescriptorType& rhs) const {
+      return std::make_tuple(tid_, pid_, cid_) == std::make_tuple(rhs.tid_, rhs.pid_, rhs.cid_);
+    }
+};
+
 // A base class of cache exposing members and status agnostic of template type.
 class CacheBase {
  public:
diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index 29cb159b54..ddf482e875 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -221,7 +221,7 @@ class CacheAllocator : public CacheBase {
   using PoolIds = std::set<PoolId>;
 
   using EventTracker = EventInterface<Key>;
-
+  using ClassBgStatsType = std::map<MemoryDescriptorType,uint64_t>;
   // SampleItem is a wrapper for the CacheItem which is provided as the sample
   // for uploading to Scuba (see ItemStatsExporter). It is guaranteed that the
   // CacheItem is accessible as long as the SampleItem is around since the
@@ -714,7 +714,7 @@ class CacheAllocator : public CacheBase {
   auto createBgWorkerMemoryAssignments(size_t numWorkers, TierId tid);
 
   // whether bg worker should be woken
-  bool shouldWakeupBgEvictor(PoolId pid, ClassId cid);
+  bool shouldWakeupBgEvictor(TierId tid, PoolId pid, ClassId cid);
 
   // Get a random item from memory
   // This is useful for profiling and sampling cachelib managed memory
@@ -1184,6 +1184,43 @@ class CacheAllocator : public CacheBase {
     return stats;
   }
 
+  // returns the background mover stats per thread
+  std::vector<BackgroundMoverStats> getBackgroundMoverStats(MoverDir direction) const {
+    auto stats = std::vector<BackgroundMoverStats>();
+    if (direction == MoverDir::Evict) {
+      for (auto& bg : backgroundEvictor_)
+        stats.push_back(bg->getStats());
+    } else if (direction == MoverDir::Promote) {
+      for (auto& bg : backgroundPromoter_)
+        stats.push_back(bg->getStats());
+    }
+    return stats;
+  }
+
+  ClassBgStatsType
+  getBackgroundMoverClassStats(MoverDir direction) const {
+    ClassBgStatsType stats;
+    auto record = [&](auto &bg) {
+      //gives a unique descriptor
+      auto classStats = bg->getClassStats();
+      for (const auto& [key,value] : classStats) {
+          stats[key] = value;
+      }
+    };
+
+    if (direction == MoverDir::Evict) {
+      for (auto& bg : backgroundEvictor_) {
+          record(bg);
+      }
+    } else if (direction == MoverDir::Promote) {
+      for (auto& bg : backgroundPromoter_) {
+          record(bg);
+      }
+    }
+
+    return stats;
+  }
+
   // returns the pool rebalancer stats
   RebalancerStats getRebalancerStats() const {
     auto stats =
@@ -1793,6 +1830,26 @@ class CacheAllocator : public CacheBase {
   //         handle to the item. On failure an empty handle. 
   WriteHandle tryEvictToNextMemoryTier(Item& item, bool fromBgThread);
 
+  // Try to move the item up to the next memory tier
+  //
+  // @param tid current tier ID of the item
+  // @param pid the pool ID the item belong to.
+  // @param item the item to promote
+  // @param fromBgThread whether this is called from BG thread
+  //
+  // @return valid handle to the item. This will be the last
+  //         handle to the item. On failure an empty handle.
+  WriteHandle tryPromoteToNextMemoryTier(TierId tid, PoolId pid, Item& item, bool fromBgThread);
+
+  // Try to move the item up to the next memory tier
+  //
+  // @param item the item to promote
+  // @param fromBgThread whether this is called from BG thread
+  //
+  // @return valid handle to the item. This will be the last
+  //         handle to the item. On failure an empty handle.
+  WriteHandle tryPromoteToNextMemoryTier(Item& item, bool fromBgThread);
+
   // Wakes up waiters if there are any
   //
   // @param item    wakes waiters that are waiting on that item
@@ -1926,20 +1983,165 @@ class CacheAllocator : public CacheBase {
 
   // exposed for the background evictor to iterate through the memory and evict
   // in batch. This should improve insertion path for tiered memory config
-  size_t traverseAndEvictItems(unsigned int /* tid */,
-                               unsigned int /* pid */,
-                               unsigned int /* cid */,
-                               size_t /* batch */) {
-    throw std::runtime_error("Not supported yet!");
+  size_t traverseAndEvictItems(unsigned int tid,
+                               unsigned int pid,
+                               unsigned int cid,
+                               size_t batch) {
+    auto& mmContainer = getMMContainer(tid, pid, cid);
+    size_t evictions = 0;
+    size_t evictionCandidates = 0;
+    std::vector<Item*> candidates;
+    candidates.reserve(batch);
+
+    size_t tries = 0;
+    mmContainer.withEvictionIterator([&tries, &candidates, &batch, &mmContainer, this](auto &&itr) {
+      while (candidates.size() < batch && 
+        (config_.maxEvictionPromotionHotness == 0 || tries < config_.maxEvictionPromotionHotness) && 
+         itr) {
+        tries++;
+        Item* candidate = itr.get();
+        XDCHECK(candidate);
+
+        if (candidate->isChainedItem()) {
+          throw std::runtime_error("Not supported for chained items");
+        }
+
+        if (candidate->markMoving()) {
+          mmContainer.remove(itr);
+          candidates.push_back(candidate);
+        } else {
+            ++itr;
+        }
+      }
+    });
+
+    for (Item *candidate : candidates) {
+      auto evictedToNext = tryEvictToNextMemoryTier(*candidate, true /* from BgThread */);
+      if (!evictedToNext) {
+	  auto token = createPutToken(*candidate);
+
+	  auto ret = candidate->markForEvictionWhenMoving();
+	  XDCHECK(ret);
+
+          unlinkItemForEviction(*candidate);
+      	  // wake up any readers that wait for the move to complete
+      	  // it's safe to do now, as we have the item marked exclusive and
+      	  // no other reader can be added to the waiters list
+      	  wakeUpWaiters(candidate->getKey(), WriteHandle{});
+
+      	  if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate)) {
+      	    nvmCache_->put(*candidate, std::move(token));
+      	  }
+      } else {
+          evictions++;
+      	  XDCHECK(!evictedToNext->isMarkedForEviction() && !evictedToNext->isMoving());
+      	  XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
+      	  XDCHECK(!candidate->isAccessible());
+      	  XDCHECK(candidate->getKey() == evictedToNext->getKey());
+
+      	  wakeUpWaiters(candidate->getKey(), std::move(evictedToNext));
+      }
+      XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
+
+      if (candidate->hasChainedItem()) {
+        (*stats_.chainedItemEvictions)[tid][pid][cid].inc();
+      } else {
+        (*stats_.regularItemEvictions)[tid][pid][cid].inc();
+      }
+
+      // it's safe to recycle the item here as there are no more
+      // references and the item could not been marked as moving
+      // by other thread since it's detached from MMContainer.
+      auto res = releaseBackToAllocator(*candidate, RemoveContext::kEviction,
+                                /* isNascent */ false);
+      XDCHECK(res == ReleaseRes::kReleased);
+    }
+    return evictions;
   }
 
-  // exposed for the background promoter to iterate through the memory and
-  // promote in batch. This should improve find latency
-  size_t traverseAndPromoteItems(unsigned int /* tid */,
-                                 unsigned int /* pid */,
-                                 unsigned int /* cid */,
-                                 size_t /* batch */) {
-    throw std::runtime_error("Not supported yet!");
+  size_t traverseAndPromoteItems(unsigned int tid,
+                                 unsigned int pid,
+                                 unsigned int cid,
+                                 size_t batch) {
+    auto& mmContainer = getMMContainer(tid, pid, cid);
+    size_t promotions = 0;
+    std::vector<Item*> candidates;
+    candidates.reserve(batch);
+
+    size_t tries = 0;
+
+    mmContainer.withPromotionIterator([&tries, &candidates, &batch, &mmContainer, this](auto &&itr){
+      while (candidates.size() < batch && (config_.maxEvictionPromotionHotness == 0 || tries < config_.maxEvictionPromotionHotness) && itr) {
+        tries++;
+        Item* candidate = itr.get();
+        XDCHECK(candidate);
+
+        if (candidate->isChainedItem()) {
+          throw std::runtime_error("Not supported for chained items");
+        }
+
+        // TODO: only allow it for read-only items?
+        // or implement mvcc
+        if (candidate->markMoving()) {
+          // promotions should rarely fail since we already marked moving
+          mmContainer.remove(itr);
+          candidates.push_back(candidate);
+        }
+
+        ++itr;
+      }
+    });
+
+    for (Item *candidate : candidates) {
+      auto promoted = tryPromoteToNextMemoryTier(*candidate, true);
+      if (promoted) {
+        promotions++;
+        XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
+        // it's safe to recycle the item here as there are no more
+        // references and the item could not been marked as moving
+        // by other thread since it's detached from MMContainer.
+        //
+        // but we need to wake up waiters before releasing
+        // since candidate's key can change after being sent
+        // back to allocator
+        wakeUpWaiters(candidate->getKey(), std::move(promoted));
+        auto res = releaseBackToAllocator(*candidate, RemoveContext::kEviction,
+                                  /* isNascent */ false);
+        XDCHECK(res == ReleaseRes::kReleased);
+      } else {
+        // we failed to allocate a new item, this item is no  longer moving
+        auto ref = candidate->unmarkMoving();
+        if (UNLIKELY(ref == 0)) {
+	  wakeUpWaiters(candidate->getKey(),{});
+          const auto res =
+              releaseBackToAllocator(*candidate, 
+                      RemoveContext::kNormal, false);
+          XDCHECK(res == ReleaseRes::kReleased);
+        } else if (candidate->isAccessible()) {
+          //case where we failed to allocate in lower tier
+          //item is still present in accessContainer
+          //item is no longer moving - acquire and
+          //wake up waiters with this handle
+	  auto hdl = acquire(candidate);
+	  insertInMMContainer(*hdl);
+	  wakeUpWaiters(candidate->getKey(), std::move(hdl));
+        } else if (!candidate->isAccessible()) {
+          //case where we failed to replace in access
+          //container due to another thread calling insertOrReplace
+          //unmark moving and return null handle
+	  wakeUpWaiters(candidate->getKey(), {});
+	  if (UNLIKELY(ref == 0)) {
+              const auto res =
+                releaseBackToAllocator(*candidate, RemoveContext::kNormal,
+                                        false);
+              XDCHECK(res == ReleaseRes::kReleased);
+          }
+        } else {
+          XDCHECK(false);
+        }
+      }
+    }
+    return promotions;
   }
 
   // returns true if nvmcache is enabled and we should write this item to
@@ -2091,49 +2293,6 @@ class CacheAllocator : public CacheBase {
                      : false;
   }
 
-  // returns the background mover stats
-  BackgroundMoverStats getBackgroundMoverStats(MoverDir direction) const {
-    auto stats = BackgroundMoverStats{};
-    if (direction == MoverDir::Evict) {
-      for (auto& bg : backgroundEvictor_)
-        stats += bg->getStats();
-    } else if (direction == MoverDir::Promote) {
-      for (auto& bg : backgroundPromoter_)
-        stats += bg->getStats();
-    }
-    return stats;
-  }
-
-  std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>>
-  getBackgroundMoverClassStats(
-      MoverDir direction) const {
-    std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>> stats;
-
-    if (direction == MoverDir::Evict) {
-      for (auto& bg : backgroundEvictor_) {
-        for (auto &tid : bg->getClassStats()) {
-          for (auto& pid : tid.second) {
-            for (auto& cid : pid.second) {
-              stats[tid.first][pid.first][cid.first] += cid.second;
-            }
-          }
-        }
-      }
-    } else if (direction == MoverDir::Promote) {
-      for (auto& bg : backgroundPromoter_) {
-        for (auto &tid : bg->getClassStats()) {
-          for (auto& pid : tid.second) {
-            for (auto& cid : pid.second) {
-              stats[tid.first][pid.first][cid.first] += cid.second;
-            }
-          }
-        }
-      }
-    }
-
-    return stats;
-  }
-
   bool tryGetHandleWithWaitContextForMovingItem(Item& item,
                                                 WriteHandle& handle);
 
@@ -2775,8 +2934,13 @@ CacheAllocator<CacheTrait>::allocate(PoolId poolId,
 }
 
 template <typename CacheTrait>
-bool CacheAllocator<CacheTrait>::shouldWakeupBgEvictor(PoolId /* pid */,
-                                                       ClassId /* cid */) {
+bool CacheAllocator<CacheTrait>::shouldWakeupBgEvictor(TierId tid, PoolId pid, ClassId cid) {
+  // TODO: should we also work on lower tiers? should we have separate set of params?
+  if (tid == 1) return false;
+  double usage = getPoolByTid(pid, tid).getApproxUsage(cid);
+  if (((1-usage)*100) <= config_.lowEvictionAcWatermark) {
+    return true;
+  }
   return false;
 }
 
@@ -2806,7 +2970,7 @@ CacheAllocator<CacheTrait>::allocateInternalTier(TierId tid,
   void* memory = allocator_[tid]->allocate(pid, requiredSize);
 
   if (backgroundEvictor_.size() && !fromBgThread &&
-      (memory == nullptr || shouldWakeupBgEvictor(pid, cid))) {
+      (memory == nullptr || shouldWakeupBgEvictor(tid, pid, cid))) {
     backgroundEvictor_[BackgroundMover<CacheT>::workerId(
                          tid, pid, cid, backgroundEvictor_.size())]
         ->wakeUp();
@@ -4064,6 +4228,47 @@ CacheAllocator<CacheTrait>::tryEvictToNextMemoryTier(Item& item, bool fromBgThre
   return tryEvictToNextMemoryTier(tid, pid, item, fromBgThread);
 }
 
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::tryPromoteToNextMemoryTier(
+    TierId tid, PoolId pid, Item& item, bool fromBgThread) {
+  if(item.isExpired()) { return {}; }
+  TierId nextTier = tid;
+  while (nextTier > 0) { // try to evict down to the next memory tiers
+    auto toPromoteTier = nextTier - 1;
+    --nextTier;
+
+    // allocateInternal might trigger another eviction
+    auto newItemHdl = allocateInternalTier(toPromoteTier, pid,
+                     item.getKey(),
+                     item.getSize(),
+                     item.getCreationTime(),
+                     item.getExpiryTime(),
+                     fromBgThread);
+
+    if (newItemHdl) {
+      XDCHECK_EQ(newItemHdl->getSize(), item.getSize());
+      if (!moveRegularItem(item, newItemHdl)) {
+        return WriteHandle{};
+      }
+      item.unmarkMoving();
+      return newItemHdl;
+    } else {
+      return WriteHandle{};
+    }
+  }
+
+  return {};
+}
+
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::tryPromoteToNextMemoryTier(Item& item, bool fromBgThread) {
+    auto tid = getTierId(item);
+    auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId;
+    return tryPromoteToNextMemoryTier(tid, pid, item, fromBgThread);
+}
+
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::RemoveRes
 CacheAllocator<CacheTrait>::remove(typename Item::Key key) {
@@ -5106,6 +5311,9 @@ ACStats CacheAllocator<CacheTrait>::getACStats(TierId tid,
   const auto& ac = pool.getAllocationClass(classId);
   auto stats = ac.getStats();
   stats.allocLatencyNs = (*stats_.classAllocLatency)[tid][poolId][classId];
+  stats.evictionAttempts = (*stats_.evictionAttempts)[tid][poolId][classId].get();
+  stats.evictions = (*stats_.regularItemEvictions)[tid][poolId][classId].get() +
+                    (*stats_.chainedItemEvictions)[tid][poolId][classId].get();
   return stats;
 }
 
diff --git a/cachelib/allocator/CacheAllocatorConfig.h b/cachelib/allocator/CacheAllocatorConfig.h
index 768b15c5eb..227f2e5354 100644
--- a/cachelib/allocator/CacheAllocatorConfig.h
+++ b/cachelib/allocator/CacheAllocatorConfig.h
@@ -639,6 +639,24 @@ class CacheAllocatorConfig {
   // CacheAllocator::startCacheWorkers()
   bool delayCacheWorkersStart{false};
 
+  // see MultiTierDataMovement.md
+  double promotionAcWatermark{4.0}; 
+  double lowEvictionAcWatermark{2.0};
+  double highEvictionAcWatermark{5.0};
+  double numDuplicateElements{0.0}; // inclusivness of the cache
+  double syncPromotion{0.0}; // can promotion be done synchronously in user thread
+  
+  uint64_t evictorThreads{1};
+  uint64_t promoterThreads{1};
+
+  uint64_t maxEvictionBatch{40};
+  uint64_t maxPromotionBatch{10};
+
+  uint64_t minEvictionBatch{1};
+  uint64_t minPromotionBatch{1};
+
+  uint64_t maxEvictionPromotionHotness{60};
+
   friend CacheT;
 
  private:
diff --git a/cachelib/allocator/CacheStats.h b/cachelib/allocator/CacheStats.h
index 8c9b1c370c..aec24cb298 100644
--- a/cachelib/allocator/CacheStats.h
+++ b/cachelib/allocator/CacheStats.h
@@ -313,26 +313,43 @@ struct RebalancerStats {
 
   uint64_t lastPickTimeMs{0};
   uint64_t avgPickTimeMs{0};
+  
+  // aggregate stats together (accross tiers)
+  RebalancerStats& operator+=(const RebalancerStats& other);
 };
 
 // Mover Stats
 struct BackgroundMoverStats {
   // the number of items this worker moved by looking at pools/classes stats
   uint64_t numMovedItems{0};
-  // number of times we went executed the thread //TODO: is this def correct?
+  
+  // number of times we went executed the thread (by periodic worker)
   uint64_t runCount{0};
-  // total number of classes
+
+  // average number of items moved per run
+  double avgItemsMoved{0.0};
+
+  // number of times we actually traversed the mmContainer
+  uint64_t numTraversals{0};
+
+  // number of classes traversed
   uint64_t totalClasses{0};
-  // eviction size
+
+  // total bytes moved
   uint64_t totalBytesMoved{0};
+  
+  // indicates the time in ns for the last iteration
+  uint64_t lastTraversalTimeNs{0};
+
+  // indicates the maximum of all traversals
+  uint64_t minTraversalTimeNs{0};
+
+  // indicates the minimum of all traversals
+  uint64_t maxTraversalTimeNs{0};
+
+  // indicates the average of all traversals
+  uint64_t avgTraversalTimeNs{0};
 
-  BackgroundMoverStats& operator+=(const BackgroundMoverStats& rhs) {
-    numMovedItems += rhs.numMovedItems;
-    runCount += rhs.runCount;
-    totalClasses += rhs.totalClasses;
-    totalBytesMoved += rhs.totalBytesMoved;
-    return *this;
-  }
 };
 
 // CacheMetadata type to export
@@ -356,9 +373,9 @@ struct Stats;
 // the ones that are aggregated over all pools
 struct GlobalCacheStats {
   // background eviction stats
-  BackgroundMoverStats evictionStats;
-
-  BackgroundMoverStats promotionStats;
+  std::vector<BackgroundMoverStats> evictionStats;
+  
+  std::vector<BackgroundMoverStats> promotionStats;
 
   // number of calls to CacheAllocator::find
   uint64_t numCacheGets{0};
diff --git a/cachelib/allocator/FreeThresholdStrategy.cpp b/cachelib/allocator/FreeThresholdStrategy.cpp
index 1fafda2bc9..284248b1cf 100644
--- a/cachelib/allocator/FreeThresholdStrategy.cpp
+++ b/cachelib/allocator/FreeThresholdStrategy.cpp
@@ -30,9 +30,47 @@ FreeThresholdStrategy::FreeThresholdStrategy(double lowEvictionAcWatermark,
       minEvictionBatch(minEvictionBatch) {}
 
 std::vector<size_t> FreeThresholdStrategy::calculateBatchSizes(
-    const CacheBase& /* cache */,
-    std::vector<MemoryDescriptorType> /* acVec */) {
-  throw std::runtime_error("Not supported yet!");
+    const CacheBase& cache,
+    std::vector<MemoryDescriptorType> acVec) {
+  std::vector<size_t> batches{};
+  for (auto [tid, pid, cid] : acVec) {
+    const auto& pool = cache.getPoolByTid(pid, tid);
+    if (pool.getApproxFreeSlabs()) {
+      batches.push_back(0);
+    }
+    double usage = pool.getApproxUsage(cid);
+    if ((1-usage)*100 < highEvictionAcWatermark && pool.allSlabsAllocated()) {
+      auto toFreeMemPercent = highEvictionAcWatermark - (1-usage)*100;
+      auto toFreeItems = static_cast<size_t>(
+          toFreeMemPercent * (pool.getApproxSlabs(cid) * pool.getPerSlab(cid)) );
+      batches.push_back(toFreeItems);
+    } else {
+      batches.push_back(0);
+    }
+  }
+
+  if (batches.size() == 0) {
+    return batches;
+  }
+
+  auto maxBatch = *std::max_element(batches.begin(), batches.end());
+  if (maxBatch == 0)
+    return batches;
+
+  std::transform(
+      batches.begin(), batches.end(), batches.begin(), [&](auto numItems) {
+        if (numItems == 0) {
+          return 0UL;
+        }
+
+        auto cappedBatchSize = maxEvictionBatch * numItems / maxBatch;
+        if (cappedBatchSize < minEvictionBatch)
+          return minEvictionBatch;
+        else
+          return cappedBatchSize;
+      });
+
+  return batches;
 }
 
 } // namespace facebook::cachelib
diff --git a/cachelib/allocator/MMLru.h b/cachelib/allocator/MMLru.h
index a98c86d9a6..4c0771a33f 100644
--- a/cachelib/allocator/MMLru.h
+++ b/cachelib/allocator/MMLru.h
@@ -378,6 +378,11 @@ class MMLru {
     template <typename F>
     void withContainerLock(F&& f);
 
+    // Execute provided function under container lock. Function gets
+    // iterator passed as parameter.
+    template <typename F>
+    void withPromotionIterator(F&& f);
+
     // get copy of current config
     Config getConfig() const;
 
@@ -716,6 +721,18 @@ void MMLru::Container<T, HookPtr>::withEvictionIterator(F&& fun) {
   }
 }
 
+template <typename T, MMLru::Hook<T> T::*HookPtr>
+template <typename F>
+void
+MMLru::Container<T, HookPtr>::withPromotionIterator(F&& fun) {
+  if (config_.useCombinedLockForIterators) {
+    lruMutex_->lock_combine([this, &fun]() { fun(Iterator{lru_.begin()}); });
+  } else {
+    LockHolder lck{*lruMutex_};
+    fun(Iterator{lru_.begin()});
+  }
+}
+
 template <typename T, MMLru::Hook<T> T::*HookPtr>
 template <typename F>
 void MMLru::Container<T, HookPtr>::withContainerLock(F&& fun) {
diff --git a/cachelib/allocator/MMTinyLFU.h b/cachelib/allocator/MMTinyLFU.h
index 71359c4782..2a6da11687 100644
--- a/cachelib/allocator/MMTinyLFU.h
+++ b/cachelib/allocator/MMTinyLFU.h
@@ -495,6 +495,11 @@ class MMTinyLFU {
     template <typename F>
     void withEvictionIterator(F&& f);
 
+    // Execute provided function under container lock. Function gets
+    // iterator passed as parameter.
+    template <typename F>
+    void withPromotionIterator(F&& f);
+
     // Execute provided function under container lock.
     template <typename F>
     void withContainerLock(F&& f);
@@ -846,6 +851,13 @@ void MMTinyLFU::Container<T, HookPtr>::withEvictionIterator(F&& fun) {
   fun(getEvictionIterator());
 }
 
+template <typename T, MMTinyLFU::Hook<T> T::*HookPtr>
+template <typename F>
+void
+MMTinyLFU::Container<T, HookPtr>::withPromotionIterator(F&& fun) {
+  throw std::runtime_error("Not supported");
+}
+
 template <typename T, MMTinyLFU::Hook<T> T::*HookPtr>
 template <typename F>
 void MMTinyLFU::Container<T, HookPtr>::withContainerLock(F&& fun) {
diff --git a/cachelib/allocator/PromotionStrategy.h b/cachelib/allocator/PromotionStrategy.h
index d3eb8686c5..233c03cc10 100644
--- a/cachelib/allocator/PromotionStrategy.h
+++ b/cachelib/allocator/PromotionStrategy.h
@@ -35,7 +35,43 @@ class PromotionStrategy : public BackgroundMoverStrategy {
 
   std::vector<size_t> calculateBatchSizes(
       const CacheBase& cache, std::vector<MemoryDescriptorType> acVec) {
-    return {};
+    std::vector<size_t> batches{};
+    for (auto [tid, pid, cid] : acVec) {
+      XDCHECK(tid > 0);
+      const auto& pool = cache.getPoolByTid(pid, tid-1);
+      double usage = pool.getApproxUsage(cid);
+      if ((1-usage)*100 <= promotionAcWatermark)
+        batches.push_back(0);
+      else {
+        auto maxPossibleItemsToPromote = static_cast<size_t>(
+            ( (promotionAcWatermark - (1-usage*100) ) *
+              (pool.getApproxSlabs(cid) * pool.getPerSlab(cid)) ) );
+        batches.push_back(maxPossibleItemsToPromote);
+      }
+    }
+
+    if (batches.size() == 0) {
+      return batches;
+    }
+
+    auto maxBatch = *std::max_element(batches.begin(), batches.end());
+    if (maxBatch == 0)
+      return batches;
+
+    std::transform(
+        batches.begin(), batches.end(), batches.begin(), [&](auto numItems) {
+          if (numItems == 0) {
+            return 0UL;
+          }
+
+          auto cappedBatchSize = maxPromotionBatch * numItems / maxBatch;
+          if (cappedBatchSize < minPromotionBatch)
+            return minPromotionBatch;
+          else
+            return cappedBatchSize;
+        });
+
+    return batches;
   }
 
  private:
diff --git a/cachelib/allocator/memory/AllocationClass.cpp b/cachelib/allocator/memory/AllocationClass.cpp
index 512df86bbe..e43494441f 100644
--- a/cachelib/allocator/memory/AllocationClass.cpp
+++ b/cachelib/allocator/memory/AllocationClass.cpp
@@ -704,6 +704,30 @@ ACStats AllocationClass::getStats() const {
   });
 }
 
+uint32_t AllocationClass::getPerSlab() const {
+  return getAllocsPerSlab();
+}
+
+uint32_t AllocationClass::getApproxSlabs() const {
+  return allocatedSlabs_.size();
+}
+
+double AllocationClass::getApproxUsage() const {
+  const unsigned long long nSlabsAllocated = allocatedSlabs_.size();
+  if (nSlabsAllocated == 0) {
+      return 0.0;
+  }
+  const unsigned long long perSlab = getAllocsPerSlab();
+  const auto freeAllocsInCurrSlab =
+      canAllocateFromCurrentSlabLocked()
+          ? (Slab::kSize - currOffset_) / allocationSize_
+          : 0;
+  const unsigned long long nFreedAllocs = freedAllocations_.size();
+  const unsigned long long nActiveAllocs =
+      nSlabsAllocated * perSlab - nFreedAllocs - freeAllocsInCurrSlab;
+  return (double) nActiveAllocs / (double) (nSlabsAllocated * perSlab);
+}
+
 void AllocationClass::createSlabReleaseAllocMapLocked(const Slab* slab) {
   // Initialize slab free state
   // Each bit represents whether or not an alloc has already been freed
diff --git a/cachelib/allocator/memory/AllocationClass.h b/cachelib/allocator/memory/AllocationClass.h
index 269887f207..6a9412db5e 100644
--- a/cachelib/allocator/memory/AllocationClass.h
+++ b/cachelib/allocator/memory/AllocationClass.h
@@ -97,6 +97,13 @@ class AllocationClass {
   // fetch stats about this allocation class.
   ACStats getStats() const;
 
+  // get approx usage as fraction of used allocs/total allocs in this class
+  double getApproxUsage() const;
+  // get approx slabs in this class
+  uint32_t getApproxSlabs() const;
+  // get items per slabs in this class
+  uint32_t getPerSlab() const;
+
   // Whether the pool is full or free to allocate more in the current state.
   // This is only a hint and not a gurantee that subsequent allocate will
   // fail/succeed.
diff --git a/cachelib/allocator/memory/MemoryAllocatorStats.h b/cachelib/allocator/memory/MemoryAllocatorStats.h
index 7301145286..048fd84247 100644
--- a/cachelib/allocator/memory/MemoryAllocatorStats.h
+++ b/cachelib/allocator/memory/MemoryAllocatorStats.h
@@ -53,6 +53,9 @@ struct ACStats {
   // Rolling allocation latency (in ns)
   util::RollingStats allocLatencyNs;
 
+  uint64_t evictionAttempts;
+  uint64_t evictions;
+
   constexpr unsigned long long totalSlabs() const noexcept {
     return freeSlabs + usedSlabs;
   }
@@ -67,6 +70,15 @@ struct ACStats {
 
     return activeAllocs / (usedSlabs * allocsPerSlab);
   }
+  
+  constexpr double approxUsage() const noexcept {
+    const unsigned long long nSlabsAllocated = usedSlabs;
+    if (nSlabsAllocated == 0) {
+        return 0.0;
+    }
+    const unsigned long long perSlab = allocsPerSlab;
+    return (double) activeAllocs / (double) (nSlabsAllocated * perSlab);
+  }
 
   constexpr size_t totalAllocatedSize() const noexcept {
     return activeAllocs * allocSize;
diff --git a/cachelib/allocator/memory/MemoryPool.cpp b/cachelib/allocator/memory/MemoryPool.cpp
index 21c04841e5..6caa409d0e 100644
--- a/cachelib/allocator/memory/MemoryPool.cpp
+++ b/cachelib/allocator/memory/MemoryPool.cpp
@@ -523,3 +523,22 @@ MPStats MemoryPool::getStats() const {
                  slabsUnAllocated,    nSlabResize_,       nSlabRebalance_,
                  curSlabsAdvised_};
 }
+
+double MemoryPool::getApproxUsage(ClassId cid) const {
+  const auto& ac = getAllocationClassFor(cid);
+  return ac.getApproxUsage();
+}
+
+uint32_t MemoryPool::getApproxFreeSlabs() const {
+  return freeSlabs_.size();
+}
+
+uint32_t MemoryPool::getApproxSlabs(ClassId cid) const {
+  const auto& ac = getAllocationClassFor(cid);
+  return ac.getApproxSlabs();
+}
+
+uint32_t MemoryPool::getPerSlab(ClassId cid) const {
+  const auto& ac = getAllocationClassFor(cid);
+  return ac.getPerSlab();
+}
diff --git a/cachelib/allocator/memory/MemoryPool.h b/cachelib/allocator/memory/MemoryPool.h
index 00c2c8c8b8..bd607fe06c 100644
--- a/cachelib/allocator/memory/MemoryPool.h
+++ b/cachelib/allocator/memory/MemoryPool.h
@@ -132,6 +132,14 @@ class MemoryPool {
   }
 
   MPStats getStats() const;
+  // approx usage fraction per class
+  double getApproxUsage(ClassId cid) const;
+  // approx slabs assigned to a given class
+  uint32_t getApproxSlabs(ClassId cid) const;
+  
+  uint32_t getApproxFreeSlabs() const;
+  // items per slab for a class
+  uint32_t getPerSlab(ClassId cid) const;
 
   // allocates memory of at least _size_ bytes.
   //
diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
index 13388f8e8e..a08ee04e6d 100644
--- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
+++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
@@ -27,6 +27,7 @@ TEST_F(LruAllocatorMemoryTiersTest, MultiTiersInvalid) { this->testMultiTiersInv
 TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValid) { this->testMultiTiersValid(); }
 TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValidStats) { this->testMultiTiersValidStats(); }
 TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValidMixed) { this->testMultiTiersValidMixed(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersBackgroundMovers ) { this->testMultiTiersBackgroundMovers(); }
 TEST_F(LruAllocatorMemoryTiersTest, MultiTiersRemoveDuringEviction) { this->testMultiTiersRemoveDuringEviction(); }
 TEST_F(LruAllocatorMemoryTiersTest, MultiTiersReplaceDuringEviction) { this->testMultiTiersReplaceDuringEviction(); }
 TEST_F(LruAllocatorMemoryTiersTest, MultiTiersReplaceDuringEvictionWithReader) { this->testMultiTiersReplaceDuringEvictionWithReader(); }
diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
index 27db22bac3..5af34db94a 100644
--- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
+++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
@@ -19,6 +19,8 @@
 #include "cachelib/allocator/CacheAllocatorConfig.h"
 #include "cachelib/allocator/MemoryTierCacheConfig.h"
 #include "cachelib/allocator/tests/TestBase.h"
+#include "cachelib/allocator/FreeThresholdStrategy.h"
+#include "cachelib/allocator/PromotionStrategy.h"
 
 #include <fcntl.h>
 #include <unistd.h>
@@ -153,6 +155,81 @@ class AllocatorMemoryTiersTest : public AllocatorTest<AllocatorT> {
         ASSERT_EQ(evictCount[tid],evictCount[tid-1]);
     }
   }
+  
+  void testMultiTiersBackgroundMovers() {
+    typename AllocatorT::Config config;
+    config.setCacheSize(10 * Slab::kSize);
+    config.enableCachePersistence("/tmp");
+    config.usePosixForShm();
+    auto moveCb = [&](typename AllocatorT::Item& oldItem,
+                      typename AllocatorT::Item& newItem,
+                      typename AllocatorT::Item* /* parentPtr */) {
+      std::memcpy(newItem.getMemory(), oldItem.getMemory(),
+                  oldItem.getSize());
+    };
+
+    config.enableMovingOnSlabRelease(moveCb, {}, 10);
+    config.configureMemoryTiers({
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0")),
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0"))
+    });
+    config.enableBackgroundEvictor(std::make_shared<FreeThresholdStrategy>(2, 10, 100, 40),
+            std::chrono::milliseconds(10),1);
+    config.enableBackgroundPromoter(std::make_shared<PromotionStrategy>(5, 4, 2),
+            std::chrono::milliseconds(10),1);
+
+    auto allocator = std::make_unique<AllocatorT>(AllocatorT::SharedMemNew, config);
+    ASSERT(allocator != nullptr);
+    const size_t numBytes = allocator->getCacheMemoryStats().ramCacheSize;
+
+    auto poolId = allocator->addPool("default", numBytes);
+
+    const unsigned int keyLen = 100;
+    const unsigned int size = 100;
+    unsigned int allocs = 0;
+
+    //we should work on pool stats because filluppooluntil evictions
+    //will finish once we evict an item from tier 0 to tier 1 and
+    //there will be unallocated memory left.
+    while (allocs < 174760) {
+      const auto key = this->getRandomNewKey(*allocator, keyLen);
+      ASSERT_EQ(allocator->find(key), nullptr);
+      auto handle = util::allocateAccessible(*allocator, poolId, key, size);
+      allocs++;
+    }
+   
+    const auto key = this->getRandomNewKey(*allocator, keyLen);
+    auto handle = util::allocateAccessible(*allocator, poolId, key, size);
+    ASSERT_NE(nullptr, handle);
+    const uint8_t cid = allocator->getAllocInfo(handle->getMemory()).classId;
+    ASSERT_EQ(cid,5);
+    auto stats = allocator->getGlobalCacheStats();
+    auto slabStats = allocator->getACStats(0,0,cid);
+    const auto& mpStats = allocator->getPoolByTid(poolId, 0).getStats(); 
+    //cache is 10MB should move about 1MB to reach 10% free
+    uint32_t approxEvict = (1024*1024)/mpStats.acStats.at(cid).allocSize;
+    while (stats.evictionStats[0].numMovedItems < approxEvict*0.95 && (1-slabStats.usageFraction()) >= 0.095) {
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+        stats = allocator->getGlobalCacheStats();
+        slabStats = allocator->getACStats(0,0,cid);
+    }
+    ASSERT_GE(1-slabStats.usageFraction(),0.095);
+
+    auto perclassEstats = allocator->getBackgroundMoverClassStats(MoverDir::Evict);
+    auto perclassPstats = allocator->getBackgroundMoverClassStats(MoverDir::Promote);
+
+    ASSERT_GE(stats.evictionStats[0].numMovedItems,1);
+    ASSERT_GE(stats.evictionStats[0].runCount,1);
+    ASSERT_GE(stats.promotionStats[0].numMovedItems,1);
+    
+    MemoryDescriptorType tier0(0,0,cid);
+    MemoryDescriptorType tier1(1,0,cid);
+    ASSERT_GE(perclassEstats[tier0], 1);
+    ASSERT_GE(perclassPstats[tier1], 1);
+    
+  }
 
   void testMultiTiersValidMixed() {
     typename AllocatorT::Config config;
diff --git a/cachelib/cachebench/cache/Cache.h b/cachelib/cachebench/cache/Cache.h
index c0896cd137..14e47161fc 100644
--- a/cachelib/cachebench/cache/Cache.h
+++ b/cachelib/cachebench/cache/Cache.h
@@ -521,6 +521,15 @@ Cache<Allocator>::Cache(const CacheConfig& config,
       config_.getRebalanceStrategy(),
       std::chrono::seconds(config_.poolRebalanceIntervalSec));
 
+  allocatorConfig_.enableBackgroundEvictor(
+      config_.getBackgroundEvictorStrategy(),
+      std::chrono::milliseconds(config_.backgroundEvictorIntervalMilSec),
+      config_.evictorThreads);
+
+  allocatorConfig_.enableBackgroundPromoter(
+      config_.getBackgroundPromoterStrategy(),
+      std::chrono::milliseconds(config_.backgroundPromoterIntervalMilSec),
+      config_.promoterThreads);
   if (config_.moveOnSlabRelease && movingSync != nullptr) {
     allocatorConfig_.enableMovingOnSlabRelease(
         [](Item& oldItem, Item& newItem, Item* parentPtr) {
@@ -575,6 +584,12 @@ Cache<Allocator>::Cache(const CacheConfig& config,
     }
   });
 
+  allocatorConfig_.maxEvictionBatch = config_.maxEvictionBatch;
+  allocatorConfig_.maxPromotionBatch = config_.maxPromotionBatch;
+  allocatorConfig_.minEvictionBatch = config_.minEvictionBatch;
+  allocatorConfig_.minPromotionBatch = config_.minPromotionBatch;
+  allocatorConfig_.maxEvictionPromotionHotness = config_.maxEvictionPromotionHotness;
+
   if (config_.enableItemDestructorCheck) {
     auto removeCB = [&](const typename Allocator::DestructorData& data) {
       if (!itemRecords_.validate(data)) {
@@ -1134,15 +1149,17 @@ Stats Cache<Allocator>::getStats() const {
     ret.numItems.push_back(aggregate.numItems());
   }
 
-  std::map<TierId, std::map<PoolId, std::map<ClassId, ACStats>>> allocationClassStats{};
+  std::map<MemoryDescriptorType, ACStats> allocationClassStats{};
 
   for (size_t pid = 0; pid < pools_.size(); pid++) {
     PoolId poolId = static_cast<PoolId>(pid);
     auto poolStats = cache_->getPoolStats(poolId);
     auto cids = poolStats.getClassIds();
     for (TierId tid = 0; tid < cache_->getNumTiers(); tid++) {
-      for (auto cid : cids)
-        allocationClassStats[tid][pid][cid] = cache_->getACStats(tid, pid, cid);
+      for (auto cid : cids) {
+        MemoryDescriptorType md(tid,pid,cid);
+        allocationClassStats[md] = cache_->getACStats(tid, pid, cid);
+      }
     }
   }
 
@@ -1151,19 +1168,14 @@ Stats Cache<Allocator>::getStats() const {
   const auto navyStats = cache_->getNvmCacheStatsMap().toMap();
 
   ret.allocationClassStats = allocationClassStats;
+
+  ret.backgroundEvictorStats = cacheStats.evictionStats;
+  ret.backgroundPromoStats = cacheStats.promotionStats;
+
   ret.evictAttempts = cacheStats.evictionAttempts;
   ret.allocAttempts = cacheStats.allocAttempts;
   ret.allocFailures = cacheStats.allocFailures;
 
-  ret.backgndEvicStats.nEvictedItems = cacheStats.evictionStats.numMovedItems;
-  ret.backgndEvicStats.nTraversals = cacheStats.evictionStats.runCount;
-  ret.backgndEvicStats.nClasses = cacheStats.evictionStats.totalClasses;
-  ret.backgndEvicStats.evictionSize = cacheStats.evictionStats.totalBytesMoved;
-  
-  ret.backgndPromoStats.nPromotedItems =
-      cacheStats.promotionStats.numMovedItems;
-  ret.backgndPromoStats.nTraversals = cacheStats.promotionStats.runCount;
-
   ret.numCacheGets = cacheStats.numCacheGets;
   ret.numCacheGetMiss = cacheStats.numCacheGetMiss;
   ret.numCacheEvictions = cacheStats.numCacheEvictions;
diff --git a/cachelib/cachebench/cache/CacheStats.h b/cachelib/cachebench/cache/CacheStats.h
index 7d5e05522b..bf79b8aa65 100644
--- a/cachelib/cachebench/cache/CacheStats.h
+++ b/cachelib/cachebench/cache/CacheStats.h
@@ -27,31 +27,10 @@ namespace facebook {
 namespace cachelib {
 namespace cachebench {
 
-struct BackgroundEvictionStats {
-  // the number of items this worker evicted by looking at pools/classes stats
-  uint64_t nEvictedItems{0};
-
-  // number of times we went executed the thread //TODO: is this def correct?
-  uint64_t nTraversals{0};
-
-  // number of classes
-  uint64_t nClasses{0};
-
-  // size of evicted items
-  uint64_t evictionSize{0};
-};
-
-struct BackgroundPromotionStats {
-  // the number of items this worker evicted by looking at pools/classes stats
-  uint64_t nPromotedItems{0};
-
-  // number of times we went executed the thread //TODO: is this def correct?
-  uint64_t nTraversals{0};
-};
 
 struct Stats {
-  BackgroundEvictionStats backgndEvicStats;
-  BackgroundPromotionStats backgndPromoStats;
+  std::vector<BackgroundMoverStats> backgroundEvictorStats;
+  std::vector<BackgroundMoverStats> backgroundPromoStats;
   ReaperStats reaperStats;
 
   std::vector<uint64_t> numEvictions;
@@ -130,15 +109,17 @@ struct Stats {
   uint64_t invalidDestructorCount{0};
   int64_t unDestructedItemCount{0};
 
-  std::map<TierId, std::map<PoolId, std::map<ClassId, ACStats>>> allocationClassStats;
+  std::map<MemoryDescriptorType, ACStats> allocationClassStats;
 
   // populate the counters related to nvm usage. Cache implementation can decide
   // what to populate since not all of those are interesting when running
   // cachebench.
   std::unordered_map<std::string, double> nvmCounters;
+  
+  using ClassBgStatsType = std::map<MemoryDescriptorType,uint64_t>;
 
-  std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>> backgroundEvictionClasses;
-  std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>> backgroundPromotionClasses;
+  ClassBgStatsType backgroundEvictionClasses;
+  ClassBgStatsType backgroundPromotionClasses;
 
   // errors from the nvm engine.
   std::unordered_map<std::string, double> nvmErrors;
@@ -152,32 +133,34 @@ struct Stats {
     }
     out << folly::sformat("Items in NVM    : {:,}", numNvmItems) << std::endl;
     for (TierId tid = 0; tid < nTiers; tid++) {
-        out << folly::sformat("Tier {} Alloc Attempts: {:,} Success: {:.2f}%",
-                              tid,
-                              allocAttempts[tid],
-                              invertPctFn(allocFailures[tid], allocAttempts[tid]))
+        out << folly::sformat("Tier {} Alloc Attempts: {:,}\n"
+                              "Tier {} Alloc Success: {:.2f}%",
+                              tid, allocAttempts[tid],
+                              tid, invertPctFn(allocFailures[tid], allocAttempts[tid]))
             << std::endl;
     }
     for (TierId tid = 0; tid < nTiers; tid++) {
         out << folly::sformat(
-                   "Tier {} Evict Attempts: {:,} Success: {:.2f}%",
-                   tid,
-                   evictAttempts[tid],
-                   pctFn(numEvictions[tid], evictAttempts[tid]))
+                   "Tier {} Evict Attempts: {:,}\n"
+                   "Tier {} Success: {:.2f}%",
+                   tid, evictAttempts[tid],
+                   tid, invertPctFn(evictAttempts[tid] - numEvictions[tid], evictAttempts[tid]))
             << std::endl;
     }
     for (TierId tid = 0; tid < nTiers; tid++) {
-        out << folly::sformat("Tier {} Evictions : {:,} Writebacks: {:,} Success: {:.2f}%",
-                tid, numEvictions[tid], numWritebacks[tid],
-                invertPctFn(numEvictions[tid] - numWritebacks[tid], numEvictions[tid])) << std::endl;
+        out << folly::sformat("Tier {} Evictions: {:,}\n"
+                              "Tier {} Writebacks: {:,}\n"
+                              "Tier {} Success: {:.2f}%",
+                              tid, numEvictions[tid],
+                              tid, numWritebacks[tid],
+                              tid, invertPctFn(numEvictions[tid] - numWritebacks[tid], numEvictions[tid]))
+            << std::endl;
     }
+    
     auto foreachAC = [&](auto &map, auto cb) {
-      for (auto &tidStat : map) {
-        for (auto &pidStat : tidStat.second) {
-          for (auto &cidStat : pidStat.second) {
-            cb(tidStat.first, pidStat.first, cidStat.first, cidStat.second);
-          }
-        }
+      for (const auto& [key, value] : map) {
+        auto [tid,pid,cid] = key;
+        cb(tid, pid, cid, value);
       }
     };
 
@@ -215,16 +198,12 @@ struct Stats {
       };
 
       auto foreachAC = [&](auto cb) {
-        for (auto& tidStat : allocationClassStats) {
-          for (auto& pidStat : tidStat.second) {
-            for (auto& cidStat : pidStat.second) {
-              cb(tidStat.first, pidStat.first, cidStat.first, cidStat.second);
-            }
-          }
+        for (const auto& [key, value] : allocationClassStats) {
+          auto [tid,pid,cid] = key;
+          cb(tid, pid, cid, value);
         }
       };
-
-
+ 
       foreachAC([&](auto tid, auto pid, auto cid, auto stats) {
         auto [allocSizeSuffix, allocSize] = formatMemory(stats.allocSize);
         auto [memorySizeSuffix, memorySize] =
@@ -232,21 +211,60 @@ struct Stats {
 
         // If the pool is not full, extrapolate usageFraction for AC assuming it
         // will grow at the same rate. This value will be the same for all ACs.
-        const auto acUsageFraction = (poolUsageFraction.at(tid)[pid] < 1.0)
-                                   ? poolUsageFraction.at(tid)[pid]
-                                   : stats.usageFraction();
-
-        out << folly::sformat(
-                   "tid{:2} pid{:2} cid{:4} {:8.2f}{} usageFraction: {:4.2f} "
-                   "memorySize: {:8.2f}{} "
-                   "rollingAvgAllocLatency: {:8.2f}ns",
+        if (memorySize > 0) {
+          const auto acUsageFraction = stats.approxUsage();
+          out << folly::sformat(
+                   "tid{:2} pid{:2} cid{:4} {:8.2f}{} usage fraction: {:4.2f}\n"
+                   "tid{:2} pid{:2} cid{:4} {:8.2f}{} memory size in {}: {:8.2f}\n"
+                   "tid{:2} pid{:2} cid{:4} {:8.2f}{} eviction success: {:4.2f}\n"
+                   "tid{:2} pid{:2} cid{:4} {:8.2f}{} rolling avg alloc latency in ns: {:8.2f}",
                    tid, pid, cid, allocSize, allocSizeSuffix, acUsageFraction,
-                   memorySize, memorySizeSuffix,
-                   stats.allocLatencyNs.estimate())
+                   tid, pid, cid, allocSize, allocSizeSuffix, memorySizeSuffix, memorySize,
+                   tid, pid, cid, allocSize, allocSizeSuffix, (double)(stats.evictions/(double)stats.evictionAttempts),
+                   tid, pid, cid, allocSize, allocSizeSuffix, stats.allocLatencyNs.estimate())
             << std::endl;
+        }
       });
     }
 
+    int bgId = 1;
+    for (auto &bgWorkerStats : backgroundEvictorStats) {
+        if (bgWorkerStats.numMovedItems > 0) {
+          out << folly::sformat(" == Background Evictor Threads ==") << std::endl;
+          out << folly::sformat("Background Evictor Thread {} Evicted Items: {:,}\n"
+                                "Background Evictor Thread {} Traversals: {:,}\n"
+                                "Background Evictor Thread {} Run Count: {:,}\n"
+                                "Background Evictor Thread {} Avg Time Per Traversal in ns: {:,}\n"
+                                "Background Evictor Thread {} Avg Items Evicted: {:.2f}",
+                                bgId, bgWorkerStats.numMovedItems,
+                                bgId, bgWorkerStats.numTraversals,
+                                bgId, bgWorkerStats.runCount,
+                                bgId, bgWorkerStats.avgTraversalTimeNs,
+                                bgId, (double)bgWorkerStats.numMovedItems/(double)bgWorkerStats.numTraversals)
+              << std::endl;
+        }
+        bgId++;
+
+    }
+    bgId = 1;
+    for (auto &bgWorkerStats : backgroundPromoStats) {
+        if (bgWorkerStats.numMovedItems > 0) {
+          out << folly::sformat(" == Background Promoter Threads ==") << std::endl;
+          out << folly::sformat("Background Promoter Thread {} Promoted Items: {:,}\n"
+                                "Background Promoter Thread {} Traversals: {:,}\n"
+                                "Background Promoter Thread {} Run Count: {:,}\n"
+                                "Background Promoter Thread {} Avg Time Per Traversal in ns: {:,}\n"
+                                "Background Promoter Thread {} Avg Items Promoted: {:.2f}",
+                                bgId, bgWorkerStats.numMovedItems,
+                                bgId, bgWorkerStats.numTraversals,
+                                bgId, bgWorkerStats.runCount,
+                                bgId, bgWorkerStats.avgTraversalTimeNs,
+                                bgId, (double)bgWorkerStats.numMovedItems/(double)bgWorkerStats.numTraversals)
+              << std::endl;
+        }
+        bgId++;
+
+    }
     if (numCacheGets > 0) {
       out << folly::sformat("Cache Gets    : {:,}", numCacheGets) << std::endl;
       out << folly::sformat("Hit Ratio     : {:6.2f}%", overallHitRatio)
@@ -262,8 +280,7 @@ struct Stats {
                    const util::PercentileStats::Estimates& latency) {
               auto fmtLatency = [&out, &cat](folly::StringPiece pct,
                                              double val) {
-                out << folly::sformat("{:20} {:8} : {:>10.2f} ns\n", cat, pct,
-                                      val);
+                out << folly::sformat("{:20} {:8} in ns: {:>10.2f}\n", cat, pct, val);
               };
 
               fmtLatency("p50", latency.p50);
@@ -281,38 +298,32 @@ struct Stats {
       }
     }
 
-    if (!backgroundEvictionClasses.empty() &&
-        backgndEvicStats.nEvictedItems > 0) {
+    uint64_t totalbgevicted = 0;
+    uint64_t totalpromoted = 0;
+    for (int i = 0; i < backgroundEvictorStats.size(); i++) {
+        totalbgevicted += backgroundEvictorStats[i].numMovedItems;
+    }
+    for (int i = 0; i < backgroundPromoStats.size(); i++) {
+        totalpromoted += backgroundPromoStats[i].numMovedItems;
+    }
+    if (!backgroundEvictionClasses.empty() && totalbgevicted > 0 ) {
       out << "== Class Background Eviction Counters Map ==" << std::endl;
-      foreachAC(backgroundEvictionClasses,
-                [&](auto tid, auto pid, auto cid, auto evicted) {
-                  out << folly::sformat("tid{:2} pid{:2} cid{:4} evicted: {:4}",
-                    tid, pid, cid, evicted) << std::endl;
-                });
-
-      out << folly::sformat("Background Evicted Items : {:,}",
-                            backgndEvicStats.nEvictedItems)
-          << std::endl;
-      out << folly::sformat("Background Evictor Traversals : {:,}",
-                            backgndEvicStats.nTraversals)
-          << std::endl;
+      foreachAC(backgroundEvictionClasses, [&](auto tid, auto pid, auto cid, auto evicted){
+        if (evicted > 0) {
+          out << folly::sformat("tid{:2} pid{:2} cid{:4} evicted: {:4}",
+            tid, pid, cid, evicted) << std::endl;
+        }
+      });
     }
-
-    if (!backgroundPromotionClasses.empty() &&
-        backgndPromoStats.nPromotedItems > 0) {
+    
+    if (!backgroundPromotionClasses.empty() && totalpromoted) {
       out << "== Class Background Promotion Counters Map ==" << std::endl;
-      foreachAC(backgroundPromotionClasses,
-                [&](auto tid, auto pid, auto cid, auto promoted) {
-                  out << folly::sformat("tid{:2} pid{:2} cid{:4} promoted: {:4}",
-                    pid, cid, promoted) << std::endl;
-                });
-
-      out << folly::sformat("Background Promoted Items : {:,}",
-                            backgndPromoStats.nPromotedItems)
-          << std::endl;
-      out << folly::sformat("Background Promoter Traversals : {:,}",
-                            backgndPromoStats.nTraversals)
-          << std::endl;
+      foreachAC(backgroundPromotionClasses, [&](auto tid, auto pid, auto cid, auto promoted){
+        if (promoted > 0) {
+          out << folly::sformat("tid{:2} pid{:2} cid{:4} promoted: {:4}",
+            tid, pid, cid, promoted) << std::endl;
+        }
+      });
     }
 
     if (reaperStats.numReapedItems > 0) {
@@ -368,15 +379,15 @@ struct Stats {
 
       double devWriteAmp =
           pctFn(numNvmNandBytesWritten, numNvmBytesWritten) / 100.0;
-      out << folly::sformat("NVM bytes written (physical)  : {:6.2f} GB\n",
+      out << folly::sformat("NVM bytes written (physical) in GB : {:6.2f}\n",
                             numNvmBytesWritten / GB);
-      out << folly::sformat("NVM bytes written (logical)   : {:6.2f} GB\n",
+      out << folly::sformat("NVM bytes written (logical) in GB  : {:6.2f}\n",
                             numNvmLogicalBytesWritten / GB);
-      out << folly::sformat("NVM bytes written (nand)      : {:6.2f} GB\n",
+      out << folly::sformat("NVM bytes written (nand) in GB     : {:6.2f}\n",
                             numNvmNandBytesWritten / GB);
-      out << folly::sformat("NVM app write amplification   : {:6.2f}\n",
+      out << folly::sformat("NVM app write amplification        : {:6.2f}\n",
                             appWriteAmp);
-      out << folly::sformat("NVM dev write amplification   : {:6.2f}\n",
+      out << folly::sformat("NVM dev write amplification        : {:6.2f}\n",
                             devWriteAmp);
     }
     const double putSuccessPct =
@@ -385,62 +396,57 @@ struct Stats {
                     numNvmPuts);
     const double cleanEvictPct = pctFn(numNvmCleanEvict, numNvmEvictions);
     const double getCoalescedPct = pctFn(numNvmGetCoalesced, numNvmGets);
-    out << folly::sformat("{:14}: {:15,}, {:10}: {:6.2f}%",
-                          "NVM Gets",
-                          numNvmGets,
-                          "Coalesced",
-                          getCoalescedPct)
+    out << folly::sformat("{:30}: {:10,}\n"
+                          "{:30}: {:10.2f}",
+                          "NVM Gets", numNvmGets,
+                          "NVM Coalesced in pct", getCoalescedPct)
         << std::endl;
     out << folly::sformat(
-               "{:14}: {:15,}, {:10}: {:6.2f}%, {:8}: {:6.2f}%, {:16}: "
-               "{:8,}, {:16}: {:8,}",
-               "NVM Puts",
-               numNvmPuts,
-               "Success",
-               putSuccessPct,
-               "Clean",
-               pctFn(numNvmPutFromClean, numNvmPuts),
-               "AbortsFromDel",
-               numNvmAbortedPutOnTombstone,
-               "AbortsFromGet",
-               numNvmAbortedPutOnInflightGet)
+               "{:30}: {:10,}\n"
+               "{:30}: {:10.2f}\n"
+               "{:30}: {:10.2f}\n"
+               "{:30}: {:10,}\n"
+               "{:30}: {:10,}",
+               "NVM Puts", numNvmPuts,
+               "NVM Puts Success in pct", putSuccessPct,
+               "NVM Puts from Clean in pct", pctFn(numNvmPutFromClean, numNvmPuts),
+               "NVM AbortsFromDel", numNvmAbortedPutOnTombstone,
+               "NVM AbortsFromGet", numNvmAbortedPutOnInflightGet)
         << std::endl;
     out << folly::sformat(
-               "{:14}: {:15,}, {:10}: {:6.2f}%, {:8}: {:7,},"
-               " {:16}: {:8,}",
-               "NVM Evicts",
-               numNvmEvictions,
-               "Clean",
-               cleanEvictPct,
-               "Unclean",
-               numNvmUncleanEvict,
-               "Double",
-               numNvmCleanDoubleEvict)
+               "{:30}: {:10,}\n"
+               "{:30}: {:10.2f}\n"
+               "{:30}: {:10,}\n"
+               "{:30}: {:10,}",
+               "NVM Evicts", numNvmEvictions,
+               "NVM Clean Evicts in pct", cleanEvictPct,
+               "NVM Unclean Evicts", numNvmUncleanEvict,
+               "NVM Clean Double Evicts", numNvmCleanDoubleEvict)
         << std::endl;
     const double skippedDeletesPct = pctFn(numNvmSkippedDeletes, numNvmDeletes);
-    out << folly::sformat("{:14}: {:15,} {:14}: {:6.2f}%",
-                          "NVM Deletes",
-                          numNvmDeletes,
-                          "Skipped Deletes",
-                          skippedDeletesPct)
+    out << folly::sformat("{:30}: {:10,}\n"
+                          "{:30}: {:10.2f}",
+                          "NVM Deletes", numNvmDeletes,
+                          "NVM Skipped Deletes in pct", skippedDeletesPct)
         << std::endl;
     if (numNvmExceededMaxRetry > 0) {
-      out << folly::sformat("{}: {}", "NVM max read retry reached",
+      out << folly::sformat("{:30}: {:10,}", "NVM max read retry reached",
                             numNvmExceededMaxRetry)
           << std::endl;
     }
 
     if (slabsReleased > 0) {
       out << folly::sformat(
-                 "Released {:,} slabs\n"
-                 "  Moves     : attempts: {:10,}, success: {:6.2f}%\n"
-                 "  Evictions : attempts: {:10,}, success: {:6.2f}%",
+                 "Released slabs: {:,}\n"
+                 "Slab Move attempts: {:10,}\n"
+                 "Slab Move success in pct: {:6.2f}\n"
+                 "Slab Eviction attempts: {:10,}\n"
+                 "Slab Eviction success in pct: {:6.2f}",
                  slabsReleased,
                  moveAttemptsForSlabRelease,
                  pctFn(moveSuccessesForSlabRelease, moveAttemptsForSlabRelease),
                  evictionAttemptsForSlabRelease,
-                 pctFn(evictionSuccessesForSlabRelease,
-                       evictionAttemptsForSlabRelease))
+                 pctFn(evictionSuccessesForSlabRelease, evictionAttemptsForSlabRelease))
           << std::endl;
     }
 
@@ -458,8 +464,13 @@ struct Stats {
     }
 
     if (numCacheEvictions > 0) {
-      out << folly::sformat("Total evictions executed {:,}", numCacheEvictions)
+      out << folly::sformat("Total evictions executed  : {:10,}", numCacheEvictions)
               << std::endl;
+      out << folly::sformat("Total background evictions: {:10,}", totalbgevicted)
+              << std::endl;
+    }
+    if (totalpromoted > 0) {
+      out << folly::sformat("Total promotions          : {:10,}", totalpromoted) << std::endl;
     }
   }
 
diff --git a/cachelib/cachebench/util/CacheConfig.cpp b/cachelib/cachebench/util/CacheConfig.cpp
index 6d8f40874b..bcf5ea7e70 100644
--- a/cachelib/cachebench/util/CacheConfig.cpp
+++ b/cachelib/cachebench/util/CacheConfig.cpp
@@ -19,6 +19,8 @@
 #include "cachelib/allocator/HitsPerSlabStrategy.h"
 #include "cachelib/allocator/LruTailAgeStrategy.h"
 #include "cachelib/allocator/RandomStrategy.h"
+#include "cachelib/allocator/FreeThresholdStrategy.h"
+#include "cachelib/allocator/PromotionStrategy.h"
 
 namespace facebook {
 namespace cachelib {
@@ -28,6 +30,9 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) {
   JSONSetVal(configJson, cacheDir);
   JSONSetVal(configJson, cacheSizeMB);
   JSONSetVal(configJson, poolRebalanceIntervalSec);
+  JSONSetVal(configJson, backgroundEvictorIntervalMilSec);
+  JSONSetVal(configJson, backgroundPromoterIntervalMilSec);
+  JSONSetVal(configJson, backgroundEvictorStrategy);
   JSONSetVal(configJson, moveOnSlabRelease);
   JSONSetVal(configJson, rebalanceStrategy);
   JSONSetVal(configJson, rebalanceMinSlabs);
@@ -109,10 +114,27 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) {
   JSONSetVal(configJson, nvmAdmissionRetentionTimeThreshold);
 
   JSONSetVal(configJson, customConfigJson);
+  
+  //Background related configs
+  JSONSetVal(configJson, lowEvictionAcWatermark);
+  JSONSetVal(configJson, highEvictionAcWatermark);
+  JSONSetVal(configJson, minAcAllocationWatermark);
+  JSONSetVal(configJson, maxAcAllocationWatermark);
+  JSONSetVal(configJson, numDuplicateElements);
+  JSONSetVal(configJson, syncPromotion);
+  JSONSetVal(configJson, evictorThreads);
+  JSONSetVal(configJson, promoterThreads);
+  JSONSetVal(configJson, promotionAcWatermark);
+  JSONSetVal(configJson, maxEvictionBatch);
+  JSONSetVal(configJson, maxPromotionBatch);
+  JSONSetVal(configJson, minEvictionBatch);
+  JSONSetVal(configJson, minPromotionBatch);
+  JSONSetVal(configJson, maxEvictionPromotionHotness);
+  
   // if you added new fields to the configuration, update the JSONSetVal
   // to make them available for the json configs and increment the size
   // below
-  checkCorrectSize<CacheConfig, 760>();
+  checkCorrectSize<CacheConfig, 920>();
 
   if (numPools != poolSizes.size()) {
     throw std::invalid_argument(folly::sformat(
@@ -148,6 +170,26 @@ MemoryTierConfig::MemoryTierConfig(const folly::dynamic& configJson) {
 
   checkCorrectSize<MemoryTierConfig, 40>();
 }
+
+std::shared_ptr<BackgroundMoverStrategy> CacheConfig::getBackgroundEvictorStrategy() const {
+  if (backgroundEvictorIntervalMilSec == 0) {
+    return nullptr;
+  }
+  if (backgroundEvictorStrategy == "threshold") {
+    return std::make_shared<FreeThresholdStrategy>(lowEvictionAcWatermark, highEvictionAcWatermark, maxEvictionBatch, minEvictionBatch);
+  } else if (backgroundEvictorStrategy == "fixed") {
+    return std::make_shared<DefaultBackgroundMoverStrategy>(maxEvictionBatch, highEvictionAcWatermark);
+  } else {
+    return std::make_shared<FreeThresholdStrategy>(lowEvictionAcWatermark, highEvictionAcWatermark, maxEvictionBatch, minEvictionBatch);
+  }
+}
+
+std::shared_ptr<BackgroundMoverStrategy> CacheConfig::getBackgroundPromoterStrategy() const {
+  if (backgroundPromoterIntervalMilSec == 0) {
+    return nullptr;
+  }
+  return std::make_shared<PromotionStrategy>(promotionAcWatermark, maxPromotionBatch, minPromotionBatch);
+}
 } // namespace cachebench
 } // namespace cachelib
 } // namespace facebook
diff --git a/cachelib/cachebench/util/CacheConfig.h b/cachelib/cachebench/util/CacheConfig.h
index 028a18c596..b80eea3008 100644
--- a/cachelib/cachebench/util/CacheConfig.h
+++ b/cachelib/cachebench/util/CacheConfig.h
@@ -20,6 +20,7 @@
 
 #include "cachelib/allocator/CacheAllocator.h"
 #include "cachelib/allocator/RebalanceStrategy.h"
+#include "cachelib/allocator/BackgroundMoverStrategy.h"
 #include "cachelib/cachebench/util/JSONConfig.h"
 #include "cachelib/common/Ticker.h"
 #include "cachelib/navy/common/Device.h"
@@ -71,7 +72,10 @@ struct CacheConfig : public JSONConfig {
 
   uint64_t cacheSizeMB{0};
   uint64_t poolRebalanceIntervalSec{0};
+  uint64_t backgroundEvictorIntervalMilSec{0};
+  uint64_t backgroundPromoterIntervalMilSec{0};
   std::string rebalanceStrategy;
+  std::string backgroundEvictorStrategy;
   uint64_t rebalanceMinSlabs{1};
   double rebalanceDiffRatio{0.25};
   bool moveOnSlabRelease{false};
@@ -271,6 +275,27 @@ struct CacheConfig : public JSONConfig {
   // eviction-age is more than this threshold. 0 means no threshold
   uint32_t nvmAdmissionRetentionTimeThreshold{0};
 
+  // See BackgroundMovers.md for complete description
+  double promotionAcWatermark{4.0};
+  double lowEvictionAcWatermark{2.0};
+  double highEvictionAcWatermark{5.0};
+  double minAcAllocationWatermark{0.0};
+  double maxAcAllocationWatermark{0.0};
+
+  double numDuplicateElements{0.0}; // inclusivness of the cache
+  double syncPromotion{0.0}; // can promotion be done synchronously in user thread
+  
+  uint64_t evictorThreads{1};
+  uint64_t promoterThreads{1};
+  
+  uint64_t maxEvictionBatch{40};
+  uint64_t maxPromotionBatch{10};
+  
+  uint64_t minEvictionBatch{5};
+  uint64_t minPromotionBatch{5};
+  
+  uint64_t maxEvictionPromotionHotness{60};
+
   //
   // Options below are not to be populated with JSON
   //
@@ -306,6 +331,8 @@ struct CacheConfig : public JSONConfig {
   CacheConfig() {}
 
   std::shared_ptr<RebalanceStrategy> getRebalanceStrategy() const;
+  std::shared_ptr<BackgroundMoverStrategy> getBackgroundEvictorStrategy() const;
+  std::shared_ptr<BackgroundMoverStrategy> getBackgroundPromoterStrategy() const;
 };
 } // namespace cachebench
 } // namespace cachelib

From 1593291e5692d7370c258441c734104818fe5ed0 Mon Sep 17 00:00:00 2001
From: Sounak Gupta <sounak.gupta@intel.com>
Date: Tue, 28 Mar 2023 12:11:15 -0700
Subject: [PATCH 15/40] dummy change to trigger container image rebuild

---
 docker/images/install-dsa-deps.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/images/install-dsa-deps.sh b/docker/images/install-dsa-deps.sh
index b4c62ecc93..265011dd70 100755
--- a/docker/images/install-dsa-deps.sh
+++ b/docker/images/install-dsa-deps.sh
@@ -15,6 +15,7 @@ rm -rf idxd-config
 # Install DML Library
 git clone --recursive https://github.com/intel/DML.git
 cd DML
+git checkout e44443c24d53552b248b9869b1b16f89cd970f52
 mkdir build
 cd build
 cmake -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_BUILD_TYPE=RelWithDebInfo ..

From a171f389a5ce99f39411caf221b14b3af265e314 Mon Sep 17 00:00:00 2001
From: Sounak Gupta <sounak.gupta@intel.com>
Date: Tue, 9 May 2023 07:16:17 -0700
Subject: [PATCH 16/40] Updated the docker gcc version to 12 (#83)

updated the docker gcc version to 12

---------

Co-authored-by: Matt Rae <matt.rae@intel.com>
---
 docker/images/centos-8streams.Dockerfile | 4 ++++
 docker/run-build.sh                      | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/docker/images/centos-8streams.Dockerfile b/docker/images/centos-8streams.Dockerfile
index 29752c5d98..b916ab760c 100644
--- a/docker/images/centos-8streams.Dockerfile
+++ b/docker/images/centos-8streams.Dockerfile
@@ -17,6 +17,10 @@ json-c-devel \
 perf \
 numactl
 
+RUN dnf -y install gcc-toolset-12
+RUN echo "source /opt/rh/gcc-toolset-12/enable" >> /etc/bashrc
+SHELL ["/bin/bash", "--login", "-c"]
+
 COPY ./install-cachelib-deps.sh ./install-cachelib-deps.sh
 RUN ./install-cachelib-deps.sh
 
diff --git a/docker/run-build.sh b/docker/run-build.sh
index 02c7caf731..bc04819f18 100755
--- a/docker/run-build.sh
+++ b/docker/run-build.sh
@@ -11,6 +11,9 @@ function sudo_password() {
 cd ..
 mkdir build
 cd build
+
+source /opt/rh/gcc-toolset-12/enable
+
 cmake ../cachelib -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=/opt -DCMAKE_BUILD_TYPE=Debug
 sudo_password make install -j$(nproc)
 

From 35a17e4dd297b256caa731e4504978a18c7e8a57 Mon Sep 17 00:00:00 2001
From: Sergei Vinogradov <sergey.vinogradov@intel.com>
Date: Wed, 17 May 2023 13:36:42 +0200
Subject: [PATCH 17/40] NUMA bindigs support for private memory (#82)

---
 cachelib/allocator/CMakeLists.txt           |  1 +
 cachelib/allocator/CacheAllocator.h         | 49 ++++++++++----
 cachelib/allocator/MemoryTierCacheConfig.h  |  9 ++-
 cachelib/allocator/PrivateMemoryManager.cpp | 50 ++++++++++++++
 cachelib/allocator/PrivateMemoryManager.h   | 44 +++++++++++++
 cachelib/cachebench/util/CacheConfig.h      |  2 +-
 cachelib/common/CMakeLists.txt              |  1 +
 cachelib/common/Utils.cpp                   | 17 +++++
 cachelib/common/Utils.h                     | 72 +++++++++++++++++++++
 cachelib/shm/PosixShmSegment.cpp            |  2 +
 cachelib/shm/ShmCommon.h                    | 57 +---------------
 cachelib/shm/SysVShmSegment.cpp             | 17 +----
 examples/single_tier_cache/main.cpp         |  2 +-
 13 files changed, 236 insertions(+), 87 deletions(-)
 create mode 100644 cachelib/allocator/PrivateMemoryManager.cpp
 create mode 100644 cachelib/allocator/PrivateMemoryManager.h

diff --git a/cachelib/allocator/CMakeLists.txt b/cachelib/allocator/CMakeLists.txt
index 6103cdc823..0f96a0cd7f 100644
--- a/cachelib/allocator/CMakeLists.txt
+++ b/cachelib/allocator/CMakeLists.txt
@@ -55,6 +55,7 @@ add_library (cachelib_allocator
     PoolOptimizeStrategy.cpp
     PoolRebalancer.cpp
     PoolResizer.cpp
+    PrivateMemoryManager.cpp
     RebalanceStrategy.cpp
     SlabReleaseStats.cpp
     TempShmMapping.cpp
diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index ddf482e875..eeabb81f86 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -61,6 +61,7 @@
 #include "cachelib/allocator/PoolOptimizer.h"
 #include "cachelib/allocator/PoolRebalancer.h"
 #include "cachelib/allocator/PoolResizer.h"
+#include "cachelib/allocator/PrivateMemoryManager.h"
 #include "cachelib/allocator/ReadOnlySharedCacheView.h"
 #include "cachelib/allocator/Reaper.h"
 #include "cachelib/allocator/RebalanceStrategy.h"
@@ -2185,6 +2186,8 @@ class CacheAllocator : public CacheBase {
                   std::chrono::seconds timeout = std::chrono::seconds{0});
 
   ShmSegmentOpts createShmCacheOpts(TierId tid);
+  PrivateSegmentOpts createPrivateSegmentOpts(TierId tid);
+  std::unique_ptr<MemoryAllocator> createPrivateAllocator(TierId tid);
   std::unique_ptr<MemoryAllocator> createNewMemoryAllocator(TierId tid);
   std::unique_ptr<MemoryAllocator> restoreMemoryAllocator(TierId tid);
   std::unique_ptr<CCacheManager> restoreCCacheManager(TierId tid);
@@ -2234,7 +2237,7 @@ class CacheAllocator : public CacheBase {
   // @throw std::runtime_error if type is invalid
   std::vector<std::unique_ptr<MemoryAllocator>> initAllocator(InitMemType type);
 
-  std::vector<std::unique_ptr<MemoryAllocator>> createPrivateAllocator();
+  std::vector<std::unique_ptr<MemoryAllocator>> createPrivateAllocators();
   std::vector<std::unique_ptr<MemoryAllocator>> createAllocators();
   std::vector<std::unique_ptr<MemoryAllocator>> restoreAllocators();
 
@@ -2400,6 +2403,8 @@ class CacheAllocator : public CacheBase {
   // is not persisted when cache process exits.
   std::unique_ptr<TempShmMapping> tempShm_;
 
+  std::unique_ptr<PrivateMemoryManager> privMemManager_;
+
   std::unique_ptr<ShmManager> shmManager_;
 
   // Deserialize data to restore cache allocator. Used only while attaching to
@@ -2612,6 +2617,9 @@ CacheAllocator<CacheTrait>::CacheAllocator(
       tempShm_(type == InitMemType::kNone && isOnShm_
                    ? std::make_unique<TempShmMapping>(config_.getCacheSize())
                    : nullptr),
+      privMemManager_(type == InitMemType::kNone && !isOnShm_
+                          ? std::make_unique<PrivateMemoryManager>()
+                          : nullptr),
       shmManager_(type != InitMemType::kNone
                       ? std::make_unique<ShmManager>(config_.cacheDir,
                                                      config_.isUsingPosixShm())
@@ -2674,6 +2682,16 @@ ShmSegmentOpts CacheAllocator<CacheTrait>::createShmCacheOpts(TierId tid) {
   return opts;
 }
 
+template <typename CacheTrait>
+PrivateSegmentOpts CacheAllocator<CacheTrait>::createPrivateSegmentOpts(TierId tid) {
+  PrivateSegmentOpts opts;
+  opts.alignment = sizeof(Slab);
+  auto memoryTierConfigs = config_.getMemoryTierConfigs();
+  opts.memBindNumaNodes = memoryTierConfigs[tid].getMemBind();
+
+  return opts;
+}
+
 template <typename CacheTrait>
 size_t CacheAllocator<CacheTrait>::memoryTierSize(TierId tid) const {
   auto partitions = std::accumulate(config_.memoryTierConfigs.begin(), config_.memoryTierConfigs.end(), 0UL,
@@ -2685,22 +2703,19 @@ size_t CacheAllocator<CacheTrait>::memoryTierSize(TierId tid) const {
 }
 
 template <typename CacheTrait>
-std::vector<std::unique_ptr<MemoryAllocator>>
-CacheAllocator<CacheTrait>::createPrivateAllocator() {
-  std::vector<std::unique_ptr<MemoryAllocator>> allocators;
-
+std::unique_ptr<MemoryAllocator>
+CacheAllocator<CacheTrait>::createPrivateAllocator(TierId tid) {
   if (isOnShm_) {
-    allocators.emplace_back(std::make_unique<MemoryAllocator>(
+    return std::make_unique<MemoryAllocator>(
                             getAllocatorConfig(config_),
                             tempShm_->getAddr(),
-                            config_.getCacheSize()));
+                            memoryTierSize(tid));
   } else {
-    allocators.emplace_back(std::make_unique<MemoryAllocator>(
+    return std::make_unique<MemoryAllocator>(
                             getAllocatorConfig(config_),
-                            config_.getCacheSize()));
+                            privMemManager_->createMapping(config_.size, createPrivateSegmentOpts(tid)),
+                            memoryTierSize(tid));
   }
-
-  return allocators;
 }
 
 template <typename CacheTrait>
@@ -2729,6 +2744,16 @@ CacheAllocator<CacheTrait>::restoreMemoryAllocator(TierId tid) {
       config_.disableFullCoredump);
 }
 
+template <typename CacheTrait>
+std::vector<std::unique_ptr<MemoryAllocator>>
+CacheAllocator<CacheTrait>::createPrivateAllocators() {
+  std::vector<std::unique_ptr<MemoryAllocator>> allocators;
+  for (int tid = 0; tid < getNumTiers(); tid++) {
+    allocators.emplace_back(createPrivateAllocator(tid));
+  }
+  return allocators;
+}
+
 template <typename CacheTrait>
 std::vector<std::unique_ptr<MemoryAllocator>>
 CacheAllocator<CacheTrait>::createAllocators() {
@@ -2862,7 +2887,7 @@ std::vector<std::unique_ptr<MemoryAllocator>>
 CacheAllocator<CacheTrait>::initAllocator(
     InitMemType type) {
   if (type == InitMemType::kNone) {
-    return createPrivateAllocator();
+    return createPrivateAllocators();
   } else if (type == InitMemType::kMemNew) {
     return createAllocators();
   } else if (type == InitMemType::kMemAttach) {
diff --git a/cachelib/allocator/MemoryTierCacheConfig.h b/cachelib/allocator/MemoryTierCacheConfig.h
index 1b9477c048..ee579a5386 100644
--- a/cachelib/allocator/MemoryTierCacheConfig.h
+++ b/cachelib/allocator/MemoryTierCacheConfig.h
@@ -16,11 +16,14 @@
 
 #pragma once
 
+#include "cachelib/common/Utils.h"
 #include "cachelib/shm/ShmCommon.h"
 
 namespace facebook {
 namespace cachelib {
 class MemoryTierCacheConfig {
+  using bitmask_type = util::NumaBitMask;
+
  public:
   // Creates instance of MemoryTierCacheConfig for Posix/SysV Shared memory.
   static MemoryTierCacheConfig fromShm() { return MemoryTierCacheConfig(); }
@@ -39,12 +42,12 @@ class MemoryTierCacheConfig {
   size_t getRatio() const noexcept { return ratio; }
 
   // Allocate memory only from specified NUMA nodes
-  MemoryTierCacheConfig& setMemBind(const NumaBitMask& _numaNodes) {
+  MemoryTierCacheConfig& setMemBind(const bitmask_type& _numaNodes) {
     numaNodes = _numaNodes;
     return *this;
   }
 
-  const NumaBitMask& getMemBind() const noexcept { return numaNodes; }
+  const bitmask_type& getMemBind() const noexcept { return numaNodes; }
 
   size_t calculateTierSize(size_t totalCacheSize, size_t partitionNum) const {
     // TODO: Call this method when tiers are enabled in allocator
@@ -71,7 +74,7 @@ class MemoryTierCacheConfig {
   size_t ratio{1};
 
   // Numa node(s) to bind the tier
-  NumaBitMask numaNodes;
+  bitmask_type numaNodes;
 
   // TODO: introduce a container for tier settings when adding support for
   // file-mapped memory
diff --git a/cachelib/allocator/PrivateMemoryManager.cpp b/cachelib/allocator/PrivateMemoryManager.cpp
new file mode 100644
index 0000000000..afcf1b2202
--- /dev/null
+++ b/cachelib/allocator/PrivateMemoryManager.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cachelib/allocator/PrivateMemoryManager.h"
+
+#include <folly/ScopeGuard.h>
+
+namespace facebook {
+namespace cachelib {
+
+PrivateMemoryManager::~PrivateMemoryManager() {
+  for (auto& entry : mappings) {
+    util::munmapMemory(entry.first, entry.second);
+  }
+}
+
+void* PrivateMemoryManager::createMapping(size_t size,
+                                          PrivateSegmentOpts opts) {
+  void* addr = util::mmapAlignedZeroedMemory(opts.alignment, size);
+  auto guard = folly::makeGuard([&]() {
+    util::munmapMemory(addr, size);
+    mappings.erase(addr);
+  });
+
+  XDCHECK_EQ(reinterpret_cast<uint64_t>(addr) & (opts.alignment - 1), 0ULL);
+
+  if (!opts.memBindNumaNodes.empty()) {
+    util::mbindMemory(addr, size, MPOL_BIND, opts.memBindNumaNodes, 0);
+  }
+
+  mappings.emplace(addr, size);
+
+  guard.dismiss();
+  return addr;
+}
+} // namespace cachelib
+} // namespace facebook
\ No newline at end of file
diff --git a/cachelib/allocator/PrivateMemoryManager.h b/cachelib/allocator/PrivateMemoryManager.h
new file mode 100644
index 0000000000..7880ca928a
--- /dev/null
+++ b/cachelib/allocator/PrivateMemoryManager.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <unordered_map>
+
+#include "cachelib/common/Utils.h"
+
+namespace facebook {
+namespace cachelib {
+
+struct PrivateSegmentOpts {
+  size_t alignment{1}; // alignment for mapping.
+  util::NumaBitMask memBindNumaNodes;
+};
+
+class PrivateMemoryManager {
+ public:
+  PrivateMemoryManager() {}
+  ~PrivateMemoryManager();
+
+  void* createMapping(size_t size, PrivateSegmentOpts opts);
+
+ private:
+  std::unordered_map<void*, size_t> mappings;
+};
+
+} // namespace cachelib
+} // namespace facebook
\ No newline at end of file
diff --git a/cachelib/cachebench/util/CacheConfig.h b/cachelib/cachebench/util/CacheConfig.h
index b80eea3008..bb8943c134 100644
--- a/cachelib/cachebench/util/CacheConfig.h
+++ b/cachelib/cachebench/util/CacheConfig.h
@@ -52,7 +52,7 @@ struct MemoryTierConfig : public JSONConfig {
   MemoryTierCacheConfig getMemoryTierCacheConfig() {
     MemoryTierCacheConfig config = MemoryTierCacheConfig::fromShm();
     config.setRatio(ratio);
-    config.setMemBind(NumaBitMask(memBindNodes));
+    config.setMemBind(util::NumaBitMask(memBindNodes));
     return config;
   }
 
diff --git a/cachelib/common/CMakeLists.txt b/cachelib/common/CMakeLists.txt
index 927f2fa3f7..2e3aaf0493 100644
--- a/cachelib/common/CMakeLists.txt
+++ b/cachelib/common/CMakeLists.txt
@@ -40,6 +40,7 @@ target_link_libraries(cachelib_common PUBLIC
   Folly::folly_exception_tracer
   Folly::folly_exception_tracer_base
   Folly::folly_exception_counter
+  numa
 )
 
 install(TARGETS cachelib_common
diff --git a/cachelib/common/Utils.cpp b/cachelib/common/Utils.cpp
index 82ec0bf72e..9b051519dc 100644
--- a/cachelib/common/Utils.cpp
+++ b/cachelib/common/Utils.cpp
@@ -16,6 +16,7 @@
 
 #include <dirent.h>
 #include <folly/experimental/exception_tracer/ExceptionTracer.h>
+#include <numaif.h>
 #include <sys/mman.h>
 #include <sys/resource.h>
 #include <sys/shm.h>
@@ -181,6 +182,22 @@ void* mmapAlignedZeroedMemory(size_t alignment,
   throw std::system_error(errno, std::system_category(), "Cannot mmap");
 }
 
+void munmapMemory(void* addr, size_t size) { munmap(addr, size); }
+
+void mbindMemory(void* addr,
+                 unsigned long len,
+                 int mode,
+                 const NumaBitMask& mask,
+                 unsigned int flags) {
+  auto nodesMask = mask.getNativeBitmask();
+
+  long ret = mbind(addr, len, mode, nodesMask->maskp, nodesMask->size, flags);
+  if (ret != 0) {
+    util::throwSystemError(
+        errno, folly::sformat("mbind() failed: {}", std::strerror(errno)));
+  }
+}
+
 void setMaxLockMemory(uint64_t bytes) {
   struct rlimit rlim {
     bytes, bytes
diff --git a/cachelib/common/Utils.h b/cachelib/common/Utils.h
index 3d8acf3654..3a045c10ba 100644
--- a/cachelib/common/Utils.h
+++ b/cachelib/common/Utils.h
@@ -18,6 +18,8 @@
 
 #include <folly/Format.h>
 #include <folly/Random.h>
+#include <numa.h>
+#include <numaif.h>
 
 #include <unordered_map>
 
@@ -35,6 +37,57 @@ namespace facebook {
 namespace cachelib {
 namespace util {
 
+class NumaBitMask {
+ public:
+  using native_bitmask_type = struct bitmask*;
+
+  NumaBitMask() { nodesMask = numa_allocate_nodemask(); }
+
+  NumaBitMask(const NumaBitMask& other) {
+    nodesMask = numa_allocate_nodemask();
+    copy_bitmask_to_bitmask(other.nodesMask, nodesMask);
+  }
+
+  NumaBitMask(NumaBitMask&& other) {
+    nodesMask = other.nodesMask;
+    other.nodesMask = nullptr;
+  }
+
+  NumaBitMask(const std::string& str) {
+    nodesMask = numa_parse_nodestring_all(str.c_str());
+  }
+
+  ~NumaBitMask() {
+    if (nodesMask) {
+      numa_bitmask_free(nodesMask);
+    }
+  }
+
+  constexpr NumaBitMask& operator=(const NumaBitMask& other) {
+    if (this != &other) {
+      if (!nodesMask) {
+        nodesMask = numa_allocate_nodemask();
+      }
+      copy_bitmask_to_bitmask(other.nodesMask, nodesMask);
+    }
+    return *this;
+  }
+
+  native_bitmask_type getNativeBitmask() const noexcept { return nodesMask; }
+
+  NumaBitMask& setBit(unsigned int n) {
+    numa_bitmask_setbit(nodesMask, n);
+    return *this;
+  }
+
+  bool empty() const noexcept {
+    return numa_bitmask_equal(numa_no_nodes_ptr, nodesMask) == 1;
+  }
+
+ protected:
+  native_bitmask_type nodesMask = nullptr;
+};
+
 // A wrapper class for functions to collect counters.
 // It can be initialized by either
 // 1. folly::StringPiece, double -> void, or
@@ -295,6 +348,25 @@ void* mmapAlignedZeroedMemory(size_t alignment,
                               size_t numBytes,
                               bool noAccess = false);
 
+// destroy the mapping created by mmapAlignedZeroedMemory
+//
+// @param addr  the pointer to the memory to unmap
+// @param size  size of the memory region
+void munmapMemory(void* addr, size_t size);
+
+// binds memory to the NUMA nodes specified by nmask.
+//
+// @param addr  the pointer to the memory to bind.
+// @param len   length of the memory.
+// @param mode  mode supported by mmap call
+// @param mask  mask specifies node ids
+// @param flags flags supported by mmap call
+void mbindMemory(void* addr,
+                 unsigned long len,
+                 int mode,
+                 const NumaBitMask& mask,
+                 unsigned int flags);
+
 // get the number of pages in the range which are resident in the process.
 //
 // @param mem   memory start which is page aligned
diff --git a/cachelib/shm/PosixShmSegment.cpp b/cachelib/shm/PosixShmSegment.cpp
index a33a052688..bf197aa439 100644
--- a/cachelib/shm/PosixShmSegment.cpp
+++ b/cachelib/shm/PosixShmSegment.cpp
@@ -31,6 +31,8 @@
 namespace facebook {
 namespace cachelib {
 
+using NumaBitMask = util::NumaBitMask;
+
 constexpr static mode_t kRWMode = 0666;
 typedef struct stat stat_t;
 
diff --git a/cachelib/shm/ShmCommon.h b/cachelib/shm/ShmCommon.h
index 8db8707515..bc451c46d1 100644
--- a/cachelib/shm/ShmCommon.h
+++ b/cachelib/shm/ShmCommon.h
@@ -15,8 +15,6 @@
  */
 
 #pragma once
-#include <numa.h>
-#include <numaif.h>
 #include <sys/ipc.h>
 #include <sys/mman.h>
 #include <sys/shm.h>
@@ -30,6 +28,8 @@
 #include <folly/Range.h>
 #pragma GCC diagnostic pop
 
+#include "cachelib/common/Utils.h"
+
 /* On Mac OS / FreeBSD, mmap(2) syscall does not support these flags */
 #ifndef MAP_LOCKED
 #define MAP_LOCKED 0
@@ -72,62 +72,11 @@ enum PageSizeT {
   ONE_GB,
 };
 
-class NumaBitMask {
- public:
-  using native_bitmask_type = struct bitmask*;
-
-  NumaBitMask() { nodesMask = numa_allocate_nodemask(); }
-
-  NumaBitMask(const NumaBitMask& other) {
-    nodesMask = numa_allocate_nodemask();
-    copy_bitmask_to_bitmask(other.nodesMask, nodesMask);
-  }
-
-  NumaBitMask(NumaBitMask&& other) {
-    nodesMask = other.nodesMask;
-    other.nodesMask = nullptr;
-  }
-
-  NumaBitMask(const std::string& str) {
-    nodesMask = numa_parse_nodestring_all(str.c_str());
-  }
-
-  ~NumaBitMask() {
-    if (nodesMask) {
-      numa_bitmask_free(nodesMask);
-    }
-  }
-
-  constexpr NumaBitMask& operator=(const NumaBitMask& other) {
-    if (this != &other) {
-      if (!nodesMask) {
-        nodesMask = numa_allocate_nodemask();
-      }
-      copy_bitmask_to_bitmask(other.nodesMask, nodesMask);
-    }
-    return *this;
-  }
-
-  native_bitmask_type getNativeBitmask() const noexcept { return nodesMask; }
-
-  NumaBitMask& setBit(unsigned int n) {
-    numa_bitmask_setbit(nodesMask, n);
-    return *this;
-  }
-
-  bool empty() const noexcept {
-    return numa_bitmask_equal(numa_no_nodes_ptr, nodesMask) == 1;
-  }
-
- protected:
-  native_bitmask_type nodesMask = nullptr;
-};
-
 struct ShmSegmentOpts {
   PageSizeT pageSize{PageSizeT::NORMAL};
   bool readOnly{false};
   size_t alignment{1}; // alignment for mapping.
-  NumaBitMask memBindNumaNodes;
+  util::NumaBitMask memBindNumaNodes;
 
   explicit ShmSegmentOpts(PageSizeT p) : pageSize(p) {}
   explicit ShmSegmentOpts(PageSizeT p, bool ro) : pageSize(p), readOnly(ro) {}
diff --git a/cachelib/shm/SysVShmSegment.cpp b/cachelib/shm/SysVShmSegment.cpp
index 43c1755bbf..d70762ad98 100644
--- a/cachelib/shm/SysVShmSegment.cpp
+++ b/cachelib/shm/SysVShmSegment.cpp
@@ -191,21 +191,6 @@ void shmCtlImpl(int shmid, int cmd, shmid_ds* buf) {
   }
 }
 
-void mbindImpl(void* addr,
-               unsigned long len,
-               int mode,
-
-               const NumaBitMask& memBindNumaNodes,
-               unsigned int flags) {
-  auto nodesMask = memBindNumaNodes.getNativeBitmask();
-
-  long ret = mbind(addr, len, mode, nodesMask->maskp, nodesMask->size, flags);
-  if (ret != 0) {
-    util::throwSystemError(
-        errno, folly::sformat("mbind() failed: {}", std::strerror(errno)));
-  }
-}
-
 } // namespace detail
 
 void ensureSizeforHugePage(size_t size) {
@@ -302,7 +287,7 @@ void SysVShmSegment::memBind(void* addr) const {
   if (opts_.memBindNumaNodes.empty()) {
     return;
   }
-  detail::mbindImpl(addr, getSize(), MPOL_BIND, opts_.memBindNumaNodes, 0);
+  util::mbindMemory(addr, getSize(), MPOL_BIND, opts_.memBindNumaNodes, 0);
 }
 
 void SysVShmSegment::markForRemoval() {
diff --git a/examples/single_tier_cache/main.cpp b/examples/single_tier_cache/main.cpp
index de6373622c..9c19dfeea9 100644
--- a/examples/single_tier_cache/main.cpp
+++ b/examples/single_tier_cache/main.cpp
@@ -25,7 +25,7 @@ using CacheConfig = typename Cache::Config;
 using CacheKey = typename Cache::Key;
 using CacheReadHandle = typename Cache::ReadHandle;
 using MemoryTierCacheConfig = typename cachelib::MemoryTierCacheConfig;
-using NumaBitMask = typename cachelib::NumaBitMask;
+using NumaBitMask = typename cachelib::util::NumaBitMask;
 
 // Global cache object and a default cache pool
 std::unique_ptr<Cache> gCache_;

From 46d168cb9b40ef2cf6b309becbcad35b4ffd035e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20Chor=C4=85=C5=BCewicz?= <igor.chorazewicz@intel.com>
Date: Tue, 6 Jun 2023 09:05:29 -0700
Subject: [PATCH 18/40] Do not run cachelib-centos-8-5 on PRs (#85)

---
 .github/workflows/build-cachelib-centos-8-5.yml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.github/workflows/build-cachelib-centos-8-5.yml b/.github/workflows/build-cachelib-centos-8-5.yml
index 5dade56439..fcb3129b22 100644
--- a/.github/workflows/build-cachelib-centos-8-5.yml
+++ b/.github/workflows/build-cachelib-centos-8-5.yml
@@ -13,11 +13,6 @@
 # limitations under the License.
 name: build-cachelib-centos-8.5
 on:
-  push:
-    tags:
-      - 'v*'
-  pull_request:
-  workflow_dispatch:
   schedule:
      - cron:  '0 9 * * *'
 jobs:

From 7d065316ea6d9b11fa3430072a1c82cd438611cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20Chor=C4=85=C5=BCewicz?= <igor.chorazewicz@intel.com>
Date: Thu, 8 Jun 2023 12:24:04 -0700
Subject: [PATCH 19/40] Add option to insert items to first free tier (#87)

instead of always inserting to topmost tier
---
 cachelib/allocator/CacheAllocator.h       | 32 ++++++++++++++++++-----
 cachelib/allocator/CacheAllocatorConfig.h | 15 +++++++++++
 cachelib/cachebench/cache/Cache.h         |  2 ++
 cachelib/cachebench/util/CacheConfig.cpp  |  2 ++
 cachelib/cachebench/util/CacheConfig.h    |  2 ++
 5 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index eeabb81f86..698976cc89 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -1528,13 +1528,19 @@ class CacheAllocator : public CacheBase {
   // For description see allocateInternal.
   //
   // @param tid id a memory tier
+  // @param fromBgThread whether this function was called from a bg
+  //        thread - this is used to decide whether bg thread should
+  //        be waken in case there is no free memory
+  // @param evict whether to evict an item from tier tid in case there
+  //        is not enough memory
   WriteHandle allocateInternalTier(TierId tid,
                                    PoolId id,
                                    Key key,
                                    uint32_t size,
                                    uint32_t creationTime,
                                    uint32_t expiryTime,
-                                   bool fromBgThread);
+                                   bool fromBgThread,
+                                   bool evict);
 
   // Allocate a chained item
   //
@@ -2977,7 +2983,8 @@ CacheAllocator<CacheTrait>::allocateInternalTier(TierId tid,
                                                  uint32_t size,
                                                  uint32_t creationTime,
                                                  uint32_t expiryTime,
-                                                 bool fromBgThread) {
+                                                 bool fromBgThread,
+                                                 bool evict) {
   util::LatencyTracker tracker{stats().allocateLatency_};
 
   SCOPE_FAIL { stats_.invalidAllocs.inc(); };
@@ -3002,6 +3009,9 @@ CacheAllocator<CacheTrait>::allocateInternalTier(TierId tid,
   }
 
   if (memory == nullptr) {
+    if (!evict) {
+      return {};
+    }
     memory = findEviction(tid, pid, cid);
   }
 
@@ -3051,7 +3061,9 @@ CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
                                              bool fromBgThread) {
   auto tid = 0; /* TODO: consult admission policy */
   for(TierId tid = 0; tid < getNumTiers(); ++tid) {
-    auto handle = allocateInternalTier(tid, pid, key, size, creationTime, expiryTime, fromBgThread);
+    bool evict = !config_.insertToFirstFreeTier || tid == getNumTiers() - 1;
+    auto handle = allocateInternalTier(tid, pid, key, size, creationTime,
+                                       expiryTime, fromBgThread, evict);
     if (handle) return handle;
   }
   return {};
@@ -4220,13 +4232,16 @@ CacheAllocator<CacheTrait>::tryEvictToNextMemoryTier(
 
   TierId nextTier = tid; // TODO - calculate this based on some admission policy
   while (++nextTier < getNumTiers()) { // try to evict down to the next memory tiers
+    // always evict item from the nextTier to make room for new item
+    bool evict = true;
     // allocateInternal might trigger another eviction
     auto newItemHdl = allocateInternalTier(nextTier, pid,
                      item.getKey(),
                      item.getSize(),
                      item.getCreationTime(),
                      item.getExpiryTime(),
-                     fromBgThread);
+                     fromBgThread,
+                     evict);
 
     if (newItemHdl) {
       
@@ -4263,13 +4278,16 @@ CacheAllocator<CacheTrait>::tryPromoteToNextMemoryTier(
     auto toPromoteTier = nextTier - 1;
     --nextTier;
 
+    // always evict item from the toPromoteTier to make room for new item
+    bool evict = true;
     // allocateInternal might trigger another eviction
     auto newItemHdl = allocateInternalTier(toPromoteTier, pid,
                      item.getKey(),
                      item.getSize(),
                      item.getCreationTime(),
                      item.getExpiryTime(),
-                     fromBgThread);
+                     fromBgThread,
+                     true);
 
     if (newItemHdl) {
       XDCHECK_EQ(newItemHdl->getSize(), item.getSize());
@@ -5608,6 +5626,7 @@ CacheAllocator<CacheTrait>::allocateNewItemForOldItem(const Item& oldItem) {
   const auto tid = getTierId(oldItem);
   const auto allocInfo =
       allocator_[tid]->getAllocInfo(static_cast<const void*>(&oldItem));
+  bool evict = !config_.insertToFirstFreeTier || tid == getNumTiers() - 1;
 
   // Set up the destination for the move. Since oldItem would have the moving
   // bit set, it won't be picked for eviction.
@@ -5617,7 +5636,8 @@ CacheAllocator<CacheTrait>::allocateNewItemForOldItem(const Item& oldItem) {
                                          oldItem.getSize(),
                                          oldItem.getCreationTime(),
                                          oldItem.getExpiryTime(),
-                                         false);
+                                         false,
+                                         evict);
   if (!newItemHdl) {
     return {};
   }
diff --git a/cachelib/allocator/CacheAllocatorConfig.h b/cachelib/allocator/CacheAllocatorConfig.h
index 227f2e5354..46d48a1feb 100644
--- a/cachelib/allocator/CacheAllocatorConfig.h
+++ b/cachelib/allocator/CacheAllocatorConfig.h
@@ -313,6 +313,9 @@ class CacheAllocatorConfig {
   // Library team if you find yourself customizing this.
   CacheAllocatorConfig& setThrottlerConfig(util::Throttler::Config config);
 
+  // Insert items to first free memory tier
+  CacheAllocatorConfig& enableInsertToFirstFreeTier();
+
   // Passes in a callback to initialize an event tracker when the allocator
   // starts
   CacheAllocatorConfig& setEventTracker(EventTrackerSharedPtr&&);
@@ -539,6 +542,11 @@ class CacheAllocatorConfig {
   // ABOVE are the config for various cache workers
   //
 
+  // if turned off, always insert new elements to topmost memory tier.
+  // if turned on, insert new element to first free memory tier or evict memory
+  // from the bottom one if memory cache is full
+  bool insertToFirstFreeTier = false;
+
   // the number of tries to search for an item to evict
   // 0 means it's infinite
   unsigned int evictionSearchTries{50};
@@ -673,6 +681,12 @@ class CacheAllocatorConfig {
       {MemoryTierCacheConfig::fromShm().setRatio(1)}};
 };
 
+template <typename T>
+CacheAllocatorConfig<T>& CacheAllocatorConfig<T>::enableInsertToFirstFreeTier() {
+  insertToFirstFreeTier = true;
+  return *this;
+}
+
 template <typename T>
 CacheAllocatorConfig<T>& CacheAllocatorConfig<T>::setCacheName(
     const std::string& _cacheName) {
@@ -1254,6 +1268,7 @@ std::map<std::string, std::string> CacheAllocatorConfig<T>::serialize() const {
   configMap["nvmAdmissionMinTTL"] = std::to_string(nvmAdmissionMinTTL);
   configMap["delayCacheWorkersStart"] =
       delayCacheWorkersStart ? "true" : "false";
+  configMap["insertToFirstFreeTier"] = std::to_string(insertToFirstFreeTier);
   mergeWithPrefix(configMap, throttleConfig.serialize(), "throttleConfig");
   mergeWithPrefix(configMap,
                   chainedItemAccessConfig.serialize(),
diff --git a/cachelib/cachebench/cache/Cache.h b/cachelib/cachebench/cache/Cache.h
index 14e47161fc..cccf1014d2 100644
--- a/cachelib/cachebench/cache/Cache.h
+++ b/cachelib/cachebench/cache/Cache.h
@@ -578,6 +578,8 @@ Cache<Allocator>::Cache(const CacheConfig& config,
     allocatorConfig_.configureMemoryTiers(config_.memoryTierConfigs);
   }
 
+  allocatorConfig_.insertToFirstFreeTier = config_.insertToFirstFreeTier;
+
   auto cleanupGuard = folly::makeGuard([&] {
     if (!nvmCacheFilePath_.empty()) {
       util::removePath(nvmCacheFilePath_);
diff --git a/cachelib/cachebench/util/CacheConfig.cpp b/cachelib/cachebench/util/CacheConfig.cpp
index bcf5ea7e70..506dc289be 100644
--- a/cachelib/cachebench/util/CacheConfig.cpp
+++ b/cachelib/cachebench/util/CacheConfig.cpp
@@ -49,6 +49,8 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) {
   JSONSetVal(configJson, tryLockUpdate);
   JSONSetVal(configJson, lruIpSpec);
   JSONSetVal(configJson, useCombinedLockForIterators);
+  
+  JSONSetVal(configJson, insertToFirstFreeTier);
 
   JSONSetVal(configJson, lru2qHotPct);
   JSONSetVal(configJson, lru2qColdPct);
diff --git a/cachelib/cachebench/util/CacheConfig.h b/cachelib/cachebench/util/CacheConfig.h
index bb8943c134..23b9df3ea1 100644
--- a/cachelib/cachebench/util/CacheConfig.h
+++ b/cachelib/cachebench/util/CacheConfig.h
@@ -97,6 +97,8 @@ struct CacheConfig : public JSONConfig {
   bool lruUpdateOnRead{true};
   bool tryLockUpdate{false};
   bool useCombinedLockForIterators{true};
+  
+  bool insertToFirstFreeTier{false};
 
   // LRU param
   uint64_t lruIpSpec{0};

From 1521efe3ae3b9b238a8d343e73be5d5858990428 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Wed, 28 Jun 2023 13:12:32 -0400
Subject: [PATCH 20/40] Chained item movement between tiers - sync on the
 parent item (#84)

* Chained item movement between tiers - currently sync on the parent
item for moving.
 - updated tests accordingly, note that we can no longer swap
   parent item if chained item is being moved for slab release.

* added some debug checks around chained item check
* fix slab release behavior if no movecb
---
 cachelib/allocator/CacheAllocator.h           | 230 ++++++++++++++----
 cachelib/allocator/tests/BaseAllocatorTest.h  |   9 +-
 .../allocator/tests/RebalanceStrategyTest.cpp |   3 +
 cachelib/allocator/tests/RefCountTest.cpp     |  10 -
 .../allocator/tests/SimpleRebalancingTest.h   |   2 +-
 cachelib/cachebench/runner/CacheStressor.h    |   6 +-
 .../test_configs/small_moving_bg.json         |  35 +++
 run_tests.sh                                  |   1 +
 8 files changed, 225 insertions(+), 71 deletions(-)
 create mode 100644 cachelib/cachebench/test_configs/small_moving_bg.json

diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index 698976cc89..bd60c91d29 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -1558,6 +1558,26 @@ class CacheAllocator : public CacheBase {
   //            if the item is invalid
   WriteHandle allocateChainedItemInternal(const Item& parent, uint32_t size);
 
+  // Allocate a chained item to a specific tier
+  //
+  // The resulting chained item does not have a parent item yet
+  // and if we fail to link to the chain for any reasoin
+  // the chained item will be freed once the handle is dropped.
+  //
+  // The parent item parameter here is mainly used to find the
+  // correct pool to allocate memory for this chained item
+  //
+  // @param parent    parent item
+  // @param size      the size for the chained allocation
+  // @param tid       the tier to allocate on
+  //
+  // @return    handle to the chained allocation
+  // @throw     std::invalid_argument if the size requested is invalid or
+  //            if the item is invalid
+  WriteHandle allocateChainedItemInternalTier(const Item& parent,
+                                              uint32_t size,
+                                              TierId tid);
+
   // Given an existing item, allocate a new one for the
   // existing one to later be moved into.
   //
@@ -1662,9 +1682,8 @@ class CacheAllocator : public CacheBase {
   // will be unmarked as having chained allocations. Parent will not be null
   // after calling this API.
   //
-  // Parent and NewParent must be valid handles to items with same key and
-  // parent must have chained items and parent handle must be the only
-  // outstanding handle for parent. New parent must be without any chained item
+  // NewParent must be valid handles to item with same key as Parent and
+  // Parent must have chained items. New parent must be without any chained item
   // handles.
   //
   // Chained item lock for the parent's key needs to be held in exclusive mode.
@@ -3092,6 +3111,19 @@ template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
 CacheAllocator<CacheTrait>::allocateChainedItemInternal(const Item& parent,
                                                         uint32_t size) {
+  auto tid = 0; /* TODO: consult admission policy */
+  for(TierId tid = 0; tid < getNumTiers(); ++tid) {
+    auto handle = allocateChainedItemInternalTier(parent, size, tid);
+    if (handle) return handle;
+  }
+  return {};
+}
+
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::allocateChainedItemInternalTier(const Item& parent,
+                                                            uint32_t size,
+                                                            TierId tid) {
   util::LatencyTracker tracker{stats().allocateLatency_};
 
   SCOPE_FAIL { stats_.invalidAllocs.inc(); };
@@ -3099,14 +3131,10 @@ CacheAllocator<CacheTrait>::allocateChainedItemInternal(const Item& parent,
   // number of bytes required for this item
   const auto requiredSize = ChainedItem::getRequiredSize(size);
   
-  // this is correct for now as we can
-  // assume the parent and chained item
-  // will reside in the same tier until 
-  // they are moved
-  auto tid = getTierId(parent);
-
-  const auto pid = allocator_[tid]->getAllocInfo(parent.getMemory()).poolId;
-  const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize);
+  //this is okay because pools/classes are duplicated among the tiers
+  auto ptid = getTierId(parent);
+  const auto pid = allocator_[ptid]->getAllocInfo(parent.getMemory()).poolId;
+  const auto cid = allocator_[ptid]->getAllocationClassId(pid, requiredSize);
 
   // TODO: per-tier? Right now stats_ are not used in any public periodic
   // worker
@@ -3477,7 +3505,10 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
   // memory for a chained item but has decided not to insert the chained item
   // to a parent item and instead drop the chained item handle. In this case,
   // we free the chained item directly without calling remove callback.
-  if (it.isChainedItem()) {
+  //
+  // Except if we are moving a chained item between tiers -
+  // then it == toRecycle and we will want the normal recycle path
+  if (it.isChainedItem() && &it != toRecycle) {
     if (toRecycle) {
       throw std::runtime_error(
           folly::sformat("Can not recycle a chained item {}, toRecyle",
@@ -3550,7 +3581,7 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
 
     while (head) {
       auto next = head->getNext(compressor_);
-
+      const auto tid = getTierId(head);
       const auto childInfo =
           allocator_[tid]->getAllocInfo(static_cast<const void*>(head));
       (*stats_.fragmentationSize)[tid][childInfo.poolId][childInfo.classId].sub(
@@ -3890,14 +3921,19 @@ bool CacheAllocator<CacheTrait>::moveRegularItem(Item& oldItem,
     newItemHdl->markNvmClean();
   }
 
-  // Execute the move callback. We cannot make any guarantees about the
-  // consistency of the old item beyond this point, because the callback can
-  // do more than a simple memcpy() e.g. update external references. If there
-  // are any remaining handles to the old item, it is the caller's
-  // responsibility to invalidate them. The move can only fail after this
-  // statement if the old item has been removed or replaced, in which case it
-  // should be fine for it to be left in an inconsistent state.
-  config_.moveCb(oldItem, *newItemHdl, nullptr);
+  if (config_.moveCb) {
+    // Execute the move callback. We cannot make any guarantees about the
+    // consistency of the old item beyond this point, because the callback can
+    // do more than a simple memcpy() e.g. update external references. If there
+    // are any remaining handles to the old item, it is the caller's
+    // responsibility to invalidate them. The move can only fail after this
+    // statement if the old item has been removed or replaced, in which case it
+    // should be fine for it to be left in an inconsistent state.
+    config_.moveCb(oldItem, *newItemHdl, nullptr);
+  } else {
+    std::memcpy(newItemHdl->getMemory(), oldItem.getMemory(),
+                oldItem.getSize());
+  }
 
   // Adding the item to mmContainer has to succeed since no one can remove the
   // item
@@ -3945,14 +3981,19 @@ bool CacheAllocator<CacheTrait>::moveChainedItem(ChainedItem& oldItem,
 
   auto parentPtr = &parentItem;
 
-  // Execute the move callback. We cannot make any guarantees about the
-  // consistency of the old item beyond this point, because the callback can
-  // do more than a simple memcpy() e.g. update external references. If there
-  // are any remaining handles to the old item, it is the caller's
-  // responsibility to invalidate them. The move can only fail after this
-  // statement if the old item has been removed or replaced, in which case it
-  // should be fine for it to be left in an inconsistent state.
-  config_.moveCb(oldItem, *newItemHdl, parentPtr);
+  if (config_.moveCb) {
+    // Execute the move callback. We cannot make any guarantees about the
+    // consistency of the old item beyond this point, because the callback can
+    // do more than a simple memcpy() e.g. update external references. If there
+    // are any remaining handles to the old item, it is the caller's
+    // responsibility to invalidate them. The move can only fail after this
+    // statement if the old item has been removed or replaced, in which case it
+    // should be fine for it to be left in an inconsistent state.
+    config_.moveCb(oldItem, *newItemHdl, parentPtr);
+  } else {
+    std::memcpy(newItemHdl->getMemory(), oldItem.getMemory(),
+                oldItem.getSize());
+  }
 
   // Replace the new item in the position of the old one before both in the
   // parent's chain and the MMContainer.
@@ -3996,12 +4037,16 @@ CacheAllocator<CacheTrait>::getNextCandidate(TierId tid,
                                              unsigned int& searchTries) {
   typename NvmCacheT::PutToken token;
   Item* toRecycle = nullptr;
+  Item* toRecycleParent = nullptr;
   Item* candidate = nullptr;
   bool isExpired = false;
+  bool chainedItem = false;
   auto& mmContainer = getMMContainer(tid, pid, cid);
   bool lastTier = tid+1 >= getNumTiers();
 
-  mmContainer.withEvictionIterator([this, tid, pid, cid, &candidate, &toRecycle,
+  mmContainer.withEvictionIterator([this, tid, pid, cid, &candidate,
+                                    &toRecycle, &toRecycleParent,
+                                    &chainedItem,
                                     &searchTries, &mmContainer, &lastTier,
                                     &isExpired, &token](auto&& itr) {
     if (!itr) {
@@ -4017,11 +4062,38 @@ CacheAllocator<CacheTrait>::getNextCandidate(TierId tid,
       (*stats_.evictionAttempts)[tid][pid][cid].inc();
 
       auto* toRecycle_ = itr.get();
-      auto* candidate_ =
-          toRecycle_->isChainedItem()
+      bool chainedItem_ = toRecycle_->isChainedItem();
+      Item* toRecycleParent_ = chainedItem_
               ? &toRecycle_->asChainedItem().getParentItem(compressor_)
-              : toRecycle_;
-
+              : nullptr;
+      // in order to safely check if the expected parent (toRecycleParent_) matches
+      // the current parent on the chained item, we need to take the chained
+      // item lock so we are sure that nobody else will be editing the chain
+      auto l_ = chainedItem_
+                ? chainedItemLocks_.tryLockExclusive(toRecycleParent_->getKey())
+                : decltype(chainedItemLocks_.tryLockExclusive(toRecycle_->getKey()))();
+
+      if (chainedItem_ &&
+          ( !l_ || &toRecycle_->asChainedItem().getParentItem(compressor_)
+                    != toRecycleParent_) ) {
+          // Fail moving if we either couldn't acquire the chained item lock,
+          // or if the parent had already been replaced in the meanwhile.
+          ++itr;
+          continue;
+      }
+      Item* candidate_;
+      Item* syncItem_;
+      //sync on the parent item for chained items to move to next tier
+      if (!lastTier && chainedItem_) {
+          syncItem_ = toRecycleParent_;
+          candidate_ = toRecycle_;
+      } else if (lastTier && chainedItem_) {
+          candidate_ = toRecycleParent_;
+          syncItem_ = toRecycleParent_;
+      } else {
+          candidate_ = toRecycle_;
+          syncItem_ = toRecycle_;
+      }
       // if it's last tier, the item will be evicted
       // need to create put token before marking it exclusive
       const bool evictToNvmCache = lastTier && shouldWriteToNvmCache(*candidate_);
@@ -4036,8 +4108,8 @@ CacheAllocator<CacheTrait>::getNextCandidate(TierId tid,
       }
 
       auto markedForEviction = (lastTier || candidate_->isExpired()) ?
-                                   candidate_->markForEviction() :
-                                   candidate_->markMoving();
+                                   syncItem_->markForEviction() :
+                                   syncItem_->markMoving();
       if (!markedForEviction) {
         if (candidate_->hasChainedItem()) {
           stats_.evictFailParentAC.inc();
@@ -4048,7 +4120,9 @@ CacheAllocator<CacheTrait>::getNextCandidate(TierId tid,
         continue;
       }
 
-      XDCHECK(candidate_->isMoving() || candidate_->isMarkedForEviction());
+      XDCHECK(syncItem_->isMoving() || syncItem_->isMarkedForEviction());
+      toRecycleParent = toRecycleParent_;
+      chainedItem = chainedItem_;
       // markForEviction to make sure no other thead is evicting the item
       // nor holding a handle to that item if this is last tier
       // since we won't be moving the item to the next tier
@@ -4056,15 +4130,11 @@ CacheAllocator<CacheTrait>::getNextCandidate(TierId tid,
       candidate = candidate_;
       isExpired = candidate_->isExpired();
       token = std::move(token_);
-
-      // Check if parent changed for chained items - if yes, we cannot
-      // remove the child from the mmContainer as we will not be evicting
-      // it. We could abort right here, but we need to cleanup in case
-      // unmarkForEviction() returns 0 - so just go through normal path.
-      if (!toRecycle_->isChainedItem() ||
-          &toRecycle->asChainedItem().getParentItem(compressor_) == candidate) {
-        mmContainer.remove(itr);
+      if (chainedItem) {
+          XDCHECK(l_);
+          XDCHECK_EQ(toRecycleParent,&toRecycle_->asChainedItem().getParentItem(compressor_));
       }
+      mmContainer.remove(itr);
       return;
     }
   });
@@ -4075,11 +4145,18 @@ CacheAllocator<CacheTrait>::getNextCandidate(TierId tid,
 
   XDCHECK(toRecycle);
   XDCHECK(candidate);
-  XDCHECK(candidate->isMoving() || candidate->isMarkedForEviction());
 
   auto evictedToNext = (lastTier || isExpired) ? nullptr
       : tryEvictToNextMemoryTier(*candidate, false);
   if (!evictedToNext) {
+    //failed to move a chained item - so evict the entire chain
+    if (candidate->isChainedItem()) {
+      //candidate should be parent now
+      XDCHECK(toRecycleParent->isMoving());
+      XDCHECK_EQ(candidate,toRecycle);
+      candidate = toRecycleParent; //but now we evict the chain and in
+                                    //doing so recycle the child
+    }
     //if insertOrReplace was called during move
     //then candidate will not be accessible (failed replace during tryEvict)
     // - therefore this was why we failed to
@@ -4125,7 +4202,34 @@ CacheAllocator<CacheTrait>::getNextCandidate(TierId tid,
     XDCHECK(candidate->getKey() == evictedToNext->getKey());
 
     (*stats_.numWritebacks)[tid][pid][cid].inc();
-    wakeUpWaiters(candidate->getKey(), std::move(evictedToNext));
+    if (chainedItem) {
+      XDCHECK(toRecycleParent->isMoving());
+      XDCHECK_EQ(evictedToNext->getRefCount(),2u);
+      (*stats_.chainedItemEvictions)[tid][pid][cid].inc();
+      // check if by releasing the item we intend to, we actually
+      // recycle the candidate.
+      auto ret = releaseBackToAllocator(*candidate, RemoveContext::kEviction,
+                                        /* isNascent */ false, toRecycle);
+      XDCHECK_EQ(ret,ReleaseRes::kRecycled);
+      evictedToNext.reset(); //once we unmark moving threads will try and alloc, drop
+                              //the handle now - and refcount will drop to 1
+      auto ref = toRecycleParent->unmarkMoving();
+      if (UNLIKELY(ref == 0)) {
+        wakeUpWaiters(toRecycleParent->getKey(),{});
+        const auto res =
+            releaseBackToAllocator(*toRecycleParent, RemoveContext::kNormal, false);
+        XDCHECK(res == ReleaseRes::kReleased);
+      } else {
+        auto parentHandle = acquire(toRecycleParent);
+        if (parentHandle) {
+          wakeUpWaiters(toRecycleParent->getKey(),std::move(parentHandle));
+        } //in case where parent handle is null that means some other thread
+          // would have called wakeUpWaiters with null handle and released
+          // parent back to allocator
+      }
+    } else {
+      wakeUpWaiters(candidate->getKey(), std::move(evictedToNext));
+    }
   }
 
   XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
@@ -4226,31 +4330,49 @@ template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
 CacheAllocator<CacheTrait>::tryEvictToNextMemoryTier(
     TierId tid, PoolId pid, Item& item, bool fromBgThread) {
-  XDCHECK(item.isMoving());
-  XDCHECK(item.getRefCount() == 0);
-  if(item.hasChainedItem()) return WriteHandle{}; // TODO: We do not support ChainedItem yet
 
   TierId nextTier = tid; // TODO - calculate this based on some admission policy
   while (++nextTier < getNumTiers()) { // try to evict down to the next memory tiers
     // always evict item from the nextTier to make room for new item
     bool evict = true;
     // allocateInternal might trigger another eviction
-    auto newItemHdl = allocateInternalTier(nextTier, pid,
+    WriteHandle newItemHdl{};
+    Item* parentItem;
+    bool chainedItem = false;
+    if(item.isChainedItem()) {
+        chainedItem = true;
+        parentItem = &item.asChainedItem().getParentItem(compressor_);
+        XDCHECK(parentItem->isMoving());
+        XDCHECK(item.isChainedItem() && item.getRefCount() == 1);
+        XDCHECK_EQ(0, parentItem->getRefCount());
+        newItemHdl = allocateChainedItemInternalTier(*parentItem,
+                                                     item.getSize(),
+                                                     nextTier);
+    } else {
+      // this assert can fail if parent changed
+      XDCHECK(item.isMoving());
+      XDCHECK(item.getRefCount() == 0);
+      newItemHdl = allocateInternalTier(nextTier, pid,
                      item.getKey(),
                      item.getSize(),
                      item.getCreationTime(),
                      item.getExpiryTime(),
                      fromBgThread,
                      evict);
+    }
 
     if (newItemHdl) {
-      
-      bool moveSuccess = moveRegularItem(item, newItemHdl);
+      bool moveSuccess = chainedItem
+                      ? moveChainedItem(item.asChainedItem(), newItemHdl)
+                      : moveRegularItem(item, newItemHdl);
       if (!moveSuccess) {
         return WriteHandle{};
       }
       XDCHECK_EQ(newItemHdl->getSize(), item.getSize());
-      item.unmarkMoving();
+      if (!chainedItem) { // TODO: do we need it?
+        XDCHECK_EQ(newItemHdl->getKey(),item.getKey());
+        item.unmarkMoving();
+      }
       return newItemHdl;
     } else {
       return WriteHandle{};
diff --git a/cachelib/allocator/tests/BaseAllocatorTest.h b/cachelib/allocator/tests/BaseAllocatorTest.h
index ac3d7bbccd..16d3c03ccd 100644
--- a/cachelib/allocator/tests/BaseAllocatorTest.h
+++ b/cachelib/allocator/tests/BaseAllocatorTest.h
@@ -4916,7 +4916,7 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
 
       std::memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize());
       ++numMoves;
-    });
+    }, {}, 1000000 /* lots of moving tries */);
 
     AllocatorT alloc(config);
     const size_t numBytes = alloc.getCacheMemoryStats().ramCacheSize;
@@ -4957,7 +4957,7 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
         }
 
         /* sleep override */
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
       }
     };
 
@@ -4965,7 +4965,7 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
     auto releaseFn = [&] {
       for (unsigned int i = 0; i < 5;) {
         /* sleep override */
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
 
         ClassId cid = static_cast<ClassId>(i);
         alloc.releaseSlab(pid, cid, SlabReleaseMode::kRebalance);
@@ -5125,7 +5125,7 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
     auto releaseFn = [&] {
       for (unsigned int i = 0; i < 5;) {
         /* sleep override */
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
 
         ClassId cid = static_cast<ClassId>(i);
         alloc.releaseSlab(pid, cid, SlabReleaseMode::kRebalance);
@@ -5968,7 +5968,6 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
     EXPECT_EQ(nullptr,
               util::allocateAccessible(alloc, poolId, "large", largeSize));
 
-    std::this_thread::sleep_for(std::chrono::seconds{1});
     // trigger the slab rebalance
     EXPECT_EQ(nullptr,
               util::allocateAccessible(alloc, poolId, "large", largeSize));
diff --git a/cachelib/allocator/tests/RebalanceStrategyTest.cpp b/cachelib/allocator/tests/RebalanceStrategyTest.cpp
index a11ab234e3..2843cec883 100644
--- a/cachelib/allocator/tests/RebalanceStrategyTest.cpp
+++ b/cachelib/allocator/tests/RebalanceStrategyTest.cpp
@@ -214,6 +214,9 @@ class RebalanceStrategyTest : public testing::Test {
     config.poolRebalancerFreeAllocThreshold = 20;
 
     initAllocatorConfigForStrategy(config, LruTailAge);
+    //TODO: why does this fail with orig. value of 8?
+    //on upstream this fails too, it always reports 4 instead
+    //of the original test value, which is 8 expected slabs
     doWork(config, true, 8);
   }
 
diff --git a/cachelib/allocator/tests/RefCountTest.cpp b/cachelib/allocator/tests/RefCountTest.cpp
index e8e16259f9..7131d0e11e 100644
--- a/cachelib/allocator/tests/RefCountTest.cpp
+++ b/cachelib/allocator/tests/RefCountTest.cpp
@@ -209,16 +209,6 @@ void RefCountTest::testMarkForEvictionAndMoving() {
     ASSERT_EQ(ret, 0);
   }
 
-  {
-    // cannot mark moving when ref count > 0
-    RefcountWithFlags ref;
-    ref.markInMMContainer();
-
-    ref.incRef();
-
-    ASSERT_FALSE(ref.markMoving());
-  }
-
   {
     // cannot mark for eviction when ref count > 0
     RefcountWithFlags ref;
diff --git a/cachelib/allocator/tests/SimpleRebalancingTest.h b/cachelib/allocator/tests/SimpleRebalancingTest.h
index 634882c730..3f1869ede3 100644
--- a/cachelib/allocator/tests/SimpleRebalancingTest.h
+++ b/cachelib/allocator/tests/SimpleRebalancingTest.h
@@ -104,7 +104,7 @@ class SimpleRebalanceTest : public testing::Test {
 
     // Sleep for 2 seconds to let the rebalancing work
     /* sleep override */
-    std::this_thread::sleep_for(std::chrono::seconds(3));
+    std::this_thread::sleep_for(std::chrono::seconds(10));
 
     // Evicted keys shouldn't be in the allocator anymore
     ASSERT_FALSE(evictedKeys.empty());
diff --git a/cachelib/cachebench/runner/CacheStressor.h b/cachelib/cachebench/runner/CacheStressor.h
index cbc8204b52..9b396cb1b7 100644
--- a/cachelib/cachebench/runner/CacheStressor.h
+++ b/cachelib/cachebench/runner/CacheStressor.h
@@ -77,7 +77,7 @@ class CacheStressor : public Stressor {
         std::unique_lock<folly::SharedMutex> lock;
 
         CacheStressSyncObj(CacheStressor& s, std::string itemKey)
-            : lock{s.chainedItemAcquireUniqueLock(itemKey)} {}
+            : lock{s.chainedItemTryAcquireUniqueLock(itemKey)} {}
       };
       movingSync = [this](typename CacheT::Item::Key key) {
         return std::make_unique<CacheStressSyncObj>(*this, key.str());
@@ -247,6 +247,10 @@ class CacheStressor : public Stressor {
     using Lock = std::unique_lock<folly::SharedMutex>;
     return lockEnabled_ ? Lock{getLock(key)} : Lock{};
   }
+  auto chainedItemTryAcquireUniqueLock(Key key) {
+    using Lock = std::unique_lock<folly::SharedMutex>;
+    return lockEnabled_ ? Lock{getLock(key), std::try_to_lock} : Lock{};
+  }
 
   // populate the input item handle according to the stress setup.
   void populateItem(WriteHandle& handle, const std::string& itemValue = "") {
diff --git a/cachelib/cachebench/test_configs/small_moving_bg.json b/cachelib/cachebench/test_configs/small_moving_bg.json
new file mode 100644
index 0000000000..c4838f42b5
--- /dev/null
+++ b/cachelib/cachebench/test_configs/small_moving_bg.json
@@ -0,0 +1,35 @@
+// @nolint like default.json, but moves items during slab release instead of evicting them.
+{
+    "cache_config" : {
+      "cacheSizeMB" : 2248,
+      "cacheDir": "/tmp/mem-tier5",
+      "memoryTiers" : [
+        {
+          "ratio": 1,
+          "memBindNodes": 0
+        }, {
+          "ratio": 1,
+          "memBindNodes": 0
+        }
+      ],
+      "poolRebalanceIntervalSec" : 1,
+      "moveOnSlabRelease" : true,
+      "rebalanceMinSlabs" : 2,
+      "evictorThreads": 2,
+      "promoterThreads": 2
+    },
+    "test_config" :
+      {
+        "preallocateCache" : true,
+        "numOps" : 20000000,
+        "numThreads" : 32,
+        "numKeys" : 250000,
+        "generator": "online",
+        "keySizeRange" : [1, 8, 32, 64, 128, 256, 512],
+        "keySizeRangeProbability" : [0.1, 0.1, 0.2, 0.2, 0.3, 0.1],
+        "valSizeRange" : [1, 128, 512, 1024, 4096, 10240, 20480, 40960, 60000],
+        "valSizeRangeProbability" : [0.1, 0.1, 0.1, 0.2, 0.2, 0.1, 0.1, 0.1],
+        "getRatio" : 0.70,
+        "setRatio" : 0.30
+      }
+  }
diff --git a/run_tests.sh b/run_tests.sh
index e575dbc62a..6ff2ac65ed 100755
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -13,3 +13,4 @@ fi
 
 ../bin/cachebench --json_test_config ../test_configs/consistency/navy.json
 ../bin/cachebench --json_test_config ../test_configs/consistency/navy-multi-tier.json
+../bin/cachebench --json_test_config ../test_configs/small_moving_bg.json

From 3328e4e1d20dc998e8d7097cab6589882d19addc Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Mon, 24 Jul 2023 14:26:23 -0700
Subject: [PATCH 21/40] edit dockerfile

---
 docker/images/centos-8streams.Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docker/images/centos-8streams.Dockerfile b/docker/images/centos-8streams.Dockerfile
index b916ab760c..e0c31226a1 100644
--- a/docker/images/centos-8streams.Dockerfile
+++ b/docker/images/centos-8streams.Dockerfile
@@ -17,6 +17,8 @@ json-c-devel \
 perf \
 numactl
 
+# updated to fix compile errors and better symbol
+# resolving in VTune
 RUN dnf -y install gcc-toolset-12
 RUN echo "source /opt/rh/gcc-toolset-12/enable" >> /etc/bashrc
 SHELL ["/bin/bash", "--login", "-c"]

From 3c87c496042322500b3c123abf8a7aab708b0dbc Mon Sep 17 00:00:00 2001
From: Sounak Gupta <sounak.gupta@intel.com>
Date: Fri, 28 Jul 2023 01:39:04 -0700
Subject: [PATCH 22/40] Track latency of per item eviction/promotion between
 memory tiers ----------------------------------------------------------------
 this can go with background evictors multi-tier part 1

---
 cachelib/allocator/Cache.cpp            |  4 ++++
 cachelib/allocator/CacheAllocator.h     |  4 +++-
 cachelib/allocator/CacheStats.cpp       |  4 +++-
 cachelib/allocator/CacheStats.h         |  2 ++
 cachelib/allocator/CacheStatsInternal.h |  2 ++
 cachelib/cachebench/cache/Cache.h       |  2 ++
 cachelib/cachebench/cache/CacheStats.h  |  6 ++++++
 cachelib/common/PercentileStats.h       | 11 ++++++-----
 8 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/cachelib/allocator/Cache.cpp b/cachelib/allocator/Cache.cpp
index db7a281104..8d958b3510 100644
--- a/cachelib/allocator/Cache.cpp
+++ b/cachelib/allocator/Cache.cpp
@@ -477,6 +477,10 @@ void CacheBase::updateGlobalCacheStats(const std::string& statPrefix) const {
 
   visitEstimates(uploadStatsNanoToMicro, stats.allocateLatencyNs,
                  statPrefix + "allocate.latency_us");
+  visitEstimates(uploadStatsNanoToMicro, stats.bgEvictLatencyNs,
+                 statPrefix + "background.eviction.latency_us");
+  visitEstimates(uploadStatsNanoToMicro, stats.bgPromoteLatencyNs,
+                 statPrefix + "background.promotion.latency_us");
   visitEstimates(uploadStatsNanoToMicro, stats.moveChainedLatencyNs,
                  statPrefix + "move.chained.latency_us");
   visitEstimates(uploadStatsNanoToMicro, stats.moveRegularLatencyNs,
diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index bd60c91d29..af40a265dc 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -2013,6 +2013,7 @@ class CacheAllocator : public CacheBase {
                                unsigned int pid,
                                unsigned int cid,
                                size_t batch) {
+    util::LatencyTracker tracker{stats().bgEvictLatency_, batch};
     auto& mmContainer = getMMContainer(tid, pid, cid);
     size_t evictions = 0;
     size_t evictionCandidates = 0;
@@ -2089,6 +2090,7 @@ class CacheAllocator : public CacheBase {
                                  unsigned int pid,
                                  unsigned int cid,
                                  size_t batch) {
+    util::LatencyTracker tracker{stats().bgPromoteLatency_, batch};
     auto& mmContainer = getMMContainer(tid, pid, cid);
     size_t promotions = 0;
     std::vector<Item*> candidates;
@@ -3004,7 +3006,7 @@ CacheAllocator<CacheTrait>::allocateInternalTier(TierId tid,
                                                  uint32_t expiryTime,
                                                  bool fromBgThread,
                                                  bool evict) {
-  util::LatencyTracker tracker{stats().allocateLatency_};
+  util::LatencyTracker tracker{stats().allocateLatency_, static_cast<size_t>(!fromBgThread)};
 
   SCOPE_FAIL { stats_.invalidAllocs.inc(); };
 
diff --git a/cachelib/allocator/CacheStats.cpp b/cachelib/allocator/CacheStats.cpp
index dcb81930b9..f09fe4e0db 100644
--- a/cachelib/allocator/CacheStats.cpp
+++ b/cachelib/allocator/CacheStats.cpp
@@ -56,7 +56,7 @@ struct SizeVerify {};
 
 void Stats::populateGlobalCacheStats(GlobalCacheStats& ret) const {
 #ifndef SKIP_SIZE_VERIFY
-  SizeVerify<sizeof(Stats)> a = SizeVerify<16288>{};
+  SizeVerify<sizeof(Stats)> a = SizeVerify<16640>{};
   std::ignore = a;
 #endif
   ret.numCacheGets = numCacheGets.get();
@@ -105,6 +105,8 @@ void Stats::populateGlobalCacheStats(GlobalCacheStats& ret) const {
   ret.numNvmItemDestructorAllocErrors = numNvmItemDestructorAllocErrors.get();
 
   ret.allocateLatencyNs = this->allocateLatency_.estimate();
+  ret.bgEvictLatencyNs = this->bgEvictLatency_.estimate();
+  ret.bgPromoteLatencyNs = this->bgPromoteLatency_.estimate();
   ret.moveChainedLatencyNs = this->moveChainedLatency_.estimate();
   ret.moveRegularLatencyNs = this->moveRegularLatency_.estimate();
   ret.nvmLookupLatencyNs = this->nvmLookupLatency_.estimate();
diff --git a/cachelib/allocator/CacheStats.h b/cachelib/allocator/CacheStats.h
index aec24cb298..18e62dbfee 100644
--- a/cachelib/allocator/CacheStats.h
+++ b/cachelib/allocator/CacheStats.h
@@ -529,6 +529,8 @@ struct GlobalCacheStats {
 
   // latency and percentile stats of various cachelib operations
   util::PercentileStats::Estimates allocateLatencyNs{};
+  util::PercentileStats::Estimates bgEvictLatencyNs{};
+  util::PercentileStats::Estimates bgPromoteLatencyNs{};
   util::PercentileStats::Estimates moveChainedLatencyNs{};
   util::PercentileStats::Estimates moveRegularLatencyNs{};
   util::PercentileStats::Estimates nvmLookupLatencyNs{};
diff --git a/cachelib/allocator/CacheStatsInternal.h b/cachelib/allocator/CacheStatsInternal.h
index 9265f74251..ece1f87a48 100644
--- a/cachelib/allocator/CacheStatsInternal.h
+++ b/cachelib/allocator/CacheStatsInternal.h
@@ -189,6 +189,8 @@ struct Stats {
 
   // latency stats of various cachelib operations
   mutable util::PercentileStats allocateLatency_;
+  mutable util::PercentileStats bgEvictLatency_;
+  mutable util::PercentileStats bgPromoteLatency_;
   mutable util::PercentileStats moveChainedLatency_;
   mutable util::PercentileStats moveRegularLatency_;
   mutable util::PercentileStats nvmLookupLatency_;
diff --git a/cachelib/cachebench/cache/Cache.h b/cachelib/cachebench/cache/Cache.h
index cccf1014d2..27107b5a64 100644
--- a/cachelib/cachebench/cache/Cache.h
+++ b/cachelib/cachebench/cache/Cache.h
@@ -1217,6 +1217,8 @@ Stats Cache<Allocator>::getStats() const {
       static_cast<int64_t>(itemRecords_.count()) - totalDestructor_;
 
   ret.cacheAllocateLatencyNs = cacheStats.allocateLatencyNs;
+  ret.cacheBgEvictLatencyNs = cacheStats.bgEvictLatencyNs;
+  ret.cacheBgPromoteLatencyNs = cacheStats.bgPromoteLatencyNs;
   ret.cacheFindLatencyNs = cacheFindLatency_.estimate();
 
   // Populate counters.
diff --git a/cachelib/cachebench/cache/CacheStats.h b/cachelib/cachebench/cache/CacheStats.h
index bf79b8aa65..1e2442d2e8 100644
--- a/cachelib/cachebench/cache/CacheStats.h
+++ b/cachelib/cachebench/cache/CacheStats.h
@@ -70,6 +70,8 @@ struct Stats {
   uint64_t numNvmItemRemovedSetSize{0};
 
   util::PercentileStats::Estimates cacheAllocateLatencyNs;
+  util::PercentileStats::Estimates cacheBgEvictLatencyNs;
+  util::PercentileStats::Estimates cacheBgPromoteLatencyNs;
   util::PercentileStats::Estimates cacheFindLatencyNs;
 
   double nvmReadLatencyMicrosP50{0};
@@ -295,6 +297,8 @@ struct Stats {
 
         printLatencies("Cache Find API latency", cacheFindLatencyNs);
         printLatencies("Cache Allocate API latency", cacheAllocateLatencyNs);
+        printLatencies("Cache Background Eviction API latency", cacheBgEvictLatencyNs);
+        printLatencies("Cache Background Promotion API latency", cacheBgPromoteLatencyNs);
       }
     }
 
@@ -535,6 +539,8 @@ struct Stats {
 
     counters["find_latency_p99"] = cacheFindLatencyNs.p99;
     counters["alloc_latency_p99"] = cacheAllocateLatencyNs.p99;
+    counters["bg_evict_latency_p99"] = cacheBgEvictLatencyNs.p99;
+    counters["bg_promote_latency_p99"] = cacheBgPromoteLatencyNs.p99;
 
     counters["ram_hit_rate"] = calcInvertPctFn(numCacheGetMiss, numCacheGets);
     counters["nvm_hit_rate"] = calcInvertPctFn(numCacheGetMiss, numCacheGets);
diff --git a/cachelib/common/PercentileStats.h b/cachelib/common/PercentileStats.h
index bdd3255eba..c308671ee9 100644
--- a/cachelib/common/PercentileStats.h
+++ b/cachelib/common/PercentileStats.h
@@ -107,16 +107,16 @@ class PercentileStats {
 
 class LatencyTracker {
  public:
-  explicit LatencyTracker(PercentileStats& stats)
-      : stats_(&stats), begin_(std::chrono::steady_clock::now()) {}
+  explicit LatencyTracker(PercentileStats& stats, size_t nSamples = 1)
+      : stats_(&stats), nSamples_(nSamples), begin_(std::chrono::steady_clock::now()) {}
   LatencyTracker() {}
   ~LatencyTracker() {
-    if (stats_) {
+    if (nSamples_ > 0 && stats_) {
       auto tp = std::chrono::steady_clock::now();
       auto diffNanos =
           std::chrono::duration_cast<std::chrono::nanoseconds>(tp - begin_)
               .count();
-      stats_->trackValue(static_cast<double>(diffNanos), tp);
+      stats_->trackValue(static_cast<double>(diffNanos/nSamples_), tp);
     }
   }
 
@@ -124,7 +124,7 @@ class LatencyTracker {
   LatencyTracker& operator=(const LatencyTracker&) = delete;
 
   LatencyTracker(LatencyTracker&& rhs) noexcept
-      : stats_(rhs.stats_), begin_(rhs.begin_) {
+      : stats_(rhs.stats_), nSamples_(rhs.nSamples_), begin_(rhs.begin_) {
     rhs.stats_ = nullptr;
   }
 
@@ -138,6 +138,7 @@ class LatencyTracker {
 
  private:
   PercentileStats* stats_{nullptr};
+  size_t nSamples_{1};
   std::chrono::time_point<std::chrono::steady_clock> begin_;
 };
 } // namespace util

From 795f85bb708bed2650b8b041fe014d6e1fef3210 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Igor=20Chor=C4=85=C5=BCewicz?= <igor.chorazewicz@intel.com>
Date: Wed, 23 Aug 2023 10:21:21 -0700
Subject: [PATCH 23/40] Update dependencies (#95)

* Set dependencies to working versions

and use dependencies from build context, instead
of downloading cachelib:develop during build step.

This makes sure that dependencies are always build
in proper versions.

* Fix CacheStats size
---
 .github/workflows/build-cachelib-docker.yml | 1 +
 contrib/build-package.sh                    | 4 ----
 docker/images/build-image.sh                | 2 +-
 docker/images/centos-8streams.Dockerfile    | 9 +++++----
 docker/images/install-cachelib-deps.sh      | 8 +++-----
 docker/images/install-dsa-deps.sh           | 2 +-
 6 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/build-cachelib-docker.yml b/.github/workflows/build-cachelib-docker.yml
index be28bc233c..f00c028708 100644
--- a/.github/workflows/build-cachelib-docker.yml
+++ b/.github/workflows/build-cachelib-docker.yml
@@ -40,6 +40,7 @@ jobs:
       - name: "checkout sources"
         uses: actions/checkout@v2
         with:
+          submodules: recursive
           fetch-depth: 0
 
       - name: Pull the image or rebuild and push it
diff --git a/contrib/build-package.sh b/contrib/build-package.sh
index 1b646049f7..fbdf5c7347 100755
--- a/contrib/build-package.sh
+++ b/contrib/build-package.sh
@@ -197,7 +197,6 @@ case "$1" in
   folly)
     NAME=folly
     SRCDIR=cachelib/external/$NAME
-    update_submodules=yes
     cmake_custom_params="-DBUILD_SHARED_LIBS=ON"
     if test "$build_tests" = "yes" ; then
         cmake_custom_params="$cmake_custom_params -DBUILD_TESTS=ON"
@@ -209,7 +208,6 @@ case "$1" in
   fizz)
     NAME=fizz
     SRCDIR=cachelib/external/$NAME/$NAME
-    update_submodules=yes
     cmake_custom_params="-DBUILD_SHARED_LIBS=ON"
     if test "$build_tests" = "yes" ; then
         cmake_custom_params="$cmake_custom_params -DBUILD_TESTS=ON"
@@ -221,7 +219,6 @@ case "$1" in
   wangle)
     NAME=wangle
     SRCDIR=cachelib/external/$NAME/$NAME
-    update_submodules=yes
     cmake_custom_params="-DBUILD_SHARED_LIBS=ON"
     if test "$build_tests" = "yes" ; then
         cmake_custom_params="$cmake_custom_params -DBUILD_TESTS=ON"
@@ -240,7 +237,6 @@ case "$1" in
   fbthrift)
     NAME=fbthrift
     SRCDIR=cachelib/external/$NAME
-    update_submodules=yes
     cmake_custom_params="-DBUILD_SHARED_LIBS=ON"
     ;;
 
diff --git a/docker/images/build-image.sh b/docker/images/build-image.sh
index 985a6e0ff1..1024c8e6d5 100755
--- a/docker/images/build-image.sh
+++ b/docker/images/build-image.sh
@@ -35,4 +35,4 @@ echo "Build a Docker image tagged with: ${CONTAINER_REG}:${TAG}"
 docker build -t ${CONTAINER_REG}:${TAG} \
 	--build-arg http_proxy=$http_proxy \
 	--build-arg https_proxy=$https_proxy \
-	-f ${OS}-${OS_VER}.Dockerfile .
+	-f ${OS}-${OS_VER}.Dockerfile ../.. # need access to contrib and submodules
diff --git a/docker/images/centos-8streams.Dockerfile b/docker/images/centos-8streams.Dockerfile
index e0c31226a1..73168e3cb3 100644
--- a/docker/images/centos-8streams.Dockerfile
+++ b/docker/images/centos-8streams.Dockerfile
@@ -23,8 +23,9 @@ RUN dnf -y install gcc-toolset-12
 RUN echo "source /opt/rh/gcc-toolset-12/enable" >> /etc/bashrc
 SHELL ["/bin/bash", "--login", "-c"]
 
-COPY ./install-cachelib-deps.sh ./install-cachelib-deps.sh
-RUN ./install-cachelib-deps.sh
+COPY ./contrib ./contrib
+COPY ./docker ./docker
+COPY ./cachelib/external ./cachelib/external
 
-COPY ./install-dsa-deps.sh ./install-dsa-deps.sh
-RUN ./install-dsa-deps.sh
+RUN ./docker/images/install-cachelib-deps.sh
+RUN ./docker/images/install-dsa-deps.sh
diff --git a/docker/images/install-cachelib-deps.sh b/docker/images/install-cachelib-deps.sh
index 6d8fbdef7b..b1754a8db5 100755
--- a/docker/images/install-cachelib-deps.sh
+++ b/docker/images/install-cachelib-deps.sh
@@ -2,13 +2,11 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright 2022, Intel Corporation
 
-git clone -b develop https://github.com/intel/CacheLib CacheLib
-
-./CacheLib/contrib/prerequisites-centos8.sh
+echo 'Defaults env_keep += "HTTPS_PROXY https_proxy HTTP_PROXY http_proxy NO_PROXY no_proxy"' >> /etc/sudoers
+./contrib/prerequisites-centos8.sh
 
 for pkg in zstd googleflags googlelog googletest sparsemap fmt folly fizz wangle fbthrift ;
 do
-    sudo ./CacheLib/contrib/build-package.sh -j -I /opt/ "$pkg"
+    sudo ./contrib/build-package.sh -j -I /opt/ "$pkg"
 done
 
-rm -rf CacheLib
diff --git a/docker/images/install-dsa-deps.sh b/docker/images/install-dsa-deps.sh
index 265011dd70..f3484746b4 100755
--- a/docker/images/install-dsa-deps.sh
+++ b/docker/images/install-dsa-deps.sh
@@ -15,7 +15,7 @@ rm -rf idxd-config
 # Install DML Library
 git clone --recursive https://github.com/intel/DML.git
 cd DML
-git checkout e44443c24d53552b248b9869b1b16f89cd970f52
+git checkout v1.1.0
 mkdir build
 cd build
 cmake -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_BUILD_TYPE=RelWithDebInfo ..

From 96d948f4e883d5498289b71bae99656063306197 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Wed, 28 Feb 2024 09:31:21 -0800
Subject: [PATCH 24/40] enable DTO build without memcpy changes to cachebench

---
 cachelib/CMakeLists.txt            | 1 +
 cachelib/cachebench/CMakeLists.txt | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/cachelib/CMakeLists.txt b/cachelib/CMakeLists.txt
index 32b2859e44..bb77d54dc6 100644
--- a/cachelib/CMakeLists.txt
+++ b/cachelib/CMakeLists.txt
@@ -43,6 +43,7 @@ set(PACKAGE_BUGREPORT "https://github.com/facebook/TBD")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 option(BUILD_TESTS "If enabled, compile the tests." ON)
+option(BUILD_WITH_DTO "If enabled, build with DSA transparent offloading." OFF)
 
 
 set(BIN_INSTALL_DIR bin CACHE STRING
diff --git a/cachelib/cachebench/CMakeLists.txt b/cachelib/cachebench/CMakeLists.txt
index 35622ee666..1dcbf8d7d8 100644
--- a/cachelib/cachebench/CMakeLists.txt
+++ b/cachelib/cachebench/CMakeLists.txt
@@ -51,6 +51,10 @@ endif()
 add_executable (cachebench main.cpp)
 target_link_libraries(cachebench cachelib_cachebench)
 
+if (BUILD_WITH_DTO)
+    target_link_libraries(cachebench accel-config dto)
+endif ()
+
 install(
   TARGETS
      cachebench

From 47d503468420b7b10d6656cc29127d364b5be8ee Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Wed, 28 Feb 2024 10:51:26 -0800
Subject: [PATCH 25/40] Bckground eviction for multi-tier Part 4.
 ------------------------------- batch eviction / promotion  - these changes
 are pretty significant so we would    avoid squashing this commit in any
 prior    background evictor patch

---
 cachelib/allocator/CacheAllocator.h           | 937 ++++++++++++------
 cachelib/allocator/MM2Q.h                     |  54 +-
 cachelib/allocator/MMLru.h                    |  57 +-
 cachelib/allocator/MMTinyLFU.h                |  38 +-
 cachelib/allocator/memory/AllocationClass.cpp |  40 +
 cachelib/allocator/memory/AllocationClass.h   |   2 +
 cachelib/allocator/memory/MemoryAllocator.cpp |  10 +
 cachelib/allocator/memory/MemoryAllocator.h   |   2 +
 cachelib/allocator/memory/MemoryPool.cpp      |  82 +-
 cachelib/allocator/memory/MemoryPool.h        |   5 +
 10 files changed, 914 insertions(+), 313 deletions(-)

diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index af40a265dc..5a1054ee79 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -353,6 +353,38 @@ class CacheAllocator : public CacheBase {
     virtual bool isValid() const { return true; }
   };
   using ChainedItemMovingSync = std::function<std::unique_ptr<SyncObj>(Key)>;
+  
+  // Eviction related data returned from
+  // function executed under mmContainer lock
+  struct EvictionData {
+    EvictionData() = delete;
+    EvictionData(Item *candidate_, 
+                 Item *toRecycle_,
+                 Item *toRecycleParent_,
+                 bool chainedItem_,
+                 bool expired_,
+                 typename NvmCacheT::PutToken token_,
+                 WriteHandle candidateHandle_) :
+                 candidate(candidate_),
+                 toRecycle(toRecycle_),
+                 toRecycleParent(toRecycleParent_),
+                 expired(expired_),
+                 chainedItem(chainedItem_),
+                 token(std::move(token_)),
+                 candidateHandle(std::move(candidateHandle_)) {}
+
+    // item that is candidate for eviction
+    Item *candidate;
+    // acutal alloc that will be recycled
+    // back up to allocator
+    Item *toRecycle;
+    // possible parent ref
+    Item *toRecycleParent;
+    bool expired; //is item expired
+    bool chainedItem; //is it a chained item
+    typename NvmCacheT::PutToken token; //put token for NVM cache
+    WriteHandle candidateHandle; //hande in case we don't use moving bit
+  };
 
   using AccessContainer = typename Item::AccessContainer;
   using MMContainer = typename Item::MMContainer;
@@ -1521,16 +1553,12 @@ class CacheAllocator : public CacheBase {
                                Key key,
                                uint32_t size,
                                uint32_t creationTime,
-                               uint32_t expiryTime,
-                               bool fromBgThread = false);
+                               uint32_t expiryTime);
 
   // create a new cache allocation on specific memory tier.
   // For description see allocateInternal.
   //
   // @param tid id a memory tier
-  // @param fromBgThread whether this function was called from a bg
-  //        thread - this is used to decide whether bg thread should
-  //        be waken in case there is no free memory
   // @param evict whether to evict an item from tier tid in case there
   //        is not enough memory
   WriteHandle allocateInternalTier(TierId tid,
@@ -1539,8 +1567,35 @@ class CacheAllocator : public CacheBase {
                                    uint32_t size,
                                    uint32_t creationTime,
                                    uint32_t expiryTime,
-                                   bool fromBgThread,
                                    bool evict);
+  
+  // create a new cache allocation on specific memory tier,
+  // for a given class id. used in moving between tiers since
+  // class id's are the same among the tiers.
+  // For description see allocateInternal.
+  //
+  // @param tid id a memory tier
+  // @param pid a poold id
+  // @param cid a class id
+  //
+  void* allocateInternalTierByCid(TierId tid,
+                                   PoolId pid,
+                                   ClassId cid);
+
+  // create a new cache allocation on specific memory tier,
+  // for a given class id. used in moving between tiers since
+  // class id's are the same among the tiers.
+  // For description see allocateInternal.
+  //
+  // @param tid id a memory tier
+  // @param pid a poold id
+  // @param cid a class id
+  // @param batch the number of allocations to make
+  //
+  std::vector<void*> allocateInternalTierByCidBatch(TierId tid,
+                                   PoolId pid,
+                                   ClassId cid,
+                                   uint64_t batch);
 
   // Allocate a chained item
   //
@@ -1646,10 +1701,12 @@ class CacheAllocator : public CacheBase {
   //
   // @param oldItem     item being moved
   // @param newItemHdl  Reference to the handle of the new item being moved into
-  //
+  // @param skipAddInMMContainer so we can tell if we should add in mmContainer or wait
+  //                     to do in batch
+  // @param fromBgThread use memmove instead of memcopy (for DTO testing)
   // @return true  If the move was completed, and the containers were updated
   //               successfully.
-  bool moveRegularItem(Item& oldItem, WriteHandle& newItemHdl);
+  bool moveRegularItem(Item& oldItem, WriteHandle& newItemHdl, bool skipAddInMMContainer, bool fromBgThread);
 
   // template class for viewAsChainedAllocs that takes either ReadHandle or
   // WriteHandle
@@ -1816,6 +1873,7 @@ class CacheAllocator : public CacheBase {
   // @return An evicted item or nullptr  if there is no suitable candidate found
   // within the configured number of attempts.
   Item* findEviction(TierId tid, PoolId pid, ClassId cid);
+  std::vector<Item*> findEvictionBatch(TierId tid, PoolId pid, ClassId cid, unsigned int batch);
 
   // Get next eviction candidate from MMContainer, remove from AccessContainer,
   // MMContainer and insert into NVMCache if enabled.
@@ -1834,47 +1892,62 @@ class CacheAllocator : public CacheBase {
                                            unsigned int& searchTries);
 
   using EvictionIterator = typename MMContainer::LockedIterator;
+  // similiar to the above method but returns a batch of evicted items
+  // as a pair of vectors
+  std::vector<EvictionData> getNextCandidates(TierId tid,
+                                              PoolId pid,
+                                              ClassId cid,
+                                              unsigned int batch,
+                                              bool markMoving,
+                                              bool fromBgThread);
+  
+  std::vector<Item*> getNextCandidatesPromotion(TierId tid,
+                                       PoolId pid,
+                                       ClassId cid,
+                                       unsigned int batch,
+                                       bool markMoving,
+                                       bool fromBgThread);
+
+  // 
+  // Common function in case move among tiers fails during eviction
+  // @param candidate that failed to move
+  // @param the corresponding put token
+  // @param if we are on the last tier
+  // @param if candidate is expired
+  // @param if we are using moving bit
+  //
+  // if insertOrReplace was called during move
+  // then candidate will not be accessible (failed replace during tryEvict)
+  //  - therefore this was why we failed to
+  //    evict to the next tier and insertOrReplace
+  //    will remove from NVM cache
+  // however, if candidate is accessible
+  // that means the allocation in the next
+  // tier failed - so we will continue to
+  // evict the item to NVM cache
+  bool handleFailedMove(Item* candidate, 
+                        typename NvmCacheT::PutToken& token, 
+                        bool isExpired,
+                        bool markMoving);
 
   // Try to move the item down to the next memory tier
   //
   // @param tid current tier ID of the item
   // @param pid the pool ID the item belong to.
   // @param item the item to evict
-  // @param fromBgThread whether this is called from BG thread
   //
   // @return valid handle to the item. This will be the last
   //         handle to the item. On failure an empty handle.
-  WriteHandle tryEvictToNextMemoryTier(TierId tid, PoolId pid, Item& item,
-                                       bool fromBgThread);
+  WriteHandle tryEvictToNextMemoryTier(TierId tid, PoolId pid, Item& item);
 
   // Try to move the item down to the next memory tier
   //
   // @param item the item to evict
-  // @param fromBgThread whether this is called from BG thread
   //
   // @return valid handle to the item. This will be the last
   //         handle to the item. On failure an empty handle. 
-  WriteHandle tryEvictToNextMemoryTier(Item& item, bool fromBgThread);
+  WriteHandle tryEvictToNextMemoryTier(Item& item);
 
-  // Try to move the item up to the next memory tier
-  //
-  // @param tid current tier ID of the item
-  // @param pid the pool ID the item belong to.
-  // @param item the item to promote
-  // @param fromBgThread whether this is called from BG thread
-  //
-  // @return valid handle to the item. This will be the last
-  //         handle to the item. On failure an empty handle.
-  WriteHandle tryPromoteToNextMemoryTier(TierId tid, PoolId pid, Item& item, bool fromBgThread);
-
-  // Try to move the item up to the next memory tier
-  //
-  // @param item the item to promote
-  // @param fromBgThread whether this is called from BG thread
-  //
-  // @return valid handle to the item. This will be the last
-  //         handle to the item. On failure an empty handle.
-  WriteHandle tryPromoteToNextMemoryTier(Item& item, bool fromBgThread);
 
   // Wakes up waiters if there are any
   //
@@ -2015,161 +2088,25 @@ class CacheAllocator : public CacheBase {
                                size_t batch) {
     util::LatencyTracker tracker{stats().bgEvictLatency_, batch};
     auto& mmContainer = getMMContainer(tid, pid, cid);
-    size_t evictions = 0;
-    size_t evictionCandidates = 0;
-    std::vector<Item*> candidates;
-    candidates.reserve(batch);
-
-    size_t tries = 0;
-    mmContainer.withEvictionIterator([&tries, &candidates, &batch, &mmContainer, this](auto &&itr) {
-      while (candidates.size() < batch && 
-        (config_.maxEvictionPromotionHotness == 0 || tries < config_.maxEvictionPromotionHotness) && 
-         itr) {
-        tries++;
-        Item* candidate = itr.get();
-        XDCHECK(candidate);
-
-        if (candidate->isChainedItem()) {
-          throw std::runtime_error("Not supported for chained items");
-        }
-
-        if (candidate->markMoving()) {
-          mmContainer.remove(itr);
-          candidates.push_back(candidate);
-        } else {
-            ++itr;
-        }
-      }
-    });
-
-    for (Item *candidate : candidates) {
-      auto evictedToNext = tryEvictToNextMemoryTier(*candidate, true /* from BgThread */);
-      if (!evictedToNext) {
-	  auto token = createPutToken(*candidate);
-
-	  auto ret = candidate->markForEvictionWhenMoving();
-	  XDCHECK(ret);
-
-          unlinkItemForEviction(*candidate);
-      	  // wake up any readers that wait for the move to complete
-      	  // it's safe to do now, as we have the item marked exclusive and
-      	  // no other reader can be added to the waiters list
-      	  wakeUpWaiters(candidate->getKey(), WriteHandle{});
-
-      	  if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate)) {
-      	    nvmCache_->put(*candidate, std::move(token));
-      	  }
-      } else {
-          evictions++;
-      	  XDCHECK(!evictedToNext->isMarkedForEviction() && !evictedToNext->isMoving());
-      	  XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
-      	  XDCHECK(!candidate->isAccessible());
-      	  XDCHECK(candidate->getKey() == evictedToNext->getKey());
-
-      	  wakeUpWaiters(candidate->getKey(), std::move(evictedToNext));
-      }
-      XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
-
-      if (candidate->hasChainedItem()) {
-        (*stats_.chainedItemEvictions)[tid][pid][cid].inc();
-      } else {
-        (*stats_.regularItemEvictions)[tid][pid][cid].inc();
+    uint32_t currItems = mmContainer.size();
+    if (currItems < batch) {
+      batch = currItems;
+      if (batch == 0) {
+        return 0;
       }
-
-      // it's safe to recycle the item here as there are no more
-      // references and the item could not been marked as moving
-      // by other thread since it's detached from MMContainer.
-      auto res = releaseBackToAllocator(*candidate, RemoveContext::kEviction,
-                                /* isNascent */ false);
-      XDCHECK(res == ReleaseRes::kReleased);
     }
+    auto evictionData = getNextCandidates(tid,pid,cid,batch,
+                                     true,true);
+    size_t evictions = evictionData.size();
+    (*stats_.regularItemEvictions)[tid][pid][cid].add(evictions);
     return evictions;
   }
-
-  size_t traverseAndPromoteItems(unsigned int tid,
-                                 unsigned int pid,
-                                 unsigned int cid,
-                                 size_t batch) {
+  
+  size_t traverseAndPromoteItems(unsigned int tid, unsigned int pid, unsigned int cid, size_t batch) {
     util::LatencyTracker tracker{stats().bgPromoteLatency_, batch};
-    auto& mmContainer = getMMContainer(tid, pid, cid);
-    size_t promotions = 0;
-    std::vector<Item*> candidates;
-    candidates.reserve(batch);
-
-    size_t tries = 0;
-
-    mmContainer.withPromotionIterator([&tries, &candidates, &batch, &mmContainer, this](auto &&itr){
-      while (candidates.size() < batch && (config_.maxEvictionPromotionHotness == 0 || tries < config_.maxEvictionPromotionHotness) && itr) {
-        tries++;
-        Item* candidate = itr.get();
-        XDCHECK(candidate);
-
-        if (candidate->isChainedItem()) {
-          throw std::runtime_error("Not supported for chained items");
-        }
-
-        // TODO: only allow it for read-only items?
-        // or implement mvcc
-        if (candidate->markMoving()) {
-          // promotions should rarely fail since we already marked moving
-          mmContainer.remove(itr);
-          candidates.push_back(candidate);
-        }
-
-        ++itr;
-      }
-    });
-
-    for (Item *candidate : candidates) {
-      auto promoted = tryPromoteToNextMemoryTier(*candidate, true);
-      if (promoted) {
-        promotions++;
-        XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
-        // it's safe to recycle the item here as there are no more
-        // references and the item could not been marked as moving
-        // by other thread since it's detached from MMContainer.
-        //
-        // but we need to wake up waiters before releasing
-        // since candidate's key can change after being sent
-        // back to allocator
-        wakeUpWaiters(candidate->getKey(), std::move(promoted));
-        auto res = releaseBackToAllocator(*candidate, RemoveContext::kEviction,
-                                  /* isNascent */ false);
-        XDCHECK(res == ReleaseRes::kReleased);
-      } else {
-        // we failed to allocate a new item, this item is no  longer moving
-        auto ref = candidate->unmarkMoving();
-        if (UNLIKELY(ref == 0)) {
-	  wakeUpWaiters(candidate->getKey(),{});
-          const auto res =
-              releaseBackToAllocator(*candidate, 
-                      RemoveContext::kNormal, false);
-          XDCHECK(res == ReleaseRes::kReleased);
-        } else if (candidate->isAccessible()) {
-          //case where we failed to allocate in lower tier
-          //item is still present in accessContainer
-          //item is no longer moving - acquire and
-          //wake up waiters with this handle
-	  auto hdl = acquire(candidate);
-	  insertInMMContainer(*hdl);
-	  wakeUpWaiters(candidate->getKey(), std::move(hdl));
-        } else if (!candidate->isAccessible()) {
-          //case where we failed to replace in access
-          //container due to another thread calling insertOrReplace
-          //unmark moving and return null handle
-	  wakeUpWaiters(candidate->getKey(), {});
-	  if (UNLIKELY(ref == 0)) {
-              const auto res =
-                releaseBackToAllocator(*candidate, RemoveContext::kNormal,
-                                        false);
-              XDCHECK(res == ReleaseRes::kReleased);
-          }
-        } else {
-          XDCHECK(false);
-        }
-      }
-    }
-    return promotions;
+    auto candidates = getNextCandidatesPromotion(tid,pid,cid,batch,
+                                     true,true);
+    return candidates.size();
   }
 
   // returns true if nvmcache is enabled and we should write this item to
@@ -2499,7 +2436,7 @@ class CacheAllocator : public CacheBase {
   // free memory monitor
   std::unique_ptr<MemoryMonitor> memMonitor_;
 
-  // background evictor
+  // background data movement
   std::vector<std::unique_ptr<BackgroundMover<CacheT>>> backgroundEvictor_;
   std::vector<std::unique_ptr<BackgroundMover<CacheT>>> backgroundPromoter_;
 
@@ -2996,6 +2933,37 @@ bool CacheAllocator<CacheTrait>::shouldWakeupBgEvictor(TierId tid, PoolId pid, C
   return false;
 }
 
+template <typename CacheTrait>
+std::vector<void*> CacheAllocator<CacheTrait>::allocateInternalTierByCidBatch(TierId tid,
+                                                 PoolId pid,
+                                                 ClassId cid, uint64_t batch) {
+  util::LatencyTracker tracker{stats().allocateLatency_};
+
+  SCOPE_FAIL { stats_.invalidAllocs.add(batch); };
+
+  util::RollingLatencyTracker rollTracker{
+      (*stats_.classAllocLatency)[tid][pid][cid]};
+
+  (*stats_.allocAttempts)[tid][pid][cid].add(batch);
+  
+  auto memory = allocator_[tid]->allocateByCidBatch(pid, cid, batch);
+  
+  if (memory.size() < batch) {
+    uint64_t toEvict = batch - memory.size();
+    auto evicted = findEvictionBatch(tid, pid, cid, toEvict);
+    if (evicted.size() < toEvict) {
+      (*stats_.allocFailures)[tid][pid][cid].add(toEvict - evicted.size());
+    }
+    if (evicted.size() > 0) {
+      //case where we some allocations from eviction - add them to
+      //the new allocations
+      memory.insert(memory.end(),evicted.begin(),evicted.end());
+      return memory;
+    }
+  }
+  return memory;
+}
+
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
 CacheAllocator<CacheTrait>::allocateInternalTier(TierId tid,
@@ -3004,10 +2972,8 @@ CacheAllocator<CacheTrait>::allocateInternalTier(TierId tid,
                                                  uint32_t size,
                                                  uint32_t creationTime,
                                                  uint32_t expiryTime,
-                                                 bool fromBgThread,
                                                  bool evict) {
-  util::LatencyTracker tracker{stats().allocateLatency_, static_cast<size_t>(!fromBgThread)};
-
+  util::LatencyTracker tracker{stats().allocateLatency_};
   SCOPE_FAIL { stats_.invalidAllocs.inc(); };
 
   // number of bytes required for this item
@@ -3022,7 +2988,7 @@ CacheAllocator<CacheTrait>::allocateInternalTier(TierId tid,
 
   void* memory = allocator_[tid]->allocate(pid, requiredSize);
 
-  if (backgroundEvictor_.size() && !fromBgThread &&
+  if (backgroundEvictor_.size() &&
       (memory == nullptr || shouldWakeupBgEvictor(tid, pid, cid))) {
     backgroundEvictor_[BackgroundMover<CacheT>::workerId(
                          tid, pid, cid, backgroundEvictor_.size())]
@@ -3078,13 +3044,12 @@ CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
                                              typename Item::Key key,
                                              uint32_t size,
                                              uint32_t creationTime,
-                                             uint32_t expiryTime,
-                                             bool fromBgThread) {
+                                             uint32_t expiryTime) {
   auto tid = 0; /* TODO: consult admission policy */
   for(TierId tid = 0; tid < getNumTiers(); ++tid) {
     bool evict = !config_.insertToFirstFreeTier || tid == getNumTiers() - 1;
     auto handle = allocateInternalTier(tid, pid, key, size, creationTime,
-                                       expiryTime, fromBgThread, evict);
+                                       expiryTime, evict);
     if (handle) return handle;
   }
   return {};
@@ -3904,14 +3869,9 @@ void CacheAllocator<CacheTrait>::wakeUpWaiters(folly::StringPiece key,
 }
 
 template <typename CacheTrait>
-bool CacheAllocator<CacheTrait>::moveRegularItem(Item& oldItem,
-                                                 WriteHandle& newItemHdl) {
-  XDCHECK(oldItem.isMoving());
-  // If an item is expired, proceed to eviction.
-  if (oldItem.isExpired()) {
-    return false;
-  }
-
+bool CacheAllocator<CacheTrait>::moveRegularItem(
+    Item& oldItem, WriteHandle& newItemHdl, bool skipAddInMMContainer, bool fromBgThread) {
+  XDCHECK(!oldItem.isExpired());
   util::LatencyTracker tracker{stats_.moveRegularLatency_};
 
   XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize());
@@ -3933,15 +3893,22 @@ bool CacheAllocator<CacheTrait>::moveRegularItem(Item& oldItem,
     // should be fine for it to be left in an inconsistent state.
     config_.moveCb(oldItem, *newItemHdl, nullptr);
   } else {
-    std::memcpy(newItemHdl->getMemory(), oldItem.getMemory(),
+    if (fromBgThread) {
+      std::memmove(newItemHdl->getMemory(), oldItem.getMemory(),
                 oldItem.getSize());
+    } else {
+      std::memcpy(newItemHdl->getMemory(), oldItem.getMemory(),
+                oldItem.getSize());
+    }
   }
 
-  // Adding the item to mmContainer has to succeed since no one can remove the
-  // item
   auto& newContainer = getMMContainer(*newItemHdl);
-  auto mmContainerAdded = newContainer.add(*newItemHdl);
-  XDCHECK(mmContainerAdded);
+  if (!skipAddInMMContainer) {
+    // Adding the item to mmContainer has to succeed since no one can remove the
+    // item
+    auto mmContainerAdded = newContainer.add(*newItemHdl);
+    XDCHECK(mmContainerAdded);
+  }
 
   if (oldItem.hasChainedItem()) {
     XDCHECK(!newItemHdl->hasChainedItem()) << newItemHdl->toString();
@@ -4030,6 +3997,472 @@ void CacheAllocator<CacheTrait>::unlinkItemForEviction(Item& it) {
   XDCHECK_EQ(0u, ref);
 }
 
+template <typename CacheTrait>
+std::vector<typename CacheAllocator<CacheTrait>::Item*>
+CacheAllocator<CacheTrait>::findEvictionBatch(TierId tid,
+                                             PoolId pid,
+                                             ClassId cid,
+                                             unsigned int batch) {
+
+  std::vector<Item*> toRecycles;
+  toRecycles.reserve(batch);
+  auto evictionData = getNextCandidates(tid,pid,cid,batch,true,false);
+  for (int i = 0; i < evictionData.size(); i++) {
+    Item *candidate = evictionData[i].candidate;
+    Item *toRecycle = evictionData[i].toRecycle;
+    toRecycles.push_back(toRecycle);
+    // recycle the item. it's safe to do so, even if toReleaseHandle was
+    // NULL. If `ref` == 0 then it means that we are the last holder of
+    // that item.
+    if (candidate->hasChainedItem()) {
+      (*stats_.chainedItemEvictions)[tid][pid][cid].inc();
+    } else {
+      (*stats_.regularItemEvictions)[tid][pid][cid].inc();
+    }
+
+    if (auto eventTracker = getEventTracker()) {
+      eventTracker->record(AllocatorApiEvent::DRAM_EVICT, candidate->getKey(),
+                           AllocatorApiResult::EVICTED, candidate->getSize(),
+                           candidate->getConfiguredTTL().count());
+    }
+
+    XDCHECK(!candidate->isChainedItem());
+    // check if by releasing the item we intend to, we actually
+    // recycle the candidate.
+    auto ret = releaseBackToAllocator(*candidate, RemoveContext::kEviction,
+                                      /* isNascent */ false, toRecycle);
+    XDCHECK_EQ(ret,ReleaseRes::kRecycled);
+  }
+  return toRecycles;
+}
+
+template <typename CacheTrait>
+std::vector<typename CacheAllocator<CacheTrait>::Item*>
+CacheAllocator<CacheTrait>::getNextCandidatesPromotion(TierId tid,
+                                             PoolId pid,
+                                             ClassId cid,
+                                             unsigned int batch,
+                                             bool markMoving,
+                                             bool fromBgThread) {
+  std::vector<Item*> newAllocs;
+  std::vector<void*> blankAllocs;
+  std::vector<WriteHandle> newHandles;
+  std::vector<WriteHandle> candidateHandles;
+  std::vector<Item*> candidates;
+  candidates.reserve(batch);
+  candidateHandles.reserve(batch);
+  newAllocs.reserve(batch);
+  newHandles.reserve(batch);
+
+  auto& mmContainer = getMMContainer(tid, pid, cid);
+  unsigned int maxSearchTries = std::max(config_.evictionSearchTries,
+                                            batch*4);
+
+  // first try and get allocations in the next tier
+  blankAllocs = allocateInternalTierByCidBatch(tid-1,pid,cid,batch);
+  if (blankAllocs.empty()) {
+    return candidates;  
+  } else if (blankAllocs.size() < batch) {
+    batch = blankAllocs.size();
+  }
+  XDCHECK_EQ(blankAllocs.size(),batch);
+
+  auto iterateAndMark = [this, tid, pid, cid, batch,
+                         markMoving, maxSearchTries,
+                         &candidates, &candidateHandles,
+                         &mmContainer](auto&& itr) {
+
+    unsigned int searchTries = 0;
+    if (!itr) {
+      ++searchTries;
+      return;
+    }
+
+    while ((config_.evictionSearchTries == 0 ||
+            maxSearchTries > searchTries) &&
+           itr && candidates.size() < batch) {
+      ++searchTries;
+      auto* toRecycle_ = itr.get();
+      bool chainedItem_ = toRecycle_->isChainedItem();
+
+      if (chainedItem_) {
+          ++itr;
+          continue;
+      }
+      Item* candidate_;
+      WriteHandle candidateHandle_;
+      Item* syncItem_;
+      //sync on the parent item for chained items to move to next tier
+      candidate_ = toRecycle_;
+      syncItem_ = toRecycle_;
+      
+      bool marked = false;
+      if (markMoving) {
+        marked = syncItem_->markMoving();
+      } else if (!markMoving) {
+        //we use item handle as sync point - for background eviction
+        auto hdl = acquire(candidate_);
+        if (hdl && hdl->getRefCount() == 1) {
+          marked = true;
+          candidateHandle_ = std::move(hdl);
+        }
+      }
+      if (!marked) {
+        ++itr;
+        continue;
+      }
+      XDCHECK(!chainedItem_); 
+      mmContainer.remove(itr);
+      candidates.push_back(candidate_);
+      candidateHandles.push_back(std::move(candidateHandle_));
+    }
+  };
+  
+  mmContainer.withPromotionIterator(iterateAndMark);
+
+  if (candidates.size() < batch) {
+    unsigned int toErase = batch - candidates.size();
+    for (int i = 0; i < toErase; i++) {
+      allocator_[tid-1]->free(blankAllocs.back());
+      blankAllocs.pop_back();
+    }
+    if (candidates.size() == 0) {
+      return candidates;  
+    }
+  }
+  
+  //1. get and item handle from a new allocation
+  for (int i = 0; i < candidates.size(); i++) {
+    Item *candidate = candidates[i];
+    WriteHandle newItemHdl = acquire(new (blankAllocs[i]) 
+            Item(candidate->getKey(), candidate->getSize(),
+                 candidate->getCreationTime(), candidate->getExpiryTime()));
+    XDCHECK(newItemHdl);
+    if (newItemHdl) {
+      newItemHdl.markNascent();
+      (*stats_.fragmentationSize)[tid][pid][cid].add(
+          util::getFragmentation(*this, *newItemHdl));
+      newAllocs.push_back(newItemHdl.getInternal());
+      newHandles.push_back(std::move(newItemHdl));
+    } else {
+      //failed to get item handle
+      throw std::runtime_error(
+         folly::sformat("Was not to acquire new alloc, failed alloc {}", blankAllocs[i]));
+    }
+  }
+  //2. add in batch to mmContainer
+  auto& newMMContainer = getMMContainer(tid-1, pid, cid);
+  uint32_t added = newMMContainer.addBatch(newAllocs.begin(), newAllocs.end());
+  XDCHECK_EQ(added,newAllocs.size());
+  if (added != newAllocs.size()) {
+    throw std::runtime_error(
+      folly::sformat("Was not able to add all new items, failed item {} and handle {}", 
+                      newAllocs[added]->toString(),newHandles[added]->toString()));
+  }
+  //3. copy item data - don't need to add in mmContainer
+  for (int i = 0; i < candidates.size(); i++) {
+    Item *candidate = candidates[i];
+    WriteHandle newHandle = std::move(newHandles[i]);
+    bool moved = moveRegularItem(*candidate,newHandle, true, true);
+    if (moved) {
+      XDCHECK(candidate->getKey() == newHandle->getKey());
+      if (markMoving) {
+        auto ref = candidate->unmarkMoving();
+        XDCHECK_EQ(ref,0);
+        wakeUpWaiters(candidate->getKey(), std::move(newHandle));
+        const auto res =
+            releaseBackToAllocator(*candidate, RemoveContext::kNormal, false);
+        XDCHECK(res == ReleaseRes::kReleased);
+      }
+    } else {
+      typename NvmCacheT::PutToken token{};
+      
+      removeFromMMContainer(*newAllocs[i]);
+      auto ret = handleFailedMove(candidate,token,false,markMoving);
+      XDCHECK(ret);
+      if (markMoving && candidate->getRefCountAndFlagsRaw() == 0) {
+        const auto res =
+            releaseBackToAllocator(*candidate, RemoveContext::kNormal, false);
+        XDCHECK(res == ReleaseRes::kReleased);
+      }
+
+    }
+  }
+  return candidates;
+}
+
+template <typename CacheTrait>
+std::vector<typename CacheAllocator<CacheTrait>::EvictionData>
+CacheAllocator<CacheTrait>::getNextCandidates(TierId tid,
+                                             PoolId pid,
+                                             ClassId cid,
+                                             unsigned int batch,
+                                             bool markMoving,
+                                             bool fromBgThread) {
+
+  std::vector<void*> blankAllocs;
+  std::vector<Item*> newAllocs;
+  std::vector<WriteHandle> newHandles;
+  std::vector<EvictionData> evictionData;
+  evictionData.reserve(batch);
+  newAllocs.reserve(batch);
+  newHandles.reserve(batch);
+  
+  auto& mmContainer = getMMContainer(tid, pid, cid);
+  bool lastTier = tid+1 >= getNumTiers();
+  unsigned int maxSearchTries = std::max(config_.evictionSearchTries,
+                                            batch*4);
+  if (!lastTier) {
+    blankAllocs = allocateInternalTierByCidBatch(tid+1,pid,cid,batch);
+    if (blankAllocs.empty()) {
+      return evictionData;  
+    } else if (blankAllocs.size() != batch) {
+      batch = blankAllocs.size(); 
+    }
+    XDCHECK_EQ(blankAllocs.size(),batch);
+  }
+
+  auto iterateAndMark = [this, tid, pid, cid, batch,
+                         markMoving, lastTier, maxSearchTries,
+                         &evictionData, &mmContainer](auto&& itr) {
+    unsigned int searchTries = 0;
+    if (!itr) {
+      ++searchTries;
+      (*stats_.evictionAttempts)[tid][pid][cid].inc();
+      return;
+    }
+
+    while ((config_.evictionSearchTries == 0 ||
+            maxSearchTries > searchTries) &&
+           itr && evictionData.size() < batch) {
+      ++searchTries;
+      (*stats_.evictionAttempts)[tid][pid][cid].inc();
+
+      auto* toRecycle_ = itr.get();
+      bool chainedItem_ = toRecycle_->isChainedItem();
+      Item* toRecycleParent_ = chainedItem_
+              ? &toRecycle_->asChainedItem().getParentItem(compressor_)
+              : nullptr;
+      if (toRecycle_->isExpired()) {
+          ++itr;
+          continue;
+      }
+      // in order to safely check if the expected parent (toRecycleParent_) matches
+      // the current parent on the chained item, we need to take the chained
+      // item lock so we are sure that nobody else will be editing the chain
+      auto l_ = chainedItem_
+                ? chainedItemLocks_.tryLockExclusive(toRecycleParent_->getKey())
+                : decltype(chainedItemLocks_.tryLockExclusive(toRecycle_->getKey()))();
+
+      if (chainedItem_ &&
+          ( !l_ || &toRecycle_->asChainedItem().getParentItem(compressor_)
+                    != toRecycleParent_) ) {
+          ++itr;
+          continue;
+      }
+      Item* candidate_;
+      WriteHandle candidateHandle_;
+      Item* syncItem_;
+      //sync on the parent item for chained items to move to next tier
+      if (!lastTier && chainedItem_) {
+          syncItem_ = toRecycleParent_;
+          candidate_ = toRecycle_;
+      } else if (lastTier && chainedItem_) {
+          candidate_ = toRecycleParent_;
+          syncItem_ = toRecycleParent_;
+      } else {
+          candidate_ = toRecycle_;
+          syncItem_ = toRecycle_;
+      }
+      // if it's last tier, the item will be evicted
+      // need to create put token before marking it exclusive
+      const bool evictToNvmCache = lastTier && shouldWriteToNvmCache(*candidate_);
+
+      auto token_ = evictToNvmCache
+                        ? nvmCache_->createPutToken(candidate_->getKey())
+                        : typename NvmCacheT::PutToken{};
+      
+      if (evictToNvmCache && !token_.isValid()) {
+        stats_.evictFailConcurrentFill.inc();
+        ++itr;
+        continue;
+      }
+      bool marked = false;
+      //case 1: mark the item for eviction
+      if ((lastTier || candidate_->isExpired()) && markMoving) {
+        marked = syncItem_->markForEviction();
+      } else if (markMoving) {
+        marked = syncItem_->markMoving();
+      } else if (!markMoving) {
+        //we use item handle as sync point - for background eviction
+        auto hdl = acquire(candidate_);
+        if (hdl && hdl->getRefCount() == 1) {
+          marked = true;
+          candidateHandle_ = std::move(hdl);
+        }
+      }
+      if (!marked) {
+        if (candidate_->hasChainedItem()) {
+          stats_.evictFailParentAC.inc();
+        } else {
+          stats_.evictFailAC.inc();
+        }
+        ++itr;
+        continue;
+      }
+      
+      if (chainedItem_) {
+          XDCHECK(l_);
+          XDCHECK_EQ(toRecycleParent_,&toRecycle_->asChainedItem().getParentItem(compressor_));
+      }
+      mmContainer.remove(itr);
+      EvictionData ed(candidate_,toRecycle_,toRecycleParent_,chainedItem_,
+                      candidate_->isExpired(), std::move(token_), std::move(candidateHandle_));
+      evictionData.push_back(std::move(ed));
+    }
+  };
+  
+  mmContainer.withEvictionIterator(iterateAndMark);
+
+  if (evictionData.size() < batch) {
+    if (!lastTier) {
+      unsigned int toErase = batch - evictionData.size();
+      for (int i = 0; i < toErase; i++) {
+        allocator_[tid+1]->free(blankAllocs.back());
+        blankAllocs.pop_back();
+      }
+    }
+    if (evictionData.size() == 0) {
+      return evictionData;  
+    }
+  }
+  
+  if (!lastTier) {
+    //1. get and item handle from a new allocation
+    for (int i = 0; i < evictionData.size(); i++) {
+      Item *candidate = evictionData[i].candidate;
+      WriteHandle newItemHdl = acquire(new (blankAllocs[i]) 
+              Item(candidate->getKey(), candidate->getSize(),
+                   candidate->getCreationTime(), candidate->getExpiryTime()));
+      XDCHECK(newItemHdl);
+      if (newItemHdl) {
+        newItemHdl.markNascent();
+        (*stats_.fragmentationSize)[tid][pid][cid].add(
+            util::getFragmentation(*this, *newItemHdl));
+        newAllocs.push_back(newItemHdl.getInternal());
+        newHandles.push_back(std::move(newItemHdl));
+      } else {
+        //failed to get item handle
+        throw std::runtime_error(
+           folly::sformat("Was not to acquire new alloc, failed alloc {}", blankAllocs[i]));
+      }
+    }
+    //2. add in batch to mmContainer
+    auto& newMMContainer = getMMContainer(tid+1, pid, cid);
+    uint32_t added = newMMContainer.addBatch(newAllocs.begin(), newAllocs.end());
+    XDCHECK_EQ(added,newAllocs.size());
+    if (added != newAllocs.size()) {
+      throw std::runtime_error(
+        folly::sformat("Was not able to add all new items, failed item {} and handle {}", 
+                        newAllocs[added]->toString(),newHandles[added]->toString()));
+    }
+    //3. copy item data - don't need to add in mmContainer
+    for (int i = 0; i < evictionData.size(); i++) {
+      Item *candidate = evictionData[i].candidate;
+      WriteHandle newHandle = std::move(newHandles[i]);
+      bool moved = moveRegularItem(*candidate,newHandle, true, true);
+      if (moved) {
+        (*stats_.numWritebacks)[tid][pid][cid].inc();
+        XDCHECK(candidate->getKey() == newHandle->getKey());
+        if (markMoving) {
+          auto ref = candidate->unmarkMoving();
+          XDCHECK_EQ(ref,0);
+          wakeUpWaiters(candidate->getKey(), std::move(newHandle));
+          if (fromBgThread) {
+            const auto res =
+                releaseBackToAllocator(*candidate, RemoveContext::kNormal, false);
+            XDCHECK(res == ReleaseRes::kReleased);
+          }
+        }
+      } else {
+        typename NvmCacheT::PutToken token = std::move(evictionData[i].token);
+        removeFromMMContainer(*newAllocs[i]);
+        auto ret = handleFailedMove(candidate,token,evictionData[i].expired,markMoving);
+        XDCHECK(ret);
+        if (fromBgThread && markMoving) {
+          const auto res =
+              releaseBackToAllocator(*candidate, RemoveContext::kNormal, false);
+          XDCHECK(res == ReleaseRes::kReleased);
+        }
+
+      }
+    }
+  } else {
+    //we are the last tier - just remove
+    for (int i = 0; i < evictionData.size(); i++) {
+      Item *candidate = evictionData[i].candidate;
+      typename NvmCacheT::PutToken token = std::move(evictionData[i].token);
+      auto ret = handleFailedMove(candidate,token,evictionData[i].expired,markMoving);
+      if (fromBgThread && markMoving) {
+        const auto res =
+            releaseBackToAllocator(*candidate, RemoveContext::kNormal, false);
+        XDCHECK(res == ReleaseRes::kReleased);
+      }
+    }
+  }
+
+  return evictionData;
+}
+
+// 
+// Common function in case move among tiers fails during eviction
+//
+// if insertOrReplace was called during move
+// then candidate will not be accessible (failed replace during tryEvict)
+//  - therefore this was why we failed to
+//    evict to the next tier and insertOrReplace
+//    will remove from NVM cache
+// however, if candidate is accessible
+// that means the allocation in the next
+// tier failed - so we will continue to
+// evict the item to NVM cache
+template <typename CacheTrait>
+bool CacheAllocator<CacheTrait>::handleFailedMove(Item* candidate, 
+                                                  typename NvmCacheT::PutToken& token, 
+                                                  bool isExpired,
+                                                  bool markMoving) {
+  bool failedToReplace = !candidate->isAccessible();
+  if (!token.isValid() && !failedToReplace) {
+    token = createPutToken(*candidate);
+  }
+  // in case that we are on the last tier, we whould have already marked
+  // as exclusive since we will not be moving the item to the next tier
+  // but rather just evicting all together, no need to
+  // markForEvictionWhenMoving
+  if (markMoving) {
+    if (!candidate->isMarkedForEviction() &&
+        candidate->isMoving()) {
+      auto ret = (isExpired) ? true : candidate->markForEvictionWhenMoving();
+      XDCHECK(ret);
+    }
+    unlinkItemForEviction(*candidate);
+  } else if (candidate->isAccessible()) {
+    accessContainer_->remove(*candidate);
+  }
+ 
+  if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate)
+          && !failedToReplace) {
+    nvmCache_->put(*candidate, std::move(token));
+  }
+  // wake up any readers that wait for the move to complete
+  // it's safe to do now, as we have the item marked exclusive and
+  // no other reader can be added to the waiters list
+  if (markMoving) {
+    wakeUpWaiters(candidate->getKey(), {});
+  }
+  return true;
+}
+
 template <typename CacheTrait>
 std::pair<typename CacheAllocator<CacheTrait>::Item*,
           typename CacheAllocator<CacheTrait>::Item*>
@@ -4149,7 +4582,7 @@ CacheAllocator<CacheTrait>::getNextCandidate(TierId tid,
   XDCHECK(candidate);
 
   auto evictedToNext = (lastTier || isExpired) ? nullptr
-      : tryEvictToNextMemoryTier(*candidate, false);
+      : tryEvictToNextMemoryTier(*candidate);
   if (!evictedToNext) {
     //failed to move a chained item - so evict the entire chain
     if (candidate->isChainedItem()) {
@@ -4159,44 +4592,9 @@ CacheAllocator<CacheTrait>::getNextCandidate(TierId tid,
       candidate = toRecycleParent; //but now we evict the chain and in
                                     //doing so recycle the child
     }
-    //if insertOrReplace was called during move
-    //then candidate will not be accessible (failed replace during tryEvict)
-    // - therefore this was why we failed to
-    //   evict to the next tier and insertOrReplace
-    //   will remove from NVM cache
-    //however, if candidate is accessible
-    //that means the allocation in the next
-    //tier failed - so we will continue to
-    //evict the item to NVM cache
-    bool failedToReplace = !candidate->isAccessible();
-    if (!token.isValid() && !failedToReplace) {
-      token = createPutToken(*candidate);
-    }
-    // tryEvictToNextMemoryTier can fail if:
-    //    a) allocation of the new item fails in that case,
-    //       it should be still possible to mark item for eviction.
-    //    b) another thread calls insertOrReplace and the item
-    //       is no longer accessible
-    //
-    // in case that we are on the last tier, we whould have already marked
-    // as exclusive since we will not be moving the item to the next tier
-    // but rather just evicting all together, no need to
-    // markForEvictionWhenMoving
-    auto ret = (lastTier || isExpired) ? true : candidate->markForEvictionWhenMoving();
+    //clean up and evict the candidate since we failed
+    auto ret = handleFailedMove(candidate,token,isExpired,true);
     XDCHECK(ret);
-
-    unlinkItemForEviction(*candidate);
-
-    // wake up any readers that wait for the move to complete
-    // it's safe to do now, as we have the item marked exclusive and
-    // no other reader can be added to the waiters list
-    wakeUpWaiters(candidate->getKey(), {});
-    
-    if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate)
-            && !failedToReplace) {
-      nvmCache_->put(*candidate, std::move(token));
-    }
-
   } else {
     XDCHECK(!evictedToNext->isMarkedForEviction() && !evictedToNext->isMoving());
     XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
@@ -4331,7 +4729,7 @@ bool CacheAllocator<CacheTrait>::shouldWriteToNvmCacheExclusive(
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
 CacheAllocator<CacheTrait>::tryEvictToNextMemoryTier(
-    TierId tid, PoolId pid, Item& item, bool fromBgThread) {
+    TierId tid, PoolId pid, Item& item) {
 
   TierId nextTier = tid; // TODO - calculate this based on some admission policy
   while (++nextTier < getNumTiers()) { // try to evict down to the next memory tiers
@@ -4359,14 +4757,14 @@ CacheAllocator<CacheTrait>::tryEvictToNextMemoryTier(
                      item.getSize(),
                      item.getCreationTime(),
                      item.getExpiryTime(),
-                     fromBgThread,
                      evict);
     }
 
     if (newItemHdl) {
       bool moveSuccess = chainedItem
                       ? moveChainedItem(item.asChainedItem(), newItemHdl)
-                      : moveRegularItem(item, newItemHdl);
+                      : moveRegularItem(item, newItemHdl,
+                      /* skipAddInMMContainer */ false, /* fromBgThread*/ false);
       if (!moveSuccess) {
         return WriteHandle{};
       }
@@ -4386,54 +4784,10 @@ CacheAllocator<CacheTrait>::tryEvictToNextMemoryTier(
 
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
-CacheAllocator<CacheTrait>::tryEvictToNextMemoryTier(Item& item, bool fromBgThread) {
+CacheAllocator<CacheTrait>::tryEvictToNextMemoryTier(Item& item) {
   auto tid = getTierId(item);
   auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId;
-  return tryEvictToNextMemoryTier(tid, pid, item, fromBgThread);
-}
-
-template <typename CacheTrait>
-typename CacheAllocator<CacheTrait>::WriteHandle
-CacheAllocator<CacheTrait>::tryPromoteToNextMemoryTier(
-    TierId tid, PoolId pid, Item& item, bool fromBgThread) {
-  if(item.isExpired()) { return {}; }
-  TierId nextTier = tid;
-  while (nextTier > 0) { // try to evict down to the next memory tiers
-    auto toPromoteTier = nextTier - 1;
-    --nextTier;
-
-    // always evict item from the toPromoteTier to make room for new item
-    bool evict = true;
-    // allocateInternal might trigger another eviction
-    auto newItemHdl = allocateInternalTier(toPromoteTier, pid,
-                     item.getKey(),
-                     item.getSize(),
-                     item.getCreationTime(),
-                     item.getExpiryTime(),
-                     fromBgThread,
-                     true);
-
-    if (newItemHdl) {
-      XDCHECK_EQ(newItemHdl->getSize(), item.getSize());
-      if (!moveRegularItem(item, newItemHdl)) {
-        return WriteHandle{};
-      }
-      item.unmarkMoving();
-      return newItemHdl;
-    } else {
-      return WriteHandle{};
-    }
-  }
-
-  return {};
-}
-
-template <typename CacheTrait>
-typename CacheAllocator<CacheTrait>::WriteHandle
-CacheAllocator<CacheTrait>::tryPromoteToNextMemoryTier(Item& item, bool fromBgThread) {
-    auto tid = getTierId(item);
-    auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId;
-    return tryPromoteToNextMemoryTier(tid, pid, item, fromBgThread);
+  return tryEvictToNextMemoryTier(tid, pid, item);
 }
 
 template <typename CacheTrait>
@@ -5680,7 +6034,7 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(Item& oldItem) {
     // will send it back to the allocator
     bool isMoved = chainedItem
                        ? moveChainedItem(oldItem.asChainedItem(), newItemHdl)
-                       : moveRegularItem(oldItem, newItemHdl);
+                       : moveRegularItem(oldItem, newItemHdl, false, false);
     if (!isMoved) {
       return false;
     }
@@ -5760,7 +6114,6 @@ CacheAllocator<CacheTrait>::allocateNewItemForOldItem(const Item& oldItem) {
                                          oldItem.getSize(),
                                          oldItem.getCreationTime(),
                                          oldItem.getExpiryTime(),
-                                         false,
                                          evict);
   if (!newItemHdl) {
     return {};
diff --git a/cachelib/allocator/MM2Q.h b/cachelib/allocator/MM2Q.h
index 9c5ebce96b..710b5c597c 100644
--- a/cachelib/allocator/MM2Q.h
+++ b/cachelib/allocator/MM2Q.h
@@ -461,6 +461,18 @@ class MM2Q {
     //          is unchanged.
     bool add(T& node) noexcept;
 
+    // helper function to add the node under the container lock
+    void addNodeLocked(T& node, const Time& currTime);
+    
+    // adds the given nodes into the container and marks each as being present in
+    // the container. The nodes are added to the head of the lru.
+    //
+    // @param vector of nodes  The nodes to be added to the container.
+    // @return  number of nodes added - it is up to user to verify all
+    //          expected nodes have been added.
+    template <typename It>
+    uint32_t addBatch(It begin, It end) noexcept;
+
     // removes the node from the lru and sets it previous and next to nullptr.
     //
     // @param node  The node to be removed from the container.
@@ -895,16 +907,41 @@ bool MM2Q::Container<T, HookPtr>::add(T& node) noexcept {
     if (node.isInMMContainer()) {
       return false;
     }
+    addNodeLocked(node, currTime);
+    return true;
+  });
+}
 
-    markHot(node);
-    unmarkCold(node);
-    unmarkTail(node);
-    lru_.getList(LruType::Hot).linkAtHead(node);
-    rebalance();
+// adds the node to the list assuming not in 
+// container and holding container lock
+template <typename T, MM2Q::Hook<T> T::*HookPtr>
+void MM2Q::Container<T, HookPtr>::addNodeLocked(T& node, const Time& currTime) {
+  XDCHECK(!node.isInMMContainer());
+  markHot(node);
+  unmarkCold(node);
+  unmarkTail(node);
+  lru_.getList(LruType::Hot).linkAtHead(node);
+  rebalance();
+
+  node.markInMMContainer();
+  setUpdateTime(node, currTime);
+}
 
-    node.markInMMContainer();
-    setUpdateTime(node, currTime);
-    return true;
+template <typename T, MM2Q::Hook<T> T::*HookPtr>
+template <typename It>
+uint32_t MM2Q::Container<T, HookPtr>::addBatch(It begin, It end) noexcept {
+  const auto currTime = static_cast<Time>(util::getCurrentTimeSec());
+  return lruMutex_->lock_combine([this, begin, end, currTime]() {
+    uint32_t i = 0;
+    for (auto itr = begin; itr != end; itr++) {
+      T* node = *itr;
+      if (node->isInMMContainer()) {
+        return i;
+      }
+      addNodeLocked(*node,currTime);
+      i++;
+    }
+    return i;
   });
 }
 
@@ -935,6 +972,7 @@ MM2Q::Container<T, HookPtr>::withPromotionIterator(F&& fun) {
     fun(LockedIterator{LockHolder{}, lru_.begin(LruType::Hot)});
   });
 }
+
 template <typename T, MM2Q::Hook<T> T::*HookPtr>
 template <typename F>
 void MM2Q::Container<T, HookPtr>::withContainerLock(F&& fun) {
diff --git a/cachelib/allocator/MMLru.h b/cachelib/allocator/MMLru.h
index 4c0771a33f..534d4bc850 100644
--- a/cachelib/allocator/MMLru.h
+++ b/cachelib/allocator/MMLru.h
@@ -337,6 +337,18 @@ class MMLru {
     //          is unchanged.
     bool add(T& node) noexcept;
 
+    // helper function to add the node under the container lock
+    void addNodeLocked(T& node, const Time& currTime);
+    
+    // adds the given nodes into the container and marks each as being present in
+    // the container. The nodes are added to the head of the lru.
+    //
+    // @param vector of nodes  The nodes to be added to the container.
+    // @return  number of nodes added - it is up to user to verify all
+    //          expected nodes have been added.
+    template <typename It>
+    uint32_t addBatch(It begin, It end) noexcept;
+
     // removes the node from the lru and sets it previous and next to nullptr.
     //
     // @param node  The node to be removed from the container.
@@ -690,19 +702,46 @@ bool MMLru::Container<T, HookPtr>::add(T& node) noexcept {
     if (node.isInMMContainer()) {
       return false;
     }
-    if (config_.lruInsertionPointSpec == 0 || insertionPoint_ == nullptr) {
-      lru_.linkAtHead(node);
-    } else {
-      lru_.insertBefore(*insertionPoint_, node);
-    }
-    node.markInMMContainer();
-    setUpdateTime(node, currTime);
-    unmarkAccessed(node);
-    updateLruInsertionPoint();
+    addNodeLocked(node,currTime);
     return true;
   });
 }
 
+template <typename T, MMLru::Hook<T> T::*HookPtr>
+void MMLru::Container<T, HookPtr>::addNodeLocked(T& node, const Time& currTime) {
+  XDCHECK(!node.isInMMContainer());
+  if (config_.lruInsertionPointSpec == 0 || insertionPoint_ == nullptr) {
+    lru_.linkAtHead(node);
+  } else {
+    lru_.insertBefore(*insertionPoint_, node);
+  }
+  node.markInMMContainer();
+  setUpdateTime(node, currTime);
+  unmarkAccessed(node);
+  updateLruInsertionPoint();
+}
+
+template <typename T, MMLru::Hook<T> T::*HookPtr>
+template <typename It>
+uint32_t MMLru::Container<T, HookPtr>::addBatch(It begin, It end) noexcept {
+  const auto currTime = static_cast<Time>(util::getCurrentTimeSec());
+  return lruMutex_->lock_combine([this, begin, end, currTime]() {
+    uint32_t i = 0;
+    for (auto itr = begin; itr != end; ++itr) {
+      T* node = *itr;
+      XDCHECK(!node->isInMMContainer());
+      if (node->isInMMContainer()) {
+        throw std::runtime_error(
+          folly::sformat("Was not able to add all new items, failed item {}",
+                          node->toString()));
+      }
+      addNodeLocked(*node,currTime);
+      i++;
+    }
+    return i;
+  });
+}
+
 template <typename T, MMLru::Hook<T> T::*HookPtr>
 typename MMLru::Container<T, HookPtr>::LockedIterator
 MMLru::Container<T, HookPtr>::getEvictionIterator() const noexcept {
diff --git a/cachelib/allocator/MMTinyLFU.h b/cachelib/allocator/MMTinyLFU.h
index 2a6da11687..6e3585d932 100644
--- a/cachelib/allocator/MMTinyLFU.h
+++ b/cachelib/allocator/MMTinyLFU.h
@@ -325,6 +325,18 @@ class MMTinyLFU {
     //          if the node was already in the contianer. On error state of node
     //          is unchanged.
     bool add(T& node) noexcept;
+    
+    // helper function to add the node under the container lock
+    void addNodeLocked(T& node, const Time& currTime);
+    
+    // adds the given nodes into the container and marks each as being present in
+    // the container. The nodes are added to the head of the lru.
+    //
+    // @param vector of nodes  The nodes to be added to the container.
+    // @return  number of nodes added - it is up to user to verify all
+    //          expected nodes have been added.
+    template <typename It>
+    uint32_t addBatch(It begin, It end) noexcept;
 
     // removes the node from the lru and sets it previous and next to nullptr.
     //
@@ -806,7 +818,15 @@ bool MMTinyLFU::Container<T, HookPtr>::add(T& node) noexcept {
   if (node.isInMMContainer()) {
     return false;
   }
+  addNodeLocked(node, currTime);
+  return true;
+}
 
+// adds the node to the list assuming not in 
+// container and holding container lock
+template <typename T, MMTinyLFU::Hook<T> T::*HookPtr>
+void MMTinyLFU::Container<T, HookPtr>::addNodeLocked(T& node, const Time &currTime) {
+  XDCHECK(!node.isInMMContainer());
   auto& tinyLru = lru_.getList(LruType::Tiny);
   tinyLru.linkAtHead(node);
   markTiny(node);
@@ -834,7 +854,23 @@ bool MMTinyLFU::Container<T, HookPtr>::add(T& node) noexcept {
   node.markInMMContainer();
   setUpdateTime(node, currTime);
   unmarkAccessed(node);
-  return true;
+}
+
+template <typename T, MMTinyLFU::Hook<T> T::*HookPtr>
+template <typename It>
+uint32_t MMTinyLFU::Container<T, HookPtr>::addBatch(It begin, It end) noexcept {
+  const auto currTime = static_cast<Time>(util::getCurrentTimeSec());
+  LockHolder l(lruMutex_);
+  uint32_t i = 0;
+  for (auto itr = begin; itr != end; itr++) {
+    T* node = *itr;
+    if (node->isInMMContainer()) {
+      return i;
+    }
+    addNodeLocked(*node, currTime);
+    i++;
+  }
+  return i;
 }
 
 template <typename T, MMTinyLFU::Hook<T> T::*HookPtr>
diff --git a/cachelib/allocator/memory/AllocationClass.cpp b/cachelib/allocator/memory/AllocationClass.cpp
index e43494441f..6d198b88bb 100644
--- a/cachelib/allocator/memory/AllocationClass.cpp
+++ b/cachelib/allocator/memory/AllocationClass.cpp
@@ -140,6 +140,26 @@ void* AllocationClass::addSlabAndAllocate(Slab* slab) {
   });
 }
 
+std::vector<void*> AllocationClass::addSlabAndAllocateBatch(Slab* slab, uint64_t batch) {
+  XDCHECK_NE(nullptr, slab);
+  std::vector<void*> allocs;
+  allocs.reserve(batch);
+  lock_->lock_combine([this, slab, batch, &allocs]() {
+    addSlabLocked(slab);
+    uint64_t total = 0;
+    while (total < batch) {
+     void *alloc = allocateLocked();
+     if (alloc != nullptr) {
+       allocs.push_back(alloc);
+       total++;
+     } else {
+       break;
+     }
+    }
+  });
+  return allocs;
+}
+
 void* AllocationClass::allocateFromCurrentSlabLocked() noexcept {
   XDCHECK(canAllocateFromCurrentSlabLocked());
   void* ret = currSlab_->memoryAtOffset(currOffset_);
@@ -159,6 +179,26 @@ void* AllocationClass::allocate() {
   return lock_->lock_combine([this]() -> void* { return allocateLocked(); });
 }
 
+std::vector<void*> AllocationClass::allocateBatch(uint64_t batch) {
+  std::vector<void*> allocs;
+  if (!canAllocate_) {
+    return allocs;
+  }
+  lock_->lock_combine([this, &allocs, batch]() { 
+    uint64_t total = 0;
+    while (total < batch) {
+      void *alloc = allocateLocked();
+      if (alloc != nullptr) {
+        allocs.push_back(alloc);
+        total++;
+      } else {
+        break;
+      }
+    }
+  });
+  return allocs;
+}
+
 void* AllocationClass::allocateLocked() {
   // fast path for case when the cache is mostly full.
   if (freedAllocations_.empty() && freeSlabs_.empty() &&
diff --git a/cachelib/allocator/memory/AllocationClass.h b/cachelib/allocator/memory/AllocationClass.h
index 6a9412db5e..079e7af01b 100644
--- a/cachelib/allocator/memory/AllocationClass.h
+++ b/cachelib/allocator/memory/AllocationClass.h
@@ -116,6 +116,7 @@ class AllocationClass {
   //          don't have any free memory. The caller will have to add a slab
   //          to this slab class to make further allocations out of it.
   void* allocate();
+  std::vector<void*> allocateBatch(uint64_t batch);
 
   // @param ctx     release context for the slab owning this alloc
   // @param memory  memory to check
@@ -227,6 +228,7 @@ class AllocationClass {
   // @param slab    a new slab to be added. This can NOT be nullptr.
   // @return  new allocation. This cannot fail.
   void* addSlabAndAllocate(Slab* slab);
+  std::vector<void*> addSlabAndAllocateBatch(Slab* slab, uint64_t batch);
 
   // Releasing a slab is a two step process.
   // 1. Mark a slab for release, by calling `startSlabRelease`.
diff --git a/cachelib/allocator/memory/MemoryAllocator.cpp b/cachelib/allocator/memory/MemoryAllocator.cpp
index 5de65e4e13..53549beeb7 100644
--- a/cachelib/allocator/memory/MemoryAllocator.cpp
+++ b/cachelib/allocator/memory/MemoryAllocator.cpp
@@ -71,6 +71,16 @@ void* MemoryAllocator::allocate(PoolId id, uint32_t size) {
   return mp.allocate(size);
 }
 
+void* MemoryAllocator::allocateByCid(PoolId id, ClassId cid) {
+  auto& mp = memoryPoolManager_.getPoolById(id);
+  return mp.allocateByCid(cid);
+}
+
+std::vector<void*> MemoryAllocator::allocateByCidBatch(PoolId id, ClassId cid, uint64_t batch) {
+  auto& mp = memoryPoolManager_.getPoolById(id);
+  return mp.allocateByCidBatch(cid, batch);
+}
+
 void* MemoryAllocator::allocateZeroedSlab(PoolId id) {
   if (!config_.enableZeroedSlabAllocs) {
     throw std::logic_error("Zeroed Slab allcoation is not enabled");
diff --git a/cachelib/allocator/memory/MemoryAllocator.h b/cachelib/allocator/memory/MemoryAllocator.h
index a77d23494c..efd23838f1 100644
--- a/cachelib/allocator/memory/MemoryAllocator.h
+++ b/cachelib/allocator/memory/MemoryAllocator.h
@@ -167,6 +167,8 @@ class MemoryAllocator {
   // @throw std::invalid_argument if the poolId is invalid or the size is
   //        invalid.
   void* allocate(PoolId id, uint32_t size);
+  void* allocateByCid(PoolId id, ClassId cid);
+  std::vector<void*> allocateByCidBatch(PoolId id, ClassId cid, uint64_t batch);
 
   // Allocate a zeroed Slab
   //
diff --git a/cachelib/allocator/memory/MemoryPool.cpp b/cachelib/allocator/memory/MemoryPool.cpp
index 6caa409d0e..99ccafbcc8 100644
--- a/cachelib/allocator/memory/MemoryPool.cpp
+++ b/cachelib/allocator/memory/MemoryPool.cpp
@@ -262,12 +262,71 @@ Slab* MemoryPool::getSlabLocked() noexcept {
   return slab;
 }
 
-void* MemoryPool::allocate(uint32_t size) {
-  auto& ac = getAllocationClassFor(size);
+std::vector<void*> MemoryPool::allocateForClassBatch(AllocationClass& ac, uint64_t batch) {
+  uint64_t total = 0;
+  const auto allocSize = ac.getAllocSize();
+  auto allocs = ac.allocateBatch(batch);
+  if (allocs.size() > 0) {
+    total += allocs.size();
+    currAllocSize_ += allocSize * allocs.size();
+  }
+  if (total == batch) {
+    return allocs;
+  }
+
+  // atomically see if we can acquire a slab by checking if we have
+  // reached the limit by size. If not, then they can be acquired from
+  // either the slab allocator or our free list. It is important to check
+  // this before we grab it from the slab allocator or free list. Things
+  // that release slab, bump down the currSlabAllocSize_ after actually
+  // releasing and adding it to free list or slab allocator.
+  if (allSlabsAllocated()) {
+    return allocs;
+  }
+
+  uint32_t remain = batch - total;
+  // TODO: introduce a new sharded lock by allocation class id for this slow
+  // path Currently this would also serialize the slow paths of two different
+  // allocation class ids that need slab to initiate an allocation.
+  LockHolder l(lock_);
+  auto allocs2 = ac.allocateBatch(remain);
+  if (allocs2.size() > 0) {
+    total += allocs2.size();
+    currAllocSize_ += allocSize * allocs2.size();
+    allocs.insert(allocs.end(),allocs2.begin(),allocs2.end());
+  }
+  if (total == batch) {
+    return allocs;
+  }
 
+  remain = batch - total;
+  // see if we have a slab to add to the allocation class.
+  auto slab = getSlabLocked();
+  while (remain && slab != nullptr) {
+    if (slab == nullptr) {
+      // out of memory
+      return allocs;
+    }
+
+    // add it to the allocation class and try to allocate.
+    auto allocs3 = ac.addSlabAndAllocateBatch(slab, remain);
+    //XDCHECK_NE(nullptr, alloc);
+
+    currAllocSize_ += allocSize * allocs3.size();
+    total += allocs3.size();
+    remain -= allocs3.size();
+    allocs.insert(allocs.end(),allocs3.begin(),allocs3.end());
+    if (total == batch) {
+      return allocs;
+    }
+    slab = getSlabLocked();
+  }
+  return allocs;
+}
+
+void* MemoryPool::allocateForClass(AllocationClass& ac) {
   auto alloc = ac.allocate();
   const auto allocSize = ac.getAllocSize();
-  XDCHECK_GE(allocSize, size);
 
   if (alloc != nullptr) {
     currAllocSize_ += allocSize;
@@ -309,6 +368,23 @@ void* MemoryPool::allocate(uint32_t size) {
   return alloc;
 }
 
+void* MemoryPool::allocateByCid(ClassId cid) {
+  auto& ac = getAllocationClassFor(cid);
+  return allocateForClass(ac);
+}
+
+std::vector<void*> MemoryPool::allocateByCidBatch(ClassId cid, uint64_t batch) {
+  auto& ac = getAllocationClassFor(cid);
+  return allocateForClassBatch(ac, batch);
+}
+
+void* MemoryPool::allocate(uint32_t size) {
+  auto& ac = getAllocationClassFor(size);
+  const auto allocSize = ac.getAllocSize();
+  XDCHECK_GE(allocSize, size);
+  return allocateForClass(ac);
+}
+
 void* MemoryPool::allocateZeroedSlab() { return allocate(Slab::kSize); }
 
 void MemoryPool::free(void* alloc) {
diff --git a/cachelib/allocator/memory/MemoryPool.h b/cachelib/allocator/memory/MemoryPool.h
index bd607fe06c..d4d72b7c0d 100644
--- a/cachelib/allocator/memory/MemoryPool.h
+++ b/cachelib/allocator/memory/MemoryPool.h
@@ -147,6 +147,8 @@ class MemoryPool {
   // @return pointer to allocation or nullptr on failure to allocate.
   // @throw  std::invalid_argument if size is invalid.
   void* allocate(uint32_t size);
+  void* allocateByCid(ClassId cid);
+  std::vector<void*> allocateByCidBatch(ClassId cid, uint64_t batch);
 
   // Allocate a slab with zeroed memory
   //
@@ -331,6 +333,9 @@ class MemoryPool {
 
   // create allocation classes corresponding to the pool's configuration.
   ACVector createAllocationClasses() const;
+  
+  void* allocateForClass(AllocationClass& ac);
+  std::vector<void*> allocateForClassBatch(AllocationClass& ac, uint64_t batch);
 
   // @return  AllocationClass corresponding to the memory, if it
   //          belongs to an AllocationClass

From efea4803bf6fc890b489b63cbaec43c5eb9d8552 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Wed, 28 Feb 2024 10:54:48 -0800
Subject: [PATCH 26/40] no online eviction option patch
 -------------------------------- this is to be used in conjunction with the
 background evictors to completely disable any hot path evictions - forcing
 all evictions to occur via background threads

---
 cachelib/allocator/CacheAllocator.h       |  6 +++---
 cachelib/allocator/CacheAllocatorConfig.h | 11 +++++++++++
 cachelib/cachebench/cache/Cache.h         |  1 +
 cachelib/cachebench/util/CacheConfig.cpp  |  1 +
 cachelib/cachebench/util/CacheConfig.h    |  1 +
 5 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index 5a1054ee79..7422c1c61c 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -2996,7 +2996,7 @@ CacheAllocator<CacheTrait>::allocateInternalTier(TierId tid,
   }
 
   if (memory == nullptr) {
-    if (!evict) {
+    if (!evict || config_.noOnlineEviction) {
       return {};
     }
     memory = findEviction(tid, pid, cid);
@@ -3047,7 +3047,7 @@ CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
                                              uint32_t expiryTime) {
   auto tid = 0; /* TODO: consult admission policy */
   for(TierId tid = 0; tid < getNumTiers(); ++tid) {
-    bool evict = !config_.insertToFirstFreeTier || tid == getNumTiers() - 1;
+    bool evict = (!config_.insertToFirstFreeTier || tid == getNumTiers() - 1);
     auto handle = allocateInternalTier(tid, pid, key, size, creationTime,
                                        expiryTime, evict);
     if (handle) return handle;
@@ -4477,7 +4477,7 @@ CacheAllocator<CacheTrait>::getNextCandidate(TierId tid,
   bool isExpired = false;
   bool chainedItem = false;
   auto& mmContainer = getMMContainer(tid, pid, cid);
-  bool lastTier = tid+1 >= getNumTiers();
+  bool lastTier = tid+1 >= getNumTiers() || config_.noOnlineEviction;
 
   mmContainer.withEvictionIterator([this, tid, pid, cid, &candidate,
                                     &toRecycle, &toRecycleParent,
diff --git a/cachelib/allocator/CacheAllocatorConfig.h b/cachelib/allocator/CacheAllocatorConfig.h
index 46d48a1feb..70be2e37cf 100644
--- a/cachelib/allocator/CacheAllocatorConfig.h
+++ b/cachelib/allocator/CacheAllocatorConfig.h
@@ -315,6 +315,8 @@ class CacheAllocatorConfig {
 
   // Insert items to first free memory tier
   CacheAllocatorConfig& enableInsertToFirstFreeTier();
+  
+  CacheAllocatorConfig& enableNoOnlineEviction();
 
   // Passes in a callback to initialize an event tracker when the allocator
   // starts
@@ -547,6 +549,8 @@ class CacheAllocatorConfig {
   // from the bottom one if memory cache is full
   bool insertToFirstFreeTier = false;
 
+  bool noOnlineEviction = false;
+
   // the number of tries to search for an item to evict
   // 0 means it's infinite
   unsigned int evictionSearchTries{50};
@@ -687,6 +691,12 @@ CacheAllocatorConfig<T>& CacheAllocatorConfig<T>::enableInsertToFirstFreeTier()
   return *this;
 }
 
+template <typename T>
+CacheAllocatorConfig<T>& CacheAllocatorConfig<T>::enableNoOnlineEviction() {
+  noOnlineEviction = true;
+  return *this;
+}
+
 template <typename T>
 CacheAllocatorConfig<T>& CacheAllocatorConfig<T>::setCacheName(
     const std::string& _cacheName) {
@@ -1269,6 +1279,7 @@ std::map<std::string, std::string> CacheAllocatorConfig<T>::serialize() const {
   configMap["delayCacheWorkersStart"] =
       delayCacheWorkersStart ? "true" : "false";
   configMap["insertToFirstFreeTier"] = std::to_string(insertToFirstFreeTier);
+  configMap["noOnlineEviction"] = std::to_string(noOnlineEviction);
   mergeWithPrefix(configMap, throttleConfig.serialize(), "throttleConfig");
   mergeWithPrefix(configMap,
                   chainedItemAccessConfig.serialize(),
diff --git a/cachelib/cachebench/cache/Cache.h b/cachelib/cachebench/cache/Cache.h
index 27107b5a64..fe6f7e7cb7 100644
--- a/cachelib/cachebench/cache/Cache.h
+++ b/cachelib/cachebench/cache/Cache.h
@@ -579,6 +579,7 @@ Cache<Allocator>::Cache(const CacheConfig& config,
   }
 
   allocatorConfig_.insertToFirstFreeTier = config_.insertToFirstFreeTier;
+  allocatorConfig_.noOnlineEviction = config_.noOnlineEviction;
 
   auto cleanupGuard = folly::makeGuard([&] {
     if (!nvmCacheFilePath_.empty()) {
diff --git a/cachelib/cachebench/util/CacheConfig.cpp b/cachelib/cachebench/util/CacheConfig.cpp
index 506dc289be..8d0044da75 100644
--- a/cachelib/cachebench/util/CacheConfig.cpp
+++ b/cachelib/cachebench/util/CacheConfig.cpp
@@ -51,6 +51,7 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) {
   JSONSetVal(configJson, useCombinedLockForIterators);
   
   JSONSetVal(configJson, insertToFirstFreeTier);
+  JSONSetVal(configJson, noOnlineEviction);
 
   JSONSetVal(configJson, lru2qHotPct);
   JSONSetVal(configJson, lru2qColdPct);
diff --git a/cachelib/cachebench/util/CacheConfig.h b/cachelib/cachebench/util/CacheConfig.h
index 23b9df3ea1..e8dff5bcae 100644
--- a/cachelib/cachebench/util/CacheConfig.h
+++ b/cachelib/cachebench/util/CacheConfig.h
@@ -99,6 +99,7 @@ struct CacheConfig : public JSONConfig {
   bool useCombinedLockForIterators{true};
   
   bool insertToFirstFreeTier{false};
+  bool noOnlineEviction{false};
 
   // LRU param
   uint64_t lruIpSpec{0};

From ebfca172bed2f01084e9d5bb126c297967bebefd Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Mon, 20 May 2024 06:05:50 -0700
Subject: [PATCH 27/40] fixes cmake in latest test removal (upstream test build
 fails - need to check if upstream has fixed this yet)

---
 cachelib/benchmarks/CMakeLists.txt | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/cachelib/benchmarks/CMakeLists.txt b/cachelib/benchmarks/CMakeLists.txt
index a4c6dab340..2adb2ad7cd 100644
--- a/cachelib/benchmarks/CMakeLists.txt
+++ b/cachelib/benchmarks/CMakeLists.txt
@@ -35,12 +35,8 @@ if (BUILD_TESTS)
                       benchmark_test_support "${ARGN}")
   endfunction()
 
-  add_test (BinarySearchVsHashTableBench.cpp)
   add_test (BucketMutexBench.cpp)
   add_test (BytesEqualBenchmark.cpp)
-  add_test (CachelibMapOperationBench.cpp)
-  add_test (CachelibMapWorkloadBench.cpp)
-  add_test (CachelibRangeMapWorkloadBench.cpp)
   add_test (CachelibTickerClockBench.cpp)
   add_test (CompactCacheBench.cpp)
   add_test (HashMapBenchmark.cpp)

From a4c9524172ba04b5ed62f3e13d3cfe64fc34dc6c Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Wed, 29 May 2024 06:51:33 -0700
Subject: [PATCH 28/40] add mvfst

---
 docker/images/install-cachelib-deps.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/images/install-cachelib-deps.sh b/docker/images/install-cachelib-deps.sh
index b1754a8db5..cbb97e8fee 100755
--- a/docker/images/install-cachelib-deps.sh
+++ b/docker/images/install-cachelib-deps.sh
@@ -5,7 +5,7 @@
 echo 'Defaults env_keep += "HTTPS_PROXY https_proxy HTTP_PROXY http_proxy NO_PROXY no_proxy"' >> /etc/sudoers
 ./contrib/prerequisites-centos8.sh
 
-for pkg in zstd googleflags googlelog googletest sparsemap fmt folly fizz wangle fbthrift ;
+for pkg in zstd googleflags googlelog googletest sparsemap fmt folly fizz wangle mvfst fbthrift ;
 do
     sudo ./contrib/build-package.sh -j -I /opt/ "$pkg"
 done

From b4cfd057af55e4fcba9b010d671b2bd3a5c75103 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Wed, 29 May 2024 07:00:40 -0700
Subject: [PATCH 29/40] touch docker

---
 docker/images/centos-8streams.Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/images/centos-8streams.Dockerfile b/docker/images/centos-8streams.Dockerfile
index 73168e3cb3..b6d8515ede 100644
--- a/docker/images/centos-8streams.Dockerfile
+++ b/docker/images/centos-8streams.Dockerfile
@@ -17,8 +17,8 @@ json-c-devel \
 perf \
 numactl
 
-# updated to fix compile errors and better symbol
-# resolving in VTune
+# GCC 12 fixes some compile errors
+# and resolves symbols better in VTune
 RUN dnf -y install gcc-toolset-12
 RUN echo "source /opt/rh/gcc-toolset-12/enable" >> /etc/bashrc
 SHELL ["/bin/bash", "--login", "-c"]

From de8d862a07dd2427836be908e0f6130e291f2170 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Wed, 29 May 2024 07:04:46 -0700
Subject: [PATCH 30/40] ci commit show

---
 docker/set-ci-vars.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docker/set-ci-vars.sh b/docker/set-ci-vars.sh
index f6f52132c8..590d3c26bd 100755
--- a/docker/set-ci-vars.sh
+++ b/docker/set-ci-vars.sh
@@ -104,6 +104,9 @@ export CI_REPO_SLUG=${CI_REPO_SLUG}
 export CI_CPU_ARCH=${CI_CPU_ARCH}
 
 echo CI_COMMIT=${CI_COMMIT}
+echo "-----------------------"
+echo $(git show ${CI_COMMIT})
+echo "-----------------------"
 echo CI_COMMIT_RANGE=${CI_COMMIT_RANGE}
 echo CI_BRANCH=${CI_BRANCH}
 echo CI_EVENT_TYPE=${CI_EVENT_TYPE}

From 4dbc9b1283c1587d6a889639f5de9328ea87d455 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Wed, 29 May 2024 07:15:20 -0700
Subject: [PATCH 31/40] force rebuild

---
 contrib/build-package.sh               | 6 +++---
 docker/images/install-cachelib-deps.sh | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/contrib/build-package.sh b/contrib/build-package.sh
index fbdf5c7347..a0db664595 100755
--- a/contrib/build-package.sh
+++ b/contrib/build-package.sh
@@ -78,7 +78,8 @@ build_tests=
 show_help=
 many_jobs=
 verbose=
-install_path=
+PREFIX="$PWD/opt/cachelib/"
+
 while getopts :BSdhijtvI: param
 do
   case $param in
@@ -90,7 +91,7 @@ do
     v) verbose=yes ;;
     j) many_jobs=yes ;;
     t) build_tests=yes ;;
-    I) install_path=${OPTARG} ; install=yes ;;
+    p) PREFIX=$OPTARG ;;
     ?) die "unknown option. See -h for help."
   esac
 done
@@ -283,7 +284,6 @@ test -d cachelib || die "expected 'cachelib' directory not found in $PWD"
 
 
 # After ensuring we are in the correct directory, set the installation prefix"
-PREFIX=${install_path:-"$PWD/opt/cachelib/"}
 CMAKE_PARAMS="$CMAKE_PARAMS -DCMAKE_INSTALL_PREFIX=$PREFIX"
 CMAKE_PREFIX_PATH="$PREFIX/lib/cmake:$PREFIX/lib64/cmake:$PREFIX/lib:$PREFIX/lib64:$PREFIX:${CMAKE_PREFIX_PATH:-}"
 export CMAKE_PREFIX_PATH
diff --git a/docker/images/install-cachelib-deps.sh b/docker/images/install-cachelib-deps.sh
index cbb97e8fee..4dd78b5f6b 100755
--- a/docker/images/install-cachelib-deps.sh
+++ b/docker/images/install-cachelib-deps.sh
@@ -7,6 +7,6 @@ echo 'Defaults env_keep += "HTTPS_PROXY https_proxy HTTP_PROXY http_proxy NO_PRO
 
 for pkg in zstd googleflags googlelog googletest sparsemap fmt folly fizz wangle mvfst fbthrift ;
 do
-    sudo ./contrib/build-package.sh -j -I /opt/ "$pkg"
+    sudo ./contrib/build-package.sh -j -p /opt/ "$pkg"
 done
 

From 82365064206c34e36c29489f7d0a3e5f434c8f1e Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Wed, 29 May 2024 07:27:33 -0700
Subject: [PATCH 32/40] print variables

---
 docker/set-ci-vars.sh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docker/set-ci-vars.sh b/docker/set-ci-vars.sh
index 590d3c26bd..5e416fae08 100755
--- a/docker/set-ci-vars.sh
+++ b/docker/set-ci-vars.sh
@@ -20,6 +20,10 @@ function get_commit_range_from_last_merge {
 
 		LAST_COMMIT=$(git log --pretty=%H -2 | tail -n1)
 		LAST_MERGE=$(git log --merges --pretty=%H -2 | tail -n1)
+                echo "last commit"
+                echo $(git show $LAST_COMMIT)
+                echo "last merge"
+                echo $(git show $LAST_MERGE)
 		# If still the last commit is a merge commit it means we're manually
 		# merging changes (probably back from stable branch). We have to use
 		# left parent of the merge and the current commit for COMMIT_RANGE.
@@ -40,6 +44,8 @@ function get_commit_range_from_last_merge {
 		# - pick up the first commit
 		LAST_MERGE=$(git log --pretty=%H | tail -n1)
 	fi
+        echo "last last merge"
+        echo $(git show $LAST_MERGE)
 	COMMIT_RANGE="${LAST_MERGE}..${RANGE_END}"
 	# make sure it works now
 	if ! git rev-list ${COMMIT_RANGE} >/dev/null; then

From 58496ed52a9438b5c31030860510dd1a0f67862c Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Wed, 29 May 2024 07:37:12 -0700
Subject: [PATCH 33/40] base off of develop

---
 docker/pull-or-rebuild-image.sh |  2 +-
 docker/set-ci-vars.sh           | 15 ++++++++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/docker/pull-or-rebuild-image.sh b/docker/pull-or-rebuild-image.sh
index dcdcb40e8c..43ac166b80 100755
--- a/docker/pull-or-rebuild-image.sh
+++ b/docker/pull-or-rebuild-image.sh
@@ -73,7 +73,7 @@ function push_image {
 		echo "Skip pushing the image to the Container Registry."
 	fi
 }
-
+echo "${1}"
 # If "rebuild" or "pull" are passed to the script as param, force rebuild/pull.
 if [[ "${1}" == "rebuild" ]]; then
 	build_image
diff --git a/docker/set-ci-vars.sh b/docker/set-ci-vars.sh
index 5e416fae08..42f12d284e 100755
--- a/docker/set-ci-vars.sh
+++ b/docker/set-ci-vars.sh
@@ -9,6 +9,13 @@
 
 set -e
 
+function get_commit_range_from_develop {
+    LAST_COMMIT=$(git log develop --pretty=%H -1)
+    RANGE_END="HEAD"
+    COMMIT_RANGE="${LAST_MERGE}..${RANGE_END}"
+    echo ${COMMIT_RANGE}
+}
+
 function get_commit_range_from_last_merge {
 	# get commit id of the last merge
 	LAST_MERGE=$(git log --merges --pretty=%H -1)
@@ -20,10 +27,6 @@ function get_commit_range_from_last_merge {
 
 		LAST_COMMIT=$(git log --pretty=%H -2 | tail -n1)
 		LAST_MERGE=$(git log --merges --pretty=%H -2 | tail -n1)
-                echo "last commit"
-                echo $(git show $LAST_COMMIT)
-                echo "last merge"
-                echo $(git show $LAST_MERGE)
 		# If still the last commit is a merge commit it means we're manually
 		# merging changes (probably back from stable branch). We have to use
 		# left parent of the merge and the current commit for COMMIT_RANGE.
@@ -44,8 +47,6 @@ function get_commit_range_from_last_merge {
 		# - pick up the first commit
 		LAST_MERGE=$(git log --pretty=%H | tail -n1)
 	fi
-        echo "last last merge"
-        echo $(git show $LAST_MERGE)
 	COMMIT_RANGE="${LAST_MERGE}..${RANGE_END}"
 	# make sure it works now
 	if ! git rev-list ${COMMIT_RANGE} >/dev/null; then
@@ -54,7 +55,7 @@ function get_commit_range_from_last_merge {
 	echo ${COMMIT_RANGE}
 }
 
-COMMIT_RANGE_FROM_LAST_MERGE=$(get_commit_range_from_last_merge)
+COMMIT_RANGE_FROM_LAST_MERGE=$(get_commit_range_from_develop)
 
 if [ -n "${TRAVIS}" ]; then
 	CI_COMMIT=${TRAVIS_COMMIT}

From a9ff0dfd232a6776aa502440effafe75ad6f6f88 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Wed, 29 May 2024 07:43:23 -0700
Subject: [PATCH 34/40] updates

---
 docker/set-ci-vars.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docker/set-ci-vars.sh b/docker/set-ci-vars.sh
index 42f12d284e..d3a21a0c43 100755
--- a/docker/set-ci-vars.sh
+++ b/docker/set-ci-vars.sh
@@ -10,7 +10,7 @@
 set -e
 
 function get_commit_range_from_develop {
-    LAST_COMMIT=$(git log develop --pretty=%H -1)
+    LAST_COMMIT=$(git log --pretty=%H -1)
     RANGE_END="HEAD"
     COMMIT_RANGE="${LAST_MERGE}..${RANGE_END}"
     echo ${COMMIT_RANGE}
@@ -114,6 +114,9 @@ echo CI_COMMIT=${CI_COMMIT}
 echo "-----------------------"
 echo $(git show ${CI_COMMIT})
 echo "-----------------------"
+echo "-----------------------"
+echo $(git show)
+echo "-----------------------"
 echo CI_COMMIT_RANGE=${CI_COMMIT_RANGE}
 echo CI_BRANCH=${CI_BRANCH}
 echo CI_EVENT_TYPE=${CI_EVENT_TYPE}

From 50b442c88a04f71ffdcd2c9d7eb1b9637c49b57b Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Wed, 29 May 2024 08:16:48 -0700
Subject: [PATCH 35/40] updates2

---
 docker/set-ci-vars.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/set-ci-vars.sh b/docker/set-ci-vars.sh
index d3a21a0c43..e6deee3e1f 100755
--- a/docker/set-ci-vars.sh
+++ b/docker/set-ci-vars.sh
@@ -10,9 +10,9 @@
 set -e
 
 function get_commit_range_from_develop {
-    LAST_COMMIT=$(git log --pretty=%H -1)
+    LAST_COMMIT=$(git log intel/develop --pretty=%H -1)
     RANGE_END="HEAD"
-    COMMIT_RANGE="${LAST_MERGE}..${RANGE_END}"
+    COMMIT_RANGE="${LAST_COMMIT}..${RANGE_END}"
     echo ${COMMIT_RANGE}
 }
 

From 705365bd32ae80763a351e01504c8920f01528ac Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Wed, 29 May 2024 08:18:51 -0700
Subject: [PATCH 36/40] updates3

---
 docker/set-ci-vars.sh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/docker/set-ci-vars.sh b/docker/set-ci-vars.sh
index e6deee3e1f..6b05457626 100755
--- a/docker/set-ci-vars.sh
+++ b/docker/set-ci-vars.sh
@@ -10,7 +10,7 @@
 set -e
 
 function get_commit_range_from_develop {
-    LAST_COMMIT=$(git log intel/develop --pretty=%H -1)
+    LAST_COMMIT=$(git log --pretty=%H -1)
     RANGE_END="HEAD"
     COMMIT_RANGE="${LAST_COMMIT}..${RANGE_END}"
     echo ${COMMIT_RANGE}
@@ -117,6 +117,12 @@ echo "-----------------------"
 echo "-----------------------"
 echo $(git show)
 echo "-----------------------"
+echo "-----------------------"
+echo $(git branches)
+echo "-----------------------"
+echo "-----------------------"
+echo $(git remote -v)
+echo "-----------------------"
 echo CI_COMMIT_RANGE=${CI_COMMIT_RANGE}
 echo CI_BRANCH=${CI_BRANCH}
 echo CI_EVENT_TYPE=${CI_EVENT_TYPE}

From 644ee88e02301c6a0d4354720a702dfd46383ef5 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Wed, 29 May 2024 08:20:40 -0700
Subject: [PATCH 37/40] updates4

---
 docker/set-ci-vars.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/set-ci-vars.sh b/docker/set-ci-vars.sh
index 6b05457626..2e919f415d 100755
--- a/docker/set-ci-vars.sh
+++ b/docker/set-ci-vars.sh
@@ -10,7 +10,7 @@
 set -e
 
 function get_commit_range_from_develop {
-    LAST_COMMIT=$(git log --pretty=%H -1)
+    LAST_COMMIT=$(git log origin/develop --pretty=%H -1)
     RANGE_END="HEAD"
     COMMIT_RANGE="${LAST_COMMIT}..${RANGE_END}"
     echo ${COMMIT_RANGE}

From d7f8aae05efc383ba575f9d019ac90ba059a9262 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Wed, 29 May 2024 08:25:41 -0700
Subject: [PATCH 38/40] updates5

---
 contrib/build-package.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/build-package.sh b/contrib/build-package.sh
index a0db664595..5fe106d523 100755
--- a/contrib/build-package.sh
+++ b/contrib/build-package.sh
@@ -92,7 +92,7 @@ do
     j) many_jobs=yes ;;
     t) build_tests=yes ;;
     p) PREFIX=$OPTARG ;;
-    ?) die "unknown option. See -h for help."
+    ?) die "unknown option $param. See -h for help."
   esac
 done
 test -n "$show_help" && show_help_and_exit;

From 5ee4821de657d72e2c7c885fad4859b09dc70798 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Wed, 29 May 2024 08:32:52 -0700
Subject: [PATCH 39/40] updates5

---
 contrib/build-package.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/contrib/build-package.sh b/contrib/build-package.sh
index 5fe106d523..cef2f3dffe 100755
--- a/contrib/build-package.sh
+++ b/contrib/build-package.sh
@@ -80,7 +80,7 @@ many_jobs=
 verbose=
 PREFIX="$PWD/opt/cachelib/"
 
-while getopts :BSdhijtvI: param
+while getopts :BSdhijtvp: param
 do
   case $param in
     i) install=yes ;;
@@ -91,7 +91,7 @@ do
     v) verbose=yes ;;
     j) many_jobs=yes ;;
     t) build_tests=yes ;;
-    p) PREFIX=$OPTARG ;;
+    p) PREFIX=${OPTARG} ;;
     ?) die "unknown option $param. See -h for help."
   esac
 done

From d5e03c83d564f05d7819051bab6ed029aebdf693 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Wed, 29 May 2024 08:56:45 -0700
Subject: [PATCH 40/40] easier build

---
 docker/images/install-cachelib-deps.sh | 3 ++-
 docker/run-build.sh                    | 7 ++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/docker/images/install-cachelib-deps.sh b/docker/images/install-cachelib-deps.sh
index 4dd78b5f6b..f95685fe21 100755
--- a/docker/images/install-cachelib-deps.sh
+++ b/docker/images/install-cachelib-deps.sh
@@ -7,6 +7,7 @@ echo 'Defaults env_keep += "HTTPS_PROXY https_proxy HTTP_PROXY http_proxy NO_PRO
 
 for pkg in zstd googleflags googlelog googletest sparsemap fmt folly fizz wangle mvfst fbthrift ;
 do
-    sudo ./contrib/build-package.sh -j -p /opt/ "$pkg"
+    #sudo ./contrib/build-package.sh -j -p /opt/ "$pkg"
+    sudo ./contrib/build-package.sh -j -v -d "$pkg"
 done
 
diff --git a/docker/run-build.sh b/docker/run-build.sh
index bc04819f18..a0ba69d27e 100755
--- a/docker/run-build.sh
+++ b/docker/run-build.sh
@@ -14,7 +14,8 @@ cd build
 
 source /opt/rh/gcc-toolset-12/enable
 
-cmake ../cachelib -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=/opt -DCMAKE_BUILD_TYPE=Debug
-sudo_password make install -j$(nproc)
+./build-package -d -j -v -T cachelib
+#cmake ../cachelib -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=/opt -DCMAKE_BUILD_TYPE=Debug
+#sudo_password make install -j$(nproc)
 
-cd /opt/tests && $WORKDIR/run_tests.sh
+cd opt/tests && $WORKDIR/run_tests.sh