diff --git a/.cirrus.yml b/.cirrus.yml
index e63ef51cf1a..bf97471d393 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -78,8 +78,8 @@ task:
     HOME: /root
     CIRRUS_WORKING_DIR: /home/runc
     GO_VERSION: "1.23"
-    BATS_VERSION: "v1.9.0"
-    RPMS: gcc git iptables jq glibc-static libseccomp-devel make criu fuse-sshfs container-selinux
+    BATS_VERSION: "v1.11.0"
+    RPMS: cpio gcc git iptables jq qemu-kvm glibc-static libseccomp-devel make criu fuse-sshfs container-selinux
     # yamllint disable rule:key-duplicates
     matrix:
       DISTRO: almalinux-8
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 754fd87f155..8d7703d91c9 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -105,7 +105,7 @@ jobs:
     - name: install deps
       run: |
         sudo apt update
-        sudo apt -y install libseccomp-dev sshfs uidmap
+        sudo apt -y install cpio libseccomp-dev qemu-kvm sshfs uidmap
 
     - name: install CRIU
       if: ${{ matrix.criu == '' }}
@@ -140,7 +140,7 @@ jobs:
     - name: Setup Bats and bats libs
       uses: bats-core/bats-action@3.0.0
       with:
-        bats-version: 1.9.0
+        bats-version: 1.11.0
         support-install: false
         assert-install: false
         detik-install: false
diff --git a/Dockerfile b/Dockerfile
index f51f5956c85..bd21290ce82 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 ARG GO_VERSION=1.23
-ARG BATS_VERSION=v1.9.0
+ARG BATS_VERSION=v1.11.0
 ARG LIBSECCOMP_VERSION=2.5.5
 
 FROM golang:${GO_VERSION}-bookworm
@@ -16,6 +16,7 @@ RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \
         criu \
         gcc \
         gcc-multilib \
+        cpio \
         curl \
         gawk \
         gperf \
@@ -24,6 +25,7 @@ RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \
         kmod \
         pkg-config \
         python3-minimal \
+        qemu-kvm \
         sshfs \
         sudo \
         uidmap \
diff --git a/Makefile b/Makefile
index 0a15fd908ea..730b853ca1a 100644
--- a/Makefile
+++ b/Makefile
@@ -164,6 +164,7 @@ localunittest: all
 integration: runcimage
 	$(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \
 		-t --privileged --rm \
+		-v /boot:/boot:ro \
 		-v /lib/modules:/lib/modules:ro \
 		-v $(CURDIR):/go/src/$(PROJECT) \
 		$(RUNC_IMAGE) make localintegration TESTPATH="$(TESTPATH)"
diff --git a/Vagrantfile.fedora b/Vagrantfile.fedora
index 4863808ee51..018a10c879a 100644
--- a/Vagrantfile.fedora
+++ b/Vagrantfile.fedora
@@ -24,7 +24,7 @@ Vagrant.configure("2") do |config|
       cat << EOF | dnf -y --exclude=kernel,kernel-core shell && break
 config install_weak_deps false
 update
-install iptables gcc golang-go make glibc-static libseccomp-devel bats jq git-core criu fuse-sshfs container-selinux
+install cpio iptables gcc golang-go make qemu-kvm glibc-static libseccomp-devel bats jq git-core criu fuse-sshfs container-selinux
 ts run
 EOF
     done
diff --git a/create.go b/create.go
index 8ed59b2e75c..59827b34248 100644
--- a/create.go
+++ b/create.go
@@ -44,8 +44,9 @@ command(s) that get executed on start, edit the args parameter of the spec. See
 			Usage: "specify the file to write the process id to",
 		},
 		cli.BoolFlag{
-			Name:  "no-pivot",
-			Usage: "do not use pivot root to jail process inside rootfs.  This should be used whenever the rootfs is on top of a ramdisk",
+			Name:   "no-pivot",
+			Usage:  "(deprecated; do not use)",
+			Hidden: true,
 		},
 		cli.BoolFlag{
 			Name:  "no-new-keyring",
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
index f7cd95dd189..8a26726b7a7 100644
--- a/libcontainer/rootfs_linux.go
+++ b/libcontainer/rootfs_linux.go
@@ -202,10 +202,19 @@ func prepareRootfs(pipe *syncSocket, iConfig *initConfig) (err error) {
 		return err
 	}
 
-	if config.NoPivotRoot {
-		err = msMoveRoot(config.Rootfs)
-	} else if config.Namespaces.Contains(configs.NEWNS) {
+	if config.Namespaces.Contains(configs.NEWNS) {
 		err = pivotRoot(config.Rootfs)
+		if config.NoPivotRoot {
+			logrus.Warnf("--no-pivot is deprecated and may be removed or silently ignored in a future version of runc -- see <https://github.com/opencontainers/runc/issues/4435> for more details")
+			if err != nil {
+				// Always try to do pivot_root(2) because it's safe, and only fallback
+				// to the unsafe MS_MOVE+chroot(2) dance if pivot_root(2) fails.
+				logrus.Warnf("your container failed to start with pivot_root(2) (%v) -- please open a bug report to let us know about your usecase", err)
+				err = msMoveRoot(config.Rootfs)
+			} else {
+				logrus.Warnf("despite setting --no-pivot, this container successfully started using pivot_root(2) -- consider removing the --no-pivot flag")
+			}
+		}
 	} else {
 		err = chroot()
 	}
@@ -1068,19 +1077,58 @@ func pivotRoot(rootfs string) error {
 	}
 	defer unix.Close(oldroot) //nolint: errcheck
 
-	newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0)
-	if err != nil {
-		return &os.PathError{Op: "open", Path: rootfs, Err: err}
-	}
-	defer unix.Close(newroot) //nolint: errcheck
-
 	// Change to the new root so that the pivot_root actually acts on it.
-	if err := unix.Fchdir(newroot); err != nil {
-		return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(newroot), Err: err}
+	if err := os.Chdir(rootfs); err != nil {
+		return err
 	}
 
-	if err := unix.PivotRoot(".", "."); err != nil {
-		return &os.PathError{Op: "pivot_root", Path: ".", Err: err}
+	pivotErr := unix.PivotRoot(".", ".")
+	if errors.Is(pivotErr, unix.EINVAL) {
+		// If pivot_root(2) failed with -EINVAL, one of the possible reasons is
+		// that we are in early boot and trying pivot_root on top of the
+		// initramfs (which isn't allowed because initramfs/rootfs doesn't have
+		// a parent mount).
+		//
+		// Traditionally, users were told to pass --no-pivot (which used chroot
+		// instead) but this is very insecure (even with the hardenings we've
+		// put into our chroot() wrapper).
+		//
+		// A much better solution is to create a bind-mount clone of / (which
+		// would have a parent) and then chroot into that clone so that we are
+		// properly rooted within a mount that has a parent mount. Then we can
+		// retry the pivot_root().
+
+		// Clone / on top of . to create a version of / that has a parent and
+		// so can be pivot-rooted.
+		if err := unix.Mount("/", ".", "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
+			err := &os.PathError{Op: "make clone of / mount", Path: rootfs, Err: err}
+			return fmt.Errorf("error during fallback for failed pivot_root (%w): %w", pivotErr, err)
+		}
+		// Switch to the cloned mount. We have to use the full path here
+		// because we need to get the kernel to move us into the new mount
+		// (chdir(".") will keep us in the old non-cloned / mount).
+		if err := os.Chdir(rootfs); err != nil {
+			return fmt.Errorf("error during fallback for failed pivot_root (%w): switch to cloned mount: %w", pivotErr, err)
+		}
+		// Move the cloned mount to /.
+		if err := unix.Mount(".", "/", "", unix.MS_MOVE, ""); err != nil {
+			err := &os.PathError{Op: "move / clone mount to /", Path: rootfs, Err: err}
+			return fmt.Errorf("error during fallback for failed pivot_root (%w): %w", pivotErr, err)
+		}
+		// Update current->fs->root to be the cloned / (to be pivot_root'd).
+		if err := unix.Chroot("."); err != nil {
+			err := &os.PathError{Op: "chroot into cloned /", Path: rootfs, Err: err}
+			return fmt.Errorf("error during fallback for failed pivot_root (%w): %w", pivotErr, err)
+		}
+
+		// Go back to the container rootfs and retry pivot_root.
+		if err := os.Chdir(rootfs); err != nil {
+			return fmt.Errorf("error during fallback for failed pivot_root (%w): %w", pivotErr, err)
+		}
+		pivotErr = unix.PivotRoot(".", ".")
+	}
+	if pivotErr != nil {
+		return &os.PathError{Op: "pivot_root", Path: rootfs, Err: pivotErr}
 	}
 
 	// Currently our "." is oldroot (according to the current kernel code).
diff --git a/run.go b/run.go
index b03b8129bd9..6348ad00cd6 100644
--- a/run.go
+++ b/run.go
@@ -57,8 +57,9 @@ command(s) that get executed on start, edit the args parameter of the spec. See
 			Usage: "disable the use of the subreaper used to reap reparented processes",
 		},
 		cli.BoolFlag{
-			Name:  "no-pivot",
-			Usage: "do not use pivot root to jail process inside rootfs.  This should be used whenever the rootfs is on top of a ramdisk",
+			Name:   "no-pivot",
+			Usage:  "(deprecated; do not use)",
+			Hidden: true,
 		},
 		cli.BoolFlag{
 			Name:  "no-new-keyring",
diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash
index 85f1fb0a558..726f61faf28 100755
--- a/tests/integration/helpers.bash
+++ b/tests/integration/helpers.bash
@@ -39,18 +39,54 @@ ARCH=$(uname -m)
 # Seccomp agent socket.
 SECCCOMP_AGENT_SOCKET="$BATS_TMPDIR/seccomp-agent.sock"
 
-# Wrapper for runc.
-function runc() {
-	run __runc "$@"
+function sane_run() { # --pipe --timeout=<timeout>
+	local getopt
+	getopt="$(getopt -o + --long pipe,timeout: -- "$@")"
+	eval set -- "$getopt"
+
+	pipe=
+	timeout=
+	while true; do
+		case "$1" in
+		--pipe)
+			pipe=1
+			shift
+			;;
+		--timeout)
+			timeout="$2"
+			shift 2
+			;;
+		--)
+			shift
+			break
+			;;
+		*)
+			fail "unknown argument $1 ${2:-} to sane_run"
+			;;
+		esac
+	done
+	cmd_prefix=()
+	[ -n "$pipe" ] && cmd_prefix+=("bats_pipe")
+	[ -n "$timeout" ] && cmd_prefix+=("timeout" "--foreground" "--signal=KILL" "$timeout")
+
+	local cmd="$1"
+	shift
+
+	run "${cmd_prefix[@]}" "$cmd" "$@"
 
 	# Some debug information to make life easier. bats will only print it if the
 	# test failed, in which case the output is useful.
 	# shellcheck disable=SC2154
-	echo "$(basename "$RUNC") $* (status=$status):" >&2
+	echo "$cmd $* (status=$status):" >&2
 	# shellcheck disable=SC2154
 	echo "$output" >&2
 }
 
+# Wrapper for runc.
+function runc() {
+	sane_run __runc "$@"
+}
+
 # Raw wrapper for runc.
 function __runc() {
 	"$RUNC" ${RUNC_USE_SYSTEMD+--systemd-cgroup} \
diff --git a/tests/integration/initramfs.bats b/tests/integration/initramfs.bats
new file mode 100644
index 00000000000..39332f541b1
--- /dev/null
+++ b/tests/integration/initramfs.bats
@@ -0,0 +1,167 @@
+#!/usr/bin/env bats
+
+load helpers
+
+# Rather than building our own kernel for use with qemu, just reuse the host's
+# kernel since we just need some kernel that supports containers that we can
+# use to run our custom initramfs.
+function find_vmlinuz() {
+	shopt -s nullglob
+	local candidate candidates=(
+		/boot/vmlinuz
+		/boot/vmlinuz-"$(uname -r)"*
+		/usr/lib*/modules/"$(uname -r)"/vmlinuz*
+	)
+	shopt -u nullglob
+
+	for candidate in "${candidates[@]}"; do
+		[ -e "$candidate" ] || continue
+		export HOST_KERNEL="$candidate"
+		return 0
+	done
+
+	# Actuated doesn't provide a copy of the boot kernel, so we have to skip
+	# the test in that case. It also seems they don't allow aarch64 guests
+	# either (see <https://docs.actuated.com/examples/kvm-guest/>).
+	skip "could not find host vmlinuz kernel"
+}
+
+function setup() {
+	INITRAMFS_ROOT="$(mktemp -d "$BATS_RUN_TMPDIR/runc-initramfs.XXXXXX")"
+	find_vmlinuz
+}
+
+function teardown() {
+	[ -v INITRAMFS_ROOT ] && rm -rf "$INITRAMFS_ROOT"
+}
+
+function qemu_native() {
+	# Different distributions put qemu-kvm in different locations and with
+	# different names. Debian and Ubuntu have a "kvm" binary, while AlmaLinux
+	# has /usr/libexec/qemu-kvm.
+	local qemu_binary="" qemu_candidates=("kvm" "qemu-kvm" "/usr/libexec/qemu-kvm")
+	local candidate
+	for candidate in "${qemu_candidates[@]}"; do
+		"$candidate" -help &>/dev/null || continue
+		qemu_binary="$candidate"
+		break
+	done
+	# TODO: Maybe we should also try to call qemu-system-FOO for the current
+	# architecture if qemu-kvm is missing?
+	[ -n "$qemu_binary" ] || skip "could not find qemu-kvm binary"
+
+	local machine=
+	case "$(go env GOARCH)" in
+	386 | amd64)
+		# Try to use a slightly newer PC CPU.
+		machine="pc"
+		;;
+	arm | arm64)
+		# ARM doesn't provide a "default" machine value (because its use is so
+		# varied) so we have to specify the machine manually.
+		machine="virt"
+		;;
+	*)
+		echo "could not figure out -machine argument for qemu -- using default" >&2
+		;;
+	esac
+	# We use -cpu max to ensure that the glibc we built runc with doesn't rely
+	# on CPU features that the default QEMU CPU doesn't support (such as on
+	# AlmaLinux 9).
+	local machine_args=("-cpu" "max")
+	[ -n "$machine" ] && machine_args+=("-machine" "$machine")
+
+	sane_run --timeout=3m \
+		"$qemu_binary" "${machine_args[@]}" "$@"
+	if [ "$status" -ne 0 ]; then
+		# To help with debugging, output the set of valid machine values.
+		"$qemu_binary" -machine help >&2
+	fi
+}
+
+@test "runc run [initramfs + pivot_root]" {
+	requires root
+
+	# Configure our minimal initrd.
+	mkdir -p "$INITRAMFS_ROOT/initrd"
+	pushd "$INITRAMFS_ROOT/initrd"
+
+	# Use busybox as a base for our initrd.
+	tar --exclude './dev/*' -xf "$BUSYBOX_IMAGE"
+	# Make sure that "sh" and "poweroff" are installed, otherwise qemu will
+	# boot loop when init stops.
+	[ -x ./bin/sh ] || skip "busybox image is missing /bin/sh"
+	[ -x ./bin/poweroff ] || skip "busybox image is missing /bin/poweroff"
+
+	# Copy the runc binary into the container. In theory we would prefer to
+	# copy a static binary, but some distros (like openSUSE) don't ship
+	# libseccomp-static so requiring a static build for any integration test
+	# run wouldn't work. Instead, we copy all of the library dependencies into
+	# the rootfs (note that we also have to copy ld-linux-*.so because runc was
+	# probably built with a newer glibc than the one in our busybox image.
+	cp "$RUNC" ./bin/runc
+	readarray -t runclibs \
+		<<<"$(ldd "$RUNC" | grep -Eo '/[^ ]*lib[^ ]*.so.[^ ]*')"
+	cp -vt ./lib64/ "${runclibs[@]}"
+	# busybox has /lib64 -> /lib so we can just fill in one path.
+
+	# Create a container bundle using the same busybox image.
+	mkdir -p ./run/bundle
+	pushd ./run/bundle
+	mkdir -p rootfs
+	tar --exclude './dev/*' -C rootfs -xf "$BUSYBOX_IMAGE"
+	runc spec
+	update_config '.process.args = ["/bin/echo", "hello from inside the container"]'
+	popd
+
+	# Build a custom /init script.
+	cat >./init <<-EOF
+		#!/bin/sh
+
+		set -x
+		echo "==START INIT SCRIPT=="
+
+		mkdir -p /proc
+		mount -t proc proc /proc
+		mkdir -p /sys
+		mount -t sysfs sysfs /sys
+
+		mkdir -p /sys/fs/cgroup
+		mount -t cgroup2 cgroup2 /sys/fs/cgroup
+
+		mkdir -p /tmp
+		mount -t tmpfs tmpfs /tmp
+
+		mkdir -p /dev
+		mount -t devtmpfs devtmpfs /dev
+		mkdir -p /dev/pts
+		mount -t devpts -o newinstance devpts /dev/pts
+		mkdir -p /dev/shm
+		mount --bind /tmp /dev/shm
+
+		# Wait for as little as possible if we panic so we can output the error
+		# log as part of the test failure before the test times out.
+		echo 1 >/proc/sys/kernel/panic
+
+		runc run -b /run/bundle ctr
+
+		echo "==END INIT SCRIPT=="
+		poweroff -f
+	EOF
+	chmod +x ./init
+
+	find . | cpio -o -H newc >"$INITRAMFS_ROOT/initrd.cpio"
+	popd
+
+	# Now we can just run the image (use qemu-kvm so that we run on the same
+	# architecture as the host system). We can just reuse the host kernel.
+	qemu_native \
+		-initrd "$INITRAMFS_ROOT/initrd.cpio" \
+		-kernel "$HOST_KERNEL" \
+		-m 512M \
+		-nographic -append console=ttyS0 -no-reboot
+	[ "$status" -eq 0 ]
+	[[ "$output" = *"==START INIT SCRIPT=="* ]]
+	[[ "$output" = *"hello from inside the container"* ]]
+	[[ "$output" = *"==END INIT SCRIPT=="* ]]
+}