diff --git a/.cirrus.yml b/.cirrus.yml index e63ef51cf1a..bf97471d393 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -78,8 +78,8 @@ task: HOME: /root CIRRUS_WORKING_DIR: /home/runc GO_VERSION: "1.23" - BATS_VERSION: "v1.9.0" - RPMS: gcc git iptables jq glibc-static libseccomp-devel make criu fuse-sshfs container-selinux + BATS_VERSION: "v1.11.0" + RPMS: cpio gcc git iptables jq qemu-kvm glibc-static libseccomp-devel make criu fuse-sshfs container-selinux # yamllint disable rule:key-duplicates matrix: DISTRO: almalinux-8 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 754fd87f155..8d7703d91c9 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -105,7 +105,7 @@ jobs: - name: install deps run: | sudo apt update - sudo apt -y install libseccomp-dev sshfs uidmap + sudo apt -y install cpio libseccomp-dev qemu-kvm sshfs uidmap - name: install CRIU if: ${{ matrix.criu == '' }} @@ -140,7 +140,7 @@ jobs: - name: Setup Bats and bats libs uses: bats-core/bats-action@3.0.0 with: - bats-version: 1.9.0 + bats-version: 1.11.0 support-install: false assert-install: false detik-install: false diff --git a/Dockerfile b/Dockerfile index f51f5956c85..bd21290ce82 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ ARG GO_VERSION=1.23 -ARG BATS_VERSION=v1.9.0 +ARG BATS_VERSION=v1.11.0 ARG LIBSECCOMP_VERSION=2.5.5 FROM golang:${GO_VERSION}-bookworm @@ -16,6 +16,7 @@ RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \ criu \ gcc \ gcc-multilib \ + cpio \ curl \ gawk \ gperf \ @@ -24,6 +25,7 @@ RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \ kmod \ pkg-config \ python3-minimal \ + qemu-kvm \ sshfs \ sudo \ uidmap \ diff --git a/Makefile b/Makefile index 0a15fd908ea..730b853ca1a 100644 --- a/Makefile +++ b/Makefile @@ -164,6 +164,7 @@ localunittest: all integration: runcimage $(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \ -t --privileged --rm \ + -v /boot:/boot:ro \ -v /lib/modules:/lib/modules:ro \ -v $(CURDIR):/go/src/$(PROJECT) \ $(RUNC_IMAGE) make localintegration TESTPATH="$(TESTPATH)" diff --git a/Vagrantfile.fedora b/Vagrantfile.fedora index 4863808ee51..018a10c879a 100644 --- a/Vagrantfile.fedora +++ b/Vagrantfile.fedora @@ -24,7 +24,7 @@ Vagrant.configure("2") do |config| cat << EOF | dnf -y --exclude=kernel,kernel-core shell && break config install_weak_deps false update -install iptables gcc golang-go make glibc-static libseccomp-devel bats jq git-core criu fuse-sshfs container-selinux +install cpio iptables gcc golang-go make qemu-kvm glibc-static libseccomp-devel bats jq git-core criu fuse-sshfs container-selinux ts run EOF done diff --git a/create.go b/create.go index 8ed59b2e75c..59827b34248 100644 --- a/create.go +++ b/create.go @@ -44,8 +44,9 @@ command(s) that get executed on start, edit the args parameter of the spec. See Usage: "specify the file to write the process id to", }, cli.BoolFlag{ - Name: "no-pivot", - Usage: "do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk", + Name: "no-pivot", + Usage: "(deprecated; do not use)", + Hidden: true, }, cli.BoolFlag{ Name: "no-new-keyring", diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index f7cd95dd189..8a26726b7a7 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -202,10 +202,19 @@ func prepareRootfs(pipe *syncSocket, iConfig *initConfig) (err error) { return err } - if config.NoPivotRoot { - err = msMoveRoot(config.Rootfs) - } else if config.Namespaces.Contains(configs.NEWNS) { + if config.Namespaces.Contains(configs.NEWNS) { err = pivotRoot(config.Rootfs) + if config.NoPivotRoot { + logrus.Warnf("--no-pivot is deprecated and may be removed or silently ignored in a future version of runc -- see for more details") + if err != nil { + // Always try to do pivot_root(2) because it's safe, and only fallback + // to the unsafe MS_MOVE+chroot(2) dance if pivot_root(2) fails. + logrus.Warnf("your container failed to start with pivot_root(2) (%v) -- please open a bug report to let us know about your usecase", err) + err = msMoveRoot(config.Rootfs) + } else { + logrus.Warnf("despite setting --no-pivot, this container successfully started using pivot_root(2) -- consider removing the --no-pivot flag") + } + } } else { err = chroot() } @@ -1068,19 +1077,58 @@ func pivotRoot(rootfs string) error { } defer unix.Close(oldroot) //nolint: errcheck - newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0) - if err != nil { - return &os.PathError{Op: "open", Path: rootfs, Err: err} - } - defer unix.Close(newroot) //nolint: errcheck - // Change to the new root so that the pivot_root actually acts on it. - if err := unix.Fchdir(newroot); err != nil { - return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(newroot), Err: err} + if err := os.Chdir(rootfs); err != nil { + return err } - if err := unix.PivotRoot(".", "."); err != nil { - return &os.PathError{Op: "pivot_root", Path: ".", Err: err} + pivotErr := unix.PivotRoot(".", ".") + if errors.Is(pivotErr, unix.EINVAL) { + // If pivot_root(2) failed with -EINVAL, one of the possible reasons is + // that we are in early boot and trying pivot_root on top of the + // initramfs (which isn't allowed because initramfs/rootfs doesn't have + // a parent mount). + // + // Traditionally, users were told to pass --no-pivot (which used chroot + // instead) but this is very insecure (even with the hardenings we've + // put into our chroot() wrapper). + // + // A much better solution is to create a bind-mount clone of / (which + // would have a parent) and then chroot into that clone so that we are + // properly rooted within a mount that has a parent mount. Then we can + // retry the pivot_root(). + + // Clone / on top of . to create a version of / that has a parent and + // so can be pivot-rooted. + if err := unix.Mount("/", ".", "", unix.MS_BIND|unix.MS_REC, ""); err != nil { + err := &os.PathError{Op: "make clone of / mount", Path: rootfs, Err: err} + return fmt.Errorf("error during fallback for failed pivot_root (%w): %w", pivotErr, err) + } + // Switch to the cloned mount. We have to use the full path here + // because we need to get the kernel to move us into the new mount + // (chdir(".") will keep us in the old non-cloned / mount). + if err := os.Chdir(rootfs); err != nil { + return fmt.Errorf("error during fallback for failed pivot_root (%w): switch to cloned mount: %w", pivotErr, err) + } + // Move the cloned mount to /. + if err := unix.Mount(".", "/", "", unix.MS_MOVE, ""); err != nil { + err := &os.PathError{Op: "move / clone mount to /", Path: rootfs, Err: err} + return fmt.Errorf("error during fallback for failed pivot_root (%w): %w", pivotErr, err) + } + // Update current->fs->root to be the cloned / (to be pivot_root'd). + if err := unix.Chroot("."); err != nil { + err := &os.PathError{Op: "chroot into cloned /", Path: rootfs, Err: err} + return fmt.Errorf("error during fallback for failed pivot_root (%w): %w", pivotErr, err) + } + + // Go back to the container rootfs and retry pivot_root. + if err := os.Chdir(rootfs); err != nil { + return fmt.Errorf("error during fallback for failed pivot_root (%w): %w", pivotErr, err) + } + pivotErr = unix.PivotRoot(".", ".") + } + if pivotErr != nil { + return &os.PathError{Op: "pivot_root", Path: rootfs, Err: pivotErr} } // Currently our "." is oldroot (according to the current kernel code). diff --git a/run.go b/run.go index b03b8129bd9..6348ad00cd6 100644 --- a/run.go +++ b/run.go @@ -57,8 +57,9 @@ command(s) that get executed on start, edit the args parameter of the spec. See Usage: "disable the use of the subreaper used to reap reparented processes", }, cli.BoolFlag{ - Name: "no-pivot", - Usage: "do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk", + Name: "no-pivot", + Usage: "(deprecated; do not use)", + Hidden: true, }, cli.BoolFlag{ Name: "no-new-keyring", diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash index 85f1fb0a558..726f61faf28 100755 --- a/tests/integration/helpers.bash +++ b/tests/integration/helpers.bash @@ -39,18 +39,54 @@ ARCH=$(uname -m) # Seccomp agent socket. SECCCOMP_AGENT_SOCKET="$BATS_TMPDIR/seccomp-agent.sock" -# Wrapper for runc. -function runc() { - run __runc "$@" +function sane_run() { # --pipe --timeout= + local getopt + getopt="$(getopt -o + --long pipe,timeout: -- "$@")" + eval set -- "$getopt" + + pipe= + timeout= + while true; do + case "$1" in + --pipe) + pipe=1 + shift + ;; + --timeout) + timeout="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + fail "unknown argument $1 ${2:-} to sane_run" + ;; + esac + done + cmd_prefix=() + [ -n "$pipe" ] && cmd_prefix+=("bats_pipe") + [ -n "$timeout" ] && cmd_prefix+=("timeout" "--foreground" "--signal=KILL" "$timeout") + + local cmd="$1" + shift + + run "${cmd_prefix[@]}" "$cmd" "$@" # Some debug information to make life easier. bats will only print it if the # test failed, in which case the output is useful. # shellcheck disable=SC2154 - echo "$(basename "$RUNC") $* (status=$status):" >&2 + echo "$cmd $* (status=$status):" >&2 # shellcheck disable=SC2154 echo "$output" >&2 } +# Wrapper for runc. +function runc() { + sane_run __runc "$@" +} + # Raw wrapper for runc. function __runc() { "$RUNC" ${RUNC_USE_SYSTEMD+--systemd-cgroup} \ diff --git a/tests/integration/initramfs.bats b/tests/integration/initramfs.bats new file mode 100644 index 00000000000..39332f541b1 --- /dev/null +++ b/tests/integration/initramfs.bats @@ -0,0 +1,167 @@ +#!/usr/bin/env bats + +load helpers + +# Rather than building our own kernel for use with qemu, just reuse the host's +# kernel since we just need some kernel that supports containers that we can +# use to run our custom initramfs. +function find_vmlinuz() { + shopt -s nullglob + local candidate candidates=( + /boot/vmlinuz + /boot/vmlinuz-"$(uname -r)"* + /usr/lib*/modules/"$(uname -r)"/vmlinuz* + ) + shopt -u nullglob + + for candidate in "${candidates[@]}"; do + [ -e "$candidate" ] || continue + export HOST_KERNEL="$candidate" + return 0 + done + + # Actuated doesn't provide a copy of the boot kernel, so we have to skip + # the test in that case. It also seems they don't allow aarch64 guests + # either (see ). + skip "could not find host vmlinuz kernel" +} + +function setup() { + INITRAMFS_ROOT="$(mktemp -d "$BATS_RUN_TMPDIR/runc-initramfs.XXXXXX")" + find_vmlinuz +} + +function teardown() { + [ -v INITRAMFS_ROOT ] && rm -rf "$INITRAMFS_ROOT" +} + +function qemu_native() { + # Different distributions put qemu-kvm in different locations and with + # different names. Debian and Ubuntu have a "kvm" binary, while AlmaLinux + # has /usr/libexec/qemu-kvm. + local qemu_binary="" qemu_candidates=("kvm" "qemu-kvm" "/usr/libexec/qemu-kvm") + local candidate + for candidate in "${qemu_candidates[@]}"; do + "$candidate" -help &>/dev/null || continue + qemu_binary="$candidate" + break + done + # TODO: Maybe we should also try to call qemu-system-FOO for the current + # architecture if qemu-kvm is missing? + [ -n "$qemu_binary" ] || skip "could not find qemu-kvm binary" + + local machine= + case "$(go env GOARCH)" in + 386 | amd64) + # Try to use a slightly newer PC CPU. + machine="pc" + ;; + arm | arm64) + # ARM doesn't provide a "default" machine value (because its use is so + # varied) so we have to specify the machine manually. + machine="virt" + ;; + *) + echo "could not figure out -machine argument for qemu -- using default" >&2 + ;; + esac + # We use -cpu max to ensure that the glibc we built runc with doesn't rely + # on CPU features that the default QEMU CPU doesn't support (such as on + # AlmaLinux 9). + local machine_args=("-cpu" "max") + [ -n "$machine" ] && machine_args+=("-machine" "$machine") + + sane_run --timeout=3m \ + "$qemu_binary" "${machine_args[@]}" "$@" + if [ "$status" -ne 0 ]; then + # To help with debugging, output the set of valid machine values. + "$qemu_binary" -machine help >&2 + fi +} + +@test "runc run [initramfs + pivot_root]" { + requires root + + # Configure our minimal initrd. + mkdir -p "$INITRAMFS_ROOT/initrd" + pushd "$INITRAMFS_ROOT/initrd" + + # Use busybox as a base for our initrd. + tar --exclude './dev/*' -xf "$BUSYBOX_IMAGE" + # Make sure that "sh" and "poweroff" are installed, otherwise qemu will + # boot loop when init stops. + [ -x ./bin/sh ] || skip "busybox image is missing /bin/sh" + [ -x ./bin/poweroff ] || skip "busybox image is missing /bin/poweroff" + + # Copy the runc binary into the container. In theory we would prefer to + # copy a static binary, but some distros (like openSUSE) don't ship + # libseccomp-static so requiring a static build for any integration test + # run wouldn't work. Instead, we copy all of the library dependencies into + # the rootfs (note that we also have to copy ld-linux-*.so because runc was + # probably built with a newer glibc than the one in our busybox image. + cp "$RUNC" ./bin/runc + readarray -t runclibs \ + <<<"$(ldd "$RUNC" | grep -Eo '/[^ ]*lib[^ ]*.so.[^ ]*')" + cp -vt ./lib64/ "${runclibs[@]}" + # busybox has /lib64 -> /lib so we can just fill in one path. + + # Create a container bundle using the same busybox image. + mkdir -p ./run/bundle + pushd ./run/bundle + mkdir -p rootfs + tar --exclude './dev/*' -C rootfs -xf "$BUSYBOX_IMAGE" + runc spec + update_config '.process.args = ["/bin/echo", "hello from inside the container"]' + popd + + # Build a custom /init script. + cat >./init <<-EOF + #!/bin/sh + + set -x + echo "==START INIT SCRIPT==" + + mkdir -p /proc + mount -t proc proc /proc + mkdir -p /sys + mount -t sysfs sysfs /sys + + mkdir -p /sys/fs/cgroup + mount -t cgroup2 cgroup2 /sys/fs/cgroup + + mkdir -p /tmp + mount -t tmpfs tmpfs /tmp + + mkdir -p /dev + mount -t devtmpfs devtmpfs /dev + mkdir -p /dev/pts + mount -t devpts -o newinstance devpts /dev/pts + mkdir -p /dev/shm + mount --bind /tmp /dev/shm + + # Wait for as little as possible if we panic so we can output the error + # log as part of the test failure before the test times out. + echo 1 >/proc/sys/kernel/panic + + runc run -b /run/bundle ctr + + echo "==END INIT SCRIPT==" + poweroff -f + EOF + chmod +x ./init + + find . | cpio -o -H newc >"$INITRAMFS_ROOT/initrd.cpio" + popd + + # Now we can just run the image (use qemu-kvm so that we run on the same + # architecture as the host system). We can just reuse the host kernel. + qemu_native \ + -initrd "$INITRAMFS_ROOT/initrd.cpio" \ + -kernel "$HOST_KERNEL" \ + -m 512M \ + -nographic -append console=ttyS0 -no-reboot + [ "$status" -eq 0 ] + [[ "$output" = *"==START INIT SCRIPT=="* ]] + [[ "$output" = *"hello from inside the container"* ]] + [[ "$output" = *"==END INIT SCRIPT=="* ]] +}