Skip to content

Commit 0432e3a

Browse files
committed
Use a minimal initrd to switch to the full initrd stored in /usr
The growth of binaries over time and the inclusion of new features filled the available boot partition space, so that the kernel+initrd almost couldn't fit twice anymore as required for updates. We employed workarounds such as wrapper scripts for ignition, afterburn and other binaries so that they are loaded from /usr. However, this was still not enough and we would have to do the same for (network) kernel modules and firmware. To avoid making this ever more complex we can use a dedicated initrd focused on loading the full initrd from /usr and then this full initrd can use dracut as before and even drop all the workarounds we accumulated. Introduce a busybox init script that prepares a minimal environment, has debug toggles and an emergency shell, and only loads the real initrd from /usr to switch over to it. Because mdev is not a proper udev replacement, some additional scripting is needed. Busybox's modprobe can't work with dependencies well and we need the real kmod for that (which is also good to guarantee have the same modprobe options set). Also, some other busybox commands are often lacking things such as loading a kernel module automatically and this has to be done explicitly. We still set up dm-verity for /usr so that we have the same security properties (The code comes from the bootengine systemd generators we have and also covers the PXE boot with a squashfs /usr passed from an additional cpio). The real initrd then reuses the mount point for /usr, and loads any kernel modules and firmware that wasn't loaded already. We also have to make the dependencies for parse-ip-for-networkd.service a bit more explicit because the removal of the /sysusr mount in the full initrd exposed a race condition. Signed-off-by: Kai Lueke <[email protected]>
1 parent daf43bf commit 0432e3a

File tree

5 files changed

+209
-1
lines changed

5 files changed

+209
-1
lines changed

dracut/03flatcar-network/parse-ip-for-networkd.service

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Description=Write systemd-networkd units from cmdline
33
DefaultDependencies=false
44

5-
After=afterburn-network-kargs.service
5+
After=afterburn-network-kargs.service dracut-cmdline.service
66
PartOf=systemd-networkd.service
77
Before=systemd-networkd.service initrd-switch-root.target
88
# Switching the root filesystem terminates all running services with binaries from the initramfs, we need to finish before that happens

dracut/10diskless-generator/diskless-generator

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22
# -*- mode: shell-script; indent-tabs-mode: nil; sh-basic-offset: 4; -*-
33
# ex: ts=8 sw=4 sts=4 et filetype=sh
44

5+
# NOTE: The /usr.squashfs mounting for /sysusr is done in /minimal-init
6+
# (making the mount unit here a no-op) but the /sysroot mounting is
7+
# and must still be done here, same for the rootfs RAM setup
8+
59
set -e
610

711
UNIT_DIR="${1:-/tmp}"

dracut/10usr-generator/usr-generator

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
# by systemd-fstab-generator. This module is only needed for old
1111
# bootloaders that pass usr=.
1212

13+
# NOTE: Now done in /minimal-init but since the "mount.usr" generator also runs,
14+
# it seems ok to also keep the "usr" generator even though the mount units are
15+
# a no-op
1316
set -e
1417

1518
UNIT_DIR="${1:-/tmp}"

dracut/10verity-generator/verity-generator

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44

55
# This script generates a service that manages a dm-verity device for the chosen USR partition
66

7+
# NOTE: The verity setup is now done in /minimal-init and this logic not used:
8+
exit 0
9+
# (We could also delete this file but once most users have a large /boot partition we can also
10+
# come back to a state where we have one initrd)
11+
712
set -e
813

914
UNIT_DIR="${1:-/tmp}"

minimal-init

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
#!/bin/sh
2+
set -u
3+
busybox mount -n -t proc proc /proc
4+
busybox mount -n -t devtmpfs devtmpfs /dev
5+
busybox mount -n -t sysfs sysfs /sys
6+
busybox --install -s
7+
if [ ! -x "/dev/pts" ]; then mkdir /dev/pts; fi
8+
if [ ! -x "/dev/shm" ]; then mkdir /dev/shm; fi
9+
busybox mount -n -t devpts devpts /dev/pts -o gid=5,mode=620,ptmxmode=000
10+
11+
cmdline_arg() {
12+
local name="$1"
13+
local value="${2-}"
14+
for arg in $(cat /proc/cmdline); do
15+
if [[ "${arg%%=*}" == "${name}" ]]; then
16+
value="${arg#*=}"
17+
fi
18+
done
19+
echo "${value}"
20+
}
21+
22+
emergency() {
23+
echo "ERROR: The early initrd has failed. To activate debug shell breakpoints, boot with rd.earlyshell in the kernel cmdline, and to activate tracing, boot with rd.earlytrace" >&2
24+
if read -s -p "Press Enter for emergency shell or wait 60 seconds for reboot." -t 60; then
25+
echo >&2; echo "Entering emergency mode. Exit the shell to retry /init (you might need to clean up mounts first) or reboot with 'reboot -f'." >&2
26+
busybox sh || true
27+
exec /init
28+
else
29+
echo >&2; echo "INFO: Rebooting" >&2
30+
exec reboot -f
31+
fi
32+
}
33+
trap 'emergency' ERR
34+
35+
# Custom debug breakpoint
36+
debug_sh() {
37+
if [ "$(cmdline_arg rd.earlyshell)" != "" ]; then
38+
echo "INFO: Entering debug shell breakpoint ($*), exit to continue booting (reboot with 'reboot -f')">&2
39+
busybox sh || true
40+
fi
41+
}
42+
debug_sh 1/4: before mdev
43+
if [ "$(cmdline_arg rd.earlytrace)" != "" ]; then
44+
set -x
45+
fi
46+
47+
mdev -d
48+
mdev -s
49+
# Coldplugging but with using /sbin/modprobe (which is kmod) instead of busybox's modprobe
50+
# because busybox doesn't properly support the globs in modules.alias
51+
find /sys/ -name modalias -print0 | xargs -0 sort -u | tr '\n' '\0' | xargs -0 /sbin/modprobe -abq || true
52+
# Required to access disks, but not autoloaded:
53+
modprobe sd_mod
54+
55+
debug_sh 2/4: before verity
56+
57+
find_drive() {
58+
local search="$1"
59+
local ueventline=
60+
local blkidmatch=
61+
local drive=
62+
local waitingmsg=
63+
local starttime=
64+
local timeoutsecs=
65+
local now=
66+
case "${search}" in
67+
LABEL=*)
68+
blkidmatch="${search#LABEL=}"
69+
# Needs " around the value
70+
blkidmatch="LABEL=\"${blkidmatch}\""
71+
;;
72+
UUID=*)
73+
blkidmatch="${search#UUID=}"
74+
# Needs " around the value
75+
blkidmatch="UUID=\"$(echo "${blkidmatch}" | tr "[:upper:]" "[:lower:]")\""
76+
;;
77+
PARTUUID=*)
78+
ueventline="${search#PARTUUID=}"
79+
ueventline="PARTUUID=$(echo "${ueventline}" | tr "[:upper:]" "[:lower:]")"
80+
;;
81+
PARTLABEL=*)
82+
ueventline="PARTNAME=${search#PARTLABEL=}"
83+
;;
84+
*)
85+
echo "${search}"
86+
return
87+
;;
88+
esac
89+
starttime=$(date +%s)
90+
# Default to 5 minutes
91+
timeoutsecs=$(cmdline_arg rd.earlytimeout 300)
92+
while [ "${drive}" = "" ]; do
93+
now=$(date +%s)
94+
# Timeout of 5 minutes for finding the device
95+
# NOTE: Only mdev -d runs as this point and the kernel also can spawn modprobe to load modules.
96+
# If problems arise, first make sure that required modules and their deps are actually in the initrd,
97+
# but if that's not enough we might even have to trigger the find /sys ... xargs coldplugging
98+
# here again every now and then? (Last resort would be to run proper udev, possibly without systemd.)
99+
if [ $((now - starttime)) -gt "${timeoutsecs}" ]; then
100+
echo "ERROR: Timeout waiting for drive: ${ueventline}${blkidmatch}" >&2
101+
return 1 # Throw error
102+
fi
103+
# No "sleep 0.1", so this is rather busy polling
104+
if [ "${ueventline}" != "" ]; then
105+
drive="$({ grep -s -l -m 1 -r "${ueventline}" /sys/class/block/*/uevent || true; } | cut -d / -f 5)"
106+
else
107+
drive="$(blkid | { grep -m 1 "${blkidmatch}" || true ; } | cut -d : -f 1 | cut -d / -f 3-)"
108+
fi
109+
if [ "${drive}" = "" ] && [ "${waitingmsg}" = "" ]; then
110+
echo "Waiting for drive..." >&2
111+
waitingmsg=1
112+
fi
113+
done
114+
drive="/dev/${drive}"
115+
echo "${drive}"
116+
}
117+
118+
# Ported code from the generators
119+
verityusr=$(cmdline_arg verity.usr)
120+
usrhash=$(cmdline_arg verity.usrhash)
121+
122+
verityusr=$(find_drive "${verityusr}")
123+
124+
# Only proceed if the source is a path and we have sufficient parameters.
125+
if echo "${verityusr}" | grep -q "^/" && [ "${usrhash}" != "" ]; then
126+
# Hardcoded expected value from the image GPT layout
127+
veritysetup --panic-on-corruption --hash-offset=1065345024 open "${verityusr}" usr "${verityusr}" "${usrhash}"
128+
# If there's a hash mismatch during table initialization,
129+
# veritysetup reports it on stderr but still exits 0.
130+
# Manually check the target status and fail if invalid.
131+
status=$(dmsetup status usr | cut -d " " -f 4)
132+
if [ "${status}" != V ]; then
133+
echo "Verity setup failed" >&2
134+
false # Throw error
135+
fi
136+
fi
137+
138+
usr=$(cmdline_arg mount.usr $(cmdline_arg usr))
139+
usrfstype=$(cmdline_arg mount.usrfstype $(cmdline_arg usrfstype auto))
140+
usrflags=$(cmdline_arg mount.usrflags $(cmdline_arg usrflags ro))
141+
142+
usr=$(find_drive "${usr}")
143+
144+
if [ "${usr}" = "" ] && [ -f /usr.squashfs ]; then
145+
usr=/usr.squashfs
146+
usrfstype=squashfs
147+
elif [ "${usrfstype}" = btrfs ] || [ "${usrfstype}" = auto ]; then
148+
if [ "$(echo ",${usrflags}," | grep -v -F ',ro,')" != "" ]; then
149+
true # Don't set "norecovery" when mounting rw
150+
else
151+
usrflags="${usrflags},rescue=nologreplay"
152+
fi
153+
fi
154+
# Only proceed if the source is a path.
155+
case "${usr}" in
156+
/*) : ;;
157+
*) echo "No mountable /usr partition given (usr='${usr}')" >&2
158+
false # Throw error
159+
;;
160+
esac
161+
162+
debug_sh 3/4: before /sysusr mount
163+
164+
echo "Mounting /usr from ${usr}" >&2
165+
# mount -t auto only works if btrfs is already loaded
166+
modprobe btrfs
167+
mkdir -p /sysusr/usr
168+
mount -t "${usrfstype}" -o "${usrflags}" "${usr}" /sysusr/usr
169+
170+
# Busybox doesn't load this for us
171+
modprobe loop
172+
LOOP=$(losetup -f)
173+
losetup -r "${LOOP}" /sysusr/usr/lib/flatcar/bootengine.img
174+
mkdir /underlay /work
175+
mount -t tmpfs tmpfs /work
176+
mkdir /work/realinit /work/work
177+
mount -t squashfs "${LOOP}" /underlay
178+
mkdir -p /realinit
179+
mount -t overlay -o rw,lowerdir=/underlay,upperdir=/work/realinit,workdir=/work/work overlay /realinit
180+
mkdir -p /realinit/sysusr/usr
181+
mount -o move /sysusr/usr /realinit/sysusr/usr
182+
if [ "${usr}" = /usr.squashfs ]; then
183+
mkdir -p /oem
184+
mkdir -p /realinit/oem
185+
mount -o bind /oem /realinit/oem
186+
touch /realinit/usr.squashfs
187+
mount -o bind /usr.squashfs /realinit/usr.squashfs
188+
fi
189+
debug_sh 4/4: before switch_root to /realinit
190+
killall mdev || true
191+
umount /proc
192+
umount /sys
193+
umount /dev/pts
194+
# Lazy unmount because /dev/console is held by the current process
195+
umount -l /dev
196+
exec switch_root /realinit /init

0 commit comments

Comments
 (0)