Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 44 additions & 11 deletions base/self-update/default.nix
Original file line number Diff line number Diff line change
@@ -1,8 +1,28 @@
{ config, pkgs, lib, ... }:
let
cfg = config.playos.selfUpdate;

# propagate status.ini update to legacy systems
postInstallHandler = pkgs.writeShellApplication {
name = "post-install";
runtimeInputs = with pkgs; [ coreutils ];
text = ''
tmpfile=$(mktemp)
trap 'rm -f $tmpfile' EXIT

cp -av /var/lib/rauc/status.ini "$tmpfile"
# Ensure /boot/status.ini is always order than /var/lib/rauc/status.ini
# to avoid statusfile-recovery.service from overwriting it. See below
# for details.
touch -a -m --date=@0 "$tmpfile"
cp -av "$tmpfile" /boot/status.ini
sync
'';
};
in
{
imports = [ ../volatile-root.nix ];

options = {
playos.selfUpdate = with lib; {
enable = mkEnableOption "Online self update";
Expand All @@ -21,13 +41,20 @@ in

services.dbus.packages = with pkgs; [ rauc ];

playos.storage.persistentFolders."/var/lib/rauc" = {
mode = "0700";
user = "root";
group = "root";
};

systemd.services.rauc = {
description = "RAUC Update Service";
serviceConfig = {
Type = "dbus";
BusName= "de.pengutronix.rauc";
ExecStart = "${pkgs.rauc}/bin/rauc service";
User = "root";
StateDirectory = "rauc";
};
wantedBy = [ "multi-user.target" ];
};
Expand All @@ -38,7 +65,7 @@ in
compatible=dividat-play-computer
bootloader=grub
grubenv=/boot/grub/grubenv
statusfile=/boot/status.ini
statusfile=/var/lib/rauc/status.ini

[keyring]
path=cert.pem
Expand All @@ -52,30 +79,36 @@ in
device=/dev/disk/by-label/system.b
type=ext4
bootname=b

[handlers]
post-install=${lib.getExe postInstallHandler}
'';
};

environment.etc."rauc/cert.pem" = {
source = cfg.updateCert;
};

# This service adjusts for a known weakness of the update mechanism that is due to the
# use of the `/boot` partition for storing RAUC's statusfile. The `/boot` partition
# was chosen to use FAT32 in order to use it as EFI system partition. FAT32 has no
# journaling and so the atomicity guarantees RAUC tries to give for statusfile updates
# are diminished. This service looks for leftovers from interrupted statusfile updates
# and tries to recover.
# Note that as previous installations will keep their boot partition unchanged even
# after system updates, this or a similar recovery mechanism would be required even if
# we change partition layout for new systems going forward.

# When one of the RAUC slots is legacy (meaning, RAUC state is persisted in
# /boot/status.ini), we need to copy it over to /var/lib/rauc in case it is
# newer than /var/lib/rauc/status.ini or if /var/lib/rauc/status.ini is
# missing
#
# Before comparing the the modified times, we deal with the lack of
# journalling/atomic writes on FAT32, by attempting to recover a partially
# written /boot/status.ini.
systemd.services.statusfile-recovery = {
description = "status.ini recovery";
serviceConfig.ExecStart = "${pkgs.bash}/bin/bash ${./recover-from-tmpfile} /boot/status.ini";
# TODO: use pkgs.writeShellApplication
serviceConfig.ExecStart = "${pkgs.bash}/bin/bash ${./recover-from-tmpfile} /boot/status.ini /var/lib/rauc/status.ini";
serviceConfig.Type = "oneshot";
serviceConfig.User = "root";
serviceConfig.StandardOutput = "syslog";
serviceConfig.SyslogIdentifier = "statusfile-recovery";
serviceConfig.RemainAfterExit = true;
after = [ "local-fs.target" ];
before = [ "rauc.service" ];
wantedBy = [ "multi-user.target" ];
};

Expand Down
71 changes: 61 additions & 10 deletions base/self-update/recover-from-tmpfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#! /usr/bin/env bash
# TODO: rename this to "status-file-sync-and-recovery" and update description

# This script is designed to recover files written with glib's
# g_file_set_contents function. On journaling filesystems this function
Expand All @@ -9,38 +10,88 @@
# temporary file. This is the situation this script is designed to
# detect and recover from, by repeating the moving from temp file to
# target.
#
# TODO: it might also be possible to recover from /boot/FSCKXXXX.rec
# produced after systemd-fsck runs

FILE="$1"
SOURCE_FILE="$1" # e.g. /boot/status.ini

TARGET_FILE="$2" # e.g. /var/lib/rauc/status.ini

# Step 1: Check if there's anything that can be used to recover $SOURCE_FILE,
# set $SOURCE_FILE_FINAL_CANDIDATE to either original SOURCE_FILE or recoverable
# version if they are viable. If not, leave empty.

# Get absolute path for given file name
TARGET="$(realpath --no-symlinks "$FILE")"
SOURCE_FILE_CURRENT="$(realpath --no-symlinks "$SOURCE_FILE")"

if ! [ -s "$TARGET" ]; then
SOURCE_FILE_FINAL_CANDIDATE=

if ! [ -s "$SOURCE_FILE_CURRENT" ]; then
# We expect a random alnum suffix of "up to" 7 characters
# (https://docs.gtk.org/glib/func.file_set_contents_full.html).
# The ones actually observed were 6 characters long, and we want to
# ignore files that don't seem likely to be tempfile copies.
TMP_SUFFIX="\.\w{5,7}"

PARENT="$(dirname "$TARGET")"
PARENT="$(dirname "$SOURCE_FILE_CURRENT")"
# List temp files based off of the target's name, with newer files first
CANDIDATES=($(ls -t --time=birth -d "$PARENT/"* | grep -E "$TARGET$TMP_SUFFIX"))
CANDIDATES=($(ls -t --time=birth -d "$PARENT/"* | grep -E "$SOURCE_FILE_CURRENT$TMP_SUFFIX"))
GREP_EXIT="$?"

if [ "$GREP_EXIT" -eq 0 ] && [ "${#CANDIDATES[@]}" -ge 1 ]; then
# Use the first, i.e. newest alternative as replacement
REPLACEMENT="${CANDIDATES[0]}"
if [ -s "$REPLACEMENT" ]; then
mv "$REPLACEMENT" "$FILE"
echo "Detected missing or empty '$FILE' and replaced it with '$REPLACEMENT'."
echo "Candidate $REPLACEMENT is not empty and can be used for recovery"
SOURCE_FILE_FINAL_CANDIDATE=$REPLACEMENT
else
# If the newest alternative is empty, we do not know what to do.
# Do not touch any evidence and abort.
echo "Both '$FILE' and recovery candidate '$REPLACEMENT' are empty. Aborting."
echo "Both '$SOURCE_FILE' and recovery candidate '$REPLACEMENT' are empty. No candidate."
fi
else
echo "The file '$FILE' seems empty, but no alternatives were found. Aborting."
echo "The file '$SOURCE_FILE' seems empty, but no alternatives were found. No candidate."
fi
else
echo "The file '$FILE' seems OK. Nothing to do."
echo "The file '$SOURCE_FILE' seems OK."
SOURCE_FILE_FINAL_CANDIDATE="$SOURCE_FILE"
fi

# Step 2: Sync-up $SOURCE_FILE and $TARGET_FILE

if [[ -z $SOURCE_FILE_FINAL_CANDIDATE ]] && ! [[ -s $TARGET_FILE ]] ; then
echo "No SOURCE_FILE_FINAL_CANDIDATE and empty TARGET_FILE, aborting."
elif [[ -z $SOURCE_FILE_FINAL_CANDIDATE ]] && [[ -s $TARGET_FILE ]]; then
echo "No SOURCE_FILE_FINAL_CANDIDATE, but $TARGET_FILE is non-empty. Setting $SOURCE_FILE to $TARGET_FILE"
cp -av $TARGET_FILE $SOURCE_FILE
touch -a -m --date=@0 $SOURCE_FILE
else
echo "Final SOURCE_FILE_FINAL_CANDIDATE is $SOURCE_FILE_FINAL_CANDIDATE, $TARGET_FILE is non-empty."

echo "Checking which file is latest"
if [[ $SOURCE_FILE_FINAL_CANDIDATE -nt $TARGET_FILE ]]; then
echo "$SOURCE_FILE_FINAL_CANDIDATE is newer than $TARGET_FILE"

if [[ "$SOURCE_FILE" != "$SOURCE_FILE_FINAL_CANDIDATE" ]]; then
echo "Recovering $SOURCE_FILE using $SOURCE_FILE_FINAL_CANDIDATE"
mv -v $SOURCE_FILE_FINAL_CANDIDATE $SOURCE_FILE
fi

echo "Updating $TARGET_FILE using $SOURCE_FILE"
cp -av $SOURCE_FILE $TARGET_FILE

else
echo "$TARGET_FILE is newer than $SOURCE_FILE_FINAL_CANDIDATE".

echo "Updating $SOURCE_FILE using $TARGET_FILE"
cp -av $TARGET_FILE $SOURCE_FILE
touch -a -m --date=@0 $SOURCE_FILE

# not strictly necessary, but good to cleanup leftovers
if [[ "$SOURCE_FILE" != "$SOURCE_FILE_FINAL_CANDIDATE" ]]; then
echo "Removing outdated $SOURCE_FILE_FINAL_CANDIDATE"
rm -v $SOURCE_FILE_FINAL_CANDIDATE
fi
fi
fi
14 changes: 14 additions & 0 deletions base/volatile-root.nix
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,22 @@ with lib;

# === Wipe persistent data if magicWipeFile is present
if [ -f "$tmpBootMountPoint/${magicWipeFile}" ]; then

# TODO: also update bootloader/rescue/, share code
# Backup rauc state
mkdir -p /tmp/data
mount -t ext4 ${cfgPart.device} /tmp/data
cp -av /tmp/data/rauc /tmp/rauc
umount /tmp/data

# fstype and label hard-coded, same as in install and rescue scripts
${pkgs.e2fsprogs}/bin/mkfs.ext4 -L data ${cfgPart.device}

# Restore rauc state
mount -t ext4 ${cfgPart.device} /tmp/data
cp -av /tmp/rauc /tmp/data/rauc
umount /tmp/data

rm -f $tmpBootMountPoint/${magicWipeFile}
fi

Expand Down
2 changes: 2 additions & 0 deletions testing/integration/rauc-statusfile-recovery.nix
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ pkgs.testers.runNixOSTest {
config = {
# We need a /boot partition for our test
virtualisation.useBootLoader = true;
# TODO: copy over the setup from integration/factory-reset.nix and
# rework/expand this test

playos.selfUpdate = {
enable = true;
Expand Down
55 changes: 44 additions & 11 deletions testing/release-validation.nix
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,13 @@ in
# This override is not needed if application.version is "already" newer
# than base
versionOverride = nextSystemVersion;
# TODO: set users.root.initialHashedPassword = "" to allow debugging post-update
}).components.unsignedRaucBundle,
}:
let
overlayPath = "/tmp/release-validation-disk.img";

versionWithFixedStatusIniRecovery = "2025.3.2"; # TODO: specify next version
in
with pkgs.lib;
pkgs.testers.runNixOSTest {
Expand Down Expand Up @@ -297,27 +300,46 @@ with TestCase("controller starts downloading the bundle") as t:

wait_until_passes(t_check, retries=30, sleep=1)

with TestCase("controller has downloaded and installed the bundle") as t:
def t_check():
playos.send_key("ctrl-r")
time.sleep(2)
navigate_to_system_status()
screen_text = screenshot_and_ocr(playos)
print(f"Current sreen text: {screen_text}")
# return early if there is an error

def check_for_text_in_status_page(text, ignore_errors=False):
playos.send_key("ctrl-r")
time.sleep(2)
navigate_to_system_status()
screen_text = screenshot_and_ocr(playos)
print(f"Current sreen text: {screen_text}")

# return early if there is an error
if not ignore_errors:
possible_errors = ["ErrorDownloading", "ErrorInstalling", "UpdateError"]
if any([e in screen_text for e in possible_errors]):
return screen_text
t.assertIn("RebootRequired", screen_text)

t.assertIn(text, screen_text)


with TestCase("controller has downloaded and installed the bundle") as t:
# controller takes at least 2 minutes for the download
# (1.2GB @ 10 MB/s), so allow up to 5 minutes for the download+install
screen_text = wait_until_passes(t_check, retries=30, sleep=10)
screen_text = wait_until_passes(
lambda: check_for_text_in_status_page("RebootRequired"),
retries=30, sleep=10)
if screen_text is not None:
t.fail(f"Update process failed with an error, last screen text: {screen_text}")


# Reboot to new system

# For legacy systems, try to ensure fsync of /boot/status.ini before reboot
if "${baseSystemVersion}" < "${versionWithFixedStatusIniRecovery}":
print("Attempting to trigger fsync of /boot/status.ini via partial shut-down")
time.sleep(30) # opportunistically wait for a sync

# simulate a Power key long-press to initiate a clean shutdown
long_press_duration_seconds = 5.5 # empirically determined :-)
playos.send_monitor_command(f"sendkey power {round(long_press_duration_seconds*1000)}")
# let the VM partially shut down, but not all the way, otherwise
# system_reset will not work
time.sleep(long_press_duration_seconds + 0.1)

playos.send_monitor_command("system_reset")

with TestCase("kiosk is open with kiosk URL after reboot") as t:
Expand All @@ -334,5 +356,16 @@ with TestCase("controller GUI with new version is visible") as t:
wait_until_passes(
lambda: t.assertIn("${nextSystemVersion}", screenshot_and_ocr(playos))
)

with TestCase("The new booted version reaches a Good state") as t:
wait_until_passes(
# UpdateError possible initially, because DHCP has not completed
lambda: check_for_text_in_status_page("Good", ignore_errors=True),
retries=10, sleep=10)

with TestCase("Update state is UpToDate") as t:
wait_until_passes(
lambda: check_for_text_in_status_page("UpToDate", ignore_errors=True),
retries=3, sleep=10)
'';
}
Loading