nodepool/tools/functional-test-check.sh
Clark Boylan 5702331087 Move nodepool functests to podman
Now that nodepool images are on quay.io we don't get speculative
container image testing with docker. The reason for this is docker only
knows how to lookup images hosted by docker.io in mirrors which
specualtive container image testing relies on. Since the images are
hosted on quay.io instead of docker.io we lose this functionality.

Address this by switching to podman and podman-compose which does
understand how to fetch images with mirrors from any location.

Depends-On: https://review.opendev.org/c/zuul/zuul/+/687135
Change-Id: I1a510a9b68a2f01098f3c099a129d6d268b422d9
2023-05-30 15:35:47 -07:00

179 lines
6.3 KiB
Bash
Executable File

#!/bin/bash -ex
LOGDIR=/home/zuul/zuul-output/logs
# Set to indiciate an error return
RETURN=0
FAILURE_REASON=""
if [[ ${NODEPOOL_FUNCTIONAL_CHECK:-} == "installed" ]]; then
NODEPOOL_INSTALL=${NODEPOOL_INSTALL:-~/.venv}
NODEPOOL_CONFIG=${NODEPOOL_CONFIG:-/etc/nodepool/nodepool.yaml}
NODEPOOL="$NODEPOOL_INSTALL/bin/nodepool -c $NODEPOOL_CONFIG"
elif [[ ${NODEPOOL_FUNCTIONAL_CHECK:-} == "containers" ]]; then
NODEPOOL="sudo podman exec nodepool_nodepool-launcher_1 nodepool"
else
echo "Running in unknown environment!"
exit 1
fi
cat > /tmp/ssh_wrapper <<EOF
#!/bin/bash -ex
sudo -H -u zuul ssh -o StrictHostKeyChecking=no -i $HOME/.ssh/id_nodepool root@\$@
EOF
sudo chmod 0755 /tmp/ssh_wrapper
function sshintonode {
name=$1
state='ready'
node=`$NODEPOOL list | grep $name | grep $state | cut -d '|' -f6 | tr -d ' '`
/tmp/ssh_wrapper $node ls /
# Check that the root partition grew on boot; it should be a 5GiB
# partition minus some space for the boot partition. However
# emperical evidence suggests there is some modulo maths going on,
# (possibly with alignment?) that means we can vary up to even
# 64MiB. Thus we choose an expected value that gives us enough
# slop to avoid false matches, but still indicates we resized up.
root_size=$(/tmp/ssh_wrapper $node -- lsblk -rbno SIZE /dev/vda1)
expected_root_size=$(( 5000000000 ))
if [[ $root_size -lt $expected_root_size ]]; then
echo "*** Root device does not appear to have grown: $root_size"
FAILURE_REASON="Root partition of $name does not appear to have grown: $root_size < $expected_root_size"
RETURN=1
fi
# Check we saw metadata deployed to the config-drive
/tmp/ssh_wrapper $node \
"dd status=none if=/dev/sr0 | tr -cd '[:print:]' | grep -q nodepool_devstack"
if [[ $? -ne 0 ]]; then
echo "*** Failed to find metadata in config-drive"
FAILURE_REASON="Failed to find meta-data in config-drive for $node"
RETURN=1
fi
# Debugging that we're seeing the right node
/tmp/ssh_wrapper $node -- "cat /etc/os-release; blkid"
# This ensures DIB setup the bootloader kernel arguments correctly
# by looking for the default console setup it does. In the past
# we have seen issues with bootloader installation for all sorts
# of reasons (our errors and upstream changes); but generally what
# has happened is that case grub has silently fallen-back to
# "guessing" from the running build-system what kernel flags to
# use.
#
# DIB images should be booting from a root device labeled
# "cloudimg-rootfs". In the gate, where you're *using* a
# DIB-image to build a DIB-image, it's /proc/cmdline contains
# "root=LABEL=cloudimg-rootfs" and a misconfigured grub can
# actually guess the correct root device. However, when this
# builds an image in production on a totally different host you
# get a non-booting, wrong image. Ergo, although this is mostly
# what we're interested in validating, this is not a reliable
# thing to test directly.
#
# So below, we probe for something else; the console setup that
# DIB will put in. If this is missing, it's an indication that
# the bootloader is not setting up the kernel arguments correctly.
kernel_cmd_line=$(/tmp/ssh_wrapper $node -- cat /proc/cmdline)
echo "Kernel command line: ${kernel_cmd_line}"
if [[ ! $kernel_cmd_line =~ 'console=tty0 console=ttyS0,115200' ]]; then
echo "*** Failed to find correct kernel boot flags"
FAILURE_REASON="Failed to find correct kernel boot flags $node"
RETURN=1
fi
# Ensure glean services have loaded
# The output is like:
# ---
# glean-early.service loaded active exited Early glean execution
# glean@ens3.service loaded active exited Glean for interface ens3 with NetworkManager
# glean@lo.service loaded active exited Glean for interface lo with NetworkManager
# ---
# So if we see anything other than 'loaded active exited' we have a problem.
glean_status=$(/tmp/ssh_wrapper $node -- "systemctl | egrep 'glean[@|-]' | { grep -v 'loaded active exited' || true; }")
if [[ ${glean_status} != '' ]]; then
echo "*** Glean not loaded correctly"
echo "*** saw: ${glean_status}"
FAILURE_REASON="Failed to start glean correctly"
RETURN=1
fi
}
function checknm {
name=$1
state='ready'
node=`$NODEPOOL list | grep $name | grep $state | cut -d '|' -f6 | tr -d ' '`
nm_output=$(/tmp/ssh_wrapper $node -- nmcli c)
# virtio device is eth0 on older, ens3 on newer
if [[ ! ${nm_output} =~ (eth0|ens3) ]]; then
echo "*** Failed to find interface in NetworkManager connections"
/tmp/ssh_wrapper $node -- nmcli c
/tmp/ssh_wrapper $node -- nmcli device
FAILURE_REASON="Failed to find interface in NetworkManager connections"
RETURN=1
fi
}
function waitforimage {
local name=$1
local state='ready'
local builds
while ! $NODEPOOL image-list | grep $name | grep $state; do
$NODEPOOL image-list > ${LOGDIR}/nodepool-image-list.txt
$NODEPOOL list --detail > ${LOGDIR}/nodepool-list.txt
builds=$(ls -l /var/log/nodepool/builds/ | grep $name | wc -l)
if [[ ${builds} -ge 4 ]]; then
echo "*** Build of $name failed at least 3 times, aborting"
exit 1
fi
sleep 10
done
}
function waitfornode {
name=$1
state='ready'
while ! $NODEPOOL list | grep $name | grep $state | grep "unlocked"; do
$NODEPOOL image-list > ${LOGDIR}/nodepool-image-list.txt
$NODEPOOL list --detail > ${LOGDIR}/nodepool-list.txt
sleep 10
done
}
# check that image built
waitforimage test-image
# check image was bootable
waitfornode test-image
# check ssh for root user
sshintonode test-image
# networkmanager check
# TODO(jeblair): This should not run in all cases; in fact, most of
# this checking should move into the dib repo
#checknm test-image
# userdata check
set -o errexit
# Show the built nodes
$NODEPOOL list
# Try to delete the nodes that were just built
$NODEPOOL delete --now 0000000000
# show the deleted nodes (and their replacements may be building)
$NODEPOOL list
if [[ -n "${FAILURE_REASON}" ]]; then
echo "${FAILURE_REASON}"
fi
exit $RETURN