5702331087
Now that nodepool images are on quay.io we don't get speculative container image testing with docker. The reason for this is docker only knows how to lookup images hosted by docker.io in mirrors which specualtive container image testing relies on. Since the images are hosted on quay.io instead of docker.io we lose this functionality. Address this by switching to podman and podman-compose which does understand how to fetch images with mirrors from any location. Depends-On: https://review.opendev.org/c/zuul/zuul/+/687135 Change-Id: I1a510a9b68a2f01098f3c099a129d6d268b422d9
179 lines
6.3 KiB
Bash
Executable File
179 lines
6.3 KiB
Bash
Executable File
#!/bin/bash -ex
|
|
|
|
LOGDIR=/home/zuul/zuul-output/logs
|
|
|
|
# Set to indiciate an error return
|
|
RETURN=0
|
|
FAILURE_REASON=""
|
|
|
|
if [[ ${NODEPOOL_FUNCTIONAL_CHECK:-} == "installed" ]]; then
|
|
NODEPOOL_INSTALL=${NODEPOOL_INSTALL:-~/.venv}
|
|
NODEPOOL_CONFIG=${NODEPOOL_CONFIG:-/etc/nodepool/nodepool.yaml}
|
|
NODEPOOL="$NODEPOOL_INSTALL/bin/nodepool -c $NODEPOOL_CONFIG"
|
|
elif [[ ${NODEPOOL_FUNCTIONAL_CHECK:-} == "containers" ]]; then
|
|
NODEPOOL="sudo podman exec nodepool_nodepool-launcher_1 nodepool"
|
|
else
|
|
echo "Running in unknown environment!"
|
|
exit 1
|
|
fi
|
|
|
|
cat > /tmp/ssh_wrapper <<EOF
|
|
#!/bin/bash -ex
|
|
sudo -H -u zuul ssh -o StrictHostKeyChecking=no -i $HOME/.ssh/id_nodepool root@\$@
|
|
|
|
EOF
|
|
sudo chmod 0755 /tmp/ssh_wrapper
|
|
|
|
function sshintonode {
|
|
name=$1
|
|
state='ready'
|
|
|
|
node=`$NODEPOOL list | grep $name | grep $state | cut -d '|' -f6 | tr -d ' '`
|
|
|
|
/tmp/ssh_wrapper $node ls /
|
|
|
|
# Check that the root partition grew on boot; it should be a 5GiB
|
|
# partition minus some space for the boot partition. However
|
|
# emperical evidence suggests there is some modulo maths going on,
|
|
# (possibly with alignment?) that means we can vary up to even
|
|
# 64MiB. Thus we choose an expected value that gives us enough
|
|
# slop to avoid false matches, but still indicates we resized up.
|
|
root_size=$(/tmp/ssh_wrapper $node -- lsblk -rbno SIZE /dev/vda1)
|
|
expected_root_size=$(( 5000000000 ))
|
|
if [[ $root_size -lt $expected_root_size ]]; then
|
|
echo "*** Root device does not appear to have grown: $root_size"
|
|
FAILURE_REASON="Root partition of $name does not appear to have grown: $root_size < $expected_root_size"
|
|
RETURN=1
|
|
fi
|
|
|
|
# Check we saw metadata deployed to the config-drive
|
|
/tmp/ssh_wrapper $node \
|
|
"dd status=none if=/dev/sr0 | tr -cd '[:print:]' | grep -q nodepool_devstack"
|
|
if [[ $? -ne 0 ]]; then
|
|
echo "*** Failed to find metadata in config-drive"
|
|
FAILURE_REASON="Failed to find meta-data in config-drive for $node"
|
|
RETURN=1
|
|
fi
|
|
|
|
# Debugging that we're seeing the right node
|
|
/tmp/ssh_wrapper $node -- "cat /etc/os-release; blkid"
|
|
|
|
# This ensures DIB setup the bootloader kernel arguments correctly
|
|
# by looking for the default console setup it does. In the past
|
|
# we have seen issues with bootloader installation for all sorts
|
|
# of reasons (our errors and upstream changes); but generally what
|
|
# has happened is that case grub has silently fallen-back to
|
|
# "guessing" from the running build-system what kernel flags to
|
|
# use.
|
|
#
|
|
# DIB images should be booting from a root device labeled
|
|
# "cloudimg-rootfs". In the gate, where you're *using* a
|
|
# DIB-image to build a DIB-image, it's /proc/cmdline contains
|
|
# "root=LABEL=cloudimg-rootfs" and a misconfigured grub can
|
|
# actually guess the correct root device. However, when this
|
|
# builds an image in production on a totally different host you
|
|
# get a non-booting, wrong image. Ergo, although this is mostly
|
|
# what we're interested in validating, this is not a reliable
|
|
# thing to test directly.
|
|
#
|
|
# So below, we probe for something else; the console setup that
|
|
# DIB will put in. If this is missing, it's an indication that
|
|
# the bootloader is not setting up the kernel arguments correctly.
|
|
kernel_cmd_line=$(/tmp/ssh_wrapper $node -- cat /proc/cmdline)
|
|
echo "Kernel command line: ${kernel_cmd_line}"
|
|
if [[ ! $kernel_cmd_line =~ 'console=tty0 console=ttyS0,115200' ]]; then
|
|
echo "*** Failed to find correct kernel boot flags"
|
|
FAILURE_REASON="Failed to find correct kernel boot flags $node"
|
|
RETURN=1
|
|
fi
|
|
|
|
# Ensure glean services have loaded
|
|
# The output is like:
|
|
# ---
|
|
# glean-early.service loaded active exited Early glean execution
|
|
# glean@ens3.service loaded active exited Glean for interface ens3 with NetworkManager
|
|
# glean@lo.service loaded active exited Glean for interface lo with NetworkManager
|
|
# ---
|
|
# So if we see anything other than 'loaded active exited' we have a problem.
|
|
glean_status=$(/tmp/ssh_wrapper $node -- "systemctl | egrep 'glean[@|-]' | { grep -v 'loaded active exited' || true; }")
|
|
if [[ ${glean_status} != '' ]]; then
|
|
echo "*** Glean not loaded correctly"
|
|
echo "*** saw: ${glean_status}"
|
|
FAILURE_REASON="Failed to start glean correctly"
|
|
RETURN=1
|
|
fi
|
|
}
|
|
|
|
function checknm {
|
|
name=$1
|
|
state='ready'
|
|
|
|
node=`$NODEPOOL list | grep $name | grep $state | cut -d '|' -f6 | tr -d ' '`
|
|
nm_output=$(/tmp/ssh_wrapper $node -- nmcli c)
|
|
|
|
# virtio device is eth0 on older, ens3 on newer
|
|
if [[ ! ${nm_output} =~ (eth0|ens3) ]]; then
|
|
echo "*** Failed to find interface in NetworkManager connections"
|
|
/tmp/ssh_wrapper $node -- nmcli c
|
|
/tmp/ssh_wrapper $node -- nmcli device
|
|
FAILURE_REASON="Failed to find interface in NetworkManager connections"
|
|
RETURN=1
|
|
fi
|
|
}
|
|
|
|
function waitforimage {
|
|
local name=$1
|
|
local state='ready'
|
|
local builds
|
|
|
|
while ! $NODEPOOL image-list | grep $name | grep $state; do
|
|
$NODEPOOL image-list > ${LOGDIR}/nodepool-image-list.txt
|
|
$NODEPOOL list --detail > ${LOGDIR}/nodepool-list.txt
|
|
|
|
builds=$(ls -l /var/log/nodepool/builds/ | grep $name | wc -l)
|
|
if [[ ${builds} -ge 4 ]]; then
|
|
echo "*** Build of $name failed at least 3 times, aborting"
|
|
exit 1
|
|
fi
|
|
sleep 10
|
|
done
|
|
}
|
|
|
|
function waitfornode {
|
|
name=$1
|
|
state='ready'
|
|
|
|
while ! $NODEPOOL list | grep $name | grep $state | grep "unlocked"; do
|
|
$NODEPOOL image-list > ${LOGDIR}/nodepool-image-list.txt
|
|
$NODEPOOL list --detail > ${LOGDIR}/nodepool-list.txt
|
|
sleep 10
|
|
done
|
|
}
|
|
|
|
# check that image built
|
|
waitforimage test-image
|
|
# check image was bootable
|
|
waitfornode test-image
|
|
# check ssh for root user
|
|
sshintonode test-image
|
|
# networkmanager check
|
|
# TODO(jeblair): This should not run in all cases; in fact, most of
|
|
# this checking should move into the dib repo
|
|
#checknm test-image
|
|
# userdata check
|
|
|
|
set -o errexit
|
|
# Show the built nodes
|
|
$NODEPOOL list
|
|
|
|
# Try to delete the nodes that were just built
|
|
$NODEPOOL delete --now 0000000000
|
|
|
|
# show the deleted nodes (and their replacements may be building)
|
|
$NODEPOOL list
|
|
|
|
if [[ -n "${FAILURE_REASON}" ]]; then
|
|
echo "${FAILURE_REASON}"
|
|
fi
|
|
exit $RETURN
|