From b77f9e2910f81e3716a7011baa5581239d523c74 Mon Sep 17 00:00:00 2001 From: Ian Howell Date: Mon, 13 Sep 2021 16:21:11 -0500 Subject: [PATCH] AIAP: Cleanup completion statuses This adds the `status-checker` container, which aggregates the statuses of the task containers. This is useful for quickly checking whether AIAP is in a failed or successful state, allowing for early exit during gating or testing. This also prevents the containers from stopping for any reason, allowing for easy debugging. Change-Id: I1571d006fb3c856e4d2bedee0befdccae6082a66 --- tools/airship-in-a-pod/Makefile | 2 +- tools/airship-in-a-pod/README.md | 30 ++++++++ .../artifact-setup/assets/entrypoint.sh | 18 +++-- tools/airship-in-a-pod/base/Dockerfile | 2 +- tools/airship-in-a-pod/base/signal_complete | 9 --- tools/airship-in-a-pod/base/signal_status | 9 +++ tools/airship-in-a-pod/base/wait_for | 28 ++++++-- .../examples/airshipctl/replacements.yaml | 8 +-- .../examples/base/airship-in-a-pod.yaml | 71 +++++-------------- .../infra-builder/assets/entrypoint.sh | 20 +++--- .../runner/assets/entrypoint.sh | 19 +++-- tools/airship-in-a-pod/scripts/aiap-in-aks.sh | 9 ++- .../status-checker/Dockerfile | 8 +++ .../status-checker/assets/entrypoint.sh | 41 +++++++++++ 14 files changed, 160 insertions(+), 114 deletions(-) delete mode 100755 tools/airship-in-a-pod/base/signal_complete create mode 100755 tools/airship-in-a-pod/base/signal_status create mode 100644 tools/airship-in-a-pod/status-checker/Dockerfile create mode 100755 tools/airship-in-a-pod/status-checker/assets/entrypoint.sh diff --git a/tools/airship-in-a-pod/Makefile b/tools/airship-in-a-pod/Makefile index 2b3139321..71d917cf0 100644 --- a/tools/airship-in-a-pod/Makefile +++ b/tools/airship-in-a-pod/Makefile @@ -1,7 +1,7 @@ DOCKER_REGISTRY ?= quay.io DOCKER_IMAGE_PREFIX ?= airshipit DOCKER_IMAGE_TAG ?= latest -IMAGES ?= infra-builder runner artifact-setup +IMAGES ?= infra-builder runner artifact-setup status-checker PUBLISH ?= false .PHONY: help base libvirt artifact-setup $(IMAGES) images test diff --git a/tools/airship-in-a-pod/README.md b/tools/airship-in-a-pod/README.md index 7f910e8e2..5785e452e 100644 --- a/tools/airship-in-a-pod/README.md +++ b/tools/airship-in-a-pod/README.md @@ -19,6 +19,8 @@ The pod also contains the following "Support" containers: * `sushy-tools`: This is used for its BMC emulator * `docker-in-docker`: This is used for nesting containers * `nginx`: This is used for image hosting +* `status-checker`: This container is used to track the completion status of + the task containers. ## Deployment Options @@ -100,6 +102,34 @@ Once you've created the desired configuration, the kustomized pod can be deploye kustomize build ${PATH_TO_KUSTOMIZATION} | kubectl apply -f - ``` +## Finishing a Deployment + +A deployment of Airship-in-a-pod is denoted by one of two states: + +1. The runner container reaches the end of its execution successfully +2. An error occurs in any of the containers + +The statuses for the task containers is aggregated in the `status-checker` +container, which provides a status report every 5 seconds. The status report +has the following structure: + +``` +artifact-setup: <$STATUS> infra-builder: <$STATUS> runner: <$STATUS> +``` + +In the above, `$STATUS` can be any of `RUNNING`, `SUCCESS`, `FAILED`, or +`UNKNOWN`. The last line of the `status-checker`'s logs will always contain the +most recent status report. This status report can be used to determine the +overall health of the deployment, as in the following: + +``` +# Check if AIAP has finished successfully +test $(kubectl logs airship-in-a-pod -c status-checker --tail 1 | grep -o "SUCCESS" | wc -l) = 3 + +# Check if AIAP has failed +kubectl logs airship-in-a-pod -c status-checker --tail 1 | grep -q "FAILED" +``` + ## Interacting with the Pod For a quick rundown of what a particular container is doing, simply check the logs for that container. diff --git a/tools/airship-in-a-pod/artifact-setup/assets/entrypoint.sh b/tools/airship-in-a-pod/artifact-setup/assets/entrypoint.sh index d27a3ff9d..a0f5f4263 100755 --- a/tools/airship-in-a-pod/artifact-setup/assets/entrypoint.sh +++ b/tools/airship-in-a-pod/artifact-setup/assets/entrypoint.sh @@ -14,17 +14,18 @@ set -ex -# Create the "canary" file, indicating that the container is healthy -mkdir -p /tmp/healthy -touch /tmp/healthy/artifact-setup - +/signal_status "artifact-setup" "RUNNING" success=false -function cleanup() { +function reportStatus() { if [[ "$success" == "false" ]]; then - rm /tmp/healthy/artifact-setup + /signal_status "artifact-setup" "FAILED" + else + /signal_status "artifact-setup" "SUCCESS" fi + # Keep the container running for debugging/monitoring purposes + sleep infinity } -trap cleanup EXIT +trap reportStatus EXIT function cloneRepo() { repo_dir=$1 @@ -79,6 +80,3 @@ fi success=true /signal_complete artifact-setup - -# Keep the container running for debugging/monitoring purposes -sleep infinity diff --git a/tools/airship-in-a-pod/base/Dockerfile b/tools/airship-in-a-pod/base/Dockerfile index 87a6e9ddf..29b1a0aba 100644 --- a/tools/airship-in-a-pod/base/Dockerfile +++ b/tools/airship-in-a-pod/base/Dockerfile @@ -41,4 +41,4 @@ RUN apt-get update ;\ mkdir -p "$CACHE_DIR" COPY wait_for . -COPY signal_complete . +COPY signal_status . diff --git a/tools/airship-in-a-pod/base/signal_complete b/tools/airship-in-a-pod/base/signal_complete deleted file mode 100755 index d07b51112..000000000 --- a/tools/airship-in-a-pod/base/signal_complete +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -# signal_complete takes a container name and creates a file in the "completed" -# directory, denoting that the named container has finished its tasks. This can be -# leveraged by dependent containers via the `wait_for` command. - -mkdir -p "/tmp/completed" -touch "/tmp/completed/$1" -printf "Marked %s as complete.\n" "$1" diff --git a/tools/airship-in-a-pod/base/signal_status b/tools/airship-in-a-pod/base/signal_status new file mode 100755 index 000000000..503ff1ca3 --- /dev/null +++ b/tools/airship-in-a-pod/base/signal_status @@ -0,0 +1,9 @@ +#!/bin/bash + +# signal_complete takes a container name and a status and creates a file in the +# "status" directory whose contents are the provided status. This can be +# leveraged by dependent containers via the `wait_for` command. + +mkdir -p "/tmp/status" +echo "$2" > "/tmp/status/$1" +printf "Marked %s as %s.\n" "$1" "$2" diff --git a/tools/airship-in-a-pod/base/wait_for b/tools/airship-in-a-pod/base/wait_for index 409baec46..a33413744 100755 --- a/tools/airship-in-a-pod/base/wait_for +++ b/tools/airship-in-a-pod/base/wait_for @@ -1,16 +1,26 @@ #!/bin/bash -# wait_for takes a list of container names and runs until all of those container names -# appear in the "/tmp/completed" directory. It can be used to prevent a -# container from executing until pre-requisite containers have indicated completion. +# wait_for takes a container name and runs until the named container has +# reported a "SUCCESS" or "FAILURE" status in the "/tmp/status" directory. +# When the status becomes "SUCCESS" or "FAILURE", the `wait_for` script exits +# with a corresponding exit code. It can be used to prevent a container from +# executing until pre-requisite containers have indicated successful +# completion. + +container="$1" +mkdir -p "/tmp/status" +status_file="/tmp/status/$container" +if [[ ! -e "$status_file" ]]; then + # Create the status file to prevent errors when checking its contents + touch "$status_file" +fi -mkdir -p "/tmp/completed" while true; do # Assume we're finished, prove otherwise finished=true for container in "$@"; do - if [[ ! -e "/tmp/completed/$container" ]]; then - printf "Waiting on '%s'...\n" "$container" + if (! grep -q -e "SUCCESS" -e "FAILURE" "$status_file"); then + printf "Waiting on status from '%s'...\n" "$container" finished=false sleep 10 break @@ -20,3 +30,9 @@ while true; do break fi done + +if (grep -q "SUCCESS" "$status_file"); then + exit 0 +else + exit 1 +fi diff --git a/tools/airship-in-a-pod/examples/airshipctl/replacements.yaml b/tools/airship-in-a-pod/examples/airshipctl/replacements.yaml index 018f04733..648c7f6ff 100644 --- a/tools/airship-in-a-pod/examples/airshipctl/replacements.yaml +++ b/tools/airship-in-a-pod/examples/airshipctl/replacements.yaml @@ -15,7 +15,7 @@ path: "/spec/containers/4/env/3/value" value: https://opendev.org/airship/airshipctl -# This is the ref to checkout for the airshipctl binary repo +# This is the branch to checkout for the airshipctl binary repo - op: replace path: "/spec/containers/4/env/4/value" value: master @@ -86,9 +86,3 @@ - op: replace path: "/spec/containers/6/env/2/value" value: "false" - -# Uncomment the following to keep the runner container alive. This can be -# useful for debugging purposes. -# - op: replace -# path: "/spec/containers/6/command" -# value: ["bash", "-cex", "/entrypoint || sleep infinity"] diff --git a/tools/airship-in-a-pod/examples/base/airship-in-a-pod.yaml b/tools/airship-in-a-pod/examples/base/airship-in-a-pod.yaml index 1d8cb23e2..936da1f90 100644 --- a/tools/airship-in-a-pod/examples/base/airship-in-a-pod.yaml +++ b/tools/airship-in-a-pod/examples/base/airship-in-a-pod.yaml @@ -116,7 +116,7 @@ spec: -extensions 'req_ext' # Wait for infrastructure to come up - while [[ ! -e /tmp/completed/infra-builder ]]; do printf "Waiting on infra-builder...\n"; sleep 1; done + while (! grep -sq "SUCCESS" "/tmp/status/infra-builder"); do printf "Waiting on infra-builder...\n"; sleep 1; done sushy-emulator \ --debug \ @@ -127,8 +127,8 @@ spec: volumeMounts: - name: var-run-libvirt mountPath: /var/run/libvirt - - name: completed - mountPath: /tmp/completed + - name: status + mountPath: /tmp/status - name: nginx image: nginx:latest @@ -204,20 +204,6 @@ spec: - bash - -cex - /entrypoint.sh - readinessProbe: - exec: - command: - - test - - -e - - /tmp/completed/artifact-setup - livenessProbe: - exec: - command: - - test - - -e - - /tmp/healthy/artifact-setup - initialDelaySeconds: 5 - periodSeconds: 5 env: - name: CACHE_DIR value: /opt/aiap-cache @@ -244,8 +230,8 @@ spec: mountPath: /opt/aiap-cache - name: artifacts mountPath: /opt/aiap-artifacts - - name: completed - mountPath: /tmp/completed + - name: status + mountPath: /tmp/status - name: var-run-docker mountPath: /var/run - name: var-run-libvirt @@ -260,20 +246,6 @@ spec: - bash - -cex - /entrypoint.sh - readinessProbe: - exec: - command: - - test - - -e - - /tmp/completed/infra-builder - livenessProbe: - exec: - command: - - test - - -e - - /tmp/healthy/infra-builder - initialDelaySeconds: 5 - periodSeconds: 5 env: - name: CACHE_DIR value: /opt/aiap-cache @@ -292,8 +264,8 @@ spec: mountPath: /opt/aiap-cache - name: artifacts mountPath: /opt/aiap-artifacts - - name: completed - mountPath: /tmp/completed + - name: status + mountPath: /tmp/status - name: tmp mountPath: /tmp - name: var-run-aiap @@ -316,22 +288,6 @@ spec: - bash - -cex - /entrypoint.sh - readinessProbe: - exec: - command: - - test - - -e - - /tmp/completed/runner - initialDelaySeconds: 600 - periodSeconds: 30 - livenessProbe: - exec: - command: - - test - - -e - - /tmp/healthy/runner - initialDelaySeconds: 5 - periodSeconds: 5 env: - name: CACHE_DIR value: /opt/aiap-cache @@ -361,14 +317,13 @@ spec: value: "" - name: AIRSHIP_CONFIG_MANIFEST_REPO_AUTH_SSH_PASSWORD value: "" - volumeMounts: - name: cache mountPath: /opt/aiap-cache - name: artifacts mountPath: /opt/aiap-artifacts - - name: completed - mountPath: /tmp/completed + - name: status + mountPath: /tmp/status - name: tmp mountPath: /tmp - name: var-run-aiap @@ -386,6 +341,12 @@ spec: - name: airship-config mountPath: /root/.airship + - name: status-checker + image: quay.io/airshipit/aiap-status-checker:latest + volumeMounts: + - name: status + mountPath: /tmp/status + volumes: - name: cache hostPath: @@ -393,7 +354,7 @@ spec: - name: artifacts hostPath: path: /opt/aiap-artifacts - - name: completed + - name: status emptyDir: {} - name: dev hostPath: diff --git a/tools/airship-in-a-pod/infra-builder/assets/entrypoint.sh b/tools/airship-in-a-pod/infra-builder/assets/entrypoint.sh index 53ff8325c..fdc15ac50 100755 --- a/tools/airship-in-a-pod/infra-builder/assets/entrypoint.sh +++ b/tools/airship-in-a-pod/infra-builder/assets/entrypoint.sh @@ -14,16 +14,18 @@ set -ex -# Create the "canary" file, indicating that the container is healthy -mkdir -p /tmp/healthy -touch /tmp/healthy/infra-builder - +/signal_status "infra-builder" "RUNNING" success=false -function cleanup() { +function reportStatus() { if [[ "$success" == "false" ]]; then - rm /tmp/healthy/infra-builder + /signal_status "infra-builder" "FAILED" + else + /signal_status "infra-builder" "SUCCESS" fi + # Keep the container running for debugging/monitoring purposes + sleep infinity } +trap reportStatus EXIT function check_libvirt_readiness() { timeout=300 @@ -47,15 +49,9 @@ function check_libvirt_readiness() { done } -trap cleanup EXIT - check_libvirt_readiness ansible-playbook -v /opt/ansible/playbooks/build-infra.yaml \ -e local_src_dir="$(pwd)" success=true -/signal_complete infra-builder - -# Keep the container running for debugging/monitoring purposes -sleep infinity diff --git a/tools/airship-in-a-pod/runner/assets/entrypoint.sh b/tools/airship-in-a-pod/runner/assets/entrypoint.sh index 6b7b3ee67..198ec65b8 100755 --- a/tools/airship-in-a-pod/runner/assets/entrypoint.sh +++ b/tools/airship-in-a-pod/runner/assets/entrypoint.sh @@ -14,17 +14,18 @@ set -ex -# Create the "canary" file, indicating that the container is healthy -mkdir -p /tmp/healthy -touch /tmp/healthy/runner - +/signal_status "runner" "RUNNING" success=false -function cleanup() { +function reportStatus() { if [[ "$success" == "false" ]]; then - rm /tmp/healthy/runner + /signal_status "runner" "FAILED" + else + /signal_status "runner" "SUCCESS" fi + # Keep the container running for debugging/monitoring purposes + sleep infinity } -trap cleanup EXIT +trap reportStatus EXIT # Wait until artifact-setup and libvirt infrastructure has been built /wait_for artifact-setup @@ -80,7 +81,3 @@ fi ./tools/deployment/25_deploy_gating.sh success=true -/signal_complete runner - -# Keep the container running for debugging/monitoring purposes -sleep infinity diff --git a/tools/airship-in-a-pod/scripts/aiap-in-aks.sh b/tools/airship-in-a-pod/scripts/aiap-in-aks.sh index a0fb4eab6..0a199dde0 100755 --- a/tools/airship-in-a-pod/scripts/aiap-in-aks.sh +++ b/tools/airship-in-a-pod/scripts/aiap-in-aks.sh @@ -36,11 +36,16 @@ kubectl apply -k ${AIAP_POD} set +x echo "waiting up to $TIMEOUT seconds for airship-in-a-pod to complete..." end=$(($(date +%s) + $TIMEOUT)) +echo "* waiting up to 10m for containers to become ready..." +kubectl wait pod airship-in-a-pod --for condition=ContainersReady --timeout 10m while true; do - if (kubectl get pod airship-in-a-pod -o jsonpath="{.status.conditions[?(@.type=='ContainersReady')].status}" | grep -q True) ; then + last_status=$(kubectl logs airship-in-a-pod -c status-checker --tail 1) + if [ $(grep -o "SUCCESS" <<<$last_status | wc -l) = 3 ] ; then echo -e "\nairship-in-a-pod completed successfully." break - #TODO There's no way today to detect that an error has occurred, besides timing out. We should resolve that & watch for condition. + elif grep -q "FAILED" <<<$last_status ; then + echo -e "\nAirship-in-a-pod completed with FAILURE: $last_status" + break else now=$(date +%s) if [ $now -gt $end ]; then diff --git a/tools/airship-in-a-pod/status-checker/Dockerfile b/tools/airship-in-a-pod/status-checker/Dockerfile new file mode 100644 index 000000000..e45ab96fa --- /dev/null +++ b/tools/airship-in-a-pod/status-checker/Dockerfile @@ -0,0 +1,8 @@ +ARG BASE_IMAGE=alpine +FROM ${BASE_IMAGE} + +COPY assets /opt/assets/ +RUN cp -ravf /opt/assets/* / ;\ + rm -rf /opt/assets + +ENTRYPOINT /entrypoint.sh diff --git a/tools/airship-in-a-pod/status-checker/assets/entrypoint.sh b/tools/airship-in-a-pod/status-checker/assets/entrypoint.sh new file mode 100755 index 000000000..fa8bb1894 --- /dev/null +++ b/tools/airship-in-a-pod/status-checker/assets/entrypoint.sh @@ -0,0 +1,41 @@ +#!/bin/sh + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +status_dir="/tmp/status" +mkdir -p "$status_dir" + +while true; do + artifact_setup_status="UNKNOWN" + infra_builder_status="UNKNOWN" + runner_status="UNKNOWN" + if [ -f "$status_dir/artifact-setup" ]; then + artifact_setup_status="$(cat $status_dir/artifact-setup)" + fi + if [ -f "$status_dir/infra-builder" ]; then + infra_builder_status="$(cat $status_dir/infra-builder)" + fi + if [ -f "$status_dir/runner" ]; then + runner_status="$(cat $status_dir/runner)" + fi + + # Print all statuses on a single line + printf "artifact-setup: <%s> " "$artifact_setup_status" + printf "infra-builder: <%s> " "$infra_builder_status" + printf "runner: <%s> " "$runner_status" + printf "\n" + + sleep 5 +done