From 74983421a13329451c8a9ae6b545650e900a0187 Mon Sep 17 00:00:00 2001 From: Chandan Kumar Date: Tue, 19 Feb 2019 14:06:26 +0530 Subject: [PATCH] Raise an error if a service or container is failed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sometimes container or service does not start, and this doesn't make the CI fail. Until now, the failed containers are listed in the /var/log/extras/ tree, but it's not checked on a regular basis. This patch intends to make a hard failure in case either a service or a container doesn't start as expected. Co-Authored-By: Cédric Jeanneret Related-Bug: #1816523 Change-Id: I001e2f27d2b562bb0be87c8eaadcf3622e530498 --- playbooks/multinode-standalone.yml | 1 + playbooks/multinode-validate.yml | 1 + playbooks/quickstart-extras-validate.yml | 12 +++++ .../validate_services-ea3d14b982f4f9e8.yaml | 5 +++ roles/collect-logs/tasks/collect.yml | 3 ++ roles/extras-common/defaults/main.yml | 3 ++ roles/validate-services/tasks/containers.yaml | 44 +++++++++++++++++++ roles/validate-services/tasks/main.yaml | 3 ++ roles/validate-services/tasks/systemd.yaml | 10 +++++ 9 files changed, 82 insertions(+) create mode 100644 releasenotes/notes/validate_services-ea3d14b982f4f9e8.yaml create mode 100644 roles/validate-services/tasks/containers.yaml create mode 100644 roles/validate-services/tasks/main.yaml create mode 100644 roles/validate-services/tasks/systemd.yaml diff --git a/playbooks/multinode-standalone.yml b/playbooks/multinode-standalone.yml index ee4d9e89e..665948283 100644 --- a/playbooks/multinode-standalone.yml +++ b/playbooks/multinode-standalone.yml @@ -42,6 +42,7 @@ gather_facts: false roles: - {role: validate-tempest, when: run_tempest|bool} + - {role: validate-services, when: validate_services|bool} tags: - standalone diff --git a/playbooks/multinode-validate.yml b/playbooks/multinode-validate.yml index 9eee46db4..d90418fce 100644 --- a/playbooks/multinode-validate.yml +++ b/playbooks/multinode-validate.yml @@ -18,3 +18,4 @@ roles: - {role: validate-simple, when: test_ping|bool} - {role: validate-tempest, when: run_tempest|bool} + - {role: validate-services, when: validate_services|bool} diff --git a/playbooks/quickstart-extras-validate.yml b/playbooks/quickstart-extras-validate.yml index 70d5dee83..d6461c64b 100644 --- a/playbooks/quickstart-extras-validate.yml +++ b/playbooks/quickstart-extras-validate.yml @@ -34,3 +34,15 @@ gather_facts: true roles: - {role: validate-ui, when: validate_ui_simple|bool and undercloud_enable_ui|default(true)|bool} + +# Ensure services and containers are OK +- name: Execute simple service and container validations + hosts: + - undercloud + - overcloud + tags: + - overcloud-validate + - undercloud-validate + gather_facts: false + roles: + - {role: validate-services, when: validate_services|bool} diff --git a/releasenotes/notes/validate_services-ea3d14b982f4f9e8.yaml b/releasenotes/notes/validate_services-ea3d14b982f4f9e8.yaml new file mode 100644 index 000000000..7e28555d4 --- /dev/null +++ b/releasenotes/notes/validate_services-ea3d14b982f4f9e8.yaml @@ -0,0 +1,5 @@ +--- +features: + - New "validate_services" boolean flag in order to check the service state + - New "validate-services" role in charge of container and service state + validation diff --git a/roles/collect-logs/tasks/collect.yml b/roles/collect-logs/tasks/collect.yml index c9503c722..de7063c5f 100644 --- a/roles/collect-logs/tasks/collect.yml +++ b/roles/collect-logs/tasks/collect.yml @@ -268,6 +268,9 @@ echo "" >> $ALL_FILE; done; + # Get only failed containers, in a dedicated file + ${engine} ps -a | grep -vE ' (IMAGE|Exited \(0\)|Up) ' &>> /var/log/extra/failed_containers.log; + for cont in $(${engine} ps | awk {'print $NF'} | grep -v NAMES); do INFO_DIR=$BASE_CONTAINER_EXTRA/containers/${cont}; mkdir -p $INFO_DIR; diff --git a/roles/extras-common/defaults/main.yml b/roles/extras-common/defaults/main.yml index d95884f6f..d1f52e6e9 100644 --- a/roles/extras-common/defaults/main.yml +++ b/roles/extras-common/defaults/main.yml @@ -61,3 +61,6 @@ test_ping: true # install and execute dstat across nodes validate_performance: true + +# If `validate_services` is `true`, run the validate-services role +validate_services: false diff --git a/roles/validate-services/tasks/containers.yaml b/roles/validate-services/tasks/containers.yaml new file mode 100644 index 000000000..32e7825c2 --- /dev/null +++ b/roles/validate-services/tasks/containers.yaml @@ -0,0 +1,44 @@ +--- +- name: Do we have docker + stat: + path: /usr/bin/docker + register: docker_stat + +- name: Does docker run + systemd: + name: docker + register: docker_svc + when: docker_stat.stat.exists + +- name: Do we have podman + stat: + path: /usr/bin/podman + register: podman_stat + +# Check both docker and podman, even if we don't have any container. +# This will help catching failed container in mixed env, for example while +# migrating from docker to podman or the reverse. +- name: Get failed containers for podman + when: podman_stat.stat.exists + shell: > + {% raw %} + podman ps -a --filter 'status=exited' --format '{{.Names}} {{.Status}}' + {% endraw %} + register: failed_podman + +- name: Get failed containers from docker + when: docker_svc.status['SubState'] == 'running' + shell: > + {% raw %} + docker ps -a --filter 'status=exited' --format '{{.Names}} {{.Status}}' + {% endraw %} + register: failed_docker + +- name: Fail if we detect failed container + fail: + msg: | + Failed container detected. Please check the following locations + /var/log/extras/failed_containers.log + /var/log/extras/ + when: item is not match(".* Exited \(0\) .* ago") + loop: "{{ failed_podman.stdout_lines + failed_docker.stdout_lines}}" diff --git a/roles/validate-services/tasks/main.yaml b/roles/validate-services/tasks/main.yaml new file mode 100644 index 000000000..8e232512a --- /dev/null +++ b/roles/validate-services/tasks/main.yaml @@ -0,0 +1,3 @@ +--- +- include_tasks: containers.yaml +- include_tasks: systemd.yaml diff --git a/roles/validate-services/tasks/systemd.yaml b/roles/validate-services/tasks/systemd.yaml new file mode 100644 index 000000000..95b4ca0e8 --- /dev/null +++ b/roles/validate-services/tasks/systemd.yaml @@ -0,0 +1,10 @@ +--- +- name: Get failed services from Systemd + shell: > + systemctl list-units --failed --plain --no-legend --no-pager + register: systemd_state + +- name: Fails if we find failed systemd units + assert: + that: + - systemd_state.stdout_lines|length == 0