Raise an error if a service or container is failed

Sometimes container or service does not start, and this
doesn't make the CI fail. Until now, the failed containers
are listed in the /var/log/extras/ tree, but it's not
checked on a regular basis.

This patch intends to make a hard failure in case either
a service or a container doesn't start as expected.

Co-Authored-By: Cédric Jeanneret <cjeanner@redhat.com>
Related-Bug: #1816523
Change-Id: I001e2f27d2b562bb0be87c8eaadcf3622e530498
This commit is contained in:
Chandan Kumar 2019-02-19 14:06:26 +05:30 committed by Cédric Jeanneret
parent 37b2a2643c
commit 74983421a1
9 changed files with 82 additions and 0 deletions

View File

@ -42,6 +42,7 @@
gather_facts: false gather_facts: false
roles: roles:
- {role: validate-tempest, when: run_tempest|bool} - {role: validate-tempest, when: run_tempest|bool}
- {role: validate-services, when: validate_services|bool}
tags: tags:
- standalone - standalone

View File

@ -18,3 +18,4 @@
roles: roles:
- {role: validate-simple, when: test_ping|bool} - {role: validate-simple, when: test_ping|bool}
- {role: validate-tempest, when: run_tempest|bool} - {role: validate-tempest, when: run_tempest|bool}
- {role: validate-services, when: validate_services|bool}

View File

@ -34,3 +34,15 @@
gather_facts: true gather_facts: true
roles: roles:
- {role: validate-ui, when: validate_ui_simple|bool and undercloud_enable_ui|default(true)|bool} - {role: validate-ui, when: validate_ui_simple|bool and undercloud_enable_ui|default(true)|bool}
# Ensure services and containers are OK
- name: Execute simple service and container validations
hosts:
- undercloud
- overcloud
tags:
- overcloud-validate
- undercloud-validate
gather_facts: false
roles:
- {role: validate-services, when: validate_services|bool}

View File

@ -0,0 +1,5 @@
---
features:
- New "validate_services" boolean flag in order to check the service state
- New "validate-services" role in charge of container and service state
validation

View File

@ -268,6 +268,9 @@
echo "" >> $ALL_FILE; echo "" >> $ALL_FILE;
done; done;
# Get only failed containers, in a dedicated file
${engine} ps -a | grep -vE ' (IMAGE|Exited \(0\)|Up) ' &>> /var/log/extra/failed_containers.log;
for cont in $(${engine} ps | awk {'print $NF'} | grep -v NAMES); do for cont in $(${engine} ps | awk {'print $NF'} | grep -v NAMES); do
INFO_DIR=$BASE_CONTAINER_EXTRA/containers/${cont}; INFO_DIR=$BASE_CONTAINER_EXTRA/containers/${cont};
mkdir -p $INFO_DIR; mkdir -p $INFO_DIR;

View File

@ -61,3 +61,6 @@ test_ping: true
# install and execute dstat across nodes # install and execute dstat across nodes
validate_performance: true validate_performance: true
# If `validate_services` is `true`, run the validate-services role
validate_services: false

View File

@ -0,0 +1,44 @@
---
- name: Do we have docker
stat:
path: /usr/bin/docker
register: docker_stat
- name: Does docker run
systemd:
name: docker
register: docker_svc
when: docker_stat.stat.exists
- name: Do we have podman
stat:
path: /usr/bin/podman
register: podman_stat
# Check both docker and podman, even if we don't have any container.
# This will help catching failed container in mixed env, for example while
# migrating from docker to podman or the reverse.
- name: Get failed containers for podman
when: podman_stat.stat.exists
shell: >
{% raw %}
podman ps -a --filter 'status=exited' --format '{{.Names}} {{.Status}}'
{% endraw %}
register: failed_podman
- name: Get failed containers from docker
when: docker_svc.status['SubState'] == 'running'
shell: >
{% raw %}
docker ps -a --filter 'status=exited' --format '{{.Names}} {{.Status}}'
{% endraw %}
register: failed_docker
- name: Fail if we detect failed container
fail:
msg: |
Failed container detected. Please check the following locations
/var/log/extras/failed_containers.log
/var/log/extras/
when: item is not match(".* Exited \(0\) .* ago")
loop: "{{ failed_podman.stdout_lines + failed_docker.stdout_lines}}"

View File

@ -0,0 +1,3 @@
---
- include_tasks: containers.yaml
- include_tasks: systemd.yaml

View File

@ -0,0 +1,10 @@
---
- name: Get failed services from Systemd
shell: >
systemctl list-units --failed --plain --no-legend --no-pager
register: systemd_state
- name: Fails if we find failed systemd units
assert:
that:
- systemd_state.stdout_lines|length == 0