Raise an error if a service or container is failed

Sometimes container or service does not start, and this
doesn't make the CI fail. Until now, the failed containers
are listed in the /var/log/extras/ tree, but it's not
checked on a regular basis.

This patch intends to make a hard failure in case either
a service or a container doesn't start as expected.

Co-Authored-By: Cédric Jeanneret <cjeanner@redhat.com>
Related-Bug: #1816523
Change-Id: I001e2f27d2b562bb0be87c8eaadcf3622e530498
changes/29/637729/12
Chandan Kumar 4 years ago committed by Cédric Jeanneret
parent 37b2a2643c
commit 74983421a1
  1. 1
      playbooks/multinode-standalone.yml
  2. 1
      playbooks/multinode-validate.yml
  3. 12
      playbooks/quickstart-extras-validate.yml
  4. 5
      releasenotes/notes/validate_services-ea3d14b982f4f9e8.yaml
  5. 3
      roles/collect-logs/tasks/collect.yml
  6. 3
      roles/extras-common/defaults/main.yml
  7. 44
      roles/validate-services/tasks/containers.yaml
  8. 3
      roles/validate-services/tasks/main.yaml
  9. 10
      roles/validate-services/tasks/systemd.yaml

@ -42,6 +42,7 @@
gather_facts: false
roles:
- {role: validate-tempest, when: run_tempest|bool}
- {role: validate-services, when: validate_services|bool}
tags:
- standalone

@ -18,3 +18,4 @@
roles:
- {role: validate-simple, when: test_ping|bool}
- {role: validate-tempest, when: run_tempest|bool}
- {role: validate-services, when: validate_services|bool}

@ -34,3 +34,15 @@
gather_facts: true
roles:
- {role: validate-ui, when: validate_ui_simple|bool and undercloud_enable_ui|default(true)|bool}
# Ensure services and containers are OK
- name: Execute simple service and container validations
hosts:
- undercloud
- overcloud
tags:
- overcloud-validate
- undercloud-validate
gather_facts: false
roles:
- {role: validate-services, when: validate_services|bool}

@ -0,0 +1,5 @@
---
features:
- New "validate_services" boolean flag in order to check the service state
- New "validate-services" role in charge of container and service state
validation

@ -268,6 +268,9 @@
echo "" >> $ALL_FILE;
done;
# Get only failed containers, in a dedicated file
${engine} ps -a | grep -vE ' (IMAGE|Exited \(0\)|Up) ' &>> /var/log/extra/failed_containers.log;
for cont in $(${engine} ps | awk {'print $NF'} | grep -v NAMES); do
INFO_DIR=$BASE_CONTAINER_EXTRA/containers/${cont};
mkdir -p $INFO_DIR;

@ -61,3 +61,6 @@ test_ping: true
# install and execute dstat across nodes
validate_performance: true
# If `validate_services` is `true`, run the validate-services role
validate_services: false

@ -0,0 +1,44 @@
---
- name: Do we have docker
stat:
path: /usr/bin/docker
register: docker_stat
- name: Does docker run
systemd:
name: docker
register: docker_svc
when: docker_stat.stat.exists
- name: Do we have podman
stat:
path: /usr/bin/podman
register: podman_stat
# Check both docker and podman, even if we don't have any container.
# This will help catching failed container in mixed env, for example while
# migrating from docker to podman or the reverse.
- name: Get failed containers for podman
when: podman_stat.stat.exists
shell: >
{% raw %}
podman ps -a --filter 'status=exited' --format '{{.Names}} {{.Status}}'
{% endraw %}
register: failed_podman
- name: Get failed containers from docker
when: docker_svc.status['SubState'] == 'running'
shell: >
{% raw %}
docker ps -a --filter 'status=exited' --format '{{.Names}} {{.Status}}'
{% endraw %}
register: failed_docker
- name: Fail if we detect failed container
fail:
msg: |
Failed container detected. Please check the following locations
/var/log/extras/failed_containers.log
/var/log/extras/
when: item is not match(".* Exited \(0\) .* ago")
loop: "{{ failed_podman.stdout_lines + failed_docker.stdout_lines}}"

@ -0,0 +1,3 @@
---
- include_tasks: containers.yaml
- include_tasks: systemd.yaml

@ -0,0 +1,10 @@
---
- name: Get failed services from Systemd
shell: >
systemctl list-units --failed --plain --no-legend --no-pager
register: systemd_state
- name: Fails if we find failed systemd units
assert:
that:
- systemd_state.stdout_lines|length == 0
Loading…
Cancel
Save