Raise an error if a service or container is failed
Sometimes container or service does not start, and this doesn't make the CI fail. Until now, the failed containers are listed in the /var/log/extras/ tree, but it's not checked on a regular basis. This patch intends to make a hard failure in case either a service or a container doesn't start as expected. Co-Authored-By: Cédric Jeanneret <cjeanner@redhat.com> Related-Bug: #1816523 Change-Id: I001e2f27d2b562bb0be87c8eaadcf3622e530498
This commit is contained in:
parent
37b2a2643c
commit
74983421a1
@ -42,6 +42,7 @@
|
||||
gather_facts: false
|
||||
roles:
|
||||
- {role: validate-tempest, when: run_tempest|bool}
|
||||
- {role: validate-services, when: validate_services|bool}
|
||||
tags:
|
||||
- standalone
|
||||
|
||||
|
@ -18,3 +18,4 @@
|
||||
roles:
|
||||
- {role: validate-simple, when: test_ping|bool}
|
||||
- {role: validate-tempest, when: run_tempest|bool}
|
||||
- {role: validate-services, when: validate_services|bool}
|
||||
|
@ -34,3 +34,15 @@
|
||||
gather_facts: true
|
||||
roles:
|
||||
- {role: validate-ui, when: validate_ui_simple|bool and undercloud_enable_ui|default(true)|bool}
|
||||
|
||||
# Ensure services and containers are OK
|
||||
- name: Execute simple service and container validations
|
||||
hosts:
|
||||
- undercloud
|
||||
- overcloud
|
||||
tags:
|
||||
- overcloud-validate
|
||||
- undercloud-validate
|
||||
gather_facts: false
|
||||
roles:
|
||||
- {role: validate-services, when: validate_services|bool}
|
||||
|
@ -0,0 +1,5 @@
|
||||
---
|
||||
features:
|
||||
- New "validate_services" boolean flag in order to check the service state
|
||||
- New "validate-services" role in charge of container and service state
|
||||
validation
|
@ -268,6 +268,9 @@
|
||||
echo "" >> $ALL_FILE;
|
||||
done;
|
||||
|
||||
# Get only failed containers, in a dedicated file
|
||||
${engine} ps -a | grep -vE ' (IMAGE|Exited \(0\)|Up) ' &>> /var/log/extra/failed_containers.log;
|
||||
|
||||
for cont in $(${engine} ps | awk {'print $NF'} | grep -v NAMES); do
|
||||
INFO_DIR=$BASE_CONTAINER_EXTRA/containers/${cont};
|
||||
mkdir -p $INFO_DIR;
|
||||
|
@ -61,3 +61,6 @@ test_ping: true
|
||||
|
||||
# install and execute dstat across nodes
|
||||
validate_performance: true
|
||||
|
||||
# If `validate_services` is `true`, run the validate-services role
|
||||
validate_services: false
|
||||
|
44
roles/validate-services/tasks/containers.yaml
Normal file
44
roles/validate-services/tasks/containers.yaml
Normal file
@ -0,0 +1,44 @@
|
||||
---
|
||||
- name: Do we have docker
|
||||
stat:
|
||||
path: /usr/bin/docker
|
||||
register: docker_stat
|
||||
|
||||
- name: Does docker run
|
||||
systemd:
|
||||
name: docker
|
||||
register: docker_svc
|
||||
when: docker_stat.stat.exists
|
||||
|
||||
- name: Do we have podman
|
||||
stat:
|
||||
path: /usr/bin/podman
|
||||
register: podman_stat
|
||||
|
||||
# Check both docker and podman, even if we don't have any container.
|
||||
# This will help catching failed container in mixed env, for example while
|
||||
# migrating from docker to podman or the reverse.
|
||||
- name: Get failed containers for podman
|
||||
when: podman_stat.stat.exists
|
||||
shell: >
|
||||
{% raw %}
|
||||
podman ps -a --filter 'status=exited' --format '{{.Names}} {{.Status}}'
|
||||
{% endraw %}
|
||||
register: failed_podman
|
||||
|
||||
- name: Get failed containers from docker
|
||||
when: docker_svc.status['SubState'] == 'running'
|
||||
shell: >
|
||||
{% raw %}
|
||||
docker ps -a --filter 'status=exited' --format '{{.Names}} {{.Status}}'
|
||||
{% endraw %}
|
||||
register: failed_docker
|
||||
|
||||
- name: Fail if we detect failed container
|
||||
fail:
|
||||
msg: |
|
||||
Failed container detected. Please check the following locations
|
||||
/var/log/extras/failed_containers.log
|
||||
/var/log/extras/
|
||||
when: item is not match(".* Exited \(0\) .* ago")
|
||||
loop: "{{ failed_podman.stdout_lines + failed_docker.stdout_lines}}"
|
3
roles/validate-services/tasks/main.yaml
Normal file
3
roles/validate-services/tasks/main.yaml
Normal file
@ -0,0 +1,3 @@
|
||||
---
|
||||
- include_tasks: containers.yaml
|
||||
- include_tasks: systemd.yaml
|
10
roles/validate-services/tasks/systemd.yaml
Normal file
10
roles/validate-services/tasks/systemd.yaml
Normal file
@ -0,0 +1,10 @@
|
||||
---
|
||||
- name: Get failed services from Systemd
|
||||
shell: >
|
||||
systemctl list-units --failed --plain --no-legend --no-pager
|
||||
register: systemd_state
|
||||
|
||||
- name: Fails if we find failed systemd units
|
||||
assert:
|
||||
that:
|
||||
- systemd_state.stdout_lines|length == 0
|
Loading…
Reference in New Issue
Block a user