Raise an error if a service or container is failed
Sometimes container or service does not start, and this doesn't make the CI fail. Until now, the failed containers are listed in the /var/log/extras/ tree, but it's not checked on a regular basis. This patch intends to make a hard failure in case either a service or a container doesn't start as expected. Co-Authored-By: Cédric Jeanneret <cjeanner@redhat.com> Related-Bug: #1816523 Change-Id: I001e2f27d2b562bb0be87c8eaadcf3622e530498
This commit is contained in:
parent
37b2a2643c
commit
74983421a1
@ -42,6 +42,7 @@
|
|||||||
gather_facts: false
|
gather_facts: false
|
||||||
roles:
|
roles:
|
||||||
- {role: validate-tempest, when: run_tempest|bool}
|
- {role: validate-tempest, when: run_tempest|bool}
|
||||||
|
- {role: validate-services, when: validate_services|bool}
|
||||||
tags:
|
tags:
|
||||||
- standalone
|
- standalone
|
||||||
|
|
||||||
|
@ -18,3 +18,4 @@
|
|||||||
roles:
|
roles:
|
||||||
- {role: validate-simple, when: test_ping|bool}
|
- {role: validate-simple, when: test_ping|bool}
|
||||||
- {role: validate-tempest, when: run_tempest|bool}
|
- {role: validate-tempest, when: run_tempest|bool}
|
||||||
|
- {role: validate-services, when: validate_services|bool}
|
||||||
|
@ -34,3 +34,15 @@
|
|||||||
gather_facts: true
|
gather_facts: true
|
||||||
roles:
|
roles:
|
||||||
- {role: validate-ui, when: validate_ui_simple|bool and undercloud_enable_ui|default(true)|bool}
|
- {role: validate-ui, when: validate_ui_simple|bool and undercloud_enable_ui|default(true)|bool}
|
||||||
|
|
||||||
|
# Ensure services and containers are OK
|
||||||
|
- name: Execute simple service and container validations
|
||||||
|
hosts:
|
||||||
|
- undercloud
|
||||||
|
- overcloud
|
||||||
|
tags:
|
||||||
|
- overcloud-validate
|
||||||
|
- undercloud-validate
|
||||||
|
gather_facts: false
|
||||||
|
roles:
|
||||||
|
- {role: validate-services, when: validate_services|bool}
|
||||||
|
@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
features:
|
||||||
|
- New "validate_services" boolean flag in order to check the service state
|
||||||
|
- New "validate-services" role in charge of container and service state
|
||||||
|
validation
|
@ -268,6 +268,9 @@
|
|||||||
echo "" >> $ALL_FILE;
|
echo "" >> $ALL_FILE;
|
||||||
done;
|
done;
|
||||||
|
|
||||||
|
# Get only failed containers, in a dedicated file
|
||||||
|
${engine} ps -a | grep -vE ' (IMAGE|Exited \(0\)|Up) ' &>> /var/log/extra/failed_containers.log;
|
||||||
|
|
||||||
for cont in $(${engine} ps | awk {'print $NF'} | grep -v NAMES); do
|
for cont in $(${engine} ps | awk {'print $NF'} | grep -v NAMES); do
|
||||||
INFO_DIR=$BASE_CONTAINER_EXTRA/containers/${cont};
|
INFO_DIR=$BASE_CONTAINER_EXTRA/containers/${cont};
|
||||||
mkdir -p $INFO_DIR;
|
mkdir -p $INFO_DIR;
|
||||||
|
@ -61,3 +61,6 @@ test_ping: true
|
|||||||
|
|
||||||
# install and execute dstat across nodes
|
# install and execute dstat across nodes
|
||||||
validate_performance: true
|
validate_performance: true
|
||||||
|
|
||||||
|
# If `validate_services` is `true`, run the validate-services role
|
||||||
|
validate_services: false
|
||||||
|
44
roles/validate-services/tasks/containers.yaml
Normal file
44
roles/validate-services/tasks/containers.yaml
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
---
|
||||||
|
- name: Do we have docker
|
||||||
|
stat:
|
||||||
|
path: /usr/bin/docker
|
||||||
|
register: docker_stat
|
||||||
|
|
||||||
|
- name: Does docker run
|
||||||
|
systemd:
|
||||||
|
name: docker
|
||||||
|
register: docker_svc
|
||||||
|
when: docker_stat.stat.exists
|
||||||
|
|
||||||
|
- name: Do we have podman
|
||||||
|
stat:
|
||||||
|
path: /usr/bin/podman
|
||||||
|
register: podman_stat
|
||||||
|
|
||||||
|
# Check both docker and podman, even if we don't have any container.
|
||||||
|
# This will help catching failed container in mixed env, for example while
|
||||||
|
# migrating from docker to podman or the reverse.
|
||||||
|
- name: Get failed containers for podman
|
||||||
|
when: podman_stat.stat.exists
|
||||||
|
shell: >
|
||||||
|
{% raw %}
|
||||||
|
podman ps -a --filter 'status=exited' --format '{{.Names}} {{.Status}}'
|
||||||
|
{% endraw %}
|
||||||
|
register: failed_podman
|
||||||
|
|
||||||
|
- name: Get failed containers from docker
|
||||||
|
when: docker_svc.status['SubState'] == 'running'
|
||||||
|
shell: >
|
||||||
|
{% raw %}
|
||||||
|
docker ps -a --filter 'status=exited' --format '{{.Names}} {{.Status}}'
|
||||||
|
{% endraw %}
|
||||||
|
register: failed_docker
|
||||||
|
|
||||||
|
- name: Fail if we detect failed container
|
||||||
|
fail:
|
||||||
|
msg: |
|
||||||
|
Failed container detected. Please check the following locations
|
||||||
|
/var/log/extras/failed_containers.log
|
||||||
|
/var/log/extras/
|
||||||
|
when: item is not match(".* Exited \(0\) .* ago")
|
||||||
|
loop: "{{ failed_podman.stdout_lines + failed_docker.stdout_lines}}"
|
3
roles/validate-services/tasks/main.yaml
Normal file
3
roles/validate-services/tasks/main.yaml
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
---
|
||||||
|
- include_tasks: containers.yaml
|
||||||
|
- include_tasks: systemd.yaml
|
10
roles/validate-services/tasks/systemd.yaml
Normal file
10
roles/validate-services/tasks/systemd.yaml
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
---
|
||||||
|
- name: Get failed services from Systemd
|
||||||
|
shell: >
|
||||||
|
systemctl list-units --failed --plain --no-legend --no-pager
|
||||||
|
register: systemd_state
|
||||||
|
|
||||||
|
- name: Fails if we find failed systemd units
|
||||||
|
assert:
|
||||||
|
that:
|
||||||
|
- systemd_state.stdout_lines|length == 0
|
Loading…
Reference in New Issue
Block a user