Improve ping test coverage during update.

The ping test starts at the beginning of the "update run" phase and
stops after it finishes. This means after all roles has been updated.

With this patch we stop, test and restart the ping in-between each
role update.

This means:

 1. that we detect error earlier;
 2. we detect error related to new flow being created during update
    run;

The point 2. was discovered to be an important test as ovn can have
existing flow still working, but new flow breaking. With this new
behavior for the ping test we catch such error.

The downside is that we have even more sensible to any % based testing
as the same number of error will give you an higher percentage as we
spend less time in the test for each run. This could be seen as
another improvement.

We're splitting the ping test into two stages so that if the ping
fails to start (as it would be for this particular issue) we would
detect it immediately instead of waiting for the end of the run.

When we doing batch update (all roles in parallel) we deactivate that
mechanism and fall back to the previous one as there is no in-between
role step there.

We also prevent the stop ping from searching into all home
subdirectory as I had an issue in local testing where one subdirectory
had unreadable files (after a local podman run). This shouldn't happen
in CI, but is good to have for local testing.

Change-Id: I7f30f5361773b96de13325f5038c89477b575e65
This commit is contained in:
Sofer Athlan-Guyot 2020-11-12 18:10:35 +01:00
parent f3278cb056
commit a87fcb7ef4
8 changed files with 50 additions and 23 deletions

View File

@ -132,6 +132,7 @@ node_reboot_timeout: 300
# enable l3 agent connectivity check during upgrade
l3_agent_connectivity_check: false
l3_agent_connectivity_check_start_script: "{{ working_dir }}/l3_agent_start_ping.sh"
l3_agent_connectivity_check_wait_script: "{{ working_dir }}/l3_agent_wait_ping.sh"
l3_agent_connectivity_check_stop_script: "{{ working_dir }}/l3_agent_stop_ping.sh"
l3_agent_failover_check: false

View File

@ -6,13 +6,19 @@
state: latest
become: true
become_user: root
- name: create start l3 agent connectivity check scripts
- name: create start l3 agent connectivity check script
template:
src: "l3_agent_start_ping.sh.j2"
dest: "{{ l3_agent_connectivity_check_start_script }}"
mode: 0775
- name: create stop l3 agent connectivity check scripts
- name: create start l3 agent connectivity wait script
template:
src: "l3_agent_wait_ping.sh.j2"
dest: "{{ l3_agent_connectivity_check_wait_script }}"
mode: 0775
- name: create stop l3 agent connectivity check script
template:
src: "l3_agent_stop_ping.sh.j2"
dest: "{{ l3_agent_connectivity_check_stop_script }}"

View File

@ -1,4 +1,10 @@
---
- name: l3 agent connectivity wait until vm is ready
shell: |
source {{ overcloud_rc }}
{{ l3_agent_connectivity_check_wait_script }}
when: l3_agent_connectivity_check
- name: start l3 agent connectivity check
shell: |
source {{ overcloud_rc }}

View File

@ -1,7 +1,4 @@
---
- name: import tasks from l3_agent_connectivity_check_start_script
import_tasks: ../common/l3_agent_connectivity_check_start_script.yml
- name: Are we running in parallel or serially ?
debug:
msg: "{{ (overcloud_batch_update|bool) | ternary('Running in parallel', 'Running serially') }}"
@ -15,9 +12,3 @@
oc_current_role: "{{ item }}"
include_tasks: overcloud_update_run_role.yml
loop: "{{ oc_roles|default(['all'])|batch((overcloud_batch_update|bool) | ternary(100, 1))|list }}"
- name: import tasks from l3_agent_connectivity_check_stop_script
import_tasks: ../common/l3_agent_connectivity_check_stop_script.yml
vars:
current_stage_error: "{{ update_loss_threshold }}"

View File

@ -1,4 +1,7 @@
---
- name: import tasks from l3_agent_connectivity_check_start_script
import_tasks: ../common/l3_agent_connectivity_check_start_script.yml
- name: run overcloud minor update in each of the roles/hostgroups
async: 25200
poll: 0
@ -24,3 +27,8 @@
register: async_poll_results
until: async_poll_results.finished
retries: 25200
- name: import tasks from l3_agent_connectivity_check_stop_script
import_tasks: ../common/l3_agent_connectivity_check_stop_script.yml
vars:
current_stage_error: "{{ update_loss_threshold }}"

View File

@ -14,17 +14,6 @@ else
source {{ working_dir }}/vm_ip.sh
fi
# Block 1 minute for the fip to be ready.
cpt=1
while ! ping -c 1 -w 1 "${VM_IP}"; do
echo "Waiting for fip to be ready ... for $cpt seconds"
if [ $cpt -gt 60 ]; then
echo "The fip ${VM_IP} took more than 1 minute to be available, aborting"
exit 1
fi
cpt=$((cpt+1))
done
# NOTE: the &>> is necessary as if we don't redirect both
# stdout and stderr we will make any script using this one to
# hang until ping finishes. Meaning if some script crashes

View File

@ -22,7 +22,7 @@ function get_cut_time()
kill -s INT $(/usr/sbin/pidof ping)
# print the ping results
PING_RESULT_LOG=$(find "${BASE_DIR}" -iname 'ping_results*' | sort | tail -1)
PING_RESULT_LOG=$(find "${BASE_DIR}" -maxdepth 1 -iname 'ping_results*' | sort | tail -1)
tail -2 $PING_RESULT_LOG
# check results

View File

@ -0,0 +1,26 @@
#!/bin/bash
#
# Script that wait for the vm to be ready
if [ -f {{ working_dir }}/vm_ip.sh ]; then
source {{ working_dir }}/vm_ip.sh
else
{% if workload_sriov | bool %}
echo export VM_IP=$(openstack server list -f json | jq -r -c '.[0]["Networks"]' | cut -d"=" -f2) > {{ working_dir }}/vm_ip.sh
{% else %}
echo export VM_IP=$(openstack floating ip list -f json | jq -r -c '.[] | select(.Port) | .["Floating IP Address"]' | head -1) > {{ working_dir }}/vm_ip.sh
{% endif %}
source {{ working_dir }}/vm_ip.sh
fi
# Block 1 minute for the fip to be ready.
cpt=1
while ! ping -c 1 -w 1 "${VM_IP}"; do
echo "Waiting for fip to be ready ... for $cpt seconds"
if [ $cpt -gt 60 ]; then
echo "The fip ${VM_IP} took more than 1 minute to be available, aborting"
exit 1
fi
sleep 1
cpt=$((cpt+1))
done