Improve ping test coverage during update.

The ping test starts at the beginning of the "update run" phase and stops after it finishes. This means after all roles has been updated. With this patch we stop, test and restart the ping in-between each role update. This means: 1. that we detect error earlier; 2. we detect error related to new flow being created during update run; The point 2. was discovered to be an important test as ovn can have existing flow still working, but new flow breaking. With this new behavior for the ping test we catch such error. The downside is that we have even more sensible to any % based testing as the same number of error will give you an higher percentage as we spend less time in the test for each run. This could be seen as another improvement. We're splitting the ping test into two stages so that if the ping fails to start (as it would be for this particular issue) we would detect it immediately instead of waiting for the end of the run. When we doing batch update (all roles in parallel) we deactivate that mechanism and fall back to the previous one as there is no in-between role step there. We also prevent the stop ping from searching into all home subdirectory as I had an issue in local testing where one subdirectory had unreadable files (after a local podman run). This shouldn't happen in CI, but is good to have for local testing. Change-Id: I7f30f5361773b96de13325f5038c89477b575e65
2020-11-12 18:10:35 +01:00 · 2020-11-12 18:10:35 +01:00 · a87fcb7ef4
commit a87fcb7ef4
parent f3278cb056
8 changed files with 50 additions and 23 deletions
--- a/defaults/main.yml
+++ b/defaults/main.yml
@ -132,6 +132,7 @@ node_reboot_timeout: 300
 # enable l3 agent connectivity check during upgrade
 l3_agent_connectivity_check: false
 l3_agent_connectivity_check_start_script: "{{ working_dir }}/l3_agent_start_ping.sh"
+l3_agent_connectivity_check_wait_script: "{{ working_dir }}/l3_agent_wait_ping.sh"
 l3_agent_connectivity_check_stop_script: "{{ working_dir }}/l3_agent_stop_ping.sh"
 l3_agent_failover_check: false

--- a/tasks/common/create_l3_agent_connectivity_check_script.yml
+++ b/tasks/common/create_l3_agent_connectivity_check_script.yml
@ -6,13 +6,19 @@
        state: latest
      become: true
      become_user: root
-    - name: create start l3 agent connectivity check scripts
+    - name: create start l3 agent connectivity check script
      template:
        src: "l3_agent_start_ping.sh.j2"
        dest: "{{ l3_agent_connectivity_check_start_script }}"
        mode: 0775

-    - name: create stop l3 agent connectivity check scripts
+    - name: create start l3 agent connectivity wait script
+      template:
+        src: "l3_agent_wait_ping.sh.j2"
+        dest: "{{ l3_agent_connectivity_check_wait_script }}"
+        mode: 0775
+
+    - name: create stop l3 agent connectivity check script
      template:
        src: "l3_agent_stop_ping.sh.j2"
        dest: "{{ l3_agent_connectivity_check_stop_script }}"
--- a/tasks/common/l3_agent_connectivity_check_start_script.yml
+++ b/tasks/common/l3_agent_connectivity_check_start_script.yml
@ -1,4 +1,10 @@
 ---
+- name: l3 agent connectivity wait until vm is ready
+  shell: |
+    source {{ overcloud_rc }}
+    {{ l3_agent_connectivity_check_wait_script }}
+  when: l3_agent_connectivity_check
+
 - name: start l3 agent connectivity check
  shell: |
    source {{ overcloud_rc }}
--- a/tasks/update/overcloud_update_run.yml
+++ b/tasks/update/overcloud_update_run.yml
@ -1,7 +1,4 @@
 ---
- name: import tasks from l3_agent_connectivity_check_start_script
-  import_tasks: ../common/l3_agent_connectivity_check_start_script.yml
-
 - name: Are we running in parallel or serially ?
  debug:
    msg: "{{ (overcloud_batch_update|bool) | ternary('Running in parallel', 'Running serially') }}"
@ -15,9 +12,3 @@
    oc_current_role: "{{ item }}"
  include_tasks: overcloud_update_run_role.yml
  loop: "{{ oc_roles|default(['all'])|batch((overcloud_batch_update|bool) | ternary(100, 1))|list }}"
-
-
- name: import tasks from l3_agent_connectivity_check_stop_script
-  import_tasks: ../common/l3_agent_connectivity_check_stop_script.yml
-  vars:
-    current_stage_error: "{{ update_loss_threshold }}"
--- a/tasks/update/overcloud_update_run_role.yml
+++ b/tasks/update/overcloud_update_run_role.yml
@ -1,4 +1,7 @@
 ---
+- name: import tasks from l3_agent_connectivity_check_start_script
+  import_tasks: ../common/l3_agent_connectivity_check_start_script.yml
+
 - name: run overcloud minor update in each of the roles/hostgroups
  async: 25200
  poll: 0
@ -24,3 +27,8 @@
  register: async_poll_results
  until: async_poll_results.finished
  retries: 25200
+
+- name: import tasks from l3_agent_connectivity_check_stop_script
+  import_tasks: ../common/l3_agent_connectivity_check_stop_script.yml
+  vars:
+    current_stage_error: "{{ update_loss_threshold }}"
--- a/templates/l3_agent_start_ping.sh.j2
+++ b/templates/l3_agent_start_ping.sh.j2
@ -14,17 +14,6 @@ else
    source {{ working_dir }}/vm_ip.sh
 fi

-# Block 1 minute for the fip to be ready.
-cpt=1
-while ! ping -c 1 -w 1 "${VM_IP}"; do
-    echo "Waiting for fip to be ready ... for $cpt seconds"
-    if [ $cpt -gt 60 ]; then
-        echo "The fip ${VM_IP} took more than 1 minute to be available, aborting"
-        exit 1
-    fi
-    cpt=$((cpt+1))
-done
-
 # NOTE: the &>> is necessary as if we don't redirect both
 # stdout and stderr we will make any script using this one to
 # hang until ping finishes. Meaning if some script crashes
--- a/templates/l3_agent_stop_ping.sh.j2
+++ b/templates/l3_agent_stop_ping.sh.j2
@ -22,7 +22,7 @@ function get_cut_time()
 kill -s INT $(/usr/sbin/pidof ping)

 # print the ping results
-PING_RESULT_LOG=$(find "${BASE_DIR}" -iname 'ping_results*' | sort | tail -1)
+PING_RESULT_LOG=$(find "${BASE_DIR}" -maxdepth 1 -iname 'ping_results*' | sort | tail -1)
 tail -2 $PING_RESULT_LOG

 # check results
--- a/templates/l3_agent_wait_ping.sh.j2
+++ b/templates/l3_agent_wait_ping.sh.j2
@ -0,0 +1,26 @@
+#!/bin/bash
+#
+# Script that wait for the vm to be ready
+
+if [ -f {{ working_dir }}/vm_ip.sh ]; then
+    source {{ working_dir }}/vm_ip.sh
+else
+    {% if workload_sriov | bool %}
+    echo export VM_IP=$(openstack server list -f json | jq -r -c '.[0]["Networks"]' | cut -d"=" -f2) > {{ working_dir }}/vm_ip.sh
+    {% else %}
+    echo export VM_IP=$(openstack floating ip list -f json | jq -r -c '.[] | select(.Port) | .["Floating IP Address"]' | head -1) > {{ working_dir }}/vm_ip.sh
+    {% endif %}
+    source {{ working_dir }}/vm_ip.sh
+fi
+
+# Block 1 minute for the fip to be ready.
+cpt=1
+while ! ping -c 1 -w 1 "${VM_IP}"; do
+    echo "Waiting for fip to be ready ... for $cpt seconds"
+    if [ $cpt -gt 60 ]; then
+        echo "The fip ${VM_IP} took more than 1 minute to be available, aborting"
+        exit 1
+    fi
+    sleep 1
+    cpt=$((cpt+1))
+done