nova-live-migration: Wait for n-cpu services to come up after configuring Ceph

Previously the ceph.sh script used during the nova-live-migration job would only grep for a `compute` process when checking if the services had been restarted. This check was bogus and would always return 0 as it would always match itself. For example: 2020-03-13 21:06:47.682073 | primary | 2020-03-13 21:06:47.681 | root 29529 0.0 0.0 4500 736 pts/0 S+ 21:06 0:00 /bin/sh -c ps aux | grep compute 2020-03-13 21:06:47.683964 | primary | 2020-03-13 21:06:47.683 | root 29531 0.0 0.0 14616 944 pts/0 S+ 21:06 0:00 grep compute Failures of this job were seen on the stable/pike branch where slower CI nodes appeared to struggle to allow Libvirt to report to n-cpu in time before Tempest was started. This in-turn caused instance build failures and the overall failure of the job. This change resolves this issue by switching to pgrep and ensuring n-cpu services are reported as fully up after a cold restart before starting the Tempest test run. Closes-Bug: 1867380 Change-Id: Icd7ab2ca4ddbed92c7e883a63a23245920d961e7 (cherry picked from commit e23c3c2c8d)
2020-03-13 16:51:01 +00:00 · 2020-03-13 16:51:01 +00:00 · 70447bca2f
parent c48d621843
commit 70447bca2f
1 changed files with 42 additions and 5 deletions
--- a/gate/live_migration/hooks/ceph.sh
+++ b/gate/live_migration/hooks/ceph.sh
@ -98,19 +98,56 @@ function _ceph_configure_nova {
    fi
 }

+function _wait_for_nova_compute_service_state {
+    source $BASE/new/devstack/openrc admin admin
+    local status=$1
+    local attempt=1
+    local max_attempts=24
+    local attempt_sleep=5
+    local computes_count=$(openstack compute service list | grep -c nova-compute)
+    local computes_ready=$(openstack compute service list | grep nova-compute | grep $status | wc -l)
+
+    echo "Waiting for $computes_count computes to report as $status"
+    while [ "$computes_ready" -ne "$computes_count" ]; do
+        if [ "$attempt" -eq "$max_attempts" ]; then
+            echo "Failed waiting for computes to report as ${status}, ${computes_ready}/${computes_count} ${status} after ${max_attempts} attempts"
+            exit 4
+        fi
+        echo "Waiting ${attempt_sleep} seconds for ${computes_count} computes to report as ${status}, ${computes_ready}/${computes_count} ${status} after ${attempt}/${max_attempts} attempts"
+        sleep $attempt_sleep
+        attempt=$((attempt+1))
+        computes_ready=$(openstack compute service list | grep nova-compute | grep $status | wc -l)
+    done
+    echo "All computes are now reporting as ${status} after ${attempt} attempts"
+}
+
 function configure_and_start_nova {
+
+    echo "Checking all n-cpu services"
+    $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "pgrep -u stack -a nova-compute"
+
+    # stop nova-compute
+    echo "Stopping all n-cpu services"
+    $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "systemctl stop devstack@n-cpu"
+
+    # Wait for the service to be marked as down
+    _wait_for_nova_compute_service_state "down"
+
    _ceph_configure_nova
+
    #import secret to libvirt
    _populate_libvirt_secret
-    echo 'check compute processes before restart'
-    $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "ps aux | grep compute"

-    # restart nova-compute
-    $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "systemctl restart devstack@n-cpu"
+    # start nova-compute
+    echo "Starting all n-cpu services"
+    $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "systemctl start devstack@n-cpu"

+    echo "Checking all n-cpu services"
    # test that they are all running again
-    $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "ps aux | grep compute"
+    $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "pgrep -u stack -a nova-compute"

+    # Wait for the service to be marked as up
+    _wait_for_nova_compute_service_state "up"
 }

 function _ceph_configure_cinder {