nova-live-migration: Wait for n-cpu services to come up after configuring Ceph

Previously the ceph.sh script used during the nova-live-migration job
would only grep for a `compute` process when checking if the services
had been restarted. This check was bogus and would always return 0 as it
would always match itself. For example:

2020-03-13 21:06:47.682073 | primary | 2020-03-13 21:06:47.681 | root
29529  0.0  0.0   4500   736 pts/0    S+   21:06   0:00 /bin/sh -c ps
       aux | grep compute
2020-03-13 21:06:47.683964 | primary | 2020-03-13 21:06:47.683 | root
29531  0.0  0.0  14616   944 pts/0    S+   21:06   0:00 grep compute

Failures of this job were seen on the stable/pike branch where slower CI
nodes appeared to struggle to allow Libvirt to report to n-cpu in time
before Tempest was started. This in-turn caused instance build failures
and the overall failure of the job.

This change resolves this issue by switching to pgrep and ensuring
n-cpu services are reported as fully up after a cold restart before
starting the Tempest test run.

Closes-Bug: 1867380
Change-Id: Icd7ab2ca4ddbed92c7e883a63a23245920d961e7
This commit is contained in:
Lee Yarwood 2020-03-13 16:51:01 +00:00
parent e483ca1cd9
commit e23c3c2c8d

View File

@ -99,19 +99,56 @@ function _ceph_configure_nova {
fi
}
function _wait_for_nova_compute_service_state {
source $BASE/new/devstack/openrc admin admin
local status=$1
local attempt=1
local max_attempts=24
local attempt_sleep=5
local computes_count=$(openstack compute service list | grep -c nova-compute)
local computes_ready=$(openstack compute service list | grep nova-compute | grep $status | wc -l)
echo "Waiting for $computes_count computes to report as $status"
while [ "$computes_ready" -ne "$computes_count" ]; do
if [ "$attempt" -eq "$max_attempts" ]; then
echo "Failed waiting for computes to report as ${status}, ${computes_ready}/${computes_count} ${status} after ${max_attempts} attempts"
exit 4
fi
echo "Waiting ${attempt_sleep} seconds for ${computes_count} computes to report as ${status}, ${computes_ready}/${computes_count} ${status} after ${attempt}/${max_attempts} attempts"
sleep $attempt_sleep
attempt=$((attempt+1))
computes_ready=$(openstack compute service list | grep nova-compute | grep $status | wc -l)
done
echo "All computes are now reporting as ${status} after ${attempt} attempts"
}
function configure_and_start_nova {
echo "Checking all n-cpu services"
$ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "pgrep -u stack -a nova-compute"
# stop nova-compute
echo "Stopping all n-cpu services"
$ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "systemctl stop devstack@n-cpu"
# Wait for the service to be marked as down
_wait_for_nova_compute_service_state "down"
_ceph_configure_nova
#import secret to libvirt
_populate_libvirt_secret
echo 'check compute processes before restart'
$ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "ps aux | grep compute"
# restart nova-compute
$ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "systemctl restart devstack@n-cpu"
# start nova-compute
echo "Starting all n-cpu services"
$ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "systemctl start devstack@n-cpu"
echo "Checking all n-cpu services"
# test that they are all running again
$ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "ps aux | grep compute"
$ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "pgrep -u stack -a nova-compute"
# Wait for the service to be marked as up
_wait_for_nova_compute_service_state "up"
}
function _ceph_configure_cinder {