nova/gate/live_migration/hooks/ceph.sh
Lee Yarwood 70447bca2f nova-live-migration: Wait for n-cpu services to come up after configuring Ceph
Previously the ceph.sh script used during the nova-live-migration job
would only grep for a `compute` process when checking if the services
had been restarted. This check was bogus and would always return 0 as it
would always match itself. For example:

2020-03-13 21:06:47.682073 | primary | 2020-03-13 21:06:47.681 | root
29529  0.0  0.0   4500   736 pts/0    S+   21:06   0:00 /bin/sh -c ps
       aux | grep compute
2020-03-13 21:06:47.683964 | primary | 2020-03-13 21:06:47.683 | root
29531  0.0  0.0  14616   944 pts/0    S+   21:06   0:00 grep compute

Failures of this job were seen on the stable/pike branch where slower CI
nodes appeared to struggle to allow Libvirt to report to n-cpu in time
before Tempest was started. This in-turn caused instance build failures
and the overall failure of the job.

This change resolves this issue by switching to pgrep and ensuring
n-cpu services are reported as fully up after a cold restart before
starting the Tempest test run.

Closes-Bug: 1867380
Change-Id: Icd7ab2ca4ddbed92c7e883a63a23245920d961e7
(cherry picked from commit e23c3c2c8d)
2020-03-19 11:03:45 +00:00

208 lines
12 KiB
Bash
Executable File

#!/bin/bash
function prepare_ceph {
git clone https://opendev.org/openstack/devstack-plugin-ceph /tmp/devstack-plugin-ceph
source /tmp/devstack-plugin-ceph/devstack/settings
source /tmp/devstack-plugin-ceph/devstack/lib/ceph
install_ceph
configure_ceph
#install ceph-common package on compute nodes
$ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m raw -a "executable=/bin/bash
source $BASE/new/devstack/functions
source $BASE/new/devstack/functions-common
git clone https://opendev.org/openstack/devstack-plugin-ceph /tmp/devstack-plugin-ceph
source /tmp/devstack-plugin-ceph/devstack/lib/ceph
install_ceph_remote
"
#copy ceph admin keyring to compute nodes
sudo cp /etc/ceph/ceph.client.admin.keyring /tmp/ceph.client.admin.keyring
sudo chown ${STACK_USER}:${STACK_USER} /tmp/ceph.client.admin.keyring
sudo chmod 644 /tmp/ceph.client.admin.keyring
$ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m copy -a "src=/tmp/ceph.client.admin.keyring dest=/etc/ceph/ceph.client.admin.keyring owner=ceph group=ceph"
sudo rm -f /tmp/ceph.client.admin.keyring
#copy ceph.conf to compute nodes
$ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m copy -a "src=/etc/ceph/ceph.conf dest=/etc/ceph/ceph.conf owner=root group=root"
start_ceph
}
function _ceph_configure_glance {
GLANCE_API_CONF=${GLANCE_API_CONF:-/etc/glance/glance-api.conf}
sudo ceph -c ${CEPH_CONF_FILE} osd pool create ${GLANCE_CEPH_POOL} ${GLANCE_CEPH_POOL_PG} ${GLANCE_CEPH_POOL_PGP}
sudo ceph -c ${CEPH_CONF_FILE} auth get-or-create client.${GLANCE_CEPH_USER} \
mon "allow r" \
osd "allow class-read object_prefix rbd_children, allow rwx pool=${GLANCE_CEPH_POOL}" | \
sudo tee ${CEPH_CONF_DIR}/ceph.client.${GLANCE_CEPH_USER}.keyring
sudo chown ${STACK_USER}:$(id -g -n $whoami) ${CEPH_CONF_DIR}/ceph.client.${GLANCE_CEPH_USER}.keyring
$ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=DEFAULT option=show_image_direct_url value=True"
$ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=glance_store option=default_store value=rbd"
$ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=glance_store option=stores value='file, http, rbd'"
$ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=glance_store option=rbd_store_ceph_conf value=$CEPH_CONF_FILE"
$ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=glance_store option=rbd_store_user value=$GLANCE_CEPH_USER"
$ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=glance_store option=rbd_store_pool value=$GLANCE_CEPH_POOL"
sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${GLANCE_CEPH_POOL} size ${CEPH_REPLICAS}
if [[ $CEPH_REPLICAS -ne 1 ]]; then
sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${GLANCE_CEPH_POOL} crush_ruleset ${RULE_ID}
fi
#copy glance keyring to compute only node
sudo cp /etc/ceph/ceph.client.glance.keyring /tmp/ceph.client.glance.keyring
sudo chown $STACK_USER:$STACK_USER /tmp/ceph.client.glance.keyring
$ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m copy -a "src=/tmp/ceph.client.glance.keyring dest=/etc/ceph/ceph.client.glance.keyring"
sudo rm -f /tmp/ceph.client.glance.keyring
}
function configure_and_start_glance {
_ceph_configure_glance
echo 'check processes before glance-api stop'
$ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "ps aux | grep glance-api"
# restart glance
$ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "systemctl restart devstack@g-api"
echo 'check processes after glance-api stop'
$ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "ps aux | grep glance-api"
}
function _ceph_configure_nova {
#setup ceph for nova, we don't reuse configure_ceph_nova - as we need to emulate case where cinder is not configured for ceph
sudo ceph -c ${CEPH_CONF_FILE} osd pool create ${NOVA_CEPH_POOL} ${NOVA_CEPH_POOL_PG} ${NOVA_CEPH_POOL_PGP}
NOVA_CONF=${NOVA_CPU_CONF:-/etc/nova/nova.conf}
$ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=rbd_user value=${CINDER_CEPH_USER}"
$ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=rbd_secret_uuid value=${CINDER_CEPH_UUID}"
$ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=inject_key value=false"
$ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=inject_partition value=-2"
$ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=disk_cachemodes value='network=writeback'"
$ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=images_type value=rbd"
$ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=images_rbd_pool value=${NOVA_CEPH_POOL}"
$ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=images_rbd_ceph_conf value=${CEPH_CONF_FILE}"
sudo ceph -c ${CEPH_CONF_FILE} auth get-or-create client.${CINDER_CEPH_USER} \
mon "allow r" \
osd "allow class-read object_prefix rbd_children, allow rwx pool=${CINDER_CEPH_POOL}, allow rwx pool=${NOVA_CEPH_POOL},allow rwx pool=${GLANCE_CEPH_POOL}" | \
sudo tee ${CEPH_CONF_DIR}/ceph.client.${CINDER_CEPH_USER}.keyring > /dev/null
sudo chown ${STACK_USER}:$(id -g -n $whoami) ${CEPH_CONF_DIR}/ceph.client.${CINDER_CEPH_USER}.keyring
#copy cinder keyring to compute only node
sudo cp /etc/ceph/ceph.client.cinder.keyring /tmp/ceph.client.cinder.keyring
sudo chown stack:stack /tmp/ceph.client.cinder.keyring
$ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m copy -a "src=/tmp/ceph.client.cinder.keyring dest=/etc/ceph/ceph.client.cinder.keyring"
sudo rm -f /tmp/ceph.client.cinder.keyring
sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${NOVA_CEPH_POOL} size ${CEPH_REPLICAS}
if [[ $CEPH_REPLICAS -ne 1 ]]; then
sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${NOVA_CEPH_POOL} crush_ruleset ${RULE_ID}
fi
}
function _wait_for_nova_compute_service_state {
source $BASE/new/devstack/openrc admin admin
local status=$1
local attempt=1
local max_attempts=24
local attempt_sleep=5
local computes_count=$(openstack compute service list | grep -c nova-compute)
local computes_ready=$(openstack compute service list | grep nova-compute | grep $status | wc -l)
echo "Waiting for $computes_count computes to report as $status"
while [ "$computes_ready" -ne "$computes_count" ]; do
if [ "$attempt" -eq "$max_attempts" ]; then
echo "Failed waiting for computes to report as ${status}, ${computes_ready}/${computes_count} ${status} after ${max_attempts} attempts"
exit 4
fi
echo "Waiting ${attempt_sleep} seconds for ${computes_count} computes to report as ${status}, ${computes_ready}/${computes_count} ${status} after ${attempt}/${max_attempts} attempts"
sleep $attempt_sleep
attempt=$((attempt+1))
computes_ready=$(openstack compute service list | grep nova-compute | grep $status | wc -l)
done
echo "All computes are now reporting as ${status} after ${attempt} attempts"
}
function configure_and_start_nova {
echo "Checking all n-cpu services"
$ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "pgrep -u stack -a nova-compute"
# stop nova-compute
echo "Stopping all n-cpu services"
$ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "systemctl stop devstack@n-cpu"
# Wait for the service to be marked as down
_wait_for_nova_compute_service_state "down"
_ceph_configure_nova
#import secret to libvirt
_populate_libvirt_secret
# start nova-compute
echo "Starting all n-cpu services"
$ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "systemctl start devstack@n-cpu"
echo "Checking all n-cpu services"
# test that they are all running again
$ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "pgrep -u stack -a nova-compute"
# Wait for the service to be marked as up
_wait_for_nova_compute_service_state "up"
}
function _ceph_configure_cinder {
sudo ceph -c ${CEPH_CONF_FILE} osd pool create ${CINDER_CEPH_POOL} ${CINDER_CEPH_POOL_PG} ${CINDER_CEPH_POOL_PGP}
sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${CINDER_CEPH_POOL} size ${CEPH_REPLICAS}
if [[ $CEPH_REPLICAS -ne 1 ]]; then
sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${CINDER_CEPH_POOL} crush_ruleset ${RULE_ID}
fi
CINDER_CONF=${CINDER_CONF:-/etc/cinder/cinder.conf}
$ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=volume_backend_name value=ceph"
$ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=volume_driver value=cinder.volume.drivers.rbd.RBDDriver"
$ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_ceph_conf value=$CEPH_CONF_FILE"
$ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_pool value=$CINDER_CEPH_POOL"
$ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_user value=$CINDER_CEPH_USER"
$ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_uuid value=$CINDER_CEPH_UUID"
$ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_flatten_volume_from_snapshot value=False"
$ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_max_clone_depth value=5"
$ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=DEFAULT option=default_volume_type value=ceph"
$ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=DEFAULT option=enabled_backends value=ceph"
}
function configure_and_start_cinder {
_ceph_configure_cinder
# restart cinder
$ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "systemctl restart devstack@c-vol"
source $BASE/new/devstack/openrc
export OS_USERNAME=admin
export OS_PROJECT_NAME=admin
lvm_type=$(cinder type-list | awk -F "|" 'NR==4{ print $2}')
cinder type-delete $lvm_type
openstack volume type create --os-volume-api-version 1 --property volume_backend_name="ceph" ceph
}
function _populate_libvirt_secret {
cat > /tmp/secret.xml <<EOF
<secret ephemeral='no' private='no'>
<uuid>${CINDER_CEPH_UUID}</uuid>
<usage type='ceph'>
<name>client.${CINDER_CEPH_USER} secret</name>
</usage>
</secret>
EOF
$ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m copy -a "src=/tmp/secret.xml dest=/tmp/secret.xml"
$ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "virsh secret-define --file /tmp/secret.xml"
local secret=$(sudo ceph -c ${CEPH_CONF_FILE} auth get-key client.${CINDER_CEPH_USER})
# TODO(tdurakov): remove this escaping as https://github.com/ansible/ansible/issues/13862 fixed
secret=${secret//=/'\='}
$ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "virsh secret-set-value --secret ${CINDER_CEPH_UUID} --base64 $secret"
$ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m file -a "path=/tmp/secret.xml state=absent"
}