Add more test for graceful shutdown

Adding more tests for graceful shutdown:
- shutdown the destination compute and see how live and cold migration
progress
- start build instance and ocne comoute start building instance then
shutdown the comoute service and see if build instance finish or not.
- revert resize server

Partial implement blueprint nova-services-graceful-shutdown-part1

Change-Id: I57132fb7b7fa614dfc138508581ff5a67aaed906
Signed-off-by: Ghanshyam Maan <gmaan.os14@gmail.com>
This commit is contained in:
Ghanshyam Maan
2026-02-20 04:31:18 +00:00
parent 996c4ff9e8
commit b47d217ca7
5 changed files with 397 additions and 1 deletions

View File

@@ -0,0 +1,43 @@
#!/bin/bash
source /opt/stack/devstack/openrc admin
set -x
set -e
timeout=60
image_id=$(openstack image list -f value -c ID | awk 'NR==1{print $1}')
flavor_id=$(openstack flavor list -f value -c ID | awk 'NR==1{print $1}')
network_id=$(openstack network list --no-share -f value -c ID | awk 'NR==1{print $1}')
echo "Creating test server on subnode"
openstack --os-compute-api-version 2.74 server create --image ${image_id} --flavor ${flavor_id} \
--nic net-id=${network_id} --host ${SUBNODE_HOSTNAME} server-build
# Wait for the server vm_state to reach BUILDING so that we know that compute has
# started the build request.
count=0
while true; do
vm_state=$(openstack server show server-build -f value -c OS-EXT-STS:vm_state)
if [ "${vm_state}" == "building" ]; then
echo "Server is in Building"
break
fi
if [ "${vm_state}" == "active" ]; then
echo "Server became active before SIGTERM was sent"
exit 2
fi
if [ "${vm_state}" == "error" ]; then
echo "Server went to error vm_state"
exit 2
fi
sleep 1
count=$((count+1))
if [ ${count} -eq ${timeout} ]; then
echo "Timed out waiting for server to reach BUILDING vm_state"
exit 2
fi
done

View File

@@ -0,0 +1,88 @@
#!/bin/bash
source /opt/stack/devstack/openrc admin
set -x
set -e
timeout=196
image_id=$(openstack image list -f value -c ID | awk 'NR==1{print $1}')
flavor_id=$(openstack flavor list -f value -c ID | awk 'NR==1{print $1}')
network_id=$(openstack network list --no-share -f value -c ID | awk 'NR==1{print $1}')
echo "Creating test server on subnode for graceful shutdown revert resize test"
openstack --os-compute-api-version 2.74 server create --image ${image_id} --flavor ${flavor_id} \
--nic net-id=${network_id} --host ${SUBNODE_HOSTNAME} --wait server-rr
echo "Migrate server-rr to ${CONTROLLER_HOSTNAME}"
openstack --os-compute-api-version 2.56 server migrate \
--host ${CONTROLLER_HOSTNAME} server-rr
# Wait for the migrate to complete
count=0
while true; do
status=$(openstack server show server-rr -f value -c status)
if [ "${status}" == "VERIFY_RESIZE" ]; then
echo "Migration completed, server is in VERIFY_RESIZE state"
break
fi
if [ "${status}" == "ERROR" ]; then
echo "Server went to ERROR status during cold migration"
exit 2
fi
sleep 5
count=$((count+1))
if [ ${count} -eq 20 ]; then
echo "Timed out waiting for server-rr to reach VERIFY_RESIZE"
exit 2
fi
done
# Start and wait for the revert resize to be in progress.
count=0
revert_started=False
revert_completed=False
status=$(openstack server show server-rr -f value -c status)
if [ "${status}" == "VERIFY_RESIZE" ]; then
echo "Starting revert resize of server-rr"
openstack server resize revert server-rr
else
echo "Revert resize skipped"
exit 2
fi
while true; do
task_state=$(openstack server show server-rr -f value -c OS-EXT-STS:task_state)
status=$(openstack server show server-rr -f value -c status)
if [ "${revert_started}" != "True" ] && [ "${revert_completed}" != "True" ]; then
if [ "${task_state}" == "resize_reverting" ]; then
echo "Revert resize is in progress"
# task_state is set by the API before it send the revert_resize RPC call
# to compute. We can try to sleep here for 2 sec and see if compute start
# the revert_resize and shutdown can be initiated before it finish. This
# is best try but no guarantee for that timing.
sleep 2
revert_started=True
fi
if [ "${status}" == "ACTIVE" ]; then
echo "Revert resize appears to have already completed"
revert_completed=True
fi
fi
if [ "${revert_started}" == "True" ]; then
break
fi
if [ "${revert_completed}" == "True" ]; then
echo "Revert resize completed before SIGTERM was sent"
exit 2
fi
count=$((count+1))
if [ ${count} -eq ${timeout} ]; then
echo "Timed out waiting for revert resize to start"
exit 2
fi
done

View File

@@ -0,0 +1,25 @@
#!/bin/bash
source /opt/stack/devstack/openrc admin
set -x
set -e
# Wait for the server to finish building and become active which confirms that
# the build completed during graceful shutdown.
build_start=$(date +%s)
while true; do
status=$(openstack server show server-build -f value -c status)
if [ "${status}" == "ACTIVE" ]; then
build_end=$(date +%s)
build_duration=$((build_end - build_start))
echo "Build completed in ${build_duration} seconds."
break
fi
if [ "${status}" == "ERROR" ]; then
echo "Server went to ERROR status."
exit 6
fi
sleep 5
done

View File

@@ -0,0 +1,35 @@
#!/bin/bash
source /opt/stack/devstack/openrc admin
set -x
set -e
# Wait for the server to finish reverting resize
revert_start=$(date +%s)
while true; do
status=$(openstack server show server-rr -f value -c status)
task_state=$(openstack server show server-rr -f value -c OS-EXT-STS:task_state)
if [ "${status}" == "ACTIVE" ] && { [ "${task_state}" == "None" ] || [ -z "${task_state}" ]; }; then
revert_end=$(date +%s)
revert_duration=$((revert_end - revert_start))
echo "Revert resize completed in ${revert_duration} seconds."
break
fi
if [ "${status}" == "ERROR" ]; then
echo "Server went to ERROR status during revert resize"
exit 3
fi
sleep 5
done
# Make sure the server moved back to the subnode.
host=$(openstack server show server-rr -f value -c OS-EXT-SRV-ATTR:host)
if [ "${host}" != "${SUBNODE_HOSTNAME}" ]; then
echo "Unexpected host ${host} for server after revert resize during graceful shutdown."
exit 4
fi
echo "Revert resize during graceful shutdown completed successfully"
echo "Server server-rr is ACTIVE on ${host}"

View File

@@ -100,7 +100,212 @@
script: "cleanup_test_servers.sh server-cm1"
ignore_errors: true
- name: Graceful shutdown dest compute live migration
block:
- name: Start live migrations of test servers
become: true
become_user: stack
script: "start_live_migration.sh server-lm2"
environment:
SUBNODE_HOSTNAME: "{{ hostvars['compute1']['ansible_hostname'] }}"
CONTROLLER_HOSTNAME: "{{ hostvars['controller']['ansible_hostname'] }}"
register: start_live_migrations_result_dest
failed_when: start_live_migrations_result_dest.rc not in [0, 2]
- name: Set fact if migrations completed or timed out before SIGTERM to dest compute
set_fact:
live_migrations_completed_or_timeout_dest: "{{ start_live_migrations_result_dest.rc == 2 }}"
- name: Run graceful shutdown tests
when: not live_migrations_completed_or_timeout_dest
block:
- name: Send SIGTERM to dest compute to start the dest compute graceful shutdown
delegate_to: controller
become: true
shell: "kill -15 $(systemctl show devstack@n-cpu -p MainPID --value)"
- name: Verify live migration is completed during graceful shutdown
become: true
become_user: stack
script: "verify_live_migration.sh server-lm2"
environment:
CONTROLLER_HOSTNAME: "{{ hostvars['controller']['ansible_hostname'] }}"
# Sleep for 180 sec: default graceful_shutdown_timeout
- name: Sleep for 180 seconds to allow dest compute graceful shutdown to complete
pause:
seconds: 180
- name: Verify dest compute service is stopped after graceful shutdown
become: true
become_user: stack
script: "start_and_verify_compute_service.sh {{ hostvars['controller']['ansible_hostname'] }} inactive"
- name: Start and verify dest compute service is running
become: true
become_user: stack
script: "start_and_verify_compute_service.sh {{ hostvars['controller']['ansible_hostname'] }}"
- name: Cleanup test servers
become: true
become_user: stack
script: "cleanup_test_servers.sh server-lm2"
ignore_errors: true
- name: Graceful shutdown dest compute cold migration
block:
- name: Start cold migrations of test servers
become: true
become_user: stack
script: "start_cold_migration.sh server-cm2"
environment:
SUBNODE_HOSTNAME: "{{ hostvars['compute1']['ansible_hostname'] }}"
CONTROLLER_HOSTNAME: "{{ hostvars['controller']['ansible_hostname'] }}"
register: start_cold_migrations_result_dest
failed_when: start_cold_migrations_result_dest.rc not in [0, 2]
- name: Set fact if migrations completed or timed out before SIGTERM to dest compute
set_fact:
cold_migrations_completed_or_timeout_dest: "{{ start_cold_migrations_result_dest.rc == 2 }}"
- name: Run graceful shutdown tests
when: not cold_migrations_completed_or_timeout_dest
block:
- name: Send SIGTERM to dest compute to start the dest compute graceful shutdown
delegate_to: controller
become: true
shell: "kill -15 $(systemctl show devstack@n-cpu -p MainPID --value)"
- name: Verify cold migration is completed during graceful shutdown
become: true
become_user: stack
script: "verify_cold_migration.sh server-cm2"
# Sleep for 180 sec: default graceful_shutdown_timeout
- name: Sleep for 180 seconds to allow dest compute graceful shutdown to complete
pause:
seconds: 180
- name: Verify dest compute service is stopped after graceful shutdown
become: true
become_user: stack
script: "start_and_verify_compute_service.sh {{ hostvars['controller']['ansible_hostname'] }} inactive"
- name: Start and verify dest compute service is running
become: true
become_user: stack
script: "start_and_verify_compute_service.sh {{ hostvars['controller']['ansible_hostname'] }}"
- name: Cleanup test servers
become: true
become_user: stack
script: "cleanup_test_servers.sh server-cm2"
ignore_errors: true
- name: Graceful shutdown while building instance
block:
- name: Build instance on subnode
become: true
become_user: stack
script: "build_instance.sh"
environment:
SUBNODE_HOSTNAME: "{{ hostvars['compute1']['ansible_hostname'] }}"
register: build_instance_result
failed_when: build_instance_result.rc not in [0, 2]
- name: Set fact if build completed before SIGTERM
set_fact:
build_completed_or_error: "{{ build_instance_result.rc == 2 }}"
- name: Run graceful shutdown tests
when: not build_completed_or_error
block:
- name: Send SIGTERM to subnode compute service
delegate_to: compute1
become: true
shell: "kill -15 $(systemctl show devstack@n-cpu -p MainPID --value)"
- name: Verify build instance is completed and it is in active state
become: true
become_user: stack
script: "verify_build_instance.sh"
# Sleep for 180 sec: default graceful_shutdown_timeout
- name: Sleep for 180 seconds to allow graceful shutdown to complete
pause:
seconds: 180
- name: Verify subnode compute service is stopped after graceful shutdown
become: true
become_user: stack
script: "start_and_verify_compute_service.sh {{ hostvars['compute1']['ansible_hostname'] }} inactive"
- name: Verify subnode compute service is running
become: true
become_user: stack
script: "start_and_verify_compute_service.sh {{ hostvars['compute1']['ansible_hostname'] }}"
- name: Cleanup test servers
become: true
become_user: stack
script: "cleanup_test_servers.sh server-build"
ignore_errors: true
- name: Graceful shutdown revert resize
block:
- name: Start revert resize of test server
become: true
become_user: stack
script: "start_revert_resize.sh"
environment:
SUBNODE_HOSTNAME: "{{ hostvars['compute1']['ansible_hostname'] }}"
CONTROLLER_HOSTNAME: "{{ hostvars['controller']['ansible_hostname'] }}"
register: start_revert_resize_result
failed_when: start_revert_resize_result.rc not in [0, 2]
- name: Set fact if revert resize completed before SIGTERM
set_fact:
revert_resize_not_done: "{{ start_revert_resize_result.rc == 2 }}"
- name: Run graceful shutdown tests
when: not revert_resize_not_done
block:
- name: Send SIGTERM to controller during revert resize
delegate_to: controller
become: true
shell: "kill -15 $(systemctl show devstack@n-cpu -p MainPID --value)"
- name: Verify revert resize is completed during graceful shutdown
become: true
become_user: stack
script: "verify_revert_resize.sh"
environment:
SUBNODE_HOSTNAME: "{{ hostvars['compute1']['ansible_hostname'] }}"
# Sleep for 180 sec: default graceful_shutdown_timeout
- name: Sleep for 180 seconds to allow graceful shutdown to complete
pause:
seconds: 180
- name: Verify dest compute service is stopped after graceful shutdown
become: true
become_user: stack
script: "start_and_verify_compute_service.sh {{ hostvars['controller']['ansible_hostname'] }} inactive"
- name: Start and verify source compute service is running
become: true
become_user: stack
script: "start_and_verify_compute_service.sh {{ hostvars['controller']['ansible_hostname'] }}"
- name: Cleanup test servers
become: true
become_user: stack
script: "cleanup_test_servers.sh server-rr"
ignore_errors: true
- name: Fail if any test is skipped
fail:
msg: "One or more test is skipped due to operation is either completed or timed out before SIGTERM signal."
when: live_migrations_completed_or_timeout or cold_migrations_completed_or_timeout
when: live_migrations_completed_or_timeout or cold_migrations_completed_or_timeout or
live_migrations_completed_or_timeout_dest or cold_migrations_completed_or_timeout_dest or
build_completed_or_error or revert_resize_not_done