diff --git a/roles/run-graceful-shutdown-tests/files/build_instance.sh b/roles/run-graceful-shutdown-tests/files/build_instance.sh new file mode 100755 index 000000000000..8e56e5e9055b --- /dev/null +++ b/roles/run-graceful-shutdown-tests/files/build_instance.sh @@ -0,0 +1,43 @@ +#!/bin/bash +source /opt/stack/devstack/openrc admin +set -x +set -e + +timeout=60 + +image_id=$(openstack image list -f value -c ID | awk 'NR==1{print $1}') +flavor_id=$(openstack flavor list -f value -c ID | awk 'NR==1{print $1}') +network_id=$(openstack network list --no-share -f value -c ID | awk 'NR==1{print $1}') + +echo "Creating test server on subnode" +openstack --os-compute-api-version 2.74 server create --image ${image_id} --flavor ${flavor_id} \ + --nic net-id=${network_id} --host ${SUBNODE_HOSTNAME} server-build + +# Wait for the server vm_state to reach BUILDING so that we know that compute has +# started the build request. +count=0 +while true; do + vm_state=$(openstack server show server-build -f value -c OS-EXT-STS:vm_state) + + if [ "${vm_state}" == "building" ]; then + echo "Server is in Building" + break + fi + + if [ "${vm_state}" == "active" ]; then + echo "Server became active before SIGTERM was sent" + exit 2 + fi + + if [ "${vm_state}" == "error" ]; then + echo "Server went to error vm_state" + exit 2 + fi + + sleep 1 + count=$((count+1)) + if [ ${count} -eq ${timeout} ]; then + echo "Timed out waiting for server to reach BUILDING vm_state" + exit 2 + fi +done diff --git a/roles/run-graceful-shutdown-tests/files/start_revert_resize.sh b/roles/run-graceful-shutdown-tests/files/start_revert_resize.sh new file mode 100755 index 000000000000..d1474a9a4587 --- /dev/null +++ b/roles/run-graceful-shutdown-tests/files/start_revert_resize.sh @@ -0,0 +1,88 @@ +#!/bin/bash +source /opt/stack/devstack/openrc admin +set -x +set -e + +timeout=196 + +image_id=$(openstack image list -f value -c ID | awk 'NR==1{print $1}') +flavor_id=$(openstack flavor list -f value -c ID | awk 'NR==1{print $1}') +network_id=$(openstack network list --no-share -f value -c ID | awk 'NR==1{print $1}') + +echo "Creating test server on subnode for graceful shutdown revert resize test" +openstack --os-compute-api-version 2.74 server create --image ${image_id} --flavor ${flavor_id} \ + --nic net-id=${network_id} --host ${SUBNODE_HOSTNAME} --wait server-rr + +echo "Migrate server-rr to ${CONTROLLER_HOSTNAME}" +openstack --os-compute-api-version 2.56 server migrate \ + --host ${CONTROLLER_HOSTNAME} server-rr + +# Wait for the migrate to complete +count=0 +while true; do + status=$(openstack server show server-rr -f value -c status) + if [ "${status}" == "VERIFY_RESIZE" ]; then + echo "Migration completed, server is in VERIFY_RESIZE state" + break + fi + if [ "${status}" == "ERROR" ]; then + echo "Server went to ERROR status during cold migration" + exit 2 + fi + sleep 5 + count=$((count+1)) + if [ ${count} -eq 20 ]; then + echo "Timed out waiting for server-rr to reach VERIFY_RESIZE" + exit 2 + fi +done + +# Start and wait for the revert resize to be in progress. +count=0 +revert_started=False +revert_completed=False + +status=$(openstack server show server-rr -f value -c status) +if [ "${status}" == "VERIFY_RESIZE" ]; then + echo "Starting revert resize of server-rr" + openstack server resize revert server-rr +else + echo "Revert resize skipped" + exit 2 +fi + +while true; do + task_state=$(openstack server show server-rr -f value -c OS-EXT-STS:task_state) + status=$(openstack server show server-rr -f value -c status) + + if [ "${revert_started}" != "True" ] && [ "${revert_completed}" != "True" ]; then + if [ "${task_state}" == "resize_reverting" ]; then + echo "Revert resize is in progress" + # task_state is set by the API before it send the revert_resize RPC call + # to compute. We can try to sleep here for 2 sec and see if compute start + # the revert_resize and shutdown can be initiated before it finish. This + # is best try but no guarantee for that timing. + sleep 2 + revert_started=True + fi + if [ "${status}" == "ACTIVE" ]; then + echo "Revert resize appears to have already completed" + revert_completed=True + fi + fi + + if [ "${revert_started}" == "True" ]; then + break + fi + + if [ "${revert_completed}" == "True" ]; then + echo "Revert resize completed before SIGTERM was sent" + exit 2 + fi + + count=$((count+1)) + if [ ${count} -eq ${timeout} ]; then + echo "Timed out waiting for revert resize to start" + exit 2 + fi +done diff --git a/roles/run-graceful-shutdown-tests/files/verify_build_instance.sh b/roles/run-graceful-shutdown-tests/files/verify_build_instance.sh new file mode 100755 index 000000000000..8dde0aae73d4 --- /dev/null +++ b/roles/run-graceful-shutdown-tests/files/verify_build_instance.sh @@ -0,0 +1,25 @@ +#!/bin/bash +source /opt/stack/devstack/openrc admin +set -x +set -e + +# Wait for the server to finish building and become active which confirms that +# the build completed during graceful shutdown. +build_start=$(date +%s) +while true; do + status=$(openstack server show server-build -f value -c status) + + if [ "${status}" == "ACTIVE" ]; then + build_end=$(date +%s) + build_duration=$((build_end - build_start)) + echo "Build completed in ${build_duration} seconds." + break + fi + + if [ "${status}" == "ERROR" ]; then + echo "Server went to ERROR status." + exit 6 + fi + + sleep 5 +done diff --git a/roles/run-graceful-shutdown-tests/files/verify_revert_resize.sh b/roles/run-graceful-shutdown-tests/files/verify_revert_resize.sh new file mode 100755 index 000000000000..7b50779feb44 --- /dev/null +++ b/roles/run-graceful-shutdown-tests/files/verify_revert_resize.sh @@ -0,0 +1,35 @@ +#!/bin/bash +source /opt/stack/devstack/openrc admin +set -x +set -e + +# Wait for the server to finish reverting resize +revert_start=$(date +%s) +while true; do + status=$(openstack server show server-rr -f value -c status) + task_state=$(openstack server show server-rr -f value -c OS-EXT-STS:task_state) + + if [ "${status}" == "ACTIVE" ] && { [ "${task_state}" == "None" ] || [ -z "${task_state}" ]; }; then + revert_end=$(date +%s) + revert_duration=$((revert_end - revert_start)) + echo "Revert resize completed in ${revert_duration} seconds." + break + fi + + if [ "${status}" == "ERROR" ]; then + echo "Server went to ERROR status during revert resize" + exit 3 + fi + + sleep 5 +done + +# Make sure the server moved back to the subnode. +host=$(openstack server show server-rr -f value -c OS-EXT-SRV-ATTR:host) +if [ "${host}" != "${SUBNODE_HOSTNAME}" ]; then + echo "Unexpected host ${host} for server after revert resize during graceful shutdown." + exit 4 +fi + +echo "Revert resize during graceful shutdown completed successfully" +echo "Server server-rr is ACTIVE on ${host}" diff --git a/roles/run-graceful-shutdown-tests/tasks/main.yaml b/roles/run-graceful-shutdown-tests/tasks/main.yaml index c725283990ab..9441cee8c86f 100644 --- a/roles/run-graceful-shutdown-tests/tasks/main.yaml +++ b/roles/run-graceful-shutdown-tests/tasks/main.yaml @@ -100,7 +100,212 @@ script: "cleanup_test_servers.sh server-cm1" ignore_errors: true +- name: Graceful shutdown dest compute live migration + block: + - name: Start live migrations of test servers + become: true + become_user: stack + script: "start_live_migration.sh server-lm2" + environment: + SUBNODE_HOSTNAME: "{{ hostvars['compute1']['ansible_hostname'] }}" + CONTROLLER_HOSTNAME: "{{ hostvars['controller']['ansible_hostname'] }}" + register: start_live_migrations_result_dest + failed_when: start_live_migrations_result_dest.rc not in [0, 2] + + - name: Set fact if migrations completed or timed out before SIGTERM to dest compute + set_fact: + live_migrations_completed_or_timeout_dest: "{{ start_live_migrations_result_dest.rc == 2 }}" + + - name: Run graceful shutdown tests + when: not live_migrations_completed_or_timeout_dest + block: + - name: Send SIGTERM to dest compute to start the dest compute graceful shutdown + delegate_to: controller + become: true + shell: "kill -15 $(systemctl show devstack@n-cpu -p MainPID --value)" + + - name: Verify live migration is completed during graceful shutdown + become: true + become_user: stack + script: "verify_live_migration.sh server-lm2" + environment: + CONTROLLER_HOSTNAME: "{{ hostvars['controller']['ansible_hostname'] }}" + + # Sleep for 180 sec: default graceful_shutdown_timeout + - name: Sleep for 180 seconds to allow dest compute graceful shutdown to complete + pause: + seconds: 180 + + - name: Verify dest compute service is stopped after graceful shutdown + become: true + become_user: stack + script: "start_and_verify_compute_service.sh {{ hostvars['controller']['ansible_hostname'] }} inactive" + + - name: Start and verify dest compute service is running + become: true + become_user: stack + script: "start_and_verify_compute_service.sh {{ hostvars['controller']['ansible_hostname'] }}" + + - name: Cleanup test servers + become: true + become_user: stack + script: "cleanup_test_servers.sh server-lm2" + ignore_errors: true + +- name: Graceful shutdown dest compute cold migration + block: + - name: Start cold migrations of test servers + become: true + become_user: stack + script: "start_cold_migration.sh server-cm2" + environment: + SUBNODE_HOSTNAME: "{{ hostvars['compute1']['ansible_hostname'] }}" + CONTROLLER_HOSTNAME: "{{ hostvars['controller']['ansible_hostname'] }}" + register: start_cold_migrations_result_dest + failed_when: start_cold_migrations_result_dest.rc not in [0, 2] + + - name: Set fact if migrations completed or timed out before SIGTERM to dest compute + set_fact: + cold_migrations_completed_or_timeout_dest: "{{ start_cold_migrations_result_dest.rc == 2 }}" + + - name: Run graceful shutdown tests + when: not cold_migrations_completed_or_timeout_dest + block: + - name: Send SIGTERM to dest compute to start the dest compute graceful shutdown + delegate_to: controller + become: true + shell: "kill -15 $(systemctl show devstack@n-cpu -p MainPID --value)" + + - name: Verify cold migration is completed during graceful shutdown + become: true + become_user: stack + script: "verify_cold_migration.sh server-cm2" + + # Sleep for 180 sec: default graceful_shutdown_timeout + - name: Sleep for 180 seconds to allow dest compute graceful shutdown to complete + pause: + seconds: 180 + + - name: Verify dest compute service is stopped after graceful shutdown + become: true + become_user: stack + script: "start_and_verify_compute_service.sh {{ hostvars['controller']['ansible_hostname'] }} inactive" + + - name: Start and verify dest compute service is running + become: true + become_user: stack + script: "start_and_verify_compute_service.sh {{ hostvars['controller']['ansible_hostname'] }}" + + - name: Cleanup test servers + become: true + become_user: stack + script: "cleanup_test_servers.sh server-cm2" + ignore_errors: true + +- name: Graceful shutdown while building instance + block: + - name: Build instance on subnode + become: true + become_user: stack + script: "build_instance.sh" + environment: + SUBNODE_HOSTNAME: "{{ hostvars['compute1']['ansible_hostname'] }}" + register: build_instance_result + failed_when: build_instance_result.rc not in [0, 2] + + - name: Set fact if build completed before SIGTERM + set_fact: + build_completed_or_error: "{{ build_instance_result.rc == 2 }}" + + - name: Run graceful shutdown tests + when: not build_completed_or_error + block: + - name: Send SIGTERM to subnode compute service + delegate_to: compute1 + become: true + shell: "kill -15 $(systemctl show devstack@n-cpu -p MainPID --value)" + + - name: Verify build instance is completed and it is in active state + become: true + become_user: stack + script: "verify_build_instance.sh" + + # Sleep for 180 sec: default graceful_shutdown_timeout + - name: Sleep for 180 seconds to allow graceful shutdown to complete + pause: + seconds: 180 + + - name: Verify subnode compute service is stopped after graceful shutdown + become: true + become_user: stack + script: "start_and_verify_compute_service.sh {{ hostvars['compute1']['ansible_hostname'] }} inactive" + + - name: Verify subnode compute service is running + become: true + become_user: stack + script: "start_and_verify_compute_service.sh {{ hostvars['compute1']['ansible_hostname'] }}" + + - name: Cleanup test servers + become: true + become_user: stack + script: "cleanup_test_servers.sh server-build" + ignore_errors: true + +- name: Graceful shutdown revert resize + block: + - name: Start revert resize of test server + become: true + become_user: stack + script: "start_revert_resize.sh" + environment: + SUBNODE_HOSTNAME: "{{ hostvars['compute1']['ansible_hostname'] }}" + CONTROLLER_HOSTNAME: "{{ hostvars['controller']['ansible_hostname'] }}" + register: start_revert_resize_result + failed_when: start_revert_resize_result.rc not in [0, 2] + + - name: Set fact if revert resize completed before SIGTERM + set_fact: + revert_resize_not_done: "{{ start_revert_resize_result.rc == 2 }}" + + - name: Run graceful shutdown tests + when: not revert_resize_not_done + block: + - name: Send SIGTERM to controller during revert resize + delegate_to: controller + become: true + shell: "kill -15 $(systemctl show devstack@n-cpu -p MainPID --value)" + + - name: Verify revert resize is completed during graceful shutdown + become: true + become_user: stack + script: "verify_revert_resize.sh" + environment: + SUBNODE_HOSTNAME: "{{ hostvars['compute1']['ansible_hostname'] }}" + + # Sleep for 180 sec: default graceful_shutdown_timeout + - name: Sleep for 180 seconds to allow graceful shutdown to complete + pause: + seconds: 180 + + - name: Verify dest compute service is stopped after graceful shutdown + become: true + become_user: stack + script: "start_and_verify_compute_service.sh {{ hostvars['controller']['ansible_hostname'] }} inactive" + + - name: Start and verify source compute service is running + become: true + become_user: stack + script: "start_and_verify_compute_service.sh {{ hostvars['controller']['ansible_hostname'] }}" + + - name: Cleanup test servers + become: true + become_user: stack + script: "cleanup_test_servers.sh server-rr" + ignore_errors: true + - name: Fail if any test is skipped fail: msg: "One or more test is skipped due to operation is either completed or timed out before SIGTERM signal." - when: live_migrations_completed_or_timeout or cold_migrations_completed_or_timeout + when: live_migrations_completed_or_timeout or cold_migrations_completed_or_timeout or + live_migrations_completed_or_timeout_dest or cold_migrations_completed_or_timeout_dest or + build_completed_or_error or revert_resize_not_done