diff --git a/nova/compute/rpcapi.py b/nova/compute/rpcapi.py index 436af8e102ea..7bad00fd103e 100644 --- a/nova/compute/rpcapi.py +++ b/nova/compute/rpcapi.py @@ -672,7 +672,14 @@ class ComputeAPI(object): if not client.can_send_version('6.0'): # We always pass the instance until the 5.0 version msg_args['instance'] = instance - cctxt = client.prepare( + # NOTE(gmaan): This is called by the destination compute's + # revert_resize() on source compute. Destination compute check + # with source compute if instance storage is shared or not so + # that it can decide if disks needs to be destroyed. Make this + # RPC request to 'compute-alt' topic so that the shutdown request + # will wait for the compute to finish the revert resize. + cctxt = self.prepare_for_alt_rpcserver( + client, server=_compute_host(host, instance), version=version) return cctxt.call(ctxt, 'check_instance_shared_storage', **msg_args) @@ -754,7 +761,11 @@ class ComputeAPI(object): msg_args.pop('request_spec') version = '5.0' - cctxt = client.prepare( + # NOTE(gmaan): This is final step of resize/migration. Make this + # RPC request to 'compute-alt' topic so that the shutdown request + # will wait for the compute to finish the in-progress migration. + cctxt = self.prepare_for_alt_rpcserver( + client, server=host, version=version) cctxt.cast(ctxt, 'finish_resize', **msg_args) @@ -773,8 +784,14 @@ class ComputeAPI(object): msg_args.pop('request_spec') version = '5.0' - cctxt = client.prepare( - server=host, version=version) + # NOTE(gmaan): This is called by the destination compute's + # revert_resize() on source compute. Destination compute has deleted + # the new instance on destination and asked source compute to power + # on the old instance on source. Make this RPC request to 'compute-alt' + # topic so that the shutdown request will wait for + # the compute to finish the revert resize. + cctxt = self.prepare_for_alt_rpcserver( + client, server=host, version=version) cctxt.cast(ctxt, 'finish_revert_resize', **msg_args) def finish_snapshot_based_resize_at_dest( @@ -813,7 +830,12 @@ class ComputeAPI(object): msg_args['request_spec'] = request_spec if not client.can_send_version(version): raise exception.MigrationError(reason=_('Compute too old')) - cctxt = client.prepare( + # NOTE(gmaan): This is the cross-cell resize case to finish the + # snapshot-based resize on the destination compute. Make this RPC + # request to 'compute-alt' topic so that the shutdown request will + # wait for the compute to finish the in-progress cross-cell resize. + cctxt = self.prepare_for_alt_rpcserver( + client, server=migration.dest_compute, version=version, call_monitor_timeout=CONF.rpc_response_timeout, timeout=CONF.long_rpc_timeout) @@ -847,10 +869,17 @@ class ComputeAPI(object): client = self.router.client(ctxt) if not client.can_send_version(version): raise exception.MigrationError(reason=_('Compute too old')) - cctxt = client.prepare(server=migration.source_compute, - version=version, - call_monitor_timeout=CONF.rpc_response_timeout, - timeout=CONF.long_rpc_timeout) + # NOTE(gmaan): This is called after the + # revert_snapshot_based_resize_at_dest so revert resize is completed on + # destination side. We should complete it on source compute also. Make + # this RPC request to 'compute-alt' topic so that the shutdown request + # will wait for the compute to finish the cross-cell revert resize. + cctxt = self.prepare_for_alt_rpcserver( + client, + server=migration.source_compute, + version=version, + call_monitor_timeout=CONF.rpc_response_timeout, + timeout=CONF.long_rpc_timeout) return cctxt.call( ctxt, 'finish_revert_snapshot_based_resize_at_source', instance=instance, migration=migration) @@ -1046,6 +1075,13 @@ class ComputeAPI(object): version = '5.0' msg_args['request_spec'] = ( request_spec.to_legacy_request_spec_dict()) + # NOTE(gmaan): This is called by the conductor on the destination + # compute to check and start the resize/cold migration on source. + # This method can be called again by the conductor if the destination + # compute asks the conductor to reschedule the migration to another + # host. In both case, resize is not yet started, so this RPC request + # uses 'compute' topic. If a shutdown is initiated, then not taking + # the resize request at this stage is acceptable. cctxt = client.prepare(server=host, version=version) cctxt.cast(ctxt, 'prep_resize', **msg_args) @@ -1094,6 +1130,10 @@ class ComputeAPI(object): msg_args['request_spec'] = request_spec if not client.can_send_version(version): raise exception.MigrationPreCheckError(reason=_('Compute too old')) + # NOTE(gmaan): This is the cross-cell resize case, and resize is not + # yet started, so this RPC request uses 'compute' topic. If a shutdown + # is initiated, then not taking the resize request at this stage is + # acceptable. cctxt = client.prepare(server=destination, version=version, call_monitor_timeout=CONF.rpc_response_timeout, timeout=CONF.long_rpc_timeout) @@ -1130,10 +1170,17 @@ class ComputeAPI(object): client = self.router.client(ctxt) if not client.can_send_version(version): raise exception.MigrationError(reason=_('Compute too old')) - cctxt = client.prepare(server=_compute_host(None, instance), - version=version, - call_monitor_timeout=CONF.rpc_response_timeout, - timeout=CONF.long_rpc_timeout) + # NOTE(gmaan): This is the cross-cell resize case, and called after + # resize is prepared on destination compute. At this point, resize + # is started so make this RPC request to 'compute-alt' topic so that + # the shutdown request will wait for the compute to finish the + # in-progress cross-cell resize. + cctxt = self.prepare_for_alt_rpcserver( + client, + server=_compute_host(None, instance), + version=version, + call_monitor_timeout=CONF.rpc_response_timeout, + timeout=CONF.long_rpc_timeout) return cctxt.call( ctxt, 'prep_snapshot_based_resize_at_source', instance=instance, migration=migration, snapshot_id=snapshot_id) @@ -1261,7 +1308,13 @@ class ComputeAPI(object): msg_args.pop('request_spec') version = '5.0' - cctxt = client.prepare(server=_compute_host(None, instance), + # NOTE(gmaan): This is called by destination compute's prep_resize() + # to start the migration on source compute. Make this RPC request to + # 'compute-alt' topic so that the shutdown request will wait for + # the compute to finish the in-progress migration. + cctxt = self.prepare_for_alt_rpcserver( + client, + server=_compute_host(None, instance), version=version) cctxt.cast(ctxt, 'resize_instance', **msg_args) @@ -1286,6 +1339,11 @@ class ComputeAPI(object): msg_args.pop('request_spec') version = '5.0' + # NOTE(gmaan): This revert resize is initiated by API on the + # destination compute, and the revert resize has not yet started. + # So this RPC request uses the 'compute' topic. If a shutdown is + # initiated, then not taking the revert resize request at this stage + # is acceptable. cctxt = client.prepare( server=_compute_host(host, instance), version=version) cctxt.cast(ctxt, 'revert_resize', **msg_args) @@ -1313,6 +1371,11 @@ class ComputeAPI(object): client = self.router.client(ctxt) if not client.can_send_version(version): raise exception.MigrationError(reason=_('Compute too old')) + # NOTE(gmaan): This revert resize for cross-cell resize case. It is + # initiated by the conductor and the revert resize has not yet started. + # So this RPC request uses the 'compute' topic. If a shutdown is + # initiated, then not taking the revert resize request at this stage + # is acceptable. cctxt = client.prepare(server=migration.dest_compute, version=version, call_monitor_timeout=CONF.rpc_response_timeout, diff --git a/nova/tests/unit/compute/test_rpcapi.py b/nova/tests/unit/compute/test_rpcapi.py index 2502291457ec..49543fab07f1 100644 --- a/nova/tests/unit/compute/test_rpcapi.py +++ b/nova/tests/unit/compute/test_rpcapi.py @@ -229,7 +229,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): self._test_compute_api('check_instance_shared_storage', 'call', expected_args, instance=self.fake_instance_obj, data='foo', - version='6.0') + version='6.0', + topic_alt=compute_rpcapi.RPC_TOPIC_ALT) def test_check_instance_shared_storage_old_compute(self): ctxt = context.RequestContext('fake_user', 'fake_project') @@ -246,7 +247,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): mock_client.can_send_version.assert_has_calls([mock.call('6.0'), mock.call('6.0')]) mock_client.prepare.assert_called_with( - server=self.fake_instance_obj.host, version='5.0') + server=self.fake_instance_obj.host, version='5.0', + topic=compute_rpcapi.RPC_TOPIC) mock_cctx.call.assert_called_with( ctxt, 'check_instance_shared_storage', instance=self.fake_instance_obj, data='foo') @@ -275,7 +277,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): self._test_compute_api('finish_resize', 'cast', instance=self.fake_instance_obj, migration={'id': 'foo'}, image='image', disk_info='disk_info', host='host', - request_spec=self.fake_request_spec_obj, version='6.0') + request_spec=self.fake_request_spec_obj, version='6.0', + topic_alt=compute_rpcapi.RPC_TOPIC_ALT) def test_finish_resize_old_compute(self): ctxt = context.RequestContext('fake_user', 'fake_project') @@ -297,7 +300,7 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): mock_client.can_send_version.assert_has_calls([mock.call('6.0'), mock.call('5.2')]) mock_client.prepare.assert_called_with( - server='host', version='5.0') + server='host', version='5.0', topic=compute_rpcapi.RPC_TOPIC) mock_cctx.cast.assert_called_with( ctxt, 'finish_resize', instance=self.fake_instance_obj, migration=mock.sentinel.migration, image='image', @@ -307,7 +310,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): self._test_compute_api('finish_revert_resize', 'cast', instance=self.fake_instance_obj, migration={'id': 'fake_id'}, host='host', request_spec=self.fake_request_spec_obj, - version='6.0') + version='6.0', + topic_alt=compute_rpcapi.RPC_TOPIC_ALT) def test_finish_revert_resize_old_compute(self): ctxt = context.RequestContext('fake_user', 'fake_project') @@ -328,7 +332,7 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): mock_client.can_send_version.assert_has_calls([mock.call('6.0'), mock.call('5.2')]) mock_client.prepare.assert_called_with( - server='host', version='5.0') + server='host', version='5.0', topic=compute_rpcapi.RPC_TOPIC) mock_cctx.cast.assert_called_with( ctxt, 'finish_revert_resize', instance=self.fake_instance_obj, migration=mock.sentinel.migration) @@ -660,7 +664,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): migration=migration_obj.Migration(), snapshot_id=uuids.snapshot_id, # client.prepare kwargs - version='6.0', call_monitor_timeout=60, timeout=1234) + version='6.0', call_monitor_timeout=60, timeout=1234, + topic_alt=compute_rpcapi.RPC_TOPIC_ALT) @mock.patch('nova.rpc.ClientRouter.client') def test_prep_snapshot_based_resize_at_source_old_compute( @@ -694,7 +699,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): request_spec=objects.RequestSpec(), # client.prepare kwargs version='6.0', prepare_server='dest', - call_monitor_timeout=60, timeout=1234) + call_monitor_timeout=60, timeout=1234, + topic_alt=compute_rpcapi.RPC_TOPIC_ALT) @mock.patch('nova.rpc.ClientRouter.client') def test_finish_snapshot_based_resize_at_dest_old_compute(self, client): @@ -722,7 +728,7 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): rpcapi.router.client = mock.Mock() mock_client = mock.MagicMock() rpcapi.router.client.return_value = mock_client - mock_client.can_send_version.side_effect = [False, False, True] + mock_client.can_send_version.side_effect = [False, False, True, False] mock_cctx = mock.MagicMock() mock_client.prepare.return_value = mock_cctx expected_args = {'instance': self.fake_instance_obj, @@ -736,10 +742,12 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): mock_client.can_send_version.assert_has_calls([mock.call('6.0'), mock.call('6.0'), - mock.call('5.7')]) + mock.call('5.7'), + mock.call('6.5')]) mock_client.prepare.assert_called_with( server='dest', version='5.7', - call_monitor_timeout=60, timeout=1234) + call_monitor_timeout=60, timeout=1234, + topic=compute_rpcapi.RPC_TOPIC) mock_cctx.call.assert_called_with( self.context, 'finish_snapshot_based_resize_at_dest', **expected_args) @@ -809,7 +817,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): migration=migration_obj.Migration(source_compute='source'), # client.prepare kwargs version='6.0', prepare_server='source', - call_monitor_timeout=60, timeout=1234) + call_monitor_timeout=60, timeout=1234, + topic_alt='compute-alt') @mock.patch('nova.rpc.ClientRouter.client') def test_finish_revert_snapshot_based_resize_at_source_old_compute( @@ -995,7 +1004,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): instance=self.fake_instance_obj, migration={'id': 'fake_id'}, image='image', flavor=self.fake_flavor_obj, clean_shutdown=True, request_spec=self.fake_request_spec_obj, - version='6.0') + version='6.0', + topic_alt=compute_rpcapi.RPC_TOPIC_ALT) def test_resize_instance_old_compute(self): ctxt = context.RequestContext('fake_user', 'fake_project') @@ -1017,7 +1027,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): mock_client.can_send_version.assert_has_calls([mock.call('6.0'), mock.call('5.2')]) mock_client.prepare.assert_called_with( - server=self.fake_instance_obj.host, version='5.0') + server=self.fake_instance_obj.host, version='5.0', + topic=compute_rpcapi.RPC_TOPIC) mock_cctx.cast.assert_called_with( ctxt, 'resize_instance', instance=self.fake_instance_obj, migration=mock.sentinel.migration, image='image', diff --git a/releasenotes/notes/nova-services-graceful-shutdown-564a321e2769152d.yaml b/releasenotes/notes/nova-services-graceful-shutdown-564a321e2769152d.yaml index 744037fc381f..128d8ec76681 100644 --- a/releasenotes/notes/nova-services-graceful-shutdown-564a321e2769152d.yaml +++ b/releasenotes/notes/nova-services-graceful-shutdown-564a321e2769152d.yaml @@ -12,6 +12,9 @@ features: Currently below operations are using second RPC server: * Live migration + * Cold migration + * Resize + * Revert resize * Server external Event * Get Console output diff --git a/roles/run-graceful-shutdown-tests/files/start_cold_migration.sh b/roles/run-graceful-shutdown-tests/files/start_cold_migration.sh new file mode 100755 index 000000000000..a615fc523c5d --- /dev/null +++ b/roles/run-graceful-shutdown-tests/files/start_cold_migration.sh @@ -0,0 +1,46 @@ +#!/bin/bash +source /opt/stack/devstack/openrc admin +set -x +set -e + +timeout=196 + +server_cm=$1 + +image_id=$(openstack image list -f value -c ID | awk 'NR==1{print $1}') +flavor_id=$(openstack flavor list -f value -c ID | awk 'NR==1{print $1}') +network_id=$(openstack network list --no-share -f value -c ID | awk 'NR==1{print $1}') + +echo "Creating test server on subnode for graceful shutdown cold migration test" +openstack --os-compute-api-version 2.74 server create --image ${image_id} --flavor ${flavor_id} \ +--nic net-id=${network_id} --host ${SUBNODE_HOSTNAME} --wait ${server_cm} + +echo "Starting cold migration of ${server_cm} to ${CONTROLLER_HOSTNAME}" +openstack --os-compute-api-version 2.56 server migrate \ + --host ${CONTROLLER_HOSTNAME} ${server_cm} + +# Wait for the migrations to be in progress before returning so that the +# SIGTERM can be sent while the migrations are in progress. +count=0 +while true; do + cold_migration_status=$(openstack server migration list ${server_cm} -f value -c Status 2>/dev/null | head -1) + server_task_state=$(openstack server show ${server_cm} -f value -c OS-EXT-STS:task_state 2>/dev/null) + server_status=$(openstack server show ${server_cm} -f value -c status 2>/dev/null) + if [ "${cold_migration_status}" == "migrating" ] || \ + [ "${cold_migration_status}" == "post-migrating" ] || \ + [ "${server_task_state}" == "resize_migrating" ] || \ + [ "${server_task_state}" == "resize_migrated" ] || \ + [ "${server_task_state}" == "resize_finish" ]; then + echo "Cold migration is in progress" + break + elif [ "${cold_migration_status}" == "finished" ] || [ "${server_status}" == "VERIFY_RESIZE" ]; then + echo "Cold migration appears to have already completed" + exit 2 + fi + + count=$((count+1)) + if [ ${count} -eq ${timeout} ]; then + echo "Timed out waiting for migrations to start" + exit 2 + fi +done diff --git a/roles/run-graceful-shutdown-tests/files/verify_cold_migration.sh b/roles/run-graceful-shutdown-tests/files/verify_cold_migration.sh new file mode 100755 index 000000000000..a3d2ec1d30d0 --- /dev/null +++ b/roles/run-graceful-shutdown-tests/files/verify_cold_migration.sh @@ -0,0 +1,35 @@ +#!/bin/bash +source /opt/stack/devstack/openrc admin +set -x +set -e + +server=$1 + +# Wait for the server to finish cold migration and reach VERIFY_RESIZE state, +# which indicates the migration has completed and is awaiting confirmation. +timeout=360 +count=0 +migration_start=$(date +%s) +while true; do + status=$(openstack server show ${server} -f value -c status) + task_state=$(openstack server show ${server} -f value -c OS-EXT-STS:task_state) + + if [ "${status}" == "VERIFY_RESIZE" ] && { [ "${task_state}" == "None" ] || [ -z "${task_state}" ]; }; then + migration_end=$(date +%s) + migration_duration=$((migration_end - migration_start)) + echo "Cold migration completed in ${migration_duration} seconds." + break + fi + + if [ "${status}" == "ERROR" ]; then + echo "Server went to ERROR status during cold migration" + exit 6 + fi + + sleep 5 + count=$((count+1)) + if [ ${count} -eq ${timeout} ]; then + echo "Timed out waiting for cold migration to complete" + exit 5 + fi +done diff --git a/roles/run-graceful-shutdown-tests/tasks/main.yaml b/roles/run-graceful-shutdown-tests/tasks/main.yaml index 87b41cafd91a..c725283990ab 100644 --- a/roles/run-graceful-shutdown-tests/tasks/main.yaml +++ b/roles/run-graceful-shutdown-tests/tasks/main.yaml @@ -50,7 +50,57 @@ script: "cleanup_test_servers.sh server-lm1" ignore_errors: true +- name: Graceful shutdown source compute cold migration + block: + - name: Start cold migrations of test servers + become: true + become_user: stack + script: "start_cold_migration.sh server-cm1" + environment: + SUBNODE_HOSTNAME: "{{ hostvars['compute1']['ansible_hostname'] }}" + CONTROLLER_HOSTNAME: "{{ hostvars['controller']['ansible_hostname'] }}" + register: start_cold_migrations_result + failed_when: start_cold_migrations_result.rc not in [0, 2] + + - name: Set fact if migration is completed or timed out before SIGTERM to source compute + set_fact: + cold_migrations_completed_or_timeout: "{{ start_cold_migrations_result.rc == 2 }}" + + - name: Run graceful shutdown tests + when: not cold_migrations_completed_or_timeout + block: + - name: Send SIGTERM to source compute to start the source compute graceful shutdown + delegate_to: compute1 + become: true + shell: "kill -15 $(systemctl show devstack@n-cpu -p MainPID --value)" + + - name: Verify cold migration is completed during graceful shutdown + become: true + become_user: stack + script: "verify_cold_migration.sh server-cm1" + + # Sleep for 180 sec: default graceful_shutdown_timeout + - name: Sleep for 180 seconds to allow source compute graceful shutdown to complete + pause: + seconds: 180 + + - name: Verify compute service is stopped after graceful shutdown + become: true + become_user: stack + script: "start_and_verify_compute_service.sh {{ hostvars['compute1']['ansible_hostname'] }} inactive" + + - name: Start and verify subnode compute service is running + become: true + become_user: stack + script: "start_and_verify_compute_service.sh {{ hostvars['compute1']['ansible_hostname'] }}" + + - name: Cleanup test servers + become: true + become_user: stack + script: "cleanup_test_servers.sh server-cm1" + ignore_errors: true + - name: Fail if any test is skipped fail: msg: "One or more test is skipped due to operation is either completed or timed out before SIGTERM signal." - when: live_migrations_completed_or_timeout + when: live_migrations_completed_or_timeout or cold_migrations_completed_or_timeout