Merge "Prepare resize/cold migration for graceful shutdown"

2026-02-26 17:45:45 +00:00
parent 44a7c5c2b0 996c4ff9e8
commit f5579e9ccc
6 changed files with 237 additions and 29 deletions
--- a/nova/compute/rpcapi.py
+++ b/nova/compute/rpcapi.py
@@ -672,7 +672,14 @@ class ComputeAPI(object):
        if not client.can_send_version('6.0'):
            # We always pass the instance until the 5.0 version
            msg_args['instance'] = instance
-        cctxt = client.prepare(
+        # NOTE(gmaan): This is called by the destination compute's
+        # revert_resize() on source compute. Destination compute check
+        # with source compute if instance storage is shared or not so
+        # that it can decide if disks needs to be destroyed. Make this
+        # RPC request to 'compute-alt' topic so that the shutdown request
+        # will wait for the compute to finish the revert resize.
+        cctxt = self.prepare_for_alt_rpcserver(
+                client,
                server=_compute_host(host, instance), version=version)
        return cctxt.call(ctxt, 'check_instance_shared_storage', **msg_args)

@@ -754,7 +761,11 @@ class ComputeAPI(object):
            msg_args.pop('request_spec')
            version = '5.0'

-        cctxt = client.prepare(
+        # NOTE(gmaan): This is final step of resize/migration. Make this
+        # RPC request to 'compute-alt' topic so that the shutdown request
+        # will wait for the compute to finish the in-progress migration.
+        cctxt = self.prepare_for_alt_rpcserver(
+                client,
                server=host, version=version)
        cctxt.cast(ctxt, 'finish_resize', **msg_args)

@@ -773,8 +784,14 @@ class ComputeAPI(object):
            msg_args.pop('request_spec')
            version = '5.0'

-        cctxt = client.prepare(
-                server=host, version=version)
+        # NOTE(gmaan): This is called by the destination compute's
+        # revert_resize() on source compute. Destination compute has deleted
+        # the new instance on destination and asked source compute to power
+        # on the old instance on source. Make this RPC request to 'compute-alt'
+        # topic so that the shutdown request will wait for
+        # the compute to finish the revert resize.
+        cctxt = self.prepare_for_alt_rpcserver(
+                client, server=host, version=version)
        cctxt.cast(ctxt, 'finish_revert_resize', **msg_args)

    def finish_snapshot_based_resize_at_dest(
@@ -813,7 +830,12 @@ class ComputeAPI(object):
            msg_args['request_spec'] = request_spec
        if not client.can_send_version(version):
            raise exception.MigrationError(reason=_('Compute too old'))
-        cctxt = client.prepare(
+        # NOTE(gmaan): This is the cross-cell resize case to finish the
+        # snapshot-based resize on the destination compute. Make this RPC
+        # request to 'compute-alt' topic so that the shutdown request will
+        # wait for the compute to finish the in-progress cross-cell resize.
+        cctxt = self.prepare_for_alt_rpcserver(
+            client,
            server=migration.dest_compute, version=version,
            call_monitor_timeout=CONF.rpc_response_timeout,
            timeout=CONF.long_rpc_timeout)
@@ -847,10 +869,17 @@ class ComputeAPI(object):
        client = self.router.client(ctxt)
        if not client.can_send_version(version):
            raise exception.MigrationError(reason=_('Compute too old'))
-        cctxt = client.prepare(server=migration.source_compute,
-                               version=version,
-                               call_monitor_timeout=CONF.rpc_response_timeout,
-                               timeout=CONF.long_rpc_timeout)
+        # NOTE(gmaan): This is called after the
+        # revert_snapshot_based_resize_at_dest so revert resize is completed on
+        # destination side. We should complete it on source compute also. Make
+        # this RPC request to 'compute-alt' topic so that the shutdown request
+        # will wait for the compute to finish the cross-cell revert resize.
+        cctxt = self.prepare_for_alt_rpcserver(
+                client,
+                server=migration.source_compute,
+                version=version,
+                call_monitor_timeout=CONF.rpc_response_timeout,
+                timeout=CONF.long_rpc_timeout)
        return cctxt.call(
            ctxt, 'finish_revert_snapshot_based_resize_at_source',
            instance=instance, migration=migration)
@@ -1046,6 +1075,13 @@ class ComputeAPI(object):
                version = '5.0'
                msg_args['request_spec'] = (
                    request_spec.to_legacy_request_spec_dict())
+        # NOTE(gmaan): This is called by the conductor on the destination
+        # compute to check and start the resize/cold migration on source.
+        # This method can be called again by the conductor if the destination
+        # compute asks the conductor to reschedule the migration to another
+        # host. In both case, resize is not yet started, so this RPC request
+        # uses 'compute' topic. If a shutdown is initiated, then not taking
+        # the resize request at this stage is acceptable.
        cctxt = client.prepare(server=host, version=version)
        cctxt.cast(ctxt, 'prep_resize', **msg_args)

@@ -1094,6 +1130,10 @@ class ComputeAPI(object):
            msg_args['request_spec'] = request_spec
        if not client.can_send_version(version):
            raise exception.MigrationPreCheckError(reason=_('Compute too old'))
+        # NOTE(gmaan): This is the cross-cell resize case, and resize is not
+        # yet started, so this RPC request uses 'compute' topic. If a shutdown
+        # is initiated, then not taking the resize request at this stage is
+        # acceptable.
        cctxt = client.prepare(server=destination, version=version,
                               call_monitor_timeout=CONF.rpc_response_timeout,
                               timeout=CONF.long_rpc_timeout)
@@ -1130,10 +1170,17 @@ class ComputeAPI(object):
        client = self.router.client(ctxt)
        if not client.can_send_version(version):
            raise exception.MigrationError(reason=_('Compute too old'))
-        cctxt = client.prepare(server=_compute_host(None, instance),
-                               version=version,
-                               call_monitor_timeout=CONF.rpc_response_timeout,
-                               timeout=CONF.long_rpc_timeout)
+        # NOTE(gmaan): This is the cross-cell resize case, and called after
+        # resize is prepared on destination compute. At this point, resize
+        # is started so make this RPC request to 'compute-alt' topic so that
+        # the shutdown request will wait for the compute to finish the
+        # in-progress cross-cell resize.
+        cctxt = self.prepare_for_alt_rpcserver(
+                client,
+                server=_compute_host(None, instance),
+                version=version,
+                call_monitor_timeout=CONF.rpc_response_timeout,
+                timeout=CONF.long_rpc_timeout)
        return cctxt.call(
            ctxt, 'prep_snapshot_based_resize_at_source',
            instance=instance, migration=migration, snapshot_id=snapshot_id)
@@ -1261,7 +1308,13 @@ class ComputeAPI(object):
                msg_args.pop('request_spec')
                version = '5.0'

-        cctxt = client.prepare(server=_compute_host(None, instance),
+        # NOTE(gmaan): This is called by destination compute's prep_resize()
+        # to start the migration on source compute. Make this RPC request to
+        # 'compute-alt' topic so that the shutdown request will wait for
+        # the compute to finish the in-progress migration.
+        cctxt = self.prepare_for_alt_rpcserver(
+                client,
+                server=_compute_host(None, instance),
                version=version)
        cctxt.cast(ctxt, 'resize_instance', **msg_args)

@@ -1286,6 +1339,11 @@ class ComputeAPI(object):
            msg_args.pop('request_spec')
            version = '5.0'

+        # NOTE(gmaan): This revert resize is initiated by API on the
+        # destination compute, and the revert resize has not yet started.
+        # So this RPC request uses the 'compute' topic. If a shutdown is
+        # initiated, then not taking the revert resize request at this stage
+        # is acceptable.
        cctxt = client.prepare(
                server=_compute_host(host, instance), version=version)
        cctxt.cast(ctxt, 'revert_resize', **msg_args)
@@ -1313,6 +1371,11 @@ class ComputeAPI(object):
        client = self.router.client(ctxt)
        if not client.can_send_version(version):
            raise exception.MigrationError(reason=_('Compute too old'))
+        # NOTE(gmaan): This revert resize for cross-cell resize case. It is
+        # initiated by the conductor and the revert resize has not yet started.
+        # So this RPC request uses the 'compute' topic. If a shutdown is
+        # initiated, then not taking the revert resize request at this stage
+        # is acceptable.
        cctxt = client.prepare(server=migration.dest_compute,
                               version=version,
                               call_monitor_timeout=CONF.rpc_response_timeout,
--- a/nova/tests/unit/compute/test_rpcapi.py
+++ b/nova/tests/unit/compute/test_rpcapi.py
@@ -229,7 +229,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
        self._test_compute_api('check_instance_shared_storage', 'call',
                expected_args,
                instance=self.fake_instance_obj, data='foo',
-                version='6.0')
+                version='6.0',
+                topic_alt=compute_rpcapi.RPC_TOPIC_ALT)

    def test_check_instance_shared_storage_old_compute(self):
        ctxt = context.RequestContext('fake_user', 'fake_project')
@@ -246,7 +247,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
        mock_client.can_send_version.assert_has_calls([mock.call('6.0'),
                                                       mock.call('6.0')])
        mock_client.prepare.assert_called_with(
-            server=self.fake_instance_obj.host, version='5.0')
+            server=self.fake_instance_obj.host, version='5.0',
+            topic=compute_rpcapi.RPC_TOPIC)
        mock_cctx.call.assert_called_with(
            ctxt, 'check_instance_shared_storage',
            instance=self.fake_instance_obj, data='foo')
@@ -275,7 +277,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
        self._test_compute_api('finish_resize', 'cast',
                instance=self.fake_instance_obj, migration={'id': 'foo'},
                image='image', disk_info='disk_info', host='host',
-                request_spec=self.fake_request_spec_obj, version='6.0')
+                request_spec=self.fake_request_spec_obj, version='6.0',
+                topic_alt=compute_rpcapi.RPC_TOPIC_ALT)

    def test_finish_resize_old_compute(self):
        ctxt = context.RequestContext('fake_user', 'fake_project')
@@ -297,7 +300,7 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
        mock_client.can_send_version.assert_has_calls([mock.call('6.0'),
                                                       mock.call('5.2')])
        mock_client.prepare.assert_called_with(
-            server='host', version='5.0')
+            server='host', version='5.0', topic=compute_rpcapi.RPC_TOPIC)
        mock_cctx.cast.assert_called_with(
            ctxt, 'finish_resize', instance=self.fake_instance_obj,
            migration=mock.sentinel.migration, image='image',
@@ -307,7 +310,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
        self._test_compute_api('finish_revert_resize', 'cast',
                instance=self.fake_instance_obj, migration={'id': 'fake_id'},
                host='host', request_spec=self.fake_request_spec_obj,
-                version='6.0')
+                version='6.0',
+                topic_alt=compute_rpcapi.RPC_TOPIC_ALT)

    def test_finish_revert_resize_old_compute(self):
        ctxt = context.RequestContext('fake_user', 'fake_project')
@@ -328,7 +332,7 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
        mock_client.can_send_version.assert_has_calls([mock.call('6.0'),
                                                       mock.call('5.2')])
        mock_client.prepare.assert_called_with(
-            server='host', version='5.0')
+            server='host', version='5.0', topic=compute_rpcapi.RPC_TOPIC)
        mock_cctx.cast.assert_called_with(
            ctxt, 'finish_revert_resize', instance=self.fake_instance_obj,
            migration=mock.sentinel.migration)
@@ -660,7 +664,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
            migration=migration_obj.Migration(),
            snapshot_id=uuids.snapshot_id,
            # client.prepare kwargs
-            version='6.0', call_monitor_timeout=60, timeout=1234)
+            version='6.0', call_monitor_timeout=60, timeout=1234,
+            topic_alt=compute_rpcapi.RPC_TOPIC_ALT)

    @mock.patch('nova.rpc.ClientRouter.client')
    def test_prep_snapshot_based_resize_at_source_old_compute(
@@ -694,7 +699,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
            request_spec=objects.RequestSpec(),
            # client.prepare kwargs
            version='6.0', prepare_server='dest',
-            call_monitor_timeout=60, timeout=1234)
+            call_monitor_timeout=60, timeout=1234,
+            topic_alt=compute_rpcapi.RPC_TOPIC_ALT)

    @mock.patch('nova.rpc.ClientRouter.client')
    def test_finish_snapshot_based_resize_at_dest_old_compute(self, client):
@@ -722,7 +728,7 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
        rpcapi.router.client = mock.Mock()
        mock_client = mock.MagicMock()
        rpcapi.router.client.return_value = mock_client
-        mock_client.can_send_version.side_effect = [False, False, True]
+        mock_client.can_send_version.side_effect = [False, False, True, False]
        mock_cctx = mock.MagicMock()
        mock_client.prepare.return_value = mock_cctx
        expected_args = {'instance': self.fake_instance_obj,
@@ -736,10 +742,12 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):

        mock_client.can_send_version.assert_has_calls([mock.call('6.0'),
                                                       mock.call('6.0'),
-                                                       mock.call('5.7')])
+                                                       mock.call('5.7'),
+                                                       mock.call('6.5')])
        mock_client.prepare.assert_called_with(
            server='dest', version='5.7',
-            call_monitor_timeout=60, timeout=1234)
+            call_monitor_timeout=60, timeout=1234,
+            topic=compute_rpcapi.RPC_TOPIC)
        mock_cctx.call.assert_called_with(
            self.context, 'finish_snapshot_based_resize_at_dest',
            **expected_args)
@@ -809,7 +817,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
            migration=migration_obj.Migration(source_compute='source'),
            # client.prepare kwargs
            version='6.0', prepare_server='source',
-            call_monitor_timeout=60, timeout=1234)
+            call_monitor_timeout=60, timeout=1234,
+            topic_alt='compute-alt')

    @mock.patch('nova.rpc.ClientRouter.client')
    def test_finish_revert_snapshot_based_resize_at_source_old_compute(
@@ -995,7 +1004,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
                instance=self.fake_instance_obj, migration={'id': 'fake_id'},
                image='image', flavor=self.fake_flavor_obj,
                clean_shutdown=True, request_spec=self.fake_request_spec_obj,
-                version='6.0')
+                version='6.0',
+                topic_alt=compute_rpcapi.RPC_TOPIC_ALT)

    def test_resize_instance_old_compute(self):
        ctxt = context.RequestContext('fake_user', 'fake_project')
@@ -1017,7 +1027,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
        mock_client.can_send_version.assert_has_calls([mock.call('6.0'),
                                                       mock.call('5.2')])
        mock_client.prepare.assert_called_with(
-            server=self.fake_instance_obj.host, version='5.0')
+            server=self.fake_instance_obj.host, version='5.0',
+            topic=compute_rpcapi.RPC_TOPIC)
        mock_cctx.cast.assert_called_with(
            ctxt, 'resize_instance', instance=self.fake_instance_obj,
            migration=mock.sentinel.migration, image='image',
--- a/releasenotes/notes/nova-services-graceful-shutdown-564a321e2769152d.yaml
+++ b/releasenotes/notes/nova-services-graceful-shutdown-564a321e2769152d.yaml
@@ -12,6 +12,9 @@ features:
    Currently below operations are using second RPC server:

    * Live migration
+    * Cold migration
+    * Resize
+    * Revert resize
    * Server external Event
    * Get Console output

--- a/roles/run-graceful-shutdown-tests/files/start_cold_migration.sh
+++ b/roles/run-graceful-shutdown-tests/files/start_cold_migration.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+source /opt/stack/devstack/openrc admin
+set -x
+set -e
+
+timeout=196
+
+server_cm=$1
+
+image_id=$(openstack image list -f value -c ID | awk 'NR==1{print $1}')
+flavor_id=$(openstack flavor list -f value -c ID | awk 'NR==1{print $1}')
+network_id=$(openstack network list --no-share -f value -c ID | awk 'NR==1{print $1}')
+
+echo "Creating test server on subnode for graceful shutdown cold migration test"
+openstack --os-compute-api-version 2.74 server create --image ${image_id} --flavor ${flavor_id} \
+--nic net-id=${network_id} --host ${SUBNODE_HOSTNAME} --wait ${server_cm}
+
+echo "Starting cold migration of ${server_cm} to ${CONTROLLER_HOSTNAME}"
+openstack --os-compute-api-version 2.56 server migrate \
+    --host ${CONTROLLER_HOSTNAME} ${server_cm}
+
+# Wait for the migrations to be in progress before returning so that the
+# SIGTERM can be sent while the migrations are in progress.
+count=0
+while true; do
+    cold_migration_status=$(openstack server migration list ${server_cm} -f value -c Status 2>/dev/null | head -1)
+    server_task_state=$(openstack server show ${server_cm} -f value -c OS-EXT-STS:task_state 2>/dev/null)
+    server_status=$(openstack server show ${server_cm} -f value -c status 2>/dev/null)
+    if [ "${cold_migration_status}" == "migrating" ] || \
+       [ "${cold_migration_status}" == "post-migrating" ] || \
+       [ "${server_task_state}" == "resize_migrating" ] || \
+       [ "${server_task_state}" == "resize_migrated" ] || \
+       [ "${server_task_state}" == "resize_finish" ]; then
+        echo "Cold migration is in progress"
+        break
+    elif [ "${cold_migration_status}" == "finished" ] || [ "${server_status}" == "VERIFY_RESIZE" ]; then
+        echo "Cold migration appears to have already completed"
+        exit 2
+    fi
+
+    count=$((count+1))
+    if [ ${count} -eq ${timeout} ]; then
+        echo "Timed out waiting for migrations to start"
+        exit 2
+    fi
+done
--- a/roles/run-graceful-shutdown-tests/files/verify_cold_migration.sh
+++ b/roles/run-graceful-shutdown-tests/files/verify_cold_migration.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+source /opt/stack/devstack/openrc admin
+set -x
+set -e
+
+server=$1
+
+# Wait for the server to finish cold migration and reach VERIFY_RESIZE state,
+# which indicates the migration has completed and is awaiting confirmation.
+timeout=360
+count=0
+migration_start=$(date +%s)
+while true; do
+    status=$(openstack server show ${server} -f value -c status)
+    task_state=$(openstack server show ${server} -f value -c OS-EXT-STS:task_state)
+
+    if [ "${status}" == "VERIFY_RESIZE" ] && { [ "${task_state}" == "None" ] || [ -z "${task_state}" ]; }; then
+        migration_end=$(date +%s)
+        migration_duration=$((migration_end - migration_start))
+        echo "Cold migration completed in ${migration_duration} seconds."
+        break
+    fi
+
+    if [ "${status}" == "ERROR" ]; then
+        echo "Server went to ERROR status during cold migration"
+        exit 6
+    fi
+
+    sleep 5
+    count=$((count+1))
+    if [ ${count} -eq ${timeout} ]; then
+        echo "Timed out waiting for cold migration to complete"
+        exit 5
+    fi
+done
--- a/roles/run-graceful-shutdown-tests/tasks/main.yaml
+++ b/roles/run-graceful-shutdown-tests/tasks/main.yaml
@@ -50,7 +50,57 @@
      script: "cleanup_test_servers.sh server-lm1"
      ignore_errors: true

+- name: Graceful shutdown source compute cold migration
+  block:
+    - name: Start cold migrations of test servers
+      become: true
+      become_user: stack
+      script: "start_cold_migration.sh server-cm1"
+      environment:
+        SUBNODE_HOSTNAME: "{{ hostvars['compute1']['ansible_hostname'] }}"
+        CONTROLLER_HOSTNAME: "{{ hostvars['controller']['ansible_hostname'] }}"
+      register: start_cold_migrations_result
+      failed_when: start_cold_migrations_result.rc not in [0, 2]
+
+    - name: Set fact if migration is completed or timed out before SIGTERM to source compute
+      set_fact:
+        cold_migrations_completed_or_timeout: "{{ start_cold_migrations_result.rc == 2 }}"
+
+    - name: Run graceful shutdown tests
+      when: not cold_migrations_completed_or_timeout
+      block:
+        - name: Send SIGTERM to source compute to start the source compute graceful shutdown
+          delegate_to: compute1
+          become: true
+          shell: "kill -15 $(systemctl show devstack@n-cpu -p MainPID --value)"
+
+        - name: Verify cold migration is completed during graceful shutdown
+          become: true
+          become_user: stack
+          script: "verify_cold_migration.sh server-cm1"
+
+        # Sleep for 180 sec: default graceful_shutdown_timeout
+        - name: Sleep for 180 seconds to allow source compute graceful shutdown to complete
+          pause:
+            seconds: 180
+
+        - name: Verify compute service is stopped after graceful shutdown
+          become: true
+          become_user: stack
+          script: "start_and_verify_compute_service.sh {{ hostvars['compute1']['ansible_hostname'] }} inactive"
+
+    - name: Start and verify subnode compute service is running
+      become: true
+      become_user: stack
+      script: "start_and_verify_compute_service.sh {{ hostvars['compute1']['ansible_hostname'] }}"
+
+    - name: Cleanup test servers
+      become: true
+      become_user: stack
+      script: "cleanup_test_servers.sh server-cm1"
+      ignore_errors: true
+
 - name: Fail if any test is skipped
  fail:
    msg: "One or more test is skipped due to operation is either completed or timed out before SIGTERM signal."
-  when: live_migrations_completed_or_timeout
+  when: live_migrations_completed_or_timeout or cold_migrations_completed_or_timeout