Merge "Prepare resize/cold migration for graceful shutdown"

This commit is contained in:
Zuul
2026-02-26 17:45:45 +00:00
committed by Gerrit Code Review
6 changed files with 237 additions and 29 deletions

View File

@@ -672,7 +672,14 @@ class ComputeAPI(object):
if not client.can_send_version('6.0'):
# We always pass the instance until the 5.0 version
msg_args['instance'] = instance
cctxt = client.prepare(
# NOTE(gmaan): This is called by the destination compute's
# revert_resize() on source compute. Destination compute check
# with source compute if instance storage is shared or not so
# that it can decide if disks needs to be destroyed. Make this
# RPC request to 'compute-alt' topic so that the shutdown request
# will wait for the compute to finish the revert resize.
cctxt = self.prepare_for_alt_rpcserver(
client,
server=_compute_host(host, instance), version=version)
return cctxt.call(ctxt, 'check_instance_shared_storage', **msg_args)
@@ -754,7 +761,11 @@ class ComputeAPI(object):
msg_args.pop('request_spec')
version = '5.0'
cctxt = client.prepare(
# NOTE(gmaan): This is final step of resize/migration. Make this
# RPC request to 'compute-alt' topic so that the shutdown request
# will wait for the compute to finish the in-progress migration.
cctxt = self.prepare_for_alt_rpcserver(
client,
server=host, version=version)
cctxt.cast(ctxt, 'finish_resize', **msg_args)
@@ -773,8 +784,14 @@ class ComputeAPI(object):
msg_args.pop('request_spec')
version = '5.0'
cctxt = client.prepare(
server=host, version=version)
# NOTE(gmaan): This is called by the destination compute's
# revert_resize() on source compute. Destination compute has deleted
# the new instance on destination and asked source compute to power
# on the old instance on source. Make this RPC request to 'compute-alt'
# topic so that the shutdown request will wait for
# the compute to finish the revert resize.
cctxt = self.prepare_for_alt_rpcserver(
client, server=host, version=version)
cctxt.cast(ctxt, 'finish_revert_resize', **msg_args)
def finish_snapshot_based_resize_at_dest(
@@ -813,7 +830,12 @@ class ComputeAPI(object):
msg_args['request_spec'] = request_spec
if not client.can_send_version(version):
raise exception.MigrationError(reason=_('Compute too old'))
cctxt = client.prepare(
# NOTE(gmaan): This is the cross-cell resize case to finish the
# snapshot-based resize on the destination compute. Make this RPC
# request to 'compute-alt' topic so that the shutdown request will
# wait for the compute to finish the in-progress cross-cell resize.
cctxt = self.prepare_for_alt_rpcserver(
client,
server=migration.dest_compute, version=version,
call_monitor_timeout=CONF.rpc_response_timeout,
timeout=CONF.long_rpc_timeout)
@@ -847,10 +869,17 @@ class ComputeAPI(object):
client = self.router.client(ctxt)
if not client.can_send_version(version):
raise exception.MigrationError(reason=_('Compute too old'))
cctxt = client.prepare(server=migration.source_compute,
version=version,
call_monitor_timeout=CONF.rpc_response_timeout,
timeout=CONF.long_rpc_timeout)
# NOTE(gmaan): This is called after the
# revert_snapshot_based_resize_at_dest so revert resize is completed on
# destination side. We should complete it on source compute also. Make
# this RPC request to 'compute-alt' topic so that the shutdown request
# will wait for the compute to finish the cross-cell revert resize.
cctxt = self.prepare_for_alt_rpcserver(
client,
server=migration.source_compute,
version=version,
call_monitor_timeout=CONF.rpc_response_timeout,
timeout=CONF.long_rpc_timeout)
return cctxt.call(
ctxt, 'finish_revert_snapshot_based_resize_at_source',
instance=instance, migration=migration)
@@ -1046,6 +1075,13 @@ class ComputeAPI(object):
version = '5.0'
msg_args['request_spec'] = (
request_spec.to_legacy_request_spec_dict())
# NOTE(gmaan): This is called by the conductor on the destination
# compute to check and start the resize/cold migration on source.
# This method can be called again by the conductor if the destination
# compute asks the conductor to reschedule the migration to another
# host. In both case, resize is not yet started, so this RPC request
# uses 'compute' topic. If a shutdown is initiated, then not taking
# the resize request at this stage is acceptable.
cctxt = client.prepare(server=host, version=version)
cctxt.cast(ctxt, 'prep_resize', **msg_args)
@@ -1094,6 +1130,10 @@ class ComputeAPI(object):
msg_args['request_spec'] = request_spec
if not client.can_send_version(version):
raise exception.MigrationPreCheckError(reason=_('Compute too old'))
# NOTE(gmaan): This is the cross-cell resize case, and resize is not
# yet started, so this RPC request uses 'compute' topic. If a shutdown
# is initiated, then not taking the resize request at this stage is
# acceptable.
cctxt = client.prepare(server=destination, version=version,
call_monitor_timeout=CONF.rpc_response_timeout,
timeout=CONF.long_rpc_timeout)
@@ -1130,10 +1170,17 @@ class ComputeAPI(object):
client = self.router.client(ctxt)
if not client.can_send_version(version):
raise exception.MigrationError(reason=_('Compute too old'))
cctxt = client.prepare(server=_compute_host(None, instance),
version=version,
call_monitor_timeout=CONF.rpc_response_timeout,
timeout=CONF.long_rpc_timeout)
# NOTE(gmaan): This is the cross-cell resize case, and called after
# resize is prepared on destination compute. At this point, resize
# is started so make this RPC request to 'compute-alt' topic so that
# the shutdown request will wait for the compute to finish the
# in-progress cross-cell resize.
cctxt = self.prepare_for_alt_rpcserver(
client,
server=_compute_host(None, instance),
version=version,
call_monitor_timeout=CONF.rpc_response_timeout,
timeout=CONF.long_rpc_timeout)
return cctxt.call(
ctxt, 'prep_snapshot_based_resize_at_source',
instance=instance, migration=migration, snapshot_id=snapshot_id)
@@ -1261,7 +1308,13 @@ class ComputeAPI(object):
msg_args.pop('request_spec')
version = '5.0'
cctxt = client.prepare(server=_compute_host(None, instance),
# NOTE(gmaan): This is called by destination compute's prep_resize()
# to start the migration on source compute. Make this RPC request to
# 'compute-alt' topic so that the shutdown request will wait for
# the compute to finish the in-progress migration.
cctxt = self.prepare_for_alt_rpcserver(
client,
server=_compute_host(None, instance),
version=version)
cctxt.cast(ctxt, 'resize_instance', **msg_args)
@@ -1286,6 +1339,11 @@ class ComputeAPI(object):
msg_args.pop('request_spec')
version = '5.0'
# NOTE(gmaan): This revert resize is initiated by API on the
# destination compute, and the revert resize has not yet started.
# So this RPC request uses the 'compute' topic. If a shutdown is
# initiated, then not taking the revert resize request at this stage
# is acceptable.
cctxt = client.prepare(
server=_compute_host(host, instance), version=version)
cctxt.cast(ctxt, 'revert_resize', **msg_args)
@@ -1313,6 +1371,11 @@ class ComputeAPI(object):
client = self.router.client(ctxt)
if not client.can_send_version(version):
raise exception.MigrationError(reason=_('Compute too old'))
# NOTE(gmaan): This revert resize for cross-cell resize case. It is
# initiated by the conductor and the revert resize has not yet started.
# So this RPC request uses the 'compute' topic. If a shutdown is
# initiated, then not taking the revert resize request at this stage
# is acceptable.
cctxt = client.prepare(server=migration.dest_compute,
version=version,
call_monitor_timeout=CONF.rpc_response_timeout,

View File

@@ -229,7 +229,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
self._test_compute_api('check_instance_shared_storage', 'call',
expected_args,
instance=self.fake_instance_obj, data='foo',
version='6.0')
version='6.0',
topic_alt=compute_rpcapi.RPC_TOPIC_ALT)
def test_check_instance_shared_storage_old_compute(self):
ctxt = context.RequestContext('fake_user', 'fake_project')
@@ -246,7 +247,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
mock_client.can_send_version.assert_has_calls([mock.call('6.0'),
mock.call('6.0')])
mock_client.prepare.assert_called_with(
server=self.fake_instance_obj.host, version='5.0')
server=self.fake_instance_obj.host, version='5.0',
topic=compute_rpcapi.RPC_TOPIC)
mock_cctx.call.assert_called_with(
ctxt, 'check_instance_shared_storage',
instance=self.fake_instance_obj, data='foo')
@@ -275,7 +277,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
self._test_compute_api('finish_resize', 'cast',
instance=self.fake_instance_obj, migration={'id': 'foo'},
image='image', disk_info='disk_info', host='host',
request_spec=self.fake_request_spec_obj, version='6.0')
request_spec=self.fake_request_spec_obj, version='6.0',
topic_alt=compute_rpcapi.RPC_TOPIC_ALT)
def test_finish_resize_old_compute(self):
ctxt = context.RequestContext('fake_user', 'fake_project')
@@ -297,7 +300,7 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
mock_client.can_send_version.assert_has_calls([mock.call('6.0'),
mock.call('5.2')])
mock_client.prepare.assert_called_with(
server='host', version='5.0')
server='host', version='5.0', topic=compute_rpcapi.RPC_TOPIC)
mock_cctx.cast.assert_called_with(
ctxt, 'finish_resize', instance=self.fake_instance_obj,
migration=mock.sentinel.migration, image='image',
@@ -307,7 +310,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
self._test_compute_api('finish_revert_resize', 'cast',
instance=self.fake_instance_obj, migration={'id': 'fake_id'},
host='host', request_spec=self.fake_request_spec_obj,
version='6.0')
version='6.0',
topic_alt=compute_rpcapi.RPC_TOPIC_ALT)
def test_finish_revert_resize_old_compute(self):
ctxt = context.RequestContext('fake_user', 'fake_project')
@@ -328,7 +332,7 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
mock_client.can_send_version.assert_has_calls([mock.call('6.0'),
mock.call('5.2')])
mock_client.prepare.assert_called_with(
server='host', version='5.0')
server='host', version='5.0', topic=compute_rpcapi.RPC_TOPIC)
mock_cctx.cast.assert_called_with(
ctxt, 'finish_revert_resize', instance=self.fake_instance_obj,
migration=mock.sentinel.migration)
@@ -660,7 +664,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
migration=migration_obj.Migration(),
snapshot_id=uuids.snapshot_id,
# client.prepare kwargs
version='6.0', call_monitor_timeout=60, timeout=1234)
version='6.0', call_monitor_timeout=60, timeout=1234,
topic_alt=compute_rpcapi.RPC_TOPIC_ALT)
@mock.patch('nova.rpc.ClientRouter.client')
def test_prep_snapshot_based_resize_at_source_old_compute(
@@ -694,7 +699,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
request_spec=objects.RequestSpec(),
# client.prepare kwargs
version='6.0', prepare_server='dest',
call_monitor_timeout=60, timeout=1234)
call_monitor_timeout=60, timeout=1234,
topic_alt=compute_rpcapi.RPC_TOPIC_ALT)
@mock.patch('nova.rpc.ClientRouter.client')
def test_finish_snapshot_based_resize_at_dest_old_compute(self, client):
@@ -722,7 +728,7 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
rpcapi.router.client = mock.Mock()
mock_client = mock.MagicMock()
rpcapi.router.client.return_value = mock_client
mock_client.can_send_version.side_effect = [False, False, True]
mock_client.can_send_version.side_effect = [False, False, True, False]
mock_cctx = mock.MagicMock()
mock_client.prepare.return_value = mock_cctx
expected_args = {'instance': self.fake_instance_obj,
@@ -736,10 +742,12 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
mock_client.can_send_version.assert_has_calls([mock.call('6.0'),
mock.call('6.0'),
mock.call('5.7')])
mock.call('5.7'),
mock.call('6.5')])
mock_client.prepare.assert_called_with(
server='dest', version='5.7',
call_monitor_timeout=60, timeout=1234)
call_monitor_timeout=60, timeout=1234,
topic=compute_rpcapi.RPC_TOPIC)
mock_cctx.call.assert_called_with(
self.context, 'finish_snapshot_based_resize_at_dest',
**expected_args)
@@ -809,7 +817,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
migration=migration_obj.Migration(source_compute='source'),
# client.prepare kwargs
version='6.0', prepare_server='source',
call_monitor_timeout=60, timeout=1234)
call_monitor_timeout=60, timeout=1234,
topic_alt='compute-alt')
@mock.patch('nova.rpc.ClientRouter.client')
def test_finish_revert_snapshot_based_resize_at_source_old_compute(
@@ -995,7 +1004,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
instance=self.fake_instance_obj, migration={'id': 'fake_id'},
image='image', flavor=self.fake_flavor_obj,
clean_shutdown=True, request_spec=self.fake_request_spec_obj,
version='6.0')
version='6.0',
topic_alt=compute_rpcapi.RPC_TOPIC_ALT)
def test_resize_instance_old_compute(self):
ctxt = context.RequestContext('fake_user', 'fake_project')
@@ -1017,7 +1027,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
mock_client.can_send_version.assert_has_calls([mock.call('6.0'),
mock.call('5.2')])
mock_client.prepare.assert_called_with(
server=self.fake_instance_obj.host, version='5.0')
server=self.fake_instance_obj.host, version='5.0',
topic=compute_rpcapi.RPC_TOPIC)
mock_cctx.cast.assert_called_with(
ctxt, 'resize_instance', instance=self.fake_instance_obj,
migration=mock.sentinel.migration, image='image',

View File

@@ -12,6 +12,9 @@ features:
Currently below operations are using second RPC server:
* Live migration
* Cold migration
* Resize
* Revert resize
* Server external Event
* Get Console output

View File

@@ -0,0 +1,46 @@
#!/bin/bash
source /opt/stack/devstack/openrc admin
set -x
set -e
timeout=196
server_cm=$1
image_id=$(openstack image list -f value -c ID | awk 'NR==1{print $1}')
flavor_id=$(openstack flavor list -f value -c ID | awk 'NR==1{print $1}')
network_id=$(openstack network list --no-share -f value -c ID | awk 'NR==1{print $1}')
echo "Creating test server on subnode for graceful shutdown cold migration test"
openstack --os-compute-api-version 2.74 server create --image ${image_id} --flavor ${flavor_id} \
--nic net-id=${network_id} --host ${SUBNODE_HOSTNAME} --wait ${server_cm}
echo "Starting cold migration of ${server_cm} to ${CONTROLLER_HOSTNAME}"
openstack --os-compute-api-version 2.56 server migrate \
--host ${CONTROLLER_HOSTNAME} ${server_cm}
# Wait for the migrations to be in progress before returning so that the
# SIGTERM can be sent while the migrations are in progress.
count=0
while true; do
cold_migration_status=$(openstack server migration list ${server_cm} -f value -c Status 2>/dev/null | head -1)
server_task_state=$(openstack server show ${server_cm} -f value -c OS-EXT-STS:task_state 2>/dev/null)
server_status=$(openstack server show ${server_cm} -f value -c status 2>/dev/null)
if [ "${cold_migration_status}" == "migrating" ] || \
[ "${cold_migration_status}" == "post-migrating" ] || \
[ "${server_task_state}" == "resize_migrating" ] || \
[ "${server_task_state}" == "resize_migrated" ] || \
[ "${server_task_state}" == "resize_finish" ]; then
echo "Cold migration is in progress"
break
elif [ "${cold_migration_status}" == "finished" ] || [ "${server_status}" == "VERIFY_RESIZE" ]; then
echo "Cold migration appears to have already completed"
exit 2
fi
count=$((count+1))
if [ ${count} -eq ${timeout} ]; then
echo "Timed out waiting for migrations to start"
exit 2
fi
done

View File

@@ -0,0 +1,35 @@
#!/bin/bash
source /opt/stack/devstack/openrc admin
set -x
set -e
server=$1
# Wait for the server to finish cold migration and reach VERIFY_RESIZE state,
# which indicates the migration has completed and is awaiting confirmation.
timeout=360
count=0
migration_start=$(date +%s)
while true; do
status=$(openstack server show ${server} -f value -c status)
task_state=$(openstack server show ${server} -f value -c OS-EXT-STS:task_state)
if [ "${status}" == "VERIFY_RESIZE" ] && { [ "${task_state}" == "None" ] || [ -z "${task_state}" ]; }; then
migration_end=$(date +%s)
migration_duration=$((migration_end - migration_start))
echo "Cold migration completed in ${migration_duration} seconds."
break
fi
if [ "${status}" == "ERROR" ]; then
echo "Server went to ERROR status during cold migration"
exit 6
fi
sleep 5
count=$((count+1))
if [ ${count} -eq ${timeout} ]; then
echo "Timed out waiting for cold migration to complete"
exit 5
fi
done

View File

@@ -50,7 +50,57 @@
script: "cleanup_test_servers.sh server-lm1"
ignore_errors: true
- name: Graceful shutdown source compute cold migration
block:
- name: Start cold migrations of test servers
become: true
become_user: stack
script: "start_cold_migration.sh server-cm1"
environment:
SUBNODE_HOSTNAME: "{{ hostvars['compute1']['ansible_hostname'] }}"
CONTROLLER_HOSTNAME: "{{ hostvars['controller']['ansible_hostname'] }}"
register: start_cold_migrations_result
failed_when: start_cold_migrations_result.rc not in [0, 2]
- name: Set fact if migration is completed or timed out before SIGTERM to source compute
set_fact:
cold_migrations_completed_or_timeout: "{{ start_cold_migrations_result.rc == 2 }}"
- name: Run graceful shutdown tests
when: not cold_migrations_completed_or_timeout
block:
- name: Send SIGTERM to source compute to start the source compute graceful shutdown
delegate_to: compute1
become: true
shell: "kill -15 $(systemctl show devstack@n-cpu -p MainPID --value)"
- name: Verify cold migration is completed during graceful shutdown
become: true
become_user: stack
script: "verify_cold_migration.sh server-cm1"
# Sleep for 180 sec: default graceful_shutdown_timeout
- name: Sleep for 180 seconds to allow source compute graceful shutdown to complete
pause:
seconds: 180
- name: Verify compute service is stopped after graceful shutdown
become: true
become_user: stack
script: "start_and_verify_compute_service.sh {{ hostvars['compute1']['ansible_hostname'] }} inactive"
- name: Start and verify subnode compute service is running
become: true
become_user: stack
script: "start_and_verify_compute_service.sh {{ hostvars['compute1']['ansible_hostname'] }}"
- name: Cleanup test servers
become: true
become_user: stack
script: "cleanup_test_servers.sh server-cm1"
ignore_errors: true
- name: Fail if any test is skipped
fail:
msg: "One or more test is skipped due to operation is either completed or timed out before SIGTERM signal."
when: live_migrations_completed_or_timeout
when: live_migrations_completed_or_timeout or cold_migrations_completed_or_timeout