TPM: support live migration of host secret security

This enables live migration for TPM instances with the ``host`` secret
security mode. The ``host`` security mode uses key manager service
secrets owned by the instance owner. The secret is persisted in
Libvirt and is sent over RPC to the destination during a live
migration.

The service version will be bumped in a separate patch.

Related to blueprint vtpm-live-migration

Change-Id: I97e9dd454c793abcb1a20579b1ceaec627be4813
Signed-off-by: melanie witt <melwittt@gmail.com>
This commit is contained in:
Artom Lifshitz
2025-02-12 11:44:12 -05:00
committed by melanie witt
parent 2bdf12535c
commit 3eae9477d2
10 changed files with 313 additions and 27 deletions

View File

@@ -106,7 +106,8 @@ class MigrateServerController(wsgi.Controller):
# 'LiveMigrationTask._check_instance_has_no_numa' check in the
# conductor
instance = common.get_instance(self.compute_api, context, id,
expected_attrs=['numa_topology'])
expected_attrs=['numa_topology',
'system_metadata'])
host = body["os-migrateLive"]["host"]
if host:

View File

@@ -9919,7 +9919,9 @@ class ComputeManager(manager.Manager):
# storage
# vpmem must be cleaned
do_cleanup = (not migrate_data.is_shared_instance_path or
has_vpmem or has_mdevs or power_management_possible)
has_vpmem or has_mdevs or
power_management_possible or
migrate_data.has_vtpm)
destroy_disks = not (
migrate_data.is_shared_block_storage or
migrate_data.is_shared_instance_path)

View File

@@ -2676,3 +2676,8 @@ class VTPMOldCompute(Invalid):
msg_fmt = _('vTPM live migration is not supported by old nova-compute '
'services. Upgrade your nova-compute services to '
'Gazpacho (33.0.0) or later.')
class VTPMSecretNotFound(NovaException):
msg_fmt = _('TPM encryption secret for instance %(instance_uuid)s was not '
'found.')

View File

@@ -363,6 +363,18 @@ class LibvirtLiveMigrateData(LiveMigrateData):
def is_on_shared_storage(self):
return self.is_shared_block_storage or self.is_shared_instance_path
@property
def has_vtpm(self):
"""Whether the live migration involves vTPM"""
return (self.obj_attr_is_set('vtpm_secret_uuid') and
self.obj_attr_is_set('vtpm_secret_value'))
@property
def has_vtpm_secret_data(self):
"""Whether vTPM secret data has been populated"""
return (
self.has_vtpm and self.vtpm_secret_uuid and self.vtpm_secret_value)
# TODO(gmann): HyperV virt driver has been removed in Nova 29.0.0 (OpenStack
# 2024.1) release but we kept this object for a couple of cycle. This can be

View File

@@ -642,12 +642,12 @@ class InstanceHelperMixin:
def _live_migrate(
self, server, migration_expected_state='completed',
server_expected_state='ACTIVE', api=None,
server_expected_state='ACTIVE', api=None, host=None,
):
api = api or self.api
api.post_server_action(
server['id'],
{'os-migrateLive': {'host': None, 'block_migration': 'auto'}})
{'os-migrateLive': {'host': host, 'block_migration': 'auto'}})
self._wait_for_migration_status(server, [migration_expected_state])
return self._wait_for_state_change(server, server_expected_state)

View File

@@ -143,15 +143,17 @@ class FakeKeyManager(key_manager.KeyManager):
@ddt.ddt
class VTPMServersTest(base.ServersTestBase):
class VTPMServersTest(base.LibvirtMigrationMixin, base.ServersTestBase):
# NOTE: ADMIN_API is intentionally not set to True in order to catch key
# manager service secret ownership issues.
# Reflect reality more for async API requests like migration
CAST_AS_CALL = False
# Enables block_migration='auto' required by the _live_migrate() helper.
microversion = '2.25'
# Microversion 2.25 Enables block_migration='auto' required by the
# _live_migrate() helper.
# Microversion 2.34 enables asynchronous pre-live-migration checks.
microversion = '2.34'
def setUp(self):
# enable vTPM and use our own fake key service
@@ -485,6 +487,140 @@ class VTPMServersTest(base.ServersTestBase):
'services. Upgrade your nova-compute services to '
'Gazpacho (33.0.0) or later.', str(ex))
@mock.patch('nova.compute.api.MIN_COMPUTE_VTPM_LIVE_MIGRATION', 5)
@mock.patch('nova.objects.service.Service.get_minimum_version',
new=mock.Mock(return_value=5))
def test_live_migrate_server_secret_security_host(self):
"""Test a successful live migration of a server with 'host' security
Because we have two computes that support the 'host' secret security
policy, we expect the live migration to be successful.
"""
self.flags(supported_tpm_secret_security=['host'], group='libvirt')
self.start_compute(hostname='src')
self.src = self.computes['src']
self.server = self._create_server_with_vtpm(secret_security='host')
self.start_compute(hostname='dest')
self.dest = self.computes['dest']
# We should have a secret in the key manager service.
self.assertInstanceHasSecret(self.server)
# We should also have a libvirt secret on the source host.
self._assert_libvirt_has_secret(self.src, self.server['id'])
# And no libvirt secret on the destination host.
self._assert_libvirt_secret_missing(self.dest, self.server['id'])
self._live_migrate(self.server, api=self.admin_api)
# After the live migration, we should still have a secret in the key
# manager service.
self.assertInstanceHasSecret(self.server)
# We should have removed the libvirt secret from the source host.
self._assert_libvirt_secret_missing(self.src, self.server['id'])
# And we should have a libvirt secret on the destination host.
self._assert_libvirt_has_secret(self.dest, self.server['id'])
@mock.patch('nova.compute.api.MIN_COMPUTE_VTPM_LIVE_MIGRATION', 5)
@mock.patch('nova.objects.service.Service.get_minimum_version',
new=mock.Mock(return_value=5))
def test_live_migrate_server_secret_security_host_missing(self):
"""Test behavior when the instance libvirt secret is missing
This should not be able to happen but in case it does, fail gracefully.
"""
self.flags(supported_tpm_secret_security=['host'], group='libvirt')
self.start_compute(hostname='src')
self.src = self.computes['src']
self.server = self._create_server_with_vtpm(secret_security='host')
self._assert_libvirt_has_secret(self.src, self.server['id'])
self.start_compute(hostname='dest')
self.dest = self.computes['dest']
# Delete the libvirt secret ourselves to fake the missing secret.
self.src.driver._host.delete_secret('vtpm', self.server['id'])
self._assert_libvirt_secret_missing(self.src, self.server['id'])
# The missing secret error will make the migration precheck fail and we
# will get NoValidHost and the instance will remain ACTIVE.
self._live_migrate(
self.server, migration_expected_state='error',
server_expected_state='ACTIVE', api=self.admin_api)
# Live migration attempt should have failed with VTPMSecretNotFound.
# Need microversion 2.84 to get events.details field.
with utils.temporary_mutation(self.admin_api, microversion='2.84'):
event = self._wait_for_instance_action_event(
self.server, 'live-migration',
'compute_check_can_live_migrate_source', 'Error')
msg = ('TPM secret was not found. Try hard-rebooting the '
'instance to recover')
self.assertIn(msg, event['details'])
# Try to recover the instance by hard-rebooting it.
self._reboot_server(self.server, hard=True)
# This time the live migration should work because the libvirt secret
# should have been re-created by the hard reboot.
self._live_migrate(self.server, migration_expected_state='completed',
api=self.admin_api)
@mock.patch('nova.compute.api.MIN_COMPUTE_VTPM_LIVE_MIGRATION', 5)
@mock.patch('nova.objects.service.Service.get_minimum_version',
new=mock.Mock(return_value=5))
def test_live_migrate_server_secret_security_host_rollback(self):
"""Test a failed live migration of a server with 'host' security
Simulate a failure and verify that secrets are correctly handled during
the rollback process.
"""
def _migrate_stub(domain, destination, params, flags):
self.dest.driver._host.get_connection().createXML(
params['destination_xml'],
'fake-createXML-doesnt-care-about-flags')
conn = self.src.driver._host.get_connection()
dom = conn.lookupByUUIDString(self.server['id'])
dom.fail_job()
self.flags(supported_tpm_secret_security=['host'], group='libvirt')
self.start_compute(hostname='src')
self.src = self.computes['src']
self.server = self._create_server_with_vtpm(secret_security='host')
self.start_compute(hostname='dest')
self.dest = self.computes['dest']
# We should have a secret in the key manager service.
self.assertInstanceHasSecret(self.server)
# We should also have a libvirt secret on the source host.
self._assert_libvirt_has_secret(self.src, self.server['id'])
# And no libvirt secret on the destination host.
self._assert_libvirt_secret_missing(self.dest, self.server['id'])
with mock.patch('nova.tests.fixtures.libvirt.Domain.migrateToURI3',
_migrate_stub):
self._live_migrate(self.server, migration_expected_state='failed',
api=self.admin_api)
# Waiting for the migration status isn't enough -- part of the
# rollback process is an async RPC call, so if we don't wait for
# the end of the rollback, the secret cleanup may not be completed
# yet when we want to verify it below.
self.notifier.wait_for_versioned_notifications(
'instance.live_migration_rollback_dest.end')
# After the live migration fails, we should still have a secret in the
# key manager service.
self.assertInstanceHasSecret(self.server)
# We should have a libvirt secret on the source host.
self._assert_libvirt_has_secret(self.src, self.server['id'])
# And no libvirt secret on the destination host.
self._assert_libvirt_secret_missing(self.dest, self.server['id'])
def test_suspend_resume_server(self):
self.start_compute()

View File

@@ -52,7 +52,7 @@ class CommonMixin(object):
expected_attrs = None
if action == '_migrate_live':
expected_attrs = ['numa_topology']
expected_attrs = ['numa_topology', 'system_metadata']
elif action == '_migrate':
expected_attrs = ['flavor', 'services']
@@ -75,7 +75,7 @@ class CommonMixin(object):
expected_attrs = None
if action == '_migrate_live':
expected_attrs = ['numa_topology']
expected_attrs = ['numa_topology', 'system_metadata']
elif action == '_migrate':
expected_attrs = ['flavor', 'services']
@@ -103,7 +103,7 @@ class CommonMixin(object):
expected_attrs = None
if action == '_migrate_live':
expected_attrs = ['numa_topology']
expected_attrs = ['numa_topology', 'system_metadata']
if method is None:
method = action.replace('_', '')
@@ -133,7 +133,7 @@ class CommonMixin(object):
expected_attrs = None
if action == '_migrate_live':
expected_attrs = ['numa_topology']
expected_attrs = ['numa_topology', 'system_metadata']
elif action == '_migrate':
expected_attrs = ['flavor', 'services']
@@ -174,7 +174,7 @@ class CommonMixin(object):
expected_attrs = None
if action == '_migrate_live':
expected_attrs = ['numa_topology']
expected_attrs = ['numa_topology', 'system_metadata']
elif action == '_migrate':
expected_attrs = ['flavor', 'services']
@@ -207,7 +207,7 @@ class CommonMixin(object):
expected_attrs = None
if action == '_migrate_live':
expected_attrs = ['numa_topology']
expected_attrs = ['numa_topology', 'system_metadata']
if method is None:
method = action.replace('_', '')

View File

@@ -156,9 +156,10 @@ class MigrateServerTests(admin_only_action_common.CommonTests):
self.context, instance, False, self.disk_over_commit,
'hostname', self.force, self.async_)
self.mock_get.assert_called_once_with(self.context, instance.uuid,
expected_attrs=['numa_topology'],
cell_down_support=False)
self.mock_get.assert_called_once_with(
self.context, instance.uuid,
expected_attrs=['numa_topology', 'system_metadata'],
cell_down_support=False)
def test_migrate_live_enabled(self):
param = self._get_params(host='hostname')
@@ -234,9 +235,10 @@ class MigrateServerTests(admin_only_action_common.CommonTests):
mock_live_migrate.assert_called_once_with(
self.context, instance, False, self.disk_over_commit,
'hostname', self.force, self.async_)
self.mock_get.assert_called_once_with(self.context, instance.uuid,
expected_attrs=['numa_topology'],
cell_down_support=False)
self.mock_get.assert_called_once_with(
self.context, instance.uuid,
expected_attrs=['numa_topology', 'system_metadata'],
cell_down_support=False)
def test_migrate_live_compute_service_unavailable(self):
self._test_migrate_live_failed_with_exception(
@@ -482,9 +484,10 @@ class MigrateServerTestsV234(MigrateServerTestsV230):
mock_live_migrate.assert_called_once_with(
self.context, instance, None, self.disk_over_commit,
'hostname', self.force, self.async_)
self.mock_get.assert_called_once_with(self.context, instance.uuid,
expected_attrs=['numa_topology'],
cell_down_support=False)
self.mock_get.assert_called_once_with(
self.context, instance.uuid,
expected_attrs=['numa_topology', 'system_metadata'],
cell_down_support=False)
def test_migrate_live_unexpected_error(self):
body = {'os-migrateLive':
@@ -501,9 +504,10 @@ class MigrateServerTestsV234(MigrateServerTestsV230):
mock_live_migrate.assert_called_once_with(
self.context, instance, None, self.disk_over_commit,
'hostname', self.force, self.async_)
self.mock_get.assert_called_once_with(self.context, instance.uuid,
expected_attrs=['numa_topology'],
cell_down_support=False)
self.mock_get.assert_called_once_with(
self.context, instance.uuid,
expected_attrs=['numa_topology', 'system_metadata'],
cell_down_support=False)
class MigrateServerTestsV256(MigrateServerTestsV234):

View File

@@ -13282,6 +13282,82 @@ class LibvirtConnTestCase(test.NoDBTestCase,
self.assertFalse(mock_disk_check.called)
@mock.patch('nova.virt.libvirt.driver.LibvirtDriver.'
'_is_shared_block_storage', new=mock.Mock())
@mock.patch('nova.virt.libvirt.driver.LibvirtDriver.'
'_check_shared_storage_test_file', new=mock.Mock())
@mock.patch('nova.virt.libvirt.host.Host.find_secret')
@ddt.data('user', 'host', 'deployment')
def test_check_can_live_migrate_source_vtpm(self, security, mock_find):
"""Verify vTPM fields in migrate data are set correctly
For the 'host' security mode, vTPM fields should be populated with the
secret UUID and the secret value. For the 'user' and 'deployment'
security modes, the fields should be set to None.
"""
instance = objects.Instance(**self.test_instance)
instance.flavor.extra_specs = {
'hw:tpm_version': '1.2',
'hw:tpm_secret_security': security,
}
dest_check_data = objects.LibvirtLiveMigrateData(filename='file')
mock_find.return_value.UUIDString.return_value = uuids.secret
mock_find.return_value.value.return_value.decode.return_value = 'foo'
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
drvr.check_can_live_migrate_source(self.context, instance,
dest_check_data)
if security == 'host':
mock_find.assert_called_once_with('vtpm', instance.uuid)
self.assertEqual(uuids.secret, dest_check_data.vtpm_secret_uuid)
self.assertEqual('foo', dest_check_data.vtpm_secret_value)
else:
mock_find.assert_not_called()
self.assertIsNone(dest_check_data.vtpm_secret_uuid)
self.assertIsNone(dest_check_data.vtpm_secret_value)
@mock.patch('nova.virt.libvirt.driver.LibvirtDriver.'
'_is_shared_block_storage', new=mock.Mock())
@mock.patch('nova.virt.libvirt.driver.LibvirtDriver.'
'_check_shared_storage_test_file', new=mock.Mock())
@mock.patch('nova.virt.libvirt.host.Host.find_secret')
def test_check_can_live_migrate_source_vtpm_legacy(self, mock_find):
"""Verify legacy vTPM instances would not set migrate data
This really shouldn't be possible given that vTPM live migration is
blocked at the API for legacy vTPM instances, but test it anyway.
"""
instance = objects.Instance(**self.test_instance)
instance.flavor.extra_specs = {'hw:tpm_version': '1.2'}
dest_check_data = objects.LibvirtLiveMigrateData(filename='file')
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
drvr.check_can_live_migrate_source(self.context, instance,
dest_check_data)
mock_find.assert_not_called()
self.assertIsNone(dest_check_data.vtpm_secret_uuid)
self.assertIsNone(dest_check_data.vtpm_secret_value)
@mock.patch('nova.virt.libvirt.driver.LibvirtDriver.'
'_is_shared_block_storage', new=mock.Mock())
@mock.patch('nova.virt.libvirt.driver.LibvirtDriver.'
'_check_shared_storage_test_file', new=mock.Mock())
@mock.patch('nova.virt.libvirt.host.Host.find_secret')
def test_check_can_live_migrate_source_no_vtpm(self, mock_find):
"""Verify non-vTPM instances would not set migrate data"""
instance = objects.Instance(**self.test_instance)
dest_check_data = objects.LibvirtLiveMigrateData(filename='file')
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
drvr.check_can_live_migrate_source(self.context, instance,
dest_check_data)
mock_find.assert_not_called()
self.assertFalse(dest_check_data.obj_attr_is_set('vtpm_secret_uuid'))
self.assertFalse(dest_check_data.obj_attr_is_set('vtpm_secret_value'))
def _is_shared_block_storage_test_create_mocks(self, disks):
# Test data
instance_xml = ("<domain type='kvm'><name>instance-0000000a</name>"
@@ -16627,6 +16703,7 @@ class LibvirtConnTestCase(test.NoDBTestCase,
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
inst_ref = {'id': 'foo'}
mig_data = objects.LibvirtLiveMigrateData()
cntx = context.get_admin_context()
# Set up the mock expectations
@@ -16634,7 +16711,7 @@ class LibvirtConnTestCase(test.NoDBTestCase,
return_value=bdi['block_device_mapping'])
@mock.patch.object(drvr, '_disconnect_volume')
def _test(_disconnect_volume, block_device_info_get_mapping):
drvr.post_live_migration(cntx, inst_ref, bdi)
drvr.post_live_migration(cntx, inst_ref, bdi, mig_data)
block_device_info_get_mapping.assert_called_once_with(bdi)
_disconnect_volume.assert_has_calls([
@@ -16651,13 +16728,14 @@ class LibvirtConnTestCase(test.NoDBTestCase,
vol_2_conn_info = {'data': {'volume_id': uuids.vol_2_id}}
mock_get_bdm.return_value = [{'connection_info': vol_1_conn_info},
{'connection_info': vol_2_conn_info}]
mig_data = objects.LibvirtLiveMigrateData()
# Raise an exception with the first call to disconnect_volume
mock_disconnect_volume.side_effect = [test.TestingException, None]
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
drvr.post_live_migration(mock.sentinel.ctxt, mock.sentinel.instance,
mock.sentinel.bdi)
mock.sentinel.bdi, mig_data)
# Assert disconnect_volume is called twice despite the exception
mock_disconnect_volume.assert_has_calls([

View File

@@ -10857,8 +10857,43 @@ class LibvirtDriver(driver.ComputeDriver):
mdev_types = self._get_mdev_types_from_uuids(instance_mdevs.keys())
dest_check_data.source_mdev_types = mdev_types
self._add_vtpm_secret_to_live_migrate_data(instance, dest_check_data)
return dest_check_data
def _add_vtpm_secret_to_live_migrate_data(self, instance, dest_check_data):
has_vtpm = hardware.get_vtpm_constraint(
instance.flavor, instance.image_meta) is not None
if not has_vtpm:
return
security = vtpm.get_instance_tpm_secret_security(instance.flavor)
if security == 'host':
secret = self._host.find_secret('vtpm', instance.uuid)
if secret is None:
# If the libvirt secret is not found on this host, a hard
# reboot will cause the secret to be re-created and the user
# will be able to try to live migration again.
msg = _('TPM secret was not found. Try hard-rebooting the '
'instance to recover.')
LOG.error(msg, instance=instance)
raise exception.VTPMSecretNotFound(msg)
dest_check_data.vtpm_secret_uuid = secret.UUIDString()
# Have to decode the bytes type to conform to the object's
# SensitiveStringField type.
dest_check_data.vtpm_secret_value = secret.value().decode()
else:
# If the instance has a vTPM, set the relevant fields to None in
# order to convey that we are actively choosing not to pass any
# vTPM data for the 'deployment' or 'user' security policies. (The
# 'user' security policy should not be able to reach this code as
# live migration is rejected at the API, but we set the fields
# anyway for completeness.)
dest_check_data.vtpm_secret_uuid = None
dest_check_data.vtpm_secret_value = None
def _host_can_support_mdev_live_migration(self):
return self._host.has_min_version(
lv_ver=MIN_MDEV_LIVEMIG_LIBVIRT_VERSION,
@@ -11764,6 +11799,8 @@ class LibvirtDriver(driver.ComputeDriver):
try:
self.destroy(context, instance, network_info, block_device_info,
destroy_disks)
if migrate_data and migrate_data.has_vtpm:
self._host.delete_secret('vtpm', instance.uuid)
finally:
# NOTE(gcb): Failed block live migration may leave instance
# directory at destination node, ensure it is always deleted.
@@ -11952,6 +11989,15 @@ class LibvirtDriver(driver.ComputeDriver):
LOG.debug('No dst_numa_info in migrate_data, '
'no cores to power up in pre_live_migration.')
if migrate_data.has_vtpm_secret_data:
self._host.create_secret(
'vtpm', instance.uuid,
# Convert the SensitiveStringField back to bytes when creating
# the libvirt secret.
password=migrate_data.vtpm_secret_value.encode(),
uuid=migrate_data.vtpm_secret_uuid, ephemeral=False,
private=False)
return migrate_data
def _try_fetch_image_cache(self, image, fetch_func, context, filename,
@@ -12089,6 +12135,8 @@ class LibvirtDriver(driver.ComputeDriver):
def post_live_migration(self, context, instance, block_device_info,
migrate_data=None):
if migrate_data and migrate_data.has_vtpm:
self._host.delete_secret('vtpm', instance.uuid)
# NOTE(mdbooth): The block_device_info we were passed was initialized
# with BDMs from the source host before they were updated to point to
# the destination. We can safely use this to disconnect the source