diff --git a/nova/api/openstack/compute/servers.py b/nova/api/openstack/compute/servers.py index 3738ea00f9a1..304d3985ddf0 100644 --- a/nova/api/openstack/compute/servers.py +++ b/nova/api/openstack/compute/servers.py @@ -980,6 +980,13 @@ class ServersController(wsgi.Controller): exception.AutoDiskConfigDisabledByImage, exception.CertificateValidationFailed) as error: raise exc.HTTPBadRequest(explanation=error.format_message()) + # NOTE(sean-k-mooney): due to the lack of the flavor and image + # properties validation I06fad233006c7bab14749a51ffa226c3801f951b in + # stable/rocky we explicitly catch the NUMA rebuild conflict instead of + # extending the INVALID_FLAVOR_IMAGE_EXCEPTIONS tuple which was + # introduced in the stein release. + except exception.ImageNUMATopologyRebuildConflict as error: + raise exc.HTTPBadRequest(explanation=error.format_message()) instance = self._get_server(context, req, id, is_detail=True) diff --git a/nova/compute/api.py b/nova/compute/api.py index 4a202683cf2f..c4d022c33583 100644 --- a/nova/compute/api.py +++ b/nova/compute/api.py @@ -3260,6 +3260,14 @@ class API(base.Base): self._checks_for_create_and_rebuild(context, image_id, image, flavor, metadata, files_to_inject, root_bdm) + # NOTE(sean-k-mooney): When we rebuild with a new image we need to + # validate that the NUMA topology does not change as we do a NOOP claim + # in resource tracker. As such we cannot allow the resource usage or + # assignment to change as a result of a new image altering the + # numa constraints. + if orig_image_ref != image_href: + self._validate_numa_rebuild(instance, image, flavor) + kernel_id, ramdisk_id = self._handle_kernel_and_ramdisk( context, None, None, image) @@ -3359,6 +3367,48 @@ class API(base.Base): request_spec=request_spec, kwargs=kwargs) + @staticmethod + def _validate_numa_rebuild(instance, image, flavor): + """validates that the NUMA constraints do not change on rebuild. + + :param instance: nova.objects.instance.Instance object + :param image: the new image the instance will be rebuilt with. + :param flavor: the flavor of the current instance. + + :returns: True or raises on failure. + :raises: nova.exception.ImageNUMATopologyRebuildConflict + """ + + # NOTE(sean-k-mooney): currently it is not possible to express + # a PCI NUMA affinity policy via flavor or image but that will + # change in the future. we pull out the image metadata into + # separate variable to make future testing of this easier. + old_image_meta = instance.image_meta + new_image_meta = objects.ImageMeta.from_dict(image) + old_constraints = hardware.numa_get_constraints(flavor, old_image_meta) + new_constraints = hardware.numa_get_constraints(flavor, new_image_meta) + + # early out for non NUMA instances + if old_constraints is None and new_constraints is None: + # return true for easy unit testing + return True + + # if only one of the constrains are non-None (or 'set') then the + # constraints changed so raise an exception. + if old_constraints is None or new_constraints is None: + raise exception.ImageNUMATopologyRebuildConflict() + + # otherwise since both the old a new constrains are non none compare + # them as dictionaries. + old = old_constraints.obj_to_primitive() + new = new_constraints.obj_to_primitive() + if old != new: + raise exception.ImageNUMATopologyRebuildConflict() + # TODO(sean-k-mooney): add PCI NUMA affinity policy check. + + # return true for easy unit testing + return True + @staticmethod def _check_quota_for_upsize(context, instance, current_flavor, new_flavor): project_id, user_id = quotas_obj.ids_from_instance(context, diff --git a/nova/exception.py b/nova/exception.py index 5ed4f31ed4c6..7c4c547d911d 100644 --- a/nova/exception.py +++ b/nova/exception.py @@ -1911,6 +1911,12 @@ class ImageNUMATopologyForbidden(Forbidden): "NUMA configuration set against the flavor") +class ImageNUMATopologyRebuildConflict(Invalid): + msg_fmt = _( + "An instance's NUMA typology cannot be changed as part of a rebuild. " + "The image provided is invalid for this instance.") + + class ImageNUMATopologyAsymmetric(Invalid): msg_fmt = _("Instance CPUs and/or memory cannot be evenly distributed " "across instance NUMA nodes. Explicit assignment of CPUs " diff --git a/nova/tests/functional/integrated_helpers.py b/nova/tests/functional/integrated_helpers.py index eb82c44cb774..6376b2203562 100644 --- a/nova/tests/functional/integrated_helpers.py +++ b/nova/tests/functional/integrated_helpers.py @@ -81,7 +81,8 @@ class _IntegratedTestBase(test.TestCase): # TODO(mriedem): Fix the functional tests to work with Neutron. self.flags(use_neutron=self.USE_NEUTRON) - nova.tests.unit.image.fake.stub_out_image_service(self) + self.fake_image_service =\ + nova.tests.unit.image.fake.stub_out_image_service(self) self.useFixture(cast_as_call.CastAsCall(self)) self.useFixture(nova_fixtures.Database(database='placement')) diff --git a/nova/tests/functional/libvirt/test_numa_servers.py b/nova/tests/functional/libvirt/test_numa_servers.py index b8d32ef437c9..220573b22c7f 100644 --- a/nova/tests/functional/libvirt/test_numa_servers.py +++ b/nova/tests/functional/libvirt/test_numa_servers.py @@ -13,10 +13,10 @@ # License for the specific language governing permissions and limitations # under the License. -import six - import fixtures import mock +import six + from oslo_config import cfg from oslo_log import log as logging @@ -27,11 +27,10 @@ from nova import test from nova.tests import fixtures as nova_fixtures from nova.tests.functional.api import client from nova.tests.functional.test_servers import ServersTestBase +from nova.tests.unit import fake_notifier from nova.tests.unit.virt.libvirt import fake_imagebackend from nova.tests.unit.virt.libvirt import fake_libvirt_utils from nova.tests.unit.virt.libvirt import fakelibvirt - - CONF = cfg.CONF LOG = logging.getLogger(__name__) @@ -568,3 +567,136 @@ class NUMAServersWithNetworksTest(NUMAServersTestBase): network_metadata = args[1].network_metadata self.assertIsNotNone(network_metadata) self.assertEqual(set(['foo']), network_metadata.physnets) + + +class NUMAServersRebuildTests(NUMAServersTestBase): + + def setUp(self): + super(NUMAServersRebuildTests, self).setUp() + images = self.api.get_images() + # save references to first two image for server create and rebuild + self.image_ref_0 = images[0]['id'] + self.image_ref_1 = images[1]['id'] + + fake_notifier.stub_notifier(self) + self.addCleanup(fake_notifier.reset) + + # NOTE(sean-k-mooney): I3b4c1153bebdeab2eb8bc2108aa177732f5c6a97 + # moved mocking of the libvirt connection to the setup of + # NUMAServersTestBase but that is not backported so we just add back + # the mocking to enable backporting of + # I0322d872bdff68936033a6f5a54e8296a6fb3434 + _p = mock.patch('nova.virt.libvirt.host.Host.get_connection') + self.mock_conn = _p.start() + self.addCleanup(_p.stop) + + def _create_active_server(self, server_args=None): + basic_server = { + 'flavorRef': 1, + 'name': 'numa_server', + 'networks': [{ + 'uuid': nova_fixtures.NeutronFixture.network_1['id'] + }], + 'imageRef': self.image_ref_0 + } + if server_args: + basic_server.update(server_args) + server = self.api.post_server({'server': basic_server}) + return self._wait_for_state_change(server, 'BUILD') + + def _rebuild_server(self, active_server, image_ref): + args = {"rebuild": {"imageRef": image_ref}} + self.api.api_post( + 'servers/%s/action' % active_server['id'], args) + fake_notifier.wait_for_versioned_notifications('instance.rebuild.end') + return self._wait_for_state_change(active_server, 'REBUILD') + + def test_rebuild_server_with_numa(self): + """Create a NUMA instance and ensure it can be rebuilt. + """ + + # Create a flavor consuming 2 pinned cpus with an implicit + # numa topology of 1 virtual numa node. + extra_spec = {'hw:cpu_policy': 'dedicated'} + flavor_id = self._create_flavor(extra_spec=extra_spec) + + # Create a host with 4 physical cpus to allow rebuild leveraging + # the free space to ensure the numa topology filter does not + # eliminate the host. + host_info = fakelibvirt.NUMAHostInfo(cpu_nodes=1, cpu_sockets=1, + cpu_cores=4, kB_mem=15740000) + fake_connection = self._get_connection(host_info=host_info) + self.mock_conn.return_value = fake_connection + self.compute = self.start_service('compute', host='compute1') + + server = self._create_active_server( + server_args={"flavorRef": flavor_id}) + + # this should succeed as the NUMA topology has not changed + # and we have enough resources on the host. We rebuild with + # a different image to force the rebuild to query the scheduler + # to validate the host. + self._rebuild_server(server, self.image_ref_1) + + def test_rebuild_server_with_numa_inplace_fails(self): + """Create a NUMA instance and ensure in place rebuild fails. + """ + + # Create a flavor consuming 2 pinned cpus with an implicit + # numa topology of 1 virtual numa node. + extra_spec = {'hw:cpu_policy': 'dedicated'} + flavor_id = self._create_flavor(extra_spec=extra_spec) + + # cpu_cores is set to 2 to ensure that + # we have enough space to boot the vm but not enough space to rebuild + # by doubling the resource use during scheduling. + host_info = fakelibvirt.NUMAHostInfo( + cpu_nodes=1, cpu_sockets=1, cpu_cores=2, kB_mem=15740000) + fake_connection = self._get_connection(host_info=host_info) + self.mock_conn.return_value = fake_connection + self.compute = self.start_service('compute', host='compute1') + + server = self._create_active_server( + server_args={"flavorRef": flavor_id}) + + # TODO(sean-k-mooney): this should pass but i currently expect it to + # fail because the NUMA topology filter does not support in place + # rebuild and we have used all the resources on the compute node. + self.assertRaises( + client.OpenStackApiException, self._rebuild_server, + server, self.image_ref_1) + + def test_rebuild_server_with_different_numa_topology_fails(self): + """Create a NUMA instance and ensure inplace rebuild fails. + """ + + # Create a flavor consuming 2 pinned cpus with an implicit + # numa topology of 1 virtual numa node. + extra_spec = {'hw:cpu_policy': 'dedicated'} + flavor_id = self._create_flavor(extra_spec=extra_spec) + + host_info = fakelibvirt.NUMAHostInfo( + cpu_nodes=2, cpu_sockets=1, cpu_cores=4, kB_mem=15740000) + fake_connection = self._get_connection(host_info=host_info) + self.mock_conn.return_value = fake_connection + self.compute = self.start_service('compute', host='compute1') + + server = self._create_active_server( + server_args={"flavorRef": flavor_id}) + + # the original vm had an implicit numa topology of 1 virtual numa nodes + # so we alter the requested numa topology in image_ref_1 to request + # 2 virtual numa nodes. + ctx = nova_context.get_admin_context() + image_meta = {'properties': {'hw_numa_nodes': 2}} + self.fake_image_service.update(ctx, self.image_ref_1, image_meta) + + # NOTE(sean-k-mooney): this should fail because rebuild uses noop + # claims therefor it is not allowed for the numa topology or resource + # usage to change during a rebuild. + ex = self.assertRaises( + client.OpenStackApiException, self._rebuild_server, + server, self.image_ref_1) + self.assertEqual(400, ex.response.status_code) + self.assertIn("An instance's NUMA typology cannot be changed", + six.text_type(ex)) diff --git a/nova/tests/unit/api/openstack/compute/test_server_actions.py b/nova/tests/unit/api/openstack/compute/test_server_actions.py index c74999288110..723585d64e73 100644 --- a/nova/tests/unit/api/openstack/compute/test_server_actions.py +++ b/nova/tests/unit/api/openstack/compute/test_server_actions.py @@ -540,11 +540,22 @@ class ServerActionsControllerTestV21(test.TestCase): def test_rebuild_when_kernel_not_exists(self): def return_image_meta(*args, **kwargs): + # NOTE(sean-k-mooney): To enable + # I0322d872bdff68936033a6f5a54e8296a6fb3434 to be backported + # without I34ffaf285718059b55f90e812b57f1e11d566c6f we update the + # fake image data to use valid UUIDs image_meta_table = { - '2': {'id': 2, 'status': 'active', 'container_format': 'ari'}, - '155d900f-4e14-4e4c-a73d-069cbf4541e6': - {'id': 3, 'status': 'active', 'container_format': 'raw', - 'properties': {'kernel_id': 1, 'ramdisk_id': 2}}, + uuids.ramdisk_image_id: { + 'id': uuids.ramdisk_image_id, 'status': 'active', + 'container_format': 'ari'}, + '155d900f-4e14-4e4c-a73d-069cbf4541e6': { + 'id': '155d900f-4e14-4e4c-a73d-069cbf4541e6', + 'status': 'active', 'container_format': 'raw', + 'properties': { + 'kernel_id': uuids.missing_image_id, + 'ramdisk_id': uuids.ramdisk_image_id, + }, + }, } image_id = args[2] try: @@ -582,12 +593,25 @@ class ServerActionsControllerTestV21(test.TestCase): instance_meta[key] = instance[key] def return_image_meta(*args, **kwargs): + # NOTE(sean-k-mooney): To enable + # I0322d872bdff68936033a6f5a54e8296a6fb3434 to be backported + # without I34ffaf285718059b55f90e812b57f1e11d566c6f we update the + # fake image data to use valid UUIDs image_meta_table = { - '1': {'id': 1, 'status': 'active', 'container_format': 'aki'}, - '2': {'id': 2, 'status': 'active', 'container_format': 'ari'}, - '155d900f-4e14-4e4c-a73d-069cbf4541e6': - {'id': 3, 'status': 'active', 'container_format': 'raw', - 'properties': {'kernel_id': 1, 'ramdisk_id': 2}}, + uuids.kernel_image_id: { + 'id': uuids.kernel_image_id, 'status': 'active', + 'container_format': 'aki'}, + uuids.ramdisk_image_id: { + 'id': uuids.ramdisk_image_id, 'status': 'active', + 'container_format': 'ari'}, + '155d900f-4e14-4e4c-a73d-069cbf4541e6': { + 'id': '155d900f-4e14-4e4c-a73d-069cbf4541e6', + 'status': 'active', 'container_format': 'raw', + 'properties': { + 'kernel_id': uuids.kernel_image_id, + 'ramdisk_id': uuids.ramdisk_image_id, + }, + }, } image_id = args[2] try: @@ -607,8 +631,8 @@ class ServerActionsControllerTestV21(test.TestCase): }, } self.controller._action_rebuild(self.req, FAKE_UUID, body=body).obj - self.assertEqual(instance_meta['kernel_id'], '1') - self.assertEqual(instance_meta['ramdisk_id'], '2') + self.assertEqual(instance_meta['kernel_id'], uuids.kernel_image_id) + self.assertEqual(instance_meta['ramdisk_id'], uuids.ramdisk_image_id) @mock.patch.object(compute_api.API, 'rebuild') def test_rebuild_instance_raise_auto_disk_config_exc(self, mock_rebuild): diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index 55461a31a95a..6039ae9c9699 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -9037,7 +9037,12 @@ class ComputeAPITestCase(BaseTestCase): with mock.patch.object(self.compute_api.compute_task_api, 'rebuild_instance', fake_rpc_rebuild): - image_ref = instance["image_ref"] + '-new_image_ref' + # NOTE(sean-k-mooney): to enable + # I0322d872bdff68936033a6f5a54e8296a6fb3434 to be backported + # without I34ffaf285718059b55f90e812b57f1e11d566c6f we update the + # image ref to use uuids.new_image_ref to pass the UUIDField + # validation requirements + image_ref = uuids.new_image_ref password = "new_password" instance.vm_state = vm_state diff --git a/nova/tests/unit/compute/test_compute_api.py b/nova/tests/unit/compute/test_compute_api.py index 4696327530d2..02bef32bff4a 100644 --- a/nova/tests/unit/compute/test_compute_api.py +++ b/nova/tests/unit/compute/test_compute_api.py @@ -6198,6 +6198,123 @@ class ComputeAPIUnitTestCase(_ComputeAPIUnitTestMixIn, test.NoDBTestCase): self.assertEqual(['context-for-%s' % c for c in compute_api.CELLS], cells) + def test__validate_numa_rebuild_non_numa(self): + """This test asserts that a rebuild of an instance without a NUMA + topology passes validation. + """ + flavor = objects.Flavor( + id=42, vcpus=1, memory_mb=512, root_gb=1, extra_specs={}) + instance = self._create_instance_obj(flavor=flavor) + # we use a dict instead of image metadata object as + # _validate_numa_rebuild constructs the object internally + image = { + 'id': uuids.image_id, 'status': 'foo', + 'properties': {}} + self.compute_api._validate_numa_rebuild(instance, image, flavor) + + def test__validate_numa_rebuild_no_conflict(self): + """This test asserts that a rebuild of an instance without a change + in NUMA topology passes validation. + """ + flavor = objects.Flavor( + id=42, vcpus=1, memory_mb=512, root_gb=1, + extra_specs={"hw:numa_nodes": 1}) + instance = self._create_instance_obj(flavor=flavor) + # we use a dict instead of image metadata object as + # _validate_numa_rebuild constructs the object internally + image = { + 'id': uuids.image_id, 'status': 'foo', + 'properties': {}} + # The flavor creates a NUMA topology but the default image and the + # rebuild image do not have any image properties so there will + # be no conflict. + self.compute_api._validate_numa_rebuild(instance, image, flavor) + + def test__validate_numa_rebuild_add_numa_toplogy(self): + """This test asserts that a rebuild of an instance with a new image + that requests a NUMA topology when the original instance did not + have a NUMA topology is invalid. + """ + + flavor = objects.Flavor( + id=42, vcpus=1, memory_mb=512, root_gb=1, + extra_specs={}) + instance = self._create_instance_obj(flavor=flavor) + # we use a dict instead of image metadata object as + # _validate_numa_rebuild constructs the object internally + image = { + 'id': uuids.image_id, 'status': 'foo', + 'properties': {"hw_numa_nodes": 1}} + # The flavor and default image have no NUMA topology defined. The image + # used to rebuild requests a NUMA topology which is not allowed as it + # would alter the NUMA constrains. + self.assertRaises( + exception.ImageNUMATopologyRebuildConflict, + self.compute_api._validate_numa_rebuild, instance, image, flavor) + + def test__validate_numa_rebuild_remove_numa_toplogy(self): + """This test asserts that a rebuild of an instance with a new image + that does not request a NUMA topology when the original image did + is invalid if it would alter the instances topology as a result. + """ + + flavor = objects.Flavor( + id=42, vcpus=1, memory_mb=512, root_gb=1, + extra_specs={}) + instance = self._create_instance_obj(flavor=flavor) + # we use a dict instead of image metadata object as + # _validate_numa_rebuild constructs the object internally + old_image = { + 'id': uuidutils.generate_uuid(), 'status': 'foo', + 'properties': {"hw_numa_nodes": 1}} + old_image_meta = objects.ImageMeta.from_dict(old_image) + image = { + 'id': uuidutils.generate_uuid(), 'status': 'foo', + 'properties': {}} + with mock.patch( + 'nova.objects.instance.Instance.image_meta', + new_callable=mock.PropertyMock(return_value=old_image_meta)): + # The old image has a NUMA topology defined but the new image + # used to rebuild does not. This would alter the NUMA constrains + # and therefor should raise. + self.assertRaises( + exception.ImageNUMATopologyRebuildConflict, + self.compute_api._validate_numa_rebuild, instance, + image, flavor) + + def test__validate_numa_rebuild_alter_numa_toplogy(self): + """This test asserts that a rebuild of an instance with a new image + that requests a different NUMA topology than the original image + is invalid. + """ + + # NOTE(sean-k-mooney): we need to use 2 vcpus here or we will fail + # with a different exception ImageNUMATopologyAsymmetric when we + # construct the NUMA constrains as the rebuild image would result + # in an invalid topology. + flavor = objects.Flavor( + id=42, vcpus=2, memory_mb=512, root_gb=1, + extra_specs={}) + instance = self._create_instance_obj(flavor=flavor) + # we use a dict instead of image metadata object as + # _validate_numa_rebuild constructs the object internally + old_image = { + 'id': uuidutils.generate_uuid(), 'status': 'foo', + 'properties': {"hw_numa_nodes": 1}} + old_image_meta = objects.ImageMeta.from_dict(old_image) + image = { + 'id': uuidutils.generate_uuid(), 'status': 'foo', + 'properties': {"hw_numa_nodes": 2}} + with mock.patch( + 'nova.objects.instance.Instance.image_meta', + new_callable=mock.PropertyMock(return_value=old_image_meta)): + # the original image requested 1 NUMA node and the image used + # for rebuild requests 2 so assert an error is raised. + self.assertRaises( + exception.ImageNUMATopologyRebuildConflict, + self.compute_api._validate_numa_rebuild, instance, + image, flavor) + def test_validate_and_build_base_options_translate_neutron_secgroup(self): """Tests that _check_requested_secgroups will return a uuid for a requested Neutron security group and that will be returned from diff --git a/releasenotes/notes/numa-rebuild-b75f9a1966f576ea.yaml b/releasenotes/notes/numa-rebuild-b75f9a1966f576ea.yaml new file mode 100644 index 000000000000..99777e4df6bf --- /dev/null +++ b/releasenotes/notes/numa-rebuild-b75f9a1966f576ea.yaml @@ -0,0 +1,16 @@ +--- +fixes: + - | + An instance can be rebuilt in-place with the original image or a new + image. Instance resource usage cannot be altered during a rebuild. + Previously Nova would have ignored the NUMA topology of the new image + continuing to use the NUMA topology of the existing instance until a move + operation was performed. As Nova did not explicitly guard against + inadvertent changes in resource request contained in a new image, + it was possible to rebuild with an image that would violate this requirement + `bug #1763766`_. This resulted in an inconsistent state as the instance that + was running did not match the instance that was requested. Nova now explicitly + checks if a rebuild would alter the requested NUMA topology of an instance + and rejects the rebuild. + + .. _`bug #1763766`: https://bugs.launchpad.net/nova/+bug/1763766