From 9f57d16f38f0290718e3ba78393d064746f7e527 Mon Sep 17 00:00:00 2001 From: Sean Mooney Date: Thu, 10 Oct 2019 18:00:07 +0100 Subject: [PATCH] Block rebuild when NUMA topology changed If the image change during a rebuild it's possible for the request NUMA topology to change. As a rebuild uses a noop claim in the resource tracker the NUMA topology will not be updated as part of a rebuild. If the NUMA constraints do not change, a rebuild will continue as normal. If the new constraints conflict with the existing NUMA constraints of the instance the rebuild will be rejected without altering the status of the instance. This change introduces an API check to block rebuild when the NUMA requirements for the new image do not match the existing NUMA constraints. This is in line with the previous check introduced to prevent the rebuild of volume-backed instances which similarly are not supported. This change adds functional tests to assert the expected behaviour of rebuilding NUMA instances with new images. This change also asserts that in place rebuilds of numa instances is currently not supported. Conflicts: nova/api/openstack/compute/servers.py nova/tests/functional/libvirt/test_numa_servers.py NOTE(sean-k-mooney): due to the lack of Ifcda7336d56c9b623720ee018ec5697740986273 the Fake HostInfo objects created in the functional tests were updated to use the NUMAHostInfo class. Prior to Ifcda7336d56c9b623720ee018ec5697740986273 the Fake HostInfo class did not construct a numa topology from the kwargs and instead only set a numa topology if it was passed in during construction. In older release the initialization of the numa topology from kwargs was a feature of NUMAHostInfo. The servers.py conflicts are due to a lack of I5576fa2a67d2771614266022428b4a95487ab6d5 in Stein. Closes-Bug: #1763766 Partial-implements: blueprint inplace-rebuild-of-numa-instances Change-Id: I0322d872bdff68936033a6f5a54e8296a6fb3434 (cherry picked from commit 6f5358ac1992b17b7f3f99d9a32290e0d4740dae) (cherry picked from commit 745de99063bf77704a7f0610fe9e3647257eaa50) --- nova/api/openstack/compute/servers.py | 1 + nova/compute/api.py | 50 +++++++ nova/exception.py | 6 + nova/tests/functional/integrated_helpers.py | 3 +- .../functional/libvirt/test_numa_servers.py | 131 +++++++++++++++++- nova/tests/unit/compute/test_compute_api.py | 117 ++++++++++++++++ .../notes/numa-rebuild-b75f9a1966f576ea.yaml | 16 +++ 7 files changed, 320 insertions(+), 4 deletions(-) create mode 100644 releasenotes/notes/numa-rebuild-b75f9a1966f576ea.yaml diff --git a/nova/api/openstack/compute/servers.py b/nova/api/openstack/compute/servers.py index de23f4b0e5cd..c6c0d1066e20 100644 --- a/nova/api/openstack/compute/servers.py +++ b/nova/api/openstack/compute/servers.py @@ -68,6 +68,7 @@ INVALID_FLAVOR_IMAGE_EXCEPTIONS = ( exception.ImageNUMATopologyForbidden, exception.ImageNUMATopologyIncomplete, exception.ImageNUMATopologyMemoryOutOfRange, + exception.ImageNUMATopologyRebuildConflict, exception.ImageSerialPortNumberExceedFlavorValue, exception.ImageSerialPortNumberInvalid, exception.ImageVCPULimitsRangeExceeded, diff --git a/nova/compute/api.py b/nova/compute/api.py index f63a66a0f083..77b2ef85cecc 100644 --- a/nova/compute/api.py +++ b/nova/compute/api.py @@ -3398,6 +3398,14 @@ class API(base.Base): self._checks_for_create_and_rebuild(context, image_id, image, flavor, metadata, files_to_inject, root_bdm) + # NOTE(sean-k-mooney): When we rebuild with a new image we need to + # validate that the NUMA topology does not change as we do a NOOP claim + # in resource tracker. As such we cannot allow the resource usage or + # assignment to change as a result of a new image altering the + # numa constraints. + if orig_image_ref != image_href: + self._validate_numa_rebuild(instance, image, flavor) + kernel_id, ramdisk_id = self._handle_kernel_and_ramdisk( context, None, None, image) @@ -3490,6 +3498,48 @@ class API(base.Base): request_spec=request_spec, kwargs=kwargs) + @staticmethod + def _validate_numa_rebuild(instance, image, flavor): + """validates that the NUMA constraints do not change on rebuild. + + :param instance: nova.objects.instance.Instance object + :param image: the new image the instance will be rebuilt with. + :param flavor: the flavor of the current instance. + + :returns: True or raises on failure. + :raises: nova.exception.ImageNUMATopologyRebuildConflict + """ + + # NOTE(sean-k-mooney): currently it is not possible to express + # a PCI NUMA affinity policy via flavor or image but that will + # change in the future. we pull out the image metadata into + # separate variable to make future testing of this easier. + old_image_meta = instance.image_meta + new_image_meta = objects.ImageMeta.from_dict(image) + old_constraints = hardware.numa_get_constraints(flavor, old_image_meta) + new_constraints = hardware.numa_get_constraints(flavor, new_image_meta) + + # early out for non NUMA instances + if old_constraints is None and new_constraints is None: + # return true for easy unit testing + return True + + # if only one of the constrains are non-None (or 'set') then the + # constraints changed so raise an exception. + if old_constraints is None or new_constraints is None: + raise exception.ImageNUMATopologyRebuildConflict() + + # otherwise since both the old a new constrains are non none compare + # them as dictionaries. + old = old_constraints.obj_to_primitive() + new = new_constraints.obj_to_primitive() + if old != new: + raise exception.ImageNUMATopologyRebuildConflict() + # TODO(sean-k-mooney): add PCI NUMA affinity policy check. + + # return true for easy unit testing + return True + @staticmethod def _check_quota_for_upsize(context, instance, current_flavor, new_flavor): project_id, user_id = quotas_obj.ids_from_instance(context, diff --git a/nova/exception.py b/nova/exception.py index 4b6522d2323b..279630311715 100644 --- a/nova/exception.py +++ b/nova/exception.py @@ -1936,6 +1936,12 @@ class ImageNUMATopologyForbidden(Forbidden): "NUMA configuration set against the flavor") +class ImageNUMATopologyRebuildConflict(Invalid): + msg_fmt = _( + "An instance's NUMA typology cannot be changed as part of a rebuild. " + "The image provided is invalid for this instance.") + + class ImageNUMATopologyAsymmetric(Invalid): msg_fmt = _("Instance CPUs and/or memory cannot be evenly distributed " "across instance NUMA nodes. Explicit assignment of CPUs " diff --git a/nova/tests/functional/integrated_helpers.py b/nova/tests/functional/integrated_helpers.py index af00f99a5b75..87f861467ee1 100644 --- a/nova/tests/functional/integrated_helpers.py +++ b/nova/tests/functional/integrated_helpers.py @@ -92,7 +92,8 @@ class _IntegratedTestBase(test.TestCase): return None self.stub_out('nova.privsep.linux_net.bind_ip', fake_noop) - nova.tests.unit.image.fake.stub_out_image_service(self) + self.fake_image_service =\ + nova.tests.unit.image.fake.stub_out_image_service(self) self.useFixture(cast_as_call.CastAsCall(self)) placement = self.useFixture(func_fixtures.PlacementFixture()) diff --git a/nova/tests/functional/libvirt/test_numa_servers.py b/nova/tests/functional/libvirt/test_numa_servers.py index 69290b6763fd..8aeee3da8fe1 100644 --- a/nova/tests/functional/libvirt/test_numa_servers.py +++ b/nova/tests/functional/libvirt/test_numa_servers.py @@ -13,20 +13,21 @@ # License for the specific language governing permissions and limitations # under the License. +import mock import six -import mock from oslo_config import cfg from oslo_log import log as logging from nova.conf import neutron as neutron_conf from nova import context as nova_context from nova import objects + +from nova.tests import fixtures as nova_fixtures from nova.tests.functional.api import client from nova.tests.functional.libvirt import base +from nova.tests.unit import fake_notifier from nova.tests.unit.virt.libvirt import fakelibvirt - - CONF = cfg.CONF LOG = logging.getLogger(__name__) @@ -362,3 +363,127 @@ class NUMAServersWithNetworksTest(NUMAServersTestBase): network_metadata = args[1].network_metadata self.assertIsNotNone(network_metadata) self.assertEqual(set(['foo']), network_metadata.physnets) + + +class NUMAServersRebuildTests(NUMAServersTestBase): + + def setUp(self): + super(NUMAServersRebuildTests, self).setUp() + images = self.api.get_images() + # save references to first two image for server create and rebuild + self.image_ref_0 = images[0]['id'] + self.image_ref_1 = images[1]['id'] + + fake_notifier.stub_notifier(self) + self.addCleanup(fake_notifier.reset) + + def _create_active_server(self, server_args=None): + basic_server = { + 'flavorRef': 1, + 'name': 'numa_server', + 'networks': [{ + 'uuid': nova_fixtures.NeutronFixture.network_1['id'] + }], + 'imageRef': self.image_ref_0 + } + if server_args: + basic_server.update(server_args) + server = self.api.post_server({'server': basic_server}) + return self._wait_for_state_change(server, 'BUILD') + + def _rebuild_server(self, active_server, image_ref): + args = {"rebuild": {"imageRef": image_ref}} + self.api.api_post( + 'servers/%s/action' % active_server['id'], args) + fake_notifier.wait_for_versioned_notifications('instance.rebuild.end') + return self._wait_for_state_change(active_server, 'REBUILD') + + def test_rebuild_server_with_numa(self): + """Create a NUMA instance and ensure it can be rebuilt. + """ + + # Create a flavor consuming 2 pinned cpus with an implicit + # numa topology of 1 virtual numa node. + extra_spec = {'hw:cpu_policy': 'dedicated'} + flavor_id = self._create_flavor(extra_spec=extra_spec) + + # Create a host with 4 physical cpus to allow rebuild leveraging + # the free space to ensure the numa topology filter does not + # eliminate the host. + host_info = fakelibvirt.NUMAHostInfo(cpu_nodes=1, cpu_sockets=1, + cpu_cores=4, kB_mem=15740000) + fake_connection = self._get_connection(host_info=host_info) + self.mock_conn.return_value = fake_connection + self.compute = self.start_service('compute', host='compute1') + + server = self._create_active_server( + server_args={"flavorRef": flavor_id}) + + # this should succeed as the NUMA topology has not changed + # and we have enough resources on the host. We rebuild with + # a different image to force the rebuild to query the scheduler + # to validate the host. + self._rebuild_server(server, self.image_ref_1) + + def test_rebuild_server_with_numa_inplace_fails(self): + """Create a NUMA instance and ensure in place rebuild fails. + """ + + # Create a flavor consuming 2 pinned cpus with an implicit + # numa topology of 1 virtual numa node. + extra_spec = {'hw:cpu_policy': 'dedicated'} + flavor_id = self._create_flavor(extra_spec=extra_spec) + + # cpu_cores is set to 2 to ensure that + # we have enough space to boot the vm but not enough space to rebuild + # by doubling the resource use during scheduling. + host_info = fakelibvirt.NUMAHostInfo( + cpu_nodes=1, cpu_sockets=1, cpu_cores=2, kB_mem=15740000) + fake_connection = self._get_connection(host_info=host_info) + self.mock_conn.return_value = fake_connection + self.compute = self.start_service('compute', host='compute1') + + server = self._create_active_server( + server_args={"flavorRef": flavor_id}) + + # TODO(sean-k-mooney): this should pass but i currently expect it to + # fail because the NUMA topology filter does not support in place + # rebuild and we have used all the resources on the compute node. + self.assertRaises( + client.OpenStackApiException, self._rebuild_server, + server, self.image_ref_1) + + def test_rebuild_server_with_different_numa_topology_fails(self): + """Create a NUMA instance and ensure inplace rebuild fails. + """ + + # Create a flavor consuming 2 pinned cpus with an implicit + # numa topology of 1 virtual numa node. + extra_spec = {'hw:cpu_policy': 'dedicated'} + flavor_id = self._create_flavor(extra_spec=extra_spec) + + host_info = fakelibvirt.NUMAHostInfo( + cpu_nodes=2, cpu_sockets=1, cpu_cores=4, kB_mem=15740000) + fake_connection = self._get_connection(host_info=host_info) + self.mock_conn.return_value = fake_connection + self.compute = self.start_service('compute', host='compute1') + + server = self._create_active_server( + server_args={"flavorRef": flavor_id}) + + # the original vm had an implicit numa topology of 1 virtual numa nodes + # so we alter the requested numa topology in image_ref_1 to request + # 2 virtual numa nodes. + ctx = nova_context.get_admin_context() + image_meta = {'properties': {'hw_numa_nodes': 2}} + self.fake_image_service.update(ctx, self.image_ref_1, image_meta) + + # NOTE(sean-k-mooney): this should fail because rebuild uses noop + # claims therefor it is not allowed for the numa topology or resource + # usage to change during a rebuild. + ex = self.assertRaises( + client.OpenStackApiException, self._rebuild_server, + server, self.image_ref_1) + self.assertEqual(400, ex.response.status_code) + self.assertIn("An instance's NUMA typology cannot be changed", + six.text_type(ex)) diff --git a/nova/tests/unit/compute/test_compute_api.py b/nova/tests/unit/compute/test_compute_api.py index b4635609ae7e..99425956108b 100644 --- a/nova/tests/unit/compute/test_compute_api.py +++ b/nova/tests/unit/compute/test_compute_api.py @@ -6539,6 +6539,123 @@ class ComputeAPIUnitTestCase(_ComputeAPIUnitTestMixIn, test.NoDBTestCase): self.assertEqual(['context-for-%s' % c for c in compute_api.CELLS], cells) + def test__validate_numa_rebuild_non_numa(self): + """This test asserts that a rebuild of an instance without a NUMA + topology passes validation. + """ + flavor = objects.Flavor( + id=42, vcpus=1, memory_mb=512, root_gb=1, extra_specs={}) + instance = self._create_instance_obj(flavor=flavor) + # we use a dict instead of image metadata object as + # _validate_numa_rebuild constructs the object internally + image = { + 'id': uuids.image_id, 'status': 'foo', + 'properties': {}} + self.compute_api._validate_numa_rebuild(instance, image, flavor) + + def test__validate_numa_rebuild_no_conflict(self): + """This test asserts that a rebuild of an instance without a change + in NUMA topology passes validation. + """ + flavor = objects.Flavor( + id=42, vcpus=1, memory_mb=512, root_gb=1, + extra_specs={"hw:numa_nodes": 1}) + instance = self._create_instance_obj(flavor=flavor) + # we use a dict instead of image metadata object as + # _validate_numa_rebuild constructs the object internally + image = { + 'id': uuids.image_id, 'status': 'foo', + 'properties': {}} + # The flavor creates a NUMA topology but the default image and the + # rebuild image do not have any image properties so there will + # be no conflict. + self.compute_api._validate_numa_rebuild(instance, image, flavor) + + def test__validate_numa_rebuild_add_numa_toplogy(self): + """This test asserts that a rebuild of an instance with a new image + that requests a NUMA topology when the original instance did not + have a NUMA topology is invalid. + """ + + flavor = objects.Flavor( + id=42, vcpus=1, memory_mb=512, root_gb=1, + extra_specs={}) + instance = self._create_instance_obj(flavor=flavor) + # we use a dict instead of image metadata object as + # _validate_numa_rebuild constructs the object internally + image = { + 'id': uuids.image_id, 'status': 'foo', + 'properties': {"hw_numa_nodes": 1}} + # The flavor and default image have no NUMA topology defined. The image + # used to rebuild requests a NUMA topology which is not allowed as it + # would alter the NUMA constrains. + self.assertRaises( + exception.ImageNUMATopologyRebuildConflict, + self.compute_api._validate_numa_rebuild, instance, image, flavor) + + def test__validate_numa_rebuild_remove_numa_toplogy(self): + """This test asserts that a rebuild of an instance with a new image + that does not request a NUMA topology when the original image did + is invalid if it would alter the instances topology as a result. + """ + + flavor = objects.Flavor( + id=42, vcpus=1, memory_mb=512, root_gb=1, + extra_specs={}) + instance = self._create_instance_obj(flavor=flavor) + # we use a dict instead of image metadata object as + # _validate_numa_rebuild constructs the object internally + old_image = { + 'id': uuidutils.generate_uuid(), 'status': 'foo', + 'properties': {"hw_numa_nodes": 1}} + old_image_meta = objects.ImageMeta.from_dict(old_image) + image = { + 'id': uuidutils.generate_uuid(), 'status': 'foo', + 'properties': {}} + with mock.patch( + 'nova.objects.instance.Instance.image_meta', + new_callable=mock.PropertyMock(return_value=old_image_meta)): + # The old image has a NUMA topology defined but the new image + # used to rebuild does not. This would alter the NUMA constrains + # and therefor should raise. + self.assertRaises( + exception.ImageNUMATopologyRebuildConflict, + self.compute_api._validate_numa_rebuild, instance, + image, flavor) + + def test__validate_numa_rebuild_alter_numa_toplogy(self): + """This test asserts that a rebuild of an instance with a new image + that requests a different NUMA topology than the original image + is invalid. + """ + + # NOTE(sean-k-mooney): we need to use 2 vcpus here or we will fail + # with a different exception ImageNUMATopologyAsymmetric when we + # construct the NUMA constrains as the rebuild image would result + # in an invalid topology. + flavor = objects.Flavor( + id=42, vcpus=2, memory_mb=512, root_gb=1, + extra_specs={}) + instance = self._create_instance_obj(flavor=flavor) + # we use a dict instead of image metadata object as + # _validate_numa_rebuild constructs the object internally + old_image = { + 'id': uuidutils.generate_uuid(), 'status': 'foo', + 'properties': {"hw_numa_nodes": 1}} + old_image_meta = objects.ImageMeta.from_dict(old_image) + image = { + 'id': uuidutils.generate_uuid(), 'status': 'foo', + 'properties': {"hw_numa_nodes": 2}} + with mock.patch( + 'nova.objects.instance.Instance.image_meta', + new_callable=mock.PropertyMock(return_value=old_image_meta)): + # the original image requested 1 NUMA node and the image used + # for rebuild requests 2 so assert an error is raised. + self.assertRaises( + exception.ImageNUMATopologyRebuildConflict, + self.compute_api._validate_numa_rebuild, instance, + image, flavor) + @mock.patch('nova.pci.request.get_pci_requests_from_flavor') def test_pci_validated(self, mock_request): """Tests that calling _validate_flavor_image_nostatus() with diff --git a/releasenotes/notes/numa-rebuild-b75f9a1966f576ea.yaml b/releasenotes/notes/numa-rebuild-b75f9a1966f576ea.yaml new file mode 100644 index 000000000000..99777e4df6bf --- /dev/null +++ b/releasenotes/notes/numa-rebuild-b75f9a1966f576ea.yaml @@ -0,0 +1,16 @@ +--- +fixes: + - | + An instance can be rebuilt in-place with the original image or a new + image. Instance resource usage cannot be altered during a rebuild. + Previously Nova would have ignored the NUMA topology of the new image + continuing to use the NUMA topology of the existing instance until a move + operation was performed. As Nova did not explicitly guard against + inadvertent changes in resource request contained in a new image, + it was possible to rebuild with an image that would violate this requirement + `bug #1763766`_. This resulted in an inconsistent state as the instance that + was running did not match the instance that was requested. Nova now explicitly + checks if a rebuild would alter the requested NUMA topology of an instance + and rejects the rebuild. + + .. _`bug #1763766`: https://bugs.launchpad.net/nova/+bug/1763766