Merge "Block rebuild when NUMA topology changed" into stable/rocky

2020-08-18 02:28:16 +00:00 · 2020-08-18 02:28:16 +00:00 · a6e1bcce5f
parent 43f698a6b4 643405b3a7
commit a6e1bcce5f
9 changed files with 375 additions and 17 deletions
--- a/nova/api/openstack/compute/servers.py
+++ b/nova/api/openstack/compute/servers.py
@ -980,6 +980,13 @@ class ServersController(wsgi.Controller):
                exception.AutoDiskConfigDisabledByImage,
                exception.CertificateValidationFailed) as error:
            raise exc.HTTPBadRequest(explanation=error.format_message())
+        # NOTE(sean-k-mooney): due to the lack of the flavor and image
+        # properties validation I06fad233006c7bab14749a51ffa226c3801f951b in
+        # stable/rocky we explicitly catch the NUMA rebuild conflict instead of
+        # extending the INVALID_FLAVOR_IMAGE_EXCEPTIONS tuple which was
+        # introduced in the stein release.
+        except exception.ImageNUMATopologyRebuildConflict as error:
+            raise exc.HTTPBadRequest(explanation=error.format_message())

        instance = self._get_server(context, req, id, is_detail=True)

--- a/nova/compute/api.py
+++ b/nova/compute/api.py
@ -3260,6 +3260,14 @@ class API(base.Base):
        self._checks_for_create_and_rebuild(context, image_id, image,
                flavor, metadata, files_to_inject, root_bdm)

+        # NOTE(sean-k-mooney): When we rebuild with a new image we need to
+        # validate that the NUMA topology does not change as we do a NOOP claim
+        # in resource tracker. As such we cannot allow the resource usage or
+        # assignment to change as a result of a new image altering the
+        # numa constraints.
+        if orig_image_ref != image_href:
+            self._validate_numa_rebuild(instance, image, flavor)
+
        kernel_id, ramdisk_id = self._handle_kernel_and_ramdisk(
                context, None, None, image)

@ -3359,6 +3367,48 @@ class API(base.Base):
                request_spec=request_spec,
                kwargs=kwargs)

+    @staticmethod
+    def _validate_numa_rebuild(instance, image, flavor):
+        """validates that the NUMA constraints do not change on rebuild.
+
+        :param instance: nova.objects.instance.Instance object
+        :param image: the new image the instance will be rebuilt with.
+        :param flavor: the flavor of the current instance.
+
+        :returns: True or raises on failure.
+        :raises: nova.exception.ImageNUMATopologyRebuildConflict
+        """
+
+        # NOTE(sean-k-mooney): currently it is not possible to express
+        # a PCI NUMA affinity policy via flavor or image but that will
+        # change in the future. we pull out the image metadata into
+        # separate variable to make future testing of this easier.
+        old_image_meta = instance.image_meta
+        new_image_meta = objects.ImageMeta.from_dict(image)
+        old_constraints = hardware.numa_get_constraints(flavor, old_image_meta)
+        new_constraints = hardware.numa_get_constraints(flavor, new_image_meta)
+
+        # early out for non NUMA instances
+        if old_constraints is None and new_constraints is None:
+            # return true for easy unit testing
+            return True
+
+        # if only one of the constrains are non-None (or 'set') then the
+        # constraints changed so raise an exception.
+        if old_constraints is None or new_constraints is None:
+            raise exception.ImageNUMATopologyRebuildConflict()
+
+        # otherwise since both the old a new constrains are non none compare
+        # them as dictionaries.
+        old = old_constraints.obj_to_primitive()
+        new = new_constraints.obj_to_primitive()
+        if old != new:
+            raise exception.ImageNUMATopologyRebuildConflict()
+        # TODO(sean-k-mooney): add PCI NUMA affinity policy check.
+
+        # return true for easy unit testing
+        return True
+
    @staticmethod
    def _check_quota_for_upsize(context, instance, current_flavor, new_flavor):
        project_id, user_id = quotas_obj.ids_from_instance(context,
--- a/nova/exception.py
+++ b/nova/exception.py
@ -1911,6 +1911,12 @@ class ImageNUMATopologyForbidden(Forbidden):
                "NUMA configuration set against the flavor")


+class ImageNUMATopologyRebuildConflict(Invalid):
+    msg_fmt = _(
+        "An instance's NUMA typology cannot be changed as part of a rebuild. "
+        "The image provided is invalid for this instance.")
+
+
 class ImageNUMATopologyAsymmetric(Invalid):
    msg_fmt = _("Instance CPUs and/or memory cannot be evenly distributed "
                "across instance NUMA nodes. Explicit assignment of CPUs "
--- a/nova/tests/functional/integrated_helpers.py
+++ b/nova/tests/functional/integrated_helpers.py
@ -81,7 +81,8 @@ class _IntegratedTestBase(test.TestCase):
        # TODO(mriedem): Fix the functional tests to work with Neutron.
        self.flags(use_neutron=self.USE_NEUTRON)

-        nova.tests.unit.image.fake.stub_out_image_service(self)
+        self.fake_image_service =\
+            nova.tests.unit.image.fake.stub_out_image_service(self)

        self.useFixture(cast_as_call.CastAsCall(self))
        self.useFixture(nova_fixtures.Database(database='placement'))
--- a/nova/tests/functional/libvirt/test_numa_servers.py
+++ b/nova/tests/functional/libvirt/test_numa_servers.py
@ -13,10 +13,10 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.

-import six
-
 import fixtures
 import mock
+import six
+
 from oslo_config import cfg
 from oslo_log import log as logging

@ -27,11 +27,10 @@ from nova import test
 from nova.tests import fixtures as nova_fixtures
 from nova.tests.functional.api import client
 from nova.tests.functional.test_servers import ServersTestBase
+from nova.tests.unit import fake_notifier
 from nova.tests.unit.virt.libvirt import fake_imagebackend
 from nova.tests.unit.virt.libvirt import fake_libvirt_utils
 from nova.tests.unit.virt.libvirt import fakelibvirt
-
-
 CONF = cfg.CONF
 LOG = logging.getLogger(__name__)

@ -568,3 +567,136 @@ class NUMAServersWithNetworksTest(NUMAServersTestBase):
        network_metadata = args[1].network_metadata
        self.assertIsNotNone(network_metadata)
        self.assertEqual(set(['foo']), network_metadata.physnets)
+
+
+class NUMAServersRebuildTests(NUMAServersTestBase):
+
+    def setUp(self):
+        super(NUMAServersRebuildTests, self).setUp()
+        images = self.api.get_images()
+        # save references to first two image for server create and rebuild
+        self.image_ref_0 = images[0]['id']
+        self.image_ref_1 = images[1]['id']
+
+        fake_notifier.stub_notifier(self)
+        self.addCleanup(fake_notifier.reset)
+
+        # NOTE(sean-k-mooney): I3b4c1153bebdeab2eb8bc2108aa177732f5c6a97
+        # moved mocking of the libvirt connection to the setup of
+        # NUMAServersTestBase but that is not backported so we just add back
+        # the mocking to enable backporting of
+        # I0322d872bdff68936033a6f5a54e8296a6fb3434
+        _p = mock.patch('nova.virt.libvirt.host.Host.get_connection')
+        self.mock_conn = _p.start()
+        self.addCleanup(_p.stop)
+
+    def _create_active_server(self, server_args=None):
+        basic_server = {
+            'flavorRef': 1,
+            'name': 'numa_server',
+            'networks': [{
+                'uuid': nova_fixtures.NeutronFixture.network_1['id']
+            }],
+            'imageRef': self.image_ref_0
+        }
+        if server_args:
+            basic_server.update(server_args)
+        server = self.api.post_server({'server': basic_server})
+        return self._wait_for_state_change(server, 'BUILD')
+
+    def _rebuild_server(self, active_server, image_ref):
+        args = {"rebuild": {"imageRef": image_ref}}
+        self.api.api_post(
+            'servers/%s/action' % active_server['id'], args)
+        fake_notifier.wait_for_versioned_notifications('instance.rebuild.end')
+        return self._wait_for_state_change(active_server, 'REBUILD')
+
+    def test_rebuild_server_with_numa(self):
+        """Create a NUMA instance and ensure it can be rebuilt.
+        """
+
+        # Create a flavor consuming 2 pinned cpus with an implicit
+        # numa topology of 1 virtual numa node.
+        extra_spec = {'hw:cpu_policy': 'dedicated'}
+        flavor_id = self._create_flavor(extra_spec=extra_spec)
+
+        # Create a host with 4 physical cpus to allow rebuild leveraging
+        # the free space to ensure the numa topology filter does not
+        # eliminate the host.
+        host_info = fakelibvirt.NUMAHostInfo(cpu_nodes=1, cpu_sockets=1,
+                                         cpu_cores=4, kB_mem=15740000)
+        fake_connection = self._get_connection(host_info=host_info)
+        self.mock_conn.return_value = fake_connection
+        self.compute = self.start_service('compute', host='compute1')
+
+        server = self._create_active_server(
+            server_args={"flavorRef": flavor_id})
+
+        # this should succeed as the NUMA topology has not changed
+        # and we have enough resources on the host. We rebuild with
+        # a different image to force the rebuild to query the scheduler
+        # to validate the host.
+        self._rebuild_server(server, self.image_ref_1)
+
+    def test_rebuild_server_with_numa_inplace_fails(self):
+        """Create a NUMA instance and ensure in place rebuild fails.
+        """
+
+        # Create a flavor consuming 2 pinned cpus with an implicit
+        # numa topology of 1 virtual numa node.
+        extra_spec = {'hw:cpu_policy': 'dedicated'}
+        flavor_id = self._create_flavor(extra_spec=extra_spec)
+
+        # cpu_cores is set to 2 to ensure that
+        # we have enough space to boot the vm but not enough space to rebuild
+        # by doubling  the resource use during scheduling.
+        host_info = fakelibvirt.NUMAHostInfo(
+            cpu_nodes=1, cpu_sockets=1, cpu_cores=2, kB_mem=15740000)
+        fake_connection = self._get_connection(host_info=host_info)
+        self.mock_conn.return_value = fake_connection
+        self.compute = self.start_service('compute', host='compute1')
+
+        server = self._create_active_server(
+            server_args={"flavorRef": flavor_id})
+
+        # TODO(sean-k-mooney): this should pass but i currently expect it to
+        # fail because the NUMA topology filter does not support in place
+        # rebuild and we have used all the resources on the compute node.
+        self.assertRaises(
+            client.OpenStackApiException, self._rebuild_server,
+            server, self.image_ref_1)
+
+    def test_rebuild_server_with_different_numa_topology_fails(self):
+        """Create a NUMA instance and ensure inplace rebuild fails.
+        """
+
+        # Create a flavor consuming 2 pinned cpus with an implicit
+        # numa topology of 1 virtual numa node.
+        extra_spec = {'hw:cpu_policy': 'dedicated'}
+        flavor_id = self._create_flavor(extra_spec=extra_spec)
+
+        host_info = fakelibvirt.NUMAHostInfo(
+            cpu_nodes=2, cpu_sockets=1, cpu_cores=4, kB_mem=15740000)
+        fake_connection = self._get_connection(host_info=host_info)
+        self.mock_conn.return_value = fake_connection
+        self.compute = self.start_service('compute', host='compute1')
+
+        server = self._create_active_server(
+            server_args={"flavorRef": flavor_id})
+
+        # the original vm had an implicit numa topology of 1 virtual numa nodes
+        # so we alter the requested numa topology in image_ref_1 to request
+        # 2 virtual numa nodes.
+        ctx = nova_context.get_admin_context()
+        image_meta = {'properties': {'hw_numa_nodes': 2}}
+        self.fake_image_service.update(ctx, self.image_ref_1, image_meta)
+
+        # NOTE(sean-k-mooney): this should fail because rebuild uses noop
+        # claims therefor it is not allowed for the numa topology or resource
+        # usage to change during a rebuild.
+        ex = self.assertRaises(
+            client.OpenStackApiException, self._rebuild_server,
+            server, self.image_ref_1)
+        self.assertEqual(400, ex.response.status_code)
+        self.assertIn("An instance's NUMA typology cannot be changed",
+                      six.text_type(ex))
--- a/nova/tests/unit/api/openstack/compute/test_server_actions.py
+++ b/nova/tests/unit/api/openstack/compute/test_server_actions.py
@ -540,11 +540,22 @@ class ServerActionsControllerTestV21(test.TestCase):
    def test_rebuild_when_kernel_not_exists(self):

        def return_image_meta(*args, **kwargs):
+            # NOTE(sean-k-mooney): To enable
+            # I0322d872bdff68936033a6f5a54e8296a6fb3434 to be backported
+            # without I34ffaf285718059b55f90e812b57f1e11d566c6f we update the
+            # fake image data to use valid UUIDs
            image_meta_table = {
-                '2': {'id': 2, 'status': 'active', 'container_format': 'ari'},
-                '155d900f-4e14-4e4c-a73d-069cbf4541e6':
-                     {'id': 3, 'status': 'active', 'container_format': 'raw',
-                      'properties': {'kernel_id': 1, 'ramdisk_id': 2}},
+                uuids.ramdisk_image_id: {
+                    'id': uuids.ramdisk_image_id, 'status': 'active',
+                    'container_format': 'ari'},
+                '155d900f-4e14-4e4c-a73d-069cbf4541e6': {
+                    'id': '155d900f-4e14-4e4c-a73d-069cbf4541e6',
+                    'status': 'active', 'container_format': 'raw',
+                    'properties': {
+                        'kernel_id': uuids.missing_image_id,
+                        'ramdisk_id': uuids.ramdisk_image_id,
+                    },
+                },
            }
            image_id = args[2]
            try:
@ -582,12 +593,25 @@ class ServerActionsControllerTestV21(test.TestCase):
                    instance_meta[key] = instance[key]

        def return_image_meta(*args, **kwargs):
+            # NOTE(sean-k-mooney): To enable
+            # I0322d872bdff68936033a6f5a54e8296a6fb3434 to be backported
+            # without I34ffaf285718059b55f90e812b57f1e11d566c6f we update the
+            # fake image data to use valid UUIDs
            image_meta_table = {
-                '1': {'id': 1, 'status': 'active', 'container_format': 'aki'},
-                '2': {'id': 2, 'status': 'active', 'container_format': 'ari'},
-                '155d900f-4e14-4e4c-a73d-069cbf4541e6':
-                     {'id': 3, 'status': 'active', 'container_format': 'raw',
-                      'properties': {'kernel_id': 1, 'ramdisk_id': 2}},
+                uuids.kernel_image_id: {
+                    'id': uuids.kernel_image_id, 'status': 'active',
+                    'container_format': 'aki'},
+                uuids.ramdisk_image_id: {
+                    'id': uuids.ramdisk_image_id, 'status': 'active',
+                    'container_format': 'ari'},
+                '155d900f-4e14-4e4c-a73d-069cbf4541e6': {
+                    'id': '155d900f-4e14-4e4c-a73d-069cbf4541e6',
+                    'status': 'active', 'container_format': 'raw',
+                    'properties': {
+                        'kernel_id': uuids.kernel_image_id,
+                        'ramdisk_id': uuids.ramdisk_image_id,
+                    },
+                },
            }
            image_id = args[2]
            try:
@ -607,8 +631,8 @@ class ServerActionsControllerTestV21(test.TestCase):
            },
        }
        self.controller._action_rebuild(self.req, FAKE_UUID, body=body).obj
-        self.assertEqual(instance_meta['kernel_id'], '1')
-        self.assertEqual(instance_meta['ramdisk_id'], '2')
+        self.assertEqual(instance_meta['kernel_id'], uuids.kernel_image_id)
+        self.assertEqual(instance_meta['ramdisk_id'], uuids.ramdisk_image_id)

    @mock.patch.object(compute_api.API, 'rebuild')
    def test_rebuild_instance_raise_auto_disk_config_exc(self, mock_rebuild):
--- a/nova/tests/unit/compute/test_compute.py
+++ b/nova/tests/unit/compute/test_compute.py
@ -9037,7 +9037,12 @@ class ComputeAPITestCase(BaseTestCase):

        with mock.patch.object(self.compute_api.compute_task_api,
                               'rebuild_instance', fake_rpc_rebuild):
-            image_ref = instance["image_ref"] + '-new_image_ref'
+            # NOTE(sean-k-mooney): to enable
+            # I0322d872bdff68936033a6f5a54e8296a6fb3434 to be backported
+            # without I34ffaf285718059b55f90e812b57f1e11d566c6f we update the
+            # image ref to use uuids.new_image_ref to pass the UUIDField
+            # validation requirements
+            image_ref = uuids.new_image_ref
            password = "new_password"

            instance.vm_state = vm_state
--- a/nova/tests/unit/compute/test_compute_api.py
+++ b/nova/tests/unit/compute/test_compute_api.py
@ -6198,6 +6198,123 @@ class ComputeAPIUnitTestCase(_ComputeAPIUnitTestMixIn, test.NoDBTestCase):
            self.assertEqual(['context-for-%s' % c for c in compute_api.CELLS],
                             cells)

+    def test__validate_numa_rebuild_non_numa(self):
+        """This test asserts that a rebuild of an instance without a NUMA
+        topology passes validation.
+        """
+        flavor = objects.Flavor(
+            id=42, vcpus=1, memory_mb=512, root_gb=1, extra_specs={})
+        instance = self._create_instance_obj(flavor=flavor)
+        # we use a dict instead of image metadata object as
+        # _validate_numa_rebuild constructs the object internally
+        image = {
+            'id': uuids.image_id, 'status': 'foo',
+            'properties': {}}
+        self.compute_api._validate_numa_rebuild(instance, image, flavor)
+
+    def test__validate_numa_rebuild_no_conflict(self):
+        """This test asserts that a rebuild of an instance without a change
+        in NUMA topology passes validation.
+        """
+        flavor = objects.Flavor(
+            id=42, vcpus=1, memory_mb=512, root_gb=1,
+            extra_specs={"hw:numa_nodes": 1})
+        instance = self._create_instance_obj(flavor=flavor)
+        # we use a dict instead of image metadata object as
+        # _validate_numa_rebuild constructs the object internally
+        image = {
+            'id': uuids.image_id, 'status': 'foo',
+            'properties': {}}
+        # The flavor creates a NUMA topology but the default image and the
+        # rebuild image do not have any image properties so there will
+        # be no conflict.
+        self.compute_api._validate_numa_rebuild(instance, image, flavor)
+
+    def test__validate_numa_rebuild_add_numa_toplogy(self):
+        """This test asserts that a rebuild of an instance with a new image
+        that requests a NUMA topology when the original instance did not
+        have a NUMA topology is invalid.
+        """
+
+        flavor = objects.Flavor(
+            id=42, vcpus=1, memory_mb=512, root_gb=1,
+            extra_specs={})
+        instance = self._create_instance_obj(flavor=flavor)
+        # we use a dict instead of image metadata object as
+        # _validate_numa_rebuild constructs the object internally
+        image = {
+            'id': uuids.image_id, 'status': 'foo',
+            'properties': {"hw_numa_nodes": 1}}
+        # The flavor and default image have no NUMA topology defined. The image
+        # used to rebuild requests a NUMA topology which is not allowed as it
+        # would alter the NUMA constrains.
+        self.assertRaises(
+            exception.ImageNUMATopologyRebuildConflict,
+            self.compute_api._validate_numa_rebuild, instance, image, flavor)
+
+    def test__validate_numa_rebuild_remove_numa_toplogy(self):
+        """This test asserts that a rebuild of an instance with a new image
+        that does not request a NUMA topology when the original image did
+        is invalid if it would alter the instances topology as a result.
+        """
+
+        flavor = objects.Flavor(
+            id=42, vcpus=1, memory_mb=512, root_gb=1,
+            extra_specs={})
+        instance = self._create_instance_obj(flavor=flavor)
+        # we use a dict instead of image metadata object as
+        # _validate_numa_rebuild constructs the object internally
+        old_image = {
+            'id': uuidutils.generate_uuid(), 'status': 'foo',
+            'properties': {"hw_numa_nodes": 1}}
+        old_image_meta = objects.ImageMeta.from_dict(old_image)
+        image = {
+            'id': uuidutils.generate_uuid(), 'status': 'foo',
+            'properties': {}}
+        with mock.patch(
+                'nova.objects.instance.Instance.image_meta',
+                new_callable=mock.PropertyMock(return_value=old_image_meta)):
+            # The old image has a NUMA topology defined but the new image
+            # used to rebuild does not. This would alter the NUMA constrains
+            # and therefor should raise.
+            self.assertRaises(
+                exception.ImageNUMATopologyRebuildConflict,
+                self.compute_api._validate_numa_rebuild, instance,
+                image, flavor)
+
+    def test__validate_numa_rebuild_alter_numa_toplogy(self):
+        """This test asserts that a rebuild of an instance with a new image
+        that requests a different NUMA topology than the original image
+        is invalid.
+        """
+
+        # NOTE(sean-k-mooney): we need to use 2 vcpus here or we will fail
+        # with a different exception ImageNUMATopologyAsymmetric when we
+        # construct the NUMA constrains as the rebuild image would result
+        # in an invalid topology.
+        flavor = objects.Flavor(
+            id=42, vcpus=2, memory_mb=512, root_gb=1,
+            extra_specs={})
+        instance = self._create_instance_obj(flavor=flavor)
+        # we use a dict instead of image metadata object as
+        # _validate_numa_rebuild constructs the object internally
+        old_image = {
+            'id': uuidutils.generate_uuid(), 'status': 'foo',
+            'properties': {"hw_numa_nodes": 1}}
+        old_image_meta = objects.ImageMeta.from_dict(old_image)
+        image = {
+            'id': uuidutils.generate_uuid(), 'status': 'foo',
+            'properties': {"hw_numa_nodes": 2}}
+        with mock.patch(
+                'nova.objects.instance.Instance.image_meta',
+                new_callable=mock.PropertyMock(return_value=old_image_meta)):
+            # the original image requested 1 NUMA node and the image used
+            # for rebuild requests 2 so assert an error is raised.
+            self.assertRaises(
+                exception.ImageNUMATopologyRebuildConflict,
+                self.compute_api._validate_numa_rebuild, instance,
+                image, flavor)
+
    def test_validate_and_build_base_options_translate_neutron_secgroup(self):
        """Tests that _check_requested_secgroups will return a uuid for a
        requested Neutron security group and that will be returned from
--- a/releasenotes/notes/numa-rebuild-b75f9a1966f576ea.yaml
+++ b/releasenotes/notes/numa-rebuild-b75f9a1966f576ea.yaml
@ -0,0 +1,16 @@
+---
+fixes:
+  - |
+    An instance can be rebuilt in-place with the original image or a new
+    image. Instance resource usage cannot be altered during a rebuild.
+    Previously Nova would have ignored the NUMA topology of the new image
+    continuing to use the NUMA topology of the existing instance until a move
+    operation was performed. As Nova did not explicitly guard against
+    inadvertent changes in resource request contained in a new image,
+    it was possible to rebuild with an image that would violate this requirement
+    `bug #1763766`_. This resulted in an inconsistent state as the instance that
+    was running did not match the instance that was requested. Nova now explicitly
+    checks if a rebuild would alter the requested NUMA topology of an instance
+    and rejects the rebuild.
+
+    ..  _`bug #1763766`: https://bugs.launchpad.net/nova/+bug/1763766