ignore deleted server groups in validation

This change simply catches the exception raised when we lookup a servergroup via a hint and the validation upcall is enabled. Change-Id: I858b4da35382a9f4dcf88f4b6db340e1f34eb82d Closes-Bug: #1890244
2022-06-21 12:23:45 +01:00 · 2022-06-21 12:23:45 +01:00 · cd2c2f359b
commit cd2c2f359b
parent 84a84f7f2f
6 changed files with 155 additions and 13 deletions
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -1717,27 +1717,32 @@ class ComputeManager(manager.Manager):
        # hosts.  This is a validation step to make sure that starting the
        # instance here doesn't violate the policy.
        if scheduler_hints is not None:
-            # only go through here if scheduler_hints is provided, even if it
-            # is empty.
+            # only go through here if scheduler_hints is provided,
+            # even if it is empty.
            group_hint = scheduler_hints.get('group')
            if not group_hint:
                return
            else:
-                # The RequestSpec stores scheduler_hints as key=list pairs so
-                # we need to check the type on the value and pull the single
-                # entry out. The API request schema validates that
+                # The RequestSpec stores scheduler_hints as key=list pairs
+                # so we need to check the type on the value and pull the
+                # single entry out. The API request schema validates that
                # the 'group' hint is a single value.
                if isinstance(group_hint, list):
                    group_hint = group_hint[0]
-
-                group = objects.InstanceGroup.get_by_hint(context, group_hint)
+                try:
+                    group = objects.InstanceGroup.get_by_hint(
+                        context, group_hint
+                    )
+                except exception.InstanceGroupNotFound:
+                    return
        else:
            # TODO(ganso): a call to DB can be saved by adding request_spec
            # to rpcapi payload of live_migration, pre_live_migration and
            # check_can_live_migrate_destination
            try:
                group = objects.InstanceGroup.get_by_instance_uuid(
-                    context, instance.uuid)
+                    context, instance.uuid
+                )
            except exception.InstanceGroupNotFound:
                return

--- a/nova/objects/request_spec.py
+++ b/nova/objects/request_spec.py
@ -645,6 +645,7 @@ class RequestSpec(base.NovaObject):
            except exception.InstanceGroupNotFound:
                # NOTE(danms): Instance group may have been deleted
                spec.instance_group = None
+                spec.scheduler_hints.pop('group', None)

        if data_migrated:
            spec.save()
--- a/nova/tests/functional/regressions/test_bug_1890244.py
+++ b/nova/tests/functional/regressions/test_bug_1890244.py
@ -0,0 +1,96 @@
+# Copyright 2017 Ericsson
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nova import context
+from nova import objects
+from nova import test
+from nova.tests import fixtures as nova_fixtures
+from nova.tests.functional import fixtures as func_fixtures
+from nova.tests.functional import integrated_helpers
+
+
+class IgnoreDeletedServerGroupsTest(
+    test.TestCase, integrated_helpers.InstanceHelperMixin,
+):
+    """Regression test for bug 1890244
+
+    If instance are created as member of server groups it
+    should be possibel to evacuate them if the server groups are
+    deleted prior to the host failure.
+    """
+
+    def setUp(self):
+        super().setUp()
+        # Stub out external dependencies.
+        self.useFixture(nova_fixtures.NeutronFixture(self))
+        self.useFixture(nova_fixtures.GlanceFixture(self))
+        self.useFixture(func_fixtures.PlacementFixture())
+        # Start nova controller services.
+        api_fixture = self.useFixture(nova_fixtures.OSAPIFixture(
+            api_version='v2.1'))
+        self.api = api_fixture.admin_api
+        self.start_service('conductor')
+        # Use a custom weigher to make sure that we have a predictable
+        # scheduling sort order.
+        self.useFixture(nova_fixtures.HostNameWeigherFixture())
+        self.start_service('scheduler')
+        # Start two computes, one where the server will be created and another
+        # where we'll evacuate it to.
+        self.src = self._start_compute('host1')
+        self.dest = self._start_compute('host2')
+        self.notifier = self.useFixture(
+            nova_fixtures.NotificationFixture(self)
+        )
+
+    def test_evacuate_after_group_delete(self):
+        # Create an anti-affinity group for the server.
+        body = {
+            'server_group': {
+                'name': 'test-group',
+                'policies': ['anti-affinity']
+            }
+        }
+        group_id = self.api.api_post(
+            '/os-server-groups', body).body['server_group']['id']
+
+        # Create a server in the group which should land on host1 due to our
+        # custom weigher.
+        body = {'server': self._build_server()}
+        body['os:scheduler_hints'] = {'group': group_id}
+        server = self.api.post_server(body)
+        server = self._wait_for_state_change(server, 'ACTIVE')
+        self.assertEqual('host1', server['OS-EXT-SRV-ATTR:host'])
+
+        # Down the source compute to enable the evacuation
+        self.api.microversion = '2.11'     # Cap for the force-down call.
+        self.api.force_down_service('host1', 'nova-compute', True)
+        self.api.microversion = 'latest'
+        self.src.stop()
+
+        # assert the server currently has a server group
+        reqspec = objects.RequestSpec.get_by_instance_uuid(
+            context.get_admin_context(), server['id'])
+        self.assertIsNotNone(reqspec.instance_group)
+        self.assertIn('group', reqspec.scheduler_hints)
+        # then delete it so that we need to clean it up on evac
+        self.api.api_delete(f'/os-server-groups/{group_id}')
+
+        # Initiate evacuation
+        server = self._evacuate_server(
+            server, expected_host='host2', expected_migration_status='done'
+        )
+        reqspec = objects.RequestSpec.get_by_instance_uuid(
+            context.get_admin_context(), server['id'])
+        self.assertIsNone(reqspec.instance_group)
+        self.assertNotIn('group', reqspec.scheduler_hints)
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@ -7601,11 +7601,14 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
        mock_get.side_effect = exception.InstanceGroupNotFound(
            group_uuid=uuids.group_hint
        )
-        # FIXME(sean-k-mooney): this should not leak the exception
-        self.assertRaises(
-            exception.InstanceGroupNotFound,
-            self.compute._validate_instance_group_policy, self.context,
-            instance, hints)
+        # This implicitly asserts that no exception is raised since
+        # uncaught exceptions would be treated as a test failure.
+        self.compute._validate_instance_group_policy(
+            self.context, instance, hints
+        )
+        # and this just assert that we did in fact invoke the method
+        # that raises to ensure that if we refactor in the future this
+        # this test will fail if the function we mock is no longer called.
        mock_get.assert_called_once_with(self.context, uuids.group_hint)

    @mock.patch('nova.objects.InstanceGroup.get_by_uuid')
--- a/nova/tests/unit/objects/test_request_spec.py
+++ b/nova/tests/unit/objects/test_request_spec.py
@ -621,6 +621,30 @@ class _TestRequestSpecObject(object):
        self.assertIsInstance(req_obj.instance_group, objects.InstanceGroup)
        self.assertEqual('fresh', req_obj.instance_group.name)

+    @mock.patch.object(
+        request_spec.RequestSpec, '_get_by_instance_uuid_from_db'
+    )
+    @mock.patch('nova.objects.InstanceGroup.get_by_uuid')
+    def test_get_by_instance_uuid_deleted_group(
+            self, mock_get_ig, get_by_uuid
+    ):
+        fake_spec_obj = fake_request_spec.fake_spec_obj()
+        fake_spec_obj.scheduler_hints['group'] = ['fresh']
+        fake_spec = fake_request_spec.fake_db_spec(fake_spec_obj)
+        get_by_uuid.return_value = fake_spec
+        mock_get_ig.side_effect = exception.InstanceGroupNotFound(
+            group_uuid=uuids.instgroup
+        )
+
+        req_obj = request_spec.RequestSpec.get_by_instance_uuid(
+            self.context, fake_spec['instance_uuid']
+        )
+        # assert that both the instance_group object and scheduler hint
+        # are cleared if the instance_group was deleted since the request
+        # spec was last saved to the db.
+        self.assertIsNone(req_obj.instance_group, objects.InstanceGroup)
+        self.assertEqual({'hint': ['over-there']}, req_obj.scheduler_hints)
+
    @mock.patch('nova.objects.request_spec.RequestSpec.save')
    @mock.patch.object(
        request_spec.RequestSpec, '_get_by_instance_uuid_from_db')
--- a/releasenotes/notes/fix-group-policy-validation-with-deleted-groups-4f685fd1d6b84192.yaml
+++ b/releasenotes/notes/fix-group-policy-validation-with-deleted-groups-4f685fd1d6b84192.yaml
@ -0,0 +1,13 @@
+---
+fixes:
+  - |
+    When the server group policy validation upcall is enabled
+    nova will assert that the policy is not violated on move operations
+    and initial instance creation. As noted in `bug 1890244`_, if a
+    server was created in a server group and that group was later deleted
+    the validation upcall would fail due to an uncaught excpetion if the
+    server group was deleted. This prevented evacuate and other move
+    operations form functioning. This has now been fixed and nova will
+    ignore deleted server groups.
+
+    .. _bug 1890244: https://bugs.launchpad.net/nova/+bug/1890244