Merge "Prevent deletion of a compute node belonging to another host" into stable/wallaby

2022-06-04 08:19:57 +00:00 · 2022-06-04 08:19:57 +00:00 · 68cad8fb98
parent 4cf632338d cbbca58504
commit 68cad8fb98
9 changed files with 127 additions and 27 deletions
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -10014,17 +10014,30 @@ class ComputeManager(manager.Manager):
                         "nodes are %(nodes)s",
                         {'id': cn.id, 'hh': cn.hypervisor_hostname,
                          'nodes': nodenames})
-                cn.destroy()
-                self.rt.remove_node(cn.hypervisor_hostname)
-                # Delete the corresponding resource provider in placement,
-                # along with any associated allocations.
                try:
-                    self.reportclient.delete_resource_provider(context, cn,
-                                                               cascade=True)
-                except keystone_exception.ClientException as e:
-                    LOG.error(
-                        "Failed to delete compute node resource provider "
-                        "for compute node %s: %s", cn.uuid, str(e))
+                    cn.destroy()
+                except exception.ObjectActionError:
+                    # NOTE(mgoddard): it's possible that another compute
+                    # service took ownership of this compute node since we
+                    # queried it due to a rebalance, and this will cause the
+                    # deletion to fail. Ignore the error in that case.
+                    LOG.info("Ignoring failure to delete orphan compute node "
+                             "%(id)s on hypervisor host %(hh)s due to "
+                             "possible node rebalance",
+                             {'id': cn.id, 'hh': cn.hypervisor_hostname})
+                    self.rt.remove_node(cn.hypervisor_hostname)
+                    self.reportclient.invalidate_resource_provider(cn.uuid)
+                else:
+                    self.rt.remove_node(cn.hypervisor_hostname)
+                    # Delete the corresponding resource provider in placement,
+                    # along with any associated allocations.
+                    try:
+                        self.reportclient.delete_resource_provider(
+                            context, cn, cascade=True)
+                    except keystone_exception.ClientException as e:
+                        LOG.error(
+                            "Failed to delete compute node resource provider "
+                            "for compute node %s: %s", cn.uuid, str(e))

        for nodename in nodenames:
            self._update_available_resource_for_node(context, nodename,
--- a/nova/db/sqlalchemy/api.py
+++ b/nova/db/sqlalchemy/api.py
@ -761,14 +761,24 @@ def compute_node_update(context, compute_id, values):


@pick_context_manager_writer
-def compute_node_delete(context, compute_id):
+def compute_node_delete(context, compute_id, constraint=None):
    """Delete a ComputeNode record."""
-    result = model_query(context, models.ComputeNode).\
-             filter_by(id=compute_id).\
-             soft_delete(synchronize_session=False)
+    query = model_query(context, models.ComputeNode).filter_by(id=compute_id)
+
+    if constraint is not None:
+        query = constraint.apply(models.ComputeNode, query)
+
+    result = query.soft_delete(synchronize_session=False)

    if not result:
-        raise exception.ComputeHostNotFound(host=compute_id)
+        # The soft_delete could fail for one of two reasons:
+        # 1) The compute node no longer exists
+        # 2) The constraint, if specified, was not met
+        # Try to read the compute node and let it raise ComputeHostNotFound if
+        # 1) happened.
+        compute_node_get(context, compute_id)
+        # Else, raise ConstraintNotMet if 2) happened.
+        raise exception.ConstraintNotMet()


@pick_context_manager_reader
--- a/nova/objects/compute_node.py
+++ b/nova/objects/compute_node.py
@ -360,7 +360,20 @@ class ComputeNode(base.NovaPersistentObject, base.NovaObject):

    @base.remotable
    def destroy(self):
-        db.compute_node_delete(self._context, self.id)
+        if self.obj_attr_is_set('host') and self.host:
+            # NOTE(melwitt): If our host is set, avoid a race between
+            # nova-computes during ironic driver node rebalances which can
+            # change node ownership.
+            constraint = db.constraint(host=db.equal_any(self.host))
+        else:
+            constraint = None
+
+        try:
+            sa_api.compute_node_delete(
+                self._context, self.id, constraint=constraint)
+        except exception.ConstraintNotMet:
+            raise exception.ObjectActionError(action='destroy',
+                                              reason='host changed')

    def update_from_virt_driver(self, resources):
        # NOTE(pmurray): the virt driver provides a dict of values that
--- a/nova/tests/functional/regressions/test_bug_1853009.py
+++ b/nova/tests/functional/regressions/test_bug_1853009.py
@ -133,24 +133,35 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
        # Mock out the compute node query to simulate a race condition where
        # the list includes an orphan compute node that is newly owned by
        # host_b by the time host_a attempts to delete it.
-        # FIXME(mgoddard): Ideally host_a would not delete a node that does not
-        # belong to it. See https://bugs.launchpad.net/nova/+bug/1853009.
        with mock.patch(
            'nova.compute.manager.ComputeManager._get_compute_nodes_in_db'
        ) as mock_get:
            mock_get.return_value = [cn]
            host_a.manager.update_available_resource(self.ctxt)

-        # Verify that the node was deleted.
+        # Verify that the node was almost deleted, but was saved by the host
+        # check
        self.assertIn(
            'Deleting orphan compute node %s hypervisor host '
            'is fake-node, nodes are' % cn.id,
            self.stdlog.logger.output)
-        hypervisors = self.api.api_get(
-            '/os-hypervisors/detail').body['hypervisors']
-        self.assertEqual(0, len(hypervisors), hypervisors)
+        self.assertIn(
+            'Ignoring failure to delete orphan compute node %s on '
+            'hypervisor host fake-node' % cn.id,
+            self.stdlog.logger.output)
+        self._assert_hypervisor_api(self.nodename, 'host_b')
        rps = self._get_all_providers()
-        self.assertEqual(0, len(rps), rps)
+        self.assertEqual(1, len(rps), rps)
+        self.assertEqual(self.nodename, rps[0]['name'])
+
+        # Simulate deletion of an orphan by host_a. It shouldn't happen
+        # anymore, but let's assume it already did.
+        cn = objects.ComputeNode.get_by_host_and_nodename(
+            self.ctxt, 'host_b', self.nodename)
+        cn.destroy()
+        host_a.manager.rt.remove_node(cn.hypervisor_hostname)
+        host_a.manager.reportclient.delete_resource_provider(
+            self.ctxt, cn, cascade=True)

        # host_b[3]: Should recreate compute node and resource provider.
        # FIXME(mgoddard): Resource provider not recreated here, due to
--- a/nova/tests/unit/compute/test_compute.py
+++ b/nova/tests/unit/compute/test_compute.py
@ -200,13 +200,14 @@ class BaseTestCase(test.TestCase):
                        context, objects.ComputeNode(), cn)
                    for cn in fake_compute_nodes]

-        def fake_compute_node_delete(context, compute_node_id):
+        def fake_compute_node_delete(context, compute_node_id,
+                                     constraint=None):
            self.assertEqual(2, compute_node_id)

        self.stub_out(
            'nova.compute.manager.ComputeManager._get_compute_nodes_in_db',
            fake_get_compute_nodes_in_db)
-        self.stub_out('nova.db.api.compute_node_delete',
+        self.stub_out('nova.db.sqlalchemy.api.compute_node_delete',
                fake_compute_node_delete)

        self.compute.update_available_resource(
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@ -409,6 +409,33 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
        update_mock.assert_not_called()
        del_rp_mock.assert_not_called()

+    @mock.patch.object(manager.ComputeManager,
+                       '_update_available_resource_for_node')
+    @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
+    @mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
+    def test_update_available_resource_destroy_rebalance(
+            self, get_db_nodes, get_avail_nodes, update_mock):
+        mock_rt = self._mock_rt()
+        rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
+            self.compute, 'reportclient')).mock
+        db_nodes = [self._make_compute_node('node1', 1)]
+        get_db_nodes.return_value = db_nodes
+        # Destroy can fail if nodes were rebalanced between getting the node
+        # list and calling destroy.
+        db_nodes[0].destroy.side_effect = exception.ObjectActionError(
+            action='destroy', reason='host changed')
+        get_avail_nodes.return_value = set()
+        self.compute.update_available_resource(self.context)
+        get_db_nodes.assert_called_once_with(self.context, set(),
+                                             use_slave=True, startup=False)
+        self.assertEqual(0, update_mock.call_count)
+
+        db_nodes[0].destroy.assert_called_once_with()
+        self.assertEqual(0, rc_mock.delete_resource_provider.call_count)
+        mock_rt.remove_node.assert_called_once_with('node1')
+        rc_mock.invalidate_resource_provider.assert_called_once_with(
+            db_nodes[0].uuid)
+
    @mock.patch('nova.context.get_admin_context')
    def test_pre_start_hook(self, get_admin_context):
        """Very simple test just to make sure update_available_resource is
--- a/nova/tests/unit/db/test_db_api.py
+++ b/nova/tests/unit/db/test_db_api.py
@ -5545,6 +5545,13 @@ class ComputeNodeTestCase(test.TestCase, ModelsObjectComparatorMixin):
        nodes = db.compute_node_get_all(self.ctxt)
        self.assertEqual(len(nodes), 0)

+    def test_compute_node_delete_different_host(self):
+        compute_node_id = self.item['id']
+        constraint = db.constraint(host=db.equal_any('different-host'))
+        self.assertRaises(exception.ConstraintNotMet,
+                          sqlalchemy_api.compute_node_delete,
+                          self.ctxt, compute_node_id, constraint=constraint)
+
    def test_compute_node_search_by_hypervisor(self):
        nodes_created = []
        new_service = copy.copy(self.service_dict)
--- a/nova/tests/unit/objects/test_compute_node.py
+++ b/nova/tests/unit/objects/test_compute_node.py
@ -24,6 +24,7 @@ from oslo_versionedobjects import exception as ovo_exc

 from nova import conf
 from nova.db import api as db
+from nova.db.sqlalchemy import api as sa_api
 from nova import exception
 from nova import objects
 from nova.objects import base
@ -410,12 +411,22 @@ class _TestComputeNodeObject(object):
        self.assertRaises(ovo_exc.ReadOnlyFieldError, setattr,
                          compute, 'id', 124)

-    @mock.patch.object(db, 'compute_node_delete')
+    @mock.patch.object(sa_api, 'compute_node_delete')
    def test_destroy(self, mock_delete):
        compute = compute_node.ComputeNode(context=self.context)
        compute.id = 123
        compute.destroy()
-        mock_delete.assert_called_once_with(self.context, 123)
+        mock_delete.assert_called_once_with(self.context, 123, constraint=None)
+
+    def test_destroy_host_constraint(self):
+        # Create compute node with host='fake'
+        compute = fake_compute_with_resources.obj_clone()
+        compute._context = self.context
+        compute.host = 'fake'
+        compute.create()
+        # Simulate a compute node ownership change due to a node rebalance
+        compute.host = 'different'
+        self.assertRaises(exception.ObjectActionError, compute.destroy)

    @mock.patch.object(db, 'compute_node_get_all')
    def test_get_all(self, mock_get_all):
--- a/releasenotes/notes/bug-1853009-99414e14d1491b5f.yaml
+++ b/releasenotes/notes/bug-1853009-99414e14d1491b5f.yaml
@ -0,0 +1,7 @@
+---
+fixes:
+  - |
+    Fixes an issue with multiple ``nova-compute`` services used with Ironic,
+    where a rebalance operation could result in a compute node being deleted
+    from the database and not recreated. See `bug 1853009
+    <https://bugs.launchpad.net/nova/+bug/1853009>`__ for details.