From a8492e88783b40f6dc61888fada232f0d00d6acf Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Mon, 18 Nov 2019 12:06:47 +0000 Subject: [PATCH] Prevent deletion of a compute node belonging to another host There is a race condition in nova-compute with the ironic virt driver as nodes get rebalanced. It can lead to compute nodes being removed in the DB and not repopulated. Ultimately this prevents these nodes from being scheduled to. The main race condition involved is in update_available_resources in the compute manager. When the list of compute nodes is queried, there is a compute node belonging to the host that it does not expect to be managing, i.e. it is an orphan. Between that time and deleting the orphan, the real owner of the compute node takes ownership of it ( in the resource tracker). However, the node is still deleted as the first host is unaware of the ownership change. This change prevents this from occurring by filtering on the host when deleting a compute node. If another compute host has taken ownership of a node, it will have updated the host field and this will prevent deletion from occurring. The first host sees this has happened via the ComputeHostNotFound exception, and avoids deleting its resource provider. Co-Authored-By: melanie witt Closes-Bug: #1853009 Related-Bug: #1841481 Change-Id: I260c1fded79a85d4899e94df4d9036a1ee437f02 --- nova/compute/manager.py | 33 +++++++++++++------ nova/db/main/api.py | 23 ++++++++++--- nova/objects/compute_node.py | 15 ++++++++- .../regressions/test_bug_1853009.py | 25 ++++++++++---- nova/tests/unit/compute/test_compute.py | 3 +- nova/tests/unit/compute/test_compute_mgr.py | 27 +++++++++++++++ nova/tests/unit/db/main/test_api.py | 7 ++++ nova/tests/unit/objects/test_compute_node.py | 12 ++++++- .../notes/bug-1853009-99414e14d1491b5f.yaml | 7 ++++ 9 files changed, 127 insertions(+), 25 deletions(-) create mode 100644 releasenotes/notes/bug-1853009-99414e14d1491b5f.yaml diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 24417258dcab..bcbe34b6e683 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -10046,17 +10046,30 @@ class ComputeManager(manager.Manager): "nodes are %(nodes)s", {'id': cn.id, 'hh': cn.hypervisor_hostname, 'nodes': nodenames}) - cn.destroy() - self.rt.remove_node(cn.hypervisor_hostname) - # Delete the corresponding resource provider in placement, - # along with any associated allocations. try: - self.reportclient.delete_resource_provider(context, cn, - cascade=True) - except keystone_exception.ClientException as e: - LOG.error( - "Failed to delete compute node resource provider " - "for compute node %s: %s", cn.uuid, str(e)) + cn.destroy() + except exception.ObjectActionError: + # NOTE(mgoddard): it's possible that another compute + # service took ownership of this compute node since we + # queried it due to a rebalance, and this will cause the + # deletion to fail. Ignore the error in that case. + LOG.info("Ignoring failure to delete orphan compute node " + "%(id)s on hypervisor host %(hh)s due to " + "possible node rebalance", + {'id': cn.id, 'hh': cn.hypervisor_hostname}) + self.rt.remove_node(cn.hypervisor_hostname) + self.reportclient.invalidate_resource_provider(cn.uuid) + else: + self.rt.remove_node(cn.hypervisor_hostname) + # Delete the corresponding resource provider in placement, + # along with any associated allocations. + try: + self.reportclient.delete_resource_provider( + context, cn, cascade=True) + except keystone_exception.ClientException as e: + LOG.error( + "Failed to delete compute node resource provider " + "for compute node %s: %s", cn.uuid, str(e)) for nodename in nodenames: self._update_available_resource_for_node(context, nodename, diff --git a/nova/db/main/api.py b/nova/db/main/api.py index d5949463feeb..57570ddc412f 100644 --- a/nova/db/main/api.py +++ b/nova/db/main/api.py @@ -855,20 +855,33 @@ def compute_node_update(context, compute_id, values): @pick_context_manager_writer -def compute_node_delete(context, compute_id): +def compute_node_delete(context, compute_id, constraint=None): """Delete a compute node from the database. :param context: The security context :param compute_id: ID of the compute node + :param constraint: a constraint object + :raises: ComputeHostNotFound if compute node with the given ID doesn't exist. + :raises: ConstraintNotMet if a constraint was specified and it was not met. """ - result = model_query(context, models.ComputeNode).\ - filter_by(id=compute_id).\ - soft_delete(synchronize_session=False) + query = model_query(context, models.ComputeNode).filter_by(id=compute_id) + + if constraint is not None: + query = constraint.apply(models.ComputeNode, query) + + result = query.soft_delete(synchronize_session=False) if not result: - raise exception.ComputeHostNotFound(host=compute_id) + # The soft_delete could fail for one of two reasons: + # 1) The compute node no longer exists + # 2) The constraint, if specified, was not met + # Try to read the compute node and let it raise ComputeHostNotFound if + # 1) happened. + compute_node_get(context, compute_id) + # Else, raise ConstraintNotMet if 2) happened. + raise exception.ConstraintNotMet() @pick_context_manager_reader diff --git a/nova/objects/compute_node.py b/nova/objects/compute_node.py index 41e0c87a564b..60c2be71cd0b 100644 --- a/nova/objects/compute_node.py +++ b/nova/objects/compute_node.py @@ -358,7 +358,20 @@ class ComputeNode(base.NovaPersistentObject, base.NovaObject): @base.remotable def destroy(self): - db.compute_node_delete(self._context, self.id) + if self.obj_attr_is_set('host') and self.host: + # NOTE(melwitt): If our host is set, avoid a race between + # nova-computes during ironic driver node rebalances which can + # change node ownership. + constraint = db.constraint(host=db.equal_any(self.host)) + else: + constraint = None + + try: + db.compute_node_delete( + self._context, self.id, constraint=constraint) + except exception.ConstraintNotMet: + raise exception.ObjectActionError(action='destroy', + reason='host changed') def update_from_virt_driver(self, resources): # NOTE(pmurray): the virt driver provides a dict of values that diff --git a/nova/tests/functional/regressions/test_bug_1853009.py b/nova/tests/functional/regressions/test_bug_1853009.py index dddc79606f01..a3ccd69b1b60 100644 --- a/nova/tests/functional/regressions/test_bug_1853009.py +++ b/nova/tests/functional/regressions/test_bug_1853009.py @@ -133,24 +133,35 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase( # Mock out the compute node query to simulate a race condition where # the list includes an orphan compute node that is newly owned by # host_b by the time host_a attempts to delete it. - # FIXME(mgoddard): Ideally host_a would not delete a node that does not - # belong to it. See https://bugs.launchpad.net/nova/+bug/1853009. with mock.patch( 'nova.compute.manager.ComputeManager._get_compute_nodes_in_db' ) as mock_get: mock_get.return_value = [cn] host_a.manager.update_available_resource(self.ctxt) - # Verify that the node was deleted. + # Verify that the node was almost deleted, but was saved by the host + # check self.assertIn( 'Deleting orphan compute node %s hypervisor host ' 'is fake-node, nodes are' % cn.id, self.stdlog.logger.output) - hypervisors = self.api.api_get( - '/os-hypervisors/detail').body['hypervisors'] - self.assertEqual(0, len(hypervisors), hypervisors) + self.assertIn( + 'Ignoring failure to delete orphan compute node %s on ' + 'hypervisor host fake-node' % cn.id, + self.stdlog.logger.output) + self._assert_hypervisor_api(self.nodename, 'host_b') rps = self._get_all_providers() - self.assertEqual(0, len(rps), rps) + self.assertEqual(1, len(rps), rps) + self.assertEqual(self.nodename, rps[0]['name']) + + # Simulate deletion of an orphan by host_a. It shouldn't happen + # anymore, but let's assume it already did. + cn = objects.ComputeNode.get_by_host_and_nodename( + self.ctxt, 'host_b', self.nodename) + cn.destroy() + host_a.manager.rt.remove_node(cn.hypervisor_hostname) + host_a.manager.reportclient.delete_resource_provider( + self.ctxt, cn, cascade=True) # host_b[3]: Should recreate compute node and resource provider. # FIXME(mgoddard): Resource provider not recreated here, due to diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index 078616599704..08d4d3632c6a 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -199,7 +199,8 @@ class BaseTestCase(test.TestCase): context, objects.ComputeNode(), cn) for cn in fake_compute_nodes] - def fake_compute_node_delete(context, compute_node_id): + def fake_compute_node_delete(context, compute_node_id, + constraint=None): self.assertEqual(2, compute_node_id) self.stub_out( diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index f0a69a4fdeaf..ee04f71b7322 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -408,6 +408,33 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, update_mock.assert_not_called() del_rp_mock.assert_not_called() + @mock.patch.object(manager.ComputeManager, + '_update_available_resource_for_node') + @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') + @mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db') + def test_update_available_resource_destroy_rebalance( + self, get_db_nodes, get_avail_nodes, update_mock): + mock_rt = self._mock_rt() + rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject( + self.compute, 'reportclient')).mock + db_nodes = [self._make_compute_node('node1', 1)] + get_db_nodes.return_value = db_nodes + # Destroy can fail if nodes were rebalanced between getting the node + # list and calling destroy. + db_nodes[0].destroy.side_effect = exception.ObjectActionError( + action='destroy', reason='host changed') + get_avail_nodes.return_value = set() + self.compute.update_available_resource(self.context) + get_db_nodes.assert_called_once_with(self.context, set(), + use_slave=True, startup=False) + self.assertEqual(0, update_mock.call_count) + + db_nodes[0].destroy.assert_called_once_with() + self.assertEqual(0, rc_mock.delete_resource_provider.call_count) + mock_rt.remove_node.assert_called_once_with('node1') + rc_mock.invalidate_resource_provider.assert_called_once_with( + db_nodes[0].uuid) + @mock.patch('nova.context.get_admin_context') def test_pre_start_hook(self, get_admin_context): """Very simple test just to make sure update_available_resource is diff --git a/nova/tests/unit/db/main/test_api.py b/nova/tests/unit/db/main/test_api.py index de92f5feb7c8..1fd82066d350 100644 --- a/nova/tests/unit/db/main/test_api.py +++ b/nova/tests/unit/db/main/test_api.py @@ -5471,6 +5471,13 @@ class ComputeNodeTestCase(test.TestCase, ModelsObjectComparatorMixin): nodes = db.compute_node_get_all(self.ctxt) self.assertEqual(len(nodes), 0) + def test_compute_node_delete_different_host(self): + compute_node_id = self.item['id'] + constraint = db.constraint(host=db.equal_any('different-host')) + self.assertRaises(exception.ConstraintNotMet, + db.compute_node_delete, + self.ctxt, compute_node_id, constraint=constraint) + def test_compute_node_search_by_hypervisor(self): nodes_created = [] new_service = copy.copy(self.service_dict) diff --git a/nova/tests/unit/objects/test_compute_node.py b/nova/tests/unit/objects/test_compute_node.py index 5b6a456f853d..297edfbd55a1 100644 --- a/nova/tests/unit/objects/test_compute_node.py +++ b/nova/tests/unit/objects/test_compute_node.py @@ -419,7 +419,17 @@ class _TestComputeNodeObject(object): compute = compute_node.ComputeNode(context=self.context) compute.id = 123 compute.destroy() - mock_delete.assert_called_once_with(self.context, 123) + mock_delete.assert_called_once_with(self.context, 123, constraint=None) + + def test_destroy_host_constraint(self): + # Create compute node with host='fake' + compute = fake_compute_with_resources.obj_clone() + compute._context = self.context + compute.host = 'fake' + compute.create() + # Simulate a compute node ownership change due to a node rebalance + compute.host = 'different' + self.assertRaises(exception.ObjectActionError, compute.destroy) @mock.patch.object(db, 'compute_node_get_all') def test_get_all(self, mock_get_all): diff --git a/releasenotes/notes/bug-1853009-99414e14d1491b5f.yaml b/releasenotes/notes/bug-1853009-99414e14d1491b5f.yaml new file mode 100644 index 000000000000..d6e71c422241 --- /dev/null +++ b/releasenotes/notes/bug-1853009-99414e14d1491b5f.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Fixes an issue with multiple ``nova-compute`` services used with Ironic, + where a rebalance operation could result in a compute node being deleted + from the database and not recreated. See `bug 1853009 + `__ for details.