diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 24417258dcab..bcbe34b6e683 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -10046,17 +10046,30 @@ class ComputeManager(manager.Manager): "nodes are %(nodes)s", {'id': cn.id, 'hh': cn.hypervisor_hostname, 'nodes': nodenames}) - cn.destroy() - self.rt.remove_node(cn.hypervisor_hostname) - # Delete the corresponding resource provider in placement, - # along with any associated allocations. try: - self.reportclient.delete_resource_provider(context, cn, - cascade=True) - except keystone_exception.ClientException as e: - LOG.error( - "Failed to delete compute node resource provider " - "for compute node %s: %s", cn.uuid, str(e)) + cn.destroy() + except exception.ObjectActionError: + # NOTE(mgoddard): it's possible that another compute + # service took ownership of this compute node since we + # queried it due to a rebalance, and this will cause the + # deletion to fail. Ignore the error in that case. + LOG.info("Ignoring failure to delete orphan compute node " + "%(id)s on hypervisor host %(hh)s due to " + "possible node rebalance", + {'id': cn.id, 'hh': cn.hypervisor_hostname}) + self.rt.remove_node(cn.hypervisor_hostname) + self.reportclient.invalidate_resource_provider(cn.uuid) + else: + self.rt.remove_node(cn.hypervisor_hostname) + # Delete the corresponding resource provider in placement, + # along with any associated allocations. + try: + self.reportclient.delete_resource_provider( + context, cn, cascade=True) + except keystone_exception.ClientException as e: + LOG.error( + "Failed to delete compute node resource provider " + "for compute node %s: %s", cn.uuid, str(e)) for nodename in nodenames: self._update_available_resource_for_node(context, nodename, diff --git a/nova/db/main/api.py b/nova/db/main/api.py index d5949463feeb..57570ddc412f 100644 --- a/nova/db/main/api.py +++ b/nova/db/main/api.py @@ -855,20 +855,33 @@ def compute_node_update(context, compute_id, values): @pick_context_manager_writer -def compute_node_delete(context, compute_id): +def compute_node_delete(context, compute_id, constraint=None): """Delete a compute node from the database. :param context: The security context :param compute_id: ID of the compute node + :param constraint: a constraint object + :raises: ComputeHostNotFound if compute node with the given ID doesn't exist. + :raises: ConstraintNotMet if a constraint was specified and it was not met. """ - result = model_query(context, models.ComputeNode).\ - filter_by(id=compute_id).\ - soft_delete(synchronize_session=False) + query = model_query(context, models.ComputeNode).filter_by(id=compute_id) + + if constraint is not None: + query = constraint.apply(models.ComputeNode, query) + + result = query.soft_delete(synchronize_session=False) if not result: - raise exception.ComputeHostNotFound(host=compute_id) + # The soft_delete could fail for one of two reasons: + # 1) The compute node no longer exists + # 2) The constraint, if specified, was not met + # Try to read the compute node and let it raise ComputeHostNotFound if + # 1) happened. + compute_node_get(context, compute_id) + # Else, raise ConstraintNotMet if 2) happened. + raise exception.ConstraintNotMet() @pick_context_manager_reader diff --git a/nova/objects/compute_node.py b/nova/objects/compute_node.py index 41e0c87a564b..60c2be71cd0b 100644 --- a/nova/objects/compute_node.py +++ b/nova/objects/compute_node.py @@ -358,7 +358,20 @@ class ComputeNode(base.NovaPersistentObject, base.NovaObject): @base.remotable def destroy(self): - db.compute_node_delete(self._context, self.id) + if self.obj_attr_is_set('host') and self.host: + # NOTE(melwitt): If our host is set, avoid a race between + # nova-computes during ironic driver node rebalances which can + # change node ownership. + constraint = db.constraint(host=db.equal_any(self.host)) + else: + constraint = None + + try: + db.compute_node_delete( + self._context, self.id, constraint=constraint) + except exception.ConstraintNotMet: + raise exception.ObjectActionError(action='destroy', + reason='host changed') def update_from_virt_driver(self, resources): # NOTE(pmurray): the virt driver provides a dict of values that diff --git a/nova/tests/functional/regressions/test_bug_1853009.py b/nova/tests/functional/regressions/test_bug_1853009.py index dddc79606f01..a3ccd69b1b60 100644 --- a/nova/tests/functional/regressions/test_bug_1853009.py +++ b/nova/tests/functional/regressions/test_bug_1853009.py @@ -133,24 +133,35 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase( # Mock out the compute node query to simulate a race condition where # the list includes an orphan compute node that is newly owned by # host_b by the time host_a attempts to delete it. - # FIXME(mgoddard): Ideally host_a would not delete a node that does not - # belong to it. See https://bugs.launchpad.net/nova/+bug/1853009. with mock.patch( 'nova.compute.manager.ComputeManager._get_compute_nodes_in_db' ) as mock_get: mock_get.return_value = [cn] host_a.manager.update_available_resource(self.ctxt) - # Verify that the node was deleted. + # Verify that the node was almost deleted, but was saved by the host + # check self.assertIn( 'Deleting orphan compute node %s hypervisor host ' 'is fake-node, nodes are' % cn.id, self.stdlog.logger.output) - hypervisors = self.api.api_get( - '/os-hypervisors/detail').body['hypervisors'] - self.assertEqual(0, len(hypervisors), hypervisors) + self.assertIn( + 'Ignoring failure to delete orphan compute node %s on ' + 'hypervisor host fake-node' % cn.id, + self.stdlog.logger.output) + self._assert_hypervisor_api(self.nodename, 'host_b') rps = self._get_all_providers() - self.assertEqual(0, len(rps), rps) + self.assertEqual(1, len(rps), rps) + self.assertEqual(self.nodename, rps[0]['name']) + + # Simulate deletion of an orphan by host_a. It shouldn't happen + # anymore, but let's assume it already did. + cn = objects.ComputeNode.get_by_host_and_nodename( + self.ctxt, 'host_b', self.nodename) + cn.destroy() + host_a.manager.rt.remove_node(cn.hypervisor_hostname) + host_a.manager.reportclient.delete_resource_provider( + self.ctxt, cn, cascade=True) # host_b[3]: Should recreate compute node and resource provider. # FIXME(mgoddard): Resource provider not recreated here, due to diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index 078616599704..08d4d3632c6a 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -199,7 +199,8 @@ class BaseTestCase(test.TestCase): context, objects.ComputeNode(), cn) for cn in fake_compute_nodes] - def fake_compute_node_delete(context, compute_node_id): + def fake_compute_node_delete(context, compute_node_id, + constraint=None): self.assertEqual(2, compute_node_id) self.stub_out( diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index f0a69a4fdeaf..ee04f71b7322 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -408,6 +408,33 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, update_mock.assert_not_called() del_rp_mock.assert_not_called() + @mock.patch.object(manager.ComputeManager, + '_update_available_resource_for_node') + @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') + @mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db') + def test_update_available_resource_destroy_rebalance( + self, get_db_nodes, get_avail_nodes, update_mock): + mock_rt = self._mock_rt() + rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject( + self.compute, 'reportclient')).mock + db_nodes = [self._make_compute_node('node1', 1)] + get_db_nodes.return_value = db_nodes + # Destroy can fail if nodes were rebalanced between getting the node + # list and calling destroy. + db_nodes[0].destroy.side_effect = exception.ObjectActionError( + action='destroy', reason='host changed') + get_avail_nodes.return_value = set() + self.compute.update_available_resource(self.context) + get_db_nodes.assert_called_once_with(self.context, set(), + use_slave=True, startup=False) + self.assertEqual(0, update_mock.call_count) + + db_nodes[0].destroy.assert_called_once_with() + self.assertEqual(0, rc_mock.delete_resource_provider.call_count) + mock_rt.remove_node.assert_called_once_with('node1') + rc_mock.invalidate_resource_provider.assert_called_once_with( + db_nodes[0].uuid) + @mock.patch('nova.context.get_admin_context') def test_pre_start_hook(self, get_admin_context): """Very simple test just to make sure update_available_resource is diff --git a/nova/tests/unit/db/main/test_api.py b/nova/tests/unit/db/main/test_api.py index de92f5feb7c8..1fd82066d350 100644 --- a/nova/tests/unit/db/main/test_api.py +++ b/nova/tests/unit/db/main/test_api.py @@ -5471,6 +5471,13 @@ class ComputeNodeTestCase(test.TestCase, ModelsObjectComparatorMixin): nodes = db.compute_node_get_all(self.ctxt) self.assertEqual(len(nodes), 0) + def test_compute_node_delete_different_host(self): + compute_node_id = self.item['id'] + constraint = db.constraint(host=db.equal_any('different-host')) + self.assertRaises(exception.ConstraintNotMet, + db.compute_node_delete, + self.ctxt, compute_node_id, constraint=constraint) + def test_compute_node_search_by_hypervisor(self): nodes_created = [] new_service = copy.copy(self.service_dict) diff --git a/nova/tests/unit/objects/test_compute_node.py b/nova/tests/unit/objects/test_compute_node.py index 5b6a456f853d..297edfbd55a1 100644 --- a/nova/tests/unit/objects/test_compute_node.py +++ b/nova/tests/unit/objects/test_compute_node.py @@ -419,7 +419,17 @@ class _TestComputeNodeObject(object): compute = compute_node.ComputeNode(context=self.context) compute.id = 123 compute.destroy() - mock_delete.assert_called_once_with(self.context, 123) + mock_delete.assert_called_once_with(self.context, 123, constraint=None) + + def test_destroy_host_constraint(self): + # Create compute node with host='fake' + compute = fake_compute_with_resources.obj_clone() + compute._context = self.context + compute.host = 'fake' + compute.create() + # Simulate a compute node ownership change due to a node rebalance + compute.host = 'different' + self.assertRaises(exception.ObjectActionError, compute.destroy) @mock.patch.object(db, 'compute_node_get_all') def test_get_all(self, mock_get_all): diff --git a/releasenotes/notes/bug-1853009-99414e14d1491b5f.yaml b/releasenotes/notes/bug-1853009-99414e14d1491b5f.yaml new file mode 100644 index 000000000000..d6e71c422241 --- /dev/null +++ b/releasenotes/notes/bug-1853009-99414e14d1491b5f.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Fixes an issue with multiple ``nova-compute`` services used with Ironic, + where a rebalance operation could result in a compute node being deleted + from the database and not recreated. See `bug 1853009 + `__ for details.