Merge "Prevent deletion of a compute node belonging to another host" into stable/wallaby

This commit is contained in:
Zuul 2022-06-04 08:19:57 +00:00 committed by Gerrit Code Review
commit 68cad8fb98
9 changed files with 127 additions and 27 deletions

View File

@ -10014,17 +10014,30 @@ class ComputeManager(manager.Manager):
"nodes are %(nodes)s",
{'id': cn.id, 'hh': cn.hypervisor_hostname,
'nodes': nodenames})
cn.destroy()
self.rt.remove_node(cn.hypervisor_hostname)
# Delete the corresponding resource provider in placement,
# along with any associated allocations.
try:
self.reportclient.delete_resource_provider(context, cn,
cascade=True)
except keystone_exception.ClientException as e:
LOG.error(
"Failed to delete compute node resource provider "
"for compute node %s: %s", cn.uuid, str(e))
cn.destroy()
except exception.ObjectActionError:
# NOTE(mgoddard): it's possible that another compute
# service took ownership of this compute node since we
# queried it due to a rebalance, and this will cause the
# deletion to fail. Ignore the error in that case.
LOG.info("Ignoring failure to delete orphan compute node "
"%(id)s on hypervisor host %(hh)s due to "
"possible node rebalance",
{'id': cn.id, 'hh': cn.hypervisor_hostname})
self.rt.remove_node(cn.hypervisor_hostname)
self.reportclient.invalidate_resource_provider(cn.uuid)
else:
self.rt.remove_node(cn.hypervisor_hostname)
# Delete the corresponding resource provider in placement,
# along with any associated allocations.
try:
self.reportclient.delete_resource_provider(
context, cn, cascade=True)
except keystone_exception.ClientException as e:
LOG.error(
"Failed to delete compute node resource provider "
"for compute node %s: %s", cn.uuid, str(e))
for nodename in nodenames:
self._update_available_resource_for_node(context, nodename,

View File

@ -761,14 +761,24 @@ def compute_node_update(context, compute_id, values):
@pick_context_manager_writer
def compute_node_delete(context, compute_id):
def compute_node_delete(context, compute_id, constraint=None):
"""Delete a ComputeNode record."""
result = model_query(context, models.ComputeNode).\
filter_by(id=compute_id).\
soft_delete(synchronize_session=False)
query = model_query(context, models.ComputeNode).filter_by(id=compute_id)
if constraint is not None:
query = constraint.apply(models.ComputeNode, query)
result = query.soft_delete(synchronize_session=False)
if not result:
raise exception.ComputeHostNotFound(host=compute_id)
# The soft_delete could fail for one of two reasons:
# 1) The compute node no longer exists
# 2) The constraint, if specified, was not met
# Try to read the compute node and let it raise ComputeHostNotFound if
# 1) happened.
compute_node_get(context, compute_id)
# Else, raise ConstraintNotMet if 2) happened.
raise exception.ConstraintNotMet()
@pick_context_manager_reader

View File

@ -360,7 +360,20 @@ class ComputeNode(base.NovaPersistentObject, base.NovaObject):
@base.remotable
def destroy(self):
db.compute_node_delete(self._context, self.id)
if self.obj_attr_is_set('host') and self.host:
# NOTE(melwitt): If our host is set, avoid a race between
# nova-computes during ironic driver node rebalances which can
# change node ownership.
constraint = db.constraint(host=db.equal_any(self.host))
else:
constraint = None
try:
sa_api.compute_node_delete(
self._context, self.id, constraint=constraint)
except exception.ConstraintNotMet:
raise exception.ObjectActionError(action='destroy',
reason='host changed')
def update_from_virt_driver(self, resources):
# NOTE(pmurray): the virt driver provides a dict of values that

View File

@ -133,24 +133,35 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
# Mock out the compute node query to simulate a race condition where
# the list includes an orphan compute node that is newly owned by
# host_b by the time host_a attempts to delete it.
# FIXME(mgoddard): Ideally host_a would not delete a node that does not
# belong to it. See https://bugs.launchpad.net/nova/+bug/1853009.
with mock.patch(
'nova.compute.manager.ComputeManager._get_compute_nodes_in_db'
) as mock_get:
mock_get.return_value = [cn]
host_a.manager.update_available_resource(self.ctxt)
# Verify that the node was deleted.
# Verify that the node was almost deleted, but was saved by the host
# check
self.assertIn(
'Deleting orphan compute node %s hypervisor host '
'is fake-node, nodes are' % cn.id,
self.stdlog.logger.output)
hypervisors = self.api.api_get(
'/os-hypervisors/detail').body['hypervisors']
self.assertEqual(0, len(hypervisors), hypervisors)
self.assertIn(
'Ignoring failure to delete orphan compute node %s on '
'hypervisor host fake-node' % cn.id,
self.stdlog.logger.output)
self._assert_hypervisor_api(self.nodename, 'host_b')
rps = self._get_all_providers()
self.assertEqual(0, len(rps), rps)
self.assertEqual(1, len(rps), rps)
self.assertEqual(self.nodename, rps[0]['name'])
# Simulate deletion of an orphan by host_a. It shouldn't happen
# anymore, but let's assume it already did.
cn = objects.ComputeNode.get_by_host_and_nodename(
self.ctxt, 'host_b', self.nodename)
cn.destroy()
host_a.manager.rt.remove_node(cn.hypervisor_hostname)
host_a.manager.reportclient.delete_resource_provider(
self.ctxt, cn, cascade=True)
# host_b[3]: Should recreate compute node and resource provider.
# FIXME(mgoddard): Resource provider not recreated here, due to

View File

@ -200,13 +200,14 @@ class BaseTestCase(test.TestCase):
context, objects.ComputeNode(), cn)
for cn in fake_compute_nodes]
def fake_compute_node_delete(context, compute_node_id):
def fake_compute_node_delete(context, compute_node_id,
constraint=None):
self.assertEqual(2, compute_node_id)
self.stub_out(
'nova.compute.manager.ComputeManager._get_compute_nodes_in_db',
fake_get_compute_nodes_in_db)
self.stub_out('nova.db.api.compute_node_delete',
self.stub_out('nova.db.sqlalchemy.api.compute_node_delete',
fake_compute_node_delete)
self.compute.update_available_resource(

View File

@ -409,6 +409,33 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
update_mock.assert_not_called()
del_rp_mock.assert_not_called()
@mock.patch.object(manager.ComputeManager,
'_update_available_resource_for_node')
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
def test_update_available_resource_destroy_rebalance(
self, get_db_nodes, get_avail_nodes, update_mock):
mock_rt = self._mock_rt()
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
self.compute, 'reportclient')).mock
db_nodes = [self._make_compute_node('node1', 1)]
get_db_nodes.return_value = db_nodes
# Destroy can fail if nodes were rebalanced between getting the node
# list and calling destroy.
db_nodes[0].destroy.side_effect = exception.ObjectActionError(
action='destroy', reason='host changed')
get_avail_nodes.return_value = set()
self.compute.update_available_resource(self.context)
get_db_nodes.assert_called_once_with(self.context, set(),
use_slave=True, startup=False)
self.assertEqual(0, update_mock.call_count)
db_nodes[0].destroy.assert_called_once_with()
self.assertEqual(0, rc_mock.delete_resource_provider.call_count)
mock_rt.remove_node.assert_called_once_with('node1')
rc_mock.invalidate_resource_provider.assert_called_once_with(
db_nodes[0].uuid)
@mock.patch('nova.context.get_admin_context')
def test_pre_start_hook(self, get_admin_context):
"""Very simple test just to make sure update_available_resource is

View File

@ -5545,6 +5545,13 @@ class ComputeNodeTestCase(test.TestCase, ModelsObjectComparatorMixin):
nodes = db.compute_node_get_all(self.ctxt)
self.assertEqual(len(nodes), 0)
def test_compute_node_delete_different_host(self):
compute_node_id = self.item['id']
constraint = db.constraint(host=db.equal_any('different-host'))
self.assertRaises(exception.ConstraintNotMet,
sqlalchemy_api.compute_node_delete,
self.ctxt, compute_node_id, constraint=constraint)
def test_compute_node_search_by_hypervisor(self):
nodes_created = []
new_service = copy.copy(self.service_dict)

View File

@ -24,6 +24,7 @@ from oslo_versionedobjects import exception as ovo_exc
from nova import conf
from nova.db import api as db
from nova.db.sqlalchemy import api as sa_api
from nova import exception
from nova import objects
from nova.objects import base
@ -410,12 +411,22 @@ class _TestComputeNodeObject(object):
self.assertRaises(ovo_exc.ReadOnlyFieldError, setattr,
compute, 'id', 124)
@mock.patch.object(db, 'compute_node_delete')
@mock.patch.object(sa_api, 'compute_node_delete')
def test_destroy(self, mock_delete):
compute = compute_node.ComputeNode(context=self.context)
compute.id = 123
compute.destroy()
mock_delete.assert_called_once_with(self.context, 123)
mock_delete.assert_called_once_with(self.context, 123, constraint=None)
def test_destroy_host_constraint(self):
# Create compute node with host='fake'
compute = fake_compute_with_resources.obj_clone()
compute._context = self.context
compute.host = 'fake'
compute.create()
# Simulate a compute node ownership change due to a node rebalance
compute.host = 'different'
self.assertRaises(exception.ObjectActionError, compute.destroy)
@mock.patch.object(db, 'compute_node_get_all')
def test_get_all(self, mock_get_all):

View File

@ -0,0 +1,7 @@
---
fixes:
- |
Fixes an issue with multiple ``nova-compute`` services used with Ironic,
where a rebalance operation could result in a compute node being deleted
from the database and not recreated. See `bug 1853009
<https://bugs.launchpad.net/nova/+bug/1853009>`__ for details.