Merge "Prevent deletion of a compute node belonging to another host" into stable/wallaby
This commit is contained in:
commit
68cad8fb98
|
@ -10014,17 +10014,30 @@ class ComputeManager(manager.Manager):
|
|||
"nodes are %(nodes)s",
|
||||
{'id': cn.id, 'hh': cn.hypervisor_hostname,
|
||||
'nodes': nodenames})
|
||||
cn.destroy()
|
||||
self.rt.remove_node(cn.hypervisor_hostname)
|
||||
# Delete the corresponding resource provider in placement,
|
||||
# along with any associated allocations.
|
||||
try:
|
||||
self.reportclient.delete_resource_provider(context, cn,
|
||||
cascade=True)
|
||||
except keystone_exception.ClientException as e:
|
||||
LOG.error(
|
||||
"Failed to delete compute node resource provider "
|
||||
"for compute node %s: %s", cn.uuid, str(e))
|
||||
cn.destroy()
|
||||
except exception.ObjectActionError:
|
||||
# NOTE(mgoddard): it's possible that another compute
|
||||
# service took ownership of this compute node since we
|
||||
# queried it due to a rebalance, and this will cause the
|
||||
# deletion to fail. Ignore the error in that case.
|
||||
LOG.info("Ignoring failure to delete orphan compute node "
|
||||
"%(id)s on hypervisor host %(hh)s due to "
|
||||
"possible node rebalance",
|
||||
{'id': cn.id, 'hh': cn.hypervisor_hostname})
|
||||
self.rt.remove_node(cn.hypervisor_hostname)
|
||||
self.reportclient.invalidate_resource_provider(cn.uuid)
|
||||
else:
|
||||
self.rt.remove_node(cn.hypervisor_hostname)
|
||||
# Delete the corresponding resource provider in placement,
|
||||
# along with any associated allocations.
|
||||
try:
|
||||
self.reportclient.delete_resource_provider(
|
||||
context, cn, cascade=True)
|
||||
except keystone_exception.ClientException as e:
|
||||
LOG.error(
|
||||
"Failed to delete compute node resource provider "
|
||||
"for compute node %s: %s", cn.uuid, str(e))
|
||||
|
||||
for nodename in nodenames:
|
||||
self._update_available_resource_for_node(context, nodename,
|
||||
|
|
|
@ -761,14 +761,24 @@ def compute_node_update(context, compute_id, values):
|
|||
|
||||
|
||||
@pick_context_manager_writer
|
||||
def compute_node_delete(context, compute_id):
|
||||
def compute_node_delete(context, compute_id, constraint=None):
|
||||
"""Delete a ComputeNode record."""
|
||||
result = model_query(context, models.ComputeNode).\
|
||||
filter_by(id=compute_id).\
|
||||
soft_delete(synchronize_session=False)
|
||||
query = model_query(context, models.ComputeNode).filter_by(id=compute_id)
|
||||
|
||||
if constraint is not None:
|
||||
query = constraint.apply(models.ComputeNode, query)
|
||||
|
||||
result = query.soft_delete(synchronize_session=False)
|
||||
|
||||
if not result:
|
||||
raise exception.ComputeHostNotFound(host=compute_id)
|
||||
# The soft_delete could fail for one of two reasons:
|
||||
# 1) The compute node no longer exists
|
||||
# 2) The constraint, if specified, was not met
|
||||
# Try to read the compute node and let it raise ComputeHostNotFound if
|
||||
# 1) happened.
|
||||
compute_node_get(context, compute_id)
|
||||
# Else, raise ConstraintNotMet if 2) happened.
|
||||
raise exception.ConstraintNotMet()
|
||||
|
||||
|
||||
@pick_context_manager_reader
|
||||
|
|
|
@ -360,7 +360,20 @@ class ComputeNode(base.NovaPersistentObject, base.NovaObject):
|
|||
|
||||
@base.remotable
|
||||
def destroy(self):
|
||||
db.compute_node_delete(self._context, self.id)
|
||||
if self.obj_attr_is_set('host') and self.host:
|
||||
# NOTE(melwitt): If our host is set, avoid a race between
|
||||
# nova-computes during ironic driver node rebalances which can
|
||||
# change node ownership.
|
||||
constraint = db.constraint(host=db.equal_any(self.host))
|
||||
else:
|
||||
constraint = None
|
||||
|
||||
try:
|
||||
sa_api.compute_node_delete(
|
||||
self._context, self.id, constraint=constraint)
|
||||
except exception.ConstraintNotMet:
|
||||
raise exception.ObjectActionError(action='destroy',
|
||||
reason='host changed')
|
||||
|
||||
def update_from_virt_driver(self, resources):
|
||||
# NOTE(pmurray): the virt driver provides a dict of values that
|
||||
|
|
|
@ -133,24 +133,35 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
|
|||
# Mock out the compute node query to simulate a race condition where
|
||||
# the list includes an orphan compute node that is newly owned by
|
||||
# host_b by the time host_a attempts to delete it.
|
||||
# FIXME(mgoddard): Ideally host_a would not delete a node that does not
|
||||
# belong to it. See https://bugs.launchpad.net/nova/+bug/1853009.
|
||||
with mock.patch(
|
||||
'nova.compute.manager.ComputeManager._get_compute_nodes_in_db'
|
||||
) as mock_get:
|
||||
mock_get.return_value = [cn]
|
||||
host_a.manager.update_available_resource(self.ctxt)
|
||||
|
||||
# Verify that the node was deleted.
|
||||
# Verify that the node was almost deleted, but was saved by the host
|
||||
# check
|
||||
self.assertIn(
|
||||
'Deleting orphan compute node %s hypervisor host '
|
||||
'is fake-node, nodes are' % cn.id,
|
||||
self.stdlog.logger.output)
|
||||
hypervisors = self.api.api_get(
|
||||
'/os-hypervisors/detail').body['hypervisors']
|
||||
self.assertEqual(0, len(hypervisors), hypervisors)
|
||||
self.assertIn(
|
||||
'Ignoring failure to delete orphan compute node %s on '
|
||||
'hypervisor host fake-node' % cn.id,
|
||||
self.stdlog.logger.output)
|
||||
self._assert_hypervisor_api(self.nodename, 'host_b')
|
||||
rps = self._get_all_providers()
|
||||
self.assertEqual(0, len(rps), rps)
|
||||
self.assertEqual(1, len(rps), rps)
|
||||
self.assertEqual(self.nodename, rps[0]['name'])
|
||||
|
||||
# Simulate deletion of an orphan by host_a. It shouldn't happen
|
||||
# anymore, but let's assume it already did.
|
||||
cn = objects.ComputeNode.get_by_host_and_nodename(
|
||||
self.ctxt, 'host_b', self.nodename)
|
||||
cn.destroy()
|
||||
host_a.manager.rt.remove_node(cn.hypervisor_hostname)
|
||||
host_a.manager.reportclient.delete_resource_provider(
|
||||
self.ctxt, cn, cascade=True)
|
||||
|
||||
# host_b[3]: Should recreate compute node and resource provider.
|
||||
# FIXME(mgoddard): Resource provider not recreated here, due to
|
||||
|
|
|
@ -200,13 +200,14 @@ class BaseTestCase(test.TestCase):
|
|||
context, objects.ComputeNode(), cn)
|
||||
for cn in fake_compute_nodes]
|
||||
|
||||
def fake_compute_node_delete(context, compute_node_id):
|
||||
def fake_compute_node_delete(context, compute_node_id,
|
||||
constraint=None):
|
||||
self.assertEqual(2, compute_node_id)
|
||||
|
||||
self.stub_out(
|
||||
'nova.compute.manager.ComputeManager._get_compute_nodes_in_db',
|
||||
fake_get_compute_nodes_in_db)
|
||||
self.stub_out('nova.db.api.compute_node_delete',
|
||||
self.stub_out('nova.db.sqlalchemy.api.compute_node_delete',
|
||||
fake_compute_node_delete)
|
||||
|
||||
self.compute.update_available_resource(
|
||||
|
|
|
@ -409,6 +409,33 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
|||
update_mock.assert_not_called()
|
||||
del_rp_mock.assert_not_called()
|
||||
|
||||
@mock.patch.object(manager.ComputeManager,
|
||||
'_update_available_resource_for_node')
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
|
||||
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
|
||||
def test_update_available_resource_destroy_rebalance(
|
||||
self, get_db_nodes, get_avail_nodes, update_mock):
|
||||
mock_rt = self._mock_rt()
|
||||
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
|
||||
self.compute, 'reportclient')).mock
|
||||
db_nodes = [self._make_compute_node('node1', 1)]
|
||||
get_db_nodes.return_value = db_nodes
|
||||
# Destroy can fail if nodes were rebalanced between getting the node
|
||||
# list and calling destroy.
|
||||
db_nodes[0].destroy.side_effect = exception.ObjectActionError(
|
||||
action='destroy', reason='host changed')
|
||||
get_avail_nodes.return_value = set()
|
||||
self.compute.update_available_resource(self.context)
|
||||
get_db_nodes.assert_called_once_with(self.context, set(),
|
||||
use_slave=True, startup=False)
|
||||
self.assertEqual(0, update_mock.call_count)
|
||||
|
||||
db_nodes[0].destroy.assert_called_once_with()
|
||||
self.assertEqual(0, rc_mock.delete_resource_provider.call_count)
|
||||
mock_rt.remove_node.assert_called_once_with('node1')
|
||||
rc_mock.invalidate_resource_provider.assert_called_once_with(
|
||||
db_nodes[0].uuid)
|
||||
|
||||
@mock.patch('nova.context.get_admin_context')
|
||||
def test_pre_start_hook(self, get_admin_context):
|
||||
"""Very simple test just to make sure update_available_resource is
|
||||
|
|
|
@ -5545,6 +5545,13 @@ class ComputeNodeTestCase(test.TestCase, ModelsObjectComparatorMixin):
|
|||
nodes = db.compute_node_get_all(self.ctxt)
|
||||
self.assertEqual(len(nodes), 0)
|
||||
|
||||
def test_compute_node_delete_different_host(self):
|
||||
compute_node_id = self.item['id']
|
||||
constraint = db.constraint(host=db.equal_any('different-host'))
|
||||
self.assertRaises(exception.ConstraintNotMet,
|
||||
sqlalchemy_api.compute_node_delete,
|
||||
self.ctxt, compute_node_id, constraint=constraint)
|
||||
|
||||
def test_compute_node_search_by_hypervisor(self):
|
||||
nodes_created = []
|
||||
new_service = copy.copy(self.service_dict)
|
||||
|
|
|
@ -24,6 +24,7 @@ from oslo_versionedobjects import exception as ovo_exc
|
|||
|
||||
from nova import conf
|
||||
from nova.db import api as db
|
||||
from nova.db.sqlalchemy import api as sa_api
|
||||
from nova import exception
|
||||
from nova import objects
|
||||
from nova.objects import base
|
||||
|
@ -410,12 +411,22 @@ class _TestComputeNodeObject(object):
|
|||
self.assertRaises(ovo_exc.ReadOnlyFieldError, setattr,
|
||||
compute, 'id', 124)
|
||||
|
||||
@mock.patch.object(db, 'compute_node_delete')
|
||||
@mock.patch.object(sa_api, 'compute_node_delete')
|
||||
def test_destroy(self, mock_delete):
|
||||
compute = compute_node.ComputeNode(context=self.context)
|
||||
compute.id = 123
|
||||
compute.destroy()
|
||||
mock_delete.assert_called_once_with(self.context, 123)
|
||||
mock_delete.assert_called_once_with(self.context, 123, constraint=None)
|
||||
|
||||
def test_destroy_host_constraint(self):
|
||||
# Create compute node with host='fake'
|
||||
compute = fake_compute_with_resources.obj_clone()
|
||||
compute._context = self.context
|
||||
compute.host = 'fake'
|
||||
compute.create()
|
||||
# Simulate a compute node ownership change due to a node rebalance
|
||||
compute.host = 'different'
|
||||
self.assertRaises(exception.ObjectActionError, compute.destroy)
|
||||
|
||||
@mock.patch.object(db, 'compute_node_get_all')
|
||||
def test_get_all(self, mock_get_all):
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
---
|
||||
fixes:
|
||||
- |
|
||||
Fixes an issue with multiple ``nova-compute`` services used with Ironic,
|
||||
where a rebalance operation could result in a compute node being deleted
|
||||
from the database and not recreated. See `bug 1853009
|
||||
<https://bugs.launchpad.net/nova/+bug/1853009>`__ for details.
|
Loading…
Reference in New Issue