Merge "Prevent deletion of a compute node belonging to another host" into stable/wallaby
This commit is contained in:
commit
68cad8fb98
@ -10014,13 +10014,26 @@ class ComputeManager(manager.Manager):
|
|||||||
"nodes are %(nodes)s",
|
"nodes are %(nodes)s",
|
||||||
{'id': cn.id, 'hh': cn.hypervisor_hostname,
|
{'id': cn.id, 'hh': cn.hypervisor_hostname,
|
||||||
'nodes': nodenames})
|
'nodes': nodenames})
|
||||||
|
try:
|
||||||
cn.destroy()
|
cn.destroy()
|
||||||
|
except exception.ObjectActionError:
|
||||||
|
# NOTE(mgoddard): it's possible that another compute
|
||||||
|
# service took ownership of this compute node since we
|
||||||
|
# queried it due to a rebalance, and this will cause the
|
||||||
|
# deletion to fail. Ignore the error in that case.
|
||||||
|
LOG.info("Ignoring failure to delete orphan compute node "
|
||||||
|
"%(id)s on hypervisor host %(hh)s due to "
|
||||||
|
"possible node rebalance",
|
||||||
|
{'id': cn.id, 'hh': cn.hypervisor_hostname})
|
||||||
|
self.rt.remove_node(cn.hypervisor_hostname)
|
||||||
|
self.reportclient.invalidate_resource_provider(cn.uuid)
|
||||||
|
else:
|
||||||
self.rt.remove_node(cn.hypervisor_hostname)
|
self.rt.remove_node(cn.hypervisor_hostname)
|
||||||
# Delete the corresponding resource provider in placement,
|
# Delete the corresponding resource provider in placement,
|
||||||
# along with any associated allocations.
|
# along with any associated allocations.
|
||||||
try:
|
try:
|
||||||
self.reportclient.delete_resource_provider(context, cn,
|
self.reportclient.delete_resource_provider(
|
||||||
cascade=True)
|
context, cn, cascade=True)
|
||||||
except keystone_exception.ClientException as e:
|
except keystone_exception.ClientException as e:
|
||||||
LOG.error(
|
LOG.error(
|
||||||
"Failed to delete compute node resource provider "
|
"Failed to delete compute node resource provider "
|
||||||
|
@ -761,14 +761,24 @@ def compute_node_update(context, compute_id, values):
|
|||||||
|
|
||||||
|
|
||||||
@pick_context_manager_writer
|
@pick_context_manager_writer
|
||||||
def compute_node_delete(context, compute_id):
|
def compute_node_delete(context, compute_id, constraint=None):
|
||||||
"""Delete a ComputeNode record."""
|
"""Delete a ComputeNode record."""
|
||||||
result = model_query(context, models.ComputeNode).\
|
query = model_query(context, models.ComputeNode).filter_by(id=compute_id)
|
||||||
filter_by(id=compute_id).\
|
|
||||||
soft_delete(synchronize_session=False)
|
if constraint is not None:
|
||||||
|
query = constraint.apply(models.ComputeNode, query)
|
||||||
|
|
||||||
|
result = query.soft_delete(synchronize_session=False)
|
||||||
|
|
||||||
if not result:
|
if not result:
|
||||||
raise exception.ComputeHostNotFound(host=compute_id)
|
# The soft_delete could fail for one of two reasons:
|
||||||
|
# 1) The compute node no longer exists
|
||||||
|
# 2) The constraint, if specified, was not met
|
||||||
|
# Try to read the compute node and let it raise ComputeHostNotFound if
|
||||||
|
# 1) happened.
|
||||||
|
compute_node_get(context, compute_id)
|
||||||
|
# Else, raise ConstraintNotMet if 2) happened.
|
||||||
|
raise exception.ConstraintNotMet()
|
||||||
|
|
||||||
|
|
||||||
@pick_context_manager_reader
|
@pick_context_manager_reader
|
||||||
|
@ -360,7 +360,20 @@ class ComputeNode(base.NovaPersistentObject, base.NovaObject):
|
|||||||
|
|
||||||
@base.remotable
|
@base.remotable
|
||||||
def destroy(self):
|
def destroy(self):
|
||||||
db.compute_node_delete(self._context, self.id)
|
if self.obj_attr_is_set('host') and self.host:
|
||||||
|
# NOTE(melwitt): If our host is set, avoid a race between
|
||||||
|
# nova-computes during ironic driver node rebalances which can
|
||||||
|
# change node ownership.
|
||||||
|
constraint = db.constraint(host=db.equal_any(self.host))
|
||||||
|
else:
|
||||||
|
constraint = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
sa_api.compute_node_delete(
|
||||||
|
self._context, self.id, constraint=constraint)
|
||||||
|
except exception.ConstraintNotMet:
|
||||||
|
raise exception.ObjectActionError(action='destroy',
|
||||||
|
reason='host changed')
|
||||||
|
|
||||||
def update_from_virt_driver(self, resources):
|
def update_from_virt_driver(self, resources):
|
||||||
# NOTE(pmurray): the virt driver provides a dict of values that
|
# NOTE(pmurray): the virt driver provides a dict of values that
|
||||||
|
@ -133,24 +133,35 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
|
|||||||
# Mock out the compute node query to simulate a race condition where
|
# Mock out the compute node query to simulate a race condition where
|
||||||
# the list includes an orphan compute node that is newly owned by
|
# the list includes an orphan compute node that is newly owned by
|
||||||
# host_b by the time host_a attempts to delete it.
|
# host_b by the time host_a attempts to delete it.
|
||||||
# FIXME(mgoddard): Ideally host_a would not delete a node that does not
|
|
||||||
# belong to it. See https://bugs.launchpad.net/nova/+bug/1853009.
|
|
||||||
with mock.patch(
|
with mock.patch(
|
||||||
'nova.compute.manager.ComputeManager._get_compute_nodes_in_db'
|
'nova.compute.manager.ComputeManager._get_compute_nodes_in_db'
|
||||||
) as mock_get:
|
) as mock_get:
|
||||||
mock_get.return_value = [cn]
|
mock_get.return_value = [cn]
|
||||||
host_a.manager.update_available_resource(self.ctxt)
|
host_a.manager.update_available_resource(self.ctxt)
|
||||||
|
|
||||||
# Verify that the node was deleted.
|
# Verify that the node was almost deleted, but was saved by the host
|
||||||
|
# check
|
||||||
self.assertIn(
|
self.assertIn(
|
||||||
'Deleting orphan compute node %s hypervisor host '
|
'Deleting orphan compute node %s hypervisor host '
|
||||||
'is fake-node, nodes are' % cn.id,
|
'is fake-node, nodes are' % cn.id,
|
||||||
self.stdlog.logger.output)
|
self.stdlog.logger.output)
|
||||||
hypervisors = self.api.api_get(
|
self.assertIn(
|
||||||
'/os-hypervisors/detail').body['hypervisors']
|
'Ignoring failure to delete orphan compute node %s on '
|
||||||
self.assertEqual(0, len(hypervisors), hypervisors)
|
'hypervisor host fake-node' % cn.id,
|
||||||
|
self.stdlog.logger.output)
|
||||||
|
self._assert_hypervisor_api(self.nodename, 'host_b')
|
||||||
rps = self._get_all_providers()
|
rps = self._get_all_providers()
|
||||||
self.assertEqual(0, len(rps), rps)
|
self.assertEqual(1, len(rps), rps)
|
||||||
|
self.assertEqual(self.nodename, rps[0]['name'])
|
||||||
|
|
||||||
|
# Simulate deletion of an orphan by host_a. It shouldn't happen
|
||||||
|
# anymore, but let's assume it already did.
|
||||||
|
cn = objects.ComputeNode.get_by_host_and_nodename(
|
||||||
|
self.ctxt, 'host_b', self.nodename)
|
||||||
|
cn.destroy()
|
||||||
|
host_a.manager.rt.remove_node(cn.hypervisor_hostname)
|
||||||
|
host_a.manager.reportclient.delete_resource_provider(
|
||||||
|
self.ctxt, cn, cascade=True)
|
||||||
|
|
||||||
# host_b[3]: Should recreate compute node and resource provider.
|
# host_b[3]: Should recreate compute node and resource provider.
|
||||||
# FIXME(mgoddard): Resource provider not recreated here, due to
|
# FIXME(mgoddard): Resource provider not recreated here, due to
|
||||||
|
@ -200,13 +200,14 @@ class BaseTestCase(test.TestCase):
|
|||||||
context, objects.ComputeNode(), cn)
|
context, objects.ComputeNode(), cn)
|
||||||
for cn in fake_compute_nodes]
|
for cn in fake_compute_nodes]
|
||||||
|
|
||||||
def fake_compute_node_delete(context, compute_node_id):
|
def fake_compute_node_delete(context, compute_node_id,
|
||||||
|
constraint=None):
|
||||||
self.assertEqual(2, compute_node_id)
|
self.assertEqual(2, compute_node_id)
|
||||||
|
|
||||||
self.stub_out(
|
self.stub_out(
|
||||||
'nova.compute.manager.ComputeManager._get_compute_nodes_in_db',
|
'nova.compute.manager.ComputeManager._get_compute_nodes_in_db',
|
||||||
fake_get_compute_nodes_in_db)
|
fake_get_compute_nodes_in_db)
|
||||||
self.stub_out('nova.db.api.compute_node_delete',
|
self.stub_out('nova.db.sqlalchemy.api.compute_node_delete',
|
||||||
fake_compute_node_delete)
|
fake_compute_node_delete)
|
||||||
|
|
||||||
self.compute.update_available_resource(
|
self.compute.update_available_resource(
|
||||||
|
@ -409,6 +409,33 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
|||||||
update_mock.assert_not_called()
|
update_mock.assert_not_called()
|
||||||
del_rp_mock.assert_not_called()
|
del_rp_mock.assert_not_called()
|
||||||
|
|
||||||
|
@mock.patch.object(manager.ComputeManager,
|
||||||
|
'_update_available_resource_for_node')
|
||||||
|
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
|
||||||
|
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
|
||||||
|
def test_update_available_resource_destroy_rebalance(
|
||||||
|
self, get_db_nodes, get_avail_nodes, update_mock):
|
||||||
|
mock_rt = self._mock_rt()
|
||||||
|
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
|
||||||
|
self.compute, 'reportclient')).mock
|
||||||
|
db_nodes = [self._make_compute_node('node1', 1)]
|
||||||
|
get_db_nodes.return_value = db_nodes
|
||||||
|
# Destroy can fail if nodes were rebalanced between getting the node
|
||||||
|
# list and calling destroy.
|
||||||
|
db_nodes[0].destroy.side_effect = exception.ObjectActionError(
|
||||||
|
action='destroy', reason='host changed')
|
||||||
|
get_avail_nodes.return_value = set()
|
||||||
|
self.compute.update_available_resource(self.context)
|
||||||
|
get_db_nodes.assert_called_once_with(self.context, set(),
|
||||||
|
use_slave=True, startup=False)
|
||||||
|
self.assertEqual(0, update_mock.call_count)
|
||||||
|
|
||||||
|
db_nodes[0].destroy.assert_called_once_with()
|
||||||
|
self.assertEqual(0, rc_mock.delete_resource_provider.call_count)
|
||||||
|
mock_rt.remove_node.assert_called_once_with('node1')
|
||||||
|
rc_mock.invalidate_resource_provider.assert_called_once_with(
|
||||||
|
db_nodes[0].uuid)
|
||||||
|
|
||||||
@mock.patch('nova.context.get_admin_context')
|
@mock.patch('nova.context.get_admin_context')
|
||||||
def test_pre_start_hook(self, get_admin_context):
|
def test_pre_start_hook(self, get_admin_context):
|
||||||
"""Very simple test just to make sure update_available_resource is
|
"""Very simple test just to make sure update_available_resource is
|
||||||
|
@ -5545,6 +5545,13 @@ class ComputeNodeTestCase(test.TestCase, ModelsObjectComparatorMixin):
|
|||||||
nodes = db.compute_node_get_all(self.ctxt)
|
nodes = db.compute_node_get_all(self.ctxt)
|
||||||
self.assertEqual(len(nodes), 0)
|
self.assertEqual(len(nodes), 0)
|
||||||
|
|
||||||
|
def test_compute_node_delete_different_host(self):
|
||||||
|
compute_node_id = self.item['id']
|
||||||
|
constraint = db.constraint(host=db.equal_any('different-host'))
|
||||||
|
self.assertRaises(exception.ConstraintNotMet,
|
||||||
|
sqlalchemy_api.compute_node_delete,
|
||||||
|
self.ctxt, compute_node_id, constraint=constraint)
|
||||||
|
|
||||||
def test_compute_node_search_by_hypervisor(self):
|
def test_compute_node_search_by_hypervisor(self):
|
||||||
nodes_created = []
|
nodes_created = []
|
||||||
new_service = copy.copy(self.service_dict)
|
new_service = copy.copy(self.service_dict)
|
||||||
|
@ -24,6 +24,7 @@ from oslo_versionedobjects import exception as ovo_exc
|
|||||||
|
|
||||||
from nova import conf
|
from nova import conf
|
||||||
from nova.db import api as db
|
from nova.db import api as db
|
||||||
|
from nova.db.sqlalchemy import api as sa_api
|
||||||
from nova import exception
|
from nova import exception
|
||||||
from nova import objects
|
from nova import objects
|
||||||
from nova.objects import base
|
from nova.objects import base
|
||||||
@ -410,12 +411,22 @@ class _TestComputeNodeObject(object):
|
|||||||
self.assertRaises(ovo_exc.ReadOnlyFieldError, setattr,
|
self.assertRaises(ovo_exc.ReadOnlyFieldError, setattr,
|
||||||
compute, 'id', 124)
|
compute, 'id', 124)
|
||||||
|
|
||||||
@mock.patch.object(db, 'compute_node_delete')
|
@mock.patch.object(sa_api, 'compute_node_delete')
|
||||||
def test_destroy(self, mock_delete):
|
def test_destroy(self, mock_delete):
|
||||||
compute = compute_node.ComputeNode(context=self.context)
|
compute = compute_node.ComputeNode(context=self.context)
|
||||||
compute.id = 123
|
compute.id = 123
|
||||||
compute.destroy()
|
compute.destroy()
|
||||||
mock_delete.assert_called_once_with(self.context, 123)
|
mock_delete.assert_called_once_with(self.context, 123, constraint=None)
|
||||||
|
|
||||||
|
def test_destroy_host_constraint(self):
|
||||||
|
# Create compute node with host='fake'
|
||||||
|
compute = fake_compute_with_resources.obj_clone()
|
||||||
|
compute._context = self.context
|
||||||
|
compute.host = 'fake'
|
||||||
|
compute.create()
|
||||||
|
# Simulate a compute node ownership change due to a node rebalance
|
||||||
|
compute.host = 'different'
|
||||||
|
self.assertRaises(exception.ObjectActionError, compute.destroy)
|
||||||
|
|
||||||
@mock.patch.object(db, 'compute_node_get_all')
|
@mock.patch.object(db, 'compute_node_get_all')
|
||||||
def test_get_all(self, mock_get_all):
|
def test_get_all(self, mock_get_all):
|
||||||
|
7
releasenotes/notes/bug-1853009-99414e14d1491b5f.yaml
Normal file
7
releasenotes/notes/bug-1853009-99414e14d1491b5f.yaml
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
---
|
||||||
|
fixes:
|
||||||
|
- |
|
||||||
|
Fixes an issue with multiple ``nova-compute`` services used with Ironic,
|
||||||
|
where a rebalance operation could result in a compute node being deleted
|
||||||
|
from the database and not recreated. See `bug 1853009
|
||||||
|
<https://bugs.launchpad.net/nova/+bug/1853009>`__ for details.
|
Loading…
Reference in New Issue
Block a user