Ironic nodes with instance reserved in placement

Currently, when you delete an ironic instance, we trigger
and undeploy in ironic and we release our allocation in placement.
We do this well before the ironic node is actually available.

We have attempted to fix this my marking unavailable nodes
as reserved in placement. This works great until you try
and re-image lots of nodes.

It turns out, ironic nodes that are waiting for their automatic
clean to finish, are returned as a valid allocation candidates
for quite some time. Eventually we mark then as reserved.

This patch takes a strange approach, if we mark all nodes as
reserved as soon as the instance lands, we close the race.
That is, when the allocation is removed the node is still
unavailable until the next update of placement is done and
notices that the node has become available. That may or may
not have been after automatic cleaning. The trade off is
that when you don't have automatic cleaning, we wait a bit
longer to notice the node is available again.

Note, this is also useful when a broken Ironic node is
marked as in-maintainance while it is in-use by a nova
instance. In a similar way, we mark the Nova as reserved
immmeidately, rather than first waiting for the instance to be
deleted before reserving the resources in Placement.

Closes-Bug: #1974070
Change-Id: Iab92124b5776a799c7f90d07281d28fcf191c8fe
(cherry picked from commit 3c022e9683)
This commit is contained in:
John Garbutt 2022-11-16 17:12:40 +00:00 committed by Ruby Loo
parent d92d093418
commit c9de185ea1
4 changed files with 89 additions and 11 deletions

View File

@ -416,6 +416,21 @@ compatibility on the destination host during live migration.
help="""
When this is enabled, it will skip version-checking of hypervisors
during live migration.
"""),
cfg.BoolOpt(
'skip_reserve_in_use_ironic_nodes',
default=False,
help="""
This may be useful if you use the Ironic driver, but don't have
automatic cleaning enabled in Ironic. Nova, by default, will mark
Ironic nodes as reserved as soon as they are in use. When you free
the Ironic node (by deleting the nova instance) it takes a while
for Nova to un-reserve that Ironic node in placement. Usually this
is a good idea, because it avoids placement providing an Ironic
as a valid candidate when it is still being cleaned.
Howerver, if you don't use automatic cleaning, it can cause an
extra delay before and Ironic node is available for building a
new Nova instance.
"""),
]

View File

@ -932,6 +932,48 @@ class IronicDriverTestCase(test.NoDBTestCase):
self.driver.update_provider_tree(self.ptree, mock.sentinel.nodename)
expected = {
'CUSTOM_IRON_NFV': {
'total': 1,
'reserved': 1,
'min_unit': 1,
'max_unit': 1,
'step_size': 1,
'allocation_ratio': 1.0,
},
}
mock_nfc.assert_called_once_with(mock.sentinel.nodename)
mock_nr.assert_called_once_with(mock_nfc.return_value)
mock_res_used.assert_called_once_with(mock_nfc.return_value)
mock_res_unavail.assert_called_once_with(mock_nfc.return_value)
result = self.ptree.data(mock.sentinel.nodename).inventory
self.assertEqual(expected, result)
@mock.patch.object(ironic_driver.IronicDriver,
'_node_resources_used', return_value=True)
@mock.patch.object(ironic_driver.IronicDriver,
'_node_resources_unavailable', return_value=False)
@mock.patch.object(ironic_driver.IronicDriver, '_node_resource')
@mock.patch.object(ironic_driver.IronicDriver, '_node_from_cache')
def test_update_provider_tree_with_rc_occupied_workaround(self,
mock_nfc, mock_nr, mock_res_unavail, mock_res_used):
"""Ensure that when a node is used, we report the inventory matching
the consumed resources.
"""
self.flags(skip_reserve_in_use_ironic_nodes=True,
group="workarounds")
mock_nr.return_value = {
'vcpus': 24,
'vcpus_used': 24,
'memory_mb': 1024,
'memory_mb_used': 1024,
'local_gb': 100,
'local_gb_used': 100,
'resource_class': 'iron-nfv',
}
self.driver.update_provider_tree(self.ptree, mock.sentinel.nodename)
expected = {
'CUSTOM_IRON_NFV': {
'total': 1,
@ -945,7 +987,7 @@ class IronicDriverTestCase(test.NoDBTestCase):
mock_nfc.assert_called_once_with(mock.sentinel.nodename)
mock_nr.assert_called_once_with(mock_nfc.return_value)
mock_res_used.assert_called_once_with(mock_nfc.return_value)
self.assertFalse(mock_res_unavail.called)
mock_res_unavail.assert_called_once_with(mock_nfc.return_value)
result = self.ptree.data(mock.sentinel.nodename).inventory
self.assertEqual(expected, result)
@ -1016,7 +1058,7 @@ class IronicDriverTestCase(test.NoDBTestCase):
mock_nfc.assert_called_once_with(mock.sentinel.nodename)
mock_nr.assert_called_once_with(mock_nfc.return_value)
mock_res_used.assert_called_once_with(mock_nfc.return_value)
self.assertFalse(mock_res_unavail.called)
mock_res_unavail.assert_called_once_with(mock_nfc.return_value)
result = self.ptree.data(mock.sentinel.nodename).traits
self.assertEqual(set(), result)
@ -1048,7 +1090,7 @@ class IronicDriverTestCase(test.NoDBTestCase):
mock_nfc.assert_called_once_with(mock.sentinel.nodename)
mock_nr.assert_called_once_with(mock_nfc.return_value)
mock_res_used.assert_called_once_with(mock_nfc.return_value)
self.assertFalse(mock_res_unavail.called)
mock_res_unavail.assert_called_once_with(mock_nfc.return_value)
result = self.ptree.data(mock.sentinel.nodename).traits
self.assertEqual(set(traits), result)

View File

@ -874,15 +874,25 @@ class IronicDriver(virt_driver.ComputeDriver):
"""
# nodename is the ironic node's UUID.
node = self._node_from_cache(nodename)
reserved = False
if (not self._node_resources_used(node) and
self._node_resources_unavailable(node)):
LOG.debug('Node %(node)s is not ready for a deployment, '
'reporting resources as reserved for it. Node\'s '
'provision state is %(prov)s, power state is '
'%(power)s and maintenance is %(maint)s.',
{'node': node.uuid, 'prov': node.provision_state,
'power': node.power_state, 'maint': node.maintenance})
if self._node_resources_unavailable(node):
# Operators might mark a node as in maintainance,
# even when an instance is on the node,
# either way lets mark this as reserved
reserved = True
if (self._node_resources_used(node) and
not CONF.workarounds.skip_reserve_in_use_ironic_nodes):
# Make resources as reserved once we have
# and instance here.
# When the allocation is deleted, most likely
# automatic clean will start, so we keep the node
# reserved until it becomes available again.
# In the case without automatic clean, once
# the allocation is removed in placement it
# also stays as reserved until we notice on
# the next periodic its actually available.
reserved = True
info = self._node_resource(node)

View File

@ -0,0 +1,11 @@
---
fixes:
- |
Fixed when placement returns ironic nodes that have just started automatic
cleaning as possible valid candidates. This is done by marking all ironic
nodes with an instance on them as reserved, such that nova only makes them
available once we have double checked Ironic reports the node as available.
If you don't have automatic cleaning on, this might mean it takes longer
than normal for Ironic nodes to become available for new instances.
If you want the old behaviour use the following workaround config:
`[workarounds]skip_reserve_in_use_ironic_nodes=true`