Merge "[ironic] Minimize window for a resource provider to be lost" into stable/wallaby

This commit is contained in:
Zuul 2021-07-16 08:32:27 +00:00 committed by Gerrit Code Review
commit b0099aa8a2
3 changed files with 40 additions and 1 deletions

View File

@ -3055,6 +3055,9 @@ class NodeCacheTestCase(test.NoDBTestCase):
mock_instances.return_value = instances
mock_nodes.return_value = nodes
mock_hosts.side_effect = hosts
parent_mock = mock.MagicMock()
parent_mock.attach_mock(mock_nodes, 'get_node_list')
parent_mock.attach_mock(mock_instances, 'get_uuids_by_host')
if not can_send_146:
mock_can_send.side_effect = (
exception.IronicAPIVersionNotAvailable(version='1.46'))
@ -3067,6 +3070,15 @@ class NodeCacheTestCase(test.NoDBTestCase):
self.driver._refresh_cache()
# assert if get_node_list() is called before get_uuids_by_host()
parent_mock.assert_has_calls(
[
mock.call.get_node_list(fields=ironic_driver._NODE_FIELDS,
**kwargs),
mock.call.get_uuids_by_host(mock.ANY, self.host)
]
)
mock_hash_ring.assert_called_once_with(mock.ANY)
mock_instances.assert_called_once_with(mock.ANY, self.host)
mock_nodes.assert_called_once_with(fields=ironic_driver._NODE_FIELDS,

View File

@ -733,10 +733,15 @@ class IronicDriver(virt_driver.ComputeDriver):
def _refresh_cache(self):
ctxt = nova_context.get_admin_context()
self._refresh_hash_ring(ctxt)
instances = objects.InstanceList.get_uuids_by_host(ctxt, CONF.host)
node_cache = {}
def _get_node_list(**kwargs):
# NOTE(TheJulia): This call can take a substantial amount
# of time as it may be attempting to retrieve thousands of
# baremetal nodes. Depending on the version of Ironic,
# this can be as long as 2-10 seconds per every thousand
# nodes, and this call may retrieve all nodes in a deployment,
# depending on if any filter paramters are applied.
return self._get_node_list(fields=_NODE_FIELDS, **kwargs)
# NOTE(jroll) if partition_key is set, we need to limit nodes that
@ -760,6 +765,15 @@ class IronicDriver(virt_driver.ComputeDriver):
else:
nodes = _get_node_list()
# NOTE(saga): As _get_node_list() will take a long
# time to return in large clusters we need to call it before
# get_uuids_by_host() method. Otherwise the instances list we get from
# get_uuids_by_host() method will become stale.
# A stale instances list can cause a node that is managed by this
# compute host to be excluded in error and cause the compute node
# to be orphaned and associated resource provider to be deleted.
instances = objects.InstanceList.get_uuids_by_host(ctxt, CONF.host)
for node in nodes:
# NOTE(jroll): we always manage the nodes for instances we manage
if node.instance_uuid in instances:

View File

@ -0,0 +1,13 @@
---
fixes:
- |
Minimizes a race condition window when using the ``ironic`` virt driver
where the data generated for the Resource Tracker may attempt to compare
potentially stale instance information with the latest known baremetal
node information. While this doesn't completely prevent nor resolve the
underlying race condition identified in
`bug 1841481 <https://bugs.launchpad.net/nova/+bug/1841481>`_,
this change allows Nova to have the latest state information, as opposed
to state information which may be out of date due to the time which it may
take to retrieve the status from Ironic. This issue was most observable
on baremetal clusters with several thousand physical nodes.