Merge "[ironic] Minimize window for a resource provider to be lost" into stable/wallaby
This commit is contained in:
commit
b0099aa8a2
|
@ -3055,6 +3055,9 @@ class NodeCacheTestCase(test.NoDBTestCase):
|
|||
mock_instances.return_value = instances
|
||||
mock_nodes.return_value = nodes
|
||||
mock_hosts.side_effect = hosts
|
||||
parent_mock = mock.MagicMock()
|
||||
parent_mock.attach_mock(mock_nodes, 'get_node_list')
|
||||
parent_mock.attach_mock(mock_instances, 'get_uuids_by_host')
|
||||
if not can_send_146:
|
||||
mock_can_send.side_effect = (
|
||||
exception.IronicAPIVersionNotAvailable(version='1.46'))
|
||||
|
@ -3067,6 +3070,15 @@ class NodeCacheTestCase(test.NoDBTestCase):
|
|||
|
||||
self.driver._refresh_cache()
|
||||
|
||||
# assert if get_node_list() is called before get_uuids_by_host()
|
||||
parent_mock.assert_has_calls(
|
||||
[
|
||||
mock.call.get_node_list(fields=ironic_driver._NODE_FIELDS,
|
||||
**kwargs),
|
||||
mock.call.get_uuids_by_host(mock.ANY, self.host)
|
||||
]
|
||||
)
|
||||
|
||||
mock_hash_ring.assert_called_once_with(mock.ANY)
|
||||
mock_instances.assert_called_once_with(mock.ANY, self.host)
|
||||
mock_nodes.assert_called_once_with(fields=ironic_driver._NODE_FIELDS,
|
||||
|
|
|
@ -733,10 +733,15 @@ class IronicDriver(virt_driver.ComputeDriver):
|
|||
def _refresh_cache(self):
|
||||
ctxt = nova_context.get_admin_context()
|
||||
self._refresh_hash_ring(ctxt)
|
||||
instances = objects.InstanceList.get_uuids_by_host(ctxt, CONF.host)
|
||||
node_cache = {}
|
||||
|
||||
def _get_node_list(**kwargs):
|
||||
# NOTE(TheJulia): This call can take a substantial amount
|
||||
# of time as it may be attempting to retrieve thousands of
|
||||
# baremetal nodes. Depending on the version of Ironic,
|
||||
# this can be as long as 2-10 seconds per every thousand
|
||||
# nodes, and this call may retrieve all nodes in a deployment,
|
||||
# depending on if any filter paramters are applied.
|
||||
return self._get_node_list(fields=_NODE_FIELDS, **kwargs)
|
||||
|
||||
# NOTE(jroll) if partition_key is set, we need to limit nodes that
|
||||
|
@ -760,6 +765,15 @@ class IronicDriver(virt_driver.ComputeDriver):
|
|||
else:
|
||||
nodes = _get_node_list()
|
||||
|
||||
# NOTE(saga): As _get_node_list() will take a long
|
||||
# time to return in large clusters we need to call it before
|
||||
# get_uuids_by_host() method. Otherwise the instances list we get from
|
||||
# get_uuids_by_host() method will become stale.
|
||||
# A stale instances list can cause a node that is managed by this
|
||||
# compute host to be excluded in error and cause the compute node
|
||||
# to be orphaned and associated resource provider to be deleted.
|
||||
instances = objects.InstanceList.get_uuids_by_host(ctxt, CONF.host)
|
||||
|
||||
for node in nodes:
|
||||
# NOTE(jroll): we always manage the nodes for instances we manage
|
||||
if node.instance_uuid in instances:
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
---
|
||||
fixes:
|
||||
- |
|
||||
Minimizes a race condition window when using the ``ironic`` virt driver
|
||||
where the data generated for the Resource Tracker may attempt to compare
|
||||
potentially stale instance information with the latest known baremetal
|
||||
node information. While this doesn't completely prevent nor resolve the
|
||||
underlying race condition identified in
|
||||
`bug 1841481 <https://bugs.launchpad.net/nova/+bug/1841481>`_,
|
||||
this change allows Nova to have the latest state information, as opposed
|
||||
to state information which may be out of date due to the time which it may
|
||||
take to retrieve the status from Ironic. This issue was most observable
|
||||
on baremetal clusters with several thousand physical nodes.
|
Loading…
Reference in New Issue