Merge "[ironic] Minimize window for a resource provider to be lost" into stable/wallaby

2021-07-16 08:32:27 +00:00 · 2021-07-16 08:32:27 +00:00 · b0099aa8a2
parent 4a11babb57 0c36bd28eb
commit b0099aa8a2
3 changed files with 40 additions and 1 deletions
--- a/nova/tests/unit/virt/ironic/test_driver.py
+++ b/nova/tests/unit/virt/ironic/test_driver.py
@ -3055,6 +3055,9 @@ class NodeCacheTestCase(test.NoDBTestCase):
        mock_instances.return_value = instances
        mock_nodes.return_value = nodes
        mock_hosts.side_effect = hosts
+        parent_mock = mock.MagicMock()
+        parent_mock.attach_mock(mock_nodes, 'get_node_list')
+        parent_mock.attach_mock(mock_instances, 'get_uuids_by_host')
        if not can_send_146:
            mock_can_send.side_effect = (
                exception.IronicAPIVersionNotAvailable(version='1.46'))
@ -3067,6 +3070,15 @@ class NodeCacheTestCase(test.NoDBTestCase):

        self.driver._refresh_cache()

+        # assert if get_node_list() is called before get_uuids_by_host()
+        parent_mock.assert_has_calls(
+            [
+                mock.call.get_node_list(fields=ironic_driver._NODE_FIELDS,
+                                        **kwargs),
+                mock.call.get_uuids_by_host(mock.ANY, self.host)
+            ]
+        )
+
        mock_hash_ring.assert_called_once_with(mock.ANY)
        mock_instances.assert_called_once_with(mock.ANY, self.host)
        mock_nodes.assert_called_once_with(fields=ironic_driver._NODE_FIELDS,
--- a/nova/virt/ironic/driver.py
+++ b/nova/virt/ironic/driver.py
@ -733,10 +733,15 @@ class IronicDriver(virt_driver.ComputeDriver):
    def _refresh_cache(self):
        ctxt = nova_context.get_admin_context()
        self._refresh_hash_ring(ctxt)
-        instances = objects.InstanceList.get_uuids_by_host(ctxt, CONF.host)
        node_cache = {}

        def _get_node_list(**kwargs):
+            # NOTE(TheJulia): This call can take a substantial amount
+            # of time as it may be attempting to retrieve thousands of
+            # baremetal nodes. Depending on the version of Ironic,
+            # this can be as long as 2-10 seconds per every thousand
+            # nodes, and this call may retrieve all nodes in a deployment,
+            # depending on if any filter paramters are applied.
            return self._get_node_list(fields=_NODE_FIELDS, **kwargs)

        # NOTE(jroll) if partition_key is set, we need to limit nodes that
@ -760,6 +765,15 @@ class IronicDriver(virt_driver.ComputeDriver):
        else:
            nodes = _get_node_list()

+        # NOTE(saga): As _get_node_list() will take a long
+        # time to return in large clusters we need to call it before
+        # get_uuids_by_host() method. Otherwise the instances list we get from
+        # get_uuids_by_host() method will become stale.
+        # A stale instances list can cause a node that is managed by this
+        # compute host to be excluded in error and cause the compute node
+        # to be orphaned and associated resource provider to be deleted.
+        instances = objects.InstanceList.get_uuids_by_host(ctxt, CONF.host)
+
        for node in nodes:
            # NOTE(jroll): we always manage the nodes for instances we manage
            if node.instance_uuid in instances:
--- a/releasenotes/notes/minimize-bug-1841481-race-window-f76912d4985770ad.yaml
+++ b/releasenotes/notes/minimize-bug-1841481-race-window-f76912d4985770ad.yaml
@ -0,0 +1,13 @@
+---
+fixes:
+  - |
+    Minimizes a race condition window when using the ``ironic`` virt driver
+    where the data generated for the Resource Tracker may attempt to compare
+    potentially stale instance information with the latest known baremetal
+    node information. While this doesn't completely prevent nor resolve the
+    underlying race condition identified in
+    `bug 1841481 <https://bugs.launchpad.net/nova/+bug/1841481>`_,
+    this change allows Nova to have the latest state information, as opposed
+    to state information which may be out of date due to the time which it may
+    take to retrieve the status from Ironic. This issue was most observable
+    on baremetal clusters with several thousand physical nodes.