Consider startup scenario in _get_compute_nodes_in_db

Before this change, on the first start of the nova-compute service on any host, this method is logging the "No compute node record for host" error which is confusing for people debugging issues. For example, if the compute service starts up before the Placement service is running, the compute is checking in and failing to connect to the placement endpoint which might take the compute node out of consideration for scheduling. When debugging that kind of issue, people can get hung up on this error message which is actually an expected case on the first start of nova-compute on a new host. This change simply plumbs through a boolean to tell it if we're starting up or not. This doesn't tell us if it's the first time we're starting this service or a restart, but the thinking is, if it's a restart we don't get the NotFound error and if we do, then we do want the error level message. Change-Id: Id7a05b579ead6ac5445ca5b1eeab6d223d545a6c
2017-02-08 10:23:14 -05:00 · 2017-02-08 10:23:14 -05:00 · 50d402821b
commit 50d402821b
parent d48aeb5783
3 changed files with 45 additions and 7 deletions
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -1165,7 +1165,8 @@ class ComputeManager(manager.Manager):
        the service up by listening on RPC queues, make sure to update
        our available resources (and indirectly our available nodes).
        """
-        self.update_available_resource(nova.context.get_admin_context())
+        self.update_available_resource(nova.context.get_admin_context(),
+                                       startup=True)

    def _get_power_state(self, context, instance):
        """Retrieve the power state for the given instance."""
@ -6559,17 +6560,20 @@ class ComputeManager(manager.Manager):
                          "%(node)s."), {'node': nodename})

    @periodic_task.periodic_task(spacing=CONF.update_resources_interval)
-    def update_available_resource(self, context):
+    def update_available_resource(self, context, startup=False):
        """See driver.get_available_resource()

        Periodic process that keeps that the compute host's understanding of
        resource availability and usage in sync with the underlying hypervisor.

        :param context: security context
+        :param startup: True if this is being called when the nova-compute
+            service is starting, False otherwise.
        """

        compute_nodes_in_db = self._get_compute_nodes_in_db(context,
-                                                            use_slave=True)
+                                                            use_slave=True,
+                                                            startup=startup)
        nodenames = set(self.driver.get_available_nodes())
        for nodename in nodenames:
            self.update_available_resource_for_node(context, nodename)
@ -6589,12 +6593,19 @@ class ComputeManager(manager.Manager):
                self.scheduler_client.reportclient.delete_resource_provider(
                    context, cn, cascade=True)

-    def _get_compute_nodes_in_db(self, context, use_slave=False):
+    def _get_compute_nodes_in_db(self, context, use_slave=False,
+                                 startup=False):
        try:
            return objects.ComputeNodeList.get_all_by_host(context, self.host,
                                                           use_slave=use_slave)
        except exception.NotFound:
-            LOG.error(_LE("No compute node record for host %s"), self.host)
+            if startup:
+                LOG.warning(
+                    _LW("No compute node record found for host %s. If this is "
+                        "the first time this service is starting on this "
+                        "host, then you can ignore this warning."), self.host)
+            else:
+                LOG.error(_LE("No compute node record for host %s"), self.host)
            return []

    @periodic_task.periodic_task(
--- a/nova/tests/unit/compute/test_compute.py
+++ b/nova/tests/unit/compute/test_compute.py
@ -160,7 +160,7 @@ class BaseTestCase(test.TestCase):
                    self.compute.driver)
        self.compute._resource_tracker = fake_rt

-        def fake_get_compute_nodes_in_db(self, context, use_slave=False):
+        def fake_get_compute_nodes_in_db(self, context, **kwargs):
            fake_compute_nodes = [{'local_gb': 259,
                                   'uuid': uuids.fake_compute_node,
                                   'vcpus_used': 0,
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@ -226,7 +226,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
        get_db_nodes.return_value = db_nodes
        get_avail_nodes.return_value = avail_nodes
        self.compute.update_available_resource(self.context)
-        get_db_nodes.assert_called_once_with(self.context, use_slave=True)
+        get_db_nodes.assert_called_once_with(self.context, use_slave=True,
+                                             startup=False)
        update_mock.has_calls(
            [mock.call(self.context, node) for node in avail_nodes_l]
        )
@ -240,6 +241,32 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
            else:
                self.assertFalse(db_node.destroy.called)

+    @mock.patch('nova.context.get_admin_context')
+    def test_pre_start_hook(self, get_admin_context):
+        """Very simple test just to make sure update_available_resource is
+        called as expected.
+        """
+        with mock.patch.object(
+                self.compute, 'update_available_resource') as update_res:
+            self.compute.pre_start_hook()
+        update_res.assert_called_once_with(
+            get_admin_context.return_value, startup=True)
+
+    @mock.patch.object(objects.ComputeNodeList, 'get_all_by_host',
+                       side_effect=exception.NotFound)
+    @mock.patch('nova.compute.manager.LOG')
+    def test_get_compute_nodes_in_db_on_startup(self, mock_log,
+                                                get_all_by_host):
+        """Tests to make sure we only log a warning when we do not find a
+        compute node on startup since this may be expected.
+        """
+        self.assertEqual([], self.compute._get_compute_nodes_in_db(
+            self.context, startup=True))
+        get_all_by_host.assert_called_once_with(
+            self.context, self.compute.host, use_slave=False)
+        self.assertTrue(mock_log.warning.called)
+        self.assertFalse(mock_log.error.called)
+
    @mock.patch('nova.compute.utils.notify_about_instance_action')
    def test_delete_instance_without_info_cache(self, mock_notify):
        instance = fake_instance.fake_instance_obj(