From f93f675a8563c7c37fdcb0c685a8b491a7311361 Mon Sep 17 00:00:00 2001 From: Dan Smith Date: Tue, 9 May 2017 09:11:27 -0700 Subject: [PATCH] Make compute auto-disable itself if builds are failing This implements an auto-disable feature in nova-compute, where we automatically set our service record to disabled if we consecutively fail to build a certain number of instances. While this is a very useful thing to do in general, disabling a failing compute becomes more important in the future where scheduler retries due to unknown failures may become either impossible or scoped to a single cell. Since a compute that is consistently failing will look very attractive to the scheduler, it may become a build magnet, that in the absence of retries, would effectively kill all builds in a cloud until fixed. Change-Id: I02b7cd87d399d487dd1d650540f503a70bc27749 --- nova/compute/manager.py | 39 +++++++- nova/conf/compute.py | 32 ++++++- nova/tests/unit/compute/test_compute_mgr.py | 89 +++++++++++++++++++ ...te-node-auto-disable-303eb9b0fdb4f3f1.yaml | 17 ++++ 4 files changed, 175 insertions(+), 2 deletions(-) create mode 100644 releasenotes/notes/compute-node-auto-disable-303eb9b0fdb4f3f1.yaml diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 2c027c12265e..59759517dfae 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -528,6 +528,7 @@ class ComputeManager(manager.Manager): CONF.max_concurrent_live_migrations) else: self._live_migration_semaphore = compute_utils.UnlimitedSemaphore() + self._failed_builds = 0 super(ComputeManager, self).__init__(service_name="compute", *args, **kwargs) @@ -1688,6 +1689,31 @@ class ComputeManager(manager.Manager): return block_device_info + def _build_failed(self): + self._failed_builds += 1 + limit = CONF.compute.consecutive_build_service_disable_threshold + if limit and self._failed_builds >= limit: + # NOTE(danms): If we're doing a bunch of parallel builds, + # it is possible (although not likely) that we have already + # failed N-1 builds before this and we race with a successful + # build and disable ourselves here when we might've otherwise + # not. + LOG.error('Disabling service due to %(fails)i ' + 'consecutive build failures', + {'fails': self._failed_builds}) + ctx = nova.context.get_admin_context() + service = objects.Service.get_by_compute_host(ctx, CONF.host) + service.disabled = True + service.disabled_reason = ( + 'Auto-disabled due to %i build failures' % self._failed_builds) + service.save() + # NOTE(danms): Reset our counter now so that when the admin + # re-enables us we can start fresh + self._failed_builds = 0 + elif self._failed_builds > 1: + LOG.warning('%(fails)i consecutive build failures', + {'fails': self._failed_builds}) + @wrap_exception() @reverts_task_state @wrap_instance_fault @@ -1704,7 +1730,18 @@ class ComputeManager(manager.Manager): # for a while and we want to make sure that nothing else tries # to do anything with this instance while we wait. with self._build_semaphore: - self._do_build_and_run_instance(*args, **kwargs) + try: + result = self._do_build_and_run_instance(*args, **kwargs) + except Exception: + result = build_results.FAILED + raise + finally: + fails = (build_results.FAILED, + build_results.RESCHEDULED) + if result in fails: + self._build_failed() + else: + self._failed_builds = 0 # NOTE(danms): We spawn here to return the RPC worker thread back to # the pool. Since what follows could take a really long time, we don't diff --git a/nova/conf/compute.py b/nova/conf/compute.py index 355416c2cf8b..67aa712ec434 100644 --- a/nova/conf/compute.py +++ b/nova/conf/compute.py @@ -23,6 +23,12 @@ from oslo_config import types from nova.conf import paths +compute_group = cfg.OptGroup( + 'compute', + title='Compute Manager Options', + help=""" +A collection of options specific to the nova-compute service. +""") compute_opts = [ cfg.StrOpt('compute_driver', help=""" @@ -637,6 +643,27 @@ Possible values: """) ] +compute_group_opts = [ + cfg.IntOpt('consecutive_build_service_disable_threshold', + default=10, + help=""" +Number of consecutive failed builds that result in disabling a compute service. + +This option will cause nova-compute to set itself to a disabled state +if a certain number of consecutive build failures occur. This will +prevent the scheduler from continuing to send builds to a compute node that is +consistently failing. Note that all failures qualify and count towards this +score, including reschedules that may have been due to racy scheduler behavior. +Since the failures must be consecutive, it is unlikely that occasional expected +reschedules will actually disable a compute node. + +Possible values: + +* Any positive integer representing a build failure count. +* Zero to never auto-disable. +"""), +] + interval_opts = [ cfg.IntOpt('image_cache_manager_interval', default=2400, @@ -1139,7 +1166,10 @@ ALL_OPTS = (compute_opts + def register_opts(conf): conf.register_opts(ALL_OPTS) + conf.register_group(compute_group) + conf.register_opts(compute_group_opts, group=compute_group) def list_opts(): - return {'DEFAULT': ALL_OPTS} + return {'DEFAULT': ALL_OPTS, + 'compute': compute_opts} diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index fa88fade675f..55f25b9f1802 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -4040,6 +4040,95 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): set_error=True, cleanup_volumes=True, nil_out_host_and_node=True) + @mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance') + @mock.patch('nova.objects.Service.get_by_compute_host') + def test_build_failures_disable_service(self, mock_service, mock_dbari): + mock_dbari.return_value = build_results.FAILED + instance = objects.Instance(uuid=uuids.instance) + for i in range(0, 10): + self.compute.build_and_run_instance(None, instance, None, + None, None) + service = mock_service.return_value + self.assertTrue(service.disabled) + self.assertEqual('Auto-disabled due to 10 build failures', + service.disabled_reason) + service.save.assert_called_once_with() + self.assertEqual(0, self.compute._failed_builds) + + @mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance') + @mock.patch('nova.objects.Service.get_by_compute_host') + def test_build_failures_not_disable_service(self, mock_service, + mock_dbari): + self.flags(consecutive_build_service_disable_threshold=0, + group='compute') + mock_dbari.return_value = build_results.FAILED + instance = objects.Instance(uuid=uuids.instance) + for i in range(0, 10): + self.compute.build_and_run_instance(None, instance, None, + None, None) + service = mock_service.return_value + self.assertFalse(service.save.called) + self.assertEqual(10, self.compute._failed_builds) + + @mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance') + @mock.patch('nova.objects.Service.get_by_compute_host') + def test_transient_build_failures_no_disable_service(self, mock_service, + mock_dbari): + results = [build_results.FAILED, + build_results.ACTIVE, + build_results.RESCHEDULED] + + def _fake_build(*a, **k): + if results: + return results.pop(0) + else: + return build_results.ACTIVE + + mock_dbari.side_effect = _fake_build + instance = objects.Instance(uuid=uuids.instance) + for i in range(0, 10): + self.compute.build_and_run_instance(None, instance, None, + None, None) + service = mock_service.return_value + self.assertFalse(service.save.called) + self.assertEqual(0, self.compute._failed_builds) + + @mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance') + @mock.patch('nova.objects.Service.get_by_compute_host') + def test_build_reschedules_disable_service(self, mock_service, mock_dbari): + mock_dbari.return_value = build_results.RESCHEDULED + instance = objects.Instance(uuid=uuids.instance) + for i in range(0, 10): + self.compute.build_and_run_instance(None, instance, None, + None, None) + service = mock_service.return_value + self.assertTrue(service.disabled) + self.assertEqual('Auto-disabled due to 10 build failures', + service.disabled_reason) + service.save.assert_called_once_with() + self.assertEqual(0, self.compute._failed_builds) + + @mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance') + @mock.patch('nova.objects.Service.get_by_compute_host') + @mock.patch('nova.exception_wrapper._emit_exception_notification') + @mock.patch('nova.compute.utils.add_instance_fault_from_exc') + def test_build_exceptions_disable_service(self, mock_if, mock_notify, + mock_service, mock_dbari): + mock_dbari.side_effect = test.TestingException() + instance = objects.Instance(uuid=uuids.instance, + task_state=None) + for i in range(0, 10): + self.assertRaises(test.TestingException, + self.compute.build_and_run_instance, + None, instance, None, + None, None) + service = mock_service.return_value + self.assertTrue(service.disabled) + self.assertEqual('Auto-disabled due to 10 build failures', + service.disabled_reason) + service.save.assert_called_once_with() + self.assertEqual(0, self.compute._failed_builds) + @mock.patch.object(manager.ComputeManager, '_shutdown_instance') @mock.patch.object(manager.ComputeManager, '_build_networks_for_instance') @mock.patch.object(fake_driver.FakeDriver, 'spawn') diff --git a/releasenotes/notes/compute-node-auto-disable-303eb9b0fdb4f3f1.yaml b/releasenotes/notes/compute-node-auto-disable-303eb9b0fdb4f3f1.yaml new file mode 100644 index 000000000000..a045b3bb206d --- /dev/null +++ b/releasenotes/notes/compute-node-auto-disable-303eb9b0fdb4f3f1.yaml @@ -0,0 +1,17 @@ +--- +features: + - | + The `nova-compute` worker can automatically disable itself in the + service database if consecutive build failures exceed a set threshold. The + ``[compute]/consecutive_build_service_disable_threshold`` configuration option + allows setting the threshold for this behavior, or disabling it entirely if + desired. + The intent is that an admin will examine the issue before manually + re-enabling the service, which will avoid that compute node becoming a + black hole build magnet. +upgrade: + - | + The new configuration option + ``[compute]/consecutive_build_service_disable_threshold`` + defaults to a nonzero value, which means multiple failed builds will + result in a compute node auto-disabling itself. \ No newline at end of file