Merge "Change consecutive build failure limit to a weigher" into stable/pike
This commit is contained in:
commit
8a08cf332e
@ -860,6 +860,11 @@ Hosts and cells are weighted based on the following options in the
|
|||||||
- Multiplier used for weighing hosts for group soft-anti-affinity. Only a
|
- Multiplier used for weighing hosts for group soft-anti-affinity. Only a
|
||||||
positive value is meaningful. Negative means that the behavior will
|
positive value is meaningful. Negative means that the behavior will
|
||||||
change to the opposite, which is soft-affinity.
|
change to the opposite, which is soft-affinity.
|
||||||
|
* - [filter_scheduler]
|
||||||
|
- ``build_failure_weight_multiplier``
|
||||||
|
- Multiplier used for weighing hosts which have recent build failures. A
|
||||||
|
positive value increases the significance of build falures reported by
|
||||||
|
the host recently, making them less likely to be chosen.
|
||||||
* - [metrics]
|
* - [metrics]
|
||||||
- ``weight_multiplier``
|
- ``weight_multiplier``
|
||||||
- Multiplier for weighting meters. Use a floating-point value.
|
- Multiplier for weighting meters. Use a floating-point value.
|
||||||
|
@ -532,7 +532,6 @@ class ComputeManager(manager.Manager):
|
|||||||
CONF.max_concurrent_live_migrations)
|
CONF.max_concurrent_live_migrations)
|
||||||
else:
|
else:
|
||||||
self._live_migration_semaphore = compute_utils.UnlimitedSemaphore()
|
self._live_migration_semaphore = compute_utils.UnlimitedSemaphore()
|
||||||
self._failed_builds = 0
|
|
||||||
|
|
||||||
super(ComputeManager, self).__init__(service_name="compute",
|
super(ComputeManager, self).__init__(service_name="compute",
|
||||||
*args, **kwargs)
|
*args, **kwargs)
|
||||||
@ -1710,29 +1709,15 @@ class ComputeManager(manager.Manager):
|
|||||||
return block_device_info
|
return block_device_info
|
||||||
|
|
||||||
def _build_failed(self):
|
def _build_failed(self):
|
||||||
self._failed_builds += 1
|
if CONF.compute.consecutive_build_service_disable_threshold:
|
||||||
limit = CONF.compute.consecutive_build_service_disable_threshold
|
rt = self._get_resource_tracker()
|
||||||
if limit and self._failed_builds >= limit:
|
# NOTE(danms): Update our counter, but wait for the next
|
||||||
# NOTE(danms): If we're doing a bunch of parallel builds,
|
# update_available_resource() periodic to flush it to the DB
|
||||||
# it is possible (although not likely) that we have already
|
rt.stats.build_failed()
|
||||||
# failed N-1 builds before this and we race with a successful
|
|
||||||
# build and disable ourselves here when we might've otherwise
|
def _build_succeeded(self):
|
||||||
# not.
|
rt = self._get_resource_tracker()
|
||||||
LOG.error('Disabling service due to %(fails)i '
|
rt.stats.build_succeeded()
|
||||||
'consecutive build failures',
|
|
||||||
{'fails': self._failed_builds})
|
|
||||||
ctx = nova.context.get_admin_context()
|
|
||||||
service = objects.Service.get_by_compute_host(ctx, CONF.host)
|
|
||||||
service.disabled = True
|
|
||||||
service.disabled_reason = (
|
|
||||||
'Auto-disabled due to %i build failures' % self._failed_builds)
|
|
||||||
service.save()
|
|
||||||
# NOTE(danms): Reset our counter now so that when the admin
|
|
||||||
# re-enables us we can start fresh
|
|
||||||
self._failed_builds = 0
|
|
||||||
elif self._failed_builds > 1:
|
|
||||||
LOG.warning('%(fails)i consecutive build failures',
|
|
||||||
{'fails': self._failed_builds})
|
|
||||||
|
|
||||||
@wrap_exception()
|
@wrap_exception()
|
||||||
@reverts_task_state
|
@reverts_task_state
|
||||||
@ -1783,7 +1768,7 @@ class ComputeManager(manager.Manager):
|
|||||||
|
|
||||||
self._build_failed()
|
self._build_failed()
|
||||||
else:
|
else:
|
||||||
self._failed_builds = 0
|
self._build_succeeded()
|
||||||
|
|
||||||
# NOTE(danms): We spawn here to return the RPC worker thread back to
|
# NOTE(danms): We spawn here to return the RPC worker thread back to
|
||||||
# the pool. Since what follows could take a really long time, we don't
|
# the pool. Since what follows could take a really long time, we don't
|
||||||
|
@ -628,7 +628,13 @@ class ResourceTracker(object):
|
|||||||
def _copy_resources(self, compute_node, resources):
|
def _copy_resources(self, compute_node, resources):
|
||||||
"""Copy resource values to supplied compute_node."""
|
"""Copy resource values to supplied compute_node."""
|
||||||
# purge old stats and init with anything passed in by the driver
|
# purge old stats and init with anything passed in by the driver
|
||||||
|
# NOTE(danms): Preserve 'failed_builds' across the stats clearing,
|
||||||
|
# as that is not part of resources
|
||||||
|
# TODO(danms): Stop doing this when we get a column to store this
|
||||||
|
# directly
|
||||||
|
prev_failed_builds = self.stats.get('failed_builds', 0)
|
||||||
self.stats.clear()
|
self.stats.clear()
|
||||||
|
self.stats['failed_builds'] = prev_failed_builds
|
||||||
self.stats.digest_stats(resources.get('stats'))
|
self.stats.digest_stats(resources.get('stats'))
|
||||||
compute_node.stats = copy.deepcopy(self.stats)
|
compute_node.stats = copy.deepcopy(self.stats)
|
||||||
|
|
||||||
|
@ -138,3 +138,11 @@ class Stats(dict):
|
|||||||
os_type=os_type, project_id=project_id)
|
os_type=os_type, project_id=project_id)
|
||||||
|
|
||||||
return (vm_state, task_state, os_type, project_id)
|
return (vm_state, task_state, os_type, project_id)
|
||||||
|
|
||||||
|
def build_failed(self):
|
||||||
|
self['failed_builds'] = self.get('failed_builds', 0) + 1
|
||||||
|
|
||||||
|
def build_succeeded(self):
|
||||||
|
# FIXME(danms): Make this more graceful, either by time-based aging or
|
||||||
|
# a fixed decline upon success
|
||||||
|
self['failed_builds'] = 0
|
||||||
|
@ -638,20 +638,20 @@ compute_group_opts = [
|
|||||||
cfg.IntOpt('consecutive_build_service_disable_threshold',
|
cfg.IntOpt('consecutive_build_service_disable_threshold',
|
||||||
default=10,
|
default=10,
|
||||||
help="""
|
help="""
|
||||||
Number of consecutive failed builds that result in disabling a compute service.
|
Enables reporting of build failures to the scheduler.
|
||||||
|
|
||||||
This option will cause nova-compute to set itself to a disabled state
|
Any nonzero value will enable sending build failure statistics to the
|
||||||
if a certain number of consecutive build failures occur. This will
|
scheduler for use by the BuildFailureWeigher.
|
||||||
prevent the scheduler from continuing to send builds to a compute node that is
|
|
||||||
consistently failing. Note that all failures qualify and count towards this
|
|
||||||
score, including reschedules that may have been due to racy scheduler behavior.
|
|
||||||
Since the failures must be consecutive, it is unlikely that occasional expected
|
|
||||||
reschedules will actually disable a compute node.
|
|
||||||
|
|
||||||
Possible values:
|
Possible values:
|
||||||
|
|
||||||
* Any positive integer representing a build failure count.
|
* Any positive integer enables reporting build failures.
|
||||||
* Zero to never auto-disable.
|
* Zero to disable reporting build failures.
|
||||||
|
|
||||||
|
Related options:
|
||||||
|
|
||||||
|
* [filter_scheduler]/build_failure_weight_multiplier
|
||||||
|
|
||||||
"""),
|
"""),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -477,6 +477,34 @@ Possible values:
|
|||||||
for hosts with group soft anti-affinity. Only a positive value are
|
for hosts with group soft anti-affinity. Only a positive value are
|
||||||
meaningful, as negative values would make this behave as a soft affinity
|
meaningful, as negative values would make this behave as a soft affinity
|
||||||
weigher.
|
weigher.
|
||||||
|
"""),
|
||||||
|
cfg.FloatOpt(
|
||||||
|
"build_failure_weight_multiplier",
|
||||||
|
default=1000000.0,
|
||||||
|
help="""
|
||||||
|
Multiplier used for weighing hosts that have had recent build failures.
|
||||||
|
|
||||||
|
This option determines how much weight is placed on a compute node with
|
||||||
|
recent build failures. Build failures may indicate a failing, misconfigured,
|
||||||
|
or otherwise ailing compute node, and avoiding it during scheduling may be
|
||||||
|
beneficial. The weight is inversely proportional to the number of recent
|
||||||
|
build failures the compute node has experienced. This value should be
|
||||||
|
set to some high value to offset weight given by other enabled weighers
|
||||||
|
due to available resources. To disable weighing compute hosts by the
|
||||||
|
number of recent failures, set this to zero.
|
||||||
|
|
||||||
|
This option is only used by the FilterScheduler and its subclasses; if you use
|
||||||
|
a different scheduler, this option has no effect.
|
||||||
|
|
||||||
|
Possible values:
|
||||||
|
|
||||||
|
* An integer or float value, where the value corresponds to the multiplier
|
||||||
|
ratio for this weigher.
|
||||||
|
|
||||||
|
Related options:
|
||||||
|
|
||||||
|
* [compute]/consecutive_build_service_disable_threshold - Must be nonzero
|
||||||
|
for a compute to report data considered by this weigher.
|
||||||
"""),
|
"""),
|
||||||
# TODO(mikal): replace this option with something involving host aggregates
|
# TODO(mikal): replace this option with something involving host aggregates
|
||||||
cfg.ListOpt("isolated_images",
|
cfg.ListOpt("isolated_images",
|
||||||
|
@ -257,6 +257,9 @@ class HostState(object):
|
|||||||
self.ram_allocation_ratio = compute.ram_allocation_ratio
|
self.ram_allocation_ratio = compute.ram_allocation_ratio
|
||||||
self.disk_allocation_ratio = compute.disk_allocation_ratio
|
self.disk_allocation_ratio = compute.disk_allocation_ratio
|
||||||
|
|
||||||
|
# update failed_builds counter reported by the compute
|
||||||
|
self.failed_builds = int(self.stats.get('failed_builds', 0))
|
||||||
|
|
||||||
def consume_from_request(self, spec_obj):
|
def consume_from_request(self, spec_obj):
|
||||||
"""Incrementally update host state from a RequestSpec object."""
|
"""Incrementally update host state from a RequestSpec object."""
|
||||||
|
|
||||||
|
@ -76,6 +76,9 @@ class IronicNodeState(host_manager.HostState):
|
|||||||
self.ram_allocation_ratio = compute.ram_allocation_ratio
|
self.ram_allocation_ratio = compute.ram_allocation_ratio
|
||||||
self.disk_allocation_ratio = compute.disk_allocation_ratio
|
self.disk_allocation_ratio = compute.disk_allocation_ratio
|
||||||
|
|
||||||
|
# update failed_builds counter reported by the compute
|
||||||
|
self.failed_builds = int(self.stats.get('failed_builds', 0))
|
||||||
|
|
||||||
self.updated = compute.updated_at
|
self.updated = compute.updated_at
|
||||||
|
|
||||||
def _locked_consume_from_request(self, spec_obj):
|
def _locked_consume_from_request(self, spec_obj):
|
||||||
|
33
nova/scheduler/weights/compute.py
Normal file
33
nova/scheduler/weights/compute.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
# not use this file except in compliance with the License. You may obtain
|
||||||
|
# a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
# License for the specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
"""
|
||||||
|
BuildFailure Weigher. Weigh hosts by the number of recent failed boot attempts.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import nova.conf
|
||||||
|
from nova.scheduler import weights
|
||||||
|
|
||||||
|
CONF = nova.conf.CONF
|
||||||
|
|
||||||
|
|
||||||
|
class BuildFailureWeigher(weights.BaseHostWeigher):
|
||||||
|
def weight_multiplier(self):
|
||||||
|
"""Override the weight multiplier. Note this is negated."""
|
||||||
|
return -1 * CONF.filter_scheduler.build_failure_weight_multiplier
|
||||||
|
|
||||||
|
def _weigh_object(self, host_state, weight_properties):
|
||||||
|
"""Higher weights win. Our multiplier is negative, so reduce our
|
||||||
|
weight by number of failed builds.
|
||||||
|
"""
|
||||||
|
return host_state.failed_builds
|
@ -324,6 +324,11 @@ class TestCase(testtools.TestCase):
|
|||||||
|
|
||||||
self.useFixture(nova_fixtures.ForbidNewLegacyNotificationFixture())
|
self.useFixture(nova_fixtures.ForbidNewLegacyNotificationFixture())
|
||||||
|
|
||||||
|
# FIXME(danms): Disable this for all tests by default to avoid breaking
|
||||||
|
# any that depend on default/previous ordering
|
||||||
|
self.flags(build_failure_weight_multiplier=0.0,
|
||||||
|
group='filter_scheduler')
|
||||||
|
|
||||||
def _setup_cells(self):
|
def _setup_cells(self):
|
||||||
"""Setup a normal cellsv2 environment.
|
"""Setup a normal cellsv2 environment.
|
||||||
|
|
||||||
|
@ -58,6 +58,7 @@ class ServersTestBase(integrated_helpers._IntegratedTestBase):
|
|||||||
_min_count_parameter = 'min_count'
|
_min_count_parameter = 'min_count'
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
self.computes = {}
|
||||||
super(ServersTestBase, self).setUp()
|
super(ServersTestBase, self).setUp()
|
||||||
# The network service is called as part of server creates but no
|
# The network service is called as part of server creates but no
|
||||||
# networks have been populated in the db, so stub the methods.
|
# networks have been populated in the db, so stub the methods.
|
||||||
@ -118,6 +119,30 @@ class ServersTest(ServersTestBase):
|
|||||||
for server in servers:
|
for server in servers:
|
||||||
LOG.debug("server: %s", server)
|
LOG.debug("server: %s", server)
|
||||||
|
|
||||||
|
def _get_node_build_failures(self):
|
||||||
|
ctxt = context.get_admin_context()
|
||||||
|
computes = objects.ComputeNodeList.get_all(ctxt)
|
||||||
|
return {
|
||||||
|
node.hypervisor_hostname: int(node.stats.get('failed_builds', 0))
|
||||||
|
for node in computes}
|
||||||
|
|
||||||
|
def _run_periodics(self):
|
||||||
|
"""Run the update_available_resource task on every compute manager
|
||||||
|
|
||||||
|
This runs periodics on the computes in an undefined order; some child
|
||||||
|
class redefined this function to force a specific order.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if self.compute.host not in self.computes:
|
||||||
|
self.computes[self.compute.host] = self.compute
|
||||||
|
|
||||||
|
ctx = context.get_admin_context()
|
||||||
|
for compute in self.computes.values():
|
||||||
|
LOG.info('Running periodic for compute (%s)',
|
||||||
|
compute.manager.host)
|
||||||
|
compute.manager.update_available_resource(ctx)
|
||||||
|
LOG.info('Finished with periodics')
|
||||||
|
|
||||||
def test_create_server_with_error(self):
|
def test_create_server_with_error(self):
|
||||||
# Create a server which will enter error state.
|
# Create a server which will enter error state.
|
||||||
|
|
||||||
@ -139,6 +164,12 @@ class ServersTest(ServersTestBase):
|
|||||||
self.assertEqual('ERROR', found_server['status'])
|
self.assertEqual('ERROR', found_server['status'])
|
||||||
self._delete_server(created_server_id)
|
self._delete_server(created_server_id)
|
||||||
|
|
||||||
|
# We should have no (persisted) build failures until we update
|
||||||
|
# resources, after which we should have one
|
||||||
|
self.assertEqual([0], list(self._get_node_build_failures().values()))
|
||||||
|
self._run_periodics()
|
||||||
|
self.assertEqual([1], list(self._get_node_build_failures().values()))
|
||||||
|
|
||||||
def test_create_and_delete_server(self):
|
def test_create_and_delete_server(self):
|
||||||
# Creates and deletes a server.
|
# Creates and deletes a server.
|
||||||
|
|
||||||
|
@ -4761,24 +4761,19 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
|
|||||||
nil_out_host_and_node=True)
|
nil_out_host_and_node=True)
|
||||||
|
|
||||||
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
|
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
|
||||||
@mock.patch('nova.objects.Service.get_by_compute_host')
|
@mock.patch('nova.compute.stats.Stats.build_failed')
|
||||||
def test_build_failures_disable_service(self, mock_service, mock_dbari):
|
def test_build_failures_reported(self, mock_failed, mock_dbari):
|
||||||
mock_dbari.return_value = build_results.FAILED
|
mock_dbari.return_value = build_results.FAILED
|
||||||
instance = objects.Instance(uuid=uuids.instance)
|
instance = objects.Instance(uuid=uuids.instance)
|
||||||
for i in range(0, 10):
|
for i in range(0, 10):
|
||||||
self.compute.build_and_run_instance(None, instance, None,
|
self.compute.build_and_run_instance(None, instance, None,
|
||||||
None, None)
|
None, None)
|
||||||
service = mock_service.return_value
|
|
||||||
self.assertTrue(service.disabled)
|
self.assertEqual(10, mock_failed.call_count)
|
||||||
self.assertEqual('Auto-disabled due to 10 build failures',
|
|
||||||
service.disabled_reason)
|
|
||||||
service.save.assert_called_once_with()
|
|
||||||
self.assertEqual(0, self.compute._failed_builds)
|
|
||||||
|
|
||||||
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
|
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
|
||||||
@mock.patch('nova.objects.Service.get_by_compute_host')
|
@mock.patch('nova.compute.stats.Stats.build_failed')
|
||||||
def test_build_failures_not_disable_service(self, mock_service,
|
def test_build_failures_not_reported(self, mock_failed, mock_dbari):
|
||||||
mock_dbari):
|
|
||||||
self.flags(consecutive_build_service_disable_threshold=0,
|
self.flags(consecutive_build_service_disable_threshold=0,
|
||||||
group='compute')
|
group='compute')
|
||||||
mock_dbari.return_value = build_results.FAILED
|
mock_dbari.return_value = build_results.FAILED
|
||||||
@ -4786,14 +4781,15 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
|
|||||||
for i in range(0, 10):
|
for i in range(0, 10):
|
||||||
self.compute.build_and_run_instance(None, instance, None,
|
self.compute.build_and_run_instance(None, instance, None,
|
||||||
None, None)
|
None, None)
|
||||||
service = mock_service.return_value
|
|
||||||
self.assertFalse(service.save.called)
|
mock_failed.assert_not_called()
|
||||||
self.assertEqual(10, self.compute._failed_builds)
|
|
||||||
|
|
||||||
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
|
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
|
||||||
@mock.patch('nova.objects.Service.get_by_compute_host')
|
@mock.patch.object(manager.ComputeManager, '_build_failed')
|
||||||
def test_transient_build_failures_no_disable_service(self, mock_service,
|
@mock.patch.object(manager.ComputeManager, '_build_succeeded')
|
||||||
mock_dbari):
|
def test_transient_build_failures_no_report(self, mock_succeeded,
|
||||||
|
mock_failed,
|
||||||
|
mock_dbari):
|
||||||
results = [build_results.FAILED,
|
results = [build_results.FAILED,
|
||||||
build_results.ACTIVE,
|
build_results.ACTIVE,
|
||||||
build_results.RESCHEDULED]
|
build_results.RESCHEDULED]
|
||||||
@ -4809,31 +4805,34 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
|
|||||||
for i in range(0, 10):
|
for i in range(0, 10):
|
||||||
self.compute.build_and_run_instance(None, instance, None,
|
self.compute.build_and_run_instance(None, instance, None,
|
||||||
None, None)
|
None, None)
|
||||||
service = mock_service.return_value
|
|
||||||
self.assertFalse(service.save.called)
|
self.assertEqual(2, mock_failed.call_count)
|
||||||
self.assertEqual(0, self.compute._failed_builds)
|
self.assertEqual(8, mock_succeeded.call_count)
|
||||||
|
|
||||||
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
|
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
|
||||||
@mock.patch('nova.objects.Service.get_by_compute_host')
|
@mock.patch.object(manager.ComputeManager, '_build_failed')
|
||||||
def test_build_reschedules_disable_service(self, mock_service, mock_dbari):
|
@mock.patch.object(manager.ComputeManager, '_build_succeeded')
|
||||||
|
def test_build_reschedules_reported(self, mock_succeeded,
|
||||||
|
mock_failed,
|
||||||
|
mock_dbari):
|
||||||
mock_dbari.return_value = build_results.RESCHEDULED
|
mock_dbari.return_value = build_results.RESCHEDULED
|
||||||
instance = objects.Instance(uuid=uuids.instance)
|
instance = objects.Instance(uuid=uuids.instance)
|
||||||
for i in range(0, 10):
|
for i in range(0, 10):
|
||||||
self.compute.build_and_run_instance(None, instance, None,
|
self.compute.build_and_run_instance(None, instance, None,
|
||||||
None, None)
|
None, None)
|
||||||
service = mock_service.return_value
|
|
||||||
self.assertTrue(service.disabled)
|
self.assertEqual(10, mock_failed.call_count)
|
||||||
self.assertEqual('Auto-disabled due to 10 build failures',
|
mock_succeeded.assert_not_called()
|
||||||
service.disabled_reason)
|
|
||||||
service.save.assert_called_once_with()
|
|
||||||
self.assertEqual(0, self.compute._failed_builds)
|
|
||||||
|
|
||||||
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
|
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
|
||||||
@mock.patch('nova.objects.Service.get_by_compute_host')
|
|
||||||
@mock.patch('nova.exception_wrapper._emit_exception_notification')
|
@mock.patch('nova.exception_wrapper._emit_exception_notification')
|
||||||
@mock.patch('nova.compute.utils.add_instance_fault_from_exc')
|
@mock.patch('nova.compute.utils.add_instance_fault_from_exc')
|
||||||
def test_build_exceptions_disable_service(self, mock_if, mock_notify,
|
@mock.patch.object(manager.ComputeManager, '_build_failed')
|
||||||
mock_service, mock_dbari):
|
@mock.patch.object(manager.ComputeManager, '_build_succeeded')
|
||||||
|
def test_build_exceptions_reported(self, mock_succeeded,
|
||||||
|
mock_failed,
|
||||||
|
mock_if, mock_notify,
|
||||||
|
mock_dbari):
|
||||||
mock_dbari.side_effect = test.TestingException()
|
mock_dbari.side_effect = test.TestingException()
|
||||||
instance = objects.Instance(uuid=uuids.instance,
|
instance = objects.Instance(uuid=uuids.instance,
|
||||||
task_state=None)
|
task_state=None)
|
||||||
@ -4842,12 +4841,9 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
|
|||||||
self.compute.build_and_run_instance,
|
self.compute.build_and_run_instance,
|
||||||
None, instance, None,
|
None, instance, None,
|
||||||
None, None)
|
None, None)
|
||||||
service = mock_service.return_value
|
|
||||||
self.assertTrue(service.disabled)
|
self.assertEqual(10, mock_failed.call_count)
|
||||||
self.assertEqual('Auto-disabled due to 10 build failures',
|
mock_succeeded.assert_not_called()
|
||||||
service.disabled_reason)
|
|
||||||
service.save.assert_called_once_with()
|
|
||||||
self.assertEqual(0, self.compute._failed_builds)
|
|
||||||
|
|
||||||
@mock.patch.object(manager.ComputeManager, '_shutdown_instance')
|
@mock.patch.object(manager.ComputeManager, '_shutdown_instance')
|
||||||
@mock.patch.object(manager.ComputeManager, '_build_networks_for_instance')
|
@mock.patch.object(manager.ComputeManager, '_build_networks_for_instance')
|
||||||
|
@ -1148,7 +1148,7 @@ class TestInitComputeNode(BaseTestCase):
|
|||||||
ram_allocation_ratio=1.0,
|
ram_allocation_ratio=1.0,
|
||||||
cpu_allocation_ratio=1.0,
|
cpu_allocation_ratio=1.0,
|
||||||
disk_allocation_ratio=1.0,
|
disk_allocation_ratio=1.0,
|
||||||
stats={},
|
stats={'failed_builds': 0},
|
||||||
pci_device_pools=objects.PciDevicePoolList(objects=[]),
|
pci_device_pools=objects.PciDevicePoolList(objects=[]),
|
||||||
uuid=uuids.compute_node_uuid
|
uuid=uuids.compute_node_uuid
|
||||||
)
|
)
|
||||||
|
@ -238,3 +238,19 @@ class StatsTestCase(test.NoDBTestCase):
|
|||||||
|
|
||||||
self.assertEqual(0, len(self.stats))
|
self.assertEqual(0, len(self.stats))
|
||||||
self.assertEqual(0, len(self.stats.states))
|
self.assertEqual(0, len(self.stats.states))
|
||||||
|
|
||||||
|
def test_build_failed_succeded(self):
|
||||||
|
self.assertEqual('not-set', self.stats.get('failed_builds', 'not-set'))
|
||||||
|
self.stats.build_failed()
|
||||||
|
self.assertEqual(1, self.stats['failed_builds'])
|
||||||
|
self.stats.build_failed()
|
||||||
|
self.assertEqual(2, self.stats['failed_builds'])
|
||||||
|
self.stats.build_succeeded()
|
||||||
|
self.assertEqual(0, self.stats['failed_builds'])
|
||||||
|
self.stats.build_succeeded()
|
||||||
|
self.assertEqual(0, self.stats['failed_builds'])
|
||||||
|
|
||||||
|
def test_build_succeeded_first(self):
|
||||||
|
self.assertEqual('not-set', self.stats.get('failed_builds', 'not-set'))
|
||||||
|
self.stats.build_succeeded()
|
||||||
|
self.assertEqual(0, self.stats['failed_builds'])
|
||||||
|
@ -163,6 +163,7 @@ class CachingSchedulerTestCase(test_scheduler.SchedulerTestCase):
|
|||||||
host_state.ram_allocation_ratio = 1.5
|
host_state.ram_allocation_ratio = 1.5
|
||||||
host_state.disk_allocation_ratio = 1.0
|
host_state.disk_allocation_ratio = 1.0
|
||||||
host_state.metrics = objects.MonitorMetricList(objects=[])
|
host_state.metrics = objects.MonitorMetricList(objects=[])
|
||||||
|
host_state.failed_builds = 0
|
||||||
return host_state
|
return host_state
|
||||||
|
|
||||||
@mock.patch('nova.db.instance_extra_get_by_instance_uuid',
|
@mock.patch('nova.db.instance_extra_get_by_instance_uuid',
|
||||||
|
57
nova/tests/unit/scheduler/weights/test_weights_compute.py
Normal file
57
nova/tests/unit/scheduler/weights/test_weights_compute.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
# not use this file except in compliance with the License. You may obtain
|
||||||
|
# a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
# License for the specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
"""
|
||||||
|
Tests For Scheduler build failure weights.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from nova.scheduler import weights
|
||||||
|
from nova.scheduler.weights import compute
|
||||||
|
from nova import test
|
||||||
|
from nova.tests.unit.scheduler import fakes
|
||||||
|
|
||||||
|
|
||||||
|
class BuildFailureWeigherTestCase(test.NoDBTestCase):
|
||||||
|
def setUp(self):
|
||||||
|
super(BuildFailureWeigherTestCase, self).setUp()
|
||||||
|
self.weight_handler = weights.HostWeightHandler()
|
||||||
|
self.weighers = [compute.BuildFailureWeigher()]
|
||||||
|
|
||||||
|
def _get_weighed_host(self, hosts):
|
||||||
|
return self.weight_handler.get_weighed_objects(self.weighers,
|
||||||
|
hosts, {})
|
||||||
|
|
||||||
|
def _get_all_hosts(self):
|
||||||
|
host_values = [
|
||||||
|
('host1', 'node1', {'failed_builds': 0}),
|
||||||
|
('host2', 'node2', {'failed_builds': 1}),
|
||||||
|
('host3', 'node3', {'failed_builds': 10}),
|
||||||
|
('host4', 'node4', {'failed_builds': 100})
|
||||||
|
]
|
||||||
|
return [fakes.FakeHostState(host, node, values)
|
||||||
|
for host, node, values in host_values]
|
||||||
|
|
||||||
|
def test_build_failure_weigher_disabled(self):
|
||||||
|
self.flags(build_failure_weight_multiplier=0.0,
|
||||||
|
group='filter_scheduler')
|
||||||
|
hosts = self._get_all_hosts()
|
||||||
|
weighed_hosts = self._get_weighed_host(hosts)
|
||||||
|
self.assertTrue(all([wh.weight == 0.0
|
||||||
|
for wh in weighed_hosts]))
|
||||||
|
|
||||||
|
def test_build_failure_weigher_scaled(self):
|
||||||
|
self.flags(build_failure_weight_multiplier=1000.0,
|
||||||
|
group='filter_scheduler')
|
||||||
|
hosts = self._get_all_hosts()
|
||||||
|
weighed_hosts = self._get_weighed_host(hosts)
|
||||||
|
self.assertEqual([0, -10, -100, -1000],
|
||||||
|
[wh.weight for wh in weighed_hosts])
|
@ -0,0 +1,23 @@
|
|||||||
|
---
|
||||||
|
security:
|
||||||
|
- |
|
||||||
|
To mitigate potential issues with compute nodes disabling
|
||||||
|
themselves in response to failures that were either non-fatal or
|
||||||
|
user-generated, the consecutive build failure counter
|
||||||
|
functionality in the compute service has been changed to advise
|
||||||
|
the scheduler of the count instead of self-disabling the service
|
||||||
|
upon exceeding the threshold. The
|
||||||
|
``[compute]/consecutive_build_service_disable_threshold``
|
||||||
|
configuration option still controls whether the count is tracked,
|
||||||
|
but the action taken on this value has been changed to a scheduler
|
||||||
|
weigher. This allows the scheduler to be configured to weigh hosts
|
||||||
|
with consecutive failures lower than other hosts, configured by the
|
||||||
|
``[filter_scheduler]/build_failure_weight_multiplier`` option. If
|
||||||
|
the compute threshold option is nonzero, computes will report their
|
||||||
|
failure count for the scheduler to consider. If the threshold
|
||||||
|
value is zero, then computes will not report this value
|
||||||
|
and the scheduler will assume the number of failures for
|
||||||
|
non-reporting compute nodes to be zero. By default, the scheduler
|
||||||
|
weigher is enabled and configured with a very large multiplier to
|
||||||
|
ensure that hosts with consecutive failures are scored low by
|
||||||
|
default.
|
Loading…
Reference in New Issue
Block a user