Merge "Cleanup when hitting MaxRetriesExceeded from no host_available" into stable/rocky
This commit is contained in:
commit
b79689fcbc
|
@ -534,6 +534,23 @@ class ComputeTaskManager(base.Base):
|
||||||
bdm.attachment_id = attachment['id']
|
bdm.attachment_id = attachment['id']
|
||||||
bdm.save()
|
bdm.save()
|
||||||
|
|
||||||
|
def _cleanup_when_reschedule_fails(
|
||||||
|
self, context, instance, exception, request_spec,
|
||||||
|
requested_networks):
|
||||||
|
"""Set the instance state and clean up.
|
||||||
|
|
||||||
|
It is only used in case build_instance fails while rescheduling the
|
||||||
|
instance
|
||||||
|
"""
|
||||||
|
|
||||||
|
updates = {'vm_state': vm_states.ERROR,
|
||||||
|
'task_state': None}
|
||||||
|
self._set_vm_state_and_notify(
|
||||||
|
context, instance.uuid, 'build_instances', updates, exception,
|
||||||
|
request_spec)
|
||||||
|
self._cleanup_allocated_networks(
|
||||||
|
context, instance, requested_networks)
|
||||||
|
|
||||||
# NOTE(danms): This is never cell-targeted because it is only used for
|
# NOTE(danms): This is never cell-targeted because it is only used for
|
||||||
# cellsv1 (which does not target cells directly) and n-cpu reschedules
|
# cellsv1 (which does not target cells directly) and n-cpu reschedules
|
||||||
# (which go to the cell conductor and thus are always cell-specific).
|
# (which go to the cell conductor and thus are always cell-specific).
|
||||||
|
@ -614,11 +631,7 @@ class ComputeTaskManager(base.Base):
|
||||||
# disabled in those cases.
|
# disabled in those cases.
|
||||||
num_attempts = filter_properties.get(
|
num_attempts = filter_properties.get(
|
||||||
'retry', {}).get('num_attempts', 1)
|
'retry', {}).get('num_attempts', 1)
|
||||||
updates = {'vm_state': vm_states.ERROR, 'task_state': None}
|
|
||||||
for instance in instances:
|
for instance in instances:
|
||||||
self._set_vm_state_and_notify(
|
|
||||||
context, instance.uuid, 'build_instances', updates,
|
|
||||||
exc, request_spec)
|
|
||||||
# If num_attempts > 1, we're in a reschedule and probably
|
# If num_attempts > 1, we're in a reschedule and probably
|
||||||
# either hit NoValidHost or MaxRetriesExceeded. Either way,
|
# either hit NoValidHost or MaxRetriesExceeded. Either way,
|
||||||
# the build request should already be gone and we probably
|
# the build request should already be gone and we probably
|
||||||
|
@ -631,8 +644,9 @@ class ComputeTaskManager(base.Base):
|
||||||
self._destroy_build_request(context, instance)
|
self._destroy_build_request(context, instance)
|
||||||
except exception.BuildRequestNotFound:
|
except exception.BuildRequestNotFound:
|
||||||
pass
|
pass
|
||||||
self._cleanup_allocated_networks(
|
self._cleanup_when_reschedule_fails(
|
||||||
context, instance, requested_networks)
|
context, instance, exc, request_spec,
|
||||||
|
requested_networks)
|
||||||
return
|
return
|
||||||
|
|
||||||
elevated = context.elevated()
|
elevated = context.elevated()
|
||||||
|
@ -673,7 +687,11 @@ class ComputeTaskManager(base.Base):
|
||||||
msg = ("Exhausted all hosts available for retrying build "
|
msg = ("Exhausted all hosts available for retrying build "
|
||||||
"failures for instance %(instance_uuid)s." %
|
"failures for instance %(instance_uuid)s." %
|
||||||
{"instance_uuid": instance.uuid})
|
{"instance_uuid": instance.uuid})
|
||||||
raise exception.MaxRetriesExceeded(reason=msg)
|
exc = exception.MaxRetriesExceeded(reason=msg)
|
||||||
|
self._cleanup_when_reschedule_fails(
|
||||||
|
context, instance, exc, request_spec,
|
||||||
|
requested_networks)
|
||||||
|
return
|
||||||
instance.availability_zone = (
|
instance.availability_zone = (
|
||||||
availability_zones.get_host_availability_zone(context,
|
availability_zones.get_host_availability_zone(context,
|
||||||
host.service_host))
|
host.service_host))
|
||||||
|
|
|
@ -29,6 +29,11 @@ class BuildRescheduleClaimFailsTestCase(
|
||||||
"""
|
"""
|
||||||
compute_driver = 'fake.SmallFakeDriver'
|
compute_driver = 'fake.SmallFakeDriver'
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
super(BuildRescheduleClaimFailsTestCase, self).setUp()
|
||||||
|
fake_notifier.stub_notifier(self)
|
||||||
|
self.addCleanup(fake_notifier.reset)
|
||||||
|
|
||||||
def _wait_for_unversioned_notification(self, event_type):
|
def _wait_for_unversioned_notification(self, event_type):
|
||||||
for x in range(20): # wait up to 10 seconds
|
for x in range(20): # wait up to 10 seconds
|
||||||
for notification in fake_notifier.NOTIFICATIONS:
|
for notification in fake_notifier.NOTIFICATIONS:
|
||||||
|
@ -85,31 +90,15 @@ class BuildRescheduleClaimFailsTestCase(
|
||||||
image_uuid=fake_image.get_valid_image_id(),
|
image_uuid=fake_image.get_valid_image_id(),
|
||||||
networks=[{'port': nova_fixtures.NeutronFixture.port_1['id']}])
|
networks=[{'port': nova_fixtures.NeutronFixture.port_1['id']}])
|
||||||
server = self.api.post_server({'server': server})
|
server = self.api.post_server({'server': server})
|
||||||
# FIXME(mriedem): This is bug 1837955 where the status is stuck in
|
server = self._wait_for_state_change(self.api, server, 'ERROR')
|
||||||
# BUILD rather than the vm_state being set to error and the task_state
|
|
||||||
# being set to None. Uncomment this when the bug is fixed.
|
|
||||||
# server = self._wait_for_state_change(self.api, server, 'ERROR')
|
|
||||||
|
|
||||||
# Wait for the MaxRetriesExceeded fault to be recorded.
|
# Wait for the MaxRetriesExceeded fault to be recorded.
|
||||||
# set_vm_state_and_notify sets the vm_state to ERROR before the fault
|
# set_vm_state_and_notify sets the vm_state to ERROR before the fault
|
||||||
# is recorded but after the notification is sent. So wait for the
|
# is recorded but after the notification is sent. So wait for the
|
||||||
# unversioned notification to show up and then get the fault.
|
# unversioned notification to show up and then get the fault.
|
||||||
# FIXME(mriedem): Uncomment this when bug 1837955 is fixed.
|
self._wait_for_unversioned_notification(
|
||||||
# self._wait_for_unversioned_notification(
|
'compute_task.build_instances')
|
||||||
# 'compute_task.build_instances')
|
server = self.api.get_server(server['id'])
|
||||||
# server = self.api.get_server(server['id'])
|
self.assertIn('fault', server)
|
||||||
# self.assertIn('fault', server)
|
self.assertIn('Exceeded maximum number of retries',
|
||||||
# self.assertIn('Exceeded maximum number of retries',
|
server['fault']['message'])
|
||||||
# server['fault']['message'])
|
|
||||||
|
|
||||||
# TODO(mriedem): Remove this when the bug is fixed. We need to assert
|
|
||||||
# something before the bug is fixed to show the failure so check the
|
|
||||||
# logs.
|
|
||||||
for x in range(20):
|
|
||||||
logs = self.stdlog.logger.output
|
|
||||||
if 'MaxRetriesExceeded' in logs:
|
|
||||||
break
|
|
||||||
time.sleep(.5)
|
|
||||||
else:
|
|
||||||
self.fail('Timed out waiting for MaxRetriesExceeded to show up '
|
|
||||||
'in the logs.')
|
|
||||||
|
|
|
@ -702,28 +702,38 @@ class _BaseTaskTestCase(object):
|
||||||
mock.call(self.context, instances[1].uuid)])
|
mock.call(self.context, instances[1].uuid)])
|
||||||
self.assertFalse(mock_get_by_host.called)
|
self.assertFalse(mock_get_by_host.called)
|
||||||
|
|
||||||
@mock.patch("nova.scheduler.utils.claim_resources", return_value=False)
|
@mock.patch('nova.conductor.manager.ComputeTaskManager.'
|
||||||
|
'_set_vm_state_and_notify')
|
||||||
@mock.patch.object(objects.Instance, 'save')
|
@mock.patch.object(objects.Instance, 'save')
|
||||||
def test_build_instances_exhaust_host_list(self, _mock_save, mock_claim):
|
def test_build_instances_exhaust_host_list(self, _mock_save, mock_notify):
|
||||||
# A list of three alternate hosts for one instance
|
# A list of three alternate hosts for one instance
|
||||||
host_lists = copy.deepcopy(fake_host_lists_alt)
|
host_lists = copy.deepcopy(fake_host_lists_alt)
|
||||||
instance = fake_instance.fake_instance_obj(self.context)
|
instance = fake_instance.fake_instance_obj(self.context)
|
||||||
image = {'fake-data': 'should_pass_silently'}
|
image = {'fake-data': 'should_pass_silently'}
|
||||||
expected_claim_count = len(host_lists[0])
|
|
||||||
|
|
||||||
# build_instances() is a cast, we need to wait for it to complete
|
# build_instances() is a cast, we need to wait for it to complete
|
||||||
self.useFixture(cast_as_call.CastAsCall(self))
|
self.useFixture(cast_as_call.CastAsCall(self))
|
||||||
|
|
||||||
|
self.conductor.build_instances(
|
||||||
|
context=self.context,
|
||||||
|
instances=[instance], image=image,
|
||||||
|
filter_properties={},
|
||||||
|
admin_password='admin_password',
|
||||||
|
injected_files='injected_files',
|
||||||
|
requested_networks=None,
|
||||||
|
security_groups='security_groups',
|
||||||
|
block_device_mapping=None,
|
||||||
|
legacy_bdm=None,
|
||||||
|
host_lists=host_lists
|
||||||
|
)
|
||||||
|
|
||||||
# Since claim_resources() is mocked to always return False, we will run
|
# Since claim_resources() is mocked to always return False, we will run
|
||||||
# out of alternate hosts, and MaxRetriesExceeded should be raised.
|
# out of alternate hosts, and complain about MaxRetriesExceeded.
|
||||||
self.assertRaises(exc.MaxRetriesExceeded,
|
mock_notify.assert_called_once_with(
|
||||||
self.conductor.build_instances, context=self.context,
|
self.context, instance.uuid, 'build_instances',
|
||||||
instances=[instance], image=image, filter_properties={},
|
test.MatchType(dict), # updates
|
||||||
admin_password='admin_password',
|
test.MatchType(exc.MaxRetriesExceeded),
|
||||||
injected_files='injected_files', requested_networks=None,
|
test.MatchType(dict)) # request_spec
|
||||||
security_groups='security_groups',
|
|
||||||
block_device_mapping=None, legacy_bdm=None,
|
|
||||||
host_lists=host_lists)
|
|
||||||
self.assertEqual(expected_claim_count, mock_claim.call_count)
|
|
||||||
|
|
||||||
@mock.patch.object(conductor_manager.ComputeTaskManager,
|
@mock.patch.object(conductor_manager.ComputeTaskManager,
|
||||||
'_destroy_build_request')
|
'_destroy_build_request')
|
||||||
|
|
Loading…
Reference in New Issue