Cleanup when hitting MaxRetriesExceeded from no host_available
Prior to this patch there was a condition when no
host_available was true and an exception would get
raised without first cleaning up the instance.
This causes instances to get indefinitely stuck in
a scheduling state.
This patch fixes this by calling the clean up function
and then exits build_instances using a return statement.
The related functional regression recreate test is updated
to show this fixes the bug.
Change-Id: I6a2c63a4c33e783100208fd3f45eb52aad49e3d6
Closes-bug: #1837955
(cherry picked from commit b98d4ba6d5
)
This commit is contained in:
parent
0d2e27fd01
commit
fcc2b9e33e
|
@ -720,7 +720,11 @@ class ComputeTaskManager(base.Base):
|
|||
msg = ("Exhausted all hosts available for retrying build "
|
||||
"failures for instance %(instance_uuid)s." %
|
||||
{"instance_uuid": instance.uuid})
|
||||
raise exception.MaxRetriesExceeded(reason=msg)
|
||||
exc = exception.MaxRetriesExceeded(reason=msg)
|
||||
self._cleanup_when_reschedule_fails(
|
||||
context, instance, exc, legacy_request_spec,
|
||||
requested_networks)
|
||||
return
|
||||
instance.availability_zone = (
|
||||
availability_zones.get_host_availability_zone(context,
|
||||
host.service_host))
|
||||
|
|
|
@ -84,31 +84,15 @@ class BuildRescheduleClaimFailsTestCase(
|
|||
image_uuid=fake_image.get_valid_image_id(),
|
||||
networks=[{'port': self.neutron.port_1['id']}])
|
||||
server = self.api.post_server({'server': server})
|
||||
# FIXME(mriedem): This is bug 1837955 where the status is stuck in
|
||||
# BUILD rather than the vm_state being set to error and the task_state
|
||||
# being set to None. Uncomment this when the bug is fixed.
|
||||
# server = self._wait_for_state_change(self.api, server, 'ERROR')
|
||||
server = self._wait_for_state_change(self.api, server, 'ERROR')
|
||||
|
||||
# Wait for the MaxRetriesExceeded fault to be recorded.
|
||||
# set_vm_state_and_notify sets the vm_state to ERROR before the fault
|
||||
# is recorded but after the notification is sent. So wait for the
|
||||
# unversioned notification to show up and then get the fault.
|
||||
# FIXME(mriedem): Uncomment this when bug 1837955 is fixed.
|
||||
# self._wait_for_unversioned_notification(
|
||||
# 'compute_task.build_instances')
|
||||
# server = self.api.get_server(server['id'])
|
||||
# self.assertIn('fault', server)
|
||||
# self.assertIn('Exceeded maximum number of retries',
|
||||
# server['fault']['message'])
|
||||
|
||||
# TODO(mriedem): Remove this when the bug is fixed. We need to assert
|
||||
# something before the bug is fixed to show the failure so check the
|
||||
# logs.
|
||||
for x in range(20):
|
||||
logs = self.stdlog.logger.output
|
||||
if 'MaxRetriesExceeded' in logs:
|
||||
break
|
||||
time.sleep(.5)
|
||||
else:
|
||||
self.fail('Timed out waiting for MaxRetriesExceeded to show up '
|
||||
'in the logs.')
|
||||
self._wait_for_unversioned_notification(
|
||||
'compute_task.build_instances')
|
||||
server = self.api.get_server(server['id'])
|
||||
self.assertIn('fault', server)
|
||||
self.assertIn('Exceeded maximum number of retries',
|
||||
server['fault']['message'])
|
||||
|
|
|
@ -693,28 +693,36 @@ class _BaseTaskTestCase(object):
|
|||
mock.call(self.context, instances[1].uuid)])
|
||||
self.assertFalse(mock_get_by_host.called)
|
||||
|
||||
@mock.patch("nova.scheduler.utils.claim_resources", return_value=False)
|
||||
@mock.patch('nova.compute.utils.notify_about_compute_task_error')
|
||||
@mock.patch.object(objects.Instance, 'save')
|
||||
def test_build_instances_exhaust_host_list(self, _mock_save, mock_claim):
|
||||
def test_build_instances_exhaust_host_list(self, _mock_save, mock_notify):
|
||||
# A list of three alternate hosts for one instance
|
||||
host_lists = copy.deepcopy(fake_host_lists_alt)
|
||||
instance = fake_instance.fake_instance_obj(self.context)
|
||||
image = {'fake-data': 'should_pass_silently'}
|
||||
expected_claim_count = len(host_lists[0])
|
||||
|
||||
# build_instances() is a cast, we need to wait for it to complete
|
||||
self.useFixture(cast_as_call.CastAsCall(self))
|
||||
|
||||
self.conductor.build_instances(
|
||||
context=self.context,
|
||||
instances=[instance], image=image,
|
||||
filter_properties={},
|
||||
admin_password='admin_password',
|
||||
injected_files='injected_files',
|
||||
requested_networks=None,
|
||||
security_groups='security_groups',
|
||||
block_device_mapping=None,
|
||||
legacy_bdm=None,
|
||||
host_lists=host_lists
|
||||
)
|
||||
|
||||
# Since claim_resources() is mocked to always return False, we will run
|
||||
# out of alternate hosts, and MaxRetriesExceeded should be raised.
|
||||
self.assertRaises(exc.MaxRetriesExceeded,
|
||||
self.conductor.build_instances, context=self.context,
|
||||
instances=[instance], image=image, filter_properties={},
|
||||
admin_password='admin_password',
|
||||
injected_files='injected_files', requested_networks=None,
|
||||
security_groups='security_groups',
|
||||
block_device_mapping=None, legacy_bdm=None,
|
||||
host_lists=host_lists)
|
||||
self.assertEqual(expected_claim_count, mock_claim.call_count)
|
||||
# out of alternate hosts, and complain about MaxRetriesExceeded.
|
||||
mock_notify.assert_called_once_with(
|
||||
self.context, 'build_instances',
|
||||
instance.uuid, test.MatchType(dict), 'error',
|
||||
test.MatchType(exc.MaxRetriesExceeded), test.MatchType(str))
|
||||
|
||||
@mock.patch.object(conductor_manager.ComputeTaskManager,
|
||||
'_destroy_build_request')
|
||||
|
|
Loading…
Reference in New Issue