Rollback instance.image_ref on failed rebuild
When rebuilding and changing the image, we run the new image through the scheduler to see if it's valid for the instance on its current compute host. The API saves off the new image ref on the instance before casting to conductor to run through the scheduler. If the scheduler fails, the instance.image_ref was not being rolled back, which meant a user could attempt the rebuild with the same invalid image a second time and the API, seeing the instance.image_ref hasn't changed (even though it's not the actual backing image for the server), will bypass the scheduler and rebuild the instance with that invalid image. This fixes the issue by using the original image ref, passed from API to conductor during rebuild, to reset the instance.image_ref in the case of a failure. Note that there are other things changed on the instance in the API which this patch does not attempt to recover as that's a bigger work item which likely involves substantial refactoring of the code. Closes-Bug: #1746032 Conflicts: nova/conductor/manager.py nova/tests/functional/test_servers.py NOTE(mriedem): The conflicts in manager.py are due to not having I06d78c744fa75ae5f34c5cfa76bc3c9460767b84 in Ocata. The functional test conflict is due to tests that existed in Pike which don't exist in Ocata. Change-Id: I3399a66fe9b1297cd6b0dca440145393ceaef41f (cherry picked from commit4a2c9a4887
) (cherry picked from commit834adeae9a
)
This commit is contained in:
parent
83fd8ac0bf
commit
2efe3f6b88
|
@ -733,6 +733,11 @@ class ComputeTaskManager(base.Base):
|
||||||
host_dict['nodename'],
|
host_dict['nodename'],
|
||||||
host_dict['limits'])
|
host_dict['limits'])
|
||||||
except exception.NoValidHost as ex:
|
except exception.NoValidHost as ex:
|
||||||
|
# Rollback the image_ref if a new one was provided (this
|
||||||
|
# only happens in the rebuild case, not evacuate).
|
||||||
|
if orig_image_ref and orig_image_ref != image_ref:
|
||||||
|
instance.image_ref = orig_image_ref
|
||||||
|
instance.save()
|
||||||
with excutils.save_and_reraise_exception():
|
with excutils.save_and_reraise_exception():
|
||||||
self._set_vm_state_and_notify(context, instance.uuid,
|
self._set_vm_state_and_notify(context, instance.uuid,
|
||||||
'rebuild_server',
|
'rebuild_server',
|
||||||
|
@ -743,6 +748,11 @@ class ComputeTaskManager(base.Base):
|
||||||
compute_utils.add_instance_fault_from_exc(context,
|
compute_utils.add_instance_fault_from_exc(context,
|
||||||
instance, ex, sys.exc_info())
|
instance, ex, sys.exc_info())
|
||||||
except exception.UnsupportedPolicyException as ex:
|
except exception.UnsupportedPolicyException as ex:
|
||||||
|
# Rollback the image_ref if a new one was provided (this
|
||||||
|
# only happens in the rebuild case, not evacuate).
|
||||||
|
if orig_image_ref and orig_image_ref != image_ref:
|
||||||
|
instance.image_ref = orig_image_ref
|
||||||
|
instance.save()
|
||||||
with excutils.save_and_reraise_exception():
|
with excutils.save_and_reraise_exception():
|
||||||
self._set_vm_state_and_notify(context, instance.uuid,
|
self._set_vm_state_and_notify(context, instance.uuid,
|
||||||
'rebuild_server',
|
'rebuild_server',
|
||||||
|
|
|
@ -21,6 +21,7 @@ import mock
|
||||||
from oslo_log import log as logging
|
from oslo_log import log as logging
|
||||||
from oslo_serialization import base64
|
from oslo_serialization import base64
|
||||||
from oslo_utils import timeutils
|
from oslo_utils import timeutils
|
||||||
|
import six
|
||||||
|
|
||||||
from nova.compute import api as compute_api
|
from nova.compute import api as compute_api
|
||||||
from nova.compute import instance_actions
|
from nova.compute import instance_actions
|
||||||
|
@ -953,11 +954,12 @@ class ServerRebuildTestCase(integrated_helpers._IntegratedTestBase,
|
||||||
self.flags(host='host2')
|
self.flags(host='host2')
|
||||||
self.compute2 = self.start_service('compute', host='host2')
|
self.compute2 = self.start_service('compute', host='host2')
|
||||||
|
|
||||||
|
# We hard-code from a fake image since we can't get images
|
||||||
|
# via the compute /images proxy API with microversion > 2.35.
|
||||||
|
original_image_ref = '155d900f-4e14-4e4c-a73d-069cbf4541e6'
|
||||||
server_req_body = {
|
server_req_body = {
|
||||||
'server': {
|
'server': {
|
||||||
# We hard-code from a fake image since we can't get images
|
'imageRef': original_image_ref,
|
||||||
# via the compute /images proxy API with microversion > 2.35.
|
|
||||||
'imageRef': '155d900f-4e14-4e4c-a73d-069cbf4541e6',
|
|
||||||
'flavorRef': '1', # m1.tiny from DefaultFlavorsFixture,
|
'flavorRef': '1', # m1.tiny from DefaultFlavorsFixture,
|
||||||
'name': 'test_rebuild_with_image_novalidhost',
|
'name': 'test_rebuild_with_image_novalidhost',
|
||||||
# We don't care about networking for this test. This requires
|
# We don't care about networking for this test. This requires
|
||||||
|
@ -1000,11 +1002,19 @@ class ServerRebuildTestCase(integrated_helpers._IntegratedTestBase,
|
||||||
# Before microversion 2.51 events are only returned for instance
|
# Before microversion 2.51 events are only returned for instance
|
||||||
# actions if you're an admin.
|
# actions if you're an admin.
|
||||||
self.api_fixture.admin_api)
|
self.api_fixture.admin_api)
|
||||||
# Unfortunately the server's image_ref is updated to be the new image
|
# Assert the server image_ref was rolled back on failure.
|
||||||
# even though the rebuild should not work.
|
|
||||||
server = self.api.get_server(server['id'])
|
server = self.api.get_server(server['id'])
|
||||||
self.assertEqual(rebuild_image_ref, server['image']['id'])
|
self.assertEqual(original_image_ref, server['image']['id'])
|
||||||
|
|
||||||
# The server should be in ERROR state
|
# The server should be in ERROR state
|
||||||
self.assertEqual('ERROR', server['status'])
|
self.assertEqual('ERROR', server['status'])
|
||||||
self.assertIn('No valid host', server['fault']['message'])
|
self.assertIn('No valid host', server['fault']['message'])
|
||||||
|
|
||||||
|
# Rebuild it again with the same bad image to make sure it's rejected
|
||||||
|
# again. Since we're using CastAsCall here, there is no 202 from the
|
||||||
|
# API, and the exception from conductor gets passed back through the
|
||||||
|
# API.
|
||||||
|
ex = self.assertRaises(
|
||||||
|
client.OpenStackApiException, self.api.api_post,
|
||||||
|
'/servers/%s/action' % server['id'], rebuild_req_body)
|
||||||
|
self.assertIn('NoValidHost', six.text_type(ex))
|
||||||
|
|
Loading…
Reference in New Issue