Browse Source

Rollback instance.image_ref on failed rebuild

When rebuilding and changing the image, we run the new image
through the scheduler to see if it's valid for the instance
on its current compute host. The API saves off the new image
ref on the instance before casting to conductor to run through
the scheduler. If the scheduler fails, the instance.image_ref was
not being rolled back, which meant a user could attempt the rebuild
with the same invalid image a second time and the API, seeing the
instance.image_ref hasn't changed (even though it's not the actual
backing image for the server), will bypass the scheduler and rebuild
the instance with that invalid image.

This fixes the issue by using the original image ref, passed from
API to conductor during rebuild, to reset the instance.image_ref
in the case of a failure.

Note that there are other things changed on the instance in the API
which this patch does not attempt to recover as that's a bigger
work item which likely involves substantial refactoring of the code.

Closes-Bug: #1746032

Conflicts:
      nova/conductor/manager.py

NOTE(mriedem): The conflict is due to not having change
Ibc44e3b2261b314bb92062a88ca9ee6b81298dc3 in Pike. Also, six
had to be imported in the functional test.

Change-Id: I3399a66fe9b1297cd6b0dca440145393ceaef41f
(cherry picked from commit 4a2c9a4887)
changes/03/539003/1
Matt Riedemann 3 years ago
parent
commit
834adeae9a
2 changed files with 26 additions and 6 deletions
  1. +10
    -0
      nova/conductor/manager.py
  2. +16
    -6
      nova/tests/functional/test_servers.py

+ 10
- 0
nova/conductor/manager.py View File

@ -878,6 +878,11 @@ class ComputeTaskManager(base.Base):
if migration:
migration.status = 'error'
migration.save()
# Rollback the image_ref if a new one was provided (this
# only happens in the rebuild case, not evacuate).
if orig_image_ref and orig_image_ref != image_ref:
instance.image_ref = orig_image_ref
instance.save()
request_spec = request_spec.to_legacy_request_spec_dict()
with excutils.save_and_reraise_exception():
self._set_vm_state_and_notify(context, instance.uuid,
@ -892,6 +897,11 @@ class ComputeTaskManager(base.Base):
if migration:
migration.status = 'error'
migration.save()
# Rollback the image_ref if a new one was provided (this
# only happens in the rebuild case, not evacuate).
if orig_image_ref and orig_image_ref != image_ref:
instance.image_ref = orig_image_ref
instance.save()
request_spec = request_spec.to_legacy_request_spec_dict()
with excutils.save_and_reraise_exception():
self._set_vm_state_and_notify(context, instance.uuid,


+ 16
- 6
nova/tests/functional/test_servers.py View File

@ -21,6 +21,7 @@ import mock
from oslo_log import log as logging
from oslo_serialization import base64
from oslo_utils import timeutils
import six
from nova.compute import api as compute_api
from nova.compute import instance_actions
@ -1093,11 +1094,12 @@ class ServerRebuildTestCase(integrated_helpers._IntegratedTestBase,
self.flags(host='host2')
self.compute2 = self.start_service('compute', host='host2')
# We hard-code from a fake image since we can't get images
# via the compute /images proxy API with microversion > 2.35.
original_image_ref = '155d900f-4e14-4e4c-a73d-069cbf4541e6'
server_req_body = {
'server': {
# We hard-code from a fake image since we can't get images
# via the compute /images proxy API with microversion > 2.35.
'imageRef': '155d900f-4e14-4e4c-a73d-069cbf4541e6',
'imageRef': original_image_ref,
'flavorRef': '1', # m1.tiny from DefaultFlavorsFixture,
'name': 'test_rebuild_with_image_novalidhost',
# We don't care about networking for this test. This requires
@ -1140,15 +1142,23 @@ class ServerRebuildTestCase(integrated_helpers._IntegratedTestBase,
# Before microversion 2.51 events are only returned for instance
# actions if you're an admin.
self.api_fixture.admin_api)
# Unfortunately the server's image_ref is updated to be the new image
# even though the rebuild should not work.
# Assert the server image_ref was rolled back on failure.
server = self.api.get_server(server['id'])
self.assertEqual(rebuild_image_ref, server['image']['id'])
self.assertEqual(original_image_ref, server['image']['id'])
# The server should be in ERROR state
self.assertEqual('ERROR', server['status'])
self.assertIn('No valid host', server['fault']['message'])
# Rebuild it again with the same bad image to make sure it's rejected
# again. Since we're using CastAsCall here, there is no 202 from the
# API, and the exception from conductor gets passed back through the
# API.
ex = self.assertRaises(
client.OpenStackApiException, self.api.api_post,
'/servers/%s/action' % server['id'], rebuild_req_body)
self.assertIn('NoValidHost', six.text_type(ex))
def test_rebuild_with_new_image(self):
"""Rebuilds a server with a different image which will run it through
the scheduler to validate the image is still OK with the compute host


Loading…
Cancel
Save