From d03a890a34f632adc9a19a33c8a5aebbccec50e4 Mon Sep 17 00:00:00 2001 From: int32bit Date: Mon, 22 Jan 2018 17:05:53 +0800 Subject: [PATCH] Set server status to ERROR if rebuild failed Currently there is no indication that the rebuild was refused, and worse, we may have a wrong imageref for the instance. This patch set the instance to ERROR status if rebuild failed in the scheduling stage. The user can rebuild the instance with valid image to get it out of ERROR state and reset with right instance metadata and properties. Closes-Bug: 1744325 Change-Id: Ibb7bee15a3d4ee6f0ef53ba12e8b41f65a1fe999 --- nova/conductor/manager.py | 9 +++++++-- .../functional/regressions/test_bug_1713783.py | 2 +- nova/tests/functional/test_server_group.py | 8 ++++---- nova/tests/functional/test_servers.py | 4 ++++ nova/tests/unit/conductor/test_conductor.py | 14 +++++++++++--- ...4325-rebuild-error-status-9e2da03f3f81fd6e.yaml | 7 +++++++ 6 files changed, 34 insertions(+), 10 deletions(-) create mode 100644 releasenotes/notes/bug-1744325-rebuild-error-status-9e2da03f3f81fd6e.yaml diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py index b5236a974634..bfd9769497ce 100644 --- a/nova/conductor/manager.py +++ b/nova/conductor/manager.py @@ -17,6 +17,7 @@ import contextlib import copy import functools +import sys from oslo_config import cfg from oslo_log import log as logging @@ -956,10 +957,12 @@ class ComputeTaskManager(base.Base): with excutils.save_and_reraise_exception(): self._set_vm_state_and_notify(context, instance.uuid, 'rebuild_server', - {'vm_state': instance.vm_state, + {'vm_state': vm_states.ERROR, 'task_state': None}, ex, request_spec) LOG.warning("No valid host found for rebuild", instance=instance) + compute_utils.add_instance_fault_from_exc(context, + instance, ex, sys.exc_info()) except exception.UnsupportedPolicyException as ex: if migration: migration.status = 'error' @@ -968,10 +971,12 @@ class ComputeTaskManager(base.Base): with excutils.save_and_reraise_exception(): self._set_vm_state_and_notify(context, instance.uuid, 'rebuild_server', - {'vm_state': instance.vm_state, + {'vm_state': vm_states.ERROR, 'task_state': None}, ex, request_spec) LOG.warning("Server with unsupported policy " "cannot be rebuilt", instance=instance) + compute_utils.add_instance_fault_from_exc(context, + instance, ex, sys.exc_info()) compute_utils.notify_about_instance_usage( self.notifier, context, instance, "rebuild.scheduled") diff --git a/nova/tests/functional/regressions/test_bug_1713783.py b/nova/tests/functional/regressions/test_bug_1713783.py index 4233511aaf67..bc023e779c6b 100644 --- a/nova/tests/functional/regressions/test_bug_1713783.py +++ b/nova/tests/functional/regressions/test_bug_1713783.py @@ -109,7 +109,7 @@ class FailedEvacuateStateTests(test.TestCase, self._wait_for_notification_event_type('compute_task.rebuild_server') - server = self._wait_for_state_change(self.api, server, 'ACTIVE') + server = self._wait_for_state_change(self.api, server, 'ERROR') self.assertEqual(self.hostname, server['OS-EXT-SRV-ATTR:host']) # Check migrations diff --git a/nova/tests/functional/test_server_group.py b/nova/tests/functional/test_server_group.py index 61c68854c79a..9e8ea76e713d 100644 --- a/nova/tests/functional/test_server_group.py +++ b/nova/tests/functional/test_server_group.py @@ -455,7 +455,7 @@ class ServerGroupTestV21(ServerGroupTestBase): self.admin_api.post_server_action(servers[1]['id'], post) self._wait_for_migration_status(servers[1], 'error') server_after_failed_evac = self._wait_for_state_change( - self.admin_api, servers[1], 'ACTIVE') + self.admin_api, servers[1], 'ERROR') # assert that after a failed evac the server active on the same host # as before @@ -479,7 +479,7 @@ class ServerGroupTestV21(ServerGroupTestBase): self.admin_api.post_server_action(servers[1]['id'], post) self._wait_for_migration_status(servers[1], 'error') server_after_failed_evac = self._wait_for_state_change( - self.admin_api, servers[1], 'ACTIVE') + self.admin_api, servers[1], 'ERROR') # assert that after a failed evac the server active on the same host # as before @@ -653,7 +653,7 @@ class ServerGroupTestV215(ServerGroupTestV21): self.admin_api.post_server_action(servers[1]['id'], post) self._wait_for_migration_status(servers[1], 'error') server_after_failed_evac = self._wait_for_state_change( - self.admin_api, servers[1], 'ACTIVE') + self.admin_api, servers[1], 'ERROR') # assert that after a failed evac the server active on the same host # as before @@ -677,7 +677,7 @@ class ServerGroupTestV215(ServerGroupTestV21): self.admin_api.post_server_action(servers[1]['id'], post) self._wait_for_migration_status(servers[1], 'error') server_after_failed_evac = self._wait_for_state_change( - self.admin_api, servers[1], 'ACTIVE') + self.admin_api, servers[1], 'ERROR') # assert that after a failed evac the server active on the same host # as before diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py index a22db98aba41..ed485f0215b7 100644 --- a/nova/tests/functional/test_servers.py +++ b/nova/tests/functional/test_servers.py @@ -1205,6 +1205,10 @@ class ServerRebuildTestCase(integrated_helpers._IntegratedTestBase, server = self.api.get_server(server['id']) self.assertEqual(rebuild_image_ref, server['image']['id']) + # The server should be in ERROR state + self.assertEqual('ERROR', server['status']) + self.assertIn('No valid host', server['fault']['message']) + def test_rebuild_with_new_image(self): """Rebuilds a server with a different image which will run it through the scheduler to validate the image is still OK with the compute host diff --git a/nova/tests/unit/conductor/test_conductor.py b/nova/tests/unit/conductor/test_conductor.py index 4c9d35eaa70a..7f9b756ebdeb 100644 --- a/nova/tests/unit/conductor/test_conductor.py +++ b/nova/tests/unit/conductor/test_conductor.py @@ -1338,8 +1338,10 @@ class _BaseTaskTestCase(object): 'select_destinations', side_effect=exc.NoValidHost(reason='')), mock.patch('nova.scheduler.utils.build_request_spec', - return_value=request_spec) - ) as (rebuild_mock, sig_mock, fp_mock, select_dest_mock, bs_mock): + return_value=request_spec), + mock.patch.object(scheduler_utils, 'set_vm_state_and_notify') + ) as (rebuild_mock, sig_mock, fp_mock, + select_dest_mock, bs_mock, set_vm_state_and_notify_mock): self.assertRaises(exc.NoValidHost, self.conductor_manager.rebuild_instance, context=self.context, instance=inst_obj, @@ -1349,7 +1351,11 @@ class _BaseTaskTestCase(object): select_dest_mock.assert_called_once_with(self.context, fake_spec, [inst_obj.uuid], return_objects=True, return_alternates=False) + self.assertEqual( + set_vm_state_and_notify_mock.call_args[0][4]['vm_state'], + vm_states.ERROR) self.assertFalse(rebuild_mock.called) + self.assertIn('No valid host', inst_obj.fault.message) @mock.patch.object(conductor_manager.compute_rpcapi.ComputeAPI, 'rebuild_instance') @@ -1390,12 +1396,14 @@ class _BaseTaskTestCase(object): self.context, inst_obj, **rebuild_args) - updates = {'vm_state': vm_states.ACTIVE, 'task_state': None} + updates = {'vm_state': vm_states.ERROR, 'task_state': None} state_mock.assert_called_once_with(self.context, inst_obj.uuid, 'rebuild_server', updates, exception, mock.ANY) self.assertFalse(select_dest_mock.called) self.assertFalse(rebuild_mock.called) + self.assertIn('ServerGroup policy is not supported', + inst_obj.fault.message) # Assert the migration status was updated. migration = objects.Migration.get_by_id(self.context, migration.id) diff --git a/releasenotes/notes/bug-1744325-rebuild-error-status-9e2da03f3f81fd6e.yaml b/releasenotes/notes/bug-1744325-rebuild-error-status-9e2da03f3f81fd6e.yaml new file mode 100644 index 000000000000..086d4094fb4d --- /dev/null +++ b/releasenotes/notes/bug-1744325-rebuild-error-status-9e2da03f3f81fd6e.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + If scheduling fails during rebuild the server instance will go to ERROR + state and a fault will be recorded. `Bug 1744325`_ + + .. _Bug 1744325: https://bugs.launchpad.net/nova/+bug/1744325