Vm state management and error states

this implements the blueprint nova-vm-state-management It implements the following functionalities: - Filter compute api calls according to state of the VM (defined in compute/state_checker). - Sets error state if the scheduler cannot allocate the VM in any host - Handles the create/delete concurrency in the compute manager Change-Id: Ie6d016b7d4781f70bb5967f204fa88a6412bd727
2011-11-16 17:31:29 +00:00
parent d3b75b75aa
commit ff753cd608
11 changed files with 533 additions and 100 deletions
--- a/nova/compute/api.py
+++ b/nova/compute/api.py
@@ -35,6 +35,7 @@ from nova import utils
 from nova import volume
 from nova.compute import instance_types
 from nova.compute import power_state
+from nova.compute import state_checker
 from nova.compute import task_states
 from nova.compute import vm_states
 from nova.scheduler import api as scheduler_api
@@ -49,25 +50,8 @@ flags.DECLARE('enable_zone_routing', 'nova.scheduler.api')
 flags.DECLARE('vncproxy_topic', 'nova.vnc')
 flags.DEFINE_integer('find_host_timeout', 30,
                     'Timeout after NN seconds when looking for a host.')
-
-
-def _is_able_to_shutdown(instance):
-    vm_state = instance["vm_state"]
-    instance_uuid = instance["uuid"]
-
-    valid_shutdown_states = [
-        vm_states.ACTIVE,
-        vm_states.REBUILDING,
-        vm_states.BUILDING,
-        vm_states.ERROR,
-    ]
-
-    if vm_state not in valid_shutdown_states:
-        LOG.warn(_("Instance %(instance_uuid)s cannot be shutdown from "
-                   "its current state: %(vm_state)s.") % locals())
-        return False
-
-    return True
+flags.DEFINE_boolean('api_check_vm_states', True,
+                     'Filter calls by vm state')


 def _is_queued_delete(instance):
@@ -83,6 +67,27 @@ def _is_queued_delete(instance):
    return True


+class check_vm_state(object):
+    """Class to wrap API functions that are sensitive to the VM state.
+
+    If the instance is in the wrong state, the wrapper will raise an exception.
+    It uses state_checker to decide if the call is allowed or not.
+    """
+
+    def __init__(self, method_name):
+        self.method_name = method_name
+
+    def __call__(self, f):
+        def _state_checker_wrap(api, context, instance, *args, **kw):
+            if FLAGS.api_check_vm_states and \
+            state_checker.is_blocked(self.method_name, context, instance):
+                raise exception.InstanceInvalidState(\
+                    instance_uuid=instance['uuid'], method=self.method_name)
+            else:
+                return f(api, context, instance, *args, **kw)
+        return _state_checker_wrap
+
+
 class API(base.Base):
    """API for interacting with the compute manager."""

@@ -766,15 +771,13 @@ class API(base.Base):
        rv = self.db.instance_update(context, instance["id"], kwargs)
        return dict(rv.iteritems())

-    @scheduler_api.reroute_compute("soft_delete")
+    @check_vm_state(state_checker.SOFT_DELETE)
+    @scheduler_api.reroute_compute(state_checker.SOFT_DELETE)
    def soft_delete(self, context, instance):
        """Terminate an instance."""
        instance_uuid = instance["uuid"]
        LOG.debug(_("Going to try to soft delete %s"), instance_uuid)

-        if not _is_able_to_shutdown(instance):
-            return
-
        # NOTE(jerdfelt): The compute daemon handles reclaiming instances
        # that are in soft delete. If there is no host assigned, there is
        # no daemon to reclaim, so delete it immediately.
@@ -806,20 +809,18 @@ class API(base.Base):
        else:
            self.db.instance_destroy(context, instance['id'])

-    @scheduler_api.reroute_compute("delete")
+    @check_vm_state(state_checker.DELETE)
+    @scheduler_api.reroute_compute(state_checker.DELETE)
    def delete(self, context, instance):
        """Terminate an instance."""
        LOG.debug(_("Going to try to terminate %s"), instance["uuid"])

-        if not _is_able_to_shutdown(instance):
-            return
-
        self._delete(context, instance)

-    @scheduler_api.reroute_compute("restore")
+    @check_vm_state(state_checker.RESTORE)
+    @scheduler_api.reroute_compute(state_checker.RESTORE)
    def restore(self, context, instance):
        """Restore a previously deleted (but not reclaimed) instance."""
-
        if not _is_queued_delete(instance):
            return

@@ -837,7 +838,8 @@ class API(base.Base):
            self._cast_compute_message('power_on_instance', context,
                                       instance['uuid'], host)

-    @scheduler_api.reroute_compute("force_delete")
+    @check_vm_state(state_checker.FORCE_DELETE)
+    @scheduler_api.reroute_compute(state_checker.FORCE_DELETE)
    def force_delete(self, context, instance):
        """Force delete a previously deleted (but not reclaimed) instance."""

@@ -846,15 +848,13 @@ class API(base.Base):

        self._delete(context, instance)

-    @scheduler_api.reroute_compute("stop")
+    @check_vm_state(state_checker.STOP)
+    @scheduler_api.reroute_compute(state_checker.STOP)
    def stop(self, context, instance):
        """Stop an instance."""
        instance_uuid = instance["uuid"]
        LOG.debug(_("Going to try to stop %s"), instance_uuid)

-        if not _is_able_to_shutdown(instance):
-            return
-
        self.update(context,
                    instance,
                    vm_state=vm_states.ACTIVE,
@@ -867,6 +867,7 @@ class API(base.Base):
            self._cast_compute_message('stop_instance', context,
                    instance_uuid, host)

+    @check_vm_state(state_checker.START)
    def start(self, context, instance):
        """Start an instance."""
        vm_state = instance["vm_state"]
@@ -1078,7 +1079,8 @@ class API(base.Base):
        raise exception.Error(_("Unable to find host for Instance %s")
                                % instance_uuid)

-    @scheduler_api.reroute_compute("backup")
+    @check_vm_state(state_checker.BACKUP)
+    @scheduler_api.reroute_compute(state_checker.BACKUP)
    def backup(self, context, instance, name, backup_type, rotation,
               extra_properties=None):
        """Backup the given instance
@@ -1095,7 +1097,8 @@ class API(base.Base):
                            extra_properties=extra_properties)
        return recv_meta

-    @scheduler_api.reroute_compute("snapshot")
+    @check_vm_state(state_checker.SNAPSHOT)
+    @scheduler_api.reroute_compute(state_checker.SNAPSHOT)
    def snapshot(self, context, instance, name, extra_properties=None):
        """Snapshot the given instance.

@@ -1125,12 +1128,6 @@ class API(base.Base):
        task_state = instance["task_state"]
        instance_uuid = instance['uuid']

-        if task_state == task_states.IMAGE_BACKUP:
-            raise exception.InstanceBackingUp(instance_uuid=instance_uuid)
-
-        if task_state == task_states.IMAGE_SNAPSHOT:
-            raise exception.InstanceSnapshotting(instance_uuid=instance_uuid)
-
        properties = {
            'instance_uuid': instance_uuid,
            'user_id': str(context.user_id),
@@ -1150,7 +1147,8 @@ class API(base.Base):
                                   params=params)
        return recv_meta

-    @scheduler_api.reroute_compute("reboot")
+    @check_vm_state(state_checker.REBOOT)
+    @scheduler_api.reroute_compute(state_checker.REBOOT)
    def reboot(self, context, instance, reboot_type):
        """Reboot the given instance."""
        state = {'SOFT': task_states.REBOOTING,
@@ -1164,16 +1162,13 @@ class API(base.Base):
                                   instance['uuid'],
                                   params={'reboot_type': reboot_type})

-    @scheduler_api.reroute_compute("rebuild")
+    @check_vm_state(state_checker.REBUILD)
+    @scheduler_api.reroute_compute(state_checker.REBUILD)
    def rebuild(self, context, instance, image_href, admin_password,
                name=None, metadata=None, files_to_inject=None):
        """Rebuild the given instance with the provided metadata."""
        name = name or instance["display_name"]

-        if instance["vm_state"] != vm_states.ACTIVE:
-            msg = _("Instance must be active to rebuild.")
-            raise exception.RebuildRequiresActiveInstance(msg)
-
        files_to_inject = files_to_inject or []
        metadata = metadata or {}

@@ -1199,7 +1194,8 @@ class API(base.Base):
                                   instance["uuid"],
                                   params=rebuild_params)

-    @scheduler_api.reroute_compute("revert_resize")
+    @check_vm_state(state_checker.REVERT_RESIZE)
+    @scheduler_api.reroute_compute(state_checker.REVERT_RESIZE)
    def revert_resize(self, context, instance):
        """Reverts a resize, deleting the 'new' instance in the process."""
        context = context.elevated()
@@ -1223,7 +1219,8 @@ class API(base.Base):
        self.db.migration_update(context, migration_ref['id'],
                                 {'status': 'reverted'})

-    @scheduler_api.reroute_compute("confirm_resize")
+    @check_vm_state(state_checker.CONFIRM_RESIZE)
+    @scheduler_api.reroute_compute(state_checker.CONFIRM_RESIZE)
    def confirm_resize(self, context, instance):
        """Confirms a migration/resize and deletes the 'old' instance."""
        context = context.elevated()
@@ -1249,7 +1246,8 @@ class API(base.Base):
        self.db.instance_update(context, instance['uuid'],
                {'host': migration_ref['dest_compute'], })

-    @scheduler_api.reroute_compute("resize")
+    @check_vm_state(state_checker.RESIZE)
+    @scheduler_api.reroute_compute(state_checker.RESIZE)
    def resize(self, context, instance, flavor_id=None):
        """Resize (ie, migrate) a running instance.

@@ -1330,7 +1328,8 @@ class API(base.Base):
        # didn't raise so this is the correct zone
        self.network_api.add_network_to_project(context, project_id)

-    @scheduler_api.reroute_compute("pause")
+    @check_vm_state(state_checker.PAUSE)
+    @scheduler_api.reroute_compute(state_checker.PAUSE)
    def pause(self, context, instance):
        """Pause the given instance."""
        instance_uuid = instance["uuid"]
@@ -1340,7 +1339,8 @@ class API(base.Base):
                    task_state=task_states.PAUSING)
        self._cast_compute_message('pause_instance', context, instance_uuid)

-    @scheduler_api.reroute_compute("unpause")
+    @check_vm_state(state_checker.UNPAUSE)
+    @scheduler_api.reroute_compute(state_checker.UNPAUSE)
    def unpause(self, context, instance):
        """Unpause the given instance."""
        instance_uuid = instance["uuid"]
@@ -1377,7 +1377,8 @@ class API(base.Base):
        """Retrieve actions for the given instance."""
        return self.db.instance_get_actions(context, instance['id'])

-    @scheduler_api.reroute_compute("suspend")
+    @check_vm_state(state_checker.SUSPEND)
+    @scheduler_api.reroute_compute(state_checker.SUSPEND)
    def suspend(self, context, instance):
        """Suspend the given instance."""
        instance_uuid = instance["uuid"]
@@ -1387,7 +1388,8 @@ class API(base.Base):
                    task_state=task_states.SUSPENDING)
        self._cast_compute_message('suspend_instance', context, instance_uuid)

-    @scheduler_api.reroute_compute("resume")
+    @check_vm_state(state_checker.RESUME)
+    @scheduler_api.reroute_compute(state_checker.RESUME)
    def resume(self, context, instance):
        """Resume the given instance."""
        instance_uuid = instance["uuid"]
@@ -1397,7 +1399,8 @@ class API(base.Base):
                    task_state=task_states.RESUMING)
        self._cast_compute_message('resume_instance', context, instance_uuid)

-    @scheduler_api.reroute_compute("rescue")
+    @check_vm_state(state_checker.RESCUE)
+    @scheduler_api.reroute_compute(state_checker.RESCUE)
    def rescue(self, context, instance, rescue_password=None):
        """Rescue the given instance."""
        self.update(context,
@@ -1412,7 +1415,8 @@ class API(base.Base):
                                   instance['uuid'],
                                   params=rescue_params)

-    @scheduler_api.reroute_compute("unrescue")
+    @check_vm_state(state_checker.UNRESCUE)
+    @scheduler_api.reroute_compute(state_checker.UNRESCUE)
    def unrescue(self, context, instance):
        """Unrescue the given instance."""
        self.update(context,
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -297,6 +297,34 @@ class ComputeManager(manager.SchedulerDependentManager):

        return (swap, ephemerals, block_device_mapping)

+    def _is_instance_terminated(self, instance_uuid):
+        """Instance in DELETING task state or not found in DB"""
+        context = nova.context.get_admin_context()
+        try:
+            instance = self.db.instance_get_by_uuid(context, instance_uuid)
+            if instance['task_state'] == task_states.DELETING:
+                return True
+            return False
+        except:
+            return True
+
+    def _shutdown_instance_even_if_deleted(self, context, instance_uuid):
+        """Call terminate_instance even for already deleted instances"""
+        LOG.info(_("Going to force the deletion of the vm %(instance_uuid)s, "
+                   "even if it is deleted") % locals())
+        try:
+            try:
+                self.terminate_instance(context, instance_uuid)
+            except exception.InstanceNotFound:
+                LOG.info(_("Instance %(instance_uuid)s did not exist in the "
+                         "DB, but I will shut it down anyway using a special "
+                         "context") % locals())
+                ctxt = nova.context.get_admin_context(True)
+                self.terminate_instance(ctxt, instance_uuid)
+        except Exception as ex:
+            LOG.info(_("exception terminating the instance "
+                     "%(instance_id)s") % locals())
+
    def _run_instance(self, context, instance_uuid,
                      requested_networks=None,
                      injected_files=[],
@@ -320,9 +348,14 @@ class ComputeManager(manager.SchedulerDependentManager):
                with utils.save_and_reraise_exception():
                    self._deallocate_network(context, instance)
            self._notify_about_instance_usage(instance)
+            if self._is_instance_terminated(instance_uuid):
+                raise exception.InstanceNotFound
        except exception.InstanceNotFound:
            LOG.exception(_("Instance %s not found.") % instance_uuid)
-            return  # assuming the instance was already deleted
+            # assuming the instance was already deleted, run "delete" again
+            # just in case
+            self._shutdown_instance_even_if_deleted(context, instance_uuid)
+            return
        except Exception as e:
            with utils.save_and_reraise_exception():
                self._instance_update(context, instance_uuid,
--- a/nova/compute/state_checker.py
+++ b/nova/compute/state_checker.py
@@ -0,0 +1,137 @@
+# vim: tabstop=4 shiftwidth=4 softtabstop=4
+
+# Copyright 2010 United States Government as represented by the
+# Administrator of the National Aeronautics and Space Administration.
+# Copyright 2011 Justin Santa Barbara
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+from nova import db
+from nova.compute import task_states as ts
+from nova.compute import vm_states as vm
+from nova import context as ctxt
+
+# Function names that run the state check before their execution:
+REBOOT = 'reboot'
+START = 'start'
+REBUILD = 'rebuild'
+STOP = 'stop'
+PAUSE = 'pause'
+BACKUP = 'backup'
+UNPAUSE = 'unpause'
+SUSPEND = 'suspend'
+RESUME = 'resume'
+RESCUE = 'rescue'
+UNRESCUE = 'unrescue'
+SNAPSHOT = 'snapshot'
+RESIZE = 'resize'
+CONFIRM_RESIZE = 'confirm_resize'
+REVERT_RESIZE = 'revert_resize'
+DELETE = 'delete'
+SOFT_DELETE = 'soft_delete'
+FORCE_DELETE = 'force_delete'
+RESTORE = 'restore'
+
+
+# Aux variables to save cpu time, used by blocker dictionaries
+all_ts_but_resize_verify = list(set(ts.get_list()) - set([ts.RESIZE_VERIFY]))
+all_vm_but_act_resc = list(set(vm.get_list()) - set([vm.ACTIVE, vm.RESCUED]))
+all_vm_but_active = list(set(vm.get_list()) - set([vm.ACTIVE]))
+
+# Call blocked if the vm task_state is found in the corresponding list
+block_for_task_state = {
+    REBOOT: all_ts_but_resize_verify,
+    START: all_ts_but_resize_verify,
+    REBUILD: all_ts_but_resize_verify,
+    PAUSE: all_ts_but_resize_verify,
+    STOP: all_ts_but_resize_verify,
+    UNPAUSE: all_ts_but_resize_verify,
+    SUSPEND: all_ts_but_resize_verify,
+    RESUME: all_ts_but_resize_verify,
+    RESCUE: all_ts_but_resize_verify,
+    UNRESCUE: all_ts_but_resize_verify,
+    SNAPSHOT: all_ts_but_resize_verify,
+    BACKUP: all_ts_but_resize_verify,
+    RESIZE: all_ts_but_resize_verify,
+    CONFIRM_RESIZE: all_ts_but_resize_verify,
+    REVERT_RESIZE: all_ts_but_resize_verify}
+
+# Call blocked if the vm vm_state is found in the corresponding list
+block_for_vm_state = {
+    REBOOT: all_vm_but_act_resc,
+    START: list(set(vm.get_list()) - set([vm.STOPPED])),
+    REBUILD: all_vm_but_active,
+    PAUSE: all_vm_but_act_resc,
+    STOP: all_vm_but_act_resc,
+    UNPAUSE: list(set(vm.get_list()) - set([vm.PAUSED])),
+    SUSPEND: all_vm_but_act_resc,
+    RESUME: list(set(vm.get_list()) - set([vm.SUSPENDED])),
+    RESCUE: list(set(vm.get_list()) - set([vm.ACTIVE, vm.STOPPED])),
+    UNRESCUE: list(set(vm.get_list()) - set([vm.ACTIVE, vm.RESCUED])),
+    SNAPSHOT: all_vm_but_active,
+    BACKUP: all_vm_but_active,
+    RESIZE: all_vm_but_active,
+    CONFIRM_RESIZE: all_vm_but_active,
+    REVERT_RESIZE: all_vm_but_active}
+
+# Call blocked if the combination of vm_state, power_state and task_state is
+# found in the corresponding list
+block_for_combination = {
+    CONFIRM_RESIZE: [{'vm_state': vm.ACTIVE, 'task_state': None}],
+    REVERT_RESIZE: [{'vm_state': vm.ACTIVE, 'task_state': None}]}
+
+
+def is_blocked(method_name, context, instance_ref):
+    """
+    Is the method blocked for the VM state?
+
+    This method returns False if the state of the vm is found
+    in the blocked dictionaries for the method.
+    """
+    if instance_ref['task_state'] in block_for_task_state.get(method_name, ()):
+        return True
+    if instance_ref['vm_state'] in block_for_vm_state.get(method_name, ()):
+        return True
+    if method_name in block_for_combination:
+        return _is_combination_blocked(method_name, instance_ref)
+    # Allow the method if not found in any list
+    return False
+
+
+def _is_combination_blocked(method_name, instance_ref):
+    """
+    Is the method blocked according to the blocked_combination dictionary?
+
+    To be blocked, all the elements
+    in a dictionary need to match the vm states.
+    If a value is not present in a dictionary we assume that the dictionary
+    applies for any value of that particular element
+    """
+    for blocked_element in block_for_combination[method_name]:
+        # Check power state
+        if 'power_state' in blocked_element and instance_ref['power_state']\
+            != blocked_element['power_state']:
+            continue
+        # Check vm state
+        if 'vm_state' in blocked_element and instance_ref['vm_state']\
+            != blocked_element['vm_state']:
+            continue
+        # Check task state
+        if 'task_state' in blocked_element and instance_ref['task_state']\
+            != blocked_element['task_state']:
+            continue
+        return True
+    # After analyzing all the dictionaries for the method, none tells us to
+    # block the function
+    return False
--- a/nova/compute/task_states.py
+++ b/nova/compute/task_states.py
@@ -60,3 +60,14 @@ UNRESCUING = 'unrescuing'
 DELETING = 'deleting'
 STOPPING = 'stopping'
 STARTING = 'starting'
+
+
+def get_list():
+    """Returns a list of all the possible task_states"""
+    return [SCHEDULING, BLOCK_DEVICE_MAPPING, NETWORKING, SPAWNING,
+            IMAGE_SNAPSHOT, IMAGE_BACKUP, UPDATING_PASSWORD, RESIZE_PREP,
+            RESIZE_MIGRATING, RESIZE_MIGRATED, RESIZE_FINISH, RESIZE_REVERTING,
+            RESIZE_CONFIRMING, RESIZE_VERIFY, REBUILDING, REBOOTING,
+            REBOOTING_HARD, PAUSING, UNPAUSING, SUSPENDING, RESUMING,
+            POWERING_OFF, POWERING_ON, RESCUING, UNRESCUING, DELETING,
+            STOPPING, STARTING]
--- a/nova/compute/vm_states.py
+++ b/nova/compute/vm_states.py
@@ -38,3 +38,9 @@ MIGRATING = 'migrating'
 RESIZING = 'resizing'

 ERROR = 'error'
+
+
+def get_list():
+    """Returns a list of all the possible vm_states"""
+    return [ACTIVE, BUILDING, REBUILDING, PAUSED, SUSPENDED, RESCUED,
+            DELETED, STOPPED, SOFT_DELETE, MIGRATING, RESIZING, ERROR]
--- a/nova/exception.py
+++ b/nova/exception.py
@@ -250,6 +250,11 @@ class InvalidParameterValue(Invalid):
    message = _("%(err)s")


+class InstanceInvalidState(Invalid):
+    message = _("Instance %(instance_uuid)s in state %(state)s. Cannot "
+                "%(method)s while the instance is in this state.")
+
+
 class InstanceNotRunning(Invalid):
    message = _("Instance %(instance_id)s is not running.")

--- a/nova/scheduler/manager.py
+++ b/nova/scheduler/manager.py
@@ -25,12 +25,13 @@ import functools

 from nova.compute import vm_states
 from nova import db
+from nova import exception
 from nova import flags
 from nova import log as logging
 from nova import manager
 from nova import rpc
-from nova import utils
 from nova.scheduler import zone_manager
+from nova import utils

 LOG = logging.getLogger('nova.scheduler.manager')
 FLAGS = flags.FLAGS
@@ -101,17 +102,24 @@ class SchedulerManager(manager.Manager):
        # Scheduler methods are responsible for casting.
        try:
            return real_meth(*args, **kwargs)
-        except Exception as e:
-            # If this affects a particular instance, move that
-            # instance to the ERROR state
-            if 'instance_id' in kwargs:
-                instance_id = kwargs['instance_id']
-                LOG.warning(_("Failed to %(driver_method)s: %(e)s.  "
-                              "Putting instance %(instance_id)s into "
+        except exception.NoValidHost as ex:
+            self._set_instance_error(method, context, ex, *args, **kwargs)
+        except Exception as ex:
+            with utils.save_and_reraise_exception():
+                self._set_instance_error(method, context, ex, *args, **kwargs)
+
+    # NOTE (David Subiros) : If the exception is raised ruing run_instance
+    #                        method, the DB record probably does not exist yet.
+    def _set_instance_error(self, method, context, ex, *args, **kwargs):
+        """Sets VM to Error state"""
+        LOG.warning(_("Failed to schedule_%(method)s: %(ex)s") % locals())
+        if method == "start_instance" or method == "run_instance":
+            instance_id = kwargs['instance_id']
+            if instance_id:
+                LOG.warning(_("Setting instance %(instance_id)s to "
                              "ERROR state.") % locals())
-                db.instance_update(context, kwargs['instance_id'],
-                                   dict(vm_state=vm_states.ERROR))
-            raise
+                db.instance_update(context, instance_id,
+                                    {'vm_state': vm_states.ERROR})

    # NOTE (masumotok) : This method should be moved to nova.api.ec2.admin.
    #                    Based on bexar design summit discussion,
--- a/nova/tests/api/ec2/test_cloud.py
+++ b/nova/tests/api/ec2/test_cloud.py
@@ -1418,9 +1418,10 @@ class CloudTestCase(test.TestCase):
                  'max_count': 1, }
        instance_id = self._run_instance(**kwargs)

-        # a running instance can't be started. It is just ignored.
-        result = self.cloud.start_instances(self.context, [instance_id])
-        self.assertTrue(result)
+        # a running instance can't be started.
+        self.assertRaises(exception.InstanceInvalidState,
+                          self.cloud.start_instances,
+                          self.context, [instance_id])

        result = self.cloud.stop_instances(self.context, [instance_id])
        self.assertTrue(result)
@@ -1469,9 +1470,10 @@ class CloudTestCase(test.TestCase):
                  'max_count': 1, }
        instance_id = self._run_instance(**kwargs)

-        # a running instance can't be started. It is just ignored.
-        result = self.cloud.start_instances(self.context, [instance_id])
-        self.assertTrue(result)
+        # a running instance can't be started.
+        self.assertRaises(exception.InstanceInvalidState,
+                          self.cloud.start_instances,
+                          self.context, [instance_id])

        result = self.cloud.terminate_instances(self.context, [instance_id])
        self.assertTrue(result)
@@ -1483,9 +1485,10 @@ class CloudTestCase(test.TestCase):
                  'max_count': 1, }
        instance_id = self._run_instance(**kwargs)

-        # a running instance can't be started. It is just ignored.
-        result = self.cloud.start_instances(self.context, [instance_id])
-        self.assertTrue(result)
+        # a running instance can't be started.
+        self.assertRaises(exception.InstanceInvalidState,
+                          self.cloud.start_instances,
+                          self.context, [instance_id])

        result = self.cloud.reboot_instances(self.context, [instance_id])
        self.assertTrue(result)
--- a/nova/tests/api/openstack/v2/test_servers.py
+++ b/nova/tests/api/openstack/v2/test_servers.py
@@ -1182,6 +1182,11 @@ class ServersControllerTest(test.TestCase):
        body = dict(reboot=dict(type="HARD"))
        req = fakes.HTTPRequest.blank(
            '/v2/fake/servers/%s/action' % FAKE_UUID)
+        # Assume the instance is in ACTIVE state before calling reboot
+        self.stubs.Set(nova.db, 'instance_get',
+                       return_server_with_state(vm_states.ACTIVE))
+        self.stubs.Set(nova.db, 'instance_get_by_uuid',
+                       return_server_with_state(vm_states.ACTIVE))
        self.controller.action(req, FAKE_UUID, body)
        self.test_server_actions()

--- a/nova/tests/scheduler/test_scheduler.py
+++ b/nova/tests/scheduler/test_scheduler.py
@@ -21,6 +21,7 @@ Tests For Scheduler

 import datetime
 import mox
+import stubout

 from novaclient import v1_1 as novaclient
 from novaclient import exceptions as novaclient_exceptions
@@ -38,9 +39,9 @@ from nova.scheduler import driver
 from nova.scheduler import manager
 from nova.scheduler.simple import SimpleScheduler
 from nova.compute import power_state
+from nova.compute import task_states
 from nova.compute import vm_states

-
 FLAGS = flags.FLAGS
 flags.DECLARE('max_cores', 'nova.scheduler.simple')
 flags.DECLARE('stub_network', 'nova.compute.manager')
@@ -143,6 +144,10 @@ class SchedulerTestCase(test.TestCase):
        driver = 'nova.tests.scheduler.test_scheduler.TestDriver'
        self.flags(scheduler_driver=driver)

+    def tearDown(self):
+        self.stubs.UnsetAll()
+        super(SchedulerTestCase, self).tearDown()
+
    def _create_compute_service(self):
        """Create compute-manager(ComputeNode and Service record)."""
        ctxt = context.get_admin_context()
@@ -205,6 +210,41 @@ class SchedulerTestCase(test.TestCase):
                return False
        return True

+    def _assert_state(self, state_dict):
+        """assert the instance is in the state defined by state_dict"""
+        instances = db.instance_get_all(context.get_admin_context())
+        self.assertEqual(len(instances), 1)
+
+        if 'vm_state' in state_dict:
+            self.assertEqual(state_dict['vm_state'], instances[0]['vm_state'])
+        if 'task_state' in state_dict:
+            self.assertEqual(state_dict['task_state'],
+                             instances[0]['task_state'])
+        if 'power_state' in state_dict:
+            self.assertEqual(state_dict['power_state'],
+                             instances[0]['power_state'])
+
+    def test_no_valid_host_exception_on_start(self):
+        """check the vm goes to ERROR state if the scheduler fails.
+
+        If the scheduler driver cannot allocate a host for the VM during
+        start_instance, it will raise a NoValidHost exception. In this
+        scenario, we have to make sure that the VM state is set to ERROR.
+        """
+        def NoValidHost_raiser(context, topic, *args, **kwargs):
+            raise exception.NoValidHost(_("Test NoValidHost exception"))
+        scheduler = manager.SchedulerManager()
+        ins_ref = _create_instance(task_state=task_states.STARTING,
+                                   vm_state=vm_states.STOPPED)
+        self.stubs = stubout.StubOutForTesting()
+        self.stubs.Set(TestDriver, 'schedule', NoValidHost_raiser)
+        self.mox.StubOutWithMock(rpc, 'cast', use_mock_anything=True)
+        ctxt = context.get_admin_context()
+        scheduler.start_instance(ctxt, 'topic', instance_id=ins_ref['id'])
+        # assert that the instance goes to ERROR state
+        self._assert_state({'vm_state': vm_states.ERROR,
+                            'task_state': task_states.STARTING})
+
    def test_show_host_resources_no_project(self):
        """No instance are running on the given host."""

@@ -247,21 +287,6 @@ class SchedulerTestCase(test.TestCase):
        db.instance_destroy(ctxt, i_ref1['id'])
        db.instance_destroy(ctxt, i_ref2['id'])

-    def test_exception_puts_instance_in_error_state(self):
-        """Test that an exception from the scheduler puts an instance
-        in the ERROR state."""
-
-        scheduler = manager.SchedulerManager()
-        ctxt = context.get_admin_context()
-        inst = _create_instance()
-        self.assertRaises(Exception, scheduler._schedule,
-                          'failing_method', ctxt, 'scheduler',
-                          instance_id=inst['uuid'])
-
-        # Refresh the instance
-        inst = db.instance_get(ctxt, inst['id'])
-        self.assertEqual(inst['vm_state'], vm_states.ERROR)
-

 class SimpleDriverTestCase(test.TestCase):
    """Test case for simple driver"""
--- a/nova/tests/test_compute.py
+++ b/nova/tests/test_compute.py
@@ -19,7 +19,6 @@
 """
 Tests For Compute
 """
-
 from copy import copy
 from webob import exc

@@ -30,6 +29,7 @@ from nova import compute
 from nova.compute import instance_types
 from nova.compute import manager as compute_manager
 from nova.compute import power_state
+from nova.compute import state_checker
 from nova.compute import task_states
 from nova.compute import vm_states
 from nova import context
@@ -111,6 +111,8 @@ class BaseTestCase(test.TestCase):
        self.project_id = 'fake'
        self.context = context.RequestContext(self.user_id, self.project_id)
        test_notifier.NOTIFICATIONS = []
+        self.mox = mox.Mox()
+        self.total_waits = 0

        def fake_show(meh, context, id):
            return {'id': 1, 'min_disk': None, 'min_ram': None,
@@ -120,7 +122,14 @@ class BaseTestCase(test.TestCase):
        self.stubs.Set(rpc, 'call', rpc_call_wrapper)
        self.stubs.Set(rpc, 'cast', rpc_cast_wrapper)

-    def _create_fake_instance(self, params=None):
+    def tearDown(self):
+        self.mox.UnsetStubs()
+        instances = db.instance_get_all(self.context.elevated())
+        for instance in instances:
+            db.instance_destroy(self.context.elevated(), instance['id'])
+        super(BaseTestCase, self).tearDown()
+
+    def _create_fake_instance(self, params=None, type_name='m1.tiny'):
        """Create a test instance"""
        if not params:
            params = {}
@@ -131,12 +140,16 @@ class BaseTestCase(test.TestCase):
        inst['launch_time'] = '10'
        inst['user_id'] = self.user_id
        inst['project_id'] = self.project_id
-        type_id = instance_types.get_instance_type_by_name('m1.tiny')['id']
+        type_id = instance_types.get_instance_type_by_name(type_name)['id']
        inst['instance_type_id'] = type_id
        inst['ami_launch_index'] = 0
        inst.update(params)
        return db.instance_create(self.context, inst)

+    def _create_instance(self, params=None, type_name='m1.tiny'):
+        """Create a test instance. Returns uuid"""
+        return self._create_fake_instance(params, type_name=type_name)['uuid']
+
    def _create_instance_type(self, params=None):
        """Create a test instance type"""
        if not params:
@@ -195,6 +208,77 @@ class ComputeTestCase(BaseTestCase):
        finally:
            db.instance_destroy(self.context, instance['id'])

+    def _assert_state(self, state_dict):
+        """assert the instance is in the state defined by state_dict"""
+        instances = db.instance_get_all(context.get_admin_context())
+        self.assertEqual(len(instances), 1)
+
+        if 'vm_state' in state_dict:
+            self.assertEqual(state_dict['vm_state'], instances[0]['vm_state'])
+        if 'task_state' in state_dict:
+            self.assertEqual(state_dict['task_state'],
+                             instances[0]['task_state'])
+        if 'power_state' in state_dict:
+            self.assertEqual(state_dict['power_state'],
+                             instances[0]['power_state'])
+
+    def test_fail_to_schedule_persists(self):
+        """check the persistence of the ERROR(scheduling) state"""
+        self._create_instance(params={'vm_state': vm_states.ERROR,
+                                      'task_state': task_states.SCHEDULING})
+        #check state is failed even after the periodic poll
+        error_list = self.compute.periodic_tasks(context.get_admin_context())
+        self._assert_state({'vm_state': vm_states.ERROR,
+                            'task_state': task_states.SCHEDULING})
+
+    def test_run_instance_setup_block_device_mapping_fail(self):
+        """ block device mapping failure test.
+
+        Make sure that when there is a block device mapping problem,
+        the instance goes to ERROR state, keeping the task state
+        """
+        def fake(*args, **kwargs):
+            raise Exception("Failed to block device mapping")
+        self.stubs.Set(nova.compute.manager.ComputeManager,
+                       '_setup_block_device_mapping', fake)
+        instance_uuid = self._create_instance()
+        self.assertRaises(Exception, self.compute.run_instance,
+                          self.context, instance_uuid)
+        #check state is failed even after the periodic poll
+        self._assert_state({'vm_state': vm_states.ERROR,
+                            'task_state': task_states.BLOCK_DEVICE_MAPPING})
+        error_list = self.compute.periodic_tasks(context.get_admin_context())
+        self._assert_state({'vm_state': vm_states.ERROR,
+                            'task_state': task_states.BLOCK_DEVICE_MAPPING})
+
+    def test_run_instance_spawn_fail(self):
+        """ spawn failure test.
+
+        Make sure that when there is a spawning problem,
+        the instance goes to ERROR state, keeping the task state"""
+        def fake(*args, **kwargs):
+            raise Exception("Failed to spawn")
+        self.stubs.Set(self.compute.driver, 'spawn', fake)
+        instance_uuid = self._create_instance()
+        self.assertRaises(Exception, self.compute.run_instance,
+                          self.context, instance_uuid)
+        #check state is failed even after the periodic poll
+        self._assert_state({'vm_state': vm_states.ERROR,
+                            'task_state': task_states.SPAWNING})
+        error_list = self.compute.periodic_tasks(context.get_admin_context())
+        self._assert_state({'vm_state': vm_states.ERROR,
+                            'task_state': task_states.SPAWNING})
+
+    def test_can_terminate_on_error_state(self):
+        """Make sure that the instance can be terminated in ERROR state"""
+        elevated = context.get_admin_context()
+        #check failed to schedule --> terminate
+        instance_uuid = self._create_instance(params={'vm_state':
+                                                vm_states.ERROR})
+        self.compute.terminate_instance(self.context, instance_uuid)
+        self.assertRaises(exception.InstanceNotFound, db.instance_get_by_uuid,
+                          elevated, instance_uuid)
+
    def test_run_terminate(self):
        """Make sure it is possible to  run and terminate instance"""
        instance = self._create_fake_instance()
@@ -1157,6 +1241,108 @@ class ComputeAPITestCase(BaseTestCase):
            'properties': {'kernel_id': 1, 'ramdisk_id': 1},
        }

+    def test_check_vm_state_filtered_function(self):
+        """Test the check_vm_state mechanism for filtered functions.
+
+        Checks that the filtered_function is correctly filtered
+        in the right states only for the api_check_vm_states flag set to True.
+
+        Note that the filtered_function takes the same number of arguments
+        than the real functions that are decorated in the compute api.
+        """
+        @compute.api.check_vm_state('filtered_function')
+        def filtered_function(api, context, instance_ref):
+            LOG.debug("filtered_function executed")
+            return True
+
+        def filtered_assume_right_state(instance_ref):
+            self.flags(api_check_vm_states=True)
+            self.assertTrue(filtered_function(self.compute_api,
+                                              self.context, instance_ref))
+
+        def filtered_assume_wrong_state(instance_ref):
+            self.flags(api_check_vm_states=True)
+            self.assertRaises(exception.InstanceInvalidState,
+                              filtered_function, self.compute_api,
+                              self.context, instance_ref)
+            self.flags(api_check_vm_states=False)
+            self.assertTrue(filtered_function(self.compute_api,
+                                              self.context, instance_ref))
+
+        # check that the filtered_function is correctly filtered
+        self._execute_allowed_and_blocked('filtered_function',
+                            filtered_assume_right_state,
+                            filtered_assume_wrong_state)
+
+    def test_check_vm_state_non_filtered_function(self):
+        """Test the check_vm_state mechanism for non filtered functions.
+
+        Checks that if a function that is decorated with the check_vm_state
+        but it is not defined in any blocked dictionary, it will always
+        be executed
+        """
+        @compute.api.check_vm_state('non_filtered_function')
+        def non_filtered_function(api, context, instance_ref):
+            LOG.debug("non_filtered_function executed")
+            return True
+
+        def non_filtered_assume_executed(instance_ref):
+            self.flags(api_check_vm_states=True)
+            self.assertTrue(non_filtered_function(self.compute_api,
+                                                  self.context, instance_ref))
+
+        # check that the non_filtered_function is never filtered
+        self._execute_allowed_and_blocked('non_filtered_function',
+                            non_filtered_assume_executed,
+                            non_filtered_assume_executed)
+
+    def _execute_allowed_and_blocked(self, func_name, f_allowed, f_blocked):
+        """Execute f_allowed and f_blocked functions for all the scenarios.
+
+        Get an allowed vm_state, a blocked vm_state, an allowed task_state,
+        and a blocked task_state for the function defined by func_name to be
+        executed. Then it executes the function f_allowed or f_blocked
+        accordingly, passing as parameter a new instance id. Theses functions
+        have to run the func_name function and assert the expected result
+        """
+
+        # define blocked and allowed states
+        blocked_tsk = task_states.SCHEDULING
+        ok_task = task_states.NETWORKING
+        blocked_vm = vm_states.BUILDING
+        ok_vm = vm_states.RESCUED
+        blocked_comb = {'power_state': power_state.RUNNING,
+                         'vm_state': vm_states.ACTIVE, 'task_state': None}
+        ok_comb = {'power_state': power_state.RUNNING,
+                    'vm_state': vm_states.PAUSED, 'task_state': None}
+
+        # To guarantee a 100% test coverage we create fake lists.
+        fake_block_for_task_state = {'filtered_function': [blocked_tsk]}
+
+        fake_block_for_vm_state = {'filtered_function': [blocked_vm]}
+
+        fake_block_for_combination = {'filtered_function': [blocked_comb]}
+
+        self.stubs.Set(nova.compute.state_checker, 'block_for_task_state',
+                       fake_block_for_task_state)
+        self.stubs.Set(nova.compute.state_checker, 'block_for_vm_state',
+                       fake_block_for_vm_state)
+        self.stubs.Set(nova.compute.state_checker, 'block_for_combination',
+                       fake_block_for_combination)
+
+        i_ref = self._create_fake_instance(params={'task_state': blocked_tsk})
+        f_blocked(i_ref)
+        i_ref = self._create_fake_instance(params={'task_state': ok_task})
+        f_allowed(i_ref)
+        i_ref = self._create_fake_instance(params={'vm_state': blocked_vm})
+        f_blocked(i_ref)
+        i_ref = self._create_fake_instance(params={'vm_state': ok_vm})
+        f_allowed(i_ref)
+        i_ref = self._create_fake_instance(params=blocked_comb)
+        f_blocked(i_ref)
+        i_ref = self._create_fake_instance(params=ok_comb)
+        f_allowed(i_ref)
+
    def test_create_with_too_little_ram(self):
        """Test an instance type with too little memory"""

@@ -1416,10 +1602,14 @@ class ComputeAPITestCase(BaseTestCase):
        db.instance_destroy(self.context, instance_uuid)

    def test_resume(self):
-        """Ensure instance can be resumed"""
+        """Ensure instance can be resumed (if suspended)"""
        instance = self._create_fake_instance()
        instance_uuid = instance['uuid']
+        instance_id = instance['id']
        self.compute.run_instance(self.context, instance_uuid )
+        db.instance_update(self.context, instance_id,
+                           {'vm_state': vm_states.SUSPENDED})
+        instance = db.instance_get(self.context, instance_id)

        self.assertEqual(instance['task_state'], None)

@@ -1578,6 +1768,7 @@ class ComputeAPITestCase(BaseTestCase):
        params = {'vm_state': vm_states.RESCUED, 'task_state': None}
        db.instance_update(self.context, instance_uuid, params)

+        instance = db.instance_get_by_uuid(self.context, instance_uuid)
        self.compute_api.unrescue(self.context, instance)

        instance = db.instance_get_by_uuid(self.context, instance_uuid)
@@ -1625,7 +1816,7 @@ class ComputeAPITestCase(BaseTestCase):
        db.instance_update(self.context, instance_uuid, instance_values)
        instance = self.compute_api.get(self.context, instance_uuid)

-        self.assertRaises(exception.InstanceBackingUp,
+        self.assertRaises(exception.InstanceInvalidState,
                          self.compute_api.backup,
                          self.context,
                          instance,
@@ -1643,7 +1834,7 @@ class ComputeAPITestCase(BaseTestCase):
        db.instance_update(self.context, instance_uuid, instance_values)
        instance = self.compute_api.get(self.context, instance_uuid)

-        self.assertRaises(exception.InstanceSnapshotting,
+        self.assertRaises(exception.InstanceInvalidState,
                          self.compute_api.snapshot,
                          self.context,
                          instance,
@@ -1663,6 +1854,11 @@ class ComputeAPITestCase(BaseTestCase):
        migration_ref = db.migration_create(context,
                {'instance_uuid': instance['uuid'],
                 'status': 'finished'})
+        # set the state that the instance gets when resize finishes
+        db.instance_update(self.context, instance['uuid'],
+                           {'task_state': task_states.RESIZE_VERIFY,
+                            'vm_state': vm_states.ACTIVE})
+        instance = db.instance_get_by_uuid(context, instance['uuid'])

        self.compute_api.confirm_resize(context, instance)
        self.compute.terminate_instance(context, instance['uuid'])