Add command to delete BUILD instances and clusters

Sometimes an instance/cluster can be stuck in BUILD state forever.
Attempting to delete the instance in this state is currently not
allowed. Add force-delete and reset-status command. Reset-status
will reset the status of an instance to ERROR and cluster to NONE.

The reset-status command can only be used if the instance/cluster
is in BUILD or ERROR state. Resetting the status of an instance in
ERROR state can be useful as an instance might go ACTIVE after the
specified timeout. Once the status has been reset it is possible
for an instance to go ACTIVE if it receives a hearbeat from the
guestagent.

Force-delete will combine functionality of reset-status and delete.

Change-Id: I83f6cdcdd884e51d002295b0d1f07341990e512c
Depends-On: I957b4be5030e493e0eb8c6b6855d41b942b2823c
Partial-Bug: #1579141
This commit is contained in:
Ali Adil 2016-07-18 21:23:48 +00:00 committed by Ali Asgar Adil
parent 1ef945d6fa
commit cee1f8e6c7
13 changed files with 229 additions and 12 deletions

View File

@ -0,0 +1,6 @@
features:
- The reset-status command will set the task and status
of an instance to ERROR after which it can be deleted.
- The force-delete command will allow the deletion of
an instance even if the instance is stuck in BUILD
state.

View File

@ -21,7 +21,8 @@ from trove.cluster.tasks import ClusterTasks
from trove.common import cfg
from trove.common import exception
from trove.common.i18n import _
from trove.common.notification import DBaaSClusterGrow, DBaaSClusterShrink
from trove.common.notification import (DBaaSClusterGrow, DBaaSClusterShrink,
DBaaSClusterResetStatus)
from trove.common.notification import StartNotification
from trove.common import remote
from trove.common import server_group as srv_grp
@ -136,6 +137,16 @@ class Cluster(object):
LOG.info(_("Setting task to NONE on cluster %s") % self.id)
self.update_db(task_status=ClusterTasks.NONE)
def reset_status(self):
self.validate_cluster_available([ClusterTasks.BUILDING_INITIAL])
LOG.info(_("Resetting status to NONE on cluster %s") % self.id)
self.reset_task()
instances = inst_models.DBInstance.find_all(cluster_id=self.id,
deleted=False).all()
for inst in instances:
instance = inst_models.load_any_instance(self.context, inst.id)
instance.reset_status()
@property
def id(self):
return self.db_info.id
@ -291,6 +302,12 @@ class Cluster(object):
with StartNotification(context, cluster_id=self.id):
instance_ids = [instance['id'] for instance in param]
return self.shrink(instance_ids)
elif action == "reset-status":
context.notification = DBaaSClusterResetStatus(context,
request=req)
with StartNotification(context, cluster_id=self.id):
return self.reset_status()
else:
raise exception.BadRequest(_("Action %s not supported") % action)

View File

@ -506,6 +506,16 @@ class DBaaSInstanceDelete(DBaaSAPINotification):
return ['instance_id']
class DBaaSInstanceResetStatus(DBaaSAPINotification):
def event_type(self):
return 'instance_reset_status'
@abc.abstractmethod
def required_start_traits(self):
return ['instance_id']
class DBaaSInstanceDetach(DBaaSAPINotification):
@abc.abstractmethod
@ -565,6 +575,17 @@ class DBaaSClusterDelete(DBaaSAPINotification):
return ['cluster_id']
class DBaaSClusterResetStatus(DBaaSAPINotification):
@abc.abstractmethod
def event_type(self):
return 'cluster_reset_status'
@abc.abstractmethod
def required_start_traits(self):
return ['cluster_id']
class DBaaSClusterAddShard(DBaaSAPINotification):
@abc.abstractmethod

View File

@ -246,6 +246,10 @@ class SimpleInstance(object):
def is_building(self):
return self.status in [InstanceStatus.BUILD]
@property
def is_error(self):
return self.status in [InstanceStatus.ERROR]
@property
def is_datastore_running(self):
"""True if the service status indicates datastore is up and running."""
@ -292,6 +296,10 @@ class SimpleInstance(object):
if self.db_info.task_status.is_error:
return InstanceStatus.ERROR
# If we've reset the status, show it as an error
if tr_instance.ServiceStatuses.UNKNOWN == self.datastore_status.status:
return InstanceStatus.ERROR
# Check for taskmanager status.
action = self.db_info.task_status.action
if 'BUILDING' == action:
@ -597,8 +605,9 @@ class BaseInstance(SimpleInstance):
def delete(self):
def _delete_resources():
if self.is_building:
raise exception.UnprocessableEntity("Instance %s is not ready."
% self.id)
raise exception.UnprocessableEntity(
"Instance %s is not ready. (Status is %s)." %
(self.id, self.status))
LOG.debug("Deleting instance with compute id = %s.",
self.db_info.compute_instance_id)
@ -718,6 +727,20 @@ class BaseInstance(SimpleInstance):
return files
def reset_status(self):
if self.is_building or self.is_error:
LOG.info(_LI("Resetting the status to ERROR on instance %s."),
self.id)
self.reset_task_status()
reset_instance = InstanceServiceStatus.find_by(instance_id=self.id)
reset_instance.set_status(tr_instance.ServiceStatuses.UNKNOWN)
reset_instance.save()
else:
raise exception.UnprocessableEntity(
"Instance %s status can only be reset in BUILD or ERROR "
"state." % self.id)
class FreshInstance(BaseInstance):
@classmethod
@ -727,8 +750,8 @@ class FreshInstance(BaseInstance):
class BuiltInstance(BaseInstance):
@classmethod
def load(cls, context, id):
return load_instance(cls, context, id, needs_server=True)
def load(cls, context, id, needs_server=True):
return load_instance(cls, context, id, needs_server=needs_server)
class Instance(BuiltInstance):

View File

@ -78,7 +78,6 @@ class InstanceController(wsgi.Controller):
if not body:
raise exception.BadRequest(_("Invalid request body."))
context = req.environ[wsgi.CONTEXT_KEY]
instance = models.Instance.load(context, id)
_actions = {
'restart': self._action_restart,
'resize': self._action_resize,
@ -86,6 +85,7 @@ class InstanceController(wsgi.Controller):
'promote_to_replica_source':
self._action_promote_to_replica_source,
'eject_replica_source': self._action_eject_replica_source,
'reset_status': self._action_reset_status,
}
selected_action = None
action_name = None
@ -97,6 +97,10 @@ class InstanceController(wsgi.Controller):
"instance %(instance_id)s for tenant '%(tenant_id)s'"),
{'action_name': action_name, 'instance_id': id,
'tenant_id': tenant_id})
needs_server = True
if action_name in ['reset_status']:
needs_server = False
instance = models.Instance.load(context, id, needs_server=needs_server)
return selected_action(context, req, instance, body)
def _action_restart(self, context, req, instance, body):
@ -163,6 +167,17 @@ class InstanceController(wsgi.Controller):
instance.eject_replica_source()
return wsgi.Result(None, 202)
def _action_reset_status(self, context, req, instance, body):
context.notification = notification.DBaaSInstanceResetStatus(
context, request=req)
with StartNotification(context, instance_id=instance.id):
instance.reset_status()
LOG.debug("Failing backups for instance %s." % instance.id)
backup_model.fail_for_instance(instance.id)
return wsgi.Result(None, 202)
def index(self, req, tenant_id):
"""Return all instances."""
LOG.info(_LI("Listing database instances for tenant '%s'"), tenant_id)

View File

@ -619,7 +619,9 @@ class FreshInstanceTasks(FreshInstance, NotifyMixin, ConfigurationMixin):
status == rd_instance.ServiceStatuses.INSTANCE_READY):
return True
elif status not in [rd_instance.ServiceStatuses.NEW,
rd_instance.ServiceStatuses.BUILDING]:
rd_instance.ServiceStatuses.BUILDING,
rd_instance.ServiceStatuses.UNKNOWN,
rd_instance.ServiceStatuses.DELETED]:
raise TroveError(_("Service not active, status: %s") % status)
c_id = self.db_info.compute_instance_id

View File

@ -42,6 +42,7 @@ from trove.tests.scenario.groups import instance_actions_group
from trove.tests.scenario.groups import instance_create_group
from trove.tests.scenario.groups import instance_delete_group
from trove.tests.scenario.groups import instance_error_create_group
from trove.tests.scenario.groups import instance_force_delete_group
from trove.tests.scenario.groups import instance_upgrade_group
from trove.tests.scenario.groups import module_group
from trove.tests.scenario.groups import negative_cluster_actions_group
@ -150,6 +151,9 @@ instance_error_create_groups.extend([instance_error_create_group.GROUP])
instance_upgrade_groups = list(instance_create_groups)
instance_upgrade_groups.extend([instance_upgrade_group.GROUP])
instance_force_delete_groups = list(base_groups)
instance_force_delete_groups.extend([instance_force_delete_group.GROUP])
backup_groups = list(instance_create_groups)
backup_groups.extend([groups.BACKUP,
groups.BACKUP_INST])
@ -195,12 +199,13 @@ user_actions_groups.extend([user_actions_group.GROUP])
# groups common to all datastores
common_groups = list(instance_actions_groups)
common_groups.extend([guest_log_groups, instance_error_create_groups,
module_groups])
instance_force_delete_groups, module_groups])
# Register: Component based groups
register(["backup"], backup_groups)
register(["backup_incremental"], backup_incremental_groups)
register(["cluster"], cluster_actions_groups)
register(["common"], common_groups)
register(["configuration"], configuration_groups)
register(["configuration_create"], configuration_create_groups)
register(["database"], database_actions_groups)
@ -209,6 +214,7 @@ register(["instance", "instance_actions"], instance_actions_groups)
register(["instance_create"], instance_create_groups)
register(["instance_error_create"], instance_error_create_groups)
register(["instance_upgrade"], instance_upgrade_groups)
register(["instance_force_delete"], instance_force_delete_groups)
register(["module"], module_groups)
register(["module_create"], module_create_groups)
register(["replication"], replication_groups)

View File

@ -87,6 +87,11 @@ INST_ERROR_DELETE = "scenario.inst_error_delete_grp"
INST_ERROR_DELETE_WAIT = "scenario.inst_error_delete_wait_grp"
# Instance Force Delete Group
INST_FORCE_DELETE = "scenario.inst_force_delete_grp"
INST_FORCE_DELETE_WAIT = "scenario.inst_force_delete_wait_grp"
# Module Group
MODULE_CREATE = "scenario.module_create_grp"
MODULE_DELETE = "scenario.module_delete_grp"

View File

@ -52,8 +52,8 @@ class InstanceErrorCreateGroup(TestGroup):
@test(depends_on_groups=[groups.INST_ERROR_CREATE],
groups=[GROUP, groups.INST_ERROR_CREATE_WAIT],
runs_after_groups=[groups.MODULE_CREATE, groups.CFGGRP_CREATE])
runs_after_groups=[groups.MODULE_CREATE, groups.CFGGRP_CREATE],
groups=[GROUP, groups.INST_ERROR_CREATE_WAIT])
class InstanceErrorCreateWaitGroup(TestGroup):
"""Test that Instance Error Create Completes."""
@ -94,6 +94,7 @@ class InstanceErrorDeleteGroup(TestGroup):
@test(depends_on_groups=[groups.INST_ERROR_DELETE],
runs_after_groups=[groups.MODULE_INST_CREATE],
groups=[GROUP, groups.INST_ERROR_DELETE_WAIT])
class InstanceErrorDeleteWaitGroup(TestGroup):
"""Test that Instance Error Delete Completes."""

View File

@ -0,0 +1,67 @@
# Copyright 2016 Tesora Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from proboscis import test
from trove.tests import PRE_INSTANCES
from trove.tests.scenario import groups
from trove.tests.scenario.groups.test_group import TestGroup
from trove.tests.scenario.runners import test_runners
GROUP = "scenario.instance_force_delete_group"
class InstanceForceDeleteRunnerFactory(test_runners.RunnerFactory):
_runner_ns = 'instance_force_delete_runners'
_runner_cls = 'InstanceForceDeleteRunner'
@test(depends_on_groups=["services.initialize"],
runs_after_groups=[PRE_INSTANCES, groups.INST_ERROR_CREATE],
groups=[GROUP, groups.INST_FORCE_DELETE])
class InstanceForceDeleteGroup(TestGroup):
"""Test Instance Force Delete functionality."""
def __init__(self):
super(InstanceForceDeleteGroup, self).__init__(
InstanceForceDeleteRunnerFactory.instance())
@test
def create_build_instance(self):
"""Create an instance in BUILD state."""
self.test_runner.run_create_build_instance()
@test(depends_on=['create_build_instance'])
def delete_build_instance(self):
"""Make sure the instance in BUILD state deletes."""
self.test_runner.run_delete_build_instance()
@test(depends_on_groups=[groups.INST_FORCE_DELETE],
runs_after_groups=[groups.MODULE_INST_CREATE],
groups=[GROUP, groups.INST_FORCE_DELETE_WAIT])
class InstanceForceDeleteWaitGroup(TestGroup):
"""Make sure the Force Delete instance goes away."""
def __init__(self):
super(InstanceForceDeleteWaitGroup, self).__init__(
InstanceForceDeleteRunnerFactory.instance())
@test
def wait_for_force_delete(self):
"""Wait for the Force Delete instance to be gone."""
self.test_runner.run_wait_for_force_delete()

View File

@ -286,7 +286,7 @@ class ModuleCreateGroup(TestGroup):
@test(depends_on_groups=[groups.INST_CREATE_WAIT, groups.MODULE_CREATE],
runs_after_groups=[groups.INST_ERROR_DELETE],
runs_after_groups=[groups.INST_ERROR_DELETE, groups.INST_FORCE_DELETE],
groups=[GROUP, groups.MODULE_INST, groups.MODULE_INST_CREATE])
class ModuleInstCreateGroup(TestGroup):
"""Test Module Instance Create functionality."""

View File

@ -0,0 +1,54 @@
# Copyright 2016 Tesora Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from proboscis import SkipTest
from trove.tests.scenario.runners.test_runners import TestRunner
class InstanceForceDeleteRunner(TestRunner):
def __init__(self):
super(InstanceForceDeleteRunner, self).__init__(sleep_time=1)
self.build_inst_id = None
def run_create_build_instance(self, expected_states=['NEW', 'BUILD'],
expected_http_code=200):
if self.is_using_existing_instance:
raise SkipTest("Using an existing instance.")
name = self.instance_info.name + '_build'
flavor = self.get_instance_flavor()
inst = self.auth_client.instances.create(
name,
self.get_flavor_href(flavor),
self.instance_info.volume,
nics=self.instance_info.nics,
datastore=self.instance_info.dbaas_datastore,
datastore_version=self.instance_info.dbaas_datastore_version)
self.assert_instance_action([inst.id], expected_states,
expected_http_code)
self.build_inst_id = inst.id
def run_delete_build_instance(self, expected_http_code=202):
if self.build_inst_id:
self.auth_client.instances.force_delete(self.build_inst_id)
self.assert_client_code(expected_http_code)
def run_wait_for_force_delete(self):
if self.build_inst_id:
self.assert_all_gone([self.build_inst_id], ['SHUTDOWN'])

View File

@ -139,7 +139,7 @@ class RunnerFactory(object):
# such as a missing override class. Anything else
# shouldn't be suppressed.
l_msg = ie.message.lower()
if load_type not in l_msg or (
if (load_type and load_type not in l_msg) or (
'no module named' not in l_msg and
'cannot be found' not in l_msg):
raise