Gracefully stop for task based deployment

Introduce new node status: stopped, which used for nodes,
where orchestrator was successfully gracefully stopped task
deployment. This status will be able node redeploy.

Also introduced new type of node error_type: stop_deployment
It will be used if gracefully stop on node will be failed
for example, because of timeout. It will be apply automatically
by recevier for nodes in deploying status
if orchestrator return error for stop deployment task

Change-Id: I30d606d7d11d670d1a68ee90b01b932c1543fccc
Implements: blueprint graceful-stop-restart-deployment
This commit is contained in:
Vladimir Sharshov (warpc) 2016-02-20 14:19:08 +03:00
parent b0cba9a677
commit a7cab8077d
6 changed files with 146 additions and 9 deletions

View File

@ -126,6 +126,7 @@ NODE_STATUSES = Enum(
'deploying',
'error',
'removing',
'stopped',
)
NODE_ERRORS = Enum(
@ -133,6 +134,7 @@ NODE_ERRORS = Enum(
'provision',
'deletion',
'discover',
'stop_deployment'
)
NODE_GROUPS = Enum(

View File

@ -42,7 +42,7 @@ cluster_statuses_old = (
'error',
'remove',
'update',
'update_error'
'update_error',
)
cluster_statuses_new = (
'new',
@ -53,6 +53,38 @@ cluster_statuses_new = (
'remove',
'partially_deployed'
)
node_statuses_old = (
'ready',
'discover',
'provisioning',
'provisioned',
'deploying',
'error',
'removing',
)
node_statuses_new = (
'ready',
'discover',
'provisioning',
'provisioned',
'deploying',
'error',
'removing',
'stopped',
)
node_errors_old = (
'deploy',
'provision',
'deletion',
'discover',
)
node_errors_new = (
'deploy',
'provision',
'deletion',
'discover',
'stop_deployment',
)
def upgrade():
@ -64,9 +96,13 @@ def upgrade():
upgrade_node_attributes()
upgrade_remove_wizard_metadata_from_releases()
drop_legacy_patching()
upgrade_node_status_attributes()
upgrade_node_stop_deployment_error_type()
def downgrade():
downgrade_node_stop_deployment_error_type()
downgrade_node_status_attributes()
restore_legacy_patching()
downgrade_remove_wizard_metadata_from_releases()
downgrade_node_attributes()
@ -752,3 +788,43 @@ def restore_legacy_patching():
cluster_statuses_new, # new options
cluster_statuses_old, # old options
)
def upgrade_node_status_attributes():
upgrade_enum(
"nodes", # table
"status", # column
"node_status", # ENUM name
node_statuses_old, # old options
node_statuses_new # new options
)
def downgrade_node_status_attributes():
upgrade_enum(
"nodes", # table
"status", # column
"node_status", # ENUM name
node_statuses_new, # old options
node_statuses_old # new options
)
def upgrade_node_stop_deployment_error_type():
upgrade_enum(
"nodes",
"error_type",
"node_error_type",
node_errors_old,
node_errors_new
)
def downgrade_node_stop_deployment_error_type():
upgrade_enum(
"nodes",
"error_type",
"node_error_type",
node_errors_new,
node_errors_old
)

View File

@ -163,8 +163,11 @@ class Node(Base):
@property
def needs_redeploy(self):
return (
self.status in ['error', 'provisioned'] or
len(self.pending_roles)) and not self.pending_deletion
self.status in [
consts.NODE_STATUSES.error,
consts.NODE_STATUSES.provisioned,
consts.NODE_STATUSES.stopped
] or len(self.pending_roles)) and not self.pending_deletion
@property
def needs_redeletion(self):

View File

@ -649,7 +649,7 @@ class NailgunReceiver(object):
task.cluster.status = consts.CLUSTER_STATUSES.stopped
if stop_tasks:
map(db().delete, stop_tasks)
objects.Task.bulk_delete(x.id for x in stop_tasks)
node_uids = [n['uid'] for n in itertools.chain(nodes, ia_nodes)]
q_nodes = objects.NodeCollection.filter_by_id_list(None, node_uids)
@ -658,7 +658,6 @@ class NailgunReceiver(object):
cluster_id=task.cluster_id
)
q_nodes = objects.NodeCollection.order_by(q_nodes, 'id')
q_nodes = objects.NodeCollection.lock_for_update(q_nodes)
# locking Nodes for update
update_nodes = objects.NodeCollection.lock_for_update(
@ -677,8 +676,6 @@ class NailgunReceiver(object):
message = (
u"Deployment of environment '{0}' was successfully stopped. "
u"Please make changes and reset the environment "
u"if you want to redeploy it."
.format(task.cluster.name or task.cluster_id)
)
@ -687,6 +684,43 @@ class NailgunReceiver(object):
message,
task.cluster_id
)
elif status == consts.TASK_STATUSES.error:
task.cluster.status = consts.CLUSTER_STATUSES.error
if stop_tasks:
objects.Task.bulk_delete(x.id for x in stop_tasks)
q_nodes = objects.NodeCollection.filter_by(
None,
cluster_id=task.cluster_id
)
q_nodes = objects.NodeCollection.filter_by(
q_nodes,
status=consts.NODE_STATUSES.deploying
)
q_nodes = objects.NodeCollection.order_by(q_nodes, 'id')
update_nodes = objects.NodeCollection.lock_for_update(
q_nodes
).all()
for node_db in update_nodes:
node_db.status = consts.NODE_STATUSES.error
node_db.progress = 100
node_db.error_type = consts.NODE_ERRORS.stop_deployment
db().flush()
message = (
u"Deployment of environment '{0}' was failed to stop: {1}. "
u"Please check logs for details."
.format(task.cluster.name or task.cluster_id, message)
)
notifier.notify(
"error",
message,
task.cluster_id
)
data = {'status': status, 'progress': progress, 'message': message}
objects.Task.update(task, data)

View File

@ -75,8 +75,7 @@ class TestStopDeployment(BaseIntegrationTest):
self.assertRegexpMatches(
notification.message,
'Please make changes and reset the environment '
'if you want to redeploy it.')
'was successfully stopped')
# FIXME(aroma): remove when stop action will be reworked for ha
# cluster. To get more details, please, refer to [1]

View File

@ -119,6 +119,29 @@ class TestTaskHelpers(BaseTestCase):
computes = self.filter_by_role(nodes, 'compute')
self.assertEqual(len(computes), 2)
def test_redeploy_with_stopped_nodes(self):
cluster = self.create_env([
{'roles': ['controller'], 'status': 'error'},
{'roles': ['controller'], 'status': 'stopped'},
{'roles': ['controller'], 'status': 'stopped'},
{'roles': ['compute', 'cinder'], 'status': 'stopped'},
{'roles': ['compute'], 'status': 'error',
'error_type': 'stop_deployment'},
{'roles': ['cinder'], 'status': 'error',
'error_type': 'deploy'}])
nodes = TaskHelper.nodes_to_deploy(cluster)
self.assertEqual(len(nodes), 6)
controllers = self.filter_by_role(nodes, 'controller')
self.assertEqual(len(controllers), 3)
cinders = self.filter_by_role(nodes, 'cinder')
self.assertEqual(len(cinders), 2)
computes = self.filter_by_role(nodes, 'compute')
self.assertEqual(len(computes), 2)
# TODO(aroma): move it to utils testing code
def test_recalculate_deployment_task_progress(self):
cluster = self.create_env([