Eliminate client race condition in convergence delete
Previously when doing a delete in convergence, we spawned a new thread to
start the delete. This was to ensure the request returned without waiting
for potentially slow operations like deleting snapshots and stopping
existing workers (which could have caused RPC timeouts).
The result, however, was that the stack was not guaranteed to be
DELETE_IN_PROGRESS by the time the request returned. In the case where a
previous delete had failed, a client request to show the stack issued soon
after the delete had returned would likely show the stack status as
DELETE_FAILED still. Only a careful examination of the updated_at timestamp
would reveal that this corresponded to the previous delete and not the one
just issued. In the case of a nested stack, this could leave the parent
stack effectively undeletable. (Since the updated_at time is not modified
on delete in the legacy path, we never checked it when deleting a nested
stack.)
To prevent this, change the order of operations so that the stack is first
put into the DELETE_IN_PROGRESS state before the delete_stack call returns.
Only after the state is stored, spawn a thread to complete the operation.
Since there is no stack lock in convergence, this gives us the flexibility
to cancel other in-progress workers after we've already written to the
Stack itself to start a new traversal.
The previous patch in the series means that snapshots are now also deleted
after the stack is marked as DELETE_IN_PROGRESS. This is consistent with
the legacy path.
Change-Id: Ib767ce8b39293c2279bf570d8399c49799cbaa70
Story: #1669608
Task: 23174
(cherry picked from commit e63778efc9
)
This commit is contained in:
parent
990acd8301
commit
a13476e787
|
@ -1350,14 +1350,19 @@ class EngineService(service.ServiceBase):
|
|||
self.resource_enforcer.enforce_stack(stack, is_registered_policy=True)
|
||||
|
||||
if stack.convergence and cfg.CONF.convergence_engine:
|
||||
def convergence_delete():
|
||||
stack.thread_group_mgr = self.thread_group_mgr
|
||||
self.worker_service.stop_all_workers(stack)
|
||||
template = templatem.Template.create_empty_template(
|
||||
from_template=stack.t)
|
||||
stack.converge_stack(template=template, action=stack.DELETE)
|
||||
stack.thread_group_mgr = self.thread_group_mgr
|
||||
template = templatem.Template.create_empty_template(
|
||||
from_template=stack.t)
|
||||
|
||||
self.thread_group_mgr.start(stack.id, convergence_delete)
|
||||
# stop existing traversal; mark stack as FAILED
|
||||
if stack.status == stack.IN_PROGRESS:
|
||||
self.worker_service.stop_traversal(stack)
|
||||
|
||||
def stop_workers():
|
||||
self.worker_service.stop_all_workers(stack)
|
||||
|
||||
stack.converge_stack(template=template, action=stack.DELETE,
|
||||
pre_converge=stop_workers)
|
||||
return
|
||||
|
||||
lock = stack_lock.StackLock(cnxt, stack.id, self.engine_id)
|
||||
|
|
|
@ -1293,7 +1293,8 @@ class Stack(collections.Mapping):
|
|||
updater()
|
||||
|
||||
@profiler.trace('Stack.converge_stack', hide_args=False)
|
||||
def converge_stack(self, template, action=UPDATE, new_stack=None):
|
||||
def converge_stack(self, template, action=UPDATE, new_stack=None,
|
||||
pre_converge=None):
|
||||
"""Update the stack template and trigger convergence for resources."""
|
||||
if action not in [self.CREATE, self.ADOPT]:
|
||||
# no back-up template for create action
|
||||
|
@ -1347,9 +1348,10 @@ class Stack(collections.Mapping):
|
|||
|
||||
# TODO(later): lifecycle_plugin_utils.do_pre_ops
|
||||
|
||||
self.thread_group_mgr.start(self.id, self._converge_create_or_update)
|
||||
self.thread_group_mgr.start(self.id, self._converge_create_or_update,
|
||||
pre_converge=pre_converge)
|
||||
|
||||
def _converge_create_or_update(self):
|
||||
def _converge_create_or_update(self, pre_converge=None):
|
||||
current_resources = self._update_or_store_resources()
|
||||
self._compute_convg_dependencies(self.ext_rsrcs_db, self.dependencies,
|
||||
current_resources)
|
||||
|
@ -1366,6 +1368,8 @@ class Stack(collections.Mapping):
|
|||
'action': self.action})
|
||||
return
|
||||
|
||||
if callable(pre_converge):
|
||||
pre_converge()
|
||||
if self.action == self.DELETE:
|
||||
try:
|
||||
self.delete_all_snapshots()
|
||||
|
|
|
@ -125,11 +125,11 @@ class WorkerService(object):
|
|||
_stop_traversal(child)
|
||||
|
||||
def stop_all_workers(self, stack):
|
||||
# stop the traversal
|
||||
if stack.status == stack.IN_PROGRESS:
|
||||
self.stop_traversal(stack)
|
||||
"""Cancel all existing worker threads for the stack.
|
||||
|
||||
# cancel existing workers
|
||||
Threads will stop running at their next yield point, whether or not the
|
||||
resource operations are complete.
|
||||
"""
|
||||
cancelled = _cancel_workers(stack, self.thread_group_mgr,
|
||||
self.engine_id, self._rpc_client)
|
||||
if not cancelled:
|
||||
|
|
|
@ -37,5 +37,8 @@ class Worker(message_processor.MessageProcessor):
|
|||
adopt_stack_data,
|
||||
converge)
|
||||
|
||||
def stop_traversal(self, current_stack):
|
||||
pass
|
||||
|
||||
def stop_all_workers(self, current_stack):
|
||||
pass
|
||||
|
|
|
@ -209,7 +209,7 @@ class WorkerServiceTest(common.HeatTestCase):
|
|||
stack.id = 'stack_id'
|
||||
stack.rollback = mock.MagicMock()
|
||||
_worker.stop_all_workers(stack)
|
||||
mock_st.assert_called_once_with(stack)
|
||||
mock_st.assert_not_called()
|
||||
mock_cw.assert_called_once_with(stack, mock_tgm, 'engine-001',
|
||||
_worker._rpc_client)
|
||||
self.assertFalse(stack.rollback.called)
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
---
|
||||
fixes:
|
||||
- |
|
||||
Previously, when deleting a convergence stack, the API call would return
|
||||
immediately, so that it was possible for a client immediately querying the
|
||||
status of the stack to see the state of the previous operation in progress
|
||||
or having failed, and confuse that with a current status. (This included
|
||||
Heat itself when acting as a client for a nested stack.) Convergence stacks
|
||||
are now guaranteed to have moved to the ``DELETE_IN_PROGRESS`` state before
|
||||
the delete API call returns, so any subsequent polling will reflect
|
||||
up-to-date information.
|
Loading…
Reference in New Issue