Implement stack-locking for multi-engine support

Implements a distributed stack lock using the database to avoid race
conditions when multiple engines are deployed.

Blueprint multiple-engines

Change-Id: If3c47dafcc5bc1b2188b7737205291bbab8bc231
This commit is contained in:
Jason Dunsmore 2013-10-02 15:22:12 -05:00
parent 23fe65d714
commit d7ae961fca
7 changed files with 312 additions and 1 deletions

View File

@ -42,6 +42,10 @@
# unlimited events per stack. (integer value)
#max_events_per_stack=1000
# RPC timeout for the engine liveness check that is used for
# stack locking. (integer value)
#engine_life_check_timeout=2
# Name of the engine node. This can be an opaque identifier.It
# is not necessarily a hostname, FQDN, or IP address. (string
# value)

View File

@ -249,6 +249,16 @@ class HeatAPINotImplementedError(HeatAPIException):
err_type = "Server"
class HeatActionInProgressError(HeatAPIException):
'''
Cannot perform action on stack in its current state
'''
code = 400
title = 'InvalidAction'
explanation = ("Cannot perform action on stack while other actions are " +
"in progress")
def map_remote_error(ex):
"""
Map rpc_common.RemoteError exceptions returned by the engine
@ -273,6 +283,7 @@ def map_remote_error(ex):
)
denied_errors = ('Forbidden', 'NotAuthorized')
already_exists_errors = ('StackExists')
invalid_action_errors = ('ActionInProgress',)
ex_type = ex.__class__.__name__
@ -285,6 +296,8 @@ def map_remote_error(ex):
return HeatAccessDeniedError(detail=str(ex))
elif ex_type in already_exists_errors:
return AlreadyExistsError(detail=str(ex))
elif ex_type in invalid_action_errors:
return HeatActionInProgressError(detail=str(ex))
else:
# Map everything else to internal server error for now
return HeatInternalFailureError(detail=str(ex))

View File

@ -58,6 +58,7 @@ class FaultWrapper(wsgi.Middleware):
error_map = {
'AttributeError': webob.exc.HTTPBadRequest,
'ActionInProgress': webob.exc.HTTPConflict,
'ValueError': webob.exc.HTTPBadRequest,
'StackNotFound': webob.exc.HTTPNotFound,
'ResourceNotFound': webob.exc.HTTPNotFound,

View File

@ -106,7 +106,12 @@ engine_opts = [
default=1000,
help=_('Maximum events that will be available per stack. Older'
' events will be deleted when this is reached. Set to 0'
' for unlimited events per stack.'))]
' for unlimited events per stack.')),
cfg.IntOpt('engine_life_check_timeout',
default=2,
help=_('RPC timeout for the engine liveness check that is used'
' for stack locking.'))]
rpc_opts = [
cfg.StrOpt('host',
default=socket.gethostname(),

View File

@ -334,3 +334,8 @@ class RequestLimitExceeded(HeatException):
class StackResourceLimitExceeded(HeatException):
msg_fmt = _('Maximum resources per stack exceeded.')
class ActionInProgress(HeatException):
msg_fmt = _("Stack %(stack_name)s already has an action (%(action)s) "
"in progress.")

102
heat/engine/stack_lock.py Normal file
View File

@ -0,0 +1,102 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import uuid
from oslo.config import cfg
cfg.CONF.import_opt('engine_life_check_timeout', 'heat.common.config')
from heat.common import exception
from heat.db import api as db_api
from heat.openstack.common import log as logging
from heat.openstack.common.gettextutils import _
from heat.openstack.common.rpc import proxy
from heat.openstack.common.rpc.common import Timeout
logger = logging.getLogger(__name__)
engine_id = str(uuid.uuid4())
class StackLock(object):
def __init__(self, context, stack):
self.context = context
self.stack = stack
self.listener = None
def _engine_alive(self, engine_id):
topic = engine_id
rpc = proxy.RpcProxy(topic, "1.0")
msg = rpc.make_msg("listening")
try:
return rpc.call(self.context, msg, topic=topic,
timeout=cfg.CONF.engine_life_check_timeout)
except Timeout:
return False
def acquire(self, retry=True):
"""Acquire a lock on the stack."""
lock_engine_id = db_api.stack_lock_create(self.stack.id, engine_id)
if lock_engine_id is None:
logger.debug(_("Engine %(engine)s acquired lock on stack "
"%(stack)s") % {'engine': engine_id,
'stack': self.stack.id})
return
if lock_engine_id == engine_id or self._engine_alive(lock_engine_id):
logger.debug(_("Lock on stack %(stack)s is owned by engine "
"%(engine)s") % {'stack': self.stack.id,
'engine': lock_engine_id})
raise exception.ActionInProgress(stack_name=self.stack.name,
action=self.stack.action)
else:
logger.info(_("Stale lock detected on stack %(stack)s. Engine "
"%(engine)s will attempt to steal the lock")
% {'stack': self.stack.id, 'engine': engine_id})
result = db_api.stack_lock_steal(self.stack.id, lock_engine_id,
engine_id)
if result is None:
logger.info(_("Engine %(engine)s successfully stole the lock "
"on stack %(stack)s") % {'engine': engine_id,
'stack': self.stack.id})
return
elif result is True:
if retry:
logger.info(_("The lock on stack %(stack)s was released "
"while engine %(engine)s was stealing it. "
"Trying again") % {'stack': self.stack.id,
'engine': engine_id})
return self.acquire(retry=False)
else:
new_lock_engine_id = result
logger.info(_("Failed to steal lock on stack %(stack)s. "
"Engine %(engine)s stole the lock first")
% {'stack': self.stack.id,
'engine': new_lock_engine_id})
raise exception.ActionInProgress(
stack_name=self.stack.name, action=self.stack.action)
def release(self):
"""Release a stack lock."""
# Only the engine that owns the lock will be releasing it.
result = db_api.stack_lock_release(self.stack.id, engine_id)
if result is True:
logger.warning(_("Lock was already released on stack %s!")
% self.stack.id)
else:
logger.debug(_("Engine %(engine)s released lock on stack "
"%(stack)s") % {'engine': engine_id,
'stack': self.stack.id})

View File

@ -0,0 +1,181 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from heat.common import exception
from heat.db import api as db_api
from heat.engine import stack_lock
from heat.openstack.common.rpc import proxy
from heat.openstack.common.rpc.common import Timeout
from heat.tests.common import HeatTestCase
from heat.tests import utils
class StackLockTest(HeatTestCase):
def setUp(self):
super(StackLockTest, self).setUp()
utils.setup_dummy_db()
self.context = utils.dummy_context()
self.stack = self.m.CreateMockAnything()
self.stack.id = "aae01f2d-52ae-47ac-8a0d-3fde3d220fea"
self.stack.name = "test_stack"
self.stack.action = "CREATE"
def test_successful_acquire_new_lock(self):
self.m.StubOutWithMock(db_api, "stack_lock_create")
db_api.stack_lock_create(self.stack.id, stack_lock.engine_id).\
AndReturn(None)
self.m.ReplayAll()
slock = stack_lock.StackLock(self.context, self.stack)
slock.acquire()
self.m.VerifyAll()
def test_failed_acquire_existing_lock_current_engine(self):
self.m.StubOutWithMock(db_api, "stack_lock_create")
db_api.stack_lock_create(self.stack.id, stack_lock.engine_id).\
AndReturn(stack_lock.engine_id)
self.m.ReplayAll()
slock = stack_lock.StackLock(self.context, self.stack)
self.assertRaises(exception.ActionInProgress, slock.acquire)
self.m.VerifyAll()
def test_successful_acquire_existing_lock_engine_dead(self):
self.m.StubOutWithMock(db_api, "stack_lock_create")
db_api.stack_lock_create(self.stack.id, stack_lock.engine_id).\
AndReturn("fake-engine-id")
topic = self.stack.id
self.m.StubOutWithMock(proxy.RpcProxy, "call")
rpc = proxy.RpcProxy(topic, "1.0")
rpc.call(self.context, rpc.make_msg("listening"), timeout=2,
topic="fake-engine-id").AndRaise(Timeout)
self.m.StubOutWithMock(db_api, "stack_lock_steal")
db_api.stack_lock_steal(self.stack.id, "fake-engine-id",
stack_lock.engine_id).AndReturn(None)
self.m.ReplayAll()
slock = stack_lock.StackLock(self.context, self.stack)
slock.acquire()
self.m.VerifyAll()
def test_failed_acquire_existing_lock_engine_alive(self):
self.m.StubOutWithMock(db_api, "stack_lock_create")
db_api.stack_lock_create(self.stack.id, stack_lock.engine_id).\
AndReturn("fake-engine-id")
topic = self.stack.id
self.m.StubOutWithMock(proxy.RpcProxy, "call")
rpc = proxy.RpcProxy(topic, "1.0")
rpc.call(self.context, rpc.make_msg("listening"), timeout=2,
topic="fake-engine-id").AndReturn(True)
self.m.ReplayAll()
slock = stack_lock.StackLock(self.context, self.stack)
self.assertRaises(exception.ActionInProgress, slock.acquire)
self.m.VerifyAll()
def test_failed_acquire_existing_lock_engine_dead(self):
self.m.StubOutWithMock(db_api, "stack_lock_create")
db_api.stack_lock_create(self.stack.id, stack_lock.engine_id).\
AndReturn("fake-engine-id")
topic = self.stack.id
self.m.StubOutWithMock(proxy.RpcProxy, "call")
rpc = proxy.RpcProxy(topic, "1.0")
rpc.call(self.context, rpc.make_msg("listening"), timeout=2,
topic="fake-engine-id").AndRaise(Timeout)
self.m.StubOutWithMock(db_api, "stack_lock_steal")
db_api.stack_lock_steal(self.stack.id, "fake-engine-id",
stack_lock.engine_id).\
AndReturn("fake-engine-id2")
self.m.ReplayAll()
slock = stack_lock.StackLock(self.context, self.stack)
self.assertRaises(exception.ActionInProgress, slock.acquire)
self.m.VerifyAll()
def test_successful_acquire_with_retry(self):
self.m.StubOutWithMock(db_api, "stack_lock_create")
db_api.stack_lock_create(self.stack.id, stack_lock.engine_id).\
AndReturn("fake-engine-id")
topic = self.stack.id
self.m.StubOutWithMock(proxy.RpcProxy, "call")
rpc = proxy.RpcProxy(topic, "1.0")
rpc.call(self.context, rpc.make_msg("listening"), timeout=2,
topic="fake-engine-id").AndRaise(Timeout)
self.m.StubOutWithMock(db_api, "stack_lock_steal")
db_api.stack_lock_steal(self.stack.id, "fake-engine-id",
stack_lock.engine_id).\
AndReturn(True)
db_api.stack_lock_create(self.stack.id, stack_lock.engine_id).\
AndReturn("fake-engine-id")
topic = self.stack.id
rpc = proxy.RpcProxy(topic, "1.0")
rpc.call(self.context, rpc.make_msg("listening"), timeout=2,
topic="fake-engine-id").AndRaise(Timeout)
db_api.stack_lock_steal(self.stack.id, "fake-engine-id",
stack_lock.engine_id).\
AndReturn(None)
self.m.ReplayAll()
slock = stack_lock.StackLock(self.context, self.stack)
slock.acquire()
self.m.VerifyAll()
def test_failed_acquire_one_retry_only(self):
self.m.StubOutWithMock(db_api, "stack_lock_create")
db_api.stack_lock_create(self.stack.id, stack_lock.engine_id).\
AndReturn("fake-engine-id")
topic = self.stack.id
self.m.StubOutWithMock(proxy.RpcProxy, "call")
rpc = proxy.RpcProxy(topic, "1.0")
rpc.call(self.context, rpc.make_msg("listening"), timeout=2,
topic="fake-engine-id").AndRaise(Timeout)
self.m.StubOutWithMock(db_api, "stack_lock_steal")
db_api.stack_lock_steal(self.stack.id, "fake-engine-id",
stack_lock.engine_id).\
AndReturn(True)
db_api.stack_lock_create(self.stack.id, stack_lock.engine_id).\
AndReturn("fake-engine-id")
topic = self.stack.id
rpc = proxy.RpcProxy(topic, "1.0")
rpc.call(self.context, rpc.make_msg("listening"), timeout=2,
topic="fake-engine-id").AndRaise(Timeout)
db_api.stack_lock_steal(self.stack.id, "fake-engine-id",
stack_lock.engine_id).\
AndReturn(True)
self.m.ReplayAll()
slock = stack_lock.StackLock(self.context, self.stack)
self.assertRaises(exception.ActionInProgress, slock.acquire)
self.m.VerifyAll()