deb-heat/heat/engine/stack_lock.py
Ethan Lynn 6bc753582b Set stack status to FAILED when engine is down
When stack is in status IN_PROGRESS and engine service went down,
the status of stack will forever remain in IN_PROGRESS. This patch
add a db apid to get engine_id from stacklock and try to reset the
stack status to FAILED when engine is back.

Closes-Bug: #1382320
Change-Id: Ica856bb0d56c23a4423fb9476c1986aaacf24108
2015-04-02 11:54:05 +08:00

166 lines
6.4 KiB
Python

#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import contextlib
import uuid
from oslo_config import cfg
from oslo_log import log as logging
import oslo_messaging as messaging
from oslo_utils import excutils
from heat.common import exception
from heat.common.i18n import _LI
from heat.common.i18n import _LW
from heat.common import messaging as rpc_messaging
from heat.objects import stack_lock as stack_lock_object
from heat.rpc import api as rpc_api
cfg.CONF.import_opt('engine_life_check_timeout', 'heat.common.config')
LOG = logging.getLogger(__name__)
class StackLock(object):
def __init__(self, context, stack, engine_id):
self.context = context
self.stack = stack
self.engine_id = engine_id
self.listener = None
@staticmethod
def engine_alive(context, engine_id):
client = rpc_messaging.get_rpc_client(
version='1.0', topic=rpc_api.LISTENER_TOPIC,
server=engine_id)
client_context = client.prepare(
timeout=cfg.CONF.engine_life_check_timeout)
try:
return client_context.call(context, 'listening')
except messaging.MessagingTimeout:
return False
@staticmethod
def generate_engine_id():
return str(uuid.uuid4())
def get_engine_id(self):
return stack_lock_object.StackLock.get_engine_id(self.stack.id)
def try_acquire(self):
"""
Try to acquire a stack lock, but don't raise an ActionInProgress
exception or try to steal lock.
"""
return stack_lock_object.StackLock.create(self.stack.id,
self.engine_id)
def acquire(self, retry=True):
"""
Acquire a lock on the stack.
:param retry: When True, retry if lock was released while stealing.
:type retry: boolean
"""
lock_engine_id = stack_lock_object.StackLock.create(self.stack.id,
self.engine_id)
if lock_engine_id is None:
LOG.debug("Engine %(engine)s acquired lock on stack "
"%(stack)s" % {'engine': self.engine_id,
'stack': self.stack.id})
return
if (lock_engine_id == self.engine_id or
self.engine_alive(self.context, lock_engine_id)):
LOG.debug("Lock on stack %(stack)s is owned by engine "
"%(engine)s" % {'stack': self.stack.id,
'engine': lock_engine_id})
raise exception.ActionInProgress(stack_name=self.stack.name,
action=self.stack.action)
else:
LOG.info(_LI("Stale lock detected on stack %(stack)s. Engine "
"%(engine)s will attempt to steal the lock"),
{'stack': self.stack.id, 'engine': self.engine_id})
result = stack_lock_object.StackLock.steal(self.stack.id,
lock_engine_id,
self.engine_id)
if result is None:
LOG.info(_LI("Engine %(engine)s successfully stole the lock "
"on stack %(stack)s"),
{'engine': self.engine_id,
'stack': self.stack.id})
return
elif result is True:
if retry:
LOG.info(_LI("The lock on stack %(stack)s was released "
"while engine %(engine)s was stealing it. "
"Trying again"), {'stack': self.stack.id,
'engine': self.engine_id})
return self.acquire(retry=False)
else:
new_lock_engine_id = result
LOG.info(_LI("Failed to steal lock on stack %(stack)s. "
"Engine %(engine)s stole the lock first"),
{'stack': self.stack.id,
'engine': new_lock_engine_id})
raise exception.ActionInProgress(
stack_name=self.stack.name, action=self.stack.action)
def release(self, stack_id):
"""Release a stack lock."""
# Only the engine that owns the lock will be releasing it.
result = stack_lock_object.StackLock.release(stack_id,
self.engine_id)
if result is True:
LOG.warn(_LW("Lock was already released on stack %s!"), stack_id)
else:
LOG.debug("Engine %(engine)s released lock on stack "
"%(stack)s" % {'engine': self.engine_id,
'stack': stack_id})
@contextlib.contextmanager
def thread_lock(self, stack_id):
"""
Acquire a lock and release it only if there is an exception. The
release method still needs to be scheduled to be run at the
end of the thread using the Thread.link method.
"""
try:
self.acquire()
yield
except exception.ActionInProgress:
raise
except: # noqa
with excutils.save_and_reraise_exception():
self.release(stack_id)
@contextlib.contextmanager
def try_thread_lock(self, stack_id):
"""
Similar to thread_lock, but acquire the lock using try_acquire
and only release it upon any exception after a successful
acquisition.
"""
result = None
try:
result = self.try_acquire()
yield result
except: # noqa
if result is None: # Lock was successfully acquired
with excutils.save_and_reraise_exception():
self.release(stack_id)
raise