550 lines
20 KiB
Python
550 lines
20 KiB
Python
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
"""Health Manager.
|
|
|
|
Health Manager is responsible for monitoring the health of the clusters and
|
|
trigger corresponding actions to recover the clusters based on the pre-defined
|
|
health policies.
|
|
"""
|
|
|
|
from oslo_config import cfg
|
|
from oslo_log import log as logging
|
|
import oslo_messaging as messaging
|
|
from oslo_service import service
|
|
from oslo_service import threadgroup
|
|
from oslo_utils import timeutils
|
|
import six
|
|
import time
|
|
|
|
from senlin.common import consts
|
|
from senlin.common import context
|
|
from senlin.common import messaging as rpc
|
|
from senlin import objects
|
|
from senlin.rpc import client as rpc_client
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
|
|
def _chase_up(start_time, interval):
|
|
"""Utility function to check if there are missed intervals.
|
|
|
|
:param start_time: A time object representing the starting time.
|
|
:param interval: An integer specifying the time interval in seconds.
|
|
:returns: Number of seconds to sleep before next round.
|
|
"""
|
|
end_time = timeutils.utcnow(True)
|
|
elapsed = timeutils.delta_seconds(start_time, end_time)
|
|
# check if we have missed any intervals?
|
|
missed = int((elapsed - 0.0000001) / interval)
|
|
if missed >= 1:
|
|
LOG.warning("Poller missed %s intervals for checking", missed)
|
|
return (missed + 1) * interval - elapsed
|
|
|
|
|
|
class NovaNotificationEndpoint(object):
|
|
|
|
VM_FAILURE_EVENTS = {
|
|
'compute.instance.pause.end': 'PAUSE',
|
|
'compute.instance.power_off.end': 'POWER_OFF',
|
|
'compute.instance.rebuild.error': 'REBUILD',
|
|
'compute.instance.shutdown.end': 'SHUTDOWN',
|
|
'compute.instance.soft_delete.end': 'SOFT_DELETE',
|
|
}
|
|
|
|
def __init__(self, project_id, cluster_id, recover_action):
|
|
self.filter_rule = messaging.NotificationFilter(
|
|
publisher_id='^compute.*',
|
|
event_type='^compute\.instance\..*',
|
|
context={'project_id': '^%s$' % project_id})
|
|
self.project_id = project_id
|
|
self.cluster_id = cluster_id
|
|
self.rpc = rpc_client.EngineClient()
|
|
self.recover_action = recover_action
|
|
|
|
def info(self, ctxt, publisher_id, event_type, payload, metadata):
|
|
meta = payload['metadata']
|
|
if meta.get('cluster_id') == self.cluster_id:
|
|
if event_type not in self.VM_FAILURE_EVENTS:
|
|
return
|
|
params = {
|
|
'event': self.VM_FAILURE_EVENTS[event_type],
|
|
'state': payload.get('state', 'Unknown'),
|
|
'instance_id': payload.get('instance_id', 'Unknown'),
|
|
'timestamp': metadata['timestamp'],
|
|
'publisher': publisher_id,
|
|
'operation': self.recover_action['operation'],
|
|
}
|
|
node_id = meta.get('cluster_node_id')
|
|
if node_id:
|
|
LOG.info("Requesting node recovery: %s", node_id)
|
|
ctx = context.get_service_context(project_id=self.project_id,
|
|
user_id=payload['user_id'])
|
|
req = objects.NodeRecoverRequest(identity=node_id,
|
|
params=params)
|
|
self.rpc.call(ctx, 'node_recover', req)
|
|
|
|
def warn(self, ctxt, publisher_id, event_type, payload, metadata):
|
|
meta = payload.get('metadata', {})
|
|
if meta.get('cluster_id') == self.cluster_id:
|
|
LOG.warning("publisher=%s", publisher_id)
|
|
LOG.warning("event_type=%s", event_type)
|
|
|
|
def debug(self, ctxt, publisher_id, event_type, payload, metadata):
|
|
meta = payload.get('metadata', {})
|
|
if meta.get('cluster_id') == self.cluster_id:
|
|
LOG.debug("publisher=%s", publisher_id)
|
|
LOG.debug("event_type=%s", event_type)
|
|
|
|
|
|
class HeatNotificationEndpoint(object):
|
|
|
|
STACK_FAILURE_EVENTS = {
|
|
'orchestration.stack.delete.end': 'DELETE',
|
|
}
|
|
|
|
def __init__(self, project_id, cluster_id, recover_action):
|
|
self.filter_rule = messaging.NotificationFilter(
|
|
publisher_id='^orchestration.*',
|
|
event_type='^orchestration\.stack\..*',
|
|
context={'project_id': '^%s$' % project_id})
|
|
self.project_id = project_id
|
|
self.cluster_id = cluster_id
|
|
self.rpc = rpc_client.EngineClient()
|
|
self.recover_action = recover_action
|
|
|
|
def info(self, ctxt, publisher_id, event_type, payload, metadata):
|
|
if event_type not in self.STACK_FAILURE_EVENTS:
|
|
return
|
|
|
|
tags = payload['tags']
|
|
if tags is None or tags == []:
|
|
return
|
|
|
|
cluster_id = None
|
|
node_id = None
|
|
for tag in tags:
|
|
if cluster_id is None:
|
|
start = tag.find('cluster_id')
|
|
if start == 0 and tag[11:] == self.cluster_id:
|
|
cluster_id = tag[11:]
|
|
if node_id is None:
|
|
start = tag.find('cluster_node_id')
|
|
if start == 0:
|
|
node_id = tag[16:]
|
|
|
|
if cluster_id is None or node_id is None:
|
|
return
|
|
|
|
params = {
|
|
'event': self.STACK_FAILURE_EVENTS[event_type],
|
|
'state': payload.get('state', 'Unknown'),
|
|
'stack_id': payload.get('stack_identity', 'Unknown'),
|
|
'timestamp': metadata['timestamp'],
|
|
'publisher': publisher_id,
|
|
'operation': self.recover_action['operation'],
|
|
}
|
|
LOG.info("Requesting stack recovery: %s", node_id)
|
|
ctx = context.get_service_context(project_id=self.project_id,
|
|
user_id=payload['user_identity'])
|
|
req = objects.NodeRecoverRequest(identity=node_id, params=params)
|
|
self.rpc.call(ctx, 'node_recover', req)
|
|
|
|
|
|
def ListenerProc(exchange, project_id, cluster_id, recover_action):
|
|
"""Thread procedure for running an event listener.
|
|
|
|
:param exchange: The control exchange for a target service.
|
|
:param project_id: The ID of the project to filter.
|
|
:param cluster_id: The ID of the cluster to filter.
|
|
:param recover_action: The health policy action name.
|
|
"""
|
|
transport = messaging.get_notification_transport(cfg.CONF)
|
|
|
|
if exchange == cfg.CONF.health_manager.nova_control_exchange:
|
|
targets = [
|
|
messaging.Target(topic='versioned_notifications',
|
|
exchange=exchange),
|
|
]
|
|
endpoints = [
|
|
NovaNotificationEndpoint(project_id, cluster_id, recover_action),
|
|
]
|
|
else: # heat notification
|
|
targets = [
|
|
messaging.Target(topic='notifications', exchange=exchange),
|
|
]
|
|
endpoints = [
|
|
HeatNotificationEndpoint(project_id, cluster_id, recover_action),
|
|
]
|
|
|
|
listener = messaging.get_notification_listener(
|
|
transport, targets, endpoints, executor='threading',
|
|
pool="senlin-listeners")
|
|
|
|
listener.start()
|
|
|
|
|
|
class HealthManager(service.Service):
|
|
|
|
def __init__(self, engine_service, topic, version):
|
|
super(HealthManager, self).__init__()
|
|
|
|
self.TG = threadgroup.ThreadGroup()
|
|
self.engine_id = engine_service.engine_id
|
|
self.topic = topic
|
|
self.version = version
|
|
self.ctx = context.get_admin_context()
|
|
self.rpc_client = rpc_client.EngineClient()
|
|
self.rt = {
|
|
'registries': [],
|
|
}
|
|
|
|
def _dummy_task(self):
|
|
"""A Dummy task that is queued on the health manager thread group.
|
|
|
|
The task is here so that the service always has something to wait()
|
|
on, or else the process will exit.
|
|
"""
|
|
self._load_runtime_registry()
|
|
|
|
def _wait_for_action(self, ctx, action_id, timeout):
|
|
done = False
|
|
req = objects.ActionGetRequest(identity=action_id)
|
|
with timeutils.StopWatch(timeout) as timeout_watch:
|
|
while timeout > 0:
|
|
action = self.rpc_client.call(ctx, 'action_get', req)
|
|
if action['status'] in [consts.ACTION_SUCCEEDED,
|
|
consts.ACTION_FAILED,
|
|
consts.ACTION_CANCELLED]:
|
|
if action['status'] == consts.ACTION_SUCCEEDED:
|
|
done = True
|
|
break
|
|
time.sleep(2)
|
|
timeout = timeout_watch.leftover(True)
|
|
|
|
if done:
|
|
return True, ""
|
|
elif timeout <= 0:
|
|
return False, "Timeout while polling cluster status"
|
|
else:
|
|
return False, "Cluster check action failed or cancelled"
|
|
|
|
def _poll_cluster(self, cluster_id, timeout, recover_action):
|
|
"""Routine to be executed for polling cluster status.
|
|
|
|
:param cluster_id: The UUID of the cluster to be checked.
|
|
:param timeout: The maximum number of seconds to wait.
|
|
:param recover_action: The health policy action name.
|
|
:returns: Nothing.
|
|
"""
|
|
start_time = timeutils.utcnow(True)
|
|
cluster = objects.Cluster.get(self.ctx, cluster_id, project_safe=False)
|
|
if not cluster:
|
|
LOG.warning("Cluster (%s) is not found.", cluster_id)
|
|
return _chase_up(start_time, timeout)
|
|
|
|
ctx = context.get_service_context(user_id=cluster.user,
|
|
project_id=cluster.project)
|
|
params = {'delete_check_action': True}
|
|
try:
|
|
req = objects.ClusterCheckRequest(identity=cluster_id,
|
|
params=params)
|
|
action = self.rpc_client.call(ctx, 'cluster_check', req)
|
|
except Exception as ex:
|
|
LOG.warning("Failed in triggering 'cluster_check' RPC for "
|
|
"'%(c)s': %(r)s",
|
|
{'c': cluster_id, 'r': six.text_type(ex)})
|
|
return _chase_up(start_time, timeout)
|
|
|
|
# wait for action to complete
|
|
res, reason = self._wait_for_action(ctx, action['action'], timeout)
|
|
if not res:
|
|
LOG.warning("%s", reason)
|
|
return _chase_up(start_time, timeout)
|
|
|
|
# loop through nodes to trigger recovery
|
|
nodes = objects.Node.get_all_by_cluster(ctx, cluster_id)
|
|
for node in nodes:
|
|
if node.status != consts.NS_ACTIVE:
|
|
LOG.info("Requesting node recovery: %s", node.id)
|
|
req = objects.NodeRecoverRequest(identity=node.id,
|
|
params=recover_action)
|
|
self.rpc_client.call(ctx, 'node_recover', req)
|
|
|
|
return _chase_up(start_time, timeout)
|
|
|
|
def _add_listener(self, cluster_id, recover_action):
|
|
"""Routine to be executed for adding cluster listener.
|
|
|
|
:param cluster_id: The UUID of the cluster to be filtered.
|
|
:param recover_action: The health policy action name.
|
|
:returns: Nothing.
|
|
"""
|
|
cluster = objects.Cluster.get(self.ctx, cluster_id, project_safe=False)
|
|
if not cluster:
|
|
LOG.warning("Cluster (%s) is not found.", cluster_id)
|
|
return
|
|
profile = objects.Profile.get(self.ctx, cluster.profile_id,
|
|
project_safe=False)
|
|
profile_type = profile.type.split('-')[0]
|
|
if profile_type == 'os.nova.server':
|
|
exchange = cfg.CONF.health_manager.nova_control_exchange
|
|
elif profile_type == 'os.heat.stack':
|
|
exchange = cfg.CONF.health_manager.heat_control_exchange
|
|
else:
|
|
return None
|
|
|
|
project = cluster.project
|
|
return self.TG.add_thread(ListenerProc, exchange, project, cluster_id,
|
|
recover_action)
|
|
|
|
def _start_check(self, entry):
|
|
"""Routine for starting the checking for a cluster.
|
|
|
|
:param entry: A dict containing the data associated with the cluster.
|
|
:returns: An updated registry entry record.
|
|
"""
|
|
cid = entry['cluster_id']
|
|
ctype = entry['check_type']
|
|
# Get the recover action parameter from the entry params
|
|
params = entry['params']
|
|
recover_action = {}
|
|
if 'recover_action' in params:
|
|
rac = params['recover_action']
|
|
for operation in rac:
|
|
recover_action['operation'] = operation.get('name')
|
|
|
|
if ctype == consts.NODE_STATUS_POLLING:
|
|
interval = min(entry['interval'], cfg.CONF.check_interval_max)
|
|
timer = self.TG.add_dynamic_timer(self._poll_cluster,
|
|
None, # initial_delay
|
|
None, # check_interval_max
|
|
cid, interval, recover_action)
|
|
entry['timer'] = timer
|
|
elif ctype == consts.LIFECYCLE_EVENTS:
|
|
LOG.info("Start listening events for cluster (%s).", cid)
|
|
listener = self._add_listener(cid, recover_action)
|
|
if listener:
|
|
entry['listener'] = listener
|
|
else:
|
|
LOG.warning("Error creating listener for cluster %s", cid)
|
|
return None
|
|
else:
|
|
LOG.warning("Cluster %(id)s check type %(type)s is invalid.",
|
|
{'id': cid, 'type': ctype})
|
|
return None
|
|
|
|
return entry
|
|
|
|
def _stop_check(self, entry):
|
|
"""Routine for stopping the checking for a cluster.
|
|
|
|
:param entry: A dict containing the data associated with the cluster.
|
|
:returns: ``None``.
|
|
"""
|
|
timer = entry.get('timer', None)
|
|
if timer:
|
|
timer.stop()
|
|
self.TG.timer_done(timer)
|
|
return
|
|
|
|
listener = entry.get('listener', None)
|
|
if listener:
|
|
self.TG.thread_done(listener)
|
|
listener.stop()
|
|
return
|
|
|
|
def _load_runtime_registry(self):
|
|
"""Load the initial runtime registry with a DB scan."""
|
|
db_registries = objects.HealthRegistry.claim(self.ctx, self.engine_id)
|
|
|
|
for r in db_registries:
|
|
# Claiming indicates we claim a health registry who's engine was
|
|
# dead, and we will update the health registry's engine_id with
|
|
# current engine id. But we may not start check always.
|
|
entry = {
|
|
'cluster_id': r.cluster_id,
|
|
'check_type': r.check_type,
|
|
'interval': r.interval,
|
|
'params': r.params,
|
|
'enabled': r.enabled,
|
|
}
|
|
|
|
LOG.info("Loading cluster %(c)s enabled=%(e)s for "
|
|
"health monitoring",
|
|
{'c': r.cluster_id, 'e': r.enabled})
|
|
|
|
if r.enabled:
|
|
entry = self._start_check(entry)
|
|
if entry:
|
|
self.rt['registries'].append(entry)
|
|
|
|
def start(self):
|
|
"""Start the health manager RPC server.
|
|
|
|
Note that the health manager server uses JSON serializer for parameter
|
|
passing. We should be careful when changing this interface.
|
|
"""
|
|
super(HealthManager, self).start()
|
|
self.target = messaging.Target(server=self.engine_id, topic=self.topic,
|
|
version=self.version)
|
|
server = rpc.get_rpc_server(self.target, self)
|
|
server.start()
|
|
self.TG.add_timer(cfg.CONF.periodic_interval, self._dummy_task)
|
|
|
|
def stop(self):
|
|
self.TG.stop_timers()
|
|
super(HealthManager, self).stop()
|
|
|
|
@property
|
|
def registries(self):
|
|
return self.rt['registries']
|
|
|
|
def listening(self, ctx):
|
|
"""Respond to confirm that the rpc service is still alive."""
|
|
return True
|
|
|
|
def register_cluster(self, ctx, cluster_id, check_type, interval=None,
|
|
params=None, enabled=True):
|
|
"""Register cluster for health checking.
|
|
|
|
:param ctx: The context of notify request.
|
|
:param cluster_id: The ID of the cluster to be checked.
|
|
:param check_type: A string indicating the type of checks.
|
|
:param interval: An optional integer indicating the length of checking
|
|
periods in seconds.
|
|
:param dict params: Other parameters for the health check.
|
|
:return: None
|
|
"""
|
|
params = params or {}
|
|
|
|
registry = objects.HealthRegistry.create(ctx, cluster_id, check_type,
|
|
interval, params,
|
|
self.engine_id,
|
|
enabled=enabled)
|
|
|
|
entry = {
|
|
'cluster_id': registry.cluster_id,
|
|
'check_type': registry.check_type,
|
|
'interval': registry.interval,
|
|
'params': registry.params,
|
|
'enabled': registry.enabled
|
|
}
|
|
|
|
if registry.enabled:
|
|
self._start_check(entry)
|
|
|
|
self.rt['registries'].append(entry)
|
|
|
|
def unregister_cluster(self, ctx, cluster_id):
|
|
"""Unregister a cluster from health checking.
|
|
|
|
:param ctx: The context of notify request.
|
|
:param cluster_id: The ID of the cluster to be unregistered.
|
|
:return: None
|
|
"""
|
|
for i in range(len(self.rt['registries']) - 1, -1, -1):
|
|
entry = self.rt['registries'][i]
|
|
if entry.get('cluster_id') == cluster_id:
|
|
self._stop_check(entry)
|
|
self.rt['registries'].pop(i)
|
|
objects.HealthRegistry.delete(ctx, cluster_id)
|
|
|
|
def enable_cluster(self, ctx, cluster_id, params=None):
|
|
for c in self.rt['registries']:
|
|
if c['cluster_id'] == cluster_id and not c['enabled']:
|
|
c['enabled'] = True
|
|
objects.HealthRegistry.update(ctx, cluster_id,
|
|
{'enabled': True})
|
|
self._start_check(c)
|
|
|
|
def disable_cluster(self, ctx, cluster_id, params=None):
|
|
for c in self.rt['registries']:
|
|
if c['cluster_id'] == cluster_id and c['enabled']:
|
|
c['enabled'] = False
|
|
objects.HealthRegistry.update(ctx, cluster_id,
|
|
{'enabled': False})
|
|
self._stop_check(c)
|
|
|
|
|
|
def notify(engine_id, method, **kwargs):
|
|
"""Send notification to health manager service.
|
|
|
|
Note that the health manager only handles JSON type of parameter passing.
|
|
|
|
:param engine_id: dispatcher to notify; broadcast if value is None
|
|
:param method: remote method to call
|
|
"""
|
|
timeout = cfg.CONF.engine_life_check_timeout
|
|
client = rpc.get_rpc_client(consts.HEALTH_MANAGER_TOPIC, None)
|
|
|
|
if engine_id:
|
|
# Notify specific dispatcher identified by engine_id
|
|
call_context = client.prepare(timeout=timeout, server=engine_id)
|
|
else:
|
|
# Broadcast to all disptachers
|
|
call_context = client.prepare(timeout=timeout)
|
|
|
|
ctx = context.get_admin_context()
|
|
|
|
try:
|
|
call_context.call(ctx, method, **kwargs)
|
|
return True
|
|
except messaging.MessagingTimeout:
|
|
return False
|
|
|
|
|
|
def register(cluster_id, engine_id=None, **kwargs):
|
|
params = kwargs.pop('params', {})
|
|
interval = kwargs.pop('interval', cfg.CONF.periodic_interval)
|
|
check_type = kwargs.pop('check_type', consts.NODE_STATUS_POLLING)
|
|
enabled = kwargs.pop('enabled', True)
|
|
return notify(engine_id, 'register_cluster',
|
|
cluster_id=cluster_id,
|
|
interval=interval,
|
|
check_type=check_type,
|
|
params=params,
|
|
enabled=enabled)
|
|
|
|
|
|
def unregister(cluster_id):
|
|
engine_id = get_manager_engine(cluster_id)
|
|
if engine_id:
|
|
return notify(engine_id, 'unregister_cluster', cluster_id=cluster_id)
|
|
return True
|
|
|
|
|
|
def enable(cluster_id, **kwargs):
|
|
engine_id = get_manager_engine(cluster_id)
|
|
if engine_id:
|
|
return notify(engine_id, 'enable_cluster', cluster_id=cluster_id,
|
|
params=kwargs)
|
|
return False
|
|
|
|
|
|
def disable(cluster_id, **kwargs):
|
|
engine_id = get_manager_engine(cluster_id)
|
|
if engine_id:
|
|
return notify(engine_id, 'disable_cluster', cluster_id=cluster_id,
|
|
params=kwargs)
|
|
return False
|
|
|
|
|
|
def get_manager_engine(cluster_id):
|
|
ctx = context.get_admin_context()
|
|
|
|
registry = objects.HealthRegistry.get(ctx, cluster_id)
|
|
if not registry:
|
|
return None
|
|
|
|
return registry.engine_id
|