You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
271 lines
10 KiB
271 lines
10 KiB
# Copyright (c) 2015 Intel Inc. |
|
# |
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
|
# you may not use this file except in compliance with the License. |
|
# You may obtain a copy of the License at |
|
# |
|
# http://www.apache.org/licenses/LICENSE-2.0 |
|
# |
|
# Unless required by applicable law or agreed to in writing, software |
|
# distributed under the License is distributed on an "AS IS" BASIS, |
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or |
|
# implied. |
|
# See the License for the specific language governing permissions and |
|
# limitations under the License. |
|
|
|
import functools |
|
|
|
from oslo_log import log |
|
from oslo_log.versionutils import deprecated |
|
from oslo_service import loopingcall |
|
from oslo_service import periodic_task |
|
|
|
from pycadf import cadftaxonomy as taxonomy |
|
|
|
from magnum.common import context |
|
from magnum.common import profiler |
|
from magnum.common import rpc |
|
from magnum.conductor import monitors |
|
from magnum.conductor import utils as conductor_utils |
|
import magnum.conf |
|
from magnum.drivers.common import driver |
|
from magnum import objects |
|
|
|
|
|
CONF = magnum.conf.CONF |
|
LOG = log.getLogger(__name__) |
|
|
|
|
|
def set_context(func): |
|
@functools.wraps(func) |
|
def handler(self, ctx): |
|
ctx = context.make_admin_context(all_tenants=True) |
|
context.set_ctx(ctx) |
|
func(self, ctx) |
|
context.set_ctx(None) |
|
return handler |
|
|
|
|
|
class ClusterUpdateJob(object): |
|
|
|
status_to_event = { |
|
objects.fields.ClusterStatus.DELETE_COMPLETE: taxonomy.ACTION_DELETE, |
|
objects.fields.ClusterStatus.CREATE_COMPLETE: taxonomy.ACTION_CREATE, |
|
objects.fields.ClusterStatus.UPDATE_COMPLETE: taxonomy.ACTION_UPDATE, |
|
objects.fields.ClusterStatus.ROLLBACK_COMPLETE: taxonomy.ACTION_UPDATE, |
|
objects.fields.ClusterStatus.CREATE_FAILED: taxonomy.ACTION_CREATE, |
|
objects.fields.ClusterStatus.DELETE_FAILED: taxonomy.ACTION_DELETE, |
|
objects.fields.ClusterStatus.UPDATE_FAILED: taxonomy.ACTION_UPDATE, |
|
objects.fields.ClusterStatus.ROLLBACK_FAILED: taxonomy.ACTION_UPDATE |
|
} |
|
|
|
def __init__(self, ctx, cluster): |
|
self.ctx = ctx |
|
self.cluster = cluster |
|
|
|
def update_status(self): |
|
LOG.debug("Updating status for cluster %s", self.cluster.id) |
|
# get the driver for the cluster |
|
cdriver = driver.Driver.get_driver_for_cluster(self.ctx, self.cluster) |
|
# ask the driver to sync status |
|
cdriver.update_cluster_status(self.ctx, self.cluster) |
|
LOG.debug("Status for cluster %s updated to %s (%s)", |
|
self.cluster.id, self.cluster.status, |
|
self.cluster.status_reason) |
|
# status update notifications |
|
if self.cluster.status.endswith("_COMPLETE"): |
|
conductor_utils.notify_about_cluster_operation( |
|
self.ctx, self.status_to_event[self.cluster.status], |
|
taxonomy.OUTCOME_SUCCESS, self.cluster) |
|
if self.cluster.status.endswith("_FAILED"): |
|
conductor_utils.notify_about_cluster_operation( |
|
self.ctx, self.status_to_event[self.cluster.status], |
|
taxonomy.OUTCOME_FAILURE, self.cluster) |
|
# if we're done with it, delete it |
|
if self.cluster.status == objects.fields.ClusterStatus.DELETE_COMPLETE: |
|
# delete all the nodegroups that belong to this cluster |
|
for ng in objects.NodeGroup.list(self.ctx, self.cluster.uuid): |
|
ng.destroy() |
|
self.cluster.destroy() |
|
# end the "loop" |
|
raise loopingcall.LoopingCallDone() |
|
|
|
|
|
class ClusterHealthUpdateJob(object): |
|
|
|
def __init__(self, ctx, cluster): |
|
self.ctx = ctx |
|
self.cluster = cluster |
|
|
|
def _update_health_status(self): |
|
monitor = monitors.create_monitor(self.ctx, self.cluster) |
|
if monitor is None: |
|
return |
|
|
|
try: |
|
monitor.poll_health_status() |
|
except Exception as e: |
|
LOG.warning( |
|
"Skip pulling data from cluster %(cluster)s due to " |
|
"error: %(e)s", |
|
{'e': e, 'cluster': self.cluster.uuid}, exc_info=True) |
|
# TODO(flwang): Should we mark this cluster's health status as |
|
# UNKNOWN if Magnum failed to pull data from the cluster? Because |
|
# that basically means the k8s API doesn't work at that moment. |
|
return |
|
|
|
if monitor.data.get('health_status'): |
|
self.cluster.health_status = monitor.data.get('health_status') |
|
self.cluster.health_status_reason = monitor.data.get( |
|
'health_status_reason') |
|
self.cluster.save() |
|
|
|
def update_health_status(self): |
|
LOG.debug("Updating health status for cluster %s", self.cluster.id) |
|
self._update_health_status() |
|
LOG.debug("Status for cluster %s updated to %s (%s)", |
|
self.cluster.id, self.cluster.health_status, |
|
self.cluster.health_status_reason) |
|
# TODO(flwang): Health status update notifications? |
|
# end the "loop" |
|
raise loopingcall.LoopingCallDone() |
|
|
|
|
|
@profiler.trace_cls("rpc") |
|
class MagnumPeriodicTasks(periodic_task.PeriodicTasks): |
|
"""Magnum periodic Task class |
|
|
|
Any periodic task job need to be added into this class |
|
|
|
NOTE(suro-patz): |
|
- oslo_service.periodic_task runs tasks protected within try/catch |
|
block, with default raise_on_error as 'False', in run_periodic_tasks(), |
|
which ensures the process does not die, even if a task encounters an |
|
Exception. |
|
- The periodic tasks here does not necessarily need another |
|
try/catch block. The present try/catch block here helps putting |
|
magnum-periodic-task-specific log/error message. |
|
|
|
""" |
|
|
|
def __init__(self, conf): |
|
super(MagnumPeriodicTasks, self).__init__(conf) |
|
self.notifier = rpc.get_notifier() |
|
|
|
@periodic_task.periodic_task(spacing=10, run_immediately=True) |
|
@set_context |
|
def sync_cluster_status(self, ctx): |
|
try: |
|
LOG.debug('Starting to sync up cluster status') |
|
|
|
# get all the clusters that are IN_PROGRESS |
|
status = [objects.fields.ClusterStatus.CREATE_IN_PROGRESS, |
|
objects.fields.ClusterStatus.UPDATE_IN_PROGRESS, |
|
objects.fields.ClusterStatus.DELETE_IN_PROGRESS, |
|
objects.fields.ClusterStatus.ROLLBACK_IN_PROGRESS] |
|
filters = {'status': status} |
|
clusters = objects.Cluster.list(ctx, filters=filters) |
|
if not clusters: |
|
return |
|
|
|
# synchronize with underlying orchestration |
|
for cluster in clusters: |
|
job = ClusterUpdateJob(ctx, cluster) |
|
# though this call isn't really looping, we use this |
|
# abstraction anyway to avoid dealing directly with eventlet |
|
# hooey |
|
lc = loopingcall.FixedIntervalLoopingCall(f=job.update_status) |
|
lc.start(1, stop_on_exception=True) |
|
|
|
except Exception as e: |
|
LOG.warning( |
|
"Ignore error [%s] when syncing up cluster status.", |
|
e, exc_info=True) |
|
|
|
@periodic_task.periodic_task(spacing=10, run_immediately=True) |
|
@set_context |
|
def sync_cluster_health_status(self, ctx): |
|
try: |
|
LOG.debug('Starting to sync up cluster health status') |
|
|
|
status = [objects.fields.ClusterStatus.CREATE_COMPLETE, |
|
objects.fields.ClusterStatus.UPDATE_COMPLETE, |
|
objects.fields.ClusterStatus.UPDATE_IN_PROGRESS, |
|
objects.fields.ClusterStatus.ROLLBACK_IN_PROGRESS] |
|
filters = {'status': status} |
|
clusters = objects.Cluster.list(ctx, filters=filters) |
|
if not clusters: |
|
return |
|
|
|
# synchronize using native COE API |
|
for cluster in clusters: |
|
job = ClusterHealthUpdateJob(ctx, cluster) |
|
# though this call isn't really looping, we use this |
|
# abstraction anyway to avoid dealing directly with eventlet |
|
# hooey |
|
lc = loopingcall.FixedIntervalLoopingCall( |
|
f=job.update_health_status) |
|
lc.start(1, stop_on_exception=True) |
|
|
|
except Exception as e: |
|
LOG.warning( |
|
"Ignore error [%s] when syncing up cluster status.", |
|
e, exc_info=True) |
|
|
|
@periodic_task.periodic_task(run_immediately=True) |
|
@set_context |
|
@deprecated(as_of=deprecated.ROCKY) |
|
def _send_cluster_metrics(self, ctx): |
|
if not CONF.drivers.send_cluster_metrics: |
|
LOG.debug('Skip sending cluster metrics') |
|
return |
|
|
|
LOG.debug('Starting to send cluster metrics') |
|
for cluster in objects.Cluster.list(ctx): |
|
if cluster.status not in ( |
|
objects.fields.ClusterStatus.CREATE_COMPLETE, |
|
objects.fields.ClusterStatus.UPDATE_COMPLETE): |
|
continue |
|
|
|
monitor = monitors.create_monitor(ctx, cluster) |
|
if monitor is None: |
|
continue |
|
|
|
try: |
|
monitor.pull_data() |
|
except Exception as e: |
|
LOG.warning( |
|
"Skip pulling data from cluster %(cluster)s due to " |
|
"error: %(e)s", |
|
{'e': e, 'cluster': cluster.uuid}, exc_info=True) |
|
continue |
|
|
|
metrics = list() |
|
for name in monitor.get_metric_names(): |
|
try: |
|
metric = { |
|
'name': name, |
|
'value': monitor.compute_metric_value(name), |
|
'unit': monitor.get_metric_unit(name), |
|
} |
|
metrics.append(metric) |
|
except Exception as e: |
|
LOG.warning("Skip adding metric %(name)s due to " |
|
"error: %(e)s", |
|
{'e': e, 'name': name}, exc_info=True) |
|
|
|
message = dict(metrics=metrics, |
|
user_id=cluster.user_id, |
|
project_id=cluster.project_id, |
|
resource_id=cluster.uuid) |
|
LOG.debug("About to send notification: '%s'", message) |
|
self.notifier.info(ctx, "magnum.cluster.metrics.update", |
|
message) |
|
|
|
|
|
def setup(conf, tg): |
|
pt = MagnumPeriodicTasks(conf) |
|
tg.add_dynamic_timer( |
|
pt.run_periodic_tasks, |
|
periodic_interval_max=conf.periodic_interval_max, |
|
context=None)
|
|
|