# Copyright (c) 2015 Intel Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or # implied. # See the License for the specific language governing permissions and # limitations under the License. import functools from oslo_log import log from oslo_log.versionutils import deprecated from oslo_service import loopingcall from oslo_service import periodic_task from pycadf import cadftaxonomy as taxonomy from magnum.common import context from magnum.common import profiler from magnum.common import rpc from magnum.conductor import monitors from magnum.conductor import utils as conductor_utils import magnum.conf from magnum.drivers.common import driver from magnum import objects CONF = magnum.conf.CONF LOG = log.getLogger(__name__) def set_context(func): @functools.wraps(func) def handler(self, ctx): ctx = context.make_admin_context(all_tenants=True) context.set_ctx(ctx) func(self, ctx) context.set_ctx(None) return handler class ClusterUpdateJob(object): status_to_event = { objects.fields.ClusterStatus.DELETE_COMPLETE: taxonomy.ACTION_DELETE, objects.fields.ClusterStatus.CREATE_COMPLETE: taxonomy.ACTION_CREATE, objects.fields.ClusterStatus.UPDATE_COMPLETE: taxonomy.ACTION_UPDATE, objects.fields.ClusterStatus.ROLLBACK_COMPLETE: taxonomy.ACTION_UPDATE, objects.fields.ClusterStatus.CREATE_FAILED: taxonomy.ACTION_CREATE, objects.fields.ClusterStatus.DELETE_FAILED: taxonomy.ACTION_DELETE, objects.fields.ClusterStatus.UPDATE_FAILED: taxonomy.ACTION_UPDATE, objects.fields.ClusterStatus.ROLLBACK_FAILED: taxonomy.ACTION_UPDATE } def __init__(self, ctx, cluster): self.ctx = ctx self.cluster = cluster def update_status(self): LOG.debug("Updating status for cluster %s", self.cluster.id) # get the driver for the cluster cdriver = driver.Driver.get_driver_for_cluster(self.ctx, self.cluster) # ask the driver to sync status cdriver.update_cluster_status(self.ctx, self.cluster) LOG.debug("Status for cluster %s updated to %s (%s)", self.cluster.id, self.cluster.status, self.cluster.status_reason) # status update notifications if self.cluster.status.endswith("_COMPLETE"): conductor_utils.notify_about_cluster_operation( self.ctx, self.status_to_event[self.cluster.status], taxonomy.OUTCOME_SUCCESS, self.cluster) if self.cluster.status.endswith("_FAILED"): conductor_utils.notify_about_cluster_operation( self.ctx, self.status_to_event[self.cluster.status], taxonomy.OUTCOME_FAILURE, self.cluster) # if we're done with it, delete it if self.cluster.status == objects.fields.ClusterStatus.DELETE_COMPLETE: # delete all the nodegroups that belong to this cluster for ng in objects.NodeGroup.list(self.ctx, self.cluster.uuid): ng.destroy() self.cluster.destroy() # end the "loop" raise loopingcall.LoopingCallDone() class ClusterHealthUpdateJob(object): def __init__(self, ctx, cluster): self.ctx = ctx self.cluster = cluster def _update_health_status(self): monitor = monitors.create_monitor(self.ctx, self.cluster) if monitor is None: return try: monitor.poll_health_status() except Exception as e: LOG.warning( "Skip pulling data from cluster %(cluster)s due to " "error: %(e)s", {'e': e, 'cluster': self.cluster.uuid}, exc_info=True) # TODO(flwang): Should we mark this cluster's health status as # UNKNOWN if Magnum failed to pull data from the cluster? Because # that basically means the k8s API doesn't work at that moment. return if monitor.data.get('health_status'): self.cluster.health_status = monitor.data.get('health_status') self.cluster.health_status_reason = monitor.data.get( 'health_status_reason') self.cluster.save() def update_health_status(self): LOG.debug("Updating health status for cluster %s", self.cluster.id) self._update_health_status() LOG.debug("Status for cluster %s updated to %s (%s)", self.cluster.id, self.cluster.health_status, self.cluster.health_status_reason) # TODO(flwang): Health status update notifications? # end the "loop" raise loopingcall.LoopingCallDone() @profiler.trace_cls("rpc") class MagnumPeriodicTasks(periodic_task.PeriodicTasks): """Magnum periodic Task class Any periodic task job need to be added into this class NOTE(suro-patz): - oslo_service.periodic_task runs tasks protected within try/catch block, with default raise_on_error as 'False', in run_periodic_tasks(), which ensures the process does not die, even if a task encounters an Exception. - The periodic tasks here does not necessarily need another try/catch block. The present try/catch block here helps putting magnum-periodic-task-specific log/error message. """ def __init__(self, conf): super(MagnumPeriodicTasks, self).__init__(conf) self.notifier = rpc.get_notifier() @periodic_task.periodic_task(spacing=10, run_immediately=True) @set_context def sync_cluster_status(self, ctx): try: LOG.debug('Starting to sync up cluster status') # get all the clusters that are IN_PROGRESS status = [objects.fields.ClusterStatus.CREATE_IN_PROGRESS, objects.fields.ClusterStatus.UPDATE_IN_PROGRESS, objects.fields.ClusterStatus.DELETE_IN_PROGRESS, objects.fields.ClusterStatus.ROLLBACK_IN_PROGRESS] filters = {'status': status} clusters = objects.Cluster.list(ctx, filters=filters) if not clusters: return # synchronize with underlying orchestration for cluster in clusters: job = ClusterUpdateJob(ctx, cluster) # though this call isn't really looping, we use this # abstraction anyway to avoid dealing directly with eventlet # hooey lc = loopingcall.FixedIntervalLoopingCall(f=job.update_status) lc.start(1, stop_on_exception=True) except Exception as e: LOG.warning( "Ignore error [%s] when syncing up cluster status.", e, exc_info=True) @periodic_task.periodic_task(spacing=10, run_immediately=True) @set_context def sync_cluster_health_status(self, ctx): try: LOG.debug('Starting to sync up cluster health status') status = [objects.fields.ClusterStatus.CREATE_COMPLETE, objects.fields.ClusterStatus.UPDATE_COMPLETE, objects.fields.ClusterStatus.UPDATE_IN_PROGRESS, objects.fields.ClusterStatus.ROLLBACK_IN_PROGRESS] filters = {'status': status} clusters = objects.Cluster.list(ctx, filters=filters) if not clusters: return # synchronize using native COE API for cluster in clusters: job = ClusterHealthUpdateJob(ctx, cluster) # though this call isn't really looping, we use this # abstraction anyway to avoid dealing directly with eventlet # hooey lc = loopingcall.FixedIntervalLoopingCall( f=job.update_health_status) lc.start(1, stop_on_exception=True) except Exception as e: LOG.warning( "Ignore error [%s] when syncing up cluster status.", e, exc_info=True) @periodic_task.periodic_task(run_immediately=True) @set_context @deprecated(as_of=deprecated.ROCKY) def _send_cluster_metrics(self, ctx): if not CONF.drivers.send_cluster_metrics: LOG.debug('Skip sending cluster metrics') return LOG.debug('Starting to send cluster metrics') for cluster in objects.Cluster.list(ctx): if cluster.status not in ( objects.fields.ClusterStatus.CREATE_COMPLETE, objects.fields.ClusterStatus.UPDATE_COMPLETE): continue monitor = monitors.create_monitor(ctx, cluster) if monitor is None: continue try: monitor.pull_data() except Exception as e: LOG.warning( "Skip pulling data from cluster %(cluster)s due to " "error: %(e)s", {'e': e, 'cluster': cluster.uuid}, exc_info=True) continue metrics = list() for name in monitor.get_metric_names(): try: metric = { 'name': name, 'value': monitor.compute_metric_value(name), 'unit': monitor.get_metric_unit(name), } metrics.append(metric) except Exception as e: LOG.warning("Skip adding metric %(name)s due to " "error: %(e)s", {'e': e, 'name': name}, exc_info=True) message = dict(metrics=metrics, user_id=cluster.user_id, project_id=cluster.project_id, resource_id=cluster.uuid) LOG.debug("About to send notification: '%s'", message) self.notifier.info(ctx, "magnum.cluster.metrics.update", message) def setup(conf, tg): pt = MagnumPeriodicTasks(conf) tg.add_dynamic_timer( pt.run_periodic_tasks, periodic_interval_max=conf.periodic_interval_max, context=None)