
showing backtrace on a handled and warning logged exception seems aggressive, just return as error. also, stop setting ceph/swift to same endpoint Change-Id: I33be2d433c86cd67add5fac92fed1b66219d48b6 Closes-Bug: #1573241
531 lines
21 KiB
Python
531 lines
21 KiB
Python
#
|
|
# Copyright 2013 Julien Danjou
|
|
# Copyright 2014 Red Hat, Inc
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import collections
|
|
import itertools
|
|
import random
|
|
|
|
from concurrent import futures
|
|
from futurist import periodics
|
|
from keystoneauth1 import exceptions as ka_exceptions
|
|
from oslo_config import cfg
|
|
from oslo_log import log
|
|
import oslo_messaging
|
|
from oslo_utils import fnmatch
|
|
from oslo_utils import timeutils
|
|
from six import moves
|
|
from six.moves.urllib import parse as urlparse
|
|
from stevedore import extension
|
|
|
|
from ceilometer.agent import plugin_base
|
|
from ceilometer import coordination
|
|
from ceilometer.i18n import _LE, _LI, _LW
|
|
from ceilometer import keystone_client
|
|
from ceilometer import messaging
|
|
from ceilometer import pipeline
|
|
from ceilometer.publisher import utils as publisher_utils
|
|
from ceilometer import service_base
|
|
from ceilometer import utils
|
|
|
|
LOG = log.getLogger(__name__)
|
|
|
|
OPTS = [
|
|
cfg.BoolOpt('batch_polled_samples',
|
|
default=True,
|
|
help='To reduce polling agent load, samples are sent to the '
|
|
'notification agent in a batch. To gain higher '
|
|
'throughput at the cost of load set this to False.'),
|
|
cfg.IntOpt('shuffle_time_before_polling_task',
|
|
default=0,
|
|
help='To reduce large requests at same time to Nova or other '
|
|
'components from different compute agents, shuffle '
|
|
'start time of polling task.'),
|
|
]
|
|
|
|
POLLING_OPTS = [
|
|
cfg.StrOpt('partitioning_group_prefix',
|
|
deprecated_group='central',
|
|
help='Work-load partitioning group prefix. Use only if you '
|
|
'want to run multiple polling agents with different '
|
|
'config files. For each sub-group of the agent '
|
|
'pool with the same partitioning_group_prefix a disjoint '
|
|
'subset of pollsters should be loaded.'),
|
|
]
|
|
|
|
|
|
class PollsterListForbidden(Exception):
|
|
def __init__(self):
|
|
msg = ('It is forbidden to use pollster-list option of polling agent '
|
|
'in case of using coordination between multiple agents. Please '
|
|
'use either multiple agents being coordinated or polling list '
|
|
'option for one polling agent.')
|
|
super(PollsterListForbidden, self).__init__(msg)
|
|
|
|
|
|
class EmptyPollstersList(Exception):
|
|
def __init__(self):
|
|
msg = ('No valid pollsters can be loaded with the startup parameters'
|
|
' polling-namespaces and pollster-list.')
|
|
super(EmptyPollstersList, self).__init__(msg)
|
|
|
|
|
|
class Resources(object):
|
|
def __init__(self, agent_manager):
|
|
self.agent_manager = agent_manager
|
|
self._resources = []
|
|
self._discovery = []
|
|
self.blacklist = []
|
|
|
|
def setup(self, source):
|
|
self._resources = source.resources
|
|
self._discovery = source.discovery
|
|
|
|
def get(self, discovery_cache=None):
|
|
source_discovery = (self.agent_manager.discover(self._discovery,
|
|
discovery_cache)
|
|
if self._discovery else [])
|
|
static_resources = []
|
|
if self._resources:
|
|
static_resources_group = self.agent_manager.construct_group_id(
|
|
utils.hash_of_set(self._resources))
|
|
p_coord = self.agent_manager.partition_coordinator
|
|
static_resources = p_coord.extract_my_subset(
|
|
static_resources_group, self._resources)
|
|
return static_resources + source_discovery
|
|
|
|
@staticmethod
|
|
def key(source_name, pollster):
|
|
return '%s-%s' % (source_name, pollster.name)
|
|
|
|
|
|
class PollingTask(object):
|
|
"""Polling task for polling samples and notifying.
|
|
|
|
A polling task can be invoked periodically or only once.
|
|
"""
|
|
|
|
def __init__(self, agent_manager):
|
|
self.manager = agent_manager
|
|
|
|
# elements of the Cartesian product of sources X pollsters
|
|
# with a common interval
|
|
self.pollster_matches = collections.defaultdict(set)
|
|
|
|
# we relate the static resources and per-source discovery to
|
|
# each combination of pollster and matching source
|
|
resource_factory = lambda: Resources(agent_manager)
|
|
self.resources = collections.defaultdict(resource_factory)
|
|
|
|
self._batch = self.manager.conf.batch_polled_samples
|
|
self._telemetry_secret = self.manager.conf.publisher.telemetry_secret
|
|
|
|
def add(self, pollster, source):
|
|
self.pollster_matches[source.name].add(pollster)
|
|
key = Resources.key(source.name, pollster)
|
|
self.resources[key].setup(source)
|
|
|
|
def poll_and_notify(self):
|
|
"""Polling sample and notify."""
|
|
cache = {}
|
|
discovery_cache = {}
|
|
poll_history = {}
|
|
for source_name in self.pollster_matches:
|
|
for pollster in self.pollster_matches[source_name]:
|
|
key = Resources.key(source_name, pollster)
|
|
candidate_res = list(
|
|
self.resources[key].get(discovery_cache))
|
|
if not candidate_res and pollster.obj.default_discovery:
|
|
candidate_res = self.manager.discover(
|
|
[pollster.obj.default_discovery], discovery_cache)
|
|
|
|
# Remove duplicated resources and black resources. Using
|
|
# set() requires well defined __hash__ for each resource.
|
|
# Since __eq__ is defined, 'not in' is safe here.
|
|
polling_resources = []
|
|
black_res = self.resources[key].blacklist
|
|
history = poll_history.get(pollster.name, [])
|
|
for x in candidate_res:
|
|
if x not in history:
|
|
history.append(x)
|
|
if x not in black_res:
|
|
polling_resources.append(x)
|
|
poll_history[pollster.name] = history
|
|
|
|
# If no resources, skip for this pollster
|
|
if not polling_resources:
|
|
p_context = 'new ' if history else ''
|
|
LOG.info(_LI("Skip pollster %(name)s, no %(p_context)s"
|
|
"resources found this cycle"),
|
|
{'name': pollster.name, 'p_context': p_context})
|
|
continue
|
|
|
|
LOG.info(_LI("Polling pollster %(poll)s in the context of "
|
|
"%(src)s"),
|
|
dict(poll=pollster.name, src=source_name))
|
|
try:
|
|
polling_timestamp = timeutils.utcnow().isoformat()
|
|
samples = pollster.obj.get_samples(
|
|
manager=self.manager,
|
|
cache=cache,
|
|
resources=polling_resources
|
|
)
|
|
sample_batch = []
|
|
|
|
for sample in samples:
|
|
# Note(yuywz): Unify the timestamp of polled samples
|
|
sample.set_timestamp(polling_timestamp)
|
|
sample_dict = (
|
|
publisher_utils.meter_message_from_counter(
|
|
sample, self._telemetry_secret
|
|
))
|
|
if self._batch:
|
|
sample_batch.append(sample_dict)
|
|
else:
|
|
self._send_notification([sample_dict])
|
|
|
|
if sample_batch:
|
|
self._send_notification(sample_batch)
|
|
|
|
except plugin_base.PollsterPermanentError as err:
|
|
LOG.error(_LE(
|
|
'Prevent pollster %(name)s from '
|
|
'polling %(res_list)s on source %(source)s anymore!')
|
|
% ({'name': pollster.name, 'source': source_name,
|
|
'res_list': err.fail_res_list}))
|
|
self.resources[key].blacklist.extend(err.fail_res_list)
|
|
except Exception as err:
|
|
LOG.error(_LE(
|
|
'Continue after error from %(name)s: %(error)s')
|
|
% ({'name': pollster.name, 'error': err}),
|
|
exc_info=True)
|
|
|
|
def _send_notification(self, samples):
|
|
self.manager.notifier.sample(
|
|
{},
|
|
'telemetry.polling',
|
|
{'samples': samples}
|
|
)
|
|
|
|
|
|
class AgentManager(service_base.PipelineBasedService):
|
|
|
|
def __init__(self, worker_id, conf, namespaces=None, pollster_list=None, ):
|
|
|
|
namespaces = namespaces or ['compute', 'central']
|
|
pollster_list = pollster_list or []
|
|
group_prefix = conf.polling.partitioning_group_prefix
|
|
|
|
# features of using coordination and pollster-list are exclusive, and
|
|
# cannot be used at one moment to avoid both samples duplication and
|
|
# samples being lost
|
|
if pollster_list and conf.coordination.backend_url:
|
|
raise PollsterListForbidden()
|
|
|
|
super(AgentManager, self).__init__(worker_id, conf)
|
|
|
|
def _match(pollster):
|
|
"""Find out if pollster name matches to one of the list."""
|
|
return any(fnmatch.fnmatch(pollster.name, pattern) for
|
|
pattern in pollster_list)
|
|
|
|
if type(namespaces) is not list:
|
|
namespaces = [namespaces]
|
|
|
|
# we'll have default ['compute', 'central'] here if no namespaces will
|
|
# be passed
|
|
extensions = (self._extensions('poll', namespace, self.conf).extensions
|
|
for namespace in namespaces)
|
|
# get the extensions from pollster builder
|
|
extensions_fb = (self._extensions_from_builder('poll', namespace)
|
|
for namespace in namespaces)
|
|
if pollster_list:
|
|
extensions = (moves.filter(_match, exts)
|
|
for exts in extensions)
|
|
extensions_fb = (moves.filter(_match, exts)
|
|
for exts in extensions_fb)
|
|
|
|
self.extensions = list(itertools.chain(*list(extensions))) + list(
|
|
itertools.chain(*list(extensions_fb)))
|
|
|
|
if self.extensions == []:
|
|
raise EmptyPollstersList()
|
|
|
|
discoveries = (self._extensions('discover', namespace,
|
|
self.conf).extensions
|
|
for namespace in namespaces)
|
|
self.discoveries = list(itertools.chain(*list(discoveries)))
|
|
self.polling_periodics = None
|
|
|
|
self.partition_coordinator = coordination.PartitionCoordinator(
|
|
self.conf)
|
|
self.heartbeat_timer = utils.create_periodic(
|
|
target=self.partition_coordinator.heartbeat,
|
|
spacing=self.conf.coordination.heartbeat,
|
|
run_immediately=True)
|
|
|
|
# Compose coordination group prefix.
|
|
# We'll use namespaces as the basement for this partitioning.
|
|
namespace_prefix = '-'.join(sorted(namespaces))
|
|
self.group_prefix = ('%s-%s' % (namespace_prefix, group_prefix)
|
|
if group_prefix else namespace_prefix)
|
|
|
|
self.notifier = oslo_messaging.Notifier(
|
|
messaging.get_transport(self.conf),
|
|
driver=self.conf.publisher_notifier.telemetry_driver,
|
|
publisher_id="ceilometer.polling")
|
|
|
|
self._keystone = None
|
|
self._keystone_last_exception = None
|
|
|
|
@staticmethod
|
|
def _get_ext_mgr(namespace, *args, **kwargs):
|
|
def _catch_extension_load_error(mgr, ep, exc):
|
|
# Extension raising ExtensionLoadError can be ignored,
|
|
# and ignore anything we can't import as a safety measure.
|
|
if isinstance(exc, plugin_base.ExtensionLoadError):
|
|
LOG.exception(_LE("Skip loading extension for %s"), ep.name)
|
|
return
|
|
if isinstance(exc, ImportError):
|
|
LOG.error(_LE("Failed to import extension for %(name)s: "
|
|
"%(error)s"),
|
|
{'name': ep.name, 'error': exc})
|
|
return
|
|
raise exc
|
|
|
|
return extension.ExtensionManager(
|
|
namespace=namespace,
|
|
invoke_on_load=True,
|
|
invoke_args=args,
|
|
invoke_kwds=kwargs,
|
|
on_load_failure_callback=_catch_extension_load_error,
|
|
)
|
|
|
|
def _extensions(self, category, agent_ns=None, *args, **kwargs):
|
|
namespace = ('ceilometer.%s.%s' % (category, agent_ns) if agent_ns
|
|
else 'ceilometer.%s' % category)
|
|
return self._get_ext_mgr(namespace, *args, **kwargs)
|
|
|
|
def _extensions_from_builder(self, category, agent_ns=None):
|
|
ns = ('ceilometer.builder.%s.%s' % (category, agent_ns) if agent_ns
|
|
else 'ceilometer.builder.%s' % category)
|
|
mgr = self._get_ext_mgr(ns, self.conf)
|
|
|
|
def _build(ext):
|
|
return ext.plugin.get_pollsters_extensions(self.conf)
|
|
|
|
# NOTE: this seems a stevedore bug. if no extensions are found,
|
|
# map will raise runtimeError which is not documented.
|
|
if mgr.names():
|
|
return list(itertools.chain(*mgr.map(_build)))
|
|
else:
|
|
return []
|
|
|
|
def join_partitioning_groups(self):
|
|
self.groups = set([self.construct_group_id(d.obj.group_id)
|
|
for d in self.discoveries])
|
|
# let each set of statically-defined resources have its own group
|
|
static_resource_groups = set([
|
|
self.construct_group_id(utils.hash_of_set(p.resources))
|
|
for p in self.polling_manager.sources
|
|
if p.resources
|
|
])
|
|
self.groups.update(static_resource_groups)
|
|
|
|
if not self.groups and self.partition_coordinator.is_active():
|
|
self.partition_coordinator.stop()
|
|
self.heartbeat_timer.stop()
|
|
|
|
if self.groups and not self.partition_coordinator.is_active():
|
|
self.partition_coordinator.start()
|
|
utils.spawn_thread(self.heartbeat_timer.start)
|
|
|
|
for group in self.groups:
|
|
self.partition_coordinator.join_group(group)
|
|
|
|
def create_polling_task(self):
|
|
"""Create an initially empty polling task."""
|
|
return PollingTask(self)
|
|
|
|
def setup_polling_tasks(self):
|
|
polling_tasks = {}
|
|
for source in self.polling_manager.sources:
|
|
polling_task = None
|
|
for pollster in self.extensions:
|
|
if source.support_meter(pollster.name):
|
|
polling_task = polling_tasks.get(source.get_interval())
|
|
if not polling_task:
|
|
polling_task = self.create_polling_task()
|
|
polling_tasks[source.get_interval()] = polling_task
|
|
polling_task.add(pollster, source)
|
|
return polling_tasks
|
|
|
|
def construct_group_id(self, discovery_group_id):
|
|
return ('%s-%s' % (self.group_prefix,
|
|
discovery_group_id)
|
|
if discovery_group_id else None)
|
|
|
|
def start_polling_tasks(self):
|
|
# allow time for coordination if necessary
|
|
delay_start = self.partition_coordinator.is_active()
|
|
|
|
# set shuffle time before polling task if necessary
|
|
delay_polling_time = random.randint(
|
|
0, self.conf.shuffle_time_before_polling_task)
|
|
|
|
data = self.setup_polling_tasks()
|
|
|
|
# Don't start useless threads if no task will run
|
|
if not data:
|
|
return
|
|
|
|
# One thread per polling tasks is enough
|
|
self.polling_periodics = periodics.PeriodicWorker.create(
|
|
[], executor_factory=lambda:
|
|
futures.ThreadPoolExecutor(max_workers=len(data)))
|
|
|
|
for interval, polling_task in data.items():
|
|
delay_time = (interval + delay_polling_time if delay_start
|
|
else delay_polling_time)
|
|
|
|
@periodics.periodic(spacing=interval, run_immediately=False)
|
|
def task(running_task):
|
|
self.interval_task(running_task)
|
|
|
|
utils.spawn_thread(utils.delayed, delay_time,
|
|
self.polling_periodics.add, task, polling_task)
|
|
|
|
utils.spawn_thread(self.polling_periodics.start, allow_empty=True)
|
|
|
|
def run(self):
|
|
super(AgentManager, self).run()
|
|
self.polling_manager = pipeline.setup_polling(self.conf)
|
|
self.join_partitioning_groups()
|
|
self.start_polling_tasks()
|
|
self.init_pipeline_refresh()
|
|
|
|
def terminate(self):
|
|
self.stop_pollsters_tasks()
|
|
self.heartbeat_timer.stop()
|
|
self.partition_coordinator.stop()
|
|
super(AgentManager, self).terminate()
|
|
|
|
def interval_task(self, task):
|
|
# NOTE(sileht): remove the previous keystone client
|
|
# and exception to get a new one in this polling cycle.
|
|
self._keystone = None
|
|
self._keystone_last_exception = None
|
|
|
|
task.poll_and_notify()
|
|
|
|
@property
|
|
def keystone(self):
|
|
# FIXME(sileht): This lazy loading of keystone client doesn't
|
|
# look concurrently safe, we never see issue because once we have
|
|
# connected to keystone everything is fine, and because all pollsters
|
|
# are delayed during startup. But each polling task creates a new
|
|
# client and overrides it which has been created by other polling
|
|
# tasks. During this short time bad thing can occur.
|
|
#
|
|
# I think we must not reset keystone client before
|
|
# running a polling task, but refresh it periodically instead.
|
|
|
|
# NOTE(sileht): we do lazy loading of the keystone client
|
|
# for multiple reasons:
|
|
# * don't use it if no plugin need it
|
|
# * use only one client for all plugins per polling cycle
|
|
if self._keystone is None and self._keystone_last_exception is None:
|
|
try:
|
|
self._keystone = keystone_client.get_client(self.conf)
|
|
self._keystone_last_exception = None
|
|
except ka_exceptions.ClientException as e:
|
|
self._keystone = None
|
|
self._keystone_last_exception = e
|
|
if self._keystone is not None:
|
|
return self._keystone
|
|
else:
|
|
raise self._keystone_last_exception
|
|
|
|
@staticmethod
|
|
def _parse_discoverer(url):
|
|
s = urlparse.urlparse(url)
|
|
return (s.scheme or s.path), (s.netloc + s.path if s.scheme else None)
|
|
|
|
def _discoverer(self, name):
|
|
for d in self.discoveries:
|
|
if d.name == name:
|
|
return d.obj
|
|
return None
|
|
|
|
def discover(self, discovery=None, discovery_cache=None):
|
|
resources = []
|
|
discovery = discovery or []
|
|
for url in discovery:
|
|
if discovery_cache is not None and url in discovery_cache:
|
|
resources.extend(discovery_cache[url])
|
|
continue
|
|
name, param = self._parse_discoverer(url)
|
|
discoverer = self._discoverer(name)
|
|
if discoverer:
|
|
try:
|
|
if discoverer.KEYSTONE_REQUIRED_FOR_SERVICE:
|
|
service_type = getattr(
|
|
self.conf.service_types,
|
|
discoverer.KEYSTONE_REQUIRED_FOR_SERVICE)
|
|
if not keystone_client.get_service_catalog(
|
|
self.keystone).get_endpoints(
|
|
service_type=service_type):
|
|
LOG.warning(_LW(
|
|
'Skipping %(name)s, %(service_type)s service '
|
|
'is not registered in keystone'),
|
|
{'name': name, 'service_type': service_type})
|
|
continue
|
|
|
|
discovered = discoverer.discover(self, param)
|
|
partitioned = self.partition_coordinator.extract_my_subset(
|
|
self.construct_group_id(discoverer.group_id),
|
|
discovered)
|
|
resources.extend(partitioned)
|
|
if discovery_cache is not None:
|
|
discovery_cache[url] = partitioned
|
|
except ka_exceptions.ClientException as e:
|
|
LOG.error(_LE('Skipping %(name)s, keystone issue: '
|
|
'%(exc)s'), {'name': name, 'exc': e})
|
|
except Exception as err:
|
|
LOG.exception(_LE('Unable to discover resources: %s'), err)
|
|
else:
|
|
LOG.warning(_LW('Unknown discovery extension: %s'), name)
|
|
return resources
|
|
|
|
def stop_pollsters_tasks(self):
|
|
if self.polling_periodics:
|
|
self.polling_periodics.stop()
|
|
self.polling_periodics.wait()
|
|
self.polling_periodics = None
|
|
|
|
def reload_pipeline(self):
|
|
if self.pipeline_validated:
|
|
LOG.info(_LI("Reconfiguring polling tasks."))
|
|
|
|
# stop existing pollsters and leave partitioning groups
|
|
self.stop_pollsters_tasks()
|
|
for group in self.groups:
|
|
self.partition_coordinator.leave_group(group)
|
|
|
|
# re-create partitioning groups according to pipeline
|
|
# and configure polling tasks with latest pipeline conf
|
|
self.join_partitioning_groups()
|
|
self.start_polling_tasks()
|