Implementation of dynamically reloadable pipeline

Adds the ability to poll the file-based pipeline
configuration and use it to activate/deactivate
collection targets on-the-fly.

Change-Id: I93fa33a167db81bb8a891d668c0714e627214d11
Partially-Implements: blueprint reload-file-based-pipeline-configuration
This commit is contained in:
Rohit Jaiswal
2015-06-11 22:10:51 +00:00
parent 34c6eb03ac
commit bd8cdbafa6
6 changed files with 402 additions and 44 deletions

View File

@@ -26,7 +26,6 @@ import random
from oslo_config import cfg
from oslo_context import context
from oslo_log import log
from oslo_service import service as os_service
import six
from six import moves
from six.moves.urllib import parse as urlparse
@@ -34,8 +33,9 @@ from stevedore import extension
from ceilometer.agent import plugin_base
from ceilometer import coordination
from ceilometer.i18n import _
from ceilometer.i18n import _, _LI
from ceilometer import pipeline as publish_pipeline
from ceilometer import service_base
from ceilometer import utils
LOG = log.getLogger(__name__)
@@ -191,7 +191,7 @@ class PollingTask(object):
exc_info=True)
class AgentManager(os_service.Service):
class AgentManager(service_base.BaseService):
def __init__(self, namespaces, pollster_list, group_prefix=None):
# features of using coordination and pollster-list are exclusive, and
@@ -256,16 +256,16 @@ class AgentManager(os_service.Service):
)
def join_partitioning_groups(self):
groups = set([self.construct_group_id(d.obj.group_id)
for d in self.discovery_manager])
self.groups = set([self.construct_group_id(d.obj.group_id)
for d in self.discovery_manager])
# let each set of statically-defined resources have its own group
static_resource_groups = set([
self.construct_group_id(utils.hash_of_set(p.resources))
for p in self.pipeline_manager.pipelines
if p.resources
])
groups.update(static_resource_groups)
for group in groups:
self.groups.update(static_resource_groups)
for group in self.groups:
self.partition_coordinator.join_group(group)
def create_polling_task(self):
@@ -290,12 +290,7 @@ class AgentManager(os_service.Service):
discovery_group_id)
if discovery_group_id else None)
def start(self):
self.pipeline_manager = publish_pipeline.setup_pipeline()
self.partition_coordinator.start()
self.join_partitioning_groups()
def configure_polling_tasks(self):
# allow time for coordination if necessary
delay_start = self.partition_coordinator.is_active()
@@ -303,16 +298,29 @@ class AgentManager(os_service.Service):
delay_polling_time = random.randint(
0, cfg.CONF.shuffle_time_before_polling_task)
pollster_timers = []
for interval, task in six.iteritems(self.setup_polling_tasks()):
delay_time = (interval + delay_polling_time if delay_start
else delay_polling_time)
self.tg.add_timer(interval,
self.interval_task,
initial_delay=delay_time,
task=task)
pollster_timers.append(self.tg.add_timer(interval,
self.interval_task,
initial_delay=delay_time,
task=task))
self.tg.add_timer(cfg.CONF.coordination.heartbeat,
self.partition_coordinator.heartbeat)
return pollster_timers
def start(self):
self.pipeline_manager = publish_pipeline.setup_pipeline()
self.partition_coordinator.start()
self.join_partitioning_groups()
self.pollster_timers = self.configure_polling_tasks()
self.init_pipeline_refresh()
def stop(self):
if self.partition_coordinator:
self.partition_coordinator.stop()
@@ -356,3 +364,25 @@ class AgentManager(os_service.Service):
else:
LOG.warning(_('Unknown discovery extension: %s') % name)
return resources
def stop_pollsters(self):
for x in self.pollster_timers:
try:
x.stop()
self.tg.timer_done(x)
except Exception:
LOG.error(_('Error stopping pollster.'), exc_info=True)
self.pollster_timers = []
def reload_pipeline(self):
LOG.info(_LI("Reconfiguring polling tasks."))
# stop existing pollsters and leave partitioning groups
self.stop_pollsters()
for group in self.groups:
self.partition_coordinator.leave_group(group)
# re-create partitioning groups according to pipeline
# and configure polling tasks with latest pipeline conf
self.join_partitioning_groups()
self.pollster_timers = self.configure_polling_tasks()

View File

@@ -17,15 +17,15 @@ from oslo_config import cfg
from oslo_context import context
from oslo_log import log
import oslo_messaging
from oslo_service import service as os_service
from stevedore import extension
from ceilometer.agent import plugin_base as base
from ceilometer import coordination
from ceilometer.event import endpoint as event_endpoint
from ceilometer.i18n import _, _LW
from ceilometer.i18n import _, _LI, _LW
from ceilometer import messaging
from ceilometer import pipeline
from ceilometer import service_base
from ceilometer import utils
@@ -66,7 +66,7 @@ cfg.CONF.import_opt('telemetry_driver', 'ceilometer.publisher.messaging',
group='publisher_notifier')
class NotificationService(os_service.Service):
class NotificationService(service_base.BaseService):
"""Notification service.
When running multiple agents, additional queuing sequence is required for
@@ -100,30 +100,50 @@ class NotificationService(os_service.Service):
publisher_id='ceilometer.notification',
topic='%s-%s' % (self.NOTIFICATION_IPC, pipe.name))
def start(self):
super(NotificationService, self).start()
self.pipeline_manager = pipeline.setup_pipeline()
if cfg.CONF.notification.store_events:
self.event_pipeline_manager = pipeline.setup_event_pipeline()
def _get_pipe_manager(self, transport, pipeline_manager):
transport = messaging.get_transport()
self.partition_coordinator = coordination.PartitionCoordinator()
self.partition_coordinator.start()
event_pipe_manager = None
if cfg.CONF.notification.workload_partitioning:
pipe_manager = pipeline.SamplePipelineTransportManager()
for pipe in self.pipeline_manager.pipelines:
for pipe in pipeline_manager.pipelines:
pipe_manager.add_transporter(
(pipe.source.support_meter,
self._get_notifier(transport, pipe)))
if cfg.CONF.notification.store_events:
else:
pipe_manager = pipeline_manager
return pipe_manager
def _get_event_pipeline_manager(self, transport):
if cfg.CONF.notification.store_events:
self.event_pipeline_manager = pipeline.setup_event_pipeline()
if cfg.CONF.notification.workload_partitioning:
event_pipe_manager = pipeline.EventPipelineTransportManager()
for pipe in self.event_pipeline_manager.pipelines:
event_pipe_manager.add_transporter(
(pipe.source.support_event,
self._get_notifier(transport, pipe)))
else:
event_pipe_manager = self.event_pipeline_manager
return event_pipe_manager
def start(self):
super(NotificationService, self).start()
self.pipeline_manager = pipeline.setup_pipeline()
self.transport = messaging.get_transport()
self.pipe_manager = self._get_pipe_manager(self.transport,
self.pipeline_manager)
self.event_pipe_manager = self._get_event_pipeline_manager(
self.transport)
self.partition_coordinator = coordination.PartitionCoordinator()
self.partition_coordinator.start()
if cfg.CONF.notification.workload_partitioning:
self.ctxt = context.get_admin_context()
self.group_id = self.NOTIFICATION_NAMESPACE
else:
@@ -133,14 +153,12 @@ class NotificationService(os_service.Service):
# the notification_topics in an other way
# we must create a transport to ensure the option have
# beeen registered by oslo_messaging
messaging.get_notifier(transport, '')
pipe_manager = self.pipeline_manager
if cfg.CONF.notification.store_events:
event_pipe_manager = self.event_pipeline_manager
messaging.get_notifier(self.transport, '')
self.group_id = None
self.listeners, self.pipeline_listeners = [], []
self._configure_main_queue_listeners(pipe_manager, event_pipe_manager)
self._configure_main_queue_listeners(self.pipe_manager,
self.event_pipe_manager)
if cfg.CONF.notification.workload_partitioning:
self.partition_coordinator.join_group(self.group_id)
@@ -160,6 +178,8 @@ class NotificationService(os_service.Service):
# Add a dummy thread to have wait() working
self.tg.add_timer(604800, lambda: None)
self.init_pipeline_refresh()
def _configure_main_queue_listeners(self, pipe_manager,
event_pipe_manager):
notification_manager = self._get_notifications_manager(pipe_manager)
@@ -231,3 +251,19 @@ class NotificationService(os_service.Service):
self.partition_coordinator.stop()
utils.kill_listeners(self.listeners + self.pipeline_listeners)
super(NotificationService, self).stop()
def reload_pipeline(self):
LOG.info(_LI("Reloading notification agent and listeners."))
self.pipe_manager = self._get_pipe_manager(
self.transport, self.pipeline_manager)
# re-start the main queue listeners.
utils.kill_listeners(self.listeners)
self._configure_main_queue_listeners(
self.pipe_manager, self.event_pipe_manager)
# re-start the pipeline listeners if workload partitioning
# is enabled.
if cfg.CONF.notification.workload_partitioning:
self._refresh_agent(None)

View File

@@ -19,6 +19,7 @@
import abc
import fnmatch
import hashlib
import os
from oslo_config import cfg
@@ -45,6 +46,15 @@ OPTS = [
default="event_pipeline.yaml",
help="Configuration file for event pipeline definition."
),
cfg.BoolOpt('refresh_pipeline_cfg',
default=False,
help="Refresh Pipeline configuration on-the-fly."
),
cfg.IntOpt('pipeline_polling_interval',
default=20,
help="Polling interval for pipeline file configuration"
" in seconds."
),
]
cfg.CONF.register_opts(OPTS)
@@ -723,3 +733,32 @@ def setup_pipeline(transformer_manager=None):
"""Setup pipeline manager according to yaml config file."""
cfg_file = cfg.CONF.pipeline_cfg_file
return _setup_pipeline_manager(cfg_file, transformer_manager)
def _get_pipeline_cfg_file(p_type=SAMPLE_TYPE):
if p_type == EVENT_TYPE:
cfg_file = cfg.CONF.event_pipeline_cfg_file
else:
cfg_file = cfg.CONF.pipeline_cfg_file
if not os.path.exists(cfg_file):
cfg_file = cfg.CONF.find_file(cfg_file)
return cfg_file
def get_pipeline_mtime(p_type=SAMPLE_TYPE):
cfg_file = _get_pipeline_cfg_file(p_type)
return os.path.getmtime(cfg_file)
def get_pipeline_hash(p_type=SAMPLE_TYPE):
cfg_file = _get_pipeline_cfg_file(p_type)
with open(cfg_file) as fap:
data = fap.read()
if six.PY3:
data = data.encode('utf-8')
file_hash = hashlib.md5(data).hexdigest()
return file_hash

View File

@@ -0,0 +1,70 @@
#
# Copyright 2015 Hewlett Packard
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import abc
from oslo_config import cfg
from oslo_log import log
from oslo_service import service as os_service
import six
from ceilometer.i18n import _, _LE, _LI
from ceilometer import pipeline
LOG = log.getLogger(__name__)
@six.add_metaclass(abc.ABCMeta)
class BaseService(os_service.Service):
def init_pipeline_refresh(self):
if cfg.CONF.refresh_pipeline_cfg:
self.pipeline_mtime = pipeline.get_pipeline_mtime()
self.pipeline_hash = pipeline.get_pipeline_hash()
self.tg.add_timer(cfg.CONF.pipeline_polling_interval,
self.refresh_pipeline)
@abc.abstractmethod
def reload_pipeline(self):
"""Reload pipeline in the agents."""
def refresh_pipeline(self):
mtime = pipeline.get_pipeline_mtime()
if mtime > self.pipeline_mtime:
LOG.info(_LI('Pipeline configuration file has been updated.'))
self.pipeline_mtime = mtime
_hash = pipeline.get_pipeline_hash()
if _hash != self.pipeline_hash:
LOG.info(_LI("Detected change in pipeline configuration."))
try:
self.pipeline_manager = pipeline.setup_pipeline()
LOG.debug(_("Pipeline has been refreshed. "
"old hash: %(old)s, new hash: %(new)s") %
({'old': self.pipeline_hash,
'new': _hash}))
except Exception as err:
LOG.debug(_("Active pipeline config's hash is %s") %
self.pipeline_hash)
LOG.exception(_LE('Unable to load changed pipeline: %s')
% err)
return
self.pipeline_hash = _hash
self.reload_pipeline()

View File

@@ -24,14 +24,20 @@
import abc
import copy
import datetime
import shutil
import eventlet
import mock
from oslo_config import fixture as fixture_config
from oslo_service import service as os_service
from oslo_utils import timeutils
from oslotest import mockpatch
import six
from stevedore import extension
import yaml
from ceilometer.agent import plugin_base
from ceilometer.openstack.common import fileutils
from ceilometer import pipeline
from ceilometer import publisher
from ceilometer.publisher import test as test_publisher
@@ -293,6 +299,102 @@ class BaseAgentManagerTestCase(base.BaseTestCase):
timer_call = mock.call(1.0, self.mgr.partition_coordinator.heartbeat)
self.assertEqual([timer_call], self.mgr.tg.add_timer.call_args_list)
@mock.patch('ceilometer.pipeline.setup_pipeline')
def test_start_with_pipeline_poller(self, setup_pipeline):
self.mgr.join_partitioning_groups = mock.MagicMock()
self.mgr.setup_polling_tasks = mock.MagicMock()
self.CONF.set_override('heartbeat', 1.0, group='coordination')
self.CONF.set_override('refresh_pipeline_cfg', True)
self.CONF.set_override('pipeline_polling_interval', 5)
self.mgr.start()
setup_pipeline.assert_called_once_with()
self.mgr.partition_coordinator.start.assert_called_once_with()
self.mgr.join_partitioning_groups.assert_called_once_with()
self.mgr.setup_polling_tasks.assert_called_once_with()
timer_call = mock.call(1.0, self.mgr.partition_coordinator.heartbeat)
pipeline_poller_call = mock.call(5, self.mgr.refresh_pipeline)
self.assertEqual([timer_call, pipeline_poller_call],
self.mgr.tg.add_timer.call_args_list)
def test_start_with_reloadable_pipeline(self):
def setup_pipeline_file(pipeline):
if six.PY3:
pipeline = pipeline.encode('utf-8')
pipeline_cfg_file = fileutils.write_to_tempfile(content=pipeline,
prefix="pipeline",
suffix="yaml")
return pipeline_cfg_file
self.CONF.set_override('heartbeat', 1.0, group='coordination')
self.CONF.set_override('refresh_pipeline_cfg', True)
self.CONF.set_override('pipeline_polling_interval', 2)
pipeline = yaml.dump({
'sources': [{
'name': 'test_pipeline',
'interval': 1,
'meters': ['test'],
'resources': ['test://'] if self.source_resources else [],
'sinks': ['test_sink']}],
'sinks': [{
'name': 'test_sink',
'transformers': [],
'publishers': ["test"]}]
})
pipeline_cfg_file = setup_pipeline_file(pipeline)
self.CONF.set_override("pipeline_cfg_file", pipeline_cfg_file)
self.mgr.tg = os_service.threadgroup.ThreadGroup(1000)
self.mgr.start()
pub = self.mgr.pipeline_manager.pipelines[0].publishers[0]
self.expected_samples = 1
start = timeutils.utcnow()
while timeutils.delta_seconds(start, timeutils.utcnow()) < 600:
if len(pub.samples) >= self.expected_samples:
break
eventlet.sleep(0)
del pub.samples[0].resource_metadata['resources']
self.assertEqual(self.Pollster.test_data, pub.samples[0])
# Flush publisher samples to test reloading
pub.samples = []
# Modify the collection targets
pipeline = yaml.dump({
'sources': [{
'name': 'test_pipeline',
'interval': 1,
'meters': ['testanother'],
'resources': ['test://'] if self.source_resources else [],
'sinks': ['test_sink']}],
'sinks': [{
'name': 'test_sink',
'transformers': [],
'publishers': ["test"]}]
})
updated_pipeline_cfg_file = setup_pipeline_file(pipeline)
# Move/re-name the updated pipeline file to the original pipeline
# file path as recorded in oslo config
shutil.move(updated_pipeline_cfg_file, pipeline_cfg_file)
# Random sleep to let the pipeline poller complete the reloading
eventlet.sleep(3)
pub = self.mgr.pipeline_manager.pipelines[0].publishers[0]
self.expected_samples = 1
start = timeutils.utcnow()
while timeutils.delta_seconds(start, timeutils.utcnow()) < 600:
if len(pub.samples) >= self.expected_samples:
break
eventlet.sleep(0)
del pub.samples[0].resource_metadata['resources']
self.assertEqual(self.PollsterAnother.test_data, pub.samples[0])
def test_join_partitioning_groups(self):
self.mgr.discovery_manager = self.create_discovery_manager()
self.mgr.join_partitioning_groups()

View File

@@ -14,6 +14,8 @@
# under the License.
"""Tests for Ceilometer notify daemon."""
import shutil
import eventlet
import mock
from oslo_config import fixture as fixture_config
@@ -171,16 +173,12 @@ class TestNotification(tests_base.BaseTestCase):
class BaseRealNotification(tests_base.BaseTestCase):
def setUp(self):
super(BaseRealNotification, self).setUp()
self.CONF = self.useFixture(fixture_config.Config()).conf
self.setup_messaging(self.CONF, 'nova')
def setup_pipeline(self, counter_names):
pipeline = yaml.dump({
'sources': [{
'name': 'test_pipeline',
'interval': 5,
'meters': ['instance', 'memory'],
'meters': counter_names,
'sinks': ['test_sink']
}],
'sinks': [{
@@ -191,12 +189,22 @@ class BaseRealNotification(tests_base.BaseTestCase):
})
if six.PY3:
pipeline = pipeline.encode('utf-8')
self.expected_samples = 2
pipeline_cfg_file = fileutils.write_to_tempfile(content=pipeline,
prefix="pipeline",
suffix="yaml")
return pipeline_cfg_file
def setUp(self):
super(BaseRealNotification, self).setUp()
self.CONF = self.useFixture(fixture_config.Config()).conf
self.setup_messaging(self.CONF, 'nova')
pipeline_cfg_file = self.setup_pipeline(['instance', 'memory'])
self.CONF.set_override("pipeline_cfg_file", pipeline_cfg_file)
self.expected_samples = 2
self.CONF.set_override("store_events", True, group="notification")
self.CONF.set_override("disable_non_metric_meters", False,
group="notification")
@@ -245,6 +253,79 @@ class BaseRealNotification(tests_base.BaseTestCase):
self.assertEqual(["9f9d01b9-4a58-4271-9e27-398b21ab20d1"], resources)
class TestRealNotificationReloadablePipeline(BaseRealNotification):
def setUp(self):
super(TestRealNotificationReloadablePipeline, self).setUp()
self.CONF.set_override('refresh_pipeline_cfg', True)
self.CONF.set_override('pipeline_polling_interval', 1)
self.srv = notification.NotificationService()
@mock.patch('ceilometer.publisher.test.TestPublisher')
def test_notification_pipeline_poller(self, fake_publisher_cls):
fake_publisher_cls.return_value = self.publisher
self.srv.tg = mock.MagicMock()
self.srv.start()
pipeline_poller_call = mock.call(1, self.srv.refresh_pipeline)
self.assertIn(pipeline_poller_call,
self.srv.tg.add_timer.call_args_list)
@mock.patch('ceilometer.publisher.test.TestPublisher')
def test_notification_reloaded_pipeline(self, fake_publisher_cls):
fake_publisher_cls.return_value = self.publisher
pipeline_cfg_file = self.setup_pipeline(['instance'])
self.CONF.set_override("pipeline_cfg_file", pipeline_cfg_file)
self.expected_samples = 1
self.srv.start()
notifier = messaging.get_notifier(self.transport,
"compute.vagrant-precise")
notifier.info(context.RequestContext(), 'compute.instance.create.end',
TEST_NOTICE_PAYLOAD)
start = timeutils.utcnow()
while timeutils.delta_seconds(start, timeutils.utcnow()) < 600:
if (len(self.publisher.samples) >= self.expected_samples and
len(self.publisher.events) >= self.expected_events):
break
eventlet.sleep(0)
self.assertEqual(self.expected_samples, len(self.publisher.samples))
# Flush publisher samples to test reloading
self.publisher.samples = []
# Modify the collection targets
updated_pipeline_cfg_file = self.setup_pipeline(['vcpus',
'disk.root.size'])
# Move/re-name the updated pipeline file to the original pipeline
# file path as recorded in oslo config
shutil.move(updated_pipeline_cfg_file, pipeline_cfg_file)
self.expected_samples = 2
# Random sleep to let the pipeline poller complete the reloading
eventlet.sleep(3)
# Send message again to verify the reload works
notifier = messaging.get_notifier(self.transport,
"compute.vagrant-precise")
notifier.info(context.RequestContext(), 'compute.instance.create.end',
TEST_NOTICE_PAYLOAD)
start = timeutils.utcnow()
while timeutils.delta_seconds(start, timeutils.utcnow()) < 600:
if (len(self.publisher.samples) >= self.expected_samples and
len(self.publisher.events) >= self.expected_events):
break
eventlet.sleep(0)
self.assertEqual(self.expected_samples, len(self.publisher.samples))
(self.assertIn(sample.name, ['disk.root.size', 'vcpus'])
for sample in self.publisher.samples)
class TestRealNotification(BaseRealNotification):
def setUp(self):