Implementation of dynamically reloadable pipeline

Adds the ability to poll the file-based pipeline configuration and use it to activate/deactivate collection targets on-the-fly. Change-Id: I93fa33a167db81bb8a891d668c0714e627214d11 Partially-Implements: blueprint reload-file-based-pipeline-configuration
2015-06-11 22:10:51 +00:00
parent 34c6eb03ac
commit bd8cdbafa6
6 changed files with 402 additions and 44 deletions
--- a/ceilometer/agent/base.py
+++ b/ceilometer/agent/base.py
@@ -26,7 +26,6 @@ import random
 from oslo_config import cfg
 from oslo_context import context
 from oslo_log import log
-from oslo_service import service as os_service
 import six
 from six import moves
 from six.moves.urllib import parse as urlparse
@@ -34,8 +33,9 @@ from stevedore import extension

 from ceilometer.agent import plugin_base
 from ceilometer import coordination
-from ceilometer.i18n import _
+from ceilometer.i18n import _, _LI
 from ceilometer import pipeline as publish_pipeline
+from ceilometer import service_base
 from ceilometer import utils

 LOG = log.getLogger(__name__)
@@ -191,7 +191,7 @@ class PollingTask(object):
                            exc_info=True)


-class AgentManager(os_service.Service):
+class AgentManager(service_base.BaseService):

    def __init__(self, namespaces, pollster_list, group_prefix=None):
        # features of using coordination and pollster-list are exclusive, and
@@ -256,16 +256,16 @@ class AgentManager(os_service.Service):
        )

    def join_partitioning_groups(self):
-        groups = set([self.construct_group_id(d.obj.group_id)
-                      for d in self.discovery_manager])
+        self.groups = set([self.construct_group_id(d.obj.group_id)
+                          for d in self.discovery_manager])
        # let each set of statically-defined resources have its own group
        static_resource_groups = set([
            self.construct_group_id(utils.hash_of_set(p.resources))
            for p in self.pipeline_manager.pipelines
            if p.resources
        ])
-        groups.update(static_resource_groups)
-        for group in groups:
+        self.groups.update(static_resource_groups)
+        for group in self.groups:
            self.partition_coordinator.join_group(group)

    def create_polling_task(self):
@@ -290,12 +290,7 @@ class AgentManager(os_service.Service):
                           discovery_group_id)
                if discovery_group_id else None)

-    def start(self):
-        self.pipeline_manager = publish_pipeline.setup_pipeline()
-
-        self.partition_coordinator.start()
-        self.join_partitioning_groups()
-
+    def configure_polling_tasks(self):
        # allow time for coordination if necessary
        delay_start = self.partition_coordinator.is_active()

@@ -303,16 +298,29 @@ class AgentManager(os_service.Service):
        delay_polling_time = random.randint(
            0, cfg.CONF.shuffle_time_before_polling_task)

+        pollster_timers = []
        for interval, task in six.iteritems(self.setup_polling_tasks()):
            delay_time = (interval + delay_polling_time if delay_start
                          else delay_polling_time)
-            self.tg.add_timer(interval,
-                              self.interval_task,
-                              initial_delay=delay_time,
-                              task=task)
+            pollster_timers.append(self.tg.add_timer(interval,
+                                   self.interval_task,
+                                   initial_delay=delay_time,
+                                   task=task))
        self.tg.add_timer(cfg.CONF.coordination.heartbeat,
                          self.partition_coordinator.heartbeat)

+        return pollster_timers
+
+    def start(self):
+        self.pipeline_manager = publish_pipeline.setup_pipeline()
+
+        self.partition_coordinator.start()
+        self.join_partitioning_groups()
+
+        self.pollster_timers = self.configure_polling_tasks()
+
+        self.init_pipeline_refresh()
+
    def stop(self):
        if self.partition_coordinator:
            self.partition_coordinator.stop()
@@ -356,3 +364,25 @@ class AgentManager(os_service.Service):
            else:
                LOG.warning(_('Unknown discovery extension: %s') % name)
        return resources
+
+    def stop_pollsters(self):
+        for x in self.pollster_timers:
+            try:
+                x.stop()
+                self.tg.timer_done(x)
+            except Exception:
+                LOG.error(_('Error stopping pollster.'), exc_info=True)
+        self.pollster_timers = []
+
+    def reload_pipeline(self):
+        LOG.info(_LI("Reconfiguring polling tasks."))
+
+        # stop existing pollsters and leave partitioning groups
+        self.stop_pollsters()
+        for group in self.groups:
+            self.partition_coordinator.leave_group(group)
+
+        # re-create partitioning groups according to pipeline
+        # and configure polling tasks with latest pipeline conf
+        self.join_partitioning_groups()
+        self.pollster_timers = self.configure_polling_tasks()
--- a/ceilometer/notification.py
+++ b/ceilometer/notification.py
@@ -17,15 +17,15 @@ from oslo_config import cfg
 from oslo_context import context
 from oslo_log import log
 import oslo_messaging
-from oslo_service import service as os_service
 from stevedore import extension

 from ceilometer.agent import plugin_base as base
 from ceilometer import coordination
 from ceilometer.event import endpoint as event_endpoint
-from ceilometer.i18n import _, _LW
+from ceilometer.i18n import _, _LI, _LW
 from ceilometer import messaging
 from ceilometer import pipeline
+from ceilometer import service_base
 from ceilometer import utils


@@ -66,7 +66,7 @@ cfg.CONF.import_opt('telemetry_driver', 'ceilometer.publisher.messaging',
                    group='publisher_notifier')


-class NotificationService(os_service.Service):
+class NotificationService(service_base.BaseService):
    """Notification service.

    When running multiple agents, additional queuing sequence is required for
@@ -100,30 +100,50 @@ class NotificationService(os_service.Service):
            publisher_id='ceilometer.notification',
            topic='%s-%s' % (self.NOTIFICATION_IPC, pipe.name))

-    def start(self):
-        super(NotificationService, self).start()
-        self.pipeline_manager = pipeline.setup_pipeline()
-        if cfg.CONF.notification.store_events:
-            self.event_pipeline_manager = pipeline.setup_event_pipeline()
+    def _get_pipe_manager(self, transport, pipeline_manager):

-        transport = messaging.get_transport()
-        self.partition_coordinator = coordination.PartitionCoordinator()
-        self.partition_coordinator.start()
-
-        event_pipe_manager = None
        if cfg.CONF.notification.workload_partitioning:
            pipe_manager = pipeline.SamplePipelineTransportManager()
-            for pipe in self.pipeline_manager.pipelines:
+            for pipe in pipeline_manager.pipelines:
                pipe_manager.add_transporter(
                    (pipe.source.support_meter,
                     self._get_notifier(transport, pipe)))
-            if cfg.CONF.notification.store_events:
+        else:
+            pipe_manager = pipeline_manager
+
+        return pipe_manager
+
+    def _get_event_pipeline_manager(self, transport):
+
+        if cfg.CONF.notification.store_events:
+            self.event_pipeline_manager = pipeline.setup_event_pipeline()
+
+            if cfg.CONF.notification.workload_partitioning:
                event_pipe_manager = pipeline.EventPipelineTransportManager()
                for pipe in self.event_pipeline_manager.pipelines:
                    event_pipe_manager.add_transporter(
                        (pipe.source.support_event,
                         self._get_notifier(transport, pipe)))
+            else:
+                event_pipe_manager = self.event_pipeline_manager

+            return event_pipe_manager
+
+    def start(self):
+        super(NotificationService, self).start()
+
+        self.pipeline_manager = pipeline.setup_pipeline()
+        self.transport = messaging.get_transport()
+
+        self.pipe_manager = self._get_pipe_manager(self.transport,
+                                                   self.pipeline_manager)
+        self.event_pipe_manager = self._get_event_pipeline_manager(
+            self.transport)
+
+        self.partition_coordinator = coordination.PartitionCoordinator()
+        self.partition_coordinator.start()
+
+        if cfg.CONF.notification.workload_partitioning:
            self.ctxt = context.get_admin_context()
            self.group_id = self.NOTIFICATION_NAMESPACE
        else:
@@ -133,14 +153,12 @@ class NotificationService(os_service.Service):
            # the notification_topics in an other way
            # we must create a transport to ensure the option have
            # beeen registered by oslo_messaging
-            messaging.get_notifier(transport, '')
-            pipe_manager = self.pipeline_manager
-            if cfg.CONF.notification.store_events:
-                event_pipe_manager = self.event_pipeline_manager
+            messaging.get_notifier(self.transport, '')
            self.group_id = None

        self.listeners, self.pipeline_listeners = [], []
-        self._configure_main_queue_listeners(pipe_manager, event_pipe_manager)
+        self._configure_main_queue_listeners(self.pipe_manager,
+                                             self.event_pipe_manager)

        if cfg.CONF.notification.workload_partitioning:
            self.partition_coordinator.join_group(self.group_id)
@@ -160,6 +178,8 @@ class NotificationService(os_service.Service):
        # Add a dummy thread to have wait() working
        self.tg.add_timer(604800, lambda: None)

+        self.init_pipeline_refresh()
+
    def _configure_main_queue_listeners(self, pipe_manager,
                                        event_pipe_manager):
        notification_manager = self._get_notifications_manager(pipe_manager)
@@ -231,3 +251,19 @@ class NotificationService(os_service.Service):
            self.partition_coordinator.stop()
        utils.kill_listeners(self.listeners + self.pipeline_listeners)
        super(NotificationService, self).stop()
+
+    def reload_pipeline(self):
+        LOG.info(_LI("Reloading notification agent and listeners."))
+
+        self.pipe_manager = self._get_pipe_manager(
+            self.transport, self.pipeline_manager)
+
+        # re-start the main queue listeners.
+        utils.kill_listeners(self.listeners)
+        self._configure_main_queue_listeners(
+            self.pipe_manager, self.event_pipe_manager)
+
+        # re-start the pipeline listeners if workload partitioning
+        # is enabled.
+        if cfg.CONF.notification.workload_partitioning:
+            self._refresh_agent(None)
--- a/ceilometer/pipeline.py
+++ b/ceilometer/pipeline.py
@@ -19,6 +19,7 @@

 import abc
 import fnmatch
+import hashlib
 import os

 from oslo_config import cfg
@@ -45,6 +46,15 @@ OPTS = [
               default="event_pipeline.yaml",
               help="Configuration file for event pipeline definition."
               ),
+    cfg.BoolOpt('refresh_pipeline_cfg',
+                default=False,
+                help="Refresh Pipeline configuration on-the-fly."
+                ),
+    cfg.IntOpt('pipeline_polling_interval',
+               default=20,
+               help="Polling interval for pipeline file configuration"
+                    " in seconds."
+               ),
 ]

 cfg.CONF.register_opts(OPTS)
@@ -723,3 +733,32 @@ def setup_pipeline(transformer_manager=None):
    """Setup pipeline manager according to yaml config file."""
    cfg_file = cfg.CONF.pipeline_cfg_file
    return _setup_pipeline_manager(cfg_file, transformer_manager)
+
+
+def _get_pipeline_cfg_file(p_type=SAMPLE_TYPE):
+    if p_type == EVENT_TYPE:
+        cfg_file = cfg.CONF.event_pipeline_cfg_file
+    else:
+        cfg_file = cfg.CONF.pipeline_cfg_file
+
+    if not os.path.exists(cfg_file):
+        cfg_file = cfg.CONF.find_file(cfg_file)
+
+    return cfg_file
+
+
+def get_pipeline_mtime(p_type=SAMPLE_TYPE):
+    cfg_file = _get_pipeline_cfg_file(p_type)
+    return os.path.getmtime(cfg_file)
+
+
+def get_pipeline_hash(p_type=SAMPLE_TYPE):
+
+    cfg_file = _get_pipeline_cfg_file(p_type)
+    with open(cfg_file) as fap:
+        data = fap.read()
+    if six.PY3:
+        data = data.encode('utf-8')
+
+    file_hash = hashlib.md5(data).hexdigest()
+    return file_hash
--- a/ceilometer/service_base.py
+++ b/ceilometer/service_base.py
@@ -0,0 +1,70 @@
+#
+# Copyright 2015 Hewlett Packard
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import abc
+
+from oslo_config import cfg
+from oslo_log import log
+from oslo_service import service as os_service
+import six
+
+from ceilometer.i18n import _, _LE, _LI
+from ceilometer import pipeline
+
+LOG = log.getLogger(__name__)
+
+
+@six.add_metaclass(abc.ABCMeta)
+class BaseService(os_service.Service):
+
+    def init_pipeline_refresh(self):
+        if cfg.CONF.refresh_pipeline_cfg:
+
+            self.pipeline_mtime = pipeline.get_pipeline_mtime()
+            self.pipeline_hash = pipeline.get_pipeline_hash()
+
+            self.tg.add_timer(cfg.CONF.pipeline_polling_interval,
+                              self.refresh_pipeline)
+
+    @abc.abstractmethod
+    def reload_pipeline(self):
+        """Reload pipeline in the agents."""
+
+    def refresh_pipeline(self):
+        mtime = pipeline.get_pipeline_mtime()
+        if mtime > self.pipeline_mtime:
+            LOG.info(_LI('Pipeline configuration file has been updated.'))
+
+            self.pipeline_mtime = mtime
+            _hash = pipeline.get_pipeline_hash()
+
+            if _hash != self.pipeline_hash:
+                LOG.info(_LI("Detected change in pipeline configuration."))
+
+                try:
+                    self.pipeline_manager = pipeline.setup_pipeline()
+                    LOG.debug(_("Pipeline has been refreshed. "
+                                "old hash: %(old)s, new hash: %(new)s") %
+                              ({'old': self.pipeline_hash,
+                                'new': _hash}))
+                except Exception as err:
+                    LOG.debug(_("Active pipeline config's hash is %s") %
+                              self.pipeline_hash)
+                    LOG.exception(_LE('Unable to load changed pipeline: %s')
+                                  % err)
+                    return
+
+                self.pipeline_hash = _hash
+                self.reload_pipeline()
--- a/ceilometer/tests/agent/agentbase.py
+++ b/ceilometer/tests/agent/agentbase.py
@@ -24,14 +24,20 @@
 import abc
 import copy
 import datetime
+import shutil

+import eventlet
 import mock
 from oslo_config import fixture as fixture_config
+from oslo_service import service as os_service
+from oslo_utils import timeutils
 from oslotest import mockpatch
 import six
 from stevedore import extension
+import yaml

 from ceilometer.agent import plugin_base
+from ceilometer.openstack.common import fileutils
 from ceilometer import pipeline
 from ceilometer import publisher
 from ceilometer.publisher import test as test_publisher
@@ -293,6 +299,102 @@ class BaseAgentManagerTestCase(base.BaseTestCase):
        timer_call = mock.call(1.0, self.mgr.partition_coordinator.heartbeat)
        self.assertEqual([timer_call], self.mgr.tg.add_timer.call_args_list)

+    @mock.patch('ceilometer.pipeline.setup_pipeline')
+    def test_start_with_pipeline_poller(self, setup_pipeline):
+        self.mgr.join_partitioning_groups = mock.MagicMock()
+        self.mgr.setup_polling_tasks = mock.MagicMock()
+
+        self.CONF.set_override('heartbeat', 1.0, group='coordination')
+        self.CONF.set_override('refresh_pipeline_cfg', True)
+        self.CONF.set_override('pipeline_polling_interval', 5)
+        self.mgr.start()
+        setup_pipeline.assert_called_once_with()
+        self.mgr.partition_coordinator.start.assert_called_once_with()
+        self.mgr.join_partitioning_groups.assert_called_once_with()
+        self.mgr.setup_polling_tasks.assert_called_once_with()
+        timer_call = mock.call(1.0, self.mgr.partition_coordinator.heartbeat)
+        pipeline_poller_call = mock.call(5, self.mgr.refresh_pipeline)
+        self.assertEqual([timer_call, pipeline_poller_call],
+                         self.mgr.tg.add_timer.call_args_list)
+
+    def test_start_with_reloadable_pipeline(self):
+
+        def setup_pipeline_file(pipeline):
+            if six.PY3:
+                pipeline = pipeline.encode('utf-8')
+
+            pipeline_cfg_file = fileutils.write_to_tempfile(content=pipeline,
+                                                            prefix="pipeline",
+                                                            suffix="yaml")
+            return pipeline_cfg_file
+
+        self.CONF.set_override('heartbeat', 1.0, group='coordination')
+        self.CONF.set_override('refresh_pipeline_cfg', True)
+        self.CONF.set_override('pipeline_polling_interval', 2)
+
+        pipeline = yaml.dump({
+            'sources': [{
+                'name': 'test_pipeline',
+                'interval': 1,
+                'meters': ['test'],
+                'resources': ['test://'] if self.source_resources else [],
+                'sinks': ['test_sink']}],
+            'sinks': [{
+                'name': 'test_sink',
+                'transformers': [],
+                'publishers': ["test"]}]
+        })
+
+        pipeline_cfg_file = setup_pipeline_file(pipeline)
+
+        self.CONF.set_override("pipeline_cfg_file", pipeline_cfg_file)
+        self.mgr.tg = os_service.threadgroup.ThreadGroup(1000)
+        self.mgr.start()
+        pub = self.mgr.pipeline_manager.pipelines[0].publishers[0]
+        self.expected_samples = 1
+        start = timeutils.utcnow()
+        while timeutils.delta_seconds(start, timeutils.utcnow()) < 600:
+            if len(pub.samples) >= self.expected_samples:
+                break
+            eventlet.sleep(0)
+
+        del pub.samples[0].resource_metadata['resources']
+        self.assertEqual(self.Pollster.test_data, pub.samples[0])
+
+        # Flush publisher samples to test reloading
+        pub.samples = []
+        # Modify the collection targets
+        pipeline = yaml.dump({
+            'sources': [{
+                'name': 'test_pipeline',
+                'interval': 1,
+                'meters': ['testanother'],
+                'resources': ['test://'] if self.source_resources else [],
+                'sinks': ['test_sink']}],
+            'sinks': [{
+                'name': 'test_sink',
+                'transformers': [],
+                'publishers': ["test"]}]
+        })
+
+        updated_pipeline_cfg_file = setup_pipeline_file(pipeline)
+        # Move/re-name the updated pipeline file to the original pipeline
+        # file path as recorded in oslo config
+        shutil.move(updated_pipeline_cfg_file, pipeline_cfg_file)
+        # Random sleep to let the pipeline poller complete the reloading
+        eventlet.sleep(3)
+
+        pub = self.mgr.pipeline_manager.pipelines[0].publishers[0]
+        self.expected_samples = 1
+        start = timeutils.utcnow()
+        while timeutils.delta_seconds(start, timeutils.utcnow()) < 600:
+            if len(pub.samples) >= self.expected_samples:
+                break
+            eventlet.sleep(0)
+
+        del pub.samples[0].resource_metadata['resources']
+        self.assertEqual(self.PollsterAnother.test_data, pub.samples[0])
+
    def test_join_partitioning_groups(self):
        self.mgr.discovery_manager = self.create_discovery_manager()
        self.mgr.join_partitioning_groups()
--- a/ceilometer/tests/test_notification.py
+++ b/ceilometer/tests/test_notification.py
@@ -14,6 +14,8 @@
 # under the License.
 """Tests for Ceilometer notify daemon."""

+import shutil
+
 import eventlet
 import mock
 from oslo_config import fixture as fixture_config
@@ -171,16 +173,12 @@ class TestNotification(tests_base.BaseTestCase):


 class BaseRealNotification(tests_base.BaseTestCase):
-    def setUp(self):
-        super(BaseRealNotification, self).setUp()
-        self.CONF = self.useFixture(fixture_config.Config()).conf
-        self.setup_messaging(self.CONF, 'nova')
-
+    def setup_pipeline(self, counter_names):
        pipeline = yaml.dump({
            'sources': [{
                'name': 'test_pipeline',
                'interval': 5,
-                'meters': ['instance', 'memory'],
+                'meters': counter_names,
                'sinks': ['test_sink']
            }],
            'sinks': [{
@@ -191,12 +189,22 @@ class BaseRealNotification(tests_base.BaseTestCase):
        })
        if six.PY3:
            pipeline = pipeline.encode('utf-8')
-        self.expected_samples = 2
+
        pipeline_cfg_file = fileutils.write_to_tempfile(content=pipeline,
                                                        prefix="pipeline",
                                                        suffix="yaml")
+        return pipeline_cfg_file
+
+    def setUp(self):
+        super(BaseRealNotification, self).setUp()
+        self.CONF = self.useFixture(fixture_config.Config()).conf
+        self.setup_messaging(self.CONF, 'nova')
+
+        pipeline_cfg_file = self.setup_pipeline(['instance', 'memory'])
        self.CONF.set_override("pipeline_cfg_file", pipeline_cfg_file)

+        self.expected_samples = 2
+
        self.CONF.set_override("store_events", True, group="notification")
        self.CONF.set_override("disable_non_metric_meters", False,
                               group="notification")
@@ -245,6 +253,79 @@ class BaseRealNotification(tests_base.BaseTestCase):
        self.assertEqual(["9f9d01b9-4a58-4271-9e27-398b21ab20d1"], resources)


+class TestRealNotificationReloadablePipeline(BaseRealNotification):
+
+    def setUp(self):
+        super(TestRealNotificationReloadablePipeline, self).setUp()
+        self.CONF.set_override('refresh_pipeline_cfg', True)
+        self.CONF.set_override('pipeline_polling_interval', 1)
+        self.srv = notification.NotificationService()
+
+    @mock.patch('ceilometer.publisher.test.TestPublisher')
+    def test_notification_pipeline_poller(self, fake_publisher_cls):
+        fake_publisher_cls.return_value = self.publisher
+        self.srv.tg = mock.MagicMock()
+        self.srv.start()
+
+        pipeline_poller_call = mock.call(1, self.srv.refresh_pipeline)
+        self.assertIn(pipeline_poller_call,
+                      self.srv.tg.add_timer.call_args_list)
+
+    @mock.patch('ceilometer.publisher.test.TestPublisher')
+    def test_notification_reloaded_pipeline(self, fake_publisher_cls):
+        fake_publisher_cls.return_value = self.publisher
+
+        pipeline_cfg_file = self.setup_pipeline(['instance'])
+        self.CONF.set_override("pipeline_cfg_file", pipeline_cfg_file)
+
+        self.expected_samples = 1
+        self.srv.start()
+
+        notifier = messaging.get_notifier(self.transport,
+                                          "compute.vagrant-precise")
+        notifier.info(context.RequestContext(), 'compute.instance.create.end',
+                      TEST_NOTICE_PAYLOAD)
+
+        start = timeutils.utcnow()
+        while timeutils.delta_seconds(start, timeutils.utcnow()) < 600:
+            if (len(self.publisher.samples) >= self.expected_samples and
+                    len(self.publisher.events) >= self.expected_events):
+                break
+            eventlet.sleep(0)
+
+        self.assertEqual(self.expected_samples, len(self.publisher.samples))
+
+        # Flush publisher samples to test reloading
+        self.publisher.samples = []
+        # Modify the collection targets
+        updated_pipeline_cfg_file = self.setup_pipeline(['vcpus',
+                                                         'disk.root.size'])
+        # Move/re-name the updated pipeline file to the original pipeline
+        # file path as recorded in oslo config
+        shutil.move(updated_pipeline_cfg_file, pipeline_cfg_file)
+
+        self.expected_samples = 2
+        # Random sleep to let the pipeline poller complete the reloading
+        eventlet.sleep(3)
+        # Send message again to verify the reload works
+        notifier = messaging.get_notifier(self.transport,
+                                          "compute.vagrant-precise")
+        notifier.info(context.RequestContext(), 'compute.instance.create.end',
+                      TEST_NOTICE_PAYLOAD)
+
+        start = timeutils.utcnow()
+        while timeutils.delta_seconds(start, timeutils.utcnow()) < 600:
+            if (len(self.publisher.samples) >= self.expected_samples and
+                    len(self.publisher.events) >= self.expected_events):
+                break
+            eventlet.sleep(0)
+
+        self.assertEqual(self.expected_samples, len(self.publisher.samples))
+
+        (self.assertIn(sample.name, ['disk.root.size', 'vcpus'])
+         for sample in self.publisher.samples)
+
+
 class TestRealNotification(BaseRealNotification):

    def setUp(self):