diff --git a/doc/source/admin/metrics.rst b/doc/source/admin/metrics.rst index f435a50c57..733c6569b3 100644 --- a/doc/source/admin/metrics.rst +++ b/doc/source/admin/metrics.rst @@ -17,8 +17,11 @@ These performance measurements, herein referred to as "metrics", can be emitted from the Bare Metal service, including ironic-api, ironic-conductor, and ironic-python-agent. By default, none of the services will emit metrics. -Configuring the Bare Metal Service to Enable Metrics -==================================================== +It is important to stress that not only statsd is supported for metrics +collection and transmission. This is covered later on in our documentation. + +Configuring the Bare Metal Service to Enable Metrics with Statsd +================================================================ Enabling metrics in ironic-api and ironic-conductor --------------------------------------------------- @@ -62,6 +65,30 @@ in the ironic configuration file as well:: agent_statsd_host = 198.51.100.2 agent_statsd_port = 8125 +.. Note:: + Use of a different metrics backend with the agent is not presently + supported. + +Transmission to the Message Bus Notifier +======================================== + +Regardless if you're using Ceilometer, +`ironic-prometheus-exporter `_, +or some scripting you wrote to consume the message bus notifications, +metrics data can be sent to the message bus notifier from the timer methods +*and* additional gauge counters by utilizing the ``[metrics]backend`` +configuration option and setting it to ``collector``. When this is the case, +Information is cached locally and periodically sent along with the general sensor +data update to the messaging notifier, which can consumed off of the message bus, +or via notifier plugin (such as is done with ironic-prometheus-exporter). + +.. NOTE:: + Transmission of timer data only works for the Conductor or ``single-process`` + Ironic service model. A separate webserver process presently does not have + the capability of triggering the call to retrieve and transmit the data. + +.. NOTE:: + This functionality requires ironic-lib version 5.4.0 to be installed. Types of Metrics Emitted ======================== @@ -79,6 +106,9 @@ additional load before enabling metrics. To see which metrics have changed names or have been removed between releases, refer to the `ironic release notes `_. +Additional conductor metrics in the form of counts will also be generated in +limited locations where petinant to the activity of the conductor. + .. note:: With the default statsd configuration, each timing metric may create additional metrics due to how statsd handles timing metrics. For more diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index a5817cf2e8..886e3c4fe5 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -98,6 +98,8 @@ class ConductorManager(base_manager.BaseConductorManager): def __init__(self, host, topic): super(ConductorManager, self).__init__(host, topic) + # NOTE(TheJulia): This is less a metric-able count, but a means to + # sort out nodes and prioritise a subset (of non-responding nodes). self.power_state_sync_count = collections.defaultdict(int) @METRICS.timer('ConductorManager._clean_up_caches') @@ -1433,6 +1435,11 @@ class ConductorManager(base_manager.BaseConductorManager): finally: waiters.wait_for_all(futures) + # report a count of the nodes + METRICS.send_gauge( + 'ConductorManager.PowerSyncNodesCount', + len(nodes)) + def _sync_power_state_nodes_task(self, context, nodes): """Invokes power state sync on nodes from synchronized queue. @@ -1451,6 +1458,7 @@ class ConductorManager(base_manager.BaseConductorManager): can do here to avoid failing a brand new deploy to a node that we've locked here, though. """ + # FIXME(comstud): Since our initial state checks are outside # of the lock (to try to avoid the lock), some checks are # repeated after grabbing the lock so we can unlock quickly. @@ -1497,6 +1505,12 @@ class ConductorManager(base_manager.BaseConductorManager): LOG.info("During sync_power_state, node %(node)s was not " "found and presumed deleted by another process.", {'node': node_uuid}) + # TODO(TheJulia): The chance exists that we orphan a node + # in power_state_sync_count, albeit it is not much data, + # it could eventually cause the memory footprint to grow + # on an exceptionally large ironic deployment. We should + # make sure we clean it up at some point, but overall given + # minimal impact, it is definite low hanging fruit. except exception.NodeLocked: LOG.info("During sync_power_state, node %(node)s was " "already locked by another process. Skip.", @@ -1513,6 +1527,7 @@ class ConductorManager(base_manager.BaseConductorManager): # regular power state checking, maintenance is still a required # condition. filters={'maintenance': True, 'fault': faults.POWER_FAILURE}, + node_count_metric_name='ConductorManager.PowerSyncRecoveryNodeCount', ) def _power_failure_recovery(self, task, context): """Periodic task to check power states for nodes in maintenance. @@ -1859,6 +1874,7 @@ class ConductorManager(base_manager.BaseConductorManager): predicate=lambda n, m: n.conductor_affinity != m.conductor.id, limit=lambda: CONF.conductor.periodic_max_workers, shared_task=False, + node_count_metric_name='ConductorManager.SyncLocalStateNodeCount', ) def _sync_local_state(self, task, context): """Perform any actions necessary to sync local state. @@ -2644,14 +2660,63 @@ class ConductorManager(base_manager.BaseConductorManager): # Yield on every iteration eventlet.sleep(0) + def _sensors_conductor(self, context): + """Called to collect and send metrics "sensors" for the conductor.""" + # populate the message which will be sent to ceilometer + # or other data consumer + message = {'message_id': uuidutils.generate_uuid(), + 'timestamp': datetime.datetime.utcnow(), + 'hostname': self.host} + + try: + ev_type = 'ironic.metrics' + message['event_type'] = ev_type + '.update' + sensors_data = METRICS.get_metrics_data() + except AttributeError: + # TODO(TheJulia): Remove this at some point, but right now + # don't inherently break on version mismatches when people + # disregard requriements. + LOG.warning( + 'get_sensors_data has been configured to collect ' + 'conductor metrics, however the installed ironic-lib ' + 'library lacks the functionality. Please update ' + 'ironic-lib to a minimum of version 5.4.0.') + except Exception as e: + LOG.exception( + "An unknown error occured while attempting to collect " + "sensor data from within the conductor. Error: %(error)s", + {'error': e}) + else: + message['payload'] = ( + self._filter_out_unsupported_types(sensors_data)) + if message['payload']: + self.sensors_notifier.info( + context, ev_type, message) + @METRICS.timer('ConductorManager._send_sensor_data') - @periodics.periodic(spacing=CONF.conductor.send_sensor_data_interval, - enabled=CONF.conductor.send_sensor_data) + @periodics.periodic(spacing=CONF.sensor_data.interval, + enabled=CONF.sensor_data.send_sensor_data) def _send_sensor_data(self, context): """Periodically collects and transmits sensor data notifications.""" + if CONF.sensor_data.enable_for_conductor: + if CONF.sensor_data.workers == 1: + # Directly call the sensors_conductor when only one + # worker is permitted, so we collect data serially + # instead. + self._sensors_conductor(context) + else: + # Also, do not apply the general threshold limit to + # the self collection of "sensor" data from the conductor, + # as were not launching external processes, we're just reading + # from an internal data structure, if we can. + self._spawn_worker(self._sensors_conductor, context) + if not CONF.sensor_data.enable_for_nodes: + # NOTE(TheJulia): If node sensor data is not required, then + # skip the rest of this method. + return filters = {} - if not CONF.conductor.send_sensor_data_for_undeployed_nodes: + if not CONF.sensor_data.enable_for_undeployed_nodes: filters['provision_state'] = states.ACTIVE nodes = queue.Queue() @@ -2659,7 +2724,7 @@ class ConductorManager(base_manager.BaseConductorManager): filters=filters): nodes.put_nowait(node_info) - number_of_threads = min(CONF.conductor.send_sensor_data_workers, + number_of_threads = min(CONF.sensor_data.workers, nodes.qsize()) futures = [] for thread_number in range(number_of_threads): @@ -2675,7 +2740,7 @@ class ConductorManager(base_manager.BaseConductorManager): break done, not_done = waiters.wait_for_all( - futures, timeout=CONF.conductor.send_sensor_data_wait_timeout) + futures, timeout=CONF.sensor_data.wait_timeout) if not_done: LOG.warning("%d workers for send sensors data did not complete", len(not_done)) @@ -2684,13 +2749,14 @@ class ConductorManager(base_manager.BaseConductorManager): """Filters out sensor data types that aren't specified in the config. Removes sensor data types that aren't specified in - CONF.conductor.send_sensor_data_types. + CONF.sensor_data.data_types. :param sensors_data: dict containing sensor types and the associated data :returns: dict with unsupported sensor types removed """ - allowed = set(x.lower() for x in CONF.conductor.send_sensor_data_types) + allowed = set(x.lower() for x in + CONF.sensor_data.data_types) if 'all' in allowed: return sensors_data diff --git a/ironic/conductor/periodics.py b/ironic/conductor/periodics.py index 70bc7bc939..b9c8f88441 100644 --- a/ironic/conductor/periodics.py +++ b/ironic/conductor/periodics.py @@ -18,6 +18,7 @@ import inspect import eventlet from futurist import periodics +from ironic_lib import metrics_utils from oslo_log import log from ironic.common import exception @@ -29,6 +30,9 @@ from ironic.drivers import base as driver_base LOG = log.getLogger(__name__) +METRICS = metrics_utils.get_metrics_logger(__name__) + + def periodic(spacing, enabled=True, **kwargs): """A decorator to define a periodic task. @@ -46,7 +50,7 @@ class Stop(Exception): def node_periodic(purpose, spacing, enabled=True, filters=None, predicate=None, predicate_extra_fields=(), limit=None, - shared_task=True): + shared_task=True, node_count_metric_name=None): """A decorator to define a periodic task to act on nodes. Defines a periodic task that fetches the list of nodes mapped to the @@ -84,6 +88,9 @@ def node_periodic(purpose, spacing, enabled=True, filters=None, iteration to determine the limit. :param shared_task: if ``True``, the task will have a shared lock. It is recommended to start with a shared lock and upgrade it only if needed. + :param node_count_metric_name: A string value to identify a metric + representing the count of matching nodes to be recorded upon the + completion of the periodic. """ node_type = collections.namedtuple( 'Node', @@ -116,10 +123,11 @@ def node_periodic(purpose, spacing, enabled=True, filters=None, else: local_limit = limit assert local_limit is None or local_limit > 0 - + node_count = 0 nodes = manager.iter_nodes(filters=filters, fields=predicate_extra_fields) for (node_uuid, *other) in nodes: + node_count += 1 if predicate is not None: node = node_type(node_uuid, *other) if accepts_manager: @@ -158,6 +166,11 @@ def node_periodic(purpose, spacing, enabled=True, filters=None, local_limit -= 1 if not local_limit: return + if node_count_metric_name: + # Send post-run metrics. + METRICS.send_gauge( + node_count_metric_name, + node_count) return wrapper diff --git a/ironic/conf/__init__.py b/ironic/conf/__init__.py index c1a8931813..41201346f0 100644 --- a/ironic/conf/__init__.py +++ b/ironic/conf/__init__.py @@ -44,6 +44,7 @@ from ironic.conf import neutron from ironic.conf import nova from ironic.conf import pxe from ironic.conf import redfish +from ironic.conf import sensor_data from ironic.conf import service_catalog from ironic.conf import snmp from ironic.conf import swift @@ -80,6 +81,7 @@ neutron.register_opts(CONF) nova.register_opts(CONF) pxe.register_opts(CONF) redfish.register_opts(CONF) +sensor_data.register_opts(CONF) service_catalog.register_opts(CONF) snmp.register_opts(CONF) swift.register_opts(CONF) diff --git a/ironic/conf/conductor.py b/ironic/conf/conductor.py index 2161b94346..653e30f56d 100644 --- a/ironic/conf/conductor.py +++ b/ironic/conf/conductor.py @@ -97,41 +97,6 @@ opts = [ cfg.IntOpt('node_locked_retry_interval', default=1, help=_('Seconds to sleep between node lock attempts.')), - cfg.BoolOpt('send_sensor_data', - default=False, - help=_('Enable sending sensor data message via the ' - 'notification bus')), - cfg.IntOpt('send_sensor_data_interval', - default=600, - min=1, - help=_('Seconds between conductor sending sensor data message ' - 'to ceilometer via the notification bus.')), - cfg.IntOpt('send_sensor_data_workers', - default=4, min=1, - help=_('The maximum number of workers that can be started ' - 'simultaneously for send data from sensors periodic ' - 'task.')), - cfg.IntOpt('send_sensor_data_wait_timeout', - default=300, - help=_('The time in seconds to wait for send sensors data ' - 'periodic task to be finished before allowing periodic ' - 'call to happen again. Should be less than ' - 'send_sensor_data_interval value.')), - cfg.ListOpt('send_sensor_data_types', - default=['ALL'], - help=_('List of comma separated meter types which need to be' - ' sent to Ceilometer. The default value, "ALL", is a ' - 'special value meaning send all the sensor data.')), - cfg.BoolOpt('send_sensor_data_for_undeployed_nodes', - default=False, - help=_('The default for sensor data collection is to only ' - 'collect data for machines that are deployed, however ' - 'operators may desire to know if there are failures ' - 'in hardware that is not presently in use. ' - 'When set to true, the conductor will collect sensor ' - 'information from all nodes when sensor data ' - 'collection is enabled via the send_sensor_data ' - 'setting.')), cfg.IntOpt('sync_local_state_interval', default=180, help=_('When conductors join or leave the cluster, existing ' diff --git a/ironic/conf/opts.py b/ironic/conf/opts.py index fd2e515345..faac800986 100644 --- a/ironic/conf/opts.py +++ b/ironic/conf/opts.py @@ -43,6 +43,7 @@ _opts = [ ('nova', ironic.conf.nova.list_opts()), ('pxe', ironic.conf.pxe.opts), ('redfish', ironic.conf.redfish.opts), + ('sensor_data', ironic.conf.sensor_data.opts), ('service_catalog', ironic.conf.service_catalog.list_opts()), ('snmp', ironic.conf.snmp.opts), ('swift', ironic.conf.swift.list_opts()), diff --git a/ironic/conf/sensor_data.py b/ironic/conf/sensor_data.py new file mode 100644 index 0000000000..8527113a67 --- /dev/null +++ b/ironic/conf/sensor_data.py @@ -0,0 +1,89 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from oslo_config import cfg + +from ironic.common.i18n import _ + +opts = [ + cfg.BoolOpt('send_sensor_data', + default=False, + deprecated_group='conductor', + deprecated_name='send_sensor_data', + help=_('Enable sending sensor data message via the ' + 'notification bus.')), + cfg.IntOpt('interval', + default=600, + min=1, + deprecated_group='conductor', + deprecated_name='send_sensor_data_interval', + help=_('Seconds between conductor sending sensor data message ' + 'via the notification bus. This was originally for ' + 'consumption via ceilometer, but the data may also ' + 'be consumed via a plugin like ' + 'ironic-prometheus-exporter or any other message bus ' + 'data collector.')), + cfg.IntOpt('workers', + default=4, min=1, + deprecated_group='conductor', + deprecated_name='send_sensor_data_workers', + help=_('The maximum number of workers that can be started ' + 'simultaneously for send data from sensors periodic ' + 'task.')), + cfg.IntOpt('wait_timeout', + default=300, + deprecated_group='conductor', + deprecated_name='send_sensor_data_wait_timeout', + help=_('The time in seconds to wait for send sensors data ' + 'periodic task to be finished before allowing periodic ' + 'call to happen again. Should be less than ' + 'send_sensor_data_interval value.')), + cfg.ListOpt('data_types', + default=['ALL'], + deprecated_group='conductor', + deprecated_name='send_sensor_data_types', + help=_('List of comma separated meter types which need to be ' + 'sent to Ceilometer. The default value, "ALL", is a ' + 'special value meaning send all the sensor data. ' + 'This setting only applies to baremetal sensor data ' + 'being processed through the conductor.')), + cfg.BoolOpt('enable_for_undeployed_nodes', + default=False, + deprecated_group='conductor', + deprecated_name='send_sensor_data_for_undeployed_nodes', + help=_('The default for sensor data collection is to only ' + 'collect data for machines that are deployed, however ' + 'operators may desire to know if there are failures ' + 'in hardware that is not presently in use. ' + 'When set to true, the conductor will collect sensor ' + 'information from all nodes when sensor data ' + 'collection is enabled via the send_sensor_data ' + 'setting.')), + cfg.BoolOpt('enable_for_conductor', + default=True, + help=_('If to include sensor metric data for the Conductor ' + 'process itself in the message payload for sensor ' + 'data which allows operators to gather instance ' + 'counts of actions and states to better manage ' + 'the deployment.')), + cfg.BoolOpt('enable_for_nodes', + default=True, + help=_('If to transmit any sensor data for any nodes under ' + 'this conductor\'s management. This option superceeds ' + 'the ``send_sensor_data_for_undeployed_nodes`` ' + 'setting.')), +] + + +def register_opts(conf): + conf.register_opts(opts, group='sensor_data') diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index b63907e53d..bda278328e 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -26,6 +26,7 @@ from unittest import mock import eventlet from futurist import waiters +from ironic_lib import metrics as ironic_metrics from oslo_config import cfg import oslo_messaging as messaging from oslo_utils import uuidutils @@ -4273,7 +4274,8 @@ class SensorsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): def test__filter_out_unsupported_types_all(self): self._start_service() - CONF.set_override('send_sensor_data_types', ['All'], group='conductor') + CONF.set_override('data_types', ['All'], + group='sensor_data') fake_sensors_data = {"t1": {'f1': 'v1'}, "t2": {'f1': 'v1'}} actual_result = ( self.service._filter_out_unsupported_types(fake_sensors_data)) @@ -4282,7 +4284,8 @@ class SensorsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): def test__filter_out_unsupported_types_part(self): self._start_service() - CONF.set_override('send_sensor_data_types', ['t1'], group='conductor') + CONF.set_override('data_types', ['t1'], + group='sensor_data') fake_sensors_data = {"t1": {'f1': 'v1'}, "t2": {'f1': 'v1'}} actual_result = ( self.service._filter_out_unsupported_types(fake_sensors_data)) @@ -4291,7 +4294,8 @@ class SensorsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): def test__filter_out_unsupported_types_non(self): self._start_service() - CONF.set_override('send_sensor_data_types', ['t3'], group='conductor') + CONF.set_override('data_types', ['t3'], + group='sensor_data') fake_sensors_data = {"t1": {'f1': 'v1'}, "t2": {'f1': 'v1'}} actual_result = ( self.service._filter_out_unsupported_types(fake_sensors_data)) @@ -4305,7 +4309,8 @@ class SensorsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): for i in range(5): nodes.put_nowait(('fake_uuid-%d' % i, 'fake-hardware', '', None)) self._start_service() - CONF.set_override('send_sensor_data', True, group='conductor') + CONF.set_override('send_sensor_data', True, + group='sensor_data') task = acquire_mock.return_value.__enter__.return_value task.node.maintenance = False @@ -4334,7 +4339,8 @@ class SensorsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): nodes.put_nowait(('fake_uuid', 'fake-hardware', '', None)) self._start_service() self.service._shutdown = True - CONF.set_override('send_sensor_data', True, group='conductor') + CONF.set_override('send_sensor_data', True, + group='sensor_data') self.service._sensors_nodes_task(self.context, nodes) acquire_mock.return_value.__enter__.assert_not_called() @@ -4343,7 +4349,8 @@ class SensorsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): nodes = queue.Queue() nodes.put_nowait(('fake_uuid', 'fake-hardware', '', None)) - CONF.set_override('send_sensor_data', True, group='conductor') + CONF.set_override('send_sensor_data', True, + group='sensor_data') self._start_service() @@ -4361,7 +4368,7 @@ class SensorsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): nodes = queue.Queue() nodes.put_nowait(('fake_uuid', 'fake-hardware', '', None)) self._start_service() - CONF.set_override('send_sensor_data', True, group='conductor') + CONF.set_override('send_sensor_data', True, group='sensor_data') task = acquire_mock.return_value.__enter__.return_value task.node.maintenance = True @@ -4384,10 +4391,10 @@ class SensorsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): mock_spawn): self._start_service() - CONF.set_override('send_sensor_data', True, group='conductor') + CONF.set_override('send_sensor_data', True, group='sensor_data') # NOTE(galyna): do not wait for threads to be finished in unittests - CONF.set_override('send_sensor_data_wait_timeout', 0, - group='conductor') + CONF.set_override('wait_timeout', 0, + group='sensor_data') _mapped_to_this_conductor_mock.return_value = True get_nodeinfo_list_mock.return_value = [('fake_uuid', 'fake', None)] self.service._send_sensor_data(self.context) @@ -4395,6 +4402,37 @@ class SensorsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): self.service._sensors_nodes_task, self.context, mock.ANY) + @mock.patch.object(queue, 'Queue', autospec=True) + @mock.patch.object(manager.ConductorManager, '_sensors_conductor', + autospec=True) + @mock.patch.object(manager.ConductorManager, '_spawn_worker', + autospec=True) + @mock.patch.object(manager.ConductorManager, '_mapped_to_this_conductor', + autospec=True) + @mock.patch.object(dbapi.IMPL, 'get_nodeinfo_list', autospec=True) + def test___send_sensor_data_disabled( + self, get_nodeinfo_list_mock, + _mapped_to_this_conductor_mock, + mock_spawn, mock_sensors_conductor, + mock_queue): + self._start_service() + + CONF.set_override('send_sensor_data', True, group='sensor_data') + CONF.set_override('enable_for_nodes', False, + group='sensor_data') + CONF.set_override('enable_for_conductor', False, + group='sensor_data') + # NOTE(galyna): do not wait for threads to be finished in unittests + CONF.set_override('wait_timeout', 0, + group='sensor_data') + _mapped_to_this_conductor_mock.return_value = True + get_nodeinfo_list_mock.return_value = [('fake_uuid', 'fake', None)] + self.service._send_sensor_data(self.context) + mock_sensors_conductor.assert_not_called() + # NOTE(TheJulia): Can't use the spawn worker since it records other, + # unrelated calls. So, queue works well here. + mock_queue.assert_not_called() + @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker', autospec=True) @mock.patch.object(manager.ConductorManager, '_mapped_to_this_conductor', @@ -4407,12 +4445,42 @@ class SensorsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): mock_spawn.reset_mock() number_of_workers = 8 - CONF.set_override('send_sensor_data', True, group='conductor') - CONF.set_override('send_sensor_data_workers', number_of_workers, - group='conductor') + CONF.set_override('send_sensor_data', True, group='sensor_data') + CONF.set_override('workers', number_of_workers, + group='sensor_data') # NOTE(galyna): do not wait for threads to be finished in unittests - CONF.set_override('send_sensor_data_wait_timeout', 0, - group='conductor') + CONF.set_override('wait_timeout', 0, + group='sensor_data') + + _mapped_to_this_conductor_mock.return_value = True + get_nodeinfo_list_mock.return_value = [('fake_uuid', 'fake', + None)] * 20 + self.service._send_sensor_data(self.context) + self.assertEqual(number_of_workers + 1, + mock_spawn.call_count) + + # TODO(TheJulia): At some point, we should add a test to validate that + # a modified filter to return all nodes actually works, although + # the way the sensor tests are written, the list is all mocked. + + @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker', + autospec=True) + @mock.patch.object(manager.ConductorManager, '_mapped_to_this_conductor', + autospec=True) + @mock.patch.object(dbapi.IMPL, 'get_nodeinfo_list', autospec=True) + def test___send_sensor_data_one_worker( + self, get_nodeinfo_list_mock, _mapped_to_this_conductor_mock, + mock_spawn): + self._start_service() + mock_spawn.reset_mock() + + number_of_workers = 1 + CONF.set_override('send_sensor_data', True, group='sensor_data') + CONF.set_override('workers', number_of_workers, + group='sensor_data') + # NOTE(galyna): do not wait for threads to be finished in unittests + CONF.set_override('wait_timeout', 0, + group='sensor_data') _mapped_to_this_conductor_mock.return_value = True get_nodeinfo_list_mock.return_value = [('fake_uuid', 'fake', @@ -4421,9 +4489,21 @@ class SensorsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): self.assertEqual(number_of_workers, mock_spawn.call_count) - # TODO(TheJulia): At some point, we should add a test to validate that - # a modified filter to return all nodes actually works, although - # the way the sensor tests are written, the list is all mocked. + @mock.patch.object(messaging.Notifier, 'info', autospec=True) + @mock.patch.object(ironic_metrics.MetricLogger, + 'get_metrics_data', autospec=True) + def test__sensors_conductor(self, mock_get_metrics, mock_notifier): + metric = {'metric': 'data'} + mock_get_metrics.return_value = metric + self._start_service() + self.service._sensors_conductor(self.context) + self.assertEqual(mock_notifier.call_count, 1) + self.assertEqual('ironic.metrics', mock_notifier.call_args.args[2]) + metrics_dict = mock_notifier.call_args.args[3] + self.assertEqual(metrics_dict.get('event_type'), + 'ironic.metrics.update') + self.assertDictEqual(metrics_dict.get('payload'), + metric) @mgr_utils.mock_record_keepalive diff --git a/releasenotes/notes/conductor-metric-collector-support-1b8b8c71f9f59da4.yaml b/releasenotes/notes/conductor-metric-collector-support-1b8b8c71f9f59da4.yaml new file mode 100644 index 0000000000..dfa3b0f89d --- /dev/null +++ b/releasenotes/notes/conductor-metric-collector-support-1b8b8c71f9f59da4.yaml @@ -0,0 +1,39 @@ +--- +features: + - | + Adds the ability for Ironic to send conductor process metrics + for monitoring. This requires the use of a new ``[metrics]backend`` + option value of ``collector``. This data was previously only available + through the use of statsd. This requires ``ironic-lib`` version ``5.4.0`` + or newer. This capability can be disabled using the + ``[sensor_data]enable_for_conductor`` option if set to False. + - | + Adds a ``[sensor_data]enable_for_nodes`` configuration option + to allow operators to disable sending node metric data via the + message bus notifier. + - | + Adds a new gauge metric ``ConductorManager.PowerSyncNodesCount`` + which tracks the nodes considered for power state synchrnozation. + - Adds a new gauge metric ``ConductorManager.PowerSyncRecoveryNodeCount`` + which represents the number of nodes which are being evaluated for power + state recovery checking. + - Adds a new gauge metric ``ConductorManager.SyncLocalStateNodeCount`` + which represents the number of nodes being tracked locally by the + conductor. +issues: + - Sensor data notifications to the message bus, such as using the + ``[metrics]backend`` configuration option of ``collector`` on a dedicated + API service process or instance, is not presently supported. This + functionality requires a periodic task to trigger the transmission + of metrics messages to the message bus notifier. +deprecations: + - The setting values starting with ``send_sensor`` in the ``[conductor]`` + configuration group have been deprecated and moved to a ``[sensor_data]`` + configuration group. The names have been updated to shorter, operator + friendly names.. +upgrades: + - Settings starting with ``sensor_data`` in the ``[conductor]`` + configuration group have been moved to a ``[sensor_data]`` configuration + group amd have been renamed to have shorter value names. If configuration + values are not updated, the ``oslo.config`` library will emit a warning + in the logs. diff --git a/requirements.txt b/requirements.txt index 0c73e632e8..2f4813baae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,7 +14,7 @@ WebOb>=1.7.1 # MIT python-cinderclient!=4.0.0,>=3.3.0 # Apache-2.0 python-glanceclient>=2.8.0 # Apache-2.0 keystoneauth1>=4.2.0 # Apache-2.0 -ironic-lib>=4.6.1 # Apache-2.0 +ironic-lib>=5.4.0 # Apache-2.0 python-swiftclient>=3.2.0 # Apache-2.0 pytz>=2013.6 # MIT stevedore>=1.29.0 # Apache-2.0