Add metrics support to IPA
This utilizes the new metrics support in ironic-lib to allow the agent to report timing metrics for agent API methods as configured in ironic-lib. Additionally, this adds developer docs on how to use metrics in IPA, including some caveats specific to ironic-lib.metrics use in IPA. Co-Authored-By: Jay Faulkner <jay@jvf.cc> Co-Authored-By: Alex Weeks <alex.weeks@gmail.com> Change-Id: Ic08d4ff78b6fb614b474b956a32eac352a14262a Partial-bug: #1526219
This commit is contained in:
		
				
					committed by
					
						
						Jay Faulkner
					
				
			
			
				
	
			
			
			
						parent
						
							ad60806f93
						
					
				
				
					commit
					fd874652e3
				
			@@ -18,6 +18,7 @@ Index
 | 
			
		||||
.. toctree::
 | 
			
		||||
 | 
			
		||||
    troubleshooting
 | 
			
		||||
    metrics
 | 
			
		||||
 | 
			
		||||
How it works
 | 
			
		||||
============
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										53
									
								
								doc/source/metrics.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										53
									
								
								doc/source/metrics.rst
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,53 @@
 | 
			
		||||
.. _metrics:
 | 
			
		||||
 | 
			
		||||
===============================================
 | 
			
		||||
Emitting metrics from Ironic-Python-Agent (IPA)
 | 
			
		||||
===============================================
 | 
			
		||||
 | 
			
		||||
This document describes how to emit metrics from IPA, including timers and
 | 
			
		||||
counters in code to directly emitting hardware metrics from a custom
 | 
			
		||||
HardwareManager.
 | 
			
		||||
 | 
			
		||||
Overview
 | 
			
		||||
========
 | 
			
		||||
IPA uses the metrics implementation from ironic-lib, with a few caveats due
 | 
			
		||||
to the dynamic configuration done at lookup time. You cannot cache the metrics
 | 
			
		||||
instance as the MetricsLogger returned will change after lookup if configs
 | 
			
		||||
different than the default setting have been used. This also means that the
 | 
			
		||||
method decorator supported by ironic-lib cannot be used in IPA.
 | 
			
		||||
 | 
			
		||||
Using a context manager
 | 
			
		||||
=======================
 | 
			
		||||
Using the context manager is the recommended way for sending metrics that time
 | 
			
		||||
or count sections of code. However, given that you cannot cache the
 | 
			
		||||
MetricsLogger, you have to explicitly call get_metrics_logger() from
 | 
			
		||||
ironic-lib every time. For example:
 | 
			
		||||
 | 
			
		||||
  from ironic_lib import metrics_utils
 | 
			
		||||
 | 
			
		||||
  def my_method():
 | 
			
		||||
    with metrics_utils.get_metrics_logger(__name__).timer():
 | 
			
		||||
      return _do_work()
 | 
			
		||||
 | 
			
		||||
As a note, these metric collectors do work for custom HardwareManagers as
 | 
			
		||||
well, however, you may want to metric the portions of a method that determine
 | 
			
		||||
compatability separate from portions of a method that actually do work, in
 | 
			
		||||
order to assure the metrics are relevant and useful on all hardware.
 | 
			
		||||
 | 
			
		||||
Explicitly sending metrics
 | 
			
		||||
==========================
 | 
			
		||||
A feature that may be particularly helpful for deployers writing custom
 | 
			
		||||
HardwareManagers is the ability to explicitly send metrics. As an example,
 | 
			
		||||
you could add a cleaning step which would retrieve metrics about a device and
 | 
			
		||||
ship them using the provided metrics library. For example:
 | 
			
		||||
 | 
			
		||||
  from ironic_lib import metrics_utils
 | 
			
		||||
 | 
			
		||||
  def my_cleaning_step():
 | 
			
		||||
    for name, value in _get_smart_data():
 | 
			
		||||
      metrics_utils.get_metrics_logger(__name__).send_gauge(name, value)
 | 
			
		||||
 | 
			
		||||
References
 | 
			
		||||
==========
 | 
			
		||||
For more information, please read the source of the metrics module in
 | 
			
		||||
`ironic-lib <http://git.openstack.org/cgit/openstack/ironic-lib/tree/ironic_lib>`_.
 | 
			
		||||
@@ -20,6 +20,7 @@ import threading
 | 
			
		||||
import time
 | 
			
		||||
 | 
			
		||||
from oslo_concurrency import processutils
 | 
			
		||||
from oslo_config import cfg
 | 
			
		||||
from oslo_log import log
 | 
			
		||||
import pkg_resources
 | 
			
		||||
from six.moves.urllib import parse as urlparse
 | 
			
		||||
@@ -35,7 +36,6 @@ from ironic_python_agent import inspector
 | 
			
		||||
from ironic_python_agent import ironic_api_client
 | 
			
		||||
from ironic_python_agent import utils
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
LOG = log.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
# Time(in seconds) to wait for any of the interfaces to be up
 | 
			
		||||
@@ -45,6 +45,9 @@ NETWORK_WAIT_TIMEOUT = 60
 | 
			
		||||
# Time(in seconds) to wait before reattempt
 | 
			
		||||
NETWORK_WAIT_RETRY = 5
 | 
			
		||||
 | 
			
		||||
cfg.CONF.import_group('metrics', 'ironic_lib.metrics_utils')
 | 
			
		||||
cfg.CONF.import_group('metrics_statsd', 'ironic_lib.metrics_statsd')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _time():
 | 
			
		||||
    """Wraps time.time() for simpler testing."""
 | 
			
		||||
@@ -340,6 +343,15 @@ class IronicPythonAgent(base.ExecuteCommandMixin):
 | 
			
		||||
            hardware.cache_node(self.node)
 | 
			
		||||
            self.heartbeat_timeout = content['heartbeat_timeout']
 | 
			
		||||
 | 
			
		||||
            # Update config with values from Ironic
 | 
			
		||||
            config = content.get('config', {})
 | 
			
		||||
            if config.get('metrics'):
 | 
			
		||||
                for opt, val in config.items():
 | 
			
		||||
                    setattr(cfg.CONF.metrics, opt, val)
 | 
			
		||||
            if config.get('metrics_statsd'):
 | 
			
		||||
                for opt, val in config.items():
 | 
			
		||||
                    setattr(cfg.CONF.metrics_statsd, opt, val)
 | 
			
		||||
 | 
			
		||||
        wsgi = simple_server.make_server(
 | 
			
		||||
            self.listen_address[0],
 | 
			
		||||
            self.listen_address[1],
 | 
			
		||||
 
 | 
			
		||||
@@ -12,9 +12,9 @@
 | 
			
		||||
# License for the specific language governing permissions and limitations
 | 
			
		||||
# under the License.
 | 
			
		||||
 | 
			
		||||
from ironic_lib import metrics_utils
 | 
			
		||||
import pecan
 | 
			
		||||
from pecan import rest
 | 
			
		||||
 | 
			
		||||
from wsme import types as wtypes
 | 
			
		||||
import wsmeext.pecan as wsme_pecan
 | 
			
		||||
 | 
			
		||||
@@ -81,6 +81,7 @@ class RootController(rest.RestController):
 | 
			
		||||
        # NOTE: The reason why convert() it's being called for every
 | 
			
		||||
        #       request is because we need to get the host url from
 | 
			
		||||
        #       the request object to make the links.
 | 
			
		||||
        with metrics_utils.get_metrics_logger(__name__).timer('get'):
 | 
			
		||||
            return Root.convert()
 | 
			
		||||
 | 
			
		||||
    @pecan.expose()
 | 
			
		||||
 
 | 
			
		||||
@@ -13,6 +13,7 @@
 | 
			
		||||
#    License for the specific language governing permissions and limitations
 | 
			
		||||
#    under the License.
 | 
			
		||||
 | 
			
		||||
from ironic_lib import metrics_utils
 | 
			
		||||
import pecan
 | 
			
		||||
from pecan import rest
 | 
			
		||||
from wsme import types
 | 
			
		||||
@@ -78,6 +79,7 @@ class CommandController(rest.RestController):
 | 
			
		||||
    @wsme_pecan.wsexpose(CommandResultList)
 | 
			
		||||
    def get_all(self):
 | 
			
		||||
        """Get all command results."""
 | 
			
		||||
        with metrics_utils.get_metrics_logger(__name__).timer('get_all'):
 | 
			
		||||
            agent = pecan.request.agent
 | 
			
		||||
            results = agent.list_command_results()
 | 
			
		||||
            return CommandResultList.from_results(results)
 | 
			
		||||
@@ -91,6 +93,7 @@ class CommandController(rest.RestController):
 | 
			
		||||
        :returns: a :class:`ironic_python_agent.api.controller.v1.command.
 | 
			
		||||
                  CommandResult` object.
 | 
			
		||||
        """
 | 
			
		||||
        with metrics_utils.get_metrics_logger(__name__).timer('get_one'):
 | 
			
		||||
            agent = pecan.request.agent
 | 
			
		||||
            result = agent.get_command_result(result_id)
 | 
			
		||||
 | 
			
		||||
@@ -109,6 +112,7 @@ class CommandController(rest.RestController):
 | 
			
		||||
        :returns: a :class:`ironic_python_agent.api.controller.v1.command.
 | 
			
		||||
                  CommandResult` object.
 | 
			
		||||
        """
 | 
			
		||||
        with metrics_utils.get_metrics_logger(__name__).timer('post'):
 | 
			
		||||
            # the POST body is always the last arg,
 | 
			
		||||
            # so command must be a kwarg here
 | 
			
		||||
            if command is None:
 | 
			
		||||
 
 | 
			
		||||
@@ -13,6 +13,7 @@
 | 
			
		||||
#    License for the specific language governing permissions and limitations
 | 
			
		||||
#    under the License.
 | 
			
		||||
 | 
			
		||||
from ironic_lib import metrics_utils
 | 
			
		||||
import pecan
 | 
			
		||||
from pecan import rest
 | 
			
		||||
from wsme import types
 | 
			
		||||
@@ -48,6 +49,7 @@ class StatusController(rest.RestController):
 | 
			
		||||
    @wsme_pecan.wsexpose(AgentStatus)
 | 
			
		||||
    def get_all(self):
 | 
			
		||||
        """Get current status of the running agent."""
 | 
			
		||||
        with metrics_utils.get_metrics_logger(__name__).timer('get_all'):
 | 
			
		||||
            agent = pecan.request.agent
 | 
			
		||||
            status = agent.get_status()
 | 
			
		||||
            return AgentStatus.from_agent_status(status)
 | 
			
		||||
 
 | 
			
		||||
@@ -533,7 +533,6 @@ class StandbyExtension(base.BaseAgentExtension):
 | 
			
		||||
        stream_raw_images = image_info.get('stream_raw_images', False)
 | 
			
		||||
        # don't write image again if already cached
 | 
			
		||||
        if self.cached_image_id != image_info['id']:
 | 
			
		||||
 | 
			
		||||
            if self.cached_image_id is not None:
 | 
			
		||||
                LOG.debug('Already had %s cached, overwriting',
 | 
			
		||||
                          self.cached_image_id)
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user