remove duplicate log exception message
Currently, when we poll hardware related metrics, if Keystone(Apache) or Nova is down, the central agent will print many exception logs such as: ConnectionError: ('Connection aborted.', error(111, 'ECONNREFUSED')) with very long stack trace, but they are all same information. This is because when we poll metrics, we discover the resources first, if succeed, then we cache it, if fail, we do not cache it. So in such case, each metrics will fail at discover steps, then log exception each time. And actually we have wrapped instance_get_all with a log decorator, which means each metrics will log twice. So this patch removes duplicate exception messages by adding a try...except block in hardware discovery, if there is any exception, we just return an empty list, so agent can record this result in cache. Finally the log file will only have one full stack trace exception message for all the hardware metrics. The disadvantage of such change is that when Keystone and Nova suddenly become OK, then we will lose some data can possible collect. But from another perspective, if in a singe interval, some hardware metrics have data but some not is not a consistent thing to end user, so I think in such case we drop some part of data (especially it is in start process or unstable status) is acceptable. Change-Id: I0045a556cde274be8e4ba6c155d3f59b4e0d5b2c Closes-Bug: #1493057
This commit is contained in:
parent
f94a72ae2b
commit
9b82f68667
ceilometer
@ -37,8 +37,15 @@ class InstanceDiscovery(plugin_base.DiscoveryBase):
|
||||
|
||||
def discover(self, manager, param=None):
|
||||
"""Discover resources to monitor."""
|
||||
instances = self.nova_cli.instance_get_all_by_host(
|
||||
cfg.CONF.host, self.last_run)
|
||||
try:
|
||||
instances = self.nova_cli.instance_get_all_by_host(
|
||||
cfg.CONF.host, self.last_run)
|
||||
except Exception:
|
||||
# NOTE(zqfan): instance_get_all_by_host is wrapped and will log
|
||||
# exception when there is any error. It is no need to raise it
|
||||
# again and print one more time.
|
||||
return []
|
||||
|
||||
for instance in instances:
|
||||
if getattr(instance, 'OS-EXT-STS:vm_state', None) in ['deleted',
|
||||
'error']:
|
||||
|
@ -55,8 +55,14 @@ class NodesDiscoveryTripleO(plugin_base.DiscoveryBase):
|
||||
instance_get_all will return all instances if last_run is None,
|
||||
and will return only the instances changed since the last_run time.
|
||||
"""
|
||||
try:
|
||||
instances = self.nova_cli.instance_get_all(self.last_run)
|
||||
except Exception:
|
||||
# NOTE(zqfan): instance_get_all is wrapped and will log exception
|
||||
# when there is any error. It is no need to raise it again and
|
||||
# print one more time.
|
||||
return []
|
||||
|
||||
instances = self.nova_cli.instance_get_all(self.last_run)
|
||||
for instance in instances:
|
||||
if getattr(instance, 'OS-EXT-STS:vm_state', None) in ['deleted',
|
||||
'error']:
|
||||
|
@ -19,11 +19,13 @@ import shutil
|
||||
|
||||
import eventlet
|
||||
import mock
|
||||
from novaclient import client as novaclient
|
||||
from oslo_service import service as os_service
|
||||
from oslo_utils import fileutils
|
||||
from oslo_utils import timeutils
|
||||
from oslotest import base
|
||||
from oslotest import mockpatch
|
||||
import requests
|
||||
import six
|
||||
from stevedore import extension
|
||||
import yaml
|
||||
@ -31,6 +33,7 @@ import yaml
|
||||
from ceilometer.agent import base as agent_base
|
||||
from ceilometer.agent import manager
|
||||
from ceilometer.agent import plugin_base
|
||||
from ceilometer.hardware import discovery
|
||||
from ceilometer import pipeline
|
||||
from ceilometer.tests.unit.agent import agentbase
|
||||
|
||||
@ -300,6 +303,54 @@ class TestRunTasks(agentbase.BaseAgentManagerTestCase):
|
||||
self.assertFalse(self.PollsterKeystone.samples)
|
||||
self.assertFalse(self.notified_samples)
|
||||
|
||||
@mock.patch('ceilometer.agent.base.LOG')
|
||||
@mock.patch('ceilometer.nova_client.LOG')
|
||||
def test_hardware_discover_fail_minimize_logs(self, novalog, baselog):
|
||||
self.useFixture(mockpatch.PatchObject(
|
||||
novaclient.HTTPClient,
|
||||
'authenticate',
|
||||
side_effect=requests.ConnectionError))
|
||||
|
||||
class PollsterHardware(agentbase.TestPollster):
|
||||
discovery = 'tripleo_overcloud_nodes'
|
||||
|
||||
class PollsterHardwareAnother(agentbase.TestPollster):
|
||||
discovery = 'tripleo_overcloud_nodes'
|
||||
|
||||
self.mgr.extensions.extend([
|
||||
extension.Extension('testhardware',
|
||||
None,
|
||||
None,
|
||||
PollsterHardware(), ),
|
||||
extension.Extension('testhardware2',
|
||||
None,
|
||||
None,
|
||||
PollsterHardwareAnother(), )
|
||||
])
|
||||
ext = extension.Extension('tripleo_overcloud_nodes',
|
||||
None,
|
||||
None,
|
||||
discovery.NodesDiscoveryTripleO())
|
||||
self.mgr.discovery_manager = (extension.ExtensionManager
|
||||
.make_test_instance([ext]))
|
||||
|
||||
self.pipeline_cfg = {
|
||||
'sources': [{
|
||||
'name': "test_hardware",
|
||||
'interval': 10,
|
||||
'meters': ['testhardware', 'testhardware2'],
|
||||
'sinks': ['test_sink']}],
|
||||
'sinks': [{
|
||||
'name': 'test_sink',
|
||||
'transformers': [],
|
||||
'publishers': ["test"]}]
|
||||
}
|
||||
self.mgr.polling_manager = pipeline.PollingManager(self.pipeline_cfg)
|
||||
polling_tasks = self.mgr.setup_polling_tasks()
|
||||
self.mgr.interval_task(list(polling_tasks.values())[0])
|
||||
self.assertEqual(1, novalog.exception.call_count)
|
||||
self.assertFalse(baselog.exception.called)
|
||||
|
||||
@mock.patch('ceilometer.agent.base.LOG')
|
||||
def test_polling_exception(self, LOG):
|
||||
source_name = 'test_pollingexception'
|
||||
|
Loading…
x
Reference in New Issue
Block a user