Add logging of agent heartbeats

When troubleshooting problems with cluster it would be
very convenient to have information about agent heartbeats
logged with some searchable identifier which could create
1-to-1 mapping between events in agent's logs and server's logs.

Currently agent's heartbeats are not logged at all on server side.
Since on a large cluster that could create too much logging
(even for troubleshooting cases), it might make sense to make
this configurable both on neutron-server side and on agent-side.

DocImpact

Change-Id: I0a127ef274a84bba5de47395d47b62f48bd4be16
Closes-Bug: #1452582
This commit is contained in:
Eugene Nikanorov 2015-05-05 18:18:28 +04:00
parent ab971d78cc
commit e50e1a2369
12 changed files with 62 additions and 10 deletions
etc
neutron
agent
db
plugins/ml2/drivers/openvswitch/agent
tests/unit/db

@ -90,3 +90,7 @@
# Timeout for ovs-vsctl commands.
# If the timeout expires, ovs commands will fail with ALARMCLOCK error.
# ovs_vsctl_timeout = 10
[AGENT]
# Log agent heartbeats from this DHCP agent
# log_agent_heartbeats = False

@ -122,3 +122,7 @@
# The advertisement interval in seconds
# ha_vrrp_advert_int = 2
[AGENT]
# Log agent heartbeats from this L3 agent
# log_agent_heartbeats = False

@ -66,3 +66,7 @@ admin_password = %SERVICE_PASSWORD%
# Otherwise default_ttl specifies time in seconds a cache entry is valid for.
# No cache is used in case no value is passed.
# cache_url = memory://?default_ttl=5
[AGENT]
# Log agent heartbeats from this Metadata agent
# log_agent_heartbeats = False

@ -58,6 +58,9 @@
# of_interface = ovs-ofctl
[agent]
# Log agent heartbeats from this OVS agent
# log_agent_heartbeats = False
# Agent's polling interval in seconds
# polling_interval = 2

@ -44,6 +44,8 @@ AGENT_STATE_OPTS = [
help=_('Seconds between nodes reporting state to server; '
'should be less than agent_down_time, best if it '
'is half or less than agent_down_time.')),
cfg.BoolOpt('log_agent_heartbeats', default=False,
help=_('Log agent heartbeats')),
]
INTERFACE_DRIVER_OPTS = [

@ -548,7 +548,8 @@ class DhcpAgentWithStateReport(DhcpAgent):
'configurations': {
'dhcp_driver': cfg.CONF.dhcp_driver,
'use_namespaces': cfg.CONF.use_namespaces,
'dhcp_lease_duration': cfg.CONF.dhcp_lease_duration},
'dhcp_lease_duration': cfg.CONF.dhcp_lease_duration,
'log_agent_heartbeats': cfg.CONF.AGENT.log_agent_heartbeats},
'start_flag': True,
'agent_type': constants.AGENT_TYPE_DHCP}
report_interval = cfg.CONF.AGENT.report_interval

@ -595,7 +595,8 @@ class L3NATAgentWithStateReport(L3NATAgent):
'external_network_bridge': self.conf.external_network_bridge,
'gateway_external_network_id':
self.conf.gateway_external_network_id,
'interface_driver': self.conf.interface_driver},
'interface_driver': self.conf.interface_driver,
'log_agent_heartbeats': self.conf.AGENT.log_agent_heartbeats},
'start_flag': True,
'agent_type': l3_constants.AGENT_TYPE_L3}
report_interval = self.conf.AGENT.report_interval

@ -289,6 +289,7 @@ class UnixDomainMetadataProxy(object):
'metadata_proxy_socket': cfg.CONF.metadata_proxy_socket,
'nova_metadata_ip': cfg.CONF.nova_metadata_ip,
'nova_metadata_port': cfg.CONF.nova_metadata_port,
'log_agent_heartbeats': cfg.CONF.AGENT.log_agent_heartbeats,
},
'start_flag': True,
'agent_type': n_const.AGENT_TYPE_METADATA}

@ -18,6 +18,7 @@ import itertools
from oslo_log import log as logging
import oslo_messaging
from oslo_utils import timeutils
from oslo_utils import uuidutils
from neutron.common import constants
from neutron.common import rpc as n_rpc
@ -72,6 +73,11 @@ class PluginReportStateAPI(object):
def report_state(self, context, agent_state, use_call=False):
cctxt = self.client.prepare()
# add unique identifier to a report
# that can be logged on server side.
# This create visible correspondence between events on
# the agent and on the server
agent_state['uuid'] = uuidutils.generate_uuid()
kwargs = {
'agent_state': {'agent_state': agent_state},
'time': timeutils.strtime(),

@ -29,7 +29,7 @@ from neutron.common import constants
from neutron.db import model_base
from neutron.db import models_v2
from neutron.extensions import agent as ext_agent
from neutron.i18n import _LE, _LW
from neutron.i18n import _LE, _LI, _LW
from neutron import manager
LOG = logging.getLogger(__name__)
@ -203,22 +203,33 @@ class AgentDbMixin(ext_agent.AgentPluginBase):
agent = self._get_agent(context, id)
return self._make_agent_dict(agent, fields)
def _create_or_update_agent(self, context, agent):
def _log_heartbeat(self, state, agent_db, agent_conf):
if agent_conf.get('log_agent_heartbeats'):
delta = timeutils.utcnow() - agent_db.heartbeat_timestamp
LOG.info(_LI("Heartbeat received from %(type)s agent on "
"host %(host)s, uuid %(uuid)s after %(delta)s"),
{'type': agent_db.agent_type,
'host': agent_db.host,
'uuid': state.get('uuid'),
'delta': delta})
def _create_or_update_agent(self, context, agent_state):
with context.session.begin(subtransactions=True):
res_keys = ['agent_type', 'binary', 'host', 'topic']
res = dict((k, agent[k]) for k in res_keys)
res = dict((k, agent_state[k]) for k in res_keys)
configurations_dict = agent.get('configurations', {})
configurations_dict = agent_state.get('configurations', {})
res['configurations'] = jsonutils.dumps(configurations_dict)
res['load'] = self._get_agent_load(agent)
res['load'] = self._get_agent_load(agent_state)
current_time = timeutils.utcnow()
try:
agent_db = self._get_agent_by_type_and_host(
context, agent['agent_type'], agent['host'])
context, agent_state['agent_type'], agent_state['host'])
res['heartbeat_timestamp'] = current_time
if agent.get('start_flag'):
if agent_state.get('start_flag'):
res['started_at'] = current_time
greenthread.sleep(0)
self._log_heartbeat(agent_state, agent_db, configurations_dict)
agent_db.update(res)
except ext_agent.AgentNotFoundByTypeHost:
greenthread.sleep(0)
@ -229,6 +240,7 @@ class AgentDbMixin(ext_agent.AgentPluginBase):
agent_db = Agent(**res)
greenthread.sleep(0)
context.session.add(agent_db)
self._log_heartbeat(agent_state, agent_db, configurations_dict)
greenthread.sleep(0)
def create_or_update_agent(self, context, agent):

@ -200,7 +200,9 @@ class OVSNeutronAgent(sg_rpc.SecurityGroupAgentRpcCallbackMixin,
'arp_responder_enabled':
self.arp_responder_enabled,
'enable_distributed_routing':
self.enable_distributed_routing},
self.enable_distributed_routing,
'log_agent_heartbeats':
self.conf.AGENT.log_agent_heartbeats},
'agent_type': q_const.AGENT_TYPE_OVS,
'start_flag': True}

@ -127,6 +127,18 @@ class TestAgentsDbMixin(TestAgentsDbBase):
agent = agents[0]
self._assert_ref_fields_are_equal(self.agent_status, agent)
def test_create_or_update_agent_logs_heartbeat(self):
status = self.agent_status.copy()
status['configurations'] = {'log_agent_heartbeats': True}
with mock.patch.object(agents_db.LOG, 'info') as info:
self.plugin.create_or_update_agent(self.context, status)
self.assertTrue(info.called)
status['configurations'] = {'log_agent_heartbeats': False}
info.reset_mock()
self.plugin.create_or_update_agent(self.context, status)
self.assertFalse(info.called)
def test_create_or_update_agent_concurrent_insert(self):
# NOTE(rpodolyaka): emulate violation of the unique constraint caused
# by a concurrent insert. Ensure we make another