Add logging of agent heartbeats
When troubleshooting problems with cluster it would be very convenient to have information about agent heartbeats logged with some searchable identifier which could create 1-to-1 mapping between events in agent's logs and server's logs. Currently agent's heartbeats are not logged at all on server side. Since on a large cluster that could create too much logging (even for troubleshooting cases), it might make sense to make this configurable both on neutron-server side and on agent-side. DocImpact Change-Id: I0a127ef274a84bba5de47395d47b62f48bd4be16 Closes-Bug: #1452582
This commit is contained in:
parent
ab971d78cc
commit
e50e1a2369
etc
neutron
agent
db
plugins/ml2/drivers/openvswitch/agent
tests/unit/db
@ -90,3 +90,7 @@
|
||||
# Timeout for ovs-vsctl commands.
|
||||
# If the timeout expires, ovs commands will fail with ALARMCLOCK error.
|
||||
# ovs_vsctl_timeout = 10
|
||||
|
||||
[AGENT]
|
||||
# Log agent heartbeats from this DHCP agent
|
||||
# log_agent_heartbeats = False
|
||||
|
@ -122,3 +122,7 @@
|
||||
|
||||
# The advertisement interval in seconds
|
||||
# ha_vrrp_advert_int = 2
|
||||
|
||||
[AGENT]
|
||||
# Log agent heartbeats from this L3 agent
|
||||
# log_agent_heartbeats = False
|
||||
|
@ -66,3 +66,7 @@ admin_password = %SERVICE_PASSWORD%
|
||||
# Otherwise default_ttl specifies time in seconds a cache entry is valid for.
|
||||
# No cache is used in case no value is passed.
|
||||
# cache_url = memory://?default_ttl=5
|
||||
|
||||
[AGENT]
|
||||
# Log agent heartbeats from this Metadata agent
|
||||
# log_agent_heartbeats = False
|
||||
|
@ -58,6 +58,9 @@
|
||||
# of_interface = ovs-ofctl
|
||||
|
||||
[agent]
|
||||
# Log agent heartbeats from this OVS agent
|
||||
# log_agent_heartbeats = False
|
||||
|
||||
# Agent's polling interval in seconds
|
||||
# polling_interval = 2
|
||||
|
||||
|
@ -44,6 +44,8 @@ AGENT_STATE_OPTS = [
|
||||
help=_('Seconds between nodes reporting state to server; '
|
||||
'should be less than agent_down_time, best if it '
|
||||
'is half or less than agent_down_time.')),
|
||||
cfg.BoolOpt('log_agent_heartbeats', default=False,
|
||||
help=_('Log agent heartbeats')),
|
||||
]
|
||||
|
||||
INTERFACE_DRIVER_OPTS = [
|
||||
|
@ -548,7 +548,8 @@ class DhcpAgentWithStateReport(DhcpAgent):
|
||||
'configurations': {
|
||||
'dhcp_driver': cfg.CONF.dhcp_driver,
|
||||
'use_namespaces': cfg.CONF.use_namespaces,
|
||||
'dhcp_lease_duration': cfg.CONF.dhcp_lease_duration},
|
||||
'dhcp_lease_duration': cfg.CONF.dhcp_lease_duration,
|
||||
'log_agent_heartbeats': cfg.CONF.AGENT.log_agent_heartbeats},
|
||||
'start_flag': True,
|
||||
'agent_type': constants.AGENT_TYPE_DHCP}
|
||||
report_interval = cfg.CONF.AGENT.report_interval
|
||||
|
@ -595,7 +595,8 @@ class L3NATAgentWithStateReport(L3NATAgent):
|
||||
'external_network_bridge': self.conf.external_network_bridge,
|
||||
'gateway_external_network_id':
|
||||
self.conf.gateway_external_network_id,
|
||||
'interface_driver': self.conf.interface_driver},
|
||||
'interface_driver': self.conf.interface_driver,
|
||||
'log_agent_heartbeats': self.conf.AGENT.log_agent_heartbeats},
|
||||
'start_flag': True,
|
||||
'agent_type': l3_constants.AGENT_TYPE_L3}
|
||||
report_interval = self.conf.AGENT.report_interval
|
||||
|
@ -289,6 +289,7 @@ class UnixDomainMetadataProxy(object):
|
||||
'metadata_proxy_socket': cfg.CONF.metadata_proxy_socket,
|
||||
'nova_metadata_ip': cfg.CONF.nova_metadata_ip,
|
||||
'nova_metadata_port': cfg.CONF.nova_metadata_port,
|
||||
'log_agent_heartbeats': cfg.CONF.AGENT.log_agent_heartbeats,
|
||||
},
|
||||
'start_flag': True,
|
||||
'agent_type': n_const.AGENT_TYPE_METADATA}
|
||||
|
@ -18,6 +18,7 @@ import itertools
|
||||
from oslo_log import log as logging
|
||||
import oslo_messaging
|
||||
from oslo_utils import timeutils
|
||||
from oslo_utils import uuidutils
|
||||
|
||||
from neutron.common import constants
|
||||
from neutron.common import rpc as n_rpc
|
||||
@ -72,6 +73,11 @@ class PluginReportStateAPI(object):
|
||||
|
||||
def report_state(self, context, agent_state, use_call=False):
|
||||
cctxt = self.client.prepare()
|
||||
# add unique identifier to a report
|
||||
# that can be logged on server side.
|
||||
# This create visible correspondence between events on
|
||||
# the agent and on the server
|
||||
agent_state['uuid'] = uuidutils.generate_uuid()
|
||||
kwargs = {
|
||||
'agent_state': {'agent_state': agent_state},
|
||||
'time': timeutils.strtime(),
|
||||
|
@ -29,7 +29,7 @@ from neutron.common import constants
|
||||
from neutron.db import model_base
|
||||
from neutron.db import models_v2
|
||||
from neutron.extensions import agent as ext_agent
|
||||
from neutron.i18n import _LE, _LW
|
||||
from neutron.i18n import _LE, _LI, _LW
|
||||
from neutron import manager
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
@ -203,22 +203,33 @@ class AgentDbMixin(ext_agent.AgentPluginBase):
|
||||
agent = self._get_agent(context, id)
|
||||
return self._make_agent_dict(agent, fields)
|
||||
|
||||
def _create_or_update_agent(self, context, agent):
|
||||
def _log_heartbeat(self, state, agent_db, agent_conf):
|
||||
if agent_conf.get('log_agent_heartbeats'):
|
||||
delta = timeutils.utcnow() - agent_db.heartbeat_timestamp
|
||||
LOG.info(_LI("Heartbeat received from %(type)s agent on "
|
||||
"host %(host)s, uuid %(uuid)s after %(delta)s"),
|
||||
{'type': agent_db.agent_type,
|
||||
'host': agent_db.host,
|
||||
'uuid': state.get('uuid'),
|
||||
'delta': delta})
|
||||
|
||||
def _create_or_update_agent(self, context, agent_state):
|
||||
with context.session.begin(subtransactions=True):
|
||||
res_keys = ['agent_type', 'binary', 'host', 'topic']
|
||||
res = dict((k, agent[k]) for k in res_keys)
|
||||
res = dict((k, agent_state[k]) for k in res_keys)
|
||||
|
||||
configurations_dict = agent.get('configurations', {})
|
||||
configurations_dict = agent_state.get('configurations', {})
|
||||
res['configurations'] = jsonutils.dumps(configurations_dict)
|
||||
res['load'] = self._get_agent_load(agent)
|
||||
res['load'] = self._get_agent_load(agent_state)
|
||||
current_time = timeutils.utcnow()
|
||||
try:
|
||||
agent_db = self._get_agent_by_type_and_host(
|
||||
context, agent['agent_type'], agent['host'])
|
||||
context, agent_state['agent_type'], agent_state['host'])
|
||||
res['heartbeat_timestamp'] = current_time
|
||||
if agent.get('start_flag'):
|
||||
if agent_state.get('start_flag'):
|
||||
res['started_at'] = current_time
|
||||
greenthread.sleep(0)
|
||||
self._log_heartbeat(agent_state, agent_db, configurations_dict)
|
||||
agent_db.update(res)
|
||||
except ext_agent.AgentNotFoundByTypeHost:
|
||||
greenthread.sleep(0)
|
||||
@ -229,6 +240,7 @@ class AgentDbMixin(ext_agent.AgentPluginBase):
|
||||
agent_db = Agent(**res)
|
||||
greenthread.sleep(0)
|
||||
context.session.add(agent_db)
|
||||
self._log_heartbeat(agent_state, agent_db, configurations_dict)
|
||||
greenthread.sleep(0)
|
||||
|
||||
def create_or_update_agent(self, context, agent):
|
||||
|
@ -200,7 +200,9 @@ class OVSNeutronAgent(sg_rpc.SecurityGroupAgentRpcCallbackMixin,
|
||||
'arp_responder_enabled':
|
||||
self.arp_responder_enabled,
|
||||
'enable_distributed_routing':
|
||||
self.enable_distributed_routing},
|
||||
self.enable_distributed_routing,
|
||||
'log_agent_heartbeats':
|
||||
self.conf.AGENT.log_agent_heartbeats},
|
||||
'agent_type': q_const.AGENT_TYPE_OVS,
|
||||
'start_flag': True}
|
||||
|
||||
|
@ -127,6 +127,18 @@ class TestAgentsDbMixin(TestAgentsDbBase):
|
||||
agent = agents[0]
|
||||
self._assert_ref_fields_are_equal(self.agent_status, agent)
|
||||
|
||||
def test_create_or_update_agent_logs_heartbeat(self):
|
||||
status = self.agent_status.copy()
|
||||
status['configurations'] = {'log_agent_heartbeats': True}
|
||||
|
||||
with mock.patch.object(agents_db.LOG, 'info') as info:
|
||||
self.plugin.create_or_update_agent(self.context, status)
|
||||
self.assertTrue(info.called)
|
||||
status['configurations'] = {'log_agent_heartbeats': False}
|
||||
info.reset_mock()
|
||||
self.plugin.create_or_update_agent(self.context, status)
|
||||
self.assertFalse(info.called)
|
||||
|
||||
def test_create_or_update_agent_concurrent_insert(self):
|
||||
# NOTE(rpodolyaka): emulate violation of the unique constraint caused
|
||||
# by a concurrent insert. Ensure we make another
|
||||
|
Loading…
x
Reference in New Issue
Block a user