Add periodic agents health check.

In addition to periodic checks of L3 and DHCP agents
add periodic checks of overall health of registered agents.
Log total count of agents at debug level so it can be
seen in logs of neutron-server.
In case some agents found dead - log detailed info about them:
Type of agent, last heartbeat, host.

Change-Id: I5db81dad4e9e8325ad3fa3a3e6d5d2d0deb297dd
Closes-Bug: #1453320
This commit is contained in:
Eugene Nikanorov 2015-05-07 01:06:09 +04:00 committed by enikanorov
parent 6fac4b8331
commit 8ee51f253c
6 changed files with 57 additions and 7 deletions

View File

@ -26,6 +26,7 @@ from sqlalchemy import sql
from neutron.api.v2 import attributes
from neutron.common import constants
from neutron import context
from neutron.db import model_base
from neutron.db import models_v2
from neutron.extensions import agent as ext_agent
@ -191,6 +192,26 @@ class AgentDbMixin(ext_agent.AgentPluginBase):
agents = [agent for agent in agents if agent['alive'] == alive]
return agents
def agent_health_check(self):
"""Scan agents and log if some are considered dead."""
agents = self.get_agents(context.get_admin_context(),
filters={'admin_state_up': [True]})
dead_agents = [agent for agent in agents if not agent['alive']]
if dead_agents:
data = '%20s %20s %s\n' % ('Type', 'Last heartbeat', "host")
data += '\n'.join(['%20s %20s %s' %
(agent['agent_type'],
agent['heartbeat_timestamp'],
agent['host']) for agent in dead_agents])
LOG.warn(_LW("Agent healthcheck: found %(count)s dead agents "
"out of %(total)s:\n%(data)s"),
{'count': len(dead_agents),
'total': len(agents),
'data': data})
else:
LOG.debug("Agent healthcheck: found %s active agents",
len(agents))
def _get_agent_by_type_and_host(self, context, agent_type, host):
query = self._model_query(context, Agent)
try:

View File

@ -118,16 +118,19 @@ class AgentSchedulerDbMixin(agents_db.AgentDbMixin):
original_agent['host'])
return result
def setup_agent_status_check(self, function):
self.periodic_agent_loop = loopingcall.FixedIntervalLoopingCall(
function)
def add_agent_status_check(self, function):
loop = loopingcall.FixedIntervalLoopingCall(function)
# TODO(enikanorov): make interval configurable rather than computed
interval = max(cfg.CONF.agent_down_time // 2, 1)
# add random initial delay to allow agents to check in after the
# neutron server first starts. random to offset multiple servers
initial_delay = random.randint(interval, interval * 2)
self.periodic_agent_loop.start(interval=interval,
initial_delay=initial_delay)
loop.start(interval=interval, initial_delay=initial_delay)
if hasattr(self, 'periodic_agent_loops'):
self.periodic_agent_loops.append(loop)
else:
self.periodic_agent_loops = [loop]
def agent_dead_limit_seconds(self):
return cfg.CONF.agent_down_time * 2
@ -166,7 +169,7 @@ class DhcpAgentSchedulerDbMixin(dhcpagentscheduler
"automatic network rescheduling is disabled."))
return
self.setup_agent_status_check(self.remove_networks_from_down_agents)
self.add_agent_status_check(self.remove_networks_from_down_agents)
def is_eligible_agent(self, context, active, agent):
# eligible agent is active or starting up

View File

@ -82,7 +82,7 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase,
"automatic router rescheduling is disabled."))
return
self.setup_agent_status_check(
self.add_agent_status_check(
self.reschedule_routers_from_down_agents)
def reschedule_routers_from_down_agents(self):

View File

@ -148,6 +148,7 @@ class Ml2Plugin(db_base_plugin_v2.NeutronDbPluginV2,
self.mechanism_manager.initialize()
self._setup_dhcp()
self._start_rpc_notifiers()
self.add_agent_status_check(self.agent_health_check)
LOG.info(_LI("Modular L2 Plugin initialization complete"))
def _setup_rpc(self):

View File

@ -409,6 +409,10 @@ class PluginFixture(fixtures.Fixture):
'neutron.db.agentschedulers_db.DhcpAgentSchedulerDbMixin.'
'start_periodic_dhcp_agent_status_check')
self.patched_dhcp_periodic = self.dhcp_periodic_p.start()
self.agent_health_check_p = mock.patch(
'neutron.db.agentschedulers_db.DhcpAgentSchedulerDbMixin.'
'add_agent_status_check')
self.agent_health_check = self.agent_health_check_p.start()
# Plugin cleanup should be triggered last so that
# test-specific cleanup has a chance to release references.
self.addCleanup(self.cleanup_core_plugin)

View File

@ -161,6 +161,27 @@ class TestAgentsDbMixin(TestAgentsDbBase):
agent = self.plugin.get_agents(self.context)[0]
self.assertFalse(agent['admin_state_up'])
def test_agent_health_check(self):
agents = [{'agent_type': "DHCP Agent",
'heartbeat_timestamp': '2015-05-06 22:40:40.432295',
'host': 'some.node',
'alive': True}]
with mock.patch.object(self.plugin, 'get_agents',
return_value=agents),\
mock.patch.object(agents_db.LOG, 'warn') as warn,\
mock.patch.object(agents_db.LOG, 'debug') as debug:
self.plugin.agent_health_check()
self.assertTrue(debug.called)
self.assertFalse(warn.called)
agents[0]['alive'] = False
self.plugin.agent_health_check()
warn.assert_called_once_with(
mock.ANY,
{'count': 1, 'total': 1,
'data': " Type Last heartbeat host\n"
" DHCP Agent 2015-05-06 22:40:40.432295 some.node"}
)
class TestAgentsDbGetAgents(TestAgentsDbBase):
scenarios = [