Add periodic agents health check.
In addition to periodic checks of L3 and DHCP agents add periodic checks of overall health of registered agents. Log total count of agents at debug level so it can be seen in logs of neutron-server. In case some agents found dead - log detailed info about them: Type of agent, last heartbeat, host. Change-Id: I5db81dad4e9e8325ad3fa3a3e6d5d2d0deb297dd Closes-Bug: #1453320
This commit is contained in:
parent
6fac4b8331
commit
8ee51f253c
@ -26,6 +26,7 @@ from sqlalchemy import sql
|
||||
|
||||
from neutron.api.v2 import attributes
|
||||
from neutron.common import constants
|
||||
from neutron import context
|
||||
from neutron.db import model_base
|
||||
from neutron.db import models_v2
|
||||
from neutron.extensions import agent as ext_agent
|
||||
@ -191,6 +192,26 @@ class AgentDbMixin(ext_agent.AgentPluginBase):
|
||||
agents = [agent for agent in agents if agent['alive'] == alive]
|
||||
return agents
|
||||
|
||||
def agent_health_check(self):
|
||||
"""Scan agents and log if some are considered dead."""
|
||||
agents = self.get_agents(context.get_admin_context(),
|
||||
filters={'admin_state_up': [True]})
|
||||
dead_agents = [agent for agent in agents if not agent['alive']]
|
||||
if dead_agents:
|
||||
data = '%20s %20s %s\n' % ('Type', 'Last heartbeat', "host")
|
||||
data += '\n'.join(['%20s %20s %s' %
|
||||
(agent['agent_type'],
|
||||
agent['heartbeat_timestamp'],
|
||||
agent['host']) for agent in dead_agents])
|
||||
LOG.warn(_LW("Agent healthcheck: found %(count)s dead agents "
|
||||
"out of %(total)s:\n%(data)s"),
|
||||
{'count': len(dead_agents),
|
||||
'total': len(agents),
|
||||
'data': data})
|
||||
else:
|
||||
LOG.debug("Agent healthcheck: found %s active agents",
|
||||
len(agents))
|
||||
|
||||
def _get_agent_by_type_and_host(self, context, agent_type, host):
|
||||
query = self._model_query(context, Agent)
|
||||
try:
|
||||
|
@ -118,16 +118,19 @@ class AgentSchedulerDbMixin(agents_db.AgentDbMixin):
|
||||
original_agent['host'])
|
||||
return result
|
||||
|
||||
def setup_agent_status_check(self, function):
|
||||
self.periodic_agent_loop = loopingcall.FixedIntervalLoopingCall(
|
||||
function)
|
||||
def add_agent_status_check(self, function):
|
||||
loop = loopingcall.FixedIntervalLoopingCall(function)
|
||||
# TODO(enikanorov): make interval configurable rather than computed
|
||||
interval = max(cfg.CONF.agent_down_time // 2, 1)
|
||||
# add random initial delay to allow agents to check in after the
|
||||
# neutron server first starts. random to offset multiple servers
|
||||
initial_delay = random.randint(interval, interval * 2)
|
||||
self.periodic_agent_loop.start(interval=interval,
|
||||
initial_delay=initial_delay)
|
||||
loop.start(interval=interval, initial_delay=initial_delay)
|
||||
|
||||
if hasattr(self, 'periodic_agent_loops'):
|
||||
self.periodic_agent_loops.append(loop)
|
||||
else:
|
||||
self.periodic_agent_loops = [loop]
|
||||
|
||||
def agent_dead_limit_seconds(self):
|
||||
return cfg.CONF.agent_down_time * 2
|
||||
@ -166,7 +169,7 @@ class DhcpAgentSchedulerDbMixin(dhcpagentscheduler
|
||||
"automatic network rescheduling is disabled."))
|
||||
return
|
||||
|
||||
self.setup_agent_status_check(self.remove_networks_from_down_agents)
|
||||
self.add_agent_status_check(self.remove_networks_from_down_agents)
|
||||
|
||||
def is_eligible_agent(self, context, active, agent):
|
||||
# eligible agent is active or starting up
|
||||
|
@ -82,7 +82,7 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase,
|
||||
"automatic router rescheduling is disabled."))
|
||||
return
|
||||
|
||||
self.setup_agent_status_check(
|
||||
self.add_agent_status_check(
|
||||
self.reschedule_routers_from_down_agents)
|
||||
|
||||
def reschedule_routers_from_down_agents(self):
|
||||
|
@ -148,6 +148,7 @@ class Ml2Plugin(db_base_plugin_v2.NeutronDbPluginV2,
|
||||
self.mechanism_manager.initialize()
|
||||
self._setup_dhcp()
|
||||
self._start_rpc_notifiers()
|
||||
self.add_agent_status_check(self.agent_health_check)
|
||||
LOG.info(_LI("Modular L2 Plugin initialization complete"))
|
||||
|
||||
def _setup_rpc(self):
|
||||
|
@ -409,6 +409,10 @@ class PluginFixture(fixtures.Fixture):
|
||||
'neutron.db.agentschedulers_db.DhcpAgentSchedulerDbMixin.'
|
||||
'start_periodic_dhcp_agent_status_check')
|
||||
self.patched_dhcp_periodic = self.dhcp_periodic_p.start()
|
||||
self.agent_health_check_p = mock.patch(
|
||||
'neutron.db.agentschedulers_db.DhcpAgentSchedulerDbMixin.'
|
||||
'add_agent_status_check')
|
||||
self.agent_health_check = self.agent_health_check_p.start()
|
||||
# Plugin cleanup should be triggered last so that
|
||||
# test-specific cleanup has a chance to release references.
|
||||
self.addCleanup(self.cleanup_core_plugin)
|
||||
|
@ -161,6 +161,27 @@ class TestAgentsDbMixin(TestAgentsDbBase):
|
||||
agent = self.plugin.get_agents(self.context)[0]
|
||||
self.assertFalse(agent['admin_state_up'])
|
||||
|
||||
def test_agent_health_check(self):
|
||||
agents = [{'agent_type': "DHCP Agent",
|
||||
'heartbeat_timestamp': '2015-05-06 22:40:40.432295',
|
||||
'host': 'some.node',
|
||||
'alive': True}]
|
||||
with mock.patch.object(self.plugin, 'get_agents',
|
||||
return_value=agents),\
|
||||
mock.patch.object(agents_db.LOG, 'warn') as warn,\
|
||||
mock.patch.object(agents_db.LOG, 'debug') as debug:
|
||||
self.plugin.agent_health_check()
|
||||
self.assertTrue(debug.called)
|
||||
self.assertFalse(warn.called)
|
||||
agents[0]['alive'] = False
|
||||
self.plugin.agent_health_check()
|
||||
warn.assert_called_once_with(
|
||||
mock.ANY,
|
||||
{'count': 1, 'total': 1,
|
||||
'data': " Type Last heartbeat host\n"
|
||||
" DHCP Agent 2015-05-06 22:40:40.432295 some.node"}
|
||||
)
|
||||
|
||||
|
||||
class TestAgentsDbGetAgents(TestAgentsDbBase):
|
||||
scenarios = [
|
||||
|
Loading…
Reference in New Issue
Block a user