Merge "Add periodic agents health check."

This commit is contained in:
Jenkins 2015-10-05 20:18:21 +00:00 committed by Gerrit Code Review
commit 254651ae84
6 changed files with 57 additions and 7 deletions

View File

@ -27,6 +27,7 @@ from sqlalchemy import sql
from neutron.api.v2 import attributes
from neutron.common import constants
from neutron import context
from neutron.db import model_base
from neutron.db import models_v2
from neutron.extensions import agent as ext_agent
@ -248,6 +249,26 @@ class AgentDbMixin(ext_agent.AgentPluginBase, AgentAvailabilityZoneMixin):
agents = [agent for agent in agents if agent['alive'] == alive]
return agents
def agent_health_check(self):
"""Scan agents and log if some are considered dead."""
agents = self.get_agents(context.get_admin_context(),
filters={'admin_state_up': [True]})
dead_agents = [agent for agent in agents if not agent['alive']]
if dead_agents:
data = '%20s %20s %s\n' % ('Type', 'Last heartbeat', "host")
data += '\n'.join(['%20s %20s %s' %
(agent['agent_type'],
agent['heartbeat_timestamp'],
agent['host']) for agent in dead_agents])
LOG.warn(_LW("Agent healthcheck: found %(count)s dead agents "
"out of %(total)s:\n%(data)s"),
{'count': len(dead_agents),
'total': len(agents),
'data': data})
else:
LOG.debug("Agent healthcheck: found %s active agents",
len(agents))
def _get_agent_by_type_and_host(self, context, agent_type, host):
query = self._model_query(context, Agent)
try:

View File

@ -118,16 +118,19 @@ class AgentSchedulerDbMixin(agents_db.AgentDbMixin):
original_agent['host'])
return result
def setup_agent_status_check(self, function):
self.periodic_agent_loop = loopingcall.FixedIntervalLoopingCall(
function)
def add_agent_status_check(self, function):
loop = loopingcall.FixedIntervalLoopingCall(function)
# TODO(enikanorov): make interval configurable rather than computed
interval = max(cfg.CONF.agent_down_time // 2, 1)
# add random initial delay to allow agents to check in after the
# neutron server first starts. random to offset multiple servers
initial_delay = random.randint(interval, interval * 2)
self.periodic_agent_loop.start(interval=interval,
initial_delay=initial_delay)
loop.start(interval=interval, initial_delay=initial_delay)
if hasattr(self, 'periodic_agent_loops'):
self.periodic_agent_loops.append(loop)
else:
self.periodic_agent_loops = [loop]
def agent_dead_limit_seconds(self):
return cfg.CONF.agent_down_time * 2
@ -166,7 +169,7 @@ class DhcpAgentSchedulerDbMixin(dhcpagentscheduler
"automatic network rescheduling is disabled."))
return
self.setup_agent_status_check(self.remove_networks_from_down_agents)
self.add_agent_status_check(self.remove_networks_from_down_agents)
def is_eligible_agent(self, context, active, agent):
# eligible agent is active or starting up

View File

@ -82,7 +82,7 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase,
"automatic router rescheduling is disabled."))
return
self.setup_agent_status_check(
self.add_agent_status_check(
self.reschedule_routers_from_down_agents)
def reschedule_routers_from_down_agents(self):

View File

@ -149,6 +149,7 @@ class Ml2Plugin(db_base_plugin_v2.NeutronDbPluginV2,
self.mechanism_manager.initialize()
self._setup_dhcp()
self._start_rpc_notifiers()
self.add_agent_status_check(self.agent_health_check)
LOG.info(_LI("Modular L2 Plugin initialization complete"))
def _setup_rpc(self):

View File

@ -409,6 +409,10 @@ class PluginFixture(fixtures.Fixture):
'neutron.db.agentschedulers_db.DhcpAgentSchedulerDbMixin.'
'start_periodic_dhcp_agent_status_check')
self.patched_dhcp_periodic = self.dhcp_periodic_p.start()
self.agent_health_check_p = mock.patch(
'neutron.db.agentschedulers_db.DhcpAgentSchedulerDbMixin.'
'add_agent_status_check')
self.agent_health_check = self.agent_health_check_p.start()
# Plugin cleanup should be triggered last so that
# test-specific cleanup has a chance to release references.
self.addCleanup(self.cleanup_core_plugin)

View File

@ -161,6 +161,27 @@ class TestAgentsDbMixin(TestAgentsDbBase):
agent = self.plugin.get_agents(self.context)[0]
self.assertFalse(agent['admin_state_up'])
def test_agent_health_check(self):
agents = [{'agent_type': "DHCP Agent",
'heartbeat_timestamp': '2015-05-06 22:40:40.432295',
'host': 'some.node',
'alive': True}]
with mock.patch.object(self.plugin, 'get_agents',
return_value=agents),\
mock.patch.object(agents_db.LOG, 'warn') as warn,\
mock.patch.object(agents_db.LOG, 'debug') as debug:
self.plugin.agent_health_check()
self.assertTrue(debug.called)
self.assertFalse(warn.called)
agents[0]['alive'] = False
self.plugin.agent_health_check()
warn.assert_called_once_with(
mock.ANY,
{'count': 1, 'total': 1,
'data': " Type Last heartbeat host\n"
" DHCP Agent 2015-05-06 22:40:40.432295 some.node"}
)
class TestAgentsDbGetAgents(TestAgentsDbBase):
scenarios = [