Merge "Add periodic agents health check."
This commit is contained in:
commit
254651ae84
@ -27,6 +27,7 @@ from sqlalchemy import sql
|
||||
|
||||
from neutron.api.v2 import attributes
|
||||
from neutron.common import constants
|
||||
from neutron import context
|
||||
from neutron.db import model_base
|
||||
from neutron.db import models_v2
|
||||
from neutron.extensions import agent as ext_agent
|
||||
@ -248,6 +249,26 @@ class AgentDbMixin(ext_agent.AgentPluginBase, AgentAvailabilityZoneMixin):
|
||||
agents = [agent for agent in agents if agent['alive'] == alive]
|
||||
return agents
|
||||
|
||||
def agent_health_check(self):
|
||||
"""Scan agents and log if some are considered dead."""
|
||||
agents = self.get_agents(context.get_admin_context(),
|
||||
filters={'admin_state_up': [True]})
|
||||
dead_agents = [agent for agent in agents if not agent['alive']]
|
||||
if dead_agents:
|
||||
data = '%20s %20s %s\n' % ('Type', 'Last heartbeat', "host")
|
||||
data += '\n'.join(['%20s %20s %s' %
|
||||
(agent['agent_type'],
|
||||
agent['heartbeat_timestamp'],
|
||||
agent['host']) for agent in dead_agents])
|
||||
LOG.warn(_LW("Agent healthcheck: found %(count)s dead agents "
|
||||
"out of %(total)s:\n%(data)s"),
|
||||
{'count': len(dead_agents),
|
||||
'total': len(agents),
|
||||
'data': data})
|
||||
else:
|
||||
LOG.debug("Agent healthcheck: found %s active agents",
|
||||
len(agents))
|
||||
|
||||
def _get_agent_by_type_and_host(self, context, agent_type, host):
|
||||
query = self._model_query(context, Agent)
|
||||
try:
|
||||
|
@ -118,16 +118,19 @@ class AgentSchedulerDbMixin(agents_db.AgentDbMixin):
|
||||
original_agent['host'])
|
||||
return result
|
||||
|
||||
def setup_agent_status_check(self, function):
|
||||
self.periodic_agent_loop = loopingcall.FixedIntervalLoopingCall(
|
||||
function)
|
||||
def add_agent_status_check(self, function):
|
||||
loop = loopingcall.FixedIntervalLoopingCall(function)
|
||||
# TODO(enikanorov): make interval configurable rather than computed
|
||||
interval = max(cfg.CONF.agent_down_time // 2, 1)
|
||||
# add random initial delay to allow agents to check in after the
|
||||
# neutron server first starts. random to offset multiple servers
|
||||
initial_delay = random.randint(interval, interval * 2)
|
||||
self.periodic_agent_loop.start(interval=interval,
|
||||
initial_delay=initial_delay)
|
||||
loop.start(interval=interval, initial_delay=initial_delay)
|
||||
|
||||
if hasattr(self, 'periodic_agent_loops'):
|
||||
self.periodic_agent_loops.append(loop)
|
||||
else:
|
||||
self.periodic_agent_loops = [loop]
|
||||
|
||||
def agent_dead_limit_seconds(self):
|
||||
return cfg.CONF.agent_down_time * 2
|
||||
@ -166,7 +169,7 @@ class DhcpAgentSchedulerDbMixin(dhcpagentscheduler
|
||||
"automatic network rescheduling is disabled."))
|
||||
return
|
||||
|
||||
self.setup_agent_status_check(self.remove_networks_from_down_agents)
|
||||
self.add_agent_status_check(self.remove_networks_from_down_agents)
|
||||
|
||||
def is_eligible_agent(self, context, active, agent):
|
||||
# eligible agent is active or starting up
|
||||
|
@ -82,7 +82,7 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase,
|
||||
"automatic router rescheduling is disabled."))
|
||||
return
|
||||
|
||||
self.setup_agent_status_check(
|
||||
self.add_agent_status_check(
|
||||
self.reschedule_routers_from_down_agents)
|
||||
|
||||
def reschedule_routers_from_down_agents(self):
|
||||
|
@ -149,6 +149,7 @@ class Ml2Plugin(db_base_plugin_v2.NeutronDbPluginV2,
|
||||
self.mechanism_manager.initialize()
|
||||
self._setup_dhcp()
|
||||
self._start_rpc_notifiers()
|
||||
self.add_agent_status_check(self.agent_health_check)
|
||||
LOG.info(_LI("Modular L2 Plugin initialization complete"))
|
||||
|
||||
def _setup_rpc(self):
|
||||
|
@ -409,6 +409,10 @@ class PluginFixture(fixtures.Fixture):
|
||||
'neutron.db.agentschedulers_db.DhcpAgentSchedulerDbMixin.'
|
||||
'start_periodic_dhcp_agent_status_check')
|
||||
self.patched_dhcp_periodic = self.dhcp_periodic_p.start()
|
||||
self.agent_health_check_p = mock.patch(
|
||||
'neutron.db.agentschedulers_db.DhcpAgentSchedulerDbMixin.'
|
||||
'add_agent_status_check')
|
||||
self.agent_health_check = self.agent_health_check_p.start()
|
||||
# Plugin cleanup should be triggered last so that
|
||||
# test-specific cleanup has a chance to release references.
|
||||
self.addCleanup(self.cleanup_core_plugin)
|
||||
|
@ -161,6 +161,27 @@ class TestAgentsDbMixin(TestAgentsDbBase):
|
||||
agent = self.plugin.get_agents(self.context)[0]
|
||||
self.assertFalse(agent['admin_state_up'])
|
||||
|
||||
def test_agent_health_check(self):
|
||||
agents = [{'agent_type': "DHCP Agent",
|
||||
'heartbeat_timestamp': '2015-05-06 22:40:40.432295',
|
||||
'host': 'some.node',
|
||||
'alive': True}]
|
||||
with mock.patch.object(self.plugin, 'get_agents',
|
||||
return_value=agents),\
|
||||
mock.patch.object(agents_db.LOG, 'warn') as warn,\
|
||||
mock.patch.object(agents_db.LOG, 'debug') as debug:
|
||||
self.plugin.agent_health_check()
|
||||
self.assertTrue(debug.called)
|
||||
self.assertFalse(warn.called)
|
||||
agents[0]['alive'] = False
|
||||
self.plugin.agent_health_check()
|
||||
warn.assert_called_once_with(
|
||||
mock.ANY,
|
||||
{'count': 1, 'total': 1,
|
||||
'data': " Type Last heartbeat host\n"
|
||||
" DHCP Agent 2015-05-06 22:40:40.432295 some.node"}
|
||||
)
|
||||
|
||||
|
||||
class TestAgentsDbGetAgents(TestAgentsDbBase):
|
||||
scenarios = [
|
||||
|
Loading…
Reference in New Issue
Block a user