diff --git a/neutron/db/agents_db.py b/neutron/db/agents_db.py index 9417d5e3c37..453a858feed 100644 --- a/neutron/db/agents_db.py +++ b/neutron/db/agents_db.py @@ -26,6 +26,7 @@ from sqlalchemy import sql from neutron.api.v2 import attributes from neutron.common import constants +from neutron import context from neutron.db import model_base from neutron.db import models_v2 from neutron.extensions import agent as ext_agent @@ -191,6 +192,26 @@ class AgentDbMixin(ext_agent.AgentPluginBase): agents = [agent for agent in agents if agent['alive'] == alive] return agents + def agent_health_check(self): + """Scan agents and log if some are considered dead.""" + agents = self.get_agents(context.get_admin_context(), + filters={'admin_state_up': [True]}) + dead_agents = [agent for agent in agents if not agent['alive']] + if dead_agents: + data = '%20s %20s %s\n' % ('Type', 'Last heartbeat', "host") + data += '\n'.join(['%20s %20s %s' % + (agent['agent_type'], + agent['heartbeat_timestamp'], + agent['host']) for agent in dead_agents]) + LOG.warn(_LW("Agent healthcheck: found %(count)s dead agents " + "out of %(total)s:\n%(data)s"), + {'count': len(dead_agents), + 'total': len(agents), + 'data': data}) + else: + LOG.debug("Agent healthcheck: found %s active agents", + len(agents)) + def _get_agent_by_type_and_host(self, context, agent_type, host): query = self._model_query(context, Agent) try: diff --git a/neutron/db/agentschedulers_db.py b/neutron/db/agentschedulers_db.py index 591db0d2972..924cdb41699 100644 --- a/neutron/db/agentschedulers_db.py +++ b/neutron/db/agentschedulers_db.py @@ -118,16 +118,19 @@ class AgentSchedulerDbMixin(agents_db.AgentDbMixin): original_agent['host']) return result - def setup_agent_status_check(self, function): - self.periodic_agent_loop = loopingcall.FixedIntervalLoopingCall( - function) + def add_agent_status_check(self, function): + loop = loopingcall.FixedIntervalLoopingCall(function) # TODO(enikanorov): make interval configurable rather than computed interval = max(cfg.CONF.agent_down_time // 2, 1) # add random initial delay to allow agents to check in after the # neutron server first starts. random to offset multiple servers initial_delay = random.randint(interval, interval * 2) - self.periodic_agent_loop.start(interval=interval, - initial_delay=initial_delay) + loop.start(interval=interval, initial_delay=initial_delay) + + if hasattr(self, 'periodic_agent_loops'): + self.periodic_agent_loops.append(loop) + else: + self.periodic_agent_loops = [loop] def agent_dead_limit_seconds(self): return cfg.CONF.agent_down_time * 2 @@ -166,7 +169,7 @@ class DhcpAgentSchedulerDbMixin(dhcpagentscheduler "automatic network rescheduling is disabled.")) return - self.setup_agent_status_check(self.remove_networks_from_down_agents) + self.add_agent_status_check(self.remove_networks_from_down_agents) def is_eligible_agent(self, context, active, agent): # eligible agent is active or starting up diff --git a/neutron/db/l3_agentschedulers_db.py b/neutron/db/l3_agentschedulers_db.py index 0accdd7db7f..4ccde0bdaf5 100644 --- a/neutron/db/l3_agentschedulers_db.py +++ b/neutron/db/l3_agentschedulers_db.py @@ -82,7 +82,7 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase, "automatic router rescheduling is disabled.")) return - self.setup_agent_status_check( + self.add_agent_status_check( self.reschedule_routers_from_down_agents) def reschedule_routers_from_down_agents(self): diff --git a/neutron/plugins/ml2/plugin.py b/neutron/plugins/ml2/plugin.py index a8a406b05d3..3a1e64fe544 100644 --- a/neutron/plugins/ml2/plugin.py +++ b/neutron/plugins/ml2/plugin.py @@ -148,6 +148,7 @@ class Ml2Plugin(db_base_plugin_v2.NeutronDbPluginV2, self.mechanism_manager.initialize() self._setup_dhcp() self._start_rpc_notifiers() + self.add_agent_status_check(self.agent_health_check) LOG.info(_LI("Modular L2 Plugin initialization complete")) def _setup_rpc(self): diff --git a/neutron/tests/base.py b/neutron/tests/base.py index cd79f3eebbf..d7dd976b6db 100644 --- a/neutron/tests/base.py +++ b/neutron/tests/base.py @@ -409,6 +409,10 @@ class PluginFixture(fixtures.Fixture): 'neutron.db.agentschedulers_db.DhcpAgentSchedulerDbMixin.' 'start_periodic_dhcp_agent_status_check') self.patched_dhcp_periodic = self.dhcp_periodic_p.start() + self.agent_health_check_p = mock.patch( + 'neutron.db.agentschedulers_db.DhcpAgentSchedulerDbMixin.' + 'add_agent_status_check') + self.agent_health_check = self.agent_health_check_p.start() # Plugin cleanup should be triggered last so that # test-specific cleanup has a chance to release references. self.addCleanup(self.cleanup_core_plugin) diff --git a/neutron/tests/unit/db/test_agents_db.py b/neutron/tests/unit/db/test_agents_db.py index 3aeea2b3ab4..cabae43159e 100644 --- a/neutron/tests/unit/db/test_agents_db.py +++ b/neutron/tests/unit/db/test_agents_db.py @@ -161,6 +161,27 @@ class TestAgentsDbMixin(TestAgentsDbBase): agent = self.plugin.get_agents(self.context)[0] self.assertFalse(agent['admin_state_up']) + def test_agent_health_check(self): + agents = [{'agent_type': "DHCP Agent", + 'heartbeat_timestamp': '2015-05-06 22:40:40.432295', + 'host': 'some.node', + 'alive': True}] + with mock.patch.object(self.plugin, 'get_agents', + return_value=agents),\ + mock.patch.object(agents_db.LOG, 'warn') as warn,\ + mock.patch.object(agents_db.LOG, 'debug') as debug: + self.plugin.agent_health_check() + self.assertTrue(debug.called) + self.assertFalse(warn.called) + agents[0]['alive'] = False + self.plugin.agent_health_check() + warn.assert_called_once_with( + mock.ANY, + {'count': 1, 'total': 1, + 'data': " Type Last heartbeat host\n" + " DHCP Agent 2015-05-06 22:40:40.432295 some.node"} + ) + class TestAgentsDbGetAgents(TestAgentsDbBase): scenarios = [