From ae8c1c5f80fd4fb7b4ab116677f4cff988c67cf1 Mon Sep 17 00:00:00 2001 From: Eugene Nikanorov Date: Tue, 26 May 2015 20:17:20 +0400 Subject: [PATCH] Catch broad exception in methods used in FixedIntervalLoopingCall Unlike other places where it might make sense to catch specific exceptions, methods that are used to check L3 and DHCP agents liveness via FixedIntervalLoopingCall should never allow exceptions to leak to calling method and interrupt the loop. Further improvement of FixedIntervalLoopingCall might be needed, but for the sake of easy backporting it makes sense to fix the issue in neutron before pushing refactoring to 3rd-party library. Change-Id: I6a61e99a6f4e445e26ea4a9923b47e35559e5703 Closes-Bug: #1458119 --- neutron/db/agentschedulers_db.py | 71 ++++++++++--------- neutron/db/l3_agentschedulers_db.py | 6 +- .../openvswitch/test_agent_scheduler.py | 11 ++- .../scheduler/test_dhcp_agent_scheduler.py | 9 +++ 4 files changed, 55 insertions(+), 42 deletions(-) diff --git a/neutron/db/agentschedulers_db.py b/neutron/db/agentschedulers_db.py index bac33a78f6b..61eff9b07cb 100644 --- a/neutron/db/agentschedulers_db.py +++ b/neutron/db/agentschedulers_db.py @@ -270,40 +270,47 @@ class DhcpAgentSchedulerDbMixin(dhcpagentscheduler agents_db.Agent.admin_state_up)) dhcp_notifier = self.agent_notifiers.get(constants.AGENT_TYPE_DHCP) - for binding in self._filter_bindings(context, down_bindings): - LOG.warn(_LW("Removing network %(network)s from agent %(agent)s " - "because the agent did not report to the server in " - "the last %(dead_time)s seconds."), - {'network': binding.network_id, - 'agent': binding.dhcp_agent_id, - 'dead_time': agent_dead_limit}) - # save binding object to avoid ObjectDeletedError - # in case binding is concurrently deleted from the DB - saved_binding = {'net': binding.network_id, - 'agent': binding.dhcp_agent_id} - try: - # do not notify agent if it considered dead - # so when it is restarted it won't see network delete - # notifications on its queue - self.remove_network_from_dhcp_agent(context, - binding.dhcp_agent_id, - binding.network_id, - notify=False) - except dhcpagentscheduler.NetworkNotHostedByDhcpAgent: - # measures against concurrent operation - LOG.debug("Network %(net)s already removed from DHCP agent " - "%(agent)s", - saved_binding) - # still continue and allow concurrent scheduling attempt - except Exception: - LOG.exception(_LE("Unexpected exception occurred while " - "removing network %(net)s from agent " - "%(agent)s"), + try: + for binding in self._filter_bindings(context, down_bindings): + LOG.warn(_LW("Removing network %(network)s from agent " + "%(agent)s because the agent did not report " + "to the server in the last %(dead_time)s " + "seconds."), + {'network': binding.network_id, + 'agent': binding.dhcp_agent_id, + 'dead_time': agent_dead_limit}) + # save binding object to avoid ObjectDeletedError + # in case binding is concurrently deleted from the DB + saved_binding = {'net': binding.network_id, + 'agent': binding.dhcp_agent_id} + try: + # do not notify agent if it considered dead + # so when it is restarted it won't see network delete + # notifications on its queue + self.remove_network_from_dhcp_agent(context, + binding.dhcp_agent_id, + binding.network_id, + notify=False) + except dhcpagentscheduler.NetworkNotHostedByDhcpAgent: + # measures against concurrent operation + LOG.debug("Network %(net)s already removed from DHCP " + "agent %(agent)s", saved_binding) + # still continue and allow concurrent scheduling attempt + except Exception: + LOG.exception(_LE("Unexpected exception occurred while " + "removing network %(net)s from agent " + "%(agent)s"), + saved_binding) - if cfg.CONF.network_auto_schedule: - self._schedule_network( - context, saved_binding['net'], dhcp_notifier) + if cfg.CONF.network_auto_schedule: + self._schedule_network( + context, saved_binding['net'], dhcp_notifier) + except Exception: + # we want to be thorough and catch whatever is raised + # to avoid loop abortion + LOG.exception(_LE("Exception encountered during network " + "rescheduling")) def get_dhcp_agents_hosting_networks( self, context, network_ids, active=None, admin_state_up=None): diff --git a/neutron/db/l3_agentschedulers_db.py b/neutron/db/l3_agentschedulers_db.py index f661dcc6221..59c39771852 100644 --- a/neutron/db/l3_agentschedulers_db.py +++ b/neutron/db/l3_agentschedulers_db.py @@ -116,9 +116,9 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase, # so one broken one doesn't stop the iteration. LOG.exception(_LE("Failed to reschedule router %s"), binding.router_id) - except db_exc.DBError: - # Catch DB errors here so a transient DB connectivity issue - # doesn't stop the loopingcall. + except Exception: + # we want to be thorough and catch whatever is raised + # to avoid loop abortion LOG.exception(_LE("Exception encountered during router " "rescheduling.")) diff --git a/neutron/tests/unit/plugins/openvswitch/test_agent_scheduler.py b/neutron/tests/unit/plugins/openvswitch/test_agent_scheduler.py index 2682014d0af..5f14037fd0e 100644 --- a/neutron/tests/unit/plugins/openvswitch/test_agent_scheduler.py +++ b/neutron/tests/unit/plugins/openvswitch/test_agent_scheduler.py @@ -674,17 +674,14 @@ class OvsAgentSchedulerTestCase(OvsAgentSchedulerTestCaseBase): db_exc.DBError(), oslo_messaging.RemoteError(), l3agentscheduler.RouterReschedulingFailed(router_id='f', agent_id='f'), - ValueError('this raises') + ValueError('this raises'), + Exception() ]).start() - # these first three should not raise any errors self._take_down_agent_and_run_reschedule(L3_HOSTA) # DBError self._take_down_agent_and_run_reschedule(L3_HOSTA) # RemoteError self._take_down_agent_and_run_reschedule(L3_HOSTA) # schedule err - - # ValueError is not caught so it should raise - self.assertRaises(ValueError, - self._take_down_agent_and_run_reschedule, - L3_HOSTA) + self._take_down_agent_and_run_reschedule(L3_HOSTA) # Value error + self._take_down_agent_and_run_reschedule(L3_HOSTA) # Exception def test_router_rescheduler_iterates_after_reschedule_failure(self): plugin = manager.NeutronManager.get_service_plugins().get( diff --git a/neutron/tests/unit/scheduler/test_dhcp_agent_scheduler.py b/neutron/tests/unit/scheduler/test_dhcp_agent_scheduler.py index 9faa8ab5bfb..5ee1adb16cd 100644 --- a/neutron/tests/unit/scheduler/test_dhcp_agent_scheduler.py +++ b/neutron/tests/unit/scheduler/test_dhcp_agent_scheduler.py @@ -248,6 +248,15 @@ class TestNetworksFailover(TestDhcpSchedulerBaseTestCase, self.assertIn('foo3', res_ids) self.assertIn('foo4', res_ids) + def test_reschedule_network_from_down_agent_failed_on_unexpected(self): + agents = self._create_and_set_agents_down(['host-a'], 1) + self._test_schedule_bind_network([agents[0]], self.network_id) + with mock.patch.object( + self, '_filter_bindings', + side_effect=Exception()): + # just make sure that no exception is raised + self.remove_networks_from_down_agents() + class DHCPAgentWeightSchedulerTestCase(TestDhcpSchedulerBaseTestCase): """Unit test scenarios for WeightScheduler.schedule."""