Catch broad exception in methods used in FixedIntervalLoopingCall

Unlike other places where it might make sense to catch specific
exceptions, methods that are used to check L3 and DHCP agents
liveness via FixedIntervalLoopingCall should never allow exceptions
to leak to calling method and interrupt the loop.

Further improvement of FixedIntervalLoopingCall might be needed,
but for the sake of easy backporting it makes sense to fix the issue
in neutron before pushing refactoring to 3rd-party library.

Change-Id: I6a61e99a6f4e445e26ea4a9923b47e35559e5703
Closes-Bug: #1458119
This commit is contained in:
Eugene Nikanorov 2015-05-26 20:17:20 +04:00
parent dee9ffc696
commit ae8c1c5f80
4 changed files with 55 additions and 42 deletions

View File

@ -270,40 +270,47 @@ class DhcpAgentSchedulerDbMixin(dhcpagentscheduler
agents_db.Agent.admin_state_up))
dhcp_notifier = self.agent_notifiers.get(constants.AGENT_TYPE_DHCP)
for binding in self._filter_bindings(context, down_bindings):
LOG.warn(_LW("Removing network %(network)s from agent %(agent)s "
"because the agent did not report to the server in "
"the last %(dead_time)s seconds."),
{'network': binding.network_id,
'agent': binding.dhcp_agent_id,
'dead_time': agent_dead_limit})
# save binding object to avoid ObjectDeletedError
# in case binding is concurrently deleted from the DB
saved_binding = {'net': binding.network_id,
'agent': binding.dhcp_agent_id}
try:
# do not notify agent if it considered dead
# so when it is restarted it won't see network delete
# notifications on its queue
self.remove_network_from_dhcp_agent(context,
binding.dhcp_agent_id,
binding.network_id,
notify=False)
except dhcpagentscheduler.NetworkNotHostedByDhcpAgent:
# measures against concurrent operation
LOG.debug("Network %(net)s already removed from DHCP agent "
"%(agent)s",
saved_binding)
# still continue and allow concurrent scheduling attempt
except Exception:
LOG.exception(_LE("Unexpected exception occurred while "
"removing network %(net)s from agent "
"%(agent)s"),
try:
for binding in self._filter_bindings(context, down_bindings):
LOG.warn(_LW("Removing network %(network)s from agent "
"%(agent)s because the agent did not report "
"to the server in the last %(dead_time)s "
"seconds."),
{'network': binding.network_id,
'agent': binding.dhcp_agent_id,
'dead_time': agent_dead_limit})
# save binding object to avoid ObjectDeletedError
# in case binding is concurrently deleted from the DB
saved_binding = {'net': binding.network_id,
'agent': binding.dhcp_agent_id}
try:
# do not notify agent if it considered dead
# so when it is restarted it won't see network delete
# notifications on its queue
self.remove_network_from_dhcp_agent(context,
binding.dhcp_agent_id,
binding.network_id,
notify=False)
except dhcpagentscheduler.NetworkNotHostedByDhcpAgent:
# measures against concurrent operation
LOG.debug("Network %(net)s already removed from DHCP "
"agent %(agent)s",
saved_binding)
# still continue and allow concurrent scheduling attempt
except Exception:
LOG.exception(_LE("Unexpected exception occurred while "
"removing network %(net)s from agent "
"%(agent)s"),
saved_binding)
if cfg.CONF.network_auto_schedule:
self._schedule_network(
context, saved_binding['net'], dhcp_notifier)
if cfg.CONF.network_auto_schedule:
self._schedule_network(
context, saved_binding['net'], dhcp_notifier)
except Exception:
# we want to be thorough and catch whatever is raised
# to avoid loop abortion
LOG.exception(_LE("Exception encountered during network "
"rescheduling"))
def get_dhcp_agents_hosting_networks(
self, context, network_ids, active=None, admin_state_up=None):

View File

@ -116,9 +116,9 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase,
# so one broken one doesn't stop the iteration.
LOG.exception(_LE("Failed to reschedule router %s"),
binding.router_id)
except db_exc.DBError:
# Catch DB errors here so a transient DB connectivity issue
# doesn't stop the loopingcall.
except Exception:
# we want to be thorough and catch whatever is raised
# to avoid loop abortion
LOG.exception(_LE("Exception encountered during router "
"rescheduling."))

View File

@ -674,17 +674,14 @@ class OvsAgentSchedulerTestCase(OvsAgentSchedulerTestCaseBase):
db_exc.DBError(), oslo_messaging.RemoteError(),
l3agentscheduler.RouterReschedulingFailed(router_id='f',
agent_id='f'),
ValueError('this raises')
ValueError('this raises'),
Exception()
]).start()
# these first three should not raise any errors
self._take_down_agent_and_run_reschedule(L3_HOSTA) # DBError
self._take_down_agent_and_run_reschedule(L3_HOSTA) # RemoteError
self._take_down_agent_and_run_reschedule(L3_HOSTA) # schedule err
# ValueError is not caught so it should raise
self.assertRaises(ValueError,
self._take_down_agent_and_run_reschedule,
L3_HOSTA)
self._take_down_agent_and_run_reschedule(L3_HOSTA) # Value error
self._take_down_agent_and_run_reschedule(L3_HOSTA) # Exception
def test_router_rescheduler_iterates_after_reschedule_failure(self):
plugin = manager.NeutronManager.get_service_plugins().get(

View File

@ -248,6 +248,15 @@ class TestNetworksFailover(TestDhcpSchedulerBaseTestCase,
self.assertIn('foo3', res_ids)
self.assertIn('foo4', res_ids)
def test_reschedule_network_from_down_agent_failed_on_unexpected(self):
agents = self._create_and_set_agents_down(['host-a'], 1)
self._test_schedule_bind_network([agents[0]], self.network_id)
with mock.patch.object(
self, '_filter_bindings',
side_effect=Exception()):
# just make sure that no exception is raised
self.remove_networks_from_down_agents()
class DHCPAgentWeightSchedulerTestCase(TestDhcpSchedulerBaseTestCase):
"""Unit test scenarios for WeightScheduler.schedule."""