Move db query to fetch down bindings under try/except

In case of intermittent DB failures router and network auto-rescheduling
tasks may fail due to error on fetching down bindings from db.
Need to put this queries under try/except to prevent unexpected exit.

Closes-Bug: #1546110
Change-Id: Id48e899a5b3d906c6d1da4d03923bdda2681cd92
This commit is contained in:
Oleg Bondarev 2016-02-16 18:03:52 +03:00 committed by Brian Haley
parent ce29313415
commit b6ec40cbf7
4 changed files with 37 additions and 17 deletions

View File

@ -268,14 +268,13 @@ class DhcpAgentSchedulerDbMixin(dhcpagentscheduler
cutoff = self.get_cutoff_time(agent_dead_limit)
context = ncontext.get_admin_context()
down_bindings = (
context.session.query(NetworkDhcpAgentBinding).
join(agents_db.Agent).
filter(agents_db.Agent.heartbeat_timestamp < cutoff,
agents_db.Agent.admin_state_up))
dhcp_notifier = self.agent_notifiers.get(constants.AGENT_TYPE_DHCP)
try:
down_bindings = (
context.session.query(NetworkDhcpAgentBinding).
join(agents_db.Agent).
filter(agents_db.Agent.heartbeat_timestamp < cutoff,
agents_db.Agent.admin_state_up))
dhcp_notifier = self.agent_notifiers.get(constants.AGENT_TYPE_DHCP)
dead_bindings = [b for b in
self._filter_bindings(context, down_bindings)]
agents = self.get_agents_db(

View File

@ -97,17 +97,20 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase,
cutoff = self.get_cutoff_time(agent_dead_limit)
context = n_ctx.get_admin_context()
down_bindings = (
context.session.query(RouterL3AgentBinding).
join(agents_db.Agent).
filter(agents_db.Agent.heartbeat_timestamp < cutoff,
agents_db.Agent.admin_state_up).
outerjoin(l3_attrs_db.RouterExtraAttributes,
l3_attrs_db.RouterExtraAttributes.router_id ==
RouterL3AgentBinding.router_id).
filter(sa.or_(l3_attrs_db.RouterExtraAttributes.ha == sql.false(),
l3_attrs_db.RouterExtraAttributes.ha == sql.null())))
try:
down_bindings = (
context.session.query(RouterL3AgentBinding).
join(agents_db.Agent).
filter(agents_db.Agent.heartbeat_timestamp < cutoff,
agents_db.Agent.admin_state_up).
outerjoin(l3_attrs_db.RouterExtraAttributes,
l3_attrs_db.RouterExtraAttributes.router_id ==
RouterL3AgentBinding.router_id).
filter(sa.or_(l3_attrs_db.RouterExtraAttributes.ha ==
sql.false(),
l3_attrs_db.RouterExtraAttributes.ha ==
sql.null())))
agents_back_online = set()
for binding in down_bindings:
if binding.l3_agent_id in agents_back_online:

View File

@ -687,6 +687,16 @@ class OvsAgentSchedulerTestCase(OvsAgentSchedulerTestCaseBase):
self._take_down_agent_and_run_reschedule(L3_HOSTA) # Value error
self._take_down_agent_and_run_reschedule(L3_HOSTA) # Exception
def test_router_rescheduler_catches_exceptions_on_fetching_bindings(self):
with mock.patch('neutron.context.get_admin_context') as get_ctx:
mock_ctx = mock.Mock()
get_ctx.return_value = mock_ctx
mock_ctx.session.query.side_effect = db_exc.DBError()
plugin = manager.NeutronManager.get_service_plugins().get(
service_constants.L3_ROUTER_NAT)
# check that no exception is raised
plugin.reschedule_routers_from_down_agents()
def test_router_rescheduler_iterates_after_reschedule_failure(self):
plugin = manager.NeutronManager.get_service_plugins().get(
service_constants.L3_ROUTER_NAT)

View File

@ -366,6 +366,14 @@ class TestNetworksFailover(TestDhcpSchedulerBaseTestCase,
# just make sure that no exception is raised
self.remove_networks_from_down_agents()
def test_reschedule_network_catches_exceptions_on_fetching_bindings(self):
with mock.patch('neutron.context.get_admin_context') as get_ctx:
mock_ctx = mock.Mock()
get_ctx.return_value = mock_ctx
mock_ctx.session.query.side_effect = Exception()
# just make sure that no exception is raised
self.remove_networks_from_down_agents()
def test_reschedule_doesnt_occur_if_no_agents(self):
agents = self._create_and_set_agents_down(['host-a', 'host-b'], 2)
self._test_schedule_bind_network([agents[0]], self.network_id)