From 5ee377952badd94d08425aab41853916092acd07 Mon Sep 17 00:00:00 2001 From: LIU Yulong Date: Sat, 11 Apr 2020 08:41:28 +0800 Subject: [PATCH] Not remove the running router when MQ is unreachable When the L3 agent get a router update notification, it will try to retrieve the router info from neutron server. But at this time, if the message queue is down/unreachable. It will get exceptions related message queue. The resync actions will be run then. Sometimes, rabbitMQ cluster is not so much easy to recover. Then Long time MQ recover time will cause the router info sync RPC never get successful until it meets the max retry time. Then the bad thing happens, L3 agent is trying to remove the router now. It basically shutdown all the existing L3 traffic of this router. This patch directly removes the final router removal action, let the router run as it is. Closes-Bug: #1871850 Change-Id: I9062638366b45a7a930f31185cd6e23901a43957 (cherry picked from commit 12b9149e20665d80c11f1ef3d2283e1fa6f3b693) --- neutron/agent/l3/agent.py | 3 -- neutron/tests/unit/agent/l3/test_agent.py | 42 ++++++++++++++++++----- 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/neutron/agent/l3/agent.py b/neutron/agent/l3/agent.py index 0805c5d7672..1c426e52e05 100644 --- a/neutron/agent/l3/agent.py +++ b/neutron/agent/l3/agent.py @@ -585,9 +585,6 @@ class L3NATAgent(ha.AgentMixin, if router_update.hit_retry_limit(): LOG.warning("Hit retry limit with router update for %s, action %s", router_update.id, router_update.action) - if router_update.action != DELETE_ROUTER: - LOG.debug("Deleting router %s", router_update.id) - self._safe_router_removed(router_update.id) return router_update.timestamp = timeutils.utcnow() router_update.priority = priority diff --git a/neutron/tests/unit/agent/l3/test_agent.py b/neutron/tests/unit/agent/l3/test_agent.py index 0410fc19dfb..a373fae2026 100644 --- a/neutron/tests/unit/agent/l3/test_agent.py +++ b/neutron/tests/unit/agent/l3/test_agent.py @@ -2696,35 +2696,59 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework): self._test_process_routers_update_rpc_timeout(ext_net_call=True, ext_net_call_failed=True) - @mock.patch.object(pd, 'remove_router') - def _test_process_routers_update_router_deleted(self, remove_router, - error=False): + def test_process_routers_update_router_update(self): agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) agent._queue = mock.Mock() update = mock.Mock() update.resource = None - update.action = 1 # ROUTER_DELETED + update.action = l3_agent.ADD_UPDATE_ROUTER router_info = mock.MagicMock() agent.router_info[update.id] = router_info router_processor = mock.Mock() agent._queue.each_update_to_next_resource.side_effect = [ [(router_processor, update)]] agent._resync_router = mock.Mock() + agent._safe_router_removed = mock.Mock() + agent.plugin_rpc = mock.MagicMock() + agent.plugin_rpc.get_routers.side_effect = ( + Exception("Failed to get router info")) + # start test + agent._process_router_update() + router_info.delete.assert_not_called() + self.assertFalse(router_info.delete.called) + self.assertTrue(agent.router_info) + self.assertTrue(agent._resync_router.called) + self.assertFalse(agent._safe_router_removed.called) + + def _test_process_routers_update_router_deleted(self, + error=False): + agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) + agent._queue = mock.Mock() + update = mock.Mock() + update.resource = None + update.action = l3_agent.DELETE_ROUTER + router_info = mock.MagicMock() + agent.router_info[update.id] = router_info + router_processor = mock.Mock() + agent._queue.each_update_to_next_resource.side_effect = [ + [(router_processor, update)]] + agent._resync_router = mock.Mock() + agent._safe_router_removed = mock.Mock() if error: - agent._safe_router_removed = mock.Mock() agent._safe_router_removed.return_value = False agent._process_router_update() if error: self.assertFalse(router_processor.fetched_and_processed.called) agent._resync_router.assert_called_with(update) - self.assertFalse(remove_router.called) + self.assertTrue(agent._safe_router_removed.called) else: - router_info.delete.assert_called_once_with() - self.assertFalse(agent.router_info) + router_info.delete.assert_not_called() + self.assertFalse(router_info.delete.called) + self.assertTrue(agent.router_info) self.assertFalse(agent._resync_router.called) router_processor.fetched_and_processed.assert_called_once_with( update.timestamp) - self.assertTrue(remove_router.called) + self.assertTrue(agent._safe_router_removed.called) def test_process_routers_update_router_deleted_success(self): self._test_process_routers_update_router_deleted()