[L3] Use processing queue for network update events

Router_info's _process_internal_ports() method is the one which is
manipulating router_info.internal_ports cache and network_update()
method from the L3 agent is relying on that Router_info's cache to
check if updated network is connected to the router or not.
So they shouldn't be run together as that may cause some race conditions
and unexpected issues, like e.g. described in the related bug.

Until now, network_update event was the only one which was processed
without using queue of events. And because of that such race condition
as described above were possible.
To fix that, this patch changes network_update method in the way that it
now adds update events for each router hosted by agent to the queue.
Those events for single routers are then processed, checks if network is
actually connected to the router and if yes, schedules router update to
be processed.

Conflicts:
    neutron/agent/l3/agent.py

Closes-Bug: #1933234
Change-Id: I2efe66a7415f7a18fb85bd2536a1901e751d6203
(cherry picked from commit 6ce48c30bd)
This commit is contained in:
Slawek Kaplonski 2021-07-07 12:00:14 +02:00
parent 25f4864d12
commit 604b055c71
3 changed files with 123 additions and 71 deletions

View File

@ -78,6 +78,7 @@ DELETE_RELATED_ROUTER = 2
ADD_UPDATE_ROUTER = 3 ADD_UPDATE_ROUTER = 3
ADD_UPDATE_RELATED_ROUTER = 4 ADD_UPDATE_RELATED_ROUTER = 4
PD_UPDATE = 5 PD_UPDATE = 5
UPDATE_NETWORK = 6
RELATED_ACTION_MAP = {DELETE_ROUTER: DELETE_RELATED_ROUTER, RELATED_ACTION_MAP = {DELETE_ROUTER: DELETE_RELATED_ROUTER,
ADD_UPDATE_ROUTER: ADD_UPDATE_RELATED_ROUTER} ADD_UPDATE_ROUTER: ADD_UPDATE_RELATED_ROUTER}
@ -595,6 +596,16 @@ class L3NATAgent(ha.AgentMixin,
network_id = kwargs['network']['id'] network_id = kwargs['network']['id']
LOG.debug("Got network %s update", network_id) LOG.debug("Got network %s update", network_id)
for ri in self.router_info.values(): for ri in self.router_info.values():
update = queue.ResourceUpdate(ri.router_id,
PRIORITY_RPC,
action=UPDATE_NETWORK,
resource=network_id)
self._queue.add(update)
def _process_network_update(self, router_id, network_id):
ri = self.router_info.get(router_id)
if not ri:
return
LOG.debug("Checking if router %s is plugged to the network %s", LOG.debug("Checking if router %s is plugged to the network %s",
ri, network_id) ri, network_id)
ports = list(ri.internal_ports) ports = list(ri.internal_ports)
@ -690,11 +701,24 @@ class L3NATAgent(ha.AgentMixin,
router_update.resource = None # Force the agent to resync the router router_update.resource = None # Force the agent to resync the router
self._queue.add(router_update) self._queue.add(router_update)
def _process_router_update(self): def _process_update(self):
if self._exiting: if self._exiting:
return return
for rp, update in self._queue.each_update_to_next_resource(): for rp, update in self._queue.each_update_to_next_resource():
LOG.info("Starting processing update %s, action %s, priority %s, "
"update_id %s. Wait time elapsed: %.3f",
update.id, update.action, update.priority,
update.update_id,
update.time_elapsed_since_create)
if update.action == UPDATE_NETWORK:
self._process_network_update(
router_id=update.id,
network_id=update.resource)
else:
self._process_router_update(rp, update)
def _process_router_update(self, rp, update):
LOG.info("Starting router update for %s, action %s, priority %s, " LOG.info("Starting router update for %s, action %s, priority %s, "
"update_id %s. Wait time elapsed: %.3f", "update_id %s. Wait time elapsed: %.3f",
update.id, update.action, update.priority, update.id, update.action, update.priority,
@ -706,7 +730,7 @@ class L3NATAgent(ha.AgentMixin,
"update_id. %s. Time elapsed: %.3f", "update_id. %s. Time elapsed: %.3f",
update.id, update.update_id, update.id, update.update_id,
update.time_elapsed_since_start) update.time_elapsed_since_start)
continue return
routers = [update.resource] if update.resource else [] routers = [update.resource] if update.resource else []
@ -723,7 +747,7 @@ class L3NATAgent(ha.AgentMixin,
msg = "Failed to fetch router information for '%s'" msg = "Failed to fetch router information for '%s'"
LOG.exception(msg, update.id) LOG.exception(msg, update.id)
self._resync_router(update) self._resync_router(update)
continue return
# For a related action, verify the router is still hosted here, # For a related action, verify the router is still hosted here,
# since it could have just been deleted and we don't want to # since it could have just been deleted and we don't want to
@ -741,15 +765,15 @@ class L3NATAgent(ha.AgentMixin,
# processing queue (like events from fullsync) in order to # processing queue (like events from fullsync) in order to
# prevent deleted router re-creation # prevent deleted router re-creation
rp.fetched_and_processed(update.timestamp) rp.fetched_and_processed(update.timestamp)
LOG.info("Finished a router update for %s, update_id %s. " LOG.info("Finished a router delete for %s, update_id %s. "
"Time elapsed: %.3f", "Time elapsed: %.3f",
update.id, update.update_id, update.id, update.update_id,
update.time_elapsed_since_start) update.time_elapsed_since_start)
continue return
if not self._process_routers_if_compatible(routers, update): if not self._process_routers_if_compatible(routers, update):
self._resync_router(update) self._resync_router(update)
continue return
rp.fetched_and_processed(update.timestamp) rp.fetched_and_processed(update.timestamp)
LOG.info("Finished a router update for %s, update_id %s. " LOG.info("Finished a router update for %s, update_id %s. "
@ -804,7 +828,7 @@ class L3NATAgent(ha.AgentMixin,
def _process_routers_loop(self): def _process_routers_loop(self):
LOG.debug("Starting _process_routers_loop") LOG.debug("Starting _process_routers_loop")
while not self._exiting: while not self._exiting:
self._pool.spawn_n(self._process_router_update) self._pool.spawn_n(self._process_update)
# NOTE(kevinbenton): this is set to 1 second because the actual interval # NOTE(kevinbenton): this is set to 1 second because the actual interval
# is controlled by a FixedIntervalLoopingCall in neutron/service.py that # is controlled by a FixedIntervalLoopingCall in neutron/service.py that

View File

@ -226,7 +226,7 @@ class L3AgentTestCase(framework.L3AgentTestFramework):
# make sure all events are processed # make sure all events are processed
while not self.agent._queue._queue.empty(): while not self.agent._queue._queue.empty():
self.agent._process_router_update() self.agent._process_update()
for r in routers_to_keep: for r in routers_to_keep:
self.assertIn(r['id'], self.agent.router_info) self.assertIn(r['id'], self.agent.router_info)

View File

@ -2266,11 +2266,11 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
agent._create_router = mock.Mock(return_value=ri) agent._create_router = mock.Mock(return_value=ri)
agent._fetch_external_net_id = mock.Mock( agent._fetch_external_net_id = mock.Mock(
return_value=router['external_gateway_info']['network_id']) return_value=router['external_gateway_info']['network_id'])
agent._process_router_update() agent._process_update()
log_exception.assert_has_calls(calls) log_exception.assert_has_calls(calls)
ri.initialize.side_effect = None ri.initialize.side_effect = None
agent._process_router_update() agent._process_update()
self.assertTrue(ri.delete.called) self.assertTrue(ri.delete.called)
self.assertEqual(2, ri.initialize.call_count) self.assertEqual(2, ri.initialize.call_count)
self.assertEqual(2, agent._create_router.call_count) self.assertEqual(2, agent._create_router.call_count)
@ -2577,6 +2577,17 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
self.assertFalse(agent._queue.add.called) self.assertFalse(agent._queue.add.called)
def test_network_update(self): def test_network_update(self):
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
agent.router_info = {
_uuid(): mock.Mock(),
_uuid(): mock.Mock()}
network_id = _uuid()
agent._queue = mock.Mock()
network = {'id': network_id}
agent.network_update(None, network=network)
self.assertEqual(2, agent._queue.add.call_count)
def test__process_network_update(self):
router = l3_test_common.prepare_router_data(num_internal_ports=2) router = l3_test_common.prepare_router_data(num_internal_ports=2)
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
agent._process_added_router(router) agent._process_added_router(router)
@ -2585,10 +2596,27 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
internal_ports = ri.router.get(lib_constants.INTERFACE_KEY, []) internal_ports = ri.router.get(lib_constants.INTERFACE_KEY, [])
network_id = internal_ports[0]['network_id'] network_id = internal_ports[0]['network_id']
agent._queue = mock.Mock() agent._queue = mock.Mock()
network = {'id': network_id} agent._process_network_update(ri.router_id, network_id)
agent.network_update(None, network=network)
self.assertEqual(1, agent._queue.add.call_count) self.assertEqual(1, agent._queue.add.call_count)
def test__process_network_update_no_router_info_found(self):
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
network_id = _uuid()
agent._queue = mock.Mock()
agent._process_network_update(_uuid(), network_id)
agent._queue.add.assert_not_called()
def test__process_network_update_not_connected_to_router(self):
router = l3_test_common.prepare_router_data(num_internal_ports=2)
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
agent._process_added_router(router)
ri = l3router.RouterInfo(agent, router['id'],
router, **self.ri_kwargs)
network_id = _uuid()
agent._queue = mock.Mock()
agent._process_network_update(ri.router_id, network_id)
agent._queue.add.assert_not_called()
def test_create_router_namespace(self): def test_create_router_namespace(self):
self.mock_ip.ensure_namespace.return_value = self.mock_ip self.mock_ip.ensure_namespace.return_value = self.mock_ip
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
@ -2724,7 +2752,7 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
update.resource = None update.resource = None
agent._queue.each_update_to_next_resource.side_effect = [ agent._queue.each_update_to_next_resource.side_effect = [
[(None, update)]] [(None, update)]]
agent._process_router_update() agent._process_update()
self.assertFalse(agent.fullsync) self.assertFalse(agent.fullsync)
self.assertEqual(ext_net_call, self.assertEqual(ext_net_call,
agent._process_router_if_compatible.called) agent._process_router_if_compatible.called)
@ -2751,13 +2779,13 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
resource=router, resource=router,
timestamp=timeutils.utcnow()) timestamp=timeutils.utcnow())
agent._queue.add(update) agent._queue.add(update)
agent._process_router_update() agent._process_update()
# The update contained the router object, get_routers won't be called # The update contained the router object, get_routers won't be called
self.assertFalse(agent.plugin_rpc.get_routers.called) self.assertFalse(agent.plugin_rpc.get_routers.called)
# The update failed, assert that get_routers was called # The update failed, assert that get_routers was called
agent._process_router_update() agent._process_update()
self.assertTrue(agent.plugin_rpc.get_routers.called) self.assertTrue(agent.plugin_rpc.get_routers.called)
def test_process_routers_update_rpc_timeout_on_get_ext_net(self): def test_process_routers_update_rpc_timeout_on_get_ext_net(self):
@ -2781,7 +2809,7 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
agent.plugin_rpc.get_routers.side_effect = ( agent.plugin_rpc.get_routers.side_effect = (
Exception("Failed to get router info")) Exception("Failed to get router info"))
# start test # start test
agent._process_router_update() agent._process_update()
router_info.delete.assert_not_called() router_info.delete.assert_not_called()
self.assertFalse(router_info.delete.called) self.assertFalse(router_info.delete.called)
self.assertTrue(agent.router_info) self.assertTrue(agent.router_info)
@ -2804,7 +2832,7 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
agent._safe_router_removed = mock.Mock() agent._safe_router_removed = mock.Mock()
if error: if error:
agent._safe_router_removed.return_value = False agent._safe_router_removed.return_value = False
agent._process_router_update() agent._process_update()
if error: if error:
self.assertFalse(router_processor.fetched_and_processed.called) self.assertFalse(router_processor.fetched_and_processed.called)
agent._resync_router.assert_called_with(update) agent._resync_router.assert_called_with(update)