From 90496824c0253d2534f299ebcf5dc00774f70fe7 Mon Sep 17 00:00:00 2001 From: LIU Yulong Date: Wed, 30 Jan 2019 09:54:52 +0800 Subject: [PATCH] Dynamically increase l3 router process queue green pool size There is a race condition between nova-compute boots instance and l3-agent processes DVR (local) router in compute node. This issue can be seen when a large number of instances were booted to one same host, and instances are under different DVR router. So the l3-agent will concurrently process all these dvr routers in this host at the same time. For now we have a green pool for the router ResourceProcessingQueue with 8 greenlet, but some of these routers can still be waiting, event worse thing is that there are time-consuming actions during the router processing procedure. For instance, installing arp entries, iptables rules, route rules etc. So when the VM is up, it will try to get meta via the local proxy hosting by the dvr router. But the router is not ready yet in that host. And finally those instances will not be able to setup some config in the guest OS. This patch adds a new measurement based on the router quantity to indicate the L3 router process queue green pool size. The pool size will be limit from 8 (original value) to 32, because we do not want the L3 agent cost too much host resource on processing router in the compute node. Related-Bug: #1813787 Change-Id: I62393864a103d666d5d9d379073f5fc23ac7d114 (cherry picked from commit 837c9283abd4ccb56d5b4ad0eb1ca435cd2fdf3b) --- neutron/agent/l3/agent.py | 22 +++++++++++++-- .../functional/agent/l3/test_legacy_router.py | 27 +++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/neutron/agent/l3/agent.py b/neutron/agent/l3/agent.py index 4533d910ad1..b53e347fc86 100644 --- a/neutron/agent/l3/agent.py +++ b/neutron/agent/l3/agent.py @@ -21,6 +21,7 @@ from neutron_lib.callbacks import registry from neutron_lib.callbacks import resources from neutron_lib import constants as lib_const from neutron_lib import context as n_context +from oslo_concurrency import lockutils from oslo_config import cfg from oslo_context import context as common_context from oslo_log import log as logging @@ -65,6 +66,9 @@ LOG = logging.getLogger(__name__) SYNC_ROUTERS_MAX_CHUNK_SIZE = 256 SYNC_ROUTERS_MIN_CHUNK_SIZE = 32 +ROUTER_PROCESS_GREENLET_MAX = 32 +ROUTER_PROCESS_GREENLET_MIN = 8 + def log_verbose_exc(message, router_payload): LOG.exception(message) @@ -239,6 +243,8 @@ class L3NATAgent(ha.AgentMixin, self.driver, self.metadata_driver) + # L3 agent router processing green pool + self._pool = eventlet.GreenPool(size=ROUTER_PROCESS_GREENLET_MIN) self._queue = queue.ResourceProcessingQueue() super(L3NATAgent, self).__init__(host=self.conf.host) @@ -356,6 +362,15 @@ class L3NATAgent(ha.AgentMixin, return legacy_router.LegacyRouter(*args, **kwargs) + @lockutils.synchronized('resize_greenpool') + def _resize_process_pool(self): + self._pool_size = max([ROUTER_PROCESS_GREENLET_MIN, + min([ROUTER_PROCESS_GREENLET_MAX, + len(self.router_info)])]) + LOG.info("Resizing router processing queue green pool size to: %d", + self._pool_size) + self._pool.resize(self._pool_size) + def _router_added(self, router_id, router): ri = self._create_router(router_id, router) registry.notify(resources.ROUTER, events.BEFORE_CREATE, @@ -378,6 +393,8 @@ class L3NATAgent(ha.AgentMixin, LOG.exception('Error while deleting router %s', router_id) + self._resize_process_pool() + def _safe_router_removed(self, router_id): """Try to delete a router and return True if successful.""" @@ -419,6 +436,8 @@ class L3NATAgent(ha.AgentMixin, registry.notify(resources.ROUTER, events.AFTER_DELETE, self, router=ri) + self._resize_process_pool() + def init_extension_manager(self, connection): l3_ext_manager.register_opts(self.conf) self.agent_api = l3_ext_api.L3AgentExtensionAPI(self.router_info) @@ -655,9 +674,8 @@ class L3NATAgent(ha.AgentMixin, def _process_routers_loop(self): LOG.debug("Starting _process_routers_loop") - pool = eventlet.GreenPool(size=8) while True: - pool.spawn_n(self._process_router_update) + self._pool.spawn_n(self._process_router_update) # NOTE(kevinbenton): this is set to 1 second because the actual interval # is controlled by a FixedIntervalLoopingCall in neutron/service.py that diff --git a/neutron/tests/functional/agent/l3/test_legacy_router.py b/neutron/tests/functional/agent/l3/test_legacy_router.py index c97aec51a20..c3f132c98ab 100644 --- a/neutron/tests/functional/agent/l3/test_legacy_router.py +++ b/neutron/tests/functional/agent/l3/test_legacy_router.py @@ -21,6 +21,7 @@ from neutron_lib.callbacks import registry from neutron_lib.callbacks import resources from neutron_lib import constants as lib_constants +from neutron.agent.l3 import agent as l3_agent from neutron.agent.l3 import namespace_manager from neutron.agent.l3 import namespaces from neutron.agent.linux import ip_lib @@ -107,6 +108,32 @@ class L3AgentTestCase(framework.L3AgentTestFramework): self.assertIsNone(device.route.get_gateway()) + def test_router_processing_pool_size(self): + router_info_1 = self.generate_router_info(False) + r1 = self.manage_router(self.agent, router_info_1) + self.assertEqual(l3_agent.ROUTER_PROCESS_GREENLET_MIN, + self.agent._pool.size) + + router_info_2 = self.generate_router_info(False) + r2 = self.manage_router(self.agent, router_info_2) + self.assertEqual(l3_agent.ROUTER_PROCESS_GREENLET_MIN, + self.agent._pool.size) + + router_info_list = [r1, r2] + for _i in range(l3_agent.ROUTER_PROCESS_GREENLET_MAX + 1): + ri = self.generate_router_info(False) + rtr = self.manage_router(self.agent, ri) + router_info_list.append(rtr) + + self.assertEqual(l3_agent.ROUTER_PROCESS_GREENLET_MAX, + self.agent._pool.size) + + for router in router_info_list: + self.agent._router_removed(router.router_id) + + self.assertEqual(l3_agent.ROUTER_PROCESS_GREENLET_MIN, + self.agent._pool.size) + def _make_bridge(self): bridge = framework.get_ovs_bridge(utils.get_rand_name()) bridge.create()