Dynamically increase l3 router process queue green pool size

There is a race condition between nova-compute boots instance and
l3-agent processes DVR (local) router in compute node. This issue
can be seen when a large number of instances were booted to one
same host, and instances are under different DVR router. So the
l3-agent will concurrently process all these dvr routers in this
host at the same time.
For now we have a green pool for the router ResourceProcessingQueue
with 8 greenlet, but some of these routers can still be waiting, event
worse thing is that there are time-consuming actions during the router
processing procedure. For instance, installing arp entries, iptables
rules, route rules etc.
So when the VM is up, it will try to get meta via the local proxy
hosting by the dvr router. But the router is not ready yet in that
host. And finally those instances will not be able to setup some
config in the guest OS.

This patch adds a new measurement based on the router quantity to
indicate the L3 router process queue green pool size. The pool size
will be limit from 8 (original value) to 32, because we do not want
the L3 agent cost too much host resource on processing router in the
compute node.

Related-Bug: #1813787
Change-Id: I62393864a103d666d5d9d379073f5fc23ac7d114
This commit is contained in:
LIU Yulong 2019-01-30 09:54:52 +08:00 committed by LIU Yulong
parent 2f3cc51784
commit 837c9283ab
2 changed files with 47 additions and 2 deletions

View File

@ -24,6 +24,7 @@ from neutron_lib import constants as lib_const
from neutron_lib import context as n_context from neutron_lib import context as n_context
from neutron_lib.exceptions import l3 as l3_exc from neutron_lib.exceptions import l3 as l3_exc
from neutron_lib import rpc as n_rpc from neutron_lib import rpc as n_rpc
from oslo_concurrency import lockutils
from oslo_config import cfg from oslo_config import cfg
from oslo_context import context as common_context from oslo_context import context as common_context
from oslo_log import log as logging from oslo_log import log as logging
@ -81,6 +82,9 @@ PD_UPDATE = 5
RELATED_ACTION_MAP = {DELETE_ROUTER: DELETE_RELATED_ROUTER, RELATED_ACTION_MAP = {DELETE_ROUTER: DELETE_RELATED_ROUTER,
ADD_UPDATE_ROUTER: ADD_UPDATE_RELATED_ROUTER} ADD_UPDATE_ROUTER: ADD_UPDATE_RELATED_ROUTER}
ROUTER_PROCESS_GREENLET_MAX = 32
ROUTER_PROCESS_GREENLET_MIN = 8
def log_verbose_exc(message, router_payload): def log_verbose_exc(message, router_payload):
LOG.exception(message) LOG.exception(message)
@ -255,6 +259,8 @@ class L3NATAgent(ha.AgentMixin,
self.driver, self.driver,
self.metadata_driver) self.metadata_driver)
# L3 agent router processing green pool
self._pool = eventlet.GreenPool(size=ROUTER_PROCESS_GREENLET_MIN)
self._queue = queue.ResourceProcessingQueue() self._queue = queue.ResourceProcessingQueue()
super(L3NATAgent, self).__init__(host=self.conf.host) super(L3NATAgent, self).__init__(host=self.conf.host)
@ -368,6 +374,15 @@ class L3NATAgent(ha.AgentMixin,
return legacy_router.LegacyRouter(*args, **kwargs) return legacy_router.LegacyRouter(*args, **kwargs)
@lockutils.synchronized('resize_greenpool')
def _resize_process_pool(self):
self._pool_size = max([ROUTER_PROCESS_GREENLET_MIN,
min([ROUTER_PROCESS_GREENLET_MAX,
len(self.router_info)])])
LOG.info("Resizing router processing queue green pool size to: %d",
self._pool_size)
self._pool.resize(self._pool_size)
def _router_added(self, router_id, router): def _router_added(self, router_id, router):
ri = self._create_router(router_id, router) ri = self._create_router(router_id, router)
registry.notify(resources.ROUTER, events.BEFORE_CREATE, registry.notify(resources.ROUTER, events.BEFORE_CREATE,
@ -390,6 +405,8 @@ class L3NATAgent(ha.AgentMixin,
LOG.exception('Error while deleting router %s', LOG.exception('Error while deleting router %s',
router_id) router_id)
self._resize_process_pool()
def _safe_router_removed(self, router_id): def _safe_router_removed(self, router_id):
"""Try to delete a router and return True if successful.""" """Try to delete a router and return True if successful."""
# The l3_ext_manager API expects a router dict, look it up # The l3_ext_manager API expects a router dict, look it up
@ -422,6 +439,8 @@ class L3NATAgent(ha.AgentMixin,
registry.notify(resources.ROUTER, events.AFTER_DELETE, self, router=ri) registry.notify(resources.ROUTER, events.AFTER_DELETE, self, router=ri)
self._resize_process_pool()
def init_extension_manager(self, connection): def init_extension_manager(self, connection):
l3_ext_manager.register_opts(self.conf) l3_ext_manager.register_opts(self.conf)
self.agent_api = l3_ext_api.L3AgentExtensionAPI(self.router_info) self.agent_api = l3_ext_api.L3AgentExtensionAPI(self.router_info)
@ -639,9 +658,8 @@ class L3NATAgent(ha.AgentMixin,
def _process_routers_loop(self): def _process_routers_loop(self):
LOG.debug("Starting _process_routers_loop") LOG.debug("Starting _process_routers_loop")
pool = eventlet.GreenPool(size=8)
while True: while True:
pool.spawn_n(self._process_router_update) self._pool.spawn_n(self._process_router_update)
# NOTE(kevinbenton): this is set to 1 second because the actual interval # NOTE(kevinbenton): this is set to 1 second because the actual interval
# is controlled by a FixedIntervalLoopingCall in neutron/service.py that # is controlled by a FixedIntervalLoopingCall in neutron/service.py that

View File

@ -21,6 +21,7 @@ from neutron_lib.callbacks import registry
from neutron_lib.callbacks import resources from neutron_lib.callbacks import resources
from neutron_lib import constants as lib_constants from neutron_lib import constants as lib_constants
from neutron.agent.l3 import agent as l3_agent
from neutron.agent.l3 import namespace_manager from neutron.agent.l3 import namespace_manager
from neutron.agent.l3 import namespaces from neutron.agent.l3 import namespaces
from neutron.agent.linux import ip_lib from neutron.agent.linux import ip_lib
@ -107,6 +108,32 @@ class L3AgentTestCase(framework.L3AgentTestFramework):
self.assertIsNone(device.route.get_gateway()) self.assertIsNone(device.route.get_gateway())
def test_router_processing_pool_size(self):
router_info_1 = self.generate_router_info(False)
r1 = self.manage_router(self.agent, router_info_1)
self.assertEqual(l3_agent.ROUTER_PROCESS_GREENLET_MIN,
self.agent._pool.size)
router_info_2 = self.generate_router_info(False)
r2 = self.manage_router(self.agent, router_info_2)
self.assertEqual(l3_agent.ROUTER_PROCESS_GREENLET_MIN,
self.agent._pool.size)
router_info_list = [r1, r2]
for _i in range(l3_agent.ROUTER_PROCESS_GREENLET_MAX + 1):
ri = self.generate_router_info(False)
rtr = self.manage_router(self.agent, ri)
router_info_list.append(rtr)
self.assertEqual(l3_agent.ROUTER_PROCESS_GREENLET_MAX,
self.agent._pool.size)
for router in router_info_list:
self.agent._router_removed(router, router.router_id)
self.assertEqual(l3_agent.ROUTER_PROCESS_GREENLET_MIN,
self.agent._pool.size)
def _make_bridge(self): def _make_bridge(self):
bridge = framework.get_ovs_bridge(utils.get_rand_name()) bridge = framework.get_ovs_bridge(utils.get_rand_name())
bridge.create() bridge.create()