Modify api and rpc default number of workers

- Limit number of api workers to roughly using half of system RAM. Spawning a bunch, just to have the OOM killer nuke them regularly is not useful. - Bump the rpc_workers default to half of the api_workers. A default of 1 falls behind on any reasonably sized node. Change-Id: I8b84a359f83133014b3d4414aafc10e6b7c6a876 Closes-bug: #1815629
2019-02-12 08:47:19 -07:00
parent 418e3f398b
commit 7e09b25b96
9 changed files with 156 additions and 27 deletions
--- a/doc/source/admin/config-wsgi.rst
+++ b/doc/source/admin/config-wsgi.rst
@@ -127,3 +127,25 @@ serve this job:
    # /usr/bin/neutron-rpc-server --config-file /etc/neutron/neutron.conf --config-file /etc/neutron/plugins/ml2/ml2_conf.ini

 .. end
+
+Neutron Worker Processes
+------------------------
+
+Neutron will attempt to spawn a number of child processes for handling API
+and RPC requests. The number of API workers is set to the number of CPU
+cores, further limited by available memory, and the number of RPC workers
+is set to half that number.
+
+It is strongly recommended that all deployers set these values themselves,
+via the api_workers and rpc_workers configuration parameters.
+
+For a cloud with a high load to a relatively small number of objects,
+a smaller value for api_workers will provide better performance than
+many (somewhere around 4-8.) For a cloud with a high load to lots of
+different objects, then the more the better. Budget neutron-server
+using about 2GB of RAM in steady-state.
+
+For rpc_workers, there needs to be enough to keep up with incoming
+events from the various neutron agents. Signs that there are too few
+can be agent heartbeats arriving late, nova vif bindings timing out
+on the hypervisors, or rpc message timeout exceptions in agent logs.
--- a/neutron/cmd/status.py
+++ b/neutron/cmd/status.py
@@ -18,6 +18,7 @@ from oslo_log import log as logging
 from oslo_upgradecheck import upgradecheck

 from neutron.conf import common as neutron_conf_base
+from neutron.conf import service as neutron_conf_service

 CHECKS_ENTRYPOINTS = 'neutron.status.upgrade.checks'
 LOG = logging.getLogger(__name__)
@@ -50,6 +51,8 @@ def setup_conf(conf=cfg.CONF):
    """

    neutron_conf_base.register_core_common_config_opts(conf)
+    neutron_conf_service.register_service_opts(
+        neutron_conf_service.service_opts, cfg.CONF)
    return conf


--- a/neutron/cmd/upgrade_checks/checks.py
+++ b/neutron/cmd/upgrade_checks/checks.py
@@ -12,6 +12,7 @@
 # License for the specific language governing permissions and limitations
 # under the License.

+from oslo_config import cfg
 from oslo_upgradecheck import upgradecheck

 from neutron._i18n import _
@@ -22,12 +23,19 @@ class CoreChecks(base.BaseChecks):

    def get_checks(self):
        return [
-            (_("Check nothing"), self.noop_check)
+            (_("Worker counts configured"), self.worker_count_check)
        ]

    @staticmethod
-    def noop_check(checker):
-        # NOTE(slaweq) This is only example Noop check, it can be removed when
-        # some real check methods will be added
-        return upgradecheck.Result(
-            upgradecheck.Code.SUCCESS, _("Always succeed (placeholder)"))
+    def worker_count_check(checker):
+
+        if cfg.CONF.api_workers and cfg.CONF.rpc_workers:
+            return upgradecheck.Result(
+                upgradecheck.Code.SUCCESS, _("Number of workers already "
+                "defined in config"))
+        else:
+            return upgradecheck.Result(
+                upgradecheck.Code.WARNING, _("The default number of workers "
+                "has changed. Please see release notes for the new values, "
+                "but it is strongly encouraged for deployers to manually set "
+                "the values for api_workers and rpc_workers."))
--- a/neutron/conf/service.py
+++ b/neutron/conf/service.py
@@ -26,10 +26,12 @@ service_opts = [
    cfg.IntOpt('api_workers',
               help=_('Number of separate API worker processes for service. '
                      'If not specified, the default is equal to the number '
-                      'of CPUs available for best performance.')),
+                      'of CPUs available for best performance, capped by '
+                      'potential RAM usage.')),
    cfg.IntOpt('rpc_workers',
-               default=1,
-               help=_('Number of RPC worker processes for service.')),
+               help=_('Number of RPC worker processes for service. '
+                      'If not specified, the default is equal to half the '
+                      'number of API workers.')),
    cfg.IntOpt('rpc_state_report_workers',
               default=1,
               help=_('Number of RPC worker processes dedicated to state '
--- a/neutron/service.py
+++ b/neutron/service.py
@@ -33,6 +33,7 @@ from oslo_service import loopingcall
 from oslo_service import service as common_service
 from oslo_utils import excutils
 from oslo_utils import importutils
+import psutil

 from neutron.common import config
 from neutron.common import profiler
@@ -148,28 +149,51 @@ class RpcReportsWorker(RpcWorker):
    start_listeners_method = 'start_rpc_state_reports_listener'


-def _get_rpc_workers():
-    plugin = directory.get_plugin()
+def _get_worker_count():
+    # Start with the number of CPUs
+    num_workers = processutils.get_worker_count()
+
+    # Now don't use more than half the system memory, assuming
+    # a steady-state bloat of around 2GB.
+    mem = psutil.virtual_memory()
+    mem_workers = int(mem.total / (2 * 1024 * 1024 * 1024))
+    if mem_workers < num_workers:
+        num_workers = mem_workers
+
+    # And just in case, always at least one.
+    if num_workers <= 0:
+        num_workers = 1
+
+    return num_workers
+
+
+def _get_rpc_workers(plugin=None):
+    if plugin is None:
+        plugin = directory.get_plugin()
    service_plugins = directory.get_plugins().values()

-    if cfg.CONF.rpc_workers < 1:
-        cfg.CONF.set_override('rpc_workers', 1)
+    workers = cfg.CONF.rpc_workers
+    if workers is None:
+        # By default, half as many rpc workers as api workers
+        workers = int(_get_worker_count() / 2)
+    if workers < 1:
+        workers = 1

-    # If 0 < rpc_workers then start_rpc_listeners would be called in a
+    # If workers > 0 then start_rpc_listeners would be called in a
    # subprocess and we cannot simply catch the NotImplementedError.  It is
    # simpler to check this up front by testing whether the plugin supports
    # multiple RPC workers.
    if not plugin.rpc_workers_supported():
        LOG.debug("Active plugin doesn't implement start_rpc_listeners")
-        if 0 < cfg.CONF.rpc_workers:
+        if workers > 0:
            LOG.error("'rpc_workers = %d' ignored because "
                      "start_rpc_listeners is not implemented.",
-                      cfg.CONF.rpc_workers)
+                      workers)
        raise NotImplementedError()

    # passing service plugins only, because core plugin is among them
    rpc_workers = [RpcWorker(service_plugins,
-                             worker_process_count=cfg.CONF.rpc_workers)]
+                             worker_process_count=workers)]

    if (cfg.CONF.rpc_state_report_workers > 0 and
            plugin.rpc_state_report_workers_supported()):
@@ -283,7 +307,7 @@ def start_plugins_workers():
 def _get_api_workers():
    workers = cfg.CONF.api_workers
    if workers is None:
-        workers = processutils.get_worker_count()
+        workers = _get_worker_count()
    return workers


--- a/neutron/tests/functional/test_service.py
+++ b/neutron/tests/functional/test_service.py
@@ -24,8 +24,9 @@ from neutron.tests.functional import test_server
 class TestService(base.BaseLoggingTestCase):

    def test_api_workers_default(self):
-        self.assertEqual(processutils.get_worker_count(),
-                         neutron_service._get_api_workers())
+        # This value may end being scaled downward based on available RAM.
+        self.assertGreaterEqual(processutils.get_worker_count(),
+                                neutron_service._get_api_workers())

    def test_api_workers_from_config(self):
        cfg.CONF.set_override('api_workers', 1234)
--- a/neutron/tests/unit/cmd/upgrade_checks/test_checks.py
+++ b/neutron/tests/unit/cmd/upgrade_checks/test_checks.py
@@ -13,6 +13,7 @@
 # under the License.

 import mock
+from oslo_config import cfg
 from oslo_upgradecheck.upgradecheck import Code

 from neutron.cmd.upgrade_checks import checks
@@ -28,6 +29,26 @@ class TestChecks(base.BaseTestCase):
    def test_get_checks_list(self):
        self.assertIsInstance(self.checks.get_checks(), list)

-    def test_noop_check(self):
-        check_result = checks.CoreChecks.noop_check(mock.Mock())
-        self.assertEqual(Code.SUCCESS, check_result.code)
+    def test_worker_check_good(self):
+        cfg.CONF.set_override("api_workers", 2)
+        cfg.CONF.set_override("rpc_workers", 2)
+        result = checks.CoreChecks.worker_count_check(mock.Mock())
+        self.assertEqual(Code.SUCCESS, result.code)
+
+    def test_worker_check_api_missing(self):
+        cfg.CONF.set_override("api_workers", None)
+        cfg.CONF.set_override("rpc_workers", 2)
+        result = checks.CoreChecks.worker_count_check(mock.Mock())
+        self.assertEqual(Code.WARNING, result.code)
+
+    def test_worker_check_rpc_missing(self):
+        cfg.CONF.set_override("api_workers", 2)
+        cfg.CONF.set_override("rpc_workers", None)
+        result = checks.CoreChecks.worker_count_check(mock.Mock())
+        self.assertEqual(Code.WARNING, result.code)
+
+    def test_worker_check_both_missing(self):
+        cfg.CONF.set_override("api_workers", None)
+        cfg.CONF.set_override("rpc_workers", None)
+        result = checks.CoreChecks.worker_count_check(mock.Mock())
+        self.assertEqual(Code.WARNING, result.code)
--- a/neutron/tests/unit/test_service.py
+++ b/neutron/tests/unit/test_service.py
@@ -18,6 +18,7 @@ import mock
 from neutron_lib.callbacks import events
 from neutron_lib.callbacks import registry
 from neutron_lib.callbacks import resources
+from oslo_concurrency import processutils
 from oslo_config import cfg

 from neutron import service
@@ -25,6 +26,14 @@ from neutron.tests import base
 from neutron.tests.unit import test_wsgi


+class TestServiceHelpers(base.BaseTestCase):
+
+    def test_get_workers(self):
+        num_workers = service._get_worker_count()
+        self.assertGreaterEqual(num_workers, 1)
+        self.assertLessEqual(num_workers, processutils.get_worker_count())
+
+
 class TestRpcWorker(test_wsgi.TestServiceBase):

    def test_reset(self):
@@ -33,12 +42,36 @@ class TestRpcWorker(test_wsgi.TestServiceBase):
        self._test_reset(rpc_worker)


+class TestRunRpcWorkers(base.BaseTestCase):
+    def setUp(self):
+        super(TestRunRpcWorkers, self).setUp()
+        self.worker_count = service._get_worker_count()
+
+    def _test_rpc_workers(self, config_value, expected_passed_value):
+        if config_value is not None:
+            cfg.CONF.set_override('rpc_workers', config_value)
+        with mock.patch('neutron.service.RpcWorker') as mock_rpc_worker:
+            with mock.patch('neutron.service.RpcReportsWorker'):
+                service._get_rpc_workers(plugin=mock.Mock())
+        init_call = mock_rpc_worker.call_args
+        expected_call = mock.call(
+            mock.ANY, worker_process_count=expected_passed_value)
+        self.assertEqual(expected_call, init_call)
+
+    def test_rpc_workers_zero(self):
+        self._test_rpc_workers(0, 1)
+
+    def test_rpc_workers_default(self):
+        self._test_rpc_workers(None, int(self.worker_count / 2))
+
+    def test_rpc_workers_defined(self):
+        self._test_rpc_workers(42, 42)
+
+
 class TestRunWsgiApp(base.BaseTestCase):
    def setUp(self):
        super(TestRunWsgiApp, self).setUp()
-        self.processor_count = mock.patch(
-            'oslo_concurrency.processutils.get_worker_count'
-        ).start().return_value
+        self.worker_count = service._get_worker_count()

    def _test_api_workers(self, config_value, expected_passed_value):
        if config_value is not None:
@@ -54,7 +87,7 @@ class TestRunWsgiApp(base.BaseTestCase):
        self._test_api_workers(0, 0)

    def test_api_workers_default(self):
-        self._test_api_workers(None, self.processor_count)
+        self._test_api_workers(None, self.worker_count)

    def test_api_workers_defined(self):
        self._test_api_workers(42, 42)
--- a/releasenotes/notes/modify_api_rpc_worker_defaults-1acd62728b2b55fa.yaml
+++ b/releasenotes/notes/modify_api_rpc_worker_defaults-1acd62728b2b55fa.yaml
@@ -0,0 +1,15 @@
+upgrade:
+  - The number of api and rpc workers may change on upgrade.
+    It is strongly recommended that all deployers set these
+    values in their neutron configurations, rather than
+    using the defaults.
+fixes:
+  - Neutron API workers default to the number of CPU cores.
+    This can lead to high cpu/low memory boxes getting into
+    trouble. The defaults have been tweaked to attempt to
+    put an upper bound on the default of either the number
+    of cores, or half of system memory, whichever is lower.
+    In addition, the default number of RPC workers has been
+    changed from a value of ``1``, to a value of half the
+    number of API workers.
+