Modify api and rpc default number of workers
- Limit number of api workers to roughly using half of system RAM. Spawning a bunch, just to have the OOM killer nuke them regularly is not useful. - Bump the rpc_workers default to half of the api_workers. A default of 1 falls behind on any reasonably sized node. Change-Id: I8b84a359f83133014b3d4414aafc10e6b7c6a876 Closes-bug: #1815629
This commit is contained in:
parent
418e3f398b
commit
7e09b25b96
@ -127,3 +127,25 @@ serve this job:
|
||||
# /usr/bin/neutron-rpc-server --config-file /etc/neutron/neutron.conf --config-file /etc/neutron/plugins/ml2/ml2_conf.ini
|
||||
|
||||
.. end
|
||||
|
||||
Neutron Worker Processes
|
||||
------------------------
|
||||
|
||||
Neutron will attempt to spawn a number of child processes for handling API
|
||||
and RPC requests. The number of API workers is set to the number of CPU
|
||||
cores, further limited by available memory, and the number of RPC workers
|
||||
is set to half that number.
|
||||
|
||||
It is strongly recommended that all deployers set these values themselves,
|
||||
via the api_workers and rpc_workers configuration parameters.
|
||||
|
||||
For a cloud with a high load to a relatively small number of objects,
|
||||
a smaller value for api_workers will provide better performance than
|
||||
many (somewhere around 4-8.) For a cloud with a high load to lots of
|
||||
different objects, then the more the better. Budget neutron-server
|
||||
using about 2GB of RAM in steady-state.
|
||||
|
||||
For rpc_workers, there needs to be enough to keep up with incoming
|
||||
events from the various neutron agents. Signs that there are too few
|
||||
can be agent heartbeats arriving late, nova vif bindings timing out
|
||||
on the hypervisors, or rpc message timeout exceptions in agent logs.
|
||||
|
@ -18,6 +18,7 @@ from oslo_log import log as logging
|
||||
from oslo_upgradecheck import upgradecheck
|
||||
|
||||
from neutron.conf import common as neutron_conf_base
|
||||
from neutron.conf import service as neutron_conf_service
|
||||
|
||||
CHECKS_ENTRYPOINTS = 'neutron.status.upgrade.checks'
|
||||
LOG = logging.getLogger(__name__)
|
||||
@ -50,6 +51,8 @@ def setup_conf(conf=cfg.CONF):
|
||||
"""
|
||||
|
||||
neutron_conf_base.register_core_common_config_opts(conf)
|
||||
neutron_conf_service.register_service_opts(
|
||||
neutron_conf_service.service_opts, cfg.CONF)
|
||||
return conf
|
||||
|
||||
|
||||
|
@ -12,6 +12,7 @@
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from oslo_config import cfg
|
||||
from oslo_upgradecheck import upgradecheck
|
||||
|
||||
from neutron._i18n import _
|
||||
@ -22,12 +23,19 @@ class CoreChecks(base.BaseChecks):
|
||||
|
||||
def get_checks(self):
|
||||
return [
|
||||
(_("Check nothing"), self.noop_check)
|
||||
(_("Worker counts configured"), self.worker_count_check)
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def noop_check(checker):
|
||||
# NOTE(slaweq) This is only example Noop check, it can be removed when
|
||||
# some real check methods will be added
|
||||
return upgradecheck.Result(
|
||||
upgradecheck.Code.SUCCESS, _("Always succeed (placeholder)"))
|
||||
def worker_count_check(checker):
|
||||
|
||||
if cfg.CONF.api_workers and cfg.CONF.rpc_workers:
|
||||
return upgradecheck.Result(
|
||||
upgradecheck.Code.SUCCESS, _("Number of workers already "
|
||||
"defined in config"))
|
||||
else:
|
||||
return upgradecheck.Result(
|
||||
upgradecheck.Code.WARNING, _("The default number of workers "
|
||||
"has changed. Please see release notes for the new values, "
|
||||
"but it is strongly encouraged for deployers to manually set "
|
||||
"the values for api_workers and rpc_workers."))
|
||||
|
@ -26,10 +26,12 @@ service_opts = [
|
||||
cfg.IntOpt('api_workers',
|
||||
help=_('Number of separate API worker processes for service. '
|
||||
'If not specified, the default is equal to the number '
|
||||
'of CPUs available for best performance.')),
|
||||
'of CPUs available for best performance, capped by '
|
||||
'potential RAM usage.')),
|
||||
cfg.IntOpt('rpc_workers',
|
||||
default=1,
|
||||
help=_('Number of RPC worker processes for service.')),
|
||||
help=_('Number of RPC worker processes for service. '
|
||||
'If not specified, the default is equal to half the '
|
||||
'number of API workers.')),
|
||||
cfg.IntOpt('rpc_state_report_workers',
|
||||
default=1,
|
||||
help=_('Number of RPC worker processes dedicated to state '
|
||||
|
@ -33,6 +33,7 @@ from oslo_service import loopingcall
|
||||
from oslo_service import service as common_service
|
||||
from oslo_utils import excutils
|
||||
from oslo_utils import importutils
|
||||
import psutil
|
||||
|
||||
from neutron.common import config
|
||||
from neutron.common import profiler
|
||||
@ -148,28 +149,51 @@ class RpcReportsWorker(RpcWorker):
|
||||
start_listeners_method = 'start_rpc_state_reports_listener'
|
||||
|
||||
|
||||
def _get_rpc_workers():
|
||||
plugin = directory.get_plugin()
|
||||
def _get_worker_count():
|
||||
# Start with the number of CPUs
|
||||
num_workers = processutils.get_worker_count()
|
||||
|
||||
# Now don't use more than half the system memory, assuming
|
||||
# a steady-state bloat of around 2GB.
|
||||
mem = psutil.virtual_memory()
|
||||
mem_workers = int(mem.total / (2 * 1024 * 1024 * 1024))
|
||||
if mem_workers < num_workers:
|
||||
num_workers = mem_workers
|
||||
|
||||
# And just in case, always at least one.
|
||||
if num_workers <= 0:
|
||||
num_workers = 1
|
||||
|
||||
return num_workers
|
||||
|
||||
|
||||
def _get_rpc_workers(plugin=None):
|
||||
if plugin is None:
|
||||
plugin = directory.get_plugin()
|
||||
service_plugins = directory.get_plugins().values()
|
||||
|
||||
if cfg.CONF.rpc_workers < 1:
|
||||
cfg.CONF.set_override('rpc_workers', 1)
|
||||
workers = cfg.CONF.rpc_workers
|
||||
if workers is None:
|
||||
# By default, half as many rpc workers as api workers
|
||||
workers = int(_get_worker_count() / 2)
|
||||
if workers < 1:
|
||||
workers = 1
|
||||
|
||||
# If 0 < rpc_workers then start_rpc_listeners would be called in a
|
||||
# If workers > 0 then start_rpc_listeners would be called in a
|
||||
# subprocess and we cannot simply catch the NotImplementedError. It is
|
||||
# simpler to check this up front by testing whether the plugin supports
|
||||
# multiple RPC workers.
|
||||
if not plugin.rpc_workers_supported():
|
||||
LOG.debug("Active plugin doesn't implement start_rpc_listeners")
|
||||
if 0 < cfg.CONF.rpc_workers:
|
||||
if workers > 0:
|
||||
LOG.error("'rpc_workers = %d' ignored because "
|
||||
"start_rpc_listeners is not implemented.",
|
||||
cfg.CONF.rpc_workers)
|
||||
workers)
|
||||
raise NotImplementedError()
|
||||
|
||||
# passing service plugins only, because core plugin is among them
|
||||
rpc_workers = [RpcWorker(service_plugins,
|
||||
worker_process_count=cfg.CONF.rpc_workers)]
|
||||
worker_process_count=workers)]
|
||||
|
||||
if (cfg.CONF.rpc_state_report_workers > 0 and
|
||||
plugin.rpc_state_report_workers_supported()):
|
||||
@ -283,7 +307,7 @@ def start_plugins_workers():
|
||||
def _get_api_workers():
|
||||
workers = cfg.CONF.api_workers
|
||||
if workers is None:
|
||||
workers = processutils.get_worker_count()
|
||||
workers = _get_worker_count()
|
||||
return workers
|
||||
|
||||
|
||||
|
@ -24,8 +24,9 @@ from neutron.tests.functional import test_server
|
||||
class TestService(base.BaseLoggingTestCase):
|
||||
|
||||
def test_api_workers_default(self):
|
||||
self.assertEqual(processutils.get_worker_count(),
|
||||
neutron_service._get_api_workers())
|
||||
# This value may end being scaled downward based on available RAM.
|
||||
self.assertGreaterEqual(processutils.get_worker_count(),
|
||||
neutron_service._get_api_workers())
|
||||
|
||||
def test_api_workers_from_config(self):
|
||||
cfg.CONF.set_override('api_workers', 1234)
|
||||
|
@ -13,6 +13,7 @@
|
||||
# under the License.
|
||||
|
||||
import mock
|
||||
from oslo_config import cfg
|
||||
from oslo_upgradecheck.upgradecheck import Code
|
||||
|
||||
from neutron.cmd.upgrade_checks import checks
|
||||
@ -28,6 +29,26 @@ class TestChecks(base.BaseTestCase):
|
||||
def test_get_checks_list(self):
|
||||
self.assertIsInstance(self.checks.get_checks(), list)
|
||||
|
||||
def test_noop_check(self):
|
||||
check_result = checks.CoreChecks.noop_check(mock.Mock())
|
||||
self.assertEqual(Code.SUCCESS, check_result.code)
|
||||
def test_worker_check_good(self):
|
||||
cfg.CONF.set_override("api_workers", 2)
|
||||
cfg.CONF.set_override("rpc_workers", 2)
|
||||
result = checks.CoreChecks.worker_count_check(mock.Mock())
|
||||
self.assertEqual(Code.SUCCESS, result.code)
|
||||
|
||||
def test_worker_check_api_missing(self):
|
||||
cfg.CONF.set_override("api_workers", None)
|
||||
cfg.CONF.set_override("rpc_workers", 2)
|
||||
result = checks.CoreChecks.worker_count_check(mock.Mock())
|
||||
self.assertEqual(Code.WARNING, result.code)
|
||||
|
||||
def test_worker_check_rpc_missing(self):
|
||||
cfg.CONF.set_override("api_workers", 2)
|
||||
cfg.CONF.set_override("rpc_workers", None)
|
||||
result = checks.CoreChecks.worker_count_check(mock.Mock())
|
||||
self.assertEqual(Code.WARNING, result.code)
|
||||
|
||||
def test_worker_check_both_missing(self):
|
||||
cfg.CONF.set_override("api_workers", None)
|
||||
cfg.CONF.set_override("rpc_workers", None)
|
||||
result = checks.CoreChecks.worker_count_check(mock.Mock())
|
||||
self.assertEqual(Code.WARNING, result.code)
|
||||
|
@ -18,6 +18,7 @@ import mock
|
||||
from neutron_lib.callbacks import events
|
||||
from neutron_lib.callbacks import registry
|
||||
from neutron_lib.callbacks import resources
|
||||
from oslo_concurrency import processutils
|
||||
from oslo_config import cfg
|
||||
|
||||
from neutron import service
|
||||
@ -25,6 +26,14 @@ from neutron.tests import base
|
||||
from neutron.tests.unit import test_wsgi
|
||||
|
||||
|
||||
class TestServiceHelpers(base.BaseTestCase):
|
||||
|
||||
def test_get_workers(self):
|
||||
num_workers = service._get_worker_count()
|
||||
self.assertGreaterEqual(num_workers, 1)
|
||||
self.assertLessEqual(num_workers, processutils.get_worker_count())
|
||||
|
||||
|
||||
class TestRpcWorker(test_wsgi.TestServiceBase):
|
||||
|
||||
def test_reset(self):
|
||||
@ -33,12 +42,36 @@ class TestRpcWorker(test_wsgi.TestServiceBase):
|
||||
self._test_reset(rpc_worker)
|
||||
|
||||
|
||||
class TestRunRpcWorkers(base.BaseTestCase):
|
||||
def setUp(self):
|
||||
super(TestRunRpcWorkers, self).setUp()
|
||||
self.worker_count = service._get_worker_count()
|
||||
|
||||
def _test_rpc_workers(self, config_value, expected_passed_value):
|
||||
if config_value is not None:
|
||||
cfg.CONF.set_override('rpc_workers', config_value)
|
||||
with mock.patch('neutron.service.RpcWorker') as mock_rpc_worker:
|
||||
with mock.patch('neutron.service.RpcReportsWorker'):
|
||||
service._get_rpc_workers(plugin=mock.Mock())
|
||||
init_call = mock_rpc_worker.call_args
|
||||
expected_call = mock.call(
|
||||
mock.ANY, worker_process_count=expected_passed_value)
|
||||
self.assertEqual(expected_call, init_call)
|
||||
|
||||
def test_rpc_workers_zero(self):
|
||||
self._test_rpc_workers(0, 1)
|
||||
|
||||
def test_rpc_workers_default(self):
|
||||
self._test_rpc_workers(None, int(self.worker_count / 2))
|
||||
|
||||
def test_rpc_workers_defined(self):
|
||||
self._test_rpc_workers(42, 42)
|
||||
|
||||
|
||||
class TestRunWsgiApp(base.BaseTestCase):
|
||||
def setUp(self):
|
||||
super(TestRunWsgiApp, self).setUp()
|
||||
self.processor_count = mock.patch(
|
||||
'oslo_concurrency.processutils.get_worker_count'
|
||||
).start().return_value
|
||||
self.worker_count = service._get_worker_count()
|
||||
|
||||
def _test_api_workers(self, config_value, expected_passed_value):
|
||||
if config_value is not None:
|
||||
@ -54,7 +87,7 @@ class TestRunWsgiApp(base.BaseTestCase):
|
||||
self._test_api_workers(0, 0)
|
||||
|
||||
def test_api_workers_default(self):
|
||||
self._test_api_workers(None, self.processor_count)
|
||||
self._test_api_workers(None, self.worker_count)
|
||||
|
||||
def test_api_workers_defined(self):
|
||||
self._test_api_workers(42, 42)
|
||||
|
@ -0,0 +1,15 @@
|
||||
upgrade:
|
||||
- The number of api and rpc workers may change on upgrade.
|
||||
It is strongly recommended that all deployers set these
|
||||
values in their neutron configurations, rather than
|
||||
using the defaults.
|
||||
fixes:
|
||||
- Neutron API workers default to the number of CPU cores.
|
||||
This can lead to high cpu/low memory boxes getting into
|
||||
trouble. The defaults have been tweaked to attempt to
|
||||
put an upper bound on the default of either the number
|
||||
of cores, or half of system memory, whichever is lower.
|
||||
In addition, the default number of RPC workers has been
|
||||
changed from a value of ``1``, to a value of half the
|
||||
number of API workers.
|
||||
|
Loading…
Reference in New Issue
Block a user