From 224cdd726cc0ca1826273e46619769f87380f2d7 Mon Sep 17 00:00:00 2001 From: Dmitry Tantsur Date: Mon, 2 Oct 2023 18:42:07 +0200 Subject: [PATCH] Bump workers_pool_size to 300 and remove queueing of tasks Especially in a single-conductor environment, the number of threads should be larger than max_concurrent_deploy, otherwise the latter cannot be reached in practice or will cause issues with heartbeats. On the other hand, this change fixes an issue with how we use futurist. Due to a misunderstanding, we ended up setting the workers pool size to 100 and then also allowing 100 more requests to be queued. To be it shortly, this change moves from 100 threads + 100 queued to 300 threads and no queue. Partial-Bug: #2038438 Change-Id: I1aeeda89a8925fbbc2dae752742f0be4bc23bee0 --- ironic/conductor/base_manager.py | 9 ++++--- ironic/conf/conductor.py | 2 +- .../notes/workers-20ca5c225c1474e0.yaml | 25 +++++++++++++++++++ 3 files changed, 32 insertions(+), 4 deletions(-) create mode 100644 releasenotes/notes/workers-20ca5c225c1474e0.yaml diff --git a/ironic/conductor/base_manager.py b/ironic/conductor/base_manager.py index 47f2081ac7..7287ea4ebf 100644 --- a/ironic/conductor/base_manager.py +++ b/ironic/conductor/base_manager.py @@ -125,9 +125,12 @@ class BaseConductorManager(object): self._keepalive_evt = threading.Event() """Event for the keepalive thread.""" - # TODO(dtantsur): make the threshold configurable? - rejection_func = rejection.reject_when_reached( - CONF.conductor.workers_pool_size) + # NOTE(dtantsur): do not allow queuing work. Given our model, it's + # better to reject an incoming request with HTTP 503 or reschedule + # a periodic task that end up with hidden backlog that is hard + # to track and debug. Using 1 instead of 0 because of how things are + # ordered in futurist (it checks for rejection first). + rejection_func = rejection.reject_when_reached(1) self._executor = futurist.GreenThreadPoolExecutor( max_workers=CONF.conductor.workers_pool_size, check_and_reject=rejection_func) diff --git a/ironic/conf/conductor.py b/ironic/conf/conductor.py index 25b0453da3..416d30ccb0 100644 --- a/ironic/conf/conductor.py +++ b/ironic/conf/conductor.py @@ -22,7 +22,7 @@ from ironic.common.i18n import _ opts = [ cfg.IntOpt('workers_pool_size', - default=100, min=3, + default=300, min=3, help=_('The size of the workers greenthread pool. ' 'Note that 2 threads will be reserved by the conductor ' 'itself for handling heart beats and periodic tasks. ' diff --git a/releasenotes/notes/workers-20ca5c225c1474e0.yaml b/releasenotes/notes/workers-20ca5c225c1474e0.yaml new file mode 100644 index 0000000000..3b55be736b --- /dev/null +++ b/releasenotes/notes/workers-20ca5c225c1474e0.yaml @@ -0,0 +1,25 @@ +--- +issues: + - | + When configuring a single-conductor environment, make sure the number + of worker pools (``[conductor]worker_pool_size``) is larger than the + maximum parallel deployments (``[conductor]max_concurrent_deploy``). + This was not the case by default previously (the options used to be set + to 100 and 250 accordingly). +upgrade: + - | + Because of a fix in the internal worker pool handling, you may now start + seeing requests rejected with HTTP 503 under a very high load earlier than + before. In this case, try increasing the ``[conductor]worker_pool_size`` + option or consider adding more conductors. + - | + The default worker pool size (the ``[conductor]worker_pool_size`` option) + has been increased from 100 to 300. You may want to consider increasing + it even further if your environment allows that. +fixes: + - | + Fixes handling new requests when the maximum number of internal workers + is reached. Previously, after reaching the maximum number of workers + (100 by default), we would queue the same number of requests (100 again). + This was not intentional, and now Ironic no longer queues requests if + there are no free threads to run them.