Do not start kuryr-daemon when worker_num <= 1

We've discovered that running kuryr-daemon with [cni_daemon]worker_num=1
breaks pyroute2.IPDB's ability to correctly close threads, leading to a
process leak. This commit makes sure kuryr-daemon will fail to start
when worker_num <= 1.

This required a few more changes in order to make sure that when any
kuryr-daemon subservice dies, kuryr-daemon will shutdown too.

Change-Id: I41afc6fa67abfff62d2f0017db508051a1e7edf4
This commit is contained in:
Michał Dulko 2021-11-04 13:44:32 +01:00
parent 90d08658dc
commit 87981d0652
4 changed files with 30 additions and 11 deletions

View File

@ -195,6 +195,12 @@ class DaemonServer(object):
LOG.exception('Cannot start server on %s.', server_pair)
raise
if CONF.cni_daemon.worker_num <= 1:
msg = ('[cni_daemon]worker_num needs to be set to a value higher '
'than 1')
LOG.critical(msg)
raise exceptions.InvalidKuryrConfiguration(msg)
try:
self._server = serving.make_server(
address, port, self.application, threaded=False,
@ -387,6 +393,7 @@ class CNIDaemonServiceManager(cotyledon.ServiceManager):
# NOTE(mdulko): Default shutdown timeout is 60 seconds and K8s won't
# wait more by default anyway.
super(CNIDaemonServiceManager, self).__init__()
self._server_service = None
# TODO(dulek): Use cotyledon.oslo_config_glue to support conf reload.
# TODO(vikasc): Should be done using dynamically loadable OVO types
@ -403,11 +410,18 @@ class CNIDaemonServiceManager(cotyledon.ServiceManager):
healthy = multiprocessing.Value(c_bool, True)
metrics = self.manager.Queue()
self.add(CNIDaemonWatcherService, workers=1, args=(registry, healthy,))
self.add(CNIDaemonServerService, workers=1, args=(registry, healthy,
metrics,))
self._server_service = self.add(CNIDaemonServerService, workers=1,
args=(registry, healthy, metrics,))
self.add(CNIDaemonHealthServerService, workers=1, args=(healthy,))
self.add(CNIDaemonExporterService, workers=1, args=(metrics,))
self.register_hooks(on_terminate=self.terminate)
def shutdown_hook(service_id, worker_id, exit_code):
LOG.critical(f'Child Service {service_id} had exited with code '
f'{exit_code}, stopping kuryr-daemon')
self.shutdown()
self.register_hooks(on_terminate=self.terminate,
on_dead_worker=shutdown_hook)
def run(self):
# FIXME(darshna): Remove pyroute2 IPDB deprecation warning, remove
@ -440,12 +454,13 @@ class CNIDaemonServiceManager(cotyledon.ServiceManager):
def terminate(self):
self._terminate_called.set()
LOG.info("Gracefully stopping DaemonServer service..")
self.reconfigure(self._server_service, 0)
for worker in self._running_services[self._server_service]:
worker.terminate()
for worker in self._running_services[self._server_service]:
worker.join()
if self._server_service:
LOG.info("Gracefully stopping DaemonServer service..")
self.reconfigure(self._server_service, 0)
for worker in self._running_services[self._server_service]:
worker.terminate()
for worker in self._running_services[self._server_service]:
worker.join()
LOG.info("Stopping registry manager...")
self.manager.shutdown()
LOG.info("Continuing with shutdown")

View File

@ -24,6 +24,10 @@ class IntegrityError(RuntimeError):
pass
class InvalidKuryrConfiguration(RuntimeError):
pass
class ResourceNotReady(Exception):
def __init__(self, resource):
msg = resource

View File

@ -11,7 +11,7 @@ click==6.7
cliff==2.11.0
cmd2==0.8.2
contextlib2==0.5.5
cotyledon==1.5.0
cotyledon==1.7.3
coverage==4.0
ddt==1.0.1
debtcollector==1.19.0

View File

@ -2,7 +2,7 @@
# of appearance. Changing the order has an impact on the overall integration
# process, which may cause wedges in the gate later.
cotyledon>=1.5.0 # Apache-2.0
cotyledon>=1.7.3 # Apache-2.0
Flask!=0.11,>=0.12.3 # BSD
kuryr-lib>=0.5.0 # Apache-2.0
pbr!=2.1.0,>=2.0.0 # Apache-2.0