From b24837a73d41cf526a928beb281188b624e05f7a Mon Sep 17 00:00:00 2001 From: Kyle MacLeod Date: Fri, 6 May 2022 13:51:19 -0400 Subject: [PATCH] Registration-based subprocess cleanup on service shutdown Introduce a helper class SubprocessCleanup in dccommon which allows a worker to register a subprocess that must be cleaned up (killed) upon service exit. There are two parts to this mechanism: 1. Registration: - The subprocess is registered for cleanup when spawned (see utils.run_playbook_with_timeout) - Suprocess is also spawned using setsid in order to start a new process group + session 2. The Service calls subprocess_cleanup upon stopping. - All registered subprocesses are terminated using the os.killpg() call to terminate the entire subprocess process group. Caveat: This mechanism only handles clean process exit cases. If the process crashes or is is killed non-gracefully via SIGKILL, the cleanup will not happen. Closes-Bug: 1972013 Test Plan: PASS: Orchestrated prestaging: * Perform system host-swact while prestaging packages in progress - ansible-playbook is terminated - prestaging task is marked as prestaging-failed * Perform system host-swact while prestaging images in progress - ansible-playbook is terminated - prestaging task is marked as prestaging-failed * Restart dcmanager-orchestrator service for the same two cases as above - behaviour is the same as for swact * Kill dcmanager-orchestrator service while prestaging in progress Non-Orchestrated prestaging: * Perform host-swact and service restart for non-orchestrated prestaging - ansible-playbook is terminated - subcloud deploy status marked as prestaging-failed Swact during large-scale subcloud add - initiate large number of subcloud add operations - swact during 'installing' state - swact during 'bootstrapping' state - verify that ansible playbooks are killed - verify that deploy status is updated with -failed state Not covered: Tested a sudo 'pkill -9 dcmanager-manager' (ungraceful SIGKILL) - in this case the ansible subprocess tree is not cleaned up - this is expected - we aren't handling a non-clean shutdown Signed-off-by: Kyle MacLeod Change-Id: I714398017b71c99edeeaa828933edd8163fb67cd --- distributedcloud/dccommon/subcloud_install.py | 2 + .../dccommon/subprocess_cleanup.py | 73 +++++++++++++++++++ .../dccommon/tests/unit/test_utils.py | 4 +- distributedcloud/dccommon/utils.py | 53 ++++++-------- distributedcloud/dcmanager/common/prestage.py | 7 +- distributedcloud/dcmanager/manager/service.py | 8 +- .../dcmanager/orchestrator/orch_thread.py | 3 +- .../dcmanager/orchestrator/service.py | 6 +- 8 files changed, 112 insertions(+), 44 deletions(-) create mode 100644 distributedcloud/dccommon/subprocess_cleanup.py diff --git a/distributedcloud/dccommon/subcloud_install.py b/distributedcloud/dccommon/subcloud_install.py index d05fb6906..85613f08f 100644 --- a/distributedcloud/dccommon/subcloud_install.py +++ b/distributedcloud/dccommon/subcloud_install.py @@ -448,6 +448,8 @@ class SubcloudInstall(object): log_file = os.path.join(log_file_dir, self.name) + '_playbook_output.log' try: + # Since this is a long-running task we want to register + # for cleanup on process restart/SWACT. run_playbook(log_file, install_command) except exceptions.PlaybookExecutionFailed: msg = ("Failed to install the subcloud %s, check individual " diff --git a/distributedcloud/dccommon/subprocess_cleanup.py b/distributedcloud/dccommon/subprocess_cleanup.py new file mode 100644 index 000000000..d0f5f9c80 --- /dev/null +++ b/distributedcloud/dccommon/subprocess_cleanup.py @@ -0,0 +1,73 @@ +# +# Copyright (c) 2022 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +import os +import signal +import time + +from oslo_concurrency import lockutils +from oslo_log import log as logging + +LOG = logging.getLogger(__name__) + + +class SubprocessCleanup(object): + """Lifecycle manager for subprocesses spawned via python subprocess. + + Notes: + - This is a best-effort cleanup. We need to preserve fast shutdown + times in case of a SWACT. + - There could potentially be multiple hundreds of subprocesses needing + to be cleaned up here. + """ + + LOCK_NAME = 'subprocess-cleanup' + SUBPROCESS_GROUPS = {} + + @staticmethod + def register_subprocess_group(subprocess_p): + SubprocessCleanup.SUBPROCESS_GROUPS[subprocess_p.pid] = subprocess_p + + @staticmethod + def unregister_subprocess_group(subprocess_p): + SubprocessCleanup.SUBPROCESS_GROUPS.pop(subprocess_p.pid, None) + + @staticmethod + @lockutils.synchronized(LOCK_NAME) + def shutdown_cleanup(origin='service'): + SubprocessCleanup._shutdown_subprocess_groups(origin) + + @staticmethod + def _shutdown_subprocess_groups(origin): + num_process_groups = len(SubprocessCleanup.SUBPROCESS_GROUPS) + if num_process_groups > 0: + LOG.warn("Shutting down %d process groups via %s", + num_process_groups, origin) + start_time = time.time() + for _, subp in SubprocessCleanup.SUBPROCESS_GROUPS.items(): + kill_subprocess_group(subp) + LOG.info("Time for %s child processes to exit: %s", + num_process_groups, + time.time() - start_time) + + +def kill_subprocess_group(subp, logmsg=None): + """Kill the subprocess and any children.""" + exitcode = subp.poll() + if exitcode: + LOG.info("kill_subprocess_tree: subprocess has already " + "terminated, pid: %s, exitcode=%s", subp.pid, exitcode) + return + + if logmsg: + LOG.warn(logmsg) + else: + LOG.warn("Killing subprocess group for pid: %s, args: %s", + subp.pid, subp.args) + # Send a SIGTERM (normal kill). We do not verify if the processes + # are shutdown (best-effort), since we don't want to wait around before + # issueing a SIGKILL (fast shutdown) + os.killpg(subp.pid, signal.SIGTERM) diff --git a/distributedcloud/dccommon/tests/unit/test_utils.py b/distributedcloud/dccommon/tests/unit/test_utils.py index b30656b59..78e6ebccc 100644 --- a/distributedcloud/dccommon/tests/unit/test_utils.py +++ b/distributedcloud/dccommon/tests/unit/test_utils.py @@ -25,7 +25,7 @@ class TestUtils(base.DCCommonTestCase): def test_run_playbook_timeout(self): testscript = ['dccommon/tests/unit/test_utils_script.sh', '30'] self.assertRaises(PlaybookExecutionTimeout, - utils.run_playbook_with_timeout, + utils.run_playbook, '/dev/null', testscript, timeout=2) @@ -36,7 +36,7 @@ class TestUtils(base.DCCommonTestCase): # a hung process script = ['dccommon/tests/unit/test_utils_script.sh', '30', 'TERM'] self.assertRaises(PlaybookExecutionTimeout, - utils.run_playbook_with_timeout, + utils.run_playbook, '/dev/null', script, timeout=2) diff --git a/distributedcloud/dccommon/utils.py b/distributedcloud/dccommon/utils.py index 5e04b71eb..679c8afa4 100644 --- a/distributedcloud/dccommon/utils.py +++ b/distributedcloud/dccommon/utils.py @@ -23,6 +23,7 @@ from oslo_utils import timeutils from dccommon.exceptions import PlaybookExecutionFailed from dccommon.exceptions import PlaybookExecutionTimeout +from dccommon.subprocess_cleanup import SubprocessCleanup LOG = logging.getLogger(__name__) ANSIBLE_PASSWD_PARMS = ['ansible_ssh_pass', 'ansible_become_pass'] @@ -59,40 +60,14 @@ def _strip_password_from_command(script_command): return logged_command -def run_playbook(log_file, playbook_command): - """Run ansible playbook via subprocess""" - exec_env = os.environ.copy() - exec_env["ANSIBLE_LOG_PATH"] = "/dev/null" - - with open(log_file, "a+") as f_out_log: - try: - logged_playbook_command = \ - _strip_password_from_command(playbook_command) - txt = "%s Executing playbook command: %s\n" \ - % (datetime.today().strftime('%Y-%m-%d-%H:%M:%S'), - logged_playbook_command) - f_out_log.write(txt) - f_out_log.flush() - - subprocess.check_call( # pylint: disable=E1102 - playbook_command, - stdout=f_out_log, - stderr=f_out_log, - env=exec_env) - except subprocess.CalledProcessError: - raise PlaybookExecutionFailed(playbook_cmd=playbook_command) - except Exception as ex: - LOG.error(str(ex)) - raise - - -def run_playbook_with_timeout(log_file, - playbook_command, - timeout=None): +def run_playbook(log_file, playbook_command, + timeout=None, register_cleanup=True): """Run ansible playbook via subprocess. log_file: Logs output to file timeout: Timeout in seconds. Raises PlaybookExecutionTimeout on timeout + register_cleanup: Register the subprocess group for cleanup on shutdown, + if the underlying service supports cleanup. """ exec_env = os.environ.copy() exec_env["ANSIBLE_LOG_PATH"] = "/dev/null" @@ -100,7 +75,7 @@ def run_playbook_with_timeout(log_file, if timeout: # Invoke ansible-playbook via the 'timeout' command. # Using --kill-after=5s which will force a kill -9 if the process - # hasn't terminated within 10s: + # hasn't terminated within 5s: timeout_log_str = " (timeout: %ss)" % timeout playbook_command = ["/usr/bin/timeout", "--kill-after=5s", "%ss" % timeout] + playbook_command @@ -118,6 +93,14 @@ def run_playbook_with_timeout(log_file, f_out_log.write(txt) f_out_log.flush() + if register_cleanup: + # Use the same process group / session for all children + # This makes it easier to kill the entire process group + # on cleanup + preexec_fn = os.setsid + else: + preexec_fn = None + # TODO(kmacleod) future considerations: # - In python3, this code can be simplified to use the new # subprocess.run(timeout=val) method or Popen with @@ -128,8 +111,12 @@ def run_playbook_with_timeout(log_file, subp = subprocess.Popen(playbook_command, stdout=f_out_log, stderr=f_out_log, - env=exec_env) + env=exec_env, + preexec_fn=preexec_fn) try: + if register_cleanup: + SubprocessCleanup.register_subprocess_group(subp) + subp.communicate() # wait for process to exit if timeout and subp.returncode == TIMEOUT_EXITCODE: @@ -143,6 +130,8 @@ def run_playbook_with_timeout(log_file, raise PlaybookExecutionFailed(playbook_cmd=playbook_command) finally: f_out_log.flush() + if register_cleanup: + SubprocessCleanup.unregister_subprocess_group(subp) except PlaybookExecutionFailed: raise diff --git a/distributedcloud/dcmanager/common/prestage.py b/distributedcloud/dcmanager/common/prestage.py index 3b99ee3f4..741f19964 100644 --- a/distributedcloud/dcmanager/common/prestage.py +++ b/distributedcloud/dcmanager/common/prestage.py @@ -34,7 +34,7 @@ from dccommon.drivers.openstack.sdk_platform import OpenStackDriver from dccommon.drivers.openstack.sysinv_v1 import SysinvClient from dccommon.exceptions import PlaybookExecutionFailed from dccommon.exceptions import PlaybookExecutionTimeout -from dccommon.utils import run_playbook_with_timeout +from dccommon.utils import run_playbook from dcmanager.common import consts from dcmanager.common import exceptions @@ -399,9 +399,8 @@ def _run_ansible(context, prestage_command, phase, oam_floating_ip, ansible_pass=base64.b64decode(sysadmin_password).decode('utf-8')) try: - run_playbook_with_timeout(log_file, - prestage_command, - timeout=timeout_seconds) + run_playbook(log_file, prestage_command, + timeout=timeout_seconds, register_cleanup=True) except PlaybookExecutionFailed as ex: timeout_msg = '' if isinstance(ex, PlaybookExecutionTimeout): diff --git a/distributedcloud/dcmanager/manager/service.py b/distributedcloud/dcmanager/manager/service.py index cd92a79f5..703f6fddf 100644 --- a/distributedcloud/dcmanager/manager/service.py +++ b/distributedcloud/dcmanager/manager/service.py @@ -22,6 +22,7 @@ import oslo_messaging from oslo_service import service from oslo_utils import uuidutils +from dccommon.subprocess_cleanup import SubprocessCleanup from dcmanager.audit import rpcapi as dcmanager_audit_rpc_client from dcmanager.common import consts from dcmanager.common import context @@ -161,16 +162,17 @@ class DCManagerService(service.Service): def _stop_rpc_server(self): # Stop RPC connection to prevent new requests - LOG.debug(_("Attempting to stop engine service...")) + LOG.debug(_("Attempting to stop RPC service...")) try: self._rpc_server.stop() self._rpc_server.wait() - LOG.info('Engine service stopped successfully') + LOG.info('RPC service stopped successfully') except Exception as ex: - LOG.error('Failed to stop engine service: %s', + LOG.error('Failed to stop RPC service: %s', six.text_type(ex)) def stop(self): + SubprocessCleanup.shutdown_cleanup(origin="service") self._stop_rpc_server() # Terminate the engine process LOG.info("All threads were gone, terminating engine") diff --git a/distributedcloud/dcmanager/orchestrator/orch_thread.py b/distributedcloud/dcmanager/orchestrator/orch_thread.py index 480247147..f802dec88 100644 --- a/distributedcloud/dcmanager/orchestrator/orch_thread.py +++ b/distributedcloud/dcmanager/orchestrator/orch_thread.py @@ -92,10 +92,11 @@ class OrchThread(threading.Thread): self._stop.set() def run(self): + LOG.info("(%s) OrchThread Starting" % self.update_type) self.run_orch() # Stop any greenthreads that are still running + LOG.info("(%s) OrchThread Stopping" % self.update_type) self.thread_group_manager.stop() - LOG.info("(%s) OrchThread Stopped" % self.update_type) @staticmethod def get_ks_client(region_name=consts.DEFAULT_REGION_NAME): diff --git a/distributedcloud/dcmanager/orchestrator/service.py b/distributedcloud/dcmanager/orchestrator/service.py index 4c903f57f..abda5757c 100644 --- a/distributedcloud/dcmanager/orchestrator/service.py +++ b/distributedcloud/dcmanager/orchestrator/service.py @@ -21,6 +21,7 @@ from oslo_log import log as logging import oslo_messaging from oslo_service import service +from dccommon.subprocess_cleanup import SubprocessCleanup from dcmanager.common import consts from dcmanager.common import context from dcmanager.common import exceptions @@ -80,19 +81,20 @@ class DCManagerOrchestratorService(service.Service): def _stop_rpc_server(self): # Stop RPC connection to prevent new requests - LOG.debug("Attempting to stop engine service...") + LOG.debug("Attempting to stop RPC service...") if self._rpc_server is not None: try: self._rpc_server.stop() self._rpc_server.wait() self._rpc_server = None - LOG.info('Engine service stopped successfully') + LOG.info('RPC service stopped successfully') except Exception as ex: LOG.error('Failed to stop engine service: %s', six.text_type(ex)) def stop(self): """Stop anything initiated by start""" + SubprocessCleanup.shutdown_cleanup(origin="service") self._stop_rpc_server() if self.TG is not None: self.TG.stop()