From 32fbf9eaaed6c52874a0e0c33c44ff869298f6c7 Mon Sep 17 00:00:00 2001 From: Tim Burke Date: Tue, 12 Nov 2024 09:25:14 -0800 Subject: [PATCH] probe tests: Set default timeout for subprocesses There's an upstream eventlet bug that seems to cause process hangs during an atexit hook; unfortunately, that means that every time we call "once" in probe tests, we can hang indefinitely waiting for a process that won't terminate. See https://github.com/eventlet/eventlet/issues/989 Now, wait with a timeout; if it pops, kill the offending process and hope for the best. Do this by patching out subprocess.Popen.wait, but only in probe tests -- this ensures that we won't impact any real systems, while also ensuring a broad coverage of probe tests (as opposed to, say, plumbing some new wait_timeout kwarg into all the Manager call sites). Closes-Bug: #2088027 Change-Id: I8983eafbb575d73d1654c354815a7de7ae141873 --- test/probe/__init__.py | 25 +++++++++++++++++++++++++ test/sample.conf | 1 + 2 files changed, 26 insertions(+) diff --git a/test/probe/__init__.py b/test/probe/__init__.py index fb8af91d09..e8e82ba9ee 100644 --- a/test/probe/__init__.py +++ b/test/probe/__init__.py @@ -17,15 +17,40 @@ import eventlet eventlet.monkey_patch() +import subprocess from test import get_config from swift.common.utils import config_true_value config = get_config('probe_test') CHECK_SERVER_TIMEOUT = int(config.get('check_server_timeout', 30)) +SUBPROCESS_WAIT_TIMEOUT = int(config.get('subprocess_wait_timeout', + CHECK_SERVER_TIMEOUT)) VALIDATE_RSYNC = config_true_value(config.get('validate_rsync', False)) PROXY_BASE_URL = config.get('proxy_base_url') if PROXY_BASE_URL is None: # TODO: find and load an "appropriate" proxy-server.conf(.d), piece # something together from bind_ip, bind_port, and cert_file PROXY_BASE_URL = 'http://127.0.0.1:8080' +orig_popen_wait = subprocess.Popen.wait + + +def wait_with_timeout(self, timeout=None, check_interval=0.01): + # We want to always have a timeout; no probe test should need to wait + # on even minute-long running processes. + timeout = SUBPROCESS_WAIT_TIMEOUT if timeout is None else timeout + try: + return orig_popen_wait( + self, timeout=timeout, check_interval=check_interval) + except subprocess.TimeoutExpired: + # Assume we tripped https://github.com/eventlet/eventlet/issues/989 + # Kill the process (it should be mid-shutdown anyway) and log about it + print('WARNING: killing long running daemon after %ss: %r' + % (timeout, self.args)) + self.kill() + # return 128 + 9 = 137 which is same as if using a command line like + # 'timeout -s KILL ' + return 137 + + +subprocess.Popen.wait = wait_with_timeout diff --git a/test/sample.conf b/test/sample.conf index b543988352..071b706b6c 100644 --- a/test/sample.conf +++ b/test/sample.conf @@ -104,6 +104,7 @@ fake_syslog = False [probe_test] # check_server_timeout = 30 +# subprocess_wait_timeout = 30 # validate_rsync = false # proxy_base_url = http://localhost:8080