From 414b10fab073ba3a8a800b1635a5f812ed81d110 Mon Sep 17 00:00:00 2001 From: Sangeet Gupta <sg774j@att.com> Date: Fri, 7 Feb 2020 17:22:06 +0000 Subject: [PATCH] Fix health-probe concurrency and timings Changed Nova and Neutron health-probe script to exit if previous probe process is still running. The health-probe has RPC call timeout of 60 seconds and has 2 retries. In worst case scenario the probe process can run a little over 180 seconds. Changing the periodSeconds so that probe starts after previous one is complete. Also changing timeoutSeconds value a little to give little more extra time for the probe to finish. Increasing the liveness probe periods as they are not do critical which will reduce the resource usage for the probes. Co-authored-by: Randeep Jalli <rj2083@att.com> Change-Id: Ife1c381d663c1e271a5099bdc6d0dfefb00d8d73 --- neutron/templates/bin/_health-probe.py.tpl | 30 +++++++++++++++++++ neutron/templates/daemonset-l2gw-agent.yaml | 8 +++--- neutron/values.yaml | 32 ++++++++++----------- nova/templates/bin/_health-probe.py.tpl | 31 ++++++++++++++++++++ nova/templates/daemonset-compute.yaml | 8 +++--- nova/templates/deployment-conductor.yaml | 8 +++--- nova/templates/deployment-consoleauth.yaml | 8 +++--- nova/templates/deployment-scheduler.yaml | 8 +++--- 8 files changed, 97 insertions(+), 36 deletions(-) diff --git a/neutron/templates/bin/_health-probe.py.tpl b/neutron/templates/bin/_health-probe.py.tpl index 284163afe6..0aa4a5647d 100644 --- a/neutron/templates/bin/_health-probe.py.tpl +++ b/neutron/templates/bin/_health-probe.py.tpl @@ -39,8 +39,10 @@ Usage example for Neutron metadata agent: import httplib2 from six.moves import http_client as httplib +import json import os import psutil +import signal import socket import sys @@ -292,8 +294,36 @@ def test_rpc_liveness(): check_agent_status(transport) +def check_pid_running(pid): + if psutil.pid_exists(int(pid)): + return True + else: + return False if __name__ == "__main__": + + if "liveness-probe" in ','.join(sys.argv): + pidfile = "/tmp/liveness.pid" #nosec + else: + pidfile = "/tmp/readiness.pid" #nosec + data = {} + if os.path.isfile(pidfile): + with open(pidfile,'r') as f: + data = json.load(f) + if check_pid_running(data['pid']): + if data['exit_count'] > 1: + # Third time in, kill the previous process + os.kill(int(data['pid']), signal.SIGTERM) + else: + data['exit_count'] = data['exit_count'] + 1 + with open(pidfile, 'w') as f: + json.dump(data, f) + sys.exit(0) + data['pid'] = os.getpid() + data['exit_count'] = 0 + with open(pidfile, 'w') as f: + json.dump(data, f) + if "sriov_agent.ini" in ','.join(sys.argv): sriov_readiness_check() elif "metadata_agent.ini" not in ','.join(sys.argv): diff --git a/neutron/templates/daemonset-l2gw-agent.yaml b/neutron/templates/daemonset-l2gw-agent.yaml index 50daf29a9e..3f673990cd 100644 --- a/neutron/templates/daemonset-l2gw-agent.yaml +++ b/neutron/templates/daemonset-l2gw-agent.yaml @@ -81,8 +81,8 @@ spec: - --use-fqdn {{- end }} initialDelaySeconds: 30 - periodSeconds: 15 - timeoutSeconds: 65 + periodSeconds: 190 + timeoutSeconds: 185 livenessProbe: exec: command: @@ -99,8 +99,8 @@ spec: - --use-fqdn {{- end }} initialDelaySeconds: 120 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 600 + timeoutSeconds: 580 command: - /tmp/neutron-l2gw-agent.sh volumeMounts: diff --git a/neutron/values.yaml b/neutron/values.yaml index 916b4e8802..3694099bef 100644 --- a/neutron/values.yaml +++ b/neutron/values.yaml @@ -341,28 +341,28 @@ pod: enabled: true params: initialDelaySeconds: 30 - periodSeconds: 15 - timeoutSeconds: 65 + periodSeconds: 190 + timeoutSeconds: 185 liveness: enabled: true params: initialDelaySeconds: 120 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 600 + timeoutSeconds: 580 l3_agent: l3_agent: readiness: enabled: true params: initialDelaySeconds: 30 - periodSeconds: 15 - timeoutSeconds: 65 + periodSeconds: 190 + timeoutSeconds: 185 liveness: enabled: true params: initialDelaySeconds: 120 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 600 + timeoutSeconds: 580 lb_agent: lb_agent: readiness: @@ -373,14 +373,14 @@ pod: enabled: true params: initialDelaySeconds: 30 - periodSeconds: 15 - timeoutSeconds: 65 + periodSeconds: 190 + timeoutSeconds: 185 liveness: enabled: true params: initialDelaySeconds: 120 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 600 + timeoutSeconds: 580 ovs_agent: ovs_agent: readiness: @@ -390,16 +390,16 @@ pod: enabled: true params: initialDelaySeconds: 120 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 600 + timeoutSeconds: 580 sriov_agent: sriov_agent: readiness: enabled: true params: initialDelaySeconds: 30 - periodSeconds: 15 - timeoutSeconds: 65 + periodSeconds: 190 + timeoutSeconds: 185 server: server: readiness: diff --git a/nova/templates/bin/_health-probe.py.tpl b/nova/templates/bin/_health-probe.py.tpl index d78e70139d..d1127fb989 100644 --- a/nova/templates/bin/_health-probe.py.tpl +++ b/nova/templates/bin/_health-probe.py.tpl @@ -33,7 +33,10 @@ Usage example for Nova Compute: """ +import json +import os import psutil +import signal import socket import sys @@ -218,8 +221,36 @@ def test_rpc_liveness(): check_service_status(transport) +def check_pid_running(pid): + if psutil.pid_exists(int(pid)): + return True + else: + return False if __name__ == "__main__": + + if "liveness-probe" in ','.join(sys.argv): + pidfile = "/tmp/liveness.pid" #nosec + else: + pidfile = "/tmp/readiness.pid" #nosec + data = {} + if os.path.isfile(pidfile): + with open(pidfile,'r') as f: + data = json.load(f) + if check_pid_running(data['pid']): + if data['exit_count'] > 1: + # Third time in, kill the previous process + os.kill(int(data['pid']), signal.SIGTERM) + else: + data['exit_count'] = data['exit_count'] + 1 + with open(pidfile, 'w') as f: + json.dump(data, f) + sys.exit(0) + data['pid'] = os.getpid() + data['exit_count'] = 0 + with open(pidfile, 'w') as f: + json.dump(data, f) + test_rpc_liveness() sys.exit(0) # return success diff --git a/nova/templates/daemonset-compute.yaml b/nova/templates/daemonset-compute.yaml index 43e53d72d7..fd1f37f431 100644 --- a/nova/templates/daemonset-compute.yaml +++ b/nova/templates/daemonset-compute.yaml @@ -203,8 +203,8 @@ spec: - --use-fqdn {{- end }} initialDelaySeconds: 80 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 190 + timeoutSeconds: 185 livenessProbe: exec: command: @@ -219,8 +219,8 @@ spec: - --use-fqdn {{- end }} initialDelaySeconds: 120 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 600 + timeoutSeconds: 580 command: - /tmp/nova-compute.sh volumeMounts: diff --git a/nova/templates/deployment-conductor.yaml b/nova/templates/deployment-conductor.yaml index f927afa6ae..ad511646a1 100644 --- a/nova/templates/deployment-conductor.yaml +++ b/nova/templates/deployment-conductor.yaml @@ -69,8 +69,8 @@ spec: - --service-queue-name - conductor initialDelaySeconds: 80 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 190 + timeoutSeconds: 185 livenessProbe: exec: command: @@ -82,8 +82,8 @@ spec: - conductor - --liveness-probe initialDelaySeconds: 120 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 600 + timeoutSeconds: 580 command: - /tmp/nova-conductor.sh volumeMounts: diff --git a/nova/templates/deployment-consoleauth.yaml b/nova/templates/deployment-consoleauth.yaml index b9cb71732e..ddeea3381b 100644 --- a/nova/templates/deployment-consoleauth.yaml +++ b/nova/templates/deployment-consoleauth.yaml @@ -69,8 +69,8 @@ spec: - --service-queue-name - consoleauth initialDelaySeconds: 80 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 190 + timeoutSeconds: 185 livenessProbe: exec: command: @@ -82,8 +82,8 @@ spec: - consoleauth - --liveness-probe initialDelaySeconds: 120 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 600 + timeoutSeconds: 580 command: - /tmp/nova-consoleauth.sh volumeMounts: diff --git a/nova/templates/deployment-scheduler.yaml b/nova/templates/deployment-scheduler.yaml index cb9e9df35c..f2d5055e72 100644 --- a/nova/templates/deployment-scheduler.yaml +++ b/nova/templates/deployment-scheduler.yaml @@ -69,8 +69,8 @@ spec: - --service-queue-name - scheduler initialDelaySeconds: 80 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 190 + timeoutSeconds: 185 livenessProbe: exec: command: @@ -82,8 +82,8 @@ spec: - scheduler - --liveness-probe initialDelaySeconds: 120 - periodSeconds: 90 - timeoutSeconds: 70 + periodSeconds: 600 + timeoutSeconds: 580 command: - /tmp/nova-scheduler.sh volumeMounts: