From 414b10fab073ba3a8a800b1635a5f812ed81d110 Mon Sep 17 00:00:00 2001
From: Sangeet Gupta <sg774j@att.com>
Date: Fri, 7 Feb 2020 17:22:06 +0000
Subject: [PATCH] Fix health-probe concurrency and timings

Changed Nova and Neutron health-probe script to exit if previous
probe process is still running.
The health-probe has RPC call timeout of 60 seconds and has 2
retries. In worst case scenario the probe process can run a little
over 180 seconds. Changing the periodSeconds so that probe starts
after previous one is complete. Also changing timeoutSeconds value
a little to give little more extra time for the probe to finish.
Increasing the liveness probe periods as they are not do critical
which will reduce the resource usage for the probes.

Co-authored-by: Randeep Jalli <rj2083@att.com>

Change-Id: Ife1c381d663c1e271a5099bdc6d0dfefb00d8d73
---
 neutron/templates/bin/_health-probe.py.tpl  | 30 +++++++++++++++++++
 neutron/templates/daemonset-l2gw-agent.yaml |  8 +++---
 neutron/values.yaml                         | 32 ++++++++++-----------
 nova/templates/bin/_health-probe.py.tpl     | 31 ++++++++++++++++++++
 nova/templates/daemonset-compute.yaml       |  8 +++---
 nova/templates/deployment-conductor.yaml    |  8 +++---
 nova/templates/deployment-consoleauth.yaml  |  8 +++---
 nova/templates/deployment-scheduler.yaml    |  8 +++---
 8 files changed, 97 insertions(+), 36 deletions(-)

diff --git a/neutron/templates/bin/_health-probe.py.tpl b/neutron/templates/bin/_health-probe.py.tpl
index 284163afe6..0aa4a5647d 100644
--- a/neutron/templates/bin/_health-probe.py.tpl
+++ b/neutron/templates/bin/_health-probe.py.tpl
@@ -39,8 +39,10 @@ Usage example for Neutron metadata agent:
 
 import httplib2
 from six.moves import http_client as httplib
+import json
 import os
 import psutil
+import signal
 import socket
 import sys
 
@@ -292,8 +294,36 @@ def test_rpc_liveness():
 
     check_agent_status(transport)
 
+def check_pid_running(pid):
+    if psutil.pid_exists(int(pid)):
+       return True
+    else:
+       return False
 
 if __name__ == "__main__":
+
+    if "liveness-probe" in ','.join(sys.argv):
+        pidfile = "/tmp/liveness.pid"  #nosec
+    else:
+        pidfile = "/tmp/readiness.pid"  #nosec
+    data = {}
+    if os.path.isfile(pidfile):
+        with open(pidfile,'r') as f:
+            data = json.load(f)
+        if check_pid_running(data['pid']):
+            if data['exit_count'] > 1:
+                # Third time in, kill the previous process
+                os.kill(int(data['pid']), signal.SIGTERM)
+            else:
+                data['exit_count'] = data['exit_count'] + 1
+                with open(pidfile, 'w') as f:
+                    json.dump(data, f)
+                sys.exit(0)
+    data['pid'] = os.getpid()
+    data['exit_count'] = 0
+    with open(pidfile, 'w') as f:
+        json.dump(data, f)
+
     if "sriov_agent.ini" in ','.join(sys.argv):
         sriov_readiness_check()
     elif "metadata_agent.ini" not in ','.join(sys.argv):
diff --git a/neutron/templates/daemonset-l2gw-agent.yaml b/neutron/templates/daemonset-l2gw-agent.yaml
index 50daf29a9e..3f673990cd 100644
--- a/neutron/templates/daemonset-l2gw-agent.yaml
+++ b/neutron/templates/daemonset-l2gw-agent.yaml
@@ -81,8 +81,8 @@ spec:
                 - --use-fqdn
 {{- end }}
             initialDelaySeconds: 30
-            periodSeconds: 15
-            timeoutSeconds: 65
+            periodSeconds: 190
+            timeoutSeconds: 185
           livenessProbe:
             exec:
               command:
@@ -99,8 +99,8 @@ spec:
                 - --use-fqdn
 {{- end }}
             initialDelaySeconds: 120
-            periodSeconds: 90
-            timeoutSeconds: 70
+            periodSeconds: 600
+            timeoutSeconds: 580
           command:
             - /tmp/neutron-l2gw-agent.sh
           volumeMounts:
diff --git a/neutron/values.yaml b/neutron/values.yaml
index 916b4e8802..3694099bef 100644
--- a/neutron/values.yaml
+++ b/neutron/values.yaml
@@ -341,28 +341,28 @@ pod:
           enabled: true
           params:
             initialDelaySeconds: 30
-            periodSeconds: 15
-            timeoutSeconds: 65
+            periodSeconds: 190
+            timeoutSeconds: 185
         liveness:
           enabled: true
           params:
             initialDelaySeconds: 120
-            periodSeconds: 90
-            timeoutSeconds: 70
+            periodSeconds: 600
+            timeoutSeconds: 580
     l3_agent:
       l3_agent:
         readiness:
           enabled: true
           params:
             initialDelaySeconds: 30
-            periodSeconds: 15
-            timeoutSeconds: 65
+            periodSeconds: 190
+            timeoutSeconds: 185
         liveness:
           enabled: true
           params:
             initialDelaySeconds: 120
-            periodSeconds: 90
-            timeoutSeconds: 70
+            periodSeconds: 600
+            timeoutSeconds: 580
     lb_agent:
       lb_agent:
         readiness:
@@ -373,14 +373,14 @@ pod:
           enabled: true
           params:
             initialDelaySeconds: 30
-            periodSeconds: 15
-            timeoutSeconds: 65
+            periodSeconds: 190
+            timeoutSeconds: 185
         liveness:
           enabled: true
           params:
             initialDelaySeconds: 120
-            periodSeconds: 90
-            timeoutSeconds: 70
+            periodSeconds: 600
+            timeoutSeconds: 580
     ovs_agent:
       ovs_agent:
         readiness:
@@ -390,16 +390,16 @@ pod:
           enabled: true
           params:
             initialDelaySeconds: 120
-            periodSeconds: 90
-            timeoutSeconds: 70
+            periodSeconds: 600
+            timeoutSeconds: 580
     sriov_agent:
       sriov_agent:
         readiness:
           enabled: true
           params:
             initialDelaySeconds: 30
-            periodSeconds: 15
-            timeoutSeconds: 65
+            periodSeconds: 190
+            timeoutSeconds: 185
     server:
       server:
         readiness:
diff --git a/nova/templates/bin/_health-probe.py.tpl b/nova/templates/bin/_health-probe.py.tpl
index d78e70139d..d1127fb989 100644
--- a/nova/templates/bin/_health-probe.py.tpl
+++ b/nova/templates/bin/_health-probe.py.tpl
@@ -33,7 +33,10 @@ Usage example for Nova Compute:
 
 """
 
+import json
+import os
 import psutil
+import signal
 import socket
 import sys
 
@@ -218,8 +221,36 @@ def test_rpc_liveness():
 
     check_service_status(transport)
 
+def check_pid_running(pid):
+    if psutil.pid_exists(int(pid)):
+       return True
+    else:
+       return False
 
 if __name__ == "__main__":
+
+    if "liveness-probe" in ','.join(sys.argv):
+        pidfile = "/tmp/liveness.pid"  #nosec
+    else:
+        pidfile = "/tmp/readiness.pid"  #nosec
+    data = {}
+    if os.path.isfile(pidfile):
+        with open(pidfile,'r') as f:
+            data = json.load(f)
+        if check_pid_running(data['pid']):
+            if data['exit_count'] > 1:
+                # Third time in, kill the previous process
+                os.kill(int(data['pid']), signal.SIGTERM)
+            else:
+                data['exit_count'] = data['exit_count'] + 1
+                with open(pidfile, 'w') as f:
+                    json.dump(data, f)
+                sys.exit(0)
+    data['pid'] = os.getpid()
+    data['exit_count'] = 0
+    with open(pidfile, 'w') as f:
+        json.dump(data, f)
+
     test_rpc_liveness()
 
     sys.exit(0)  # return success
diff --git a/nova/templates/daemonset-compute.yaml b/nova/templates/daemonset-compute.yaml
index 43e53d72d7..fd1f37f431 100644
--- a/nova/templates/daemonset-compute.yaml
+++ b/nova/templates/daemonset-compute.yaml
@@ -203,8 +203,8 @@ spec:
                 - --use-fqdn
                 {{- end }}
             initialDelaySeconds: 80
-            periodSeconds: 90
-            timeoutSeconds: 70
+            periodSeconds: 190
+            timeoutSeconds: 185
           livenessProbe:
             exec:
               command:
@@ -219,8 +219,8 @@ spec:
                 - --use-fqdn
                 {{- end }}
             initialDelaySeconds: 120
-            periodSeconds: 90
-            timeoutSeconds: 70
+            periodSeconds: 600
+            timeoutSeconds: 580
           command:
             - /tmp/nova-compute.sh
           volumeMounts:
diff --git a/nova/templates/deployment-conductor.yaml b/nova/templates/deployment-conductor.yaml
index f927afa6ae..ad511646a1 100644
--- a/nova/templates/deployment-conductor.yaml
+++ b/nova/templates/deployment-conductor.yaml
@@ -69,8 +69,8 @@ spec:
                 - --service-queue-name
                 - conductor
             initialDelaySeconds: 80
-            periodSeconds: 90
-            timeoutSeconds: 70
+            periodSeconds: 190
+            timeoutSeconds: 185
           livenessProbe:
             exec:
               command:
@@ -82,8 +82,8 @@ spec:
                 - conductor
                 - --liveness-probe
             initialDelaySeconds: 120
-            periodSeconds: 90
-            timeoutSeconds: 70
+            periodSeconds: 600
+            timeoutSeconds: 580
           command:
             - /tmp/nova-conductor.sh
           volumeMounts:
diff --git a/nova/templates/deployment-consoleauth.yaml b/nova/templates/deployment-consoleauth.yaml
index b9cb71732e..ddeea3381b 100644
--- a/nova/templates/deployment-consoleauth.yaml
+++ b/nova/templates/deployment-consoleauth.yaml
@@ -69,8 +69,8 @@ spec:
                 - --service-queue-name
                 - consoleauth
             initialDelaySeconds: 80
-            periodSeconds: 90
-            timeoutSeconds: 70
+            periodSeconds: 190
+            timeoutSeconds: 185
           livenessProbe:
             exec:
               command:
@@ -82,8 +82,8 @@ spec:
                 - consoleauth
                 - --liveness-probe
             initialDelaySeconds: 120
-            periodSeconds: 90
-            timeoutSeconds: 70
+            periodSeconds: 600
+            timeoutSeconds: 580
           command:
             - /tmp/nova-consoleauth.sh
           volumeMounts:
diff --git a/nova/templates/deployment-scheduler.yaml b/nova/templates/deployment-scheduler.yaml
index cb9e9df35c..f2d5055e72 100644
--- a/nova/templates/deployment-scheduler.yaml
+++ b/nova/templates/deployment-scheduler.yaml
@@ -69,8 +69,8 @@ spec:
                 - --service-queue-name
                 - scheduler
             initialDelaySeconds: 80
-            periodSeconds: 90
-            timeoutSeconds: 70
+            periodSeconds: 190
+            timeoutSeconds: 185
           livenessProbe:
             exec:
               command:
@@ -82,8 +82,8 @@ spec:
                 - scheduler
                 - --liveness-probe
             initialDelaySeconds: 120
-            periodSeconds: 90
-            timeoutSeconds: 70
+            periodSeconds: 600
+            timeoutSeconds: 580
           command:
             - /tmp/nova-scheduler.sh
           volumeMounts: