Browse Source

Health probe for Nova components

Health probe for Nova pods is used for both liveness
and readiness probe.

nova-compute, nova-conductor, nova-consoleauth and nova-scheduler:
Check if the rpc socket status on the nova pods to rabbitmq and
database are in established state.
sends an RPC call with a non-existence method to component's queue.
Probe is success if agent returns with NoSuchMethod error.
If agent is not reachable or fails to respond in time,
returns failure to probe.

novnc/spice proxy: uses Kubernetes tcp probe on corresponding ports
they expose.
Added code to catch nova config file not present exception.

Change-Id: Ib8e4b93486588320fd2d562c3bc90b65844e52e5
Hemachandra Reddy 1 month ago
parent
commit
49b58b7e7d

+ 208
- 0
nova/templates/bin/_health-probe.py.tpl View File

@@ -0,0 +1,208 @@
1
+#!/usr/bin/env python2
2
+
3
+# Copyright 2019 The Openstack-Helm Authors.
4
+#
5
+# Licensed under the Apache License, Version 2.0 (the "License");
6
+# you may not use this file except in compliance with the License.
7
+# You may obtain a copy of the License at
8
+#
9
+#     http://www.apache.org/licenses/LICENSE-2.0
10
+#
11
+# Unless required by applicable law or agreed to in writing, software
12
+# distributed under the License is distributed on an "AS IS" BASIS,
13
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+# See the License for the specific language governing permissions and
15
+# limitations under the License.
16
+
17
+"""
18
+Health probe script for OpenStack service that uses RPC/unix domain socket for
19
+communication. Check's the RPC tcp socket status on the process and send
20
+message to service through rpc call method and expects a reply. It is expected
21
+to receive failure from the service's RPC server as the method does not exist.
22
+
23
+Script returns failure to Kubernetes only when
24
+  a. TCP socket for the RPC communication are not established.
25
+  b. service is not reachable or
26
+  c. service times out sending a reply.
27
+
28
+sys.stderr.write() writes to pod's events on failures.
29
+
30
+Usage example for Nova Compute:
31
+# python health-probe-rpc.py --config-file /etc/nova/nova.conf \
32
+#  --service-queue-name compute
33
+
34
+"""
35
+
36
+import psutil
37
+import socket
38
+import sys
39
+
40
+from oslo_config import cfg
41
+from oslo_context import context
42
+from oslo_log import log
43
+import oslo_messaging
44
+
45
+
46
+tcp_established = "ESTABLISHED"
47
+
48
+
49
+def check_service_status(transport):
50
+    """Verify service status. Return success if service consumes message"""
51
+    try:
52
+        target = oslo_messaging.Target(topic=cfg.CONF.service_queue_name,
53
+                                       server=socket.gethostname())
54
+        client = oslo_messaging.RPCClient(transport, target,
55
+                                          timeout=60,
56
+                                          retry=2)
57
+        client.call(context.RequestContext(),
58
+                    'pod_health_probe_method_ignore_errors')
59
+    except oslo_messaging.exceptions.MessageDeliveryFailure:
60
+        # Log to pod events
61
+        sys.stderr.write("Health probe unable to reach message bus")
62
+        sys.exit(0)  # return success
63
+    except oslo_messaging.rpc.client.RemoteError as re:
64
+        if ("Endpoint does not support RPC method" in re.message) or \
65
+                ("Endpoint does not support RPC version" in re.message):
66
+            sys.exit(0)  # Call reached the service
67
+        else:
68
+            sys.stderr.write("Health probe unable to reach service")
69
+            sys.exit(1)  # return failure
70
+    except oslo_messaging.exceptions.MessagingTimeout:
71
+        sys.stderr.write("Health probe timed out. Agent is down or response "
72
+                         "timed out")
73
+        sys.exit(1)  # return failure
74
+    except Exception as ex:
75
+        sys.stderr.write("Health probe caught exception sending message to "
76
+                         "service: %s" % ex.message)
77
+        sys.exit(0)
78
+    except:
79
+        sys.stderr.write("Health probe caught exception sending message to"
80
+                         " service")
81
+        sys.exit(0)
82
+
83
+
84
+def tcp_socket_status(process, port):
85
+    """Check the tcp socket status on a process"""
86
+    sock_count = 0
87
+    parentId = 0
88
+    for pr in psutil.pids():
89
+        try:
90
+            p = psutil.Process(pr)
91
+            if p.name() == process:
92
+                if parentId == 0:
93
+                    parentId = p.pid
94
+                else:
95
+                    if p.ppid() == parentId:
96
+                        continue
97
+                pcon = p.connections()
98
+                for con in pcon:
99
+                    try:
100
+                        rport = con.raddr[1]
101
+                        status = con.status
102
+                    except IndexError:
103
+                        continue
104
+                    if rport == port and status == tcp_established:
105
+                        sock_count = sock_count + 1
106
+        except psutil.NoSuchProcess:
107
+            continue
108
+
109
+    if sock_count == 0:
110
+        return 0
111
+    else:
112
+        return 1
113
+
114
+
115
+def configured_port_in_conf():
116
+    """Get the rabbitmq/Database port configured in config file"""
117
+    rabbitmq_port = 0
118
+    database_port = 0
119
+    try:
120
+        with open(sys.argv[2]) as conf_file:
121
+            for line in conf_file:
122
+                if "transport_url" in line:
123
+                    rabbitmq_port = int(line.split(':', 3)[3].split('/')[0])
124
+                elif "connection =" in line:
125
+                    service = line.split(':', 3)[3].split('/')[1].rstrip('\n')
126
+                    if service == "nova":
127
+                        database_port = int(
128
+                            line.split(':', 3)[3].split('/')[0])
129
+            return rabbitmq_port, database_port
130
+    except IOError:
131
+        sys.stderr.write("Nova Config file not present")
132
+        sys.exit(1)
133
+
134
+
135
+def test_tcp_socket(service):
136
+    """Check tcp socket to rabbitmq/db is in Established state"""
137
+    dict_services = {
138
+        "compute": "nova-compute",
139
+        "conductor": "nova-conductor",
140
+        "consoleauth": "nova-consoleaut",
141
+        "scheduler": "nova-scheduler"
142
+    }
143
+    r_port, d_port = configured_port_in_conf()
144
+
145
+    if service in dict_services:
146
+        proc = dict_services[service]
147
+        if r_port != 0 and tcp_socket_status(proc, r_port) == 0:
148
+            sys.stderr.write("RabbitMQ socket not established")
149
+            # Do not kill the pod if RabbitMQ is not reachable/down
150
+            if not cfg.CONF.liveness_probe:
151
+                sys.exit(1)
152
+
153
+        # let's do the db check
154
+        if service != "compute":
155
+            if d_port != 0 and tcp_socket_status(proc, d_port) == 0:
156
+                sys.stderr.write("Database socket not established")
157
+                # Do not kill the pod if database is not reachable/down
158
+                # there could be no socket as well as typically connections
159
+                # get closed after an idle timeout
160
+                # Just log it to pod events
161
+                if not cfg.CONF.liveness_probe:
162
+                    sys.exit(1)
163
+
164
+
165
+def test_rpc_liveness():
166
+    """Test if service can consume message from queue"""
167
+    oslo_messaging.set_transport_defaults(control_exchange='nova')
168
+
169
+    rabbit_group = cfg.OptGroup(name='oslo_messaging_rabbit',
170
+                                title='RabbitMQ options')
171
+    cfg.CONF.register_group(rabbit_group)
172
+    cfg.CONF.register_cli_opt(cfg.StrOpt('service-queue-name'))
173
+    cfg.CONF.register_cli_opt(cfg.BoolOpt('liveness-probe', default=False,
174
+                                          required=False))
175
+
176
+    cfg.CONF(sys.argv[1:])
177
+
178
+    log.logging.basicConfig(level=log.ERROR)
179
+
180
+    try:
181
+        transport = oslo_messaging.get_transport(cfg.CONF)
182
+    except Exception as ex:
183
+        sys.stderr.write("Message bus driver load error: %s" % ex.message)
184
+        sys.exit(0)  # return success
185
+
186
+    if not cfg.CONF.transport_url or \
187
+            not cfg.CONF.service_queue_name:
188
+        sys.stderr.write("Both message bus URL and service's queue name are "
189
+                         "required for health probe to work")
190
+        sys.exit(0)  # return success
191
+
192
+    try:
193
+        cfg.CONF.set_override('rabbit_max_retries', 2,
194
+                              group=rabbit_group)  # 3 attempts
195
+    except cfg.NoSuchOptError as ex:
196
+        cfg.CONF.register_opt(cfg.IntOpt('rabbit_max_retries', default=2),
197
+                              group=rabbit_group)
198
+
199
+    service = cfg.CONF.service_queue_name
200
+    test_tcp_socket(service)
201
+
202
+    check_service_status(transport)
203
+
204
+
205
+if __name__ == "__main__":
206
+    test_rpc_liveness()
207
+
208
+    sys.exit(0)  # return success

+ 2
- 0
nova/templates/configmap-bin.yaml View File

@@ -51,6 +51,8 @@ data:
51 51
   ceph-admin-keyring.sh: |
52 52
 {{ tuple "bin/_ceph-admin-keyring.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
53 53
 {{- end }}
54
+  health-probe.py: |
55
+{{ tuple "bin/_health-probe.py.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
54 56
   nova-api.sh: |
55 57
 {{ tuple "bin/_nova-api.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
56 58
   nova-api-metadata.sh: |

+ 29
- 0
nova/templates/daemonset-compute.yaml View File

@@ -180,6 +180,31 @@ spec:
180 180
             - name: LIBVIRT_CEPH_SECRET_UUID
181 181
               value: "{{ .Values.conf.ceph.secret_uuid }}"
182 182
           {{ end }}
183
+          readinessProbe:
184
+            exec:
185
+              command:
186
+              - python
187
+              - /tmp/health-probe.py
188
+              - --config-file
189
+              - /etc/nova/nova.conf
190
+              - --service-queue-name
191
+              - compute
192
+            initialDelaySeconds: 80
193
+            periodSeconds: 90
194
+            timeoutSeconds: 70
195
+          livenessProbe:
196
+            exec:
197
+              command:
198
+              - python
199
+              - /tmp/health-probe.py
200
+              - --config-file
201
+              - /etc/nova/nova.conf
202
+              - --service-queue-name
203
+              - compute
204
+              - --liveness-probe
205
+            initialDelaySeconds: 120
206
+            periodSeconds: 90
207
+            timeoutSeconds: 70
183 208
           command:
184 209
             - /tmp/nova-compute.sh
185 210
           volumeMounts:
@@ -187,6 +212,10 @@ spec:
187 212
               mountPath: /tmp/nova-compute.sh
188 213
               subPath: nova-compute.sh
189 214
               readOnly: true
215
+            - name: nova-bin
216
+              mountPath: /tmp/health-probe.py
217
+              subPath: health-probe.py
218
+              readOnly: true
190 219
             - name: nova-etc
191 220
               mountPath: /etc/nova/nova.conf
192 221
               subPath: nova.conf

+ 29
- 0
nova/templates/deployment-conductor.yaml View File

@@ -60,6 +60,31 @@ spec:
60 60
 {{ tuple $envAll $envAll.Values.pod.resources.conductor | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
61 61
           securityContext:
62 62
             allowPrivilegeEscalation: false
63
+          readinessProbe:
64
+            exec:
65
+              command:
66
+              - python
67
+              - /tmp/health-probe.py
68
+              - --config-file
69
+              - /etc/nova/nova.conf
70
+              - --service-queue-name
71
+              - conductor
72
+            initialDelaySeconds: 80
73
+            periodSeconds: 90
74
+            timeoutSeconds: 70
75
+          livenessProbe:
76
+            exec:
77
+              command:
78
+              - python
79
+              - /tmp/health-probe.py
80
+              - --config-file
81
+              - /etc/nova/nova.conf
82
+              - --service-queue-name
83
+              - conductor
84
+              - --liveness-probe
85
+            initialDelaySeconds: 120
86
+            periodSeconds: 90
87
+            timeoutSeconds: 70
63 88
           command:
64 89
             - /tmp/nova-conductor.sh
65 90
           volumeMounts:
@@ -67,6 +92,10 @@ spec:
67 92
               mountPath: /tmp/nova-conductor.sh
68 93
               subPath: nova-conductor.sh
69 94
               readOnly: true
95
+            - name: nova-bin
96
+              mountPath: /tmp/health-probe.py
97
+              subPath: health-probe.py
98
+              readOnly: true
70 99
             - name: nova-etc
71 100
               mountPath: /etc/nova/nova.conf
72 101
               subPath: nova.conf

+ 29
- 0
nova/templates/deployment-consoleauth.yaml View File

@@ -60,6 +60,31 @@ spec:
60 60
 {{ tuple $envAll $envAll.Values.pod.resources.consoleauth | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
61 61
           securityContext:
62 62
             allowPrivilegeEscalation: false
63
+          readinessProbe:
64
+            exec:
65
+              command:
66
+              - python
67
+              - /tmp/health-probe.py
68
+              - --config-file
69
+              - /etc/nova/nova.conf
70
+              - --service-queue-name
71
+              - consoleauth
72
+            initialDelaySeconds: 80
73
+            periodSeconds: 90
74
+            timeoutSeconds: 70
75
+          livenessProbe:
76
+            exec:
77
+              command:
78
+              - python
79
+              - /tmp/health-probe.py
80
+              - --config-file
81
+              - /etc/nova/nova.conf
82
+              - --service-queue-name
83
+              - consoleauth
84
+              - --liveness-probe
85
+            initialDelaySeconds: 120
86
+            periodSeconds: 90
87
+            timeoutSeconds: 70
63 88
           command:
64 89
             - /tmp/nova-consoleauth.sh
65 90
           volumeMounts:
@@ -67,6 +92,10 @@ spec:
67 92
               mountPath: /tmp/nova-consoleauth.sh
68 93
               subPath: nova-consoleauth.sh
69 94
               readOnly: true
95
+            - name: nova-bin
96
+              mountPath: /tmp/health-probe.py
97
+              subPath: health-probe.py
98
+              readOnly: true
70 99
             - name: nova-etc
71 100
               mountPath: /etc/nova/nova.conf
72 101
               subPath: nova.conf

+ 8
- 0
nova/templates/deployment-novncproxy.yaml View File

@@ -94,6 +94,14 @@ spec:
94 94
         - name: nova-novncproxy
95 95
 {{ tuple $envAll "nova_novncproxy" | include "helm-toolkit.snippets.image" | indent 10 }}
96 96
 {{ tuple $envAll $envAll.Values.pod.resources.novncproxy | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
97
+          readinessProbe:
98
+            tcpSocket:
99
+              port: {{ tuple "compute_novnc_proxy" "internal" "novnc_proxy" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
100
+            initialDelaySeconds: 30
101
+          livenessProbe:
102
+            tcpSocket:
103
+              port: {{ tuple "compute_novnc_proxy" "internal" "novnc_proxy" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
104
+            initialDelaySeconds: 30
97 105
           command:
98 106
             - /tmp/nova-console-proxy.sh
99 107
           ports:

+ 29
- 0
nova/templates/deployment-scheduler.yaml View File

@@ -60,6 +60,31 @@ spec:
60 60
 {{ tuple $envAll $envAll.Values.pod.resources.scheduler | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
61 61
           securityContext:
62 62
             allowPrivilegeEscalation: false
63
+          readinessProbe:
64
+            exec:
65
+              command:
66
+              - python
67
+              - /tmp/health-probe.py
68
+              - --config-file
69
+              - /etc/nova/nova.conf
70
+              - --service-queue-name
71
+              - scheduler
72
+            initialDelaySeconds: 80
73
+            periodSeconds: 90
74
+            timeoutSeconds: 70
75
+          livenessProbe:
76
+            exec:
77
+              command:
78
+              - python
79
+              - /tmp/health-probe.py
80
+              - --config-file
81
+              - /etc/nova/nova.conf
82
+              - --service-queue-name
83
+              - scheduler
84
+              - --liveness-probe
85
+            initialDelaySeconds: 120
86
+            periodSeconds: 90
87
+            timeoutSeconds: 70
63 88
           command:
64 89
             - /tmp/nova-scheduler.sh
65 90
           volumeMounts:
@@ -67,6 +92,10 @@ spec:
67 92
               mountPath: /tmp/nova-scheduler.sh
68 93
               subPath: nova-scheduler.sh
69 94
               readOnly: true
95
+            - name: nova-bin
96
+              mountPath: /tmp/health-probe.py
97
+              subPath: health-probe.py
98
+              readOnly: true
70 99
             - name: nova-etc
71 100
               mountPath: /etc/nova/nova.conf
72 101
               subPath: nova.conf

+ 8
- 0
nova/templates/deployment-spiceproxy.yaml View File

@@ -94,6 +94,14 @@ spec:
94 94
         - name: nova-spiceproxy
95 95
 {{ tuple $envAll "nova_spiceproxy" | include "helm-toolkit.snippets.image" | indent 10 }}
96 96
 {{ tuple $envAll $envAll.Values.pod.resources.spiceproxy | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
97
+          readinessProbe:
98
+            tcpSocket:
99
+              port: {{ tuple "compute_spice_proxy" "internal" "spice_proxy" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
100
+            initialDelaySeconds: 30
101
+          livenessProbe:
102
+            tcpSocket:
103
+              port: {{ tuple "compute_spice_proxy" "internal" "spice_proxy" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
104
+            initialDelaySeconds: 30
97 105
           command:
98 106
             - /tmp/nova-console-proxy.sh
99 107
           ports:

Loading…
Cancel
Save