Browse Source

Merge "Liveness/Readiness probe for Neutron server and its agents"

Zuul 1 month ago
parent
commit
c598004355

+ 266
- 0
neutron/templates/bin/_health-probe.py.tpl View File

@@ -0,0 +1,266 @@
1
+#!/usr/bin/env python2
2
+
3
+# Copyright 2019 The Openstack-Helm Authors.
4
+#
5
+# Licensed under the Apache License, Version 2.0 (the "License");
6
+# you may not use this file except in compliance with the License.
7
+# You may obtain a copy of the License at
8
+#
9
+#     http://www.apache.org/licenses/LICENSE-2.0
10
+#
11
+# Unless required by applicable law or agreed to in writing, software
12
+# distributed under the License is distributed on an "AS IS" BASIS,
13
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+# See the License for the specific language governing permissions and
15
+# limitations under the License.
16
+
17
+"""
18
+Health probe script for OpenStack agents that uses RPC/unix domain socket for
19
+communication. Sends message to agent through rpc call method and expects a
20
+reply. It is expected to receive a failure from the agent's RPC server as the
21
+method does not exist.
22
+
23
+Script returns failure to Kubernetes only when
24
+  a. agent is not reachable or
25
+  b. agent times out sending a reply.
26
+
27
+sys.stderr.write() writes to pod's events on failures.
28
+
29
+Usage example for Neutron L3 agent:
30
+# python health-probe.py --config-file /etc/neutron/neutron.conf \
31
+#  --config-file /etc/neutron/l3_agent.ini --agent-queue-name l3_agent
32
+
33
+Usage example for Neutron metadata agent:
34
+# python health-probe.py --config-file /etc/neutron/neutron.conf \
35
+#  --config-file /etc/neutron/metadata_agent.ini
36
+"""
37
+
38
+import httplib2
39
+from six.moves import http_client as httplib
40
+import os
41
+import psutil
42
+import socket
43
+import sys
44
+
45
+from oslo_config import cfg
46
+from oslo_context import context
47
+from oslo_log import log
48
+import oslo_messaging
49
+
50
+rabbit_port = 5672
51
+tcp_established = "ESTABLISHED"
52
+log.logging.basicConfig(level=log.ERROR)
53
+
54
+
55
+def check_agent_status(transport):
56
+    """Verify agent status. Return success if agent consumes message"""
57
+    try:
58
+        target = oslo_messaging.Target(topic=cfg.CONF.agent_queue_name,
59
+                                       server=socket.gethostname())
60
+        client = oslo_messaging.RPCClient(transport, target,
61
+                                          timeout=60,
62
+                                          retry=2)
63
+        client.call(context.RequestContext(),
64
+                    'pod_health_probe_method_ignore_errors')
65
+    except oslo_messaging.exceptions.MessageDeliveryFailure:
66
+        # Log to pod events
67
+        sys.stderr.write("Health probe unable to reach message bus")
68
+        sys.exit(0)  # return success
69
+    except oslo_messaging.rpc.client.RemoteError as re:
70
+        if ("Endpoint does not support RPC method" in re.message) or \
71
+                ("Endpoint does not support RPC version" in re.message):
72
+            sys.exit(0)  # Call reached the agent
73
+        else:
74
+            sys.stderr.write("Health probe unable to reach agent")
75
+            sys.exit(1)  # return failure
76
+    except oslo_messaging.exceptions.MessagingTimeout:
77
+        sys.stderr.write("Health probe timed out. Agent is down or response "
78
+                         "timed out")
79
+        sys.exit(1)  # return failure
80
+    except Exception as ex:
81
+        sys.stderr.write("Health probe caught exception sending message to "
82
+                         "agent: %s" % ex.message)
83
+        sys.exit(0)
84
+    except:
85
+        sys.stderr.write("Health probe caught exception sending message to"
86
+                         " agent")
87
+        sys.exit(0)
88
+
89
+
90
+def sriov_readiness_check():
91
+    """Checks the sriov configuration on the sriov nic's"""
92
+    return_status = 1
93
+    with open('/etc/neutron/plugins/ml2/sriov_agent.ini') as nic:
94
+        for phy in nic:
95
+            if "physical_device_mappings" in phy:
96
+                phy_dev = phy.split('=', 1)[1]
97
+                phy_dev1 = phy_dev.rstrip().split(',')
98
+                if not phy_dev1:
99
+                    sys.stderr.write("No Physical devices"
100
+                                     " configured as SRIOV NICs")
101
+                    sys.exit(1)
102
+                for intf in phy_dev1:
103
+                    phy, dev = intf.split(':')
104
+                    try:
105
+                        with open('/sys/class/net/%s/device/'
106
+                                  'sriov_numvfs' % dev) as f:
107
+                            for line in f:
108
+                                numvfs = line.rstrip('\n')
109
+                                if numvfs:
110
+                                    return_status = 0
111
+                    except IOError:
112
+                        sys.stderr.write("IOError:No sriov_numvfs config file")
113
+    sys.exit(return_status)
114
+
115
+
116
+def tcp_socket_state_check(agentq):
117
+    """Check if the tcp socket to rabbitmq is in Established state"""
118
+    rabbit_sock_count = 0
119
+    parentId = 0
120
+    if agentq == "l3_agent":
121
+        proc = "neutron-l3-agen"
122
+    elif agentq == "dhcp_agent":
123
+        proc = "neutron-dhcp-ag"
124
+    elif agentq == "q-agent-notifier-tunnel-update":
125
+        proc = "neutron-openvsw"
126
+    else:
127
+        proc = "neutron-metadat"
128
+
129
+    for pr in psutil.pids():
130
+        try:
131
+            p = psutil.Process(pr)
132
+            if p.name() == proc:
133
+                if parentId == 0:
134
+                    parentId = p.pid
135
+                else:
136
+                    if p.ppid() == parentId:
137
+                        continue
138
+                pcon = p.connections()
139
+                for con in pcon:
140
+                    try:
141
+                        port = con.raddr[1]
142
+                        status = con.status
143
+                    except IndexError:
144
+                        continue
145
+                    if port == rabbit_port and status == tcp_established:
146
+                        rabbit_sock_count = rabbit_sock_count + 1
147
+        except psutil.NoSuchProcess:
148
+            continue
149
+
150
+    if rabbit_sock_count == 0:
151
+        sys.stderr.write("RabbitMQ sockets not Established")
152
+        # Do not kill the pod if RabbitMQ is not reachable/down
153
+        if not cfg.CONF.liveness_probe:
154
+            sys.exit(1)
155
+
156
+
157
+class UnixDomainHTTPConnection(httplib.HTTPConnection):
158
+    """Connection class for HTTP over UNIX domain socket."""
159
+
160
+    def __init__(self, host, port=None, strict=None, timeout=None,
161
+                 proxy_info=None):
162
+        httplib.HTTPConnection.__init__(self, host, port, strict)
163
+        self.timeout = timeout
164
+        self.socket_path = cfg.CONF.metadata_proxy_socket
165
+
166
+    def connect(self):
167
+        self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
168
+        if self.timeout:
169
+            self.sock.settimeout(self.timeout)
170
+        self.sock.connect(self.socket_path)
171
+
172
+
173
+def test_socket_liveness():
174
+    """Test if agent can respond to message over the socket"""
175
+    cfg.CONF.register_cli_opt(cfg.BoolOpt('liveness-probe', default=False,
176
+                                          required=False))
177
+    cfg.CONF(sys.argv[1:])
178
+
179
+    agentq = "metadata_agent"
180
+    tcp_socket_state_check(agentq)
181
+
182
+    try:
183
+        metadata_proxy_socket = cfg.CONF.metadata_proxy_socket
184
+    except cfg.NoSuchOptError:
185
+        cfg.CONF.register_opt(cfg.StrOpt(
186
+            'metadata_proxy_socket',
187
+            default='/var/lib/neutron/openstack-helm/metadata_proxy'))
188
+
189
+    headers = {'X-Forwarded-For': '169.254.169.254',
190
+               'X-Neutron-Router-ID': 'pod-health-probe-check-ignore-errors'}
191
+
192
+    h = httplib2.Http(timeout=30)
193
+
194
+    try:
195
+        resp, content = h.request(
196
+            'http://169.254.169.254',
197
+            method='GET',
198
+            headers=headers,
199
+            connection_type=UnixDomainHTTPConnection)
200
+    except socket.error as se:
201
+        msg = "Socket error: Health probe failed to connect to " \
202
+              "Neutron Metadata agent: "
203
+        if se.strerror:
204
+            sys.stderr.write(msg + se.strerror)
205
+        elif se.message:
206
+            sys.stderr.write(msg + se.message)
207
+        sys.exit(1)  # return failure
208
+    except Exception as ex:
209
+        sys.stderr.write("Health probe caught exception sending message to "
210
+                         "Neutron Metadata agent: %s" % ex.message)
211
+        sys.exit(0)  # return success
212
+
213
+    if resp.status >= 500:  # Probe expects HTTP error code 404
214
+        msg = "Health probe failed: Neutron Metadata agent failed to" \
215
+              " process request: "
216
+        sys.stderr.write(msg + str(resp.__dict__))
217
+        sys.exit(1)  # return failure
218
+
219
+
220
+def test_rpc_liveness():
221
+    """Test if agent can consume message from queue"""
222
+    oslo_messaging.set_transport_defaults(control_exchange='neutron')
223
+
224
+    rabbit_group = cfg.OptGroup(name='oslo_messaging_rabbit',
225
+                                title='RabbitMQ options')
226
+    cfg.CONF.register_group(rabbit_group)
227
+    cfg.CONF.register_cli_opt(cfg.StrOpt('agent-queue-name'))
228
+    cfg.CONF.register_cli_opt(cfg.BoolOpt('liveness-probe', default=False,
229
+                                          required=False))
230
+
231
+    cfg.CONF(sys.argv[1:])
232
+
233
+    try:
234
+        transport = oslo_messaging.get_transport(cfg.CONF)
235
+    except Exception as ex:
236
+        sys.stderr.write("Message bus driver load error: %s" % ex.message)
237
+        sys.exit(0)  # return success
238
+
239
+    if not cfg.CONF.transport_url or \
240
+            not cfg.CONF.agent_queue_name:
241
+        sys.stderr.write("Both message bus URL and agent queue name are "
242
+                         "required for Health probe to work")
243
+        sys.exit(0)  # return success
244
+
245
+    try:
246
+        cfg.CONF.set_override('rabbit_max_retries', 2,
247
+                              group=rabbit_group)  # 3 attempts
248
+    except cfg.NoSuchOptError as ex:
249
+        cfg.CONF.register_opt(cfg.IntOpt('rabbit_max_retries', default=2),
250
+                              group=rabbit_group)
251
+
252
+    agentq = cfg.CONF.agent_queue_name
253
+    tcp_socket_state_check(agentq)
254
+
255
+    check_agent_status(transport)
256
+
257
+
258
+if __name__ == "__main__":
259
+    if "sriov_agent.ini" in ','.join(sys.argv):
260
+        sriov_readiness_check()
261
+    elif "metadata_agent.ini" not in ','.join(sys.argv):
262
+        test_rpc_liveness()
263
+    else:
264
+        test_socket_liveness()
265
+
266
+    sys.exit(0)  # return success

+ 2
- 0
neutron/templates/configmap-bin.yaml View File

@@ -45,6 +45,8 @@ data:
45 45
 {{- include "helm-toolkit.scripts.keystone_endpoints" . | indent 4 }}
46 46
   ks-user.sh: |
47 47
 {{- include "helm-toolkit.scripts.keystone_user" . | indent 4 }}
48
+  health-probe.py: |
49
+{{ tuple "bin/_health-probe.py.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
48 50
   neutron-dhcp-agent.sh: |
49 51
 {{ tuple "bin/_neutron-dhcp-agent.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
50 52
   neutron-l3-agent.sh: |

+ 33
- 0
neutron/templates/daemonset-dhcp-agent.yaml View File

@@ -66,6 +66,35 @@ spec:
66 66
 {{ tuple $envAll $envAll.Values.pod.resources.agent.dhcp | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
67 67
           securityContext:
68 68
             privileged: true
69
+          readinessProbe:
70
+            exec:
71
+              command:
72
+              - python
73
+              - /tmp/health-probe.py
74
+              - --config-file
75
+              - /etc/neutron/neutron.conf
76
+              - --config-file
77
+              - /etc/neutron/dhcp_agent.ini
78
+              - --agent-queue-name
79
+              - dhcp_agent
80
+            initialDelaySeconds: 30
81
+            periodSeconds: 15
82
+            timeoutSeconds: 65
83
+          livenessProbe:
84
+            exec:
85
+              command:
86
+              - python
87
+              - /tmp/health-probe.py
88
+              - --config-file
89
+              - /etc/neutron/neutron.conf
90
+              - --config-file
91
+              - /etc/neutron/dhcp_agent.ini
92
+              - --agent-queue-name
93
+              - dhcp_agent
94
+              - --liveness-probe
95
+            initialDelaySeconds: 120
96
+            periodSeconds: 90
97
+            timeoutSeconds: 70
69 98
           command:
70 99
             - /tmp/neutron-dhcp-agent.sh
71 100
           volumeMounts:
@@ -73,6 +102,10 @@ spec:
73 102
               mountPath: /tmp/neutron-dhcp-agent.sh
74 103
               subPath: neutron-dhcp-agent.sh
75 104
               readOnly: true
105
+            - name: neutron-bin
106
+              mountPath: /tmp/health-probe.py
107
+              subPath: health-probe.py
108
+              readOnly: true
76 109
             - name: neutron-etc
77 110
               mountPath: /etc/neutron/neutron.conf
78 111
               subPath: neutron.conf

+ 33
- 0
neutron/templates/daemonset-l3-agent.yaml View File

@@ -66,6 +66,35 @@ spec:
66 66
 {{ tuple $envAll $envAll.Values.pod.resources.agent.l3 | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
67 67
           securityContext:
68 68
             privileged: true
69
+          readinessProbe:
70
+            exec:
71
+              command:
72
+              - python
73
+              - /tmp/health-probe.py
74
+              - --config-file
75
+              - /etc/neutron/neutron.conf
76
+              - --config-file
77
+              - /etc/neutron/l3_agent.ini
78
+              - --agent-queue-name
79
+              - l3_agent
80
+            initialDelaySeconds: 30
81
+            periodSeconds: 15
82
+            timeoutSeconds: 65
83
+          livenessProbe:
84
+            exec:
85
+              command:
86
+              - python
87
+              - /tmp/health-probe.py
88
+              - --config-file
89
+              - /etc/neutron/neutron.conf
90
+              - --config-file
91
+              - /etc/neutron/l3_agent.ini
92
+              - --agent-queue-name
93
+              - l3_agent
94
+              - --liveness-probe
95
+            initialDelaySeconds: 120
96
+            periodSeconds: 90
97
+            timeoutSeconds: 70
69 98
           command:
70 99
             - /tmp/neutron-l3-agent.sh
71 100
           volumeMounts:
@@ -73,6 +102,10 @@ spec:
73 102
               mountPath: /tmp/neutron-l3-agent.sh
74 103
               subPath: neutron-l3-agent.sh
75 104
               readOnly: true
105
+            - name: neutron-bin
106
+              mountPath: /tmp/health-probe.py
107
+              subPath: health-probe.py
108
+              readOnly: true
76 109
             - name: neutron-etc
77 110
               mountPath: /etc/neutron/neutron.conf
78 111
               subPath: neutron.conf

+ 29
- 0
neutron/templates/daemonset-metadata-agent.yaml View File

@@ -87,6 +87,31 @@ spec:
87 87
 {{ tuple $envAll $envAll.Values.pod.resources.agent.metadata | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
88 88
           securityContext:
89 89
             privileged: true
90
+          readinessProbe:
91
+            exec:
92
+              command:
93
+              - python
94
+              - /tmp/health-probe.py
95
+              - --config-file
96
+              - /etc/neutron/neutron.conf
97
+              - --config-file
98
+              - /etc/neutron/metadata_agent.ini
99
+            initialDelaySeconds: 30
100
+            periodSeconds: 15
101
+            timeoutSeconds: 35
102
+          livenessProbe:
103
+            exec:
104
+              command:
105
+              - python
106
+              - /tmp/health-probe.py
107
+              - --config-file
108
+              - /etc/neutron/neutron.conf
109
+              - --config-file
110
+              - /etc/neutron/metadata_agent.ini
111
+              - --liveness-probe
112
+            initialDelaySeconds: 90
113
+            periodSeconds: 60
114
+            timeoutSeconds: 45
90 115
           command:
91 116
             - /tmp/neutron-metadata-agent.sh
92 117
           volumeMounts:
@@ -94,6 +119,10 @@ spec:
94 119
               mountPath: /tmp/neutron-metadata-agent.sh
95 120
               subPath: neutron-metadata-agent.sh
96 121
               readOnly: true
122
+            - name: neutron-bin
123
+              mountPath: /tmp/health-probe.py
124
+              subPath: health-probe.py
125
+              readOnly: true
97 126
             - name: neutron-etc
98 127
               mountPath: /etc/neutron/neutron.conf
99 128
               subPath: neutron.conf

+ 19
- 0
neutron/templates/daemonset-ovs-agent.yaml View File

@@ -162,11 +162,30 @@ spec:
162 162
                 - bash
163 163
                 - -c
164 164
                 - 'ovs-vsctl list-br | grep -q br-int'
165
+          livenessProbe:
166
+            exec:
167
+              command:
168
+              - python
169
+              - /tmp/health-probe.py
170
+              - --config-file
171
+              - /etc/neutron/neutron.conf
172
+              - --config-file
173
+              - /etc/neutron/plugins/ml2/openvswitch_agent.ini
174
+              - --agent-queue-name
175
+              - q-agent-notifier-tunnel-update
176
+              - --liveness-probe
177
+            initialDelaySeconds: 120
178
+            periodSeconds: 90
179
+            timeoutSeconds: 70
165 180
           volumeMounts:
166 181
             - name: neutron-bin
167 182
               mountPath: /tmp/neutron-openvswitch-agent.sh
168 183
               subPath: neutron-openvswitch-agent.sh
169 184
               readOnly: true
185
+            - name: neutron-bin
186
+              mountPath: /tmp/health-probe.py
187
+              subPath: health-probe.py
188
+              readOnly: true
170 189
             - name: pod-shared
171 190
               mountPath: /tmp/pod-shared
172 191
             - name: neutron-etc

+ 16
- 0
neutron/templates/daemonset-sriov-agent.yaml View File

@@ -129,11 +129,27 @@ spec:
129 129
             privileged: true
130 130
           command:
131 131
             - /tmp/neutron-sriov-agent.sh
132
+          readinessProbe:
133
+            exec:
134
+              command:
135
+              - python
136
+              - /tmp/health-probe.py
137
+              - --config-file
138
+              - /etc/neutron/neutron.conf
139
+              - --config-file
140
+              - /etc/neutron/sriov_agent.ini
141
+            initialDelaySeconds: 30
142
+            periodSeconds: 15
143
+            timeoutSeconds: 10
132 144
           volumeMounts:
133 145
             - name: neutron-bin
134 146
               mountPath: /tmp/neutron-sriov-agent.sh
135 147
               subPath: neutron-sriov-agent.sh
136 148
               readOnly: true
149
+            - name: neutron-bin
150
+              mountPath: /tmp/health-probe.py
151
+              subPath: health-probe.py
152
+              readOnly: true
137 153
             - name: pod-shared
138 154
               mountPath: /tmp/pod-shared
139 155
             - name: neutron-etc

+ 4
- 0
neutron/templates/deployment-server.yaml View File

@@ -81,6 +81,10 @@ spec:
81 81
           readinessProbe:
82 82
             tcpSocket:
83 83
               port: {{ tuple "network" "internal" "api" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
84
+          livenessProbe:
85
+            tcpSocket:
86
+              port: {{ tuple "network" "internal" "api" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
87
+            initialDelaySeconds: 60
84 88
           volumeMounts:
85 89
             - name: neutron-bin
86 90
               mountPath: /tmp/neutron-server.sh

Loading…
Cancel
Save