Merge "Health probe for Nova components"
This commit is contained in:
commit
00fff1d274
208
nova/templates/bin/_health-probe.py.tpl
Normal file
208
nova/templates/bin/_health-probe.py.tpl
Normal file
@ -0,0 +1,208 @@
|
||||
#!/usr/bin/env python2
|
||||
|
||||
# Copyright 2019 The Openstack-Helm Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Health probe script for OpenStack service that uses RPC/unix domain socket for
|
||||
communication. Check's the RPC tcp socket status on the process and send
|
||||
message to service through rpc call method and expects a reply. It is expected
|
||||
to receive failure from the service's RPC server as the method does not exist.
|
||||
|
||||
Script returns failure to Kubernetes only when
|
||||
a. TCP socket for the RPC communication are not established.
|
||||
b. service is not reachable or
|
||||
c. service times out sending a reply.
|
||||
|
||||
sys.stderr.write() writes to pod's events on failures.
|
||||
|
||||
Usage example for Nova Compute:
|
||||
# python health-probe-rpc.py --config-file /etc/nova/nova.conf \
|
||||
# --service-queue-name compute
|
||||
|
||||
"""
|
||||
|
||||
import psutil
|
||||
import socket
|
||||
import sys
|
||||
|
||||
from oslo_config import cfg
|
||||
from oslo_context import context
|
||||
from oslo_log import log
|
||||
import oslo_messaging
|
||||
|
||||
|
||||
tcp_established = "ESTABLISHED"
|
||||
|
||||
|
||||
def check_service_status(transport):
|
||||
"""Verify service status. Return success if service consumes message"""
|
||||
try:
|
||||
target = oslo_messaging.Target(topic=cfg.CONF.service_queue_name,
|
||||
server=socket.gethostname())
|
||||
client = oslo_messaging.RPCClient(transport, target,
|
||||
timeout=60,
|
||||
retry=2)
|
||||
client.call(context.RequestContext(),
|
||||
'pod_health_probe_method_ignore_errors')
|
||||
except oslo_messaging.exceptions.MessageDeliveryFailure:
|
||||
# Log to pod events
|
||||
sys.stderr.write("Health probe unable to reach message bus")
|
||||
sys.exit(0) # return success
|
||||
except oslo_messaging.rpc.client.RemoteError as re:
|
||||
if ("Endpoint does not support RPC method" in re.message) or \
|
||||
("Endpoint does not support RPC version" in re.message):
|
||||
sys.exit(0) # Call reached the service
|
||||
else:
|
||||
sys.stderr.write("Health probe unable to reach service")
|
||||
sys.exit(1) # return failure
|
||||
except oslo_messaging.exceptions.MessagingTimeout:
|
||||
sys.stderr.write("Health probe timed out. Agent is down or response "
|
||||
"timed out")
|
||||
sys.exit(1) # return failure
|
||||
except Exception as ex:
|
||||
sys.stderr.write("Health probe caught exception sending message to "
|
||||
"service: %s" % ex.message)
|
||||
sys.exit(0)
|
||||
except:
|
||||
sys.stderr.write("Health probe caught exception sending message to"
|
||||
" service")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
def tcp_socket_status(process, port):
|
||||
"""Check the tcp socket status on a process"""
|
||||
sock_count = 0
|
||||
parentId = 0
|
||||
for pr in psutil.pids():
|
||||
try:
|
||||
p = psutil.Process(pr)
|
||||
if p.name() == process:
|
||||
if parentId == 0:
|
||||
parentId = p.pid
|
||||
else:
|
||||
if p.ppid() == parentId:
|
||||
continue
|
||||
pcon = p.connections()
|
||||
for con in pcon:
|
||||
try:
|
||||
rport = con.raddr[1]
|
||||
status = con.status
|
||||
except IndexError:
|
||||
continue
|
||||
if rport == port and status == tcp_established:
|
||||
sock_count = sock_count + 1
|
||||
except psutil.NoSuchProcess:
|
||||
continue
|
||||
|
||||
if sock_count == 0:
|
||||
return 0
|
||||
else:
|
||||
return 1
|
||||
|
||||
|
||||
def configured_port_in_conf():
|
||||
"""Get the rabbitmq/Database port configured in config file"""
|
||||
rabbitmq_port = 0
|
||||
database_port = 0
|
||||
try:
|
||||
with open(sys.argv[2]) as conf_file:
|
||||
for line in conf_file:
|
||||
if "transport_url" in line:
|
||||
rabbitmq_port = int(line.split(':', 3)[3].split('/')[0])
|
||||
elif "connection =" in line:
|
||||
service = line.split(':', 3)[3].split('/')[1].rstrip('\n')
|
||||
if service == "nova":
|
||||
database_port = int(
|
||||
line.split(':', 3)[3].split('/')[0])
|
||||
return rabbitmq_port, database_port
|
||||
except IOError:
|
||||
sys.stderr.write("Nova Config file not present")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def test_tcp_socket(service):
|
||||
"""Check tcp socket to rabbitmq/db is in Established state"""
|
||||
dict_services = {
|
||||
"compute": "nova-compute",
|
||||
"conductor": "nova-conductor",
|
||||
"consoleauth": "nova-consoleaut",
|
||||
"scheduler": "nova-scheduler"
|
||||
}
|
||||
r_port, d_port = configured_port_in_conf()
|
||||
|
||||
if service in dict_services:
|
||||
proc = dict_services[service]
|
||||
if r_port != 0 and tcp_socket_status(proc, r_port) == 0:
|
||||
sys.stderr.write("RabbitMQ socket not established")
|
||||
# Do not kill the pod if RabbitMQ is not reachable/down
|
||||
if not cfg.CONF.liveness_probe:
|
||||
sys.exit(1)
|
||||
|
||||
# let's do the db check
|
||||
if service != "compute":
|
||||
if d_port != 0 and tcp_socket_status(proc, d_port) == 0:
|
||||
sys.stderr.write("Database socket not established")
|
||||
# Do not kill the pod if database is not reachable/down
|
||||
# there could be no socket as well as typically connections
|
||||
# get closed after an idle timeout
|
||||
# Just log it to pod events
|
||||
if not cfg.CONF.liveness_probe:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def test_rpc_liveness():
|
||||
"""Test if service can consume message from queue"""
|
||||
oslo_messaging.set_transport_defaults(control_exchange='nova')
|
||||
|
||||
rabbit_group = cfg.OptGroup(name='oslo_messaging_rabbit',
|
||||
title='RabbitMQ options')
|
||||
cfg.CONF.register_group(rabbit_group)
|
||||
cfg.CONF.register_cli_opt(cfg.StrOpt('service-queue-name'))
|
||||
cfg.CONF.register_cli_opt(cfg.BoolOpt('liveness-probe', default=False,
|
||||
required=False))
|
||||
|
||||
cfg.CONF(sys.argv[1:])
|
||||
|
||||
log.logging.basicConfig(level=log.ERROR)
|
||||
|
||||
try:
|
||||
transport = oslo_messaging.get_transport(cfg.CONF)
|
||||
except Exception as ex:
|
||||
sys.stderr.write("Message bus driver load error: %s" % ex.message)
|
||||
sys.exit(0) # return success
|
||||
|
||||
if not cfg.CONF.transport_url or \
|
||||
not cfg.CONF.service_queue_name:
|
||||
sys.stderr.write("Both message bus URL and service's queue name are "
|
||||
"required for health probe to work")
|
||||
sys.exit(0) # return success
|
||||
|
||||
try:
|
||||
cfg.CONF.set_override('rabbit_max_retries', 2,
|
||||
group=rabbit_group) # 3 attempts
|
||||
except cfg.NoSuchOptError as ex:
|
||||
cfg.CONF.register_opt(cfg.IntOpt('rabbit_max_retries', default=2),
|
||||
group=rabbit_group)
|
||||
|
||||
service = cfg.CONF.service_queue_name
|
||||
test_tcp_socket(service)
|
||||
|
||||
check_service_status(transport)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_rpc_liveness()
|
||||
|
||||
sys.exit(0) # return success
|
@ -51,6 +51,8 @@ data:
|
||||
ceph-admin-keyring.sh: |
|
||||
{{ tuple "bin/_ceph-admin-keyring.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
|
||||
{{- end }}
|
||||
health-probe.py: |
|
||||
{{ tuple "bin/_health-probe.py.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
|
||||
nova-api.sh: |
|
||||
{{ tuple "bin/_nova-api.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
|
||||
nova-api-metadata.sh: |
|
||||
|
@ -180,6 +180,31 @@ spec:
|
||||
- name: LIBVIRT_CEPH_SECRET_UUID
|
||||
value: "{{ .Values.conf.ceph.secret_uuid }}"
|
||||
{{ end }}
|
||||
readinessProbe:
|
||||
exec:
|
||||
command:
|
||||
- python
|
||||
- /tmp/health-probe.py
|
||||
- --config-file
|
||||
- /etc/nova/nova.conf
|
||||
- --service-queue-name
|
||||
- compute
|
||||
initialDelaySeconds: 80
|
||||
periodSeconds: 90
|
||||
timeoutSeconds: 70
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
- python
|
||||
- /tmp/health-probe.py
|
||||
- --config-file
|
||||
- /etc/nova/nova.conf
|
||||
- --service-queue-name
|
||||
- compute
|
||||
- --liveness-probe
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 90
|
||||
timeoutSeconds: 70
|
||||
command:
|
||||
- /tmp/nova-compute.sh
|
||||
volumeMounts:
|
||||
@ -187,6 +212,10 @@ spec:
|
||||
mountPath: /tmp/nova-compute.sh
|
||||
subPath: nova-compute.sh
|
||||
readOnly: true
|
||||
- name: nova-bin
|
||||
mountPath: /tmp/health-probe.py
|
||||
subPath: health-probe.py
|
||||
readOnly: true
|
||||
- name: nova-etc
|
||||
mountPath: /etc/nova/nova.conf
|
||||
subPath: nova.conf
|
||||
|
@ -60,6 +60,31 @@ spec:
|
||||
{{ tuple $envAll $envAll.Values.pod.resources.conductor | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readinessProbe:
|
||||
exec:
|
||||
command:
|
||||
- python
|
||||
- /tmp/health-probe.py
|
||||
- --config-file
|
||||
- /etc/nova/nova.conf
|
||||
- --service-queue-name
|
||||
- conductor
|
||||
initialDelaySeconds: 80
|
||||
periodSeconds: 90
|
||||
timeoutSeconds: 70
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
- python
|
||||
- /tmp/health-probe.py
|
||||
- --config-file
|
||||
- /etc/nova/nova.conf
|
||||
- --service-queue-name
|
||||
- conductor
|
||||
- --liveness-probe
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 90
|
||||
timeoutSeconds: 70
|
||||
command:
|
||||
- /tmp/nova-conductor.sh
|
||||
volumeMounts:
|
||||
@ -67,6 +92,10 @@ spec:
|
||||
mountPath: /tmp/nova-conductor.sh
|
||||
subPath: nova-conductor.sh
|
||||
readOnly: true
|
||||
- name: nova-bin
|
||||
mountPath: /tmp/health-probe.py
|
||||
subPath: health-probe.py
|
||||
readOnly: true
|
||||
- name: nova-etc
|
||||
mountPath: /etc/nova/nova.conf
|
||||
subPath: nova.conf
|
||||
|
@ -60,6 +60,31 @@ spec:
|
||||
{{ tuple $envAll $envAll.Values.pod.resources.consoleauth | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readinessProbe:
|
||||
exec:
|
||||
command:
|
||||
- python
|
||||
- /tmp/health-probe.py
|
||||
- --config-file
|
||||
- /etc/nova/nova.conf
|
||||
- --service-queue-name
|
||||
- consoleauth
|
||||
initialDelaySeconds: 80
|
||||
periodSeconds: 90
|
||||
timeoutSeconds: 70
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
- python
|
||||
- /tmp/health-probe.py
|
||||
- --config-file
|
||||
- /etc/nova/nova.conf
|
||||
- --service-queue-name
|
||||
- consoleauth
|
||||
- --liveness-probe
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 90
|
||||
timeoutSeconds: 70
|
||||
command:
|
||||
- /tmp/nova-consoleauth.sh
|
||||
volumeMounts:
|
||||
@ -67,6 +92,10 @@ spec:
|
||||
mountPath: /tmp/nova-consoleauth.sh
|
||||
subPath: nova-consoleauth.sh
|
||||
readOnly: true
|
||||
- name: nova-bin
|
||||
mountPath: /tmp/health-probe.py
|
||||
subPath: health-probe.py
|
||||
readOnly: true
|
||||
- name: nova-etc
|
||||
mountPath: /etc/nova/nova.conf
|
||||
subPath: nova.conf
|
||||
|
@ -94,6 +94,14 @@ spec:
|
||||
- name: nova-novncproxy
|
||||
{{ tuple $envAll "nova_novncproxy" | include "helm-toolkit.snippets.image" | indent 10 }}
|
||||
{{ tuple $envAll $envAll.Values.pod.resources.novncproxy | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
|
||||
readinessProbe:
|
||||
tcpSocket:
|
||||
port: {{ tuple "compute_novnc_proxy" "internal" "novnc_proxy" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
|
||||
initialDelaySeconds: 30
|
||||
livenessProbe:
|
||||
tcpSocket:
|
||||
port: {{ tuple "compute_novnc_proxy" "internal" "novnc_proxy" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
|
||||
initialDelaySeconds: 30
|
||||
command:
|
||||
- /tmp/nova-console-proxy.sh
|
||||
ports:
|
||||
|
@ -60,6 +60,31 @@ spec:
|
||||
{{ tuple $envAll $envAll.Values.pod.resources.scheduler | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readinessProbe:
|
||||
exec:
|
||||
command:
|
||||
- python
|
||||
- /tmp/health-probe.py
|
||||
- --config-file
|
||||
- /etc/nova/nova.conf
|
||||
- --service-queue-name
|
||||
- scheduler
|
||||
initialDelaySeconds: 80
|
||||
periodSeconds: 90
|
||||
timeoutSeconds: 70
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
- python
|
||||
- /tmp/health-probe.py
|
||||
- --config-file
|
||||
- /etc/nova/nova.conf
|
||||
- --service-queue-name
|
||||
- scheduler
|
||||
- --liveness-probe
|
||||
initialDelaySeconds: 120
|
||||
periodSeconds: 90
|
||||
timeoutSeconds: 70
|
||||
command:
|
||||
- /tmp/nova-scheduler.sh
|
||||
volumeMounts:
|
||||
@ -67,6 +92,10 @@ spec:
|
||||
mountPath: /tmp/nova-scheduler.sh
|
||||
subPath: nova-scheduler.sh
|
||||
readOnly: true
|
||||
- name: nova-bin
|
||||
mountPath: /tmp/health-probe.py
|
||||
subPath: health-probe.py
|
||||
readOnly: true
|
||||
- name: nova-etc
|
||||
mountPath: /etc/nova/nova.conf
|
||||
subPath: nova.conf
|
||||
|
@ -94,6 +94,14 @@ spec:
|
||||
- name: nova-spiceproxy
|
||||
{{ tuple $envAll "nova_spiceproxy" | include "helm-toolkit.snippets.image" | indent 10 }}
|
||||
{{ tuple $envAll $envAll.Values.pod.resources.spiceproxy | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
|
||||
readinessProbe:
|
||||
tcpSocket:
|
||||
port: {{ tuple "compute_spice_proxy" "internal" "spice_proxy" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
|
||||
initialDelaySeconds: 30
|
||||
livenessProbe:
|
||||
tcpSocket:
|
||||
port: {{ tuple "compute_spice_proxy" "internal" "spice_proxy" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
|
||||
initialDelaySeconds: 30
|
||||
command:
|
||||
- /tmp/nova-console-proxy.sh
|
||||
ports:
|
||||
|
Loading…
Reference in New Issue
Block a user