Health probe for Ranger-agent pods

Health probe for Ranger-agent pods is used for both liveness
and readiness probe.

ranger-agent-api and ranger-agent-engine pods:
- Sends an RPC call with a known method to pod's listener
  queue. Probe is successful if call returns with no error. If
  listener is not reachable or fails to respond in time, returns
  failure to probe.
- Check if the rpc socket status on ranger-agent pods to rabbitmq
  are in established state.

ranger-agent-api pod:
- Launch a call to pod's open interface. Probe is successful if call
  returns; otherwise failure if response has error or timed out.

Change-Id: I7a22fd50d47e58df19b413ed65ab528e2d78d609
This commit is contained in:
Chi Lo 2019-09-20 10:07:44 -07:00
parent 71fdc5fdb7
commit 3b9adc2bf0
7 changed files with 350 additions and 37 deletions

View File

@ -0,0 +1,256 @@
#!/usr/bin/env python
# Copyright 2019 The Openstack-Helm Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Health probe script for OpenStack service that uses RPC/unix domain socket for
communication. Check's the RPC tcp socket status on the process and send
message to service through rpc call method and expects a reply.
Script returns failure to Kubernetes only when
a. TCP socket for the RPC communication are not established.
b. service is not reachable or
c. service times out sending a reply.
sys.stderr.write() writes to pod's events on failures.
Usage example for Ranger-agent-engine:
# python health-probe.py --config-file /etc/ranger-agent/ranger-agent.conf \
# --service-queue-name ord-notifier-q
"""
import psutil
import requests
import socket
import sys
from oslo_config import cfg
from oslo_context import context
from oslo_log import log
import oslo_messaging
try:
from configparser import ConfigParser
except ImportError:
from ConfigParser import ConfigParser
tcp_established = "ESTABLISHED"
def check_service_status(transport, service_queue_name):
"""Verify service status. Return success if service consumes message"""
service_error = False
try:
target = oslo_messaging.Target(topic=service_queue_name,
server=socket.gethostname())
client = oslo_messaging.RPCClient(transport, target,
timeout=75,
retry=0)
cctxt = client.prepare(version='1.0')
results = cctxt.call(context.RequestContext(),
'invoke_health_probe_rpc')
for value in results.values():
if value == 'failed':
sys.stderr.write("Health probe detects problem "
": %s\n" % results)
if not cfg.CONF.liveness_probe:
service_error = True
sys.exit(1)
break
except oslo_messaging.exceptions.MessageDeliveryFailure:
# Log to pod events
sys.stderr.write("Health probe unable to reach message bus\n")
sys.exit(0) # return success
except oslo_messaging.rpc.client.RemoteError as re:
message = getattr(re, "message", str(re))
if ("Endpoint does not support RPC method" in message) or \
("Endpoint does not support RPC version" in message):
sys.exit(0) # Call reached the service
else:
sys.stderr.write("Health probe unable to reach service\n")
sys.exit(1) # return failure
except oslo_messaging.exceptions.MessagingTimeout:
sys.stderr.write("Health probe timed out. Service is down or "
"response timed out\n")
sys.exit(1) # return failure
except Exception as ex:
message = getattr(ex, "message", str(ex))
sys.stderr.write("Health probe caught exception sending message to "
"service: %s\n" % message)
sys.exit(0)
except:
sys.stderr.write("Health probe caught exception sending message to"
" service\n")
if service_error:
sys.exit(1)
else:
sys.exit(0)
def tcp_socket_status(process, ports):
"""Check the tcp socket status on a process"""
sock_count = 0
parentId = 0
for pr in psutil.pids():
try:
p = psutil.Process(pr)
if p.name() in process:
if parentId == 0:
parentId = p.pid
else:
if p.ppid() == parentId and not cfg.CONF.check_all_pids:
continue
pcon = p.connections()
for con in pcon:
try:
rport = con.raddr[1]
status = con.status
except IndexError:
continue
if rport in ports and status == tcp_established:
sock_count = sock_count + 1
except psutil.NoSuchProcess:
continue
if sock_count == 0:
return 0
else:
return 1
def get_rabbitmq_ports():
"""Get the rabbitmq port from config file"""
rabbit_ports = set()
try:
transport_url = oslo_messaging.TransportURL.parse(cfg.CONF)
for host in transport_url.hosts:
rabbit_ports.add(host.port)
except Exception as ex:
message = getattr(ex, "message", str(ex))
sys.stderr.write("Health probe caught exception reading "
"RabbitMQ ports: %s" % message)
sys.exit(0) # return success
return rabbit_ports
def test_tcp_socket(service_name):
"""Check tcp socket to rabbitmq is in Established state"""
r_ports = get_rabbitmq_ports()
# service_name is the same as process name for ranger-agent app
proc = cfg.CONF.service_name
if r_ports and tcp_socket_status(service_name, r_ports) == 0:
sys.stderr.write("RabbitMQ socket not established\n")
# Do not kill the pod if RabbitMQ is not reachable/down
if not cfg.CONF.liveness_probe:
sys.exit(1)
def test_ranger_agent_api_reachable():
"""Test ranger-agent-api for response"""
# get ranger-agent-api port
config = ConfigParser()
config.read(cfg.CONF.config_file)
port = config.get('api', 'port')
url = "http://localhost:{}/v1/ord/health_check".format(port)
try:
response = requests.get(url, timeout=30)
if response.status_code != 200:
sys.exit(1)
except requests.exceptions.ConnectionError as ce:
message = getattr(ce, "message", str(ce))
sys.stderr.write("Health probe ConnectionError Exp: %s\n" % message)
sys.exit(1)
except requests.exceptions.ReadTimeout as to:
message = getattr(to, "message", str(to))
sys.stderr.write("Health probe ReadTimeout Exp: %s\n" % message)
sys.exit(1)
except Exception as ex:
message = getattr(ex, "message", str(ex))
sys.stderr.write("Health probe caught Unknown Exp: %s\n" % message)
sys.exit(1)
def test_rpc_liveness(rabbit_group, service_queue_name):
"""Test if service can consume message from queue"""
try:
transport = oslo_messaging.get_transport(cfg.CONF)
except Exception as ex:
message = getattr(ex, "message", str(ex))
sys.stderr.write("Message bus driver load error: %s" % message)
sys.exit(0) # return success
if not cfg.CONF.transport_url or \
not service_queue_name:
sys.stderr.write("Both message bus URL and service's queue name are "
"required for health probe to work")
sys.exit(0) # return success
try:
cfg.CONF.set_override('rabbit_max_retries', 2,
group=rabbit_group) # 3 attempts
except cfg.NoSuchOptError as ex:
cfg.CONF.register_opt(cfg.IntOpt('rabbit_max_retries', default=2),
group=rabbit_group)
check_service_status(transport, service_queue_name)
def run_health_check():
oslo_messaging.set_transport_defaults(control_exchange='openstack')
rabbit_group = cfg.OptGroup(name='oslo_messaging_rabbit',
title='RabbitMQ options')
cfg.CONF.register_group(rabbit_group)
cfg.CONF.register_cli_opt(cfg.StrOpt('service-name'))
cfg.CONF.register_cli_opt(cfg.BoolOpt('liveness-probe', default=False,
required=False))
cfg.CONF.register_cli_opt(cfg.BoolOpt('check-all-pids', default=False,
required=False))
cfg.CONF(sys.argv[1:])
log.logging.basicConfig(level=log.ERROR)
dict_services = {
"ranger-agent-engine": "ord-notifier-q",
"ranger-agent-api": "ord-listener-q"
}
service_name = cfg.CONF.service_name
if service_name in dict_services:
service_queue_name = dict_services[service_name]
else:
sys.stderr.write("Invalid service name: %s\n" % service_name)
sys.exit(0) # return success
if service_name == 'ranger-agent-api':
test_ranger_agent_api_reachable()
test_tcp_socket(service_name)
test_rpc_liveness(rabbit_group, service_queue_name)
if __name__ == "__main__":
run_health_check()
sys.exit(0) # return success

View File

@ -21,7 +21,7 @@ set -ex
# Come up with a ranger agent payload # Come up with a ranger agent payload
region="${REGION_NAME}" region="${REGION_NAME}"
url="${RANGER_SERVICE_URL}" url="${RANGER_SERVICE_URL}"
UUID=$(python -c 'import uuid; print uuid.uuid1()') UUID=$(python -c 'import uuid; print(uuid.uuid1())')
PAYLOAD="{\"ord-notifier\":{ PAYLOAD="{\"ord-notifier\":{
\"request-id\":\"$UUID\", \"request-id\":\"$UUID\",
@ -47,11 +47,11 @@ function assertContains()
msg="$(curl -s "$url?Id=$UUID")" msg="$(curl -s "$url?Id=$UUID")"
fi fi
if echo "$msg" | grep -q "$expected"; then if echo "$msg" | grep -q "$expected"; then
echo "***TEST IS PASSED: EXPECTED=$expected is in Responce" echo "***TEST IS PASSED: EXPECTED=$expected is in Response"
break break
else else
if [ "$n" == "5" ]; then if [ "$n" == "5" ]; then
echo "***FAILED: EXPECTED=$expected in Responce" echo "***FAILED: EXPECTED=$expected in Response"
exit 1 exit 1
fi fi
n=$[$n+1] n=$[$n+1]

View File

@ -38,8 +38,8 @@ data:
{{ tuple "bin/_ranger-agent-api.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} {{ tuple "bin/_ranger-agent-api.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
ranger-agent-engine.sh: | ranger-agent-engine.sh: |
{{ tuple "bin/_ranger-agent-engine.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} {{ tuple "bin/_ranger-agent-engine.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
health-check.sh: |+ health-probe.py: |
{{ tuple "bin/_health-check.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} {{ tuple "bin/_health-probe.py.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
ranger-agent-test.sh: |+ ranger-agent-test.sh: |+
{{ tuple "bin/_ranger-agent-test.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} {{ tuple "bin/_ranger-agent-test.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
rabbit-init.sh: | rabbit-init.sh: |

View File

@ -14,15 +14,37 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/}} */}}
{{- define "RangerAgentApiReadinessProbeTemplate" }}
exec:
command:
- python
- /tmp/health-probe.py
- --config-file
- /etc/ranger-agent/ranger-agent.conf
- --service-name
- ranger-agent-api
{{- end }}
{{- define "RangerAgentApiLivenessProbeTemplate" }}
exec:
command:
- python
- /tmp/health-probe.py
- --config-file
- /etc/ranger-agent/ranger-agent.conf
- --service-name
- ranger-agent-api
- --liveness-probe
{{- end }}
{{- if .Values.manifests.deployment_ranger_agent_api }} {{- if .Values.manifests.deployment_ranger_agent_api }}
{{- $envAll := . }} {{- $envAll := . }}
{{- $mounts_ranger_agent_api := .Values.pod.mounts.ranger_agent_api.ranger_agent_api }} {{- $mounts_ranger_agent_api := .Values.pod.mounts.ranger_agent_api.ranger_agent_api }}
{{- $mounts_ranger_agent_api_init := .Values.pod.mounts.ranger_agent_api.init_container }} {{- $mounts_ranger_agent_api_init := .Values.pod.mounts.ranger_agent_api.init_container }}
{{- $serviceAccountName := "ranger-agent-api" }} {{- $serviceAccountName := "ranger-agent-api" }}
{{ tuple $envAll "api" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }} {{ tuple $envAll "api" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
--- ---
apiVersion: apps/v1beta1 apiVersion: apps/v1beta1
kind: Deployment kind: Deployment
@ -71,16 +93,8 @@ spec:
ports: ports:
- name: ranger-api - name: ranger-api
containerPort: {{ tuple "ranger-agent" "internal" "api" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }} containerPort: {{ tuple "ranger-agent" "internal" "api" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
livenessProbe: {{ dict "envAll" $envAll "component" "api" "container" "ranger-agent-api" "type" "readiness" "probeTemplate" (include "RangerAgentApiReadinessProbeTemplate" $envAll | fromYaml) | include "helm-toolkit.snippets.kubernetes_probe" | indent 10 }}
tcpSocket: {{ dict "envAll" $envAll "component" "api" "container" "ranger-agent-api" "type" "liveness" "probeTemplate" (include "RangerAgentApiLivenessProbeTemplate" $envAll | fromYaml) | include "helm-toolkit.snippets.kubernetes_probe" | indent 10 }}
port: {{ tuple "ranger-agent" "internal" "api" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
readinessProbe:
exec:
command:
- /tmp/health-check.sh
- apireadiness
initialDelaySeconds: 30
timeoutSeconds: 5
volumeMounts: volumeMounts:
- name: pod-etc-ranger-agent - name: pod-etc-ranger-agent
mountPath: /etc/ranger-agent mountPath: /etc/ranger-agent
@ -89,8 +103,8 @@ spec:
subPath: ranger-agent-api.sh subPath: ranger-agent-api.sh
readOnly: true readOnly: true
- name: ranger-agent-bin - name: ranger-agent-bin
mountPath: /tmp/health-check.sh mountPath: /tmp/health-probe.py
subPath: health-check.sh subPath: health-probe.py
readOnly: true readOnly: true
- name: ranger-agent-etc - name: ranger-agent-etc
mountPath: /etc/ranger-agent/ranger-agent.conf mountPath: /etc/ranger-agent/ranger-agent.conf

View File

@ -13,6 +13,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/}} */}}
{{- define "RangerAgentEngineReadinessProbeTemplate" }}
exec:
command:
- python
- /tmp/health-probe.py
- --config-file
- /etc/ranger-agent/ranger-agent.conf
- --service-name
- ranger-agent-engine
{{- end }}
{{- define "RangerAgentEngineLivenessProbeTemplate" }}
exec:
command:
- python
- /tmp/health-probe.py
- --config-file
- /etc/ranger-agent/ranger-agent.conf
- --service-name
- ranger-agent-engine
- --liveness-probe
{{- end }}
{{- if .Values.manifests.deployment_ranger_agent_engine }} {{- if .Values.manifests.deployment_ranger_agent_engine }}
{{- $envAll := . }} {{- $envAll := . }}
@ -21,6 +44,7 @@ limitations under the License.
{{- $serviceAccountName := "ranger-agent-engine" }} {{- $serviceAccountName := "ranger-agent-engine" }}
{{ tuple $envAll "engine" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }} {{ tuple $envAll "engine" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
--- ---
apiVersion: apps/v1beta1 apiVersion: apps/v1beta1
kind: Deployment kind: Deployment
@ -92,20 +116,8 @@ spec:
command: command:
- /tmp/ranger-agent-engine.sh - /tmp/ranger-agent-engine.sh
- stop - stop
livenessProbe: {{ dict "envAll" $envAll "component" "engine" "container" "ranger-agent-engine" "type" "readiness" "probeTemplate" (include "RangerAgentEngineReadinessProbeTemplate" $envAll | fromYaml) | include "helm-toolkit.snippets.kubernetes_probe" | indent 10 }}
exec: {{ dict "envAll" $envAll "component" "engine" "container" "ranger-agent-engine" "type" "liveness" "probeTemplate" (include "RangerAgentEngineLivenessProbeTemplate" $envAll | fromYaml) | include "helm-toolkit.snippets.kubernetes_probe" | indent 10 }}
command:
- /tmp/health-check.sh
- engineliveness
initialDelaySeconds: 30
timeoutSeconds: 5
readinessProbe:
exec:
command:
- /tmp/health-check.sh
- enginereadiness
initialDelaySeconds: 30
timeoutSeconds: 5
volumeMounts: volumeMounts:
- name: pod-etc-ranger-agent - name: pod-etc-ranger-agent
mountPath: /etc/ranger-agent mountPath: /etc/ranger-agent
@ -114,8 +126,8 @@ spec:
subPath: ranger-agent-engine.sh subPath: ranger-agent-engine.sh
readOnly: true readOnly: true
- name: ranger-agent-bin - name: ranger-agent-bin
mountPath: /tmp/health-check.sh mountPath: /tmp/health-probe.py
subPath: health-check.sh subPath: health-probe.py
readOnly: true readOnly: true
- name: ranger-agent-etc - name: ranger-agent-etc
mountPath: /etc/ranger-agent/ranger-agent.conf mountPath: /etc/ranger-agent/ranger-agent.conf

View File

@ -27,8 +27,8 @@ images:
ks_service: docker.io/openstackhelm/heat:newton-ubuntu_xenial ks_service: docker.io/openstackhelm/heat:newton-ubuntu_xenial
ks_user: docker.io/openstackhelm/heat:newton-ubuntu_xenial ks_user: docker.io/openstackhelm/heat:newton-ubuntu_xenial
rabbit_init: docker.io/rabbitmq:3.7-management rabbit_init: docker.io/rabbitmq:3.7-management
ranger-agent_db_sync: quay.io/attcomdev/ranger-agent:60529ac023bf550f0e9cb9e0eb4d4eb3dbf2d5c6 ranger-agent_db_sync: quay.io/attcomdev/ranger-agent:02114b616b50c24e7f1f27d9b1ab3d722b4b20b2
ranger_agent: quay.io/attcomdev/ranger-agent:60529ac023bf550f0e9cb9e0eb4d4eb3dbf2d5c6 ranger_agent: quay.io/attcomdev/ranger-agent:02114b616b50c24e7f1f27d9b1ab3d722b4b20b2
scripted_test: docker.io/openstackhelm/heat:newton-ubuntu_xenial scripted_test: docker.io/openstackhelm/heat:newton-ubuntu_xenial
pull_policy: "IfNotPresent" pull_policy: "IfNotPresent"
local_registry: local_registry:
@ -264,6 +264,35 @@ pod:
limits: limits:
memory: "1024Mi" memory: "1024Mi"
cpu: "2000m" cpu: "2000m"
probes:
api:
ranger-agent-api:
readiness:
enabled: true
params:
initialDelaySeconds: 80
periodSeconds: 95
timeoutSeconds: 85
liveness:
enabled: true
params:
initialDelaySeconds: 120
periodSeconds: 95
timeoutSeconds: 85
engine:
ranger-agent-engine:
readiness:
enabled: true
params:
initialDelaySeconds: 80
periodSeconds: 95
timeoutSeconds: 85
liveness:
enabled: true
params:
initialDelaySeconds: 120
periodSeconds: 95
timeoutSeconds: 85
# Names of secrets used and environmental checks # Names of secrets used and environmental checks
secrets: secrets:
@ -517,6 +546,7 @@ conf:
api_paste_config: /etc/ranger-agent/api-paste.ini api_paste_config: /etc/ranger-agent/api-paste.ini
local_repo: ranger_repo local_repo: ranger_repo
resource_status_check_wait: 15 resource_status_check_wait: 15
enable_heat_health_check: true
api: api:
host: 0.0.0.0 host: 0.0.0.0
database: database:

View File

@ -7,7 +7,8 @@ tee /tmp/ranger-agent.yaml << EOF
conf: conf:
ranger_agent: ranger_agent:
DEFAULT: DEFAULT:
enable_rds_callback_check: False enable_rds_callback_check: false
enable_heat_health_check: false
ssh: ssh:
ssh_key: null ssh_key: null
ssh_config: null ssh_config: null