From 3b9adc2bf056b8e3ec5f800e189ebf31d51c0ab4 Mon Sep 17 00:00:00 2001 From: Chi Lo Date: Fri, 20 Sep 2019 10:07:44 -0700 Subject: [PATCH] Health probe for Ranger-agent pods Health probe for Ranger-agent pods is used for both liveness and readiness probe. ranger-agent-api and ranger-agent-engine pods: - Sends an RPC call with a known method to pod's listener queue. Probe is successful if call returns with no error. If listener is not reachable or fails to respond in time, returns failure to probe. - Check if the rpc socket status on ranger-agent pods to rabbitmq are in established state. ranger-agent-api pod: - Launch a call to pod's open interface. Probe is successful if call returns; otherwise failure if response has error or timed out. Change-Id: I7a22fd50d47e58df19b413ed65ab528e2d78d609 --- .../templates/bin/_health-probe.py.tpl | 256 ++++++++++++++++++ .../templates/bin/_ranger-agent-test.sh.tpl | 6 +- ranger-agent/templates/configmap-bin.yaml | 4 +- .../deployment-ranger-agent-api.yaml | 40 ++- .../deployment-ranger-agent-engine.yaml | 44 +-- ranger-agent/values.yaml | 34 ++- tools/gate/scripts/070-deploy-ranger-agent.sh | 3 +- 7 files changed, 350 insertions(+), 37 deletions(-) create mode 100644 ranger-agent/templates/bin/_health-probe.py.tpl diff --git a/ranger-agent/templates/bin/_health-probe.py.tpl b/ranger-agent/templates/bin/_health-probe.py.tpl new file mode 100644 index 00000000..21f6f75d --- /dev/null +++ b/ranger-agent/templates/bin/_health-probe.py.tpl @@ -0,0 +1,256 @@ +#!/usr/bin/env python + +# Copyright 2019 The Openstack-Helm Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Health probe script for OpenStack service that uses RPC/unix domain socket for +communication. Check's the RPC tcp socket status on the process and send +message to service through rpc call method and expects a reply. + +Script returns failure to Kubernetes only when + a. TCP socket for the RPC communication are not established. + b. service is not reachable or + c. service times out sending a reply. + +sys.stderr.write() writes to pod's events on failures. + +Usage example for Ranger-agent-engine: +# python health-probe.py --config-file /etc/ranger-agent/ranger-agent.conf \ +# --service-queue-name ord-notifier-q + +""" + +import psutil +import requests +import socket +import sys + +from oslo_config import cfg +from oslo_context import context +from oslo_log import log +import oslo_messaging + +try: + from configparser import ConfigParser +except ImportError: + from ConfigParser import ConfigParser + +tcp_established = "ESTABLISHED" + + +def check_service_status(transport, service_queue_name): + """Verify service status. Return success if service consumes message""" + service_error = False + try: + target = oslo_messaging.Target(topic=service_queue_name, + server=socket.gethostname()) + client = oslo_messaging.RPCClient(transport, target, + timeout=75, + retry=0) + cctxt = client.prepare(version='1.0') + results = cctxt.call(context.RequestContext(), + 'invoke_health_probe_rpc') + + for value in results.values(): + if value == 'failed': + sys.stderr.write("Health probe detects problem " + ": %s\n" % results) + if not cfg.CONF.liveness_probe: + service_error = True + sys.exit(1) + break + + except oslo_messaging.exceptions.MessageDeliveryFailure: + # Log to pod events + sys.stderr.write("Health probe unable to reach message bus\n") + sys.exit(0) # return success + except oslo_messaging.rpc.client.RemoteError as re: + message = getattr(re, "message", str(re)) + if ("Endpoint does not support RPC method" in message) or \ + ("Endpoint does not support RPC version" in message): + sys.exit(0) # Call reached the service + else: + sys.stderr.write("Health probe unable to reach service\n") + sys.exit(1) # return failure + except oslo_messaging.exceptions.MessagingTimeout: + sys.stderr.write("Health probe timed out. Service is down or " + "response timed out\n") + sys.exit(1) # return failure + except Exception as ex: + message = getattr(ex, "message", str(ex)) + sys.stderr.write("Health probe caught exception sending message to " + "service: %s\n" % message) + sys.exit(0) + except: + sys.stderr.write("Health probe caught exception sending message to" + " service\n") + if service_error: + sys.exit(1) + else: + sys.exit(0) + + +def tcp_socket_status(process, ports): + """Check the tcp socket status on a process""" + sock_count = 0 + parentId = 0 + for pr in psutil.pids(): + try: + p = psutil.Process(pr) + if p.name() in process: + if parentId == 0: + parentId = p.pid + else: + if p.ppid() == parentId and not cfg.CONF.check_all_pids: + continue + pcon = p.connections() + for con in pcon: + try: + rport = con.raddr[1] + status = con.status + except IndexError: + continue + if rport in ports and status == tcp_established: + sock_count = sock_count + 1 + except psutil.NoSuchProcess: + continue + + if sock_count == 0: + return 0 + else: + return 1 + + +def get_rabbitmq_ports(): + """Get the rabbitmq port from config file""" + rabbit_ports = set() + + try: + transport_url = oslo_messaging.TransportURL.parse(cfg.CONF) + for host in transport_url.hosts: + rabbit_ports.add(host.port) + except Exception as ex: + message = getattr(ex, "message", str(ex)) + sys.stderr.write("Health probe caught exception reading " + "RabbitMQ ports: %s" % message) + sys.exit(0) # return success + + return rabbit_ports + + +def test_tcp_socket(service_name): + """Check tcp socket to rabbitmq is in Established state""" + r_ports = get_rabbitmq_ports() + + # service_name is the same as process name for ranger-agent app + proc = cfg.CONF.service_name + if r_ports and tcp_socket_status(service_name, r_ports) == 0: + sys.stderr.write("RabbitMQ socket not established\n") + # Do not kill the pod if RabbitMQ is not reachable/down + if not cfg.CONF.liveness_probe: + sys.exit(1) + + +def test_ranger_agent_api_reachable(): + """Test ranger-agent-api for response""" + + # get ranger-agent-api port + config = ConfigParser() + config.read(cfg.CONF.config_file) + port = config.get('api', 'port') + + url = "http://localhost:{}/v1/ord/health_check".format(port) + try: + response = requests.get(url, timeout=30) + if response.status_code != 200: + sys.exit(1) + except requests.exceptions.ConnectionError as ce: + message = getattr(ce, "message", str(ce)) + sys.stderr.write("Health probe ConnectionError Exp: %s\n" % message) + sys.exit(1) + except requests.exceptions.ReadTimeout as to: + message = getattr(to, "message", str(to)) + sys.stderr.write("Health probe ReadTimeout Exp: %s\n" % message) + sys.exit(1) + except Exception as ex: + message = getattr(ex, "message", str(ex)) + sys.stderr.write("Health probe caught Unknown Exp: %s\n" % message) + sys.exit(1) + + +def test_rpc_liveness(rabbit_group, service_queue_name): + """Test if service can consume message from queue""" + try: + transport = oslo_messaging.get_transport(cfg.CONF) + except Exception as ex: + message = getattr(ex, "message", str(ex)) + sys.stderr.write("Message bus driver load error: %s" % message) + sys.exit(0) # return success + + if not cfg.CONF.transport_url or \ + not service_queue_name: + sys.stderr.write("Both message bus URL and service's queue name are " + "required for health probe to work") + sys.exit(0) # return success + + try: + cfg.CONF.set_override('rabbit_max_retries', 2, + group=rabbit_group) # 3 attempts + except cfg.NoSuchOptError as ex: + cfg.CONF.register_opt(cfg.IntOpt('rabbit_max_retries', default=2), + group=rabbit_group) + + check_service_status(transport, service_queue_name) + + +def run_health_check(): + oslo_messaging.set_transport_defaults(control_exchange='openstack') + + rabbit_group = cfg.OptGroup(name='oslo_messaging_rabbit', + title='RabbitMQ options') + cfg.CONF.register_group(rabbit_group) + cfg.CONF.register_cli_opt(cfg.StrOpt('service-name')) + cfg.CONF.register_cli_opt(cfg.BoolOpt('liveness-probe', default=False, + required=False)) + cfg.CONF.register_cli_opt(cfg.BoolOpt('check-all-pids', default=False, + required=False)) + + cfg.CONF(sys.argv[1:]) + + log.logging.basicConfig(level=log.ERROR) + + dict_services = { + "ranger-agent-engine": "ord-notifier-q", + "ranger-agent-api": "ord-listener-q" + } + + service_name = cfg.CONF.service_name + if service_name in dict_services: + service_queue_name = dict_services[service_name] + else: + sys.stderr.write("Invalid service name: %s\n" % service_name) + sys.exit(0) # return success + + if service_name == 'ranger-agent-api': + test_ranger_agent_api_reachable() + + test_tcp_socket(service_name) + test_rpc_liveness(rabbit_group, service_queue_name) + + +if __name__ == "__main__": + run_health_check() + + sys.exit(0) # return success diff --git a/ranger-agent/templates/bin/_ranger-agent-test.sh.tpl b/ranger-agent/templates/bin/_ranger-agent-test.sh.tpl index 3d716413..f48173ce 100644 --- a/ranger-agent/templates/bin/_ranger-agent-test.sh.tpl +++ b/ranger-agent/templates/bin/_ranger-agent-test.sh.tpl @@ -21,7 +21,7 @@ set -ex # Come up with a ranger agent payload region="${REGION_NAME}" url="${RANGER_SERVICE_URL}" -UUID=$(python -c 'import uuid; print uuid.uuid1()') +UUID=$(python -c 'import uuid; print(uuid.uuid1())') PAYLOAD="{\"ord-notifier\":{ \"request-id\":\"$UUID\", @@ -47,11 +47,11 @@ function assertContains() msg="$(curl -s "$url?Id=$UUID")" fi if echo "$msg" | grep -q "$expected"; then - echo "***TEST IS PASSED: EXPECTED=$expected is in Responce" + echo "***TEST IS PASSED: EXPECTED=$expected is in Response" break else if [ "$n" == "5" ]; then - echo "***FAILED: EXPECTED=$expected in Responce" + echo "***FAILED: EXPECTED=$expected in Response" exit 1 fi n=$[$n+1] diff --git a/ranger-agent/templates/configmap-bin.yaml b/ranger-agent/templates/configmap-bin.yaml index fd6ca8db..406a30ac 100755 --- a/ranger-agent/templates/configmap-bin.yaml +++ b/ranger-agent/templates/configmap-bin.yaml @@ -38,8 +38,8 @@ data: {{ tuple "bin/_ranger-agent-api.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} ranger-agent-engine.sh: | {{ tuple "bin/_ranger-agent-engine.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} - health-check.sh: |+ -{{ tuple "bin/_health-check.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} + health-probe.py: | +{{ tuple "bin/_health-probe.py.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} ranger-agent-test.sh: |+ {{ tuple "bin/_ranger-agent-test.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} rabbit-init.sh: | diff --git a/ranger-agent/templates/deployment-ranger-agent-api.yaml b/ranger-agent/templates/deployment-ranger-agent-api.yaml index 4756d2e7..8644f133 100755 --- a/ranger-agent/templates/deployment-ranger-agent-api.yaml +++ b/ranger-agent/templates/deployment-ranger-agent-api.yaml @@ -14,15 +14,37 @@ See the License for the specific language governing permissions and limitations under the License. */}} +{{- define "RangerAgentApiReadinessProbeTemplate" }} +exec: + command: + - python + - /tmp/health-probe.py + - --config-file + - /etc/ranger-agent/ranger-agent.conf + - --service-name + - ranger-agent-api +{{- end }} +{{- define "RangerAgentApiLivenessProbeTemplate" }} +exec: + command: + - python + - /tmp/health-probe.py + - --config-file + - /etc/ranger-agent/ranger-agent.conf + - --service-name + - ranger-agent-api + - --liveness-probe +{{- end }} + {{- if .Values.manifests.deployment_ranger_agent_api }} {{- $envAll := . }} {{- $mounts_ranger_agent_api := .Values.pod.mounts.ranger_agent_api.ranger_agent_api }} {{- $mounts_ranger_agent_api_init := .Values.pod.mounts.ranger_agent_api.init_container }} - {{- $serviceAccountName := "ranger-agent-api" }} {{ tuple $envAll "api" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }} + --- apiVersion: apps/v1beta1 kind: Deployment @@ -71,16 +93,8 @@ spec: ports: - name: ranger-api containerPort: {{ tuple "ranger-agent" "internal" "api" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }} - livenessProbe: - tcpSocket: - port: {{ tuple "ranger-agent" "internal" "api" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }} - readinessProbe: - exec: - command: - - /tmp/health-check.sh - - apireadiness - initialDelaySeconds: 30 - timeoutSeconds: 5 +{{ dict "envAll" $envAll "component" "api" "container" "ranger-agent-api" "type" "readiness" "probeTemplate" (include "RangerAgentApiReadinessProbeTemplate" $envAll | fromYaml) | include "helm-toolkit.snippets.kubernetes_probe" | indent 10 }} +{{ dict "envAll" $envAll "component" "api" "container" "ranger-agent-api" "type" "liveness" "probeTemplate" (include "RangerAgentApiLivenessProbeTemplate" $envAll | fromYaml) | include "helm-toolkit.snippets.kubernetes_probe" | indent 10 }} volumeMounts: - name: pod-etc-ranger-agent mountPath: /etc/ranger-agent @@ -89,8 +103,8 @@ spec: subPath: ranger-agent-api.sh readOnly: true - name: ranger-agent-bin - mountPath: /tmp/health-check.sh - subPath: health-check.sh + mountPath: /tmp/health-probe.py + subPath: health-probe.py readOnly: true - name: ranger-agent-etc mountPath: /etc/ranger-agent/ranger-agent.conf diff --git a/ranger-agent/templates/deployment-ranger-agent-engine.yaml b/ranger-agent/templates/deployment-ranger-agent-engine.yaml index 3e140d56..e6bc66d1 100755 --- a/ranger-agent/templates/deployment-ranger-agent-engine.yaml +++ b/ranger-agent/templates/deployment-ranger-agent-engine.yaml @@ -13,6 +13,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */}} + +{{- define "RangerAgentEngineReadinessProbeTemplate" }} +exec: + command: + - python + - /tmp/health-probe.py + - --config-file + - /etc/ranger-agent/ranger-agent.conf + - --service-name + - ranger-agent-engine +{{- end }} +{{- define "RangerAgentEngineLivenessProbeTemplate" }} +exec: + command: + - python + - /tmp/health-probe.py + - --config-file + - /etc/ranger-agent/ranger-agent.conf + - --service-name + - ranger-agent-engine + - --liveness-probe +{{- end }} + {{- if .Values.manifests.deployment_ranger_agent_engine }} {{- $envAll := . }} @@ -21,6 +44,7 @@ limitations under the License. {{- $serviceAccountName := "ranger-agent-engine" }} {{ tuple $envAll "engine" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }} + --- apiVersion: apps/v1beta1 kind: Deployment @@ -92,20 +116,8 @@ spec: command: - /tmp/ranger-agent-engine.sh - stop - livenessProbe: - exec: - command: - - /tmp/health-check.sh - - engineliveness - initialDelaySeconds: 30 - timeoutSeconds: 5 - readinessProbe: - exec: - command: - - /tmp/health-check.sh - - enginereadiness - initialDelaySeconds: 30 - timeoutSeconds: 5 +{{ dict "envAll" $envAll "component" "engine" "container" "ranger-agent-engine" "type" "readiness" "probeTemplate" (include "RangerAgentEngineReadinessProbeTemplate" $envAll | fromYaml) | include "helm-toolkit.snippets.kubernetes_probe" | indent 10 }} +{{ dict "envAll" $envAll "component" "engine" "container" "ranger-agent-engine" "type" "liveness" "probeTemplate" (include "RangerAgentEngineLivenessProbeTemplate" $envAll | fromYaml) | include "helm-toolkit.snippets.kubernetes_probe" | indent 10 }} volumeMounts: - name: pod-etc-ranger-agent mountPath: /etc/ranger-agent @@ -114,8 +126,8 @@ spec: subPath: ranger-agent-engine.sh readOnly: true - name: ranger-agent-bin - mountPath: /tmp/health-check.sh - subPath: health-check.sh + mountPath: /tmp/health-probe.py + subPath: health-probe.py readOnly: true - name: ranger-agent-etc mountPath: /etc/ranger-agent/ranger-agent.conf diff --git a/ranger-agent/values.yaml b/ranger-agent/values.yaml index 4922d3d1..136706ba 100755 --- a/ranger-agent/values.yaml +++ b/ranger-agent/values.yaml @@ -27,8 +27,8 @@ images: ks_service: docker.io/openstackhelm/heat:newton-ubuntu_xenial ks_user: docker.io/openstackhelm/heat:newton-ubuntu_xenial rabbit_init: docker.io/rabbitmq:3.7-management - ranger-agent_db_sync: quay.io/attcomdev/ranger-agent:60529ac023bf550f0e9cb9e0eb4d4eb3dbf2d5c6 - ranger_agent: quay.io/attcomdev/ranger-agent:60529ac023bf550f0e9cb9e0eb4d4eb3dbf2d5c6 + ranger-agent_db_sync: quay.io/attcomdev/ranger-agent:02114b616b50c24e7f1f27d9b1ab3d722b4b20b2 + ranger_agent: quay.io/attcomdev/ranger-agent:02114b616b50c24e7f1f27d9b1ab3d722b4b20b2 scripted_test: docker.io/openstackhelm/heat:newton-ubuntu_xenial pull_policy: "IfNotPresent" local_registry: @@ -264,6 +264,35 @@ pod: limits: memory: "1024Mi" cpu: "2000m" + probes: + api: + ranger-agent-api: + readiness: + enabled: true + params: + initialDelaySeconds: 80 + periodSeconds: 95 + timeoutSeconds: 85 + liveness: + enabled: true + params: + initialDelaySeconds: 120 + periodSeconds: 95 + timeoutSeconds: 85 + engine: + ranger-agent-engine: + readiness: + enabled: true + params: + initialDelaySeconds: 80 + periodSeconds: 95 + timeoutSeconds: 85 + liveness: + enabled: true + params: + initialDelaySeconds: 120 + periodSeconds: 95 + timeoutSeconds: 85 # Names of secrets used and environmental checks secrets: @@ -517,6 +546,7 @@ conf: api_paste_config: /etc/ranger-agent/api-paste.ini local_repo: ranger_repo resource_status_check_wait: 15 + enable_heat_health_check: true api: host: 0.0.0.0 database: diff --git a/tools/gate/scripts/070-deploy-ranger-agent.sh b/tools/gate/scripts/070-deploy-ranger-agent.sh index 9b86afb4..d2866e7c 100755 --- a/tools/gate/scripts/070-deploy-ranger-agent.sh +++ b/tools/gate/scripts/070-deploy-ranger-agent.sh @@ -7,7 +7,8 @@ tee /tmp/ranger-agent.yaml << EOF conf: ranger_agent: DEFAULT: - enable_rds_callback_check: False + enable_rds_callback_check: false + enable_heat_health_check: false ssh: ssh_key: null ssh_config: null