From 31a1f9c8ed4ad8a212e146cc59e5273048570b32 Mon Sep 17 00:00:00 2001 From: Martin Magr Date: Tue, 14 Jul 2020 10:22:12 +0200 Subject: [PATCH] Adapt container health check for built-in podman health checks This patch removes regression which was introduced by moving from systemd health check framework to built-in podman health check support. Change-Id: I1706e04b543e8c9ff3903a9575b7c2cd74b9a0b3 (cherry picked from commit 1952a9ce647d3abd34fb37cbe793b7eddd52e56b --- .../monitoring/collectd_check_health.py | 92 +++++++++++++++++++ .../metrics/collectd-container-puppet.yaml | 22 ++--- 2 files changed, 101 insertions(+), 13 deletions(-) create mode 100755 container_config_scripts/monitoring/collectd_check_health.py diff --git a/container_config_scripts/monitoring/collectd_check_health.py b/container_config_scripts/monitoring/collectd_check_health.py new file mode 100755 index 0000000000..eea75ec31b --- /dev/null +++ b/container_config_scripts/monitoring/collectd_check_health.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +# +# Copyright 2018 Red Hat Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import datetime +import re +import sys + +HCLOG = '/var/log/collectd/healthchecks.stdout' +START_RE = re.compile( + r'(?P\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) (?P[\w\-\.\:]*) systemd\[.*\]: Started /usr/bin/podman healthcheck run (?P\w*)') +EXEC_RE = re.compile( + r'(?P\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) (?P[\w\-\.\:]*) podman\[(?P\d*)\]: (?P.*) container exec (?P\w*) \(.*name=(?P\w*).*\)') +RESULT_RE = re.compile( + r'(?P\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) (?P[\w\-\.\:]*) podman\[(?P\d*)\]: (?P(un)?healthy)') + + +def process_healthcheck_output(path_to_log): + """Process saved output of health checks and returns list of unhealthy + containers. + """ + data = {} + pid_map = {} + with open(path_to_log, "r+") as logfile: + for line in logfile: + match = START_RE.search(line) + if match: + item = data.setdefault(match.group('container_id'), {}) + item['timestamp_start'] = match.group('timestamp') + item['host'] = match.group('host') + continue + match = EXEC_RE.search(line) + if match: + item = data.setdefault(match.group('container_id'), {}) + item['container_name'] = match.group('container_name') + item['host'] = match.group('host') + item['pid'] = match.group('pid') + pid_map[match.group('pid')] = match.group('container_id') + continue + match = RESULT_RE.search(line) + if match: + if match.group('pid') not in pid_map: + continue + item = data[pid_map[match.group('pid')]] + item['result'] = match.group('result') + if 'timestamp_start' not in item: + continue + try: + start = datetime.datetime.strptime(item['timestamp_start'], + '%b %d %H:%M:%S') + end = datetime.datetime.strptime(match.group('timestamp'), + '%b %d %H:%M:%S') + item['duration'] = (end - start).seconds + except Exception as ex: + err = "[WARN] Failure during calculating duration: {}" + print(err.format(ex)) + continue + logfile.truncate() + + # truncate the file + with open(HCLOG, "w") as logfile: + pass + + unhealthy = [] + for container in data.values(): + if 'result' not in container: + continue + if container['result'] == 'healthy': + continue + log = ('{container_name}: Container health check on host {host} ' + 'results as {result} after {duration}s.') + unhealthy.append(log.format(**container)) + return unhealthy + + +if __name__ == "__main__": + unhealthy = process_healthcheck_output(HCLOG) + if unhealthy: + print(' ; '.join(unhealthy)) + sys.exit(2) diff --git a/deployment/metrics/collectd-container-puppet.yaml b/deployment/metrics/collectd-container-puppet.yaml index 73099a14bb..7c11f7bcf7 100644 --- a/deployment/metrics/collectd-container-puppet.yaml +++ b/deployment/metrics/collectd-container-puppet.yaml @@ -330,18 +330,7 @@ parameters: default: true CollectdContainerHealthCheckCommand: type: string - default: | - output="" - while read line ; do - i=$(echo $line | awk '//{gsub(/:/, "", $0); print $5}') - log=$(echo $line | awk '{split($0,a,/:\s+Error:\s+/); print a[2]}') - log=${log:0:-1} - output+=" ; ${i}: ${log}" - done < <(egrep "^[a-zA-Z]{3}\s+[0-9]{2}\s+[0-9\:]{8}\s+.*\s+.*:\s+[Ee]rror\:" /var/log/collectd/healthchecks.log) - truncate -s0 /var/log/collectd/healthchecks.log - if [ ! -z "${output}" ]; then - echo ${output:3} && exit 2; - fi + default: "/scripts/collectd_check_health.py" CollectdContainerHealthCheckInterval: type: number description: The frequency in seconds the docker health check is executed. @@ -640,6 +629,12 @@ outputs: - path: /var/log/collectd owner: collectd:collectd recurse: true + container_config_scripts: + map_merge: + - {get_attr: [ContainersCommon, container_config_scripts]} + - collectd_check_health.py: + mode: "0755" + content: { get_file: ../../container_config_scripts/monitoring/collectd_check_health.py } docker_config: step_5: collectd: @@ -661,6 +656,7 @@ outputs: - /var/lib/config-data/puppet-generated/collectd:/var/lib/kolla/config_files/src:ro - /var/log/containers/collectd:/var/log/collectd:rw,z - /var/run/:/var/run:rw + - /var/lib/container-config-scripts:/scripts:ro - /sys/fs/cgroup:/sys/fs/cgroup:ro environment: KOLLA_CONFIG_STRATEGY: COPY_ALWAYS @@ -688,7 +684,7 @@ outputs: copy: dest: /etc/rsyslog.d/openstack-healthcheck.conf content: | - if $programname startswith 'healthcheck_' then -/var/log/containers/collectd/healthchecks.log + if ($programname startswith 'podman' and ($msg contains 'container exec' or $msg contains 'healthy')) or ($programname startswith 'systemd' and $msg contains 'podman healthcheck run') then -/var/log/containers/collectd/healthchecks.stdout & stop - name: Remove healthcheck log when: