diff --git a/container_config_scripts/monitoring/collectd_check_health.py b/container_config_scripts/monitoring/collectd_check_health.py deleted file mode 100755 index eea75ec31b..0000000000 --- a/container_config_scripts/monitoring/collectd_check_health.py +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env python3 -# -# Copyright 2018 Red Hat Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -import datetime -import re -import sys - -HCLOG = '/var/log/collectd/healthchecks.stdout' -START_RE = re.compile( - r'(?P\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) (?P[\w\-\.\:]*) systemd\[.*\]: Started /usr/bin/podman healthcheck run (?P\w*)') -EXEC_RE = re.compile( - r'(?P\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) (?P[\w\-\.\:]*) podman\[(?P\d*)\]: (?P.*) container exec (?P\w*) \(.*name=(?P\w*).*\)') -RESULT_RE = re.compile( - r'(?P\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) (?P[\w\-\.\:]*) podman\[(?P\d*)\]: (?P(un)?healthy)') - - -def process_healthcheck_output(path_to_log): - """Process saved output of health checks and returns list of unhealthy - containers. - """ - data = {} - pid_map = {} - with open(path_to_log, "r+") as logfile: - for line in logfile: - match = START_RE.search(line) - if match: - item = data.setdefault(match.group('container_id'), {}) - item['timestamp_start'] = match.group('timestamp') - item['host'] = match.group('host') - continue - match = EXEC_RE.search(line) - if match: - item = data.setdefault(match.group('container_id'), {}) - item['container_name'] = match.group('container_name') - item['host'] = match.group('host') - item['pid'] = match.group('pid') - pid_map[match.group('pid')] = match.group('container_id') - continue - match = RESULT_RE.search(line) - if match: - if match.group('pid') not in pid_map: - continue - item = data[pid_map[match.group('pid')]] - item['result'] = match.group('result') - if 'timestamp_start' not in item: - continue - try: - start = datetime.datetime.strptime(item['timestamp_start'], - '%b %d %H:%M:%S') - end = datetime.datetime.strptime(match.group('timestamp'), - '%b %d %H:%M:%S') - item['duration'] = (end - start).seconds - except Exception as ex: - err = "[WARN] Failure during calculating duration: {}" - print(err.format(ex)) - continue - logfile.truncate() - - # truncate the file - with open(HCLOG, "w") as logfile: - pass - - unhealthy = [] - for container in data.values(): - if 'result' not in container: - continue - if container['result'] == 'healthy': - continue - log = ('{container_name}: Container health check on host {host} ' - 'results as {result} after {duration}s.') - unhealthy.append(log.format(**container)) - return unhealthy - - -if __name__ == "__main__": - unhealthy = process_healthcheck_output(HCLOG) - if unhealthy: - print(' ; '.join(unhealthy)) - sys.exit(2) diff --git a/deployment/metrics/collectd-container-puppet.yaml b/deployment/metrics/collectd-container-puppet.yaml index 7c11f7bcf7..73099a14bb 100644 --- a/deployment/metrics/collectd-container-puppet.yaml +++ b/deployment/metrics/collectd-container-puppet.yaml @@ -330,7 +330,18 @@ parameters: default: true CollectdContainerHealthCheckCommand: type: string - default: "/scripts/collectd_check_health.py" + default: | + output="" + while read line ; do + i=$(echo $line | awk '//{gsub(/:/, "", $0); print $5}') + log=$(echo $line | awk '{split($0,a,/:\s+Error:\s+/); print a[2]}') + log=${log:0:-1} + output+=" ; ${i}: ${log}" + done < <(egrep "^[a-zA-Z]{3}\s+[0-9]{2}\s+[0-9\:]{8}\s+.*\s+.*:\s+[Ee]rror\:" /var/log/collectd/healthchecks.log) + truncate -s0 /var/log/collectd/healthchecks.log + if [ ! -z "${output}" ]; then + echo ${output:3} && exit 2; + fi CollectdContainerHealthCheckInterval: type: number description: The frequency in seconds the docker health check is executed. @@ -629,12 +640,6 @@ outputs: - path: /var/log/collectd owner: collectd:collectd recurse: true - container_config_scripts: - map_merge: - - {get_attr: [ContainersCommon, container_config_scripts]} - - collectd_check_health.py: - mode: "0755" - content: { get_file: ../../container_config_scripts/monitoring/collectd_check_health.py } docker_config: step_5: collectd: @@ -656,7 +661,6 @@ outputs: - /var/lib/config-data/puppet-generated/collectd:/var/lib/kolla/config_files/src:ro - /var/log/containers/collectd:/var/log/collectd:rw,z - /var/run/:/var/run:rw - - /var/lib/container-config-scripts:/scripts:ro - /sys/fs/cgroup:/sys/fs/cgroup:ro environment: KOLLA_CONFIG_STRATEGY: COPY_ALWAYS @@ -684,7 +688,7 @@ outputs: copy: dest: /etc/rsyslog.d/openstack-healthcheck.conf content: | - if ($programname startswith 'podman' and ($msg contains 'container exec' or $msg contains 'healthy')) or ($programname startswith 'systemd' and $msg contains 'podman healthcheck run') then -/var/log/containers/collectd/healthchecks.stdout + if $programname startswith 'healthcheck_' then -/var/log/containers/collectd/healthchecks.log & stop - name: Remove healthcheck log when: