diff --git a/container_config_scripts/monitoring/collectd_check_health.py b/container_config_scripts/monitoring/collectd_check_health.py new file mode 100755 index 0000000000..86282e4035 --- /dev/null +++ b/container_config_scripts/monitoring/collectd_check_health.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +# Copyright 2018 Red Hat Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import json +import re +import sys + +HCLOG = '/var/log/collectd/healthchecks.log' +SERVICE_REGX = re.compile(r""" + \shealthcheck_(?P\w+) # service + \[(?P\d+)\] # pid + """, re.VERBOSE) +ERROR_REGX = re.compile(r""" + \shealthcheck_(?P\w+) # service + \[(?P\d+)\] # pid + :\s[Ee]rror: (?P.+) # error + """, re.VERBOSE) + + +def process_healthcheck_output(logfile): + """Process saved output of health checks and returns list of healthy and + unhealthy containers. + """ + with open(logfile, 'r') as logs: + data = {} + for line in logs: + match = SERVICE_REGX.search(line) + if match and not match.group('service_name') in data: + data[match.group('service_name')] = { + 'service': match.group('service_name'), + 'container': match.group('id'), + 'status': 'healthy', + 'healthy': 1 + } + match = ERROR_REGX.search(line) + if match: + data[match.group('service_name')] = { + 'service': match.group('service_name'), + 'container': match.group('id'), + 'status': 'unhealthy', + 'healthy': 0 + } + + # truncate + with open(logfile, 'w') as logs: + pass + + ret_code, output = 0, [] + for _, opt in data.items(): + if opt['healthy'] > 0 and ret_code != 2: + ret_code = 2 + output.append(opt) + return ret_code, output + +if __name__ == "__main__": + RET_CODE, STATUS = process_healthcheck_output(HCLOG) + print(json.dumps(STATUS)) + sys.exit(RET_CODE) diff --git a/deployment/metrics/collectd-container-puppet.yaml b/deployment/metrics/collectd-container-puppet.yaml index 641db5d934..0c6fb61580 100644 --- a/deployment/metrics/collectd-container-puppet.yaml +++ b/deployment/metrics/collectd-container-puppet.yaml @@ -330,18 +330,7 @@ parameters: default: true CollectdContainerHealthCheckCommand: type: string - default: | - output="" - while read line ; do - i=$(echo $line | awk '//{gsub(/:/, "", $0); print $5}') - log=$(echo $line | awk '{split($0,a,/:\s+Error:\s+/); print a[2]}') - log=${log:0:-1} - output+=" ; ${i}: ${log}" - done < <(egrep "^[a-zA-Z]{3}\s+[0-9]{2}\s+[0-9\:]{8}\s+.*\s+.*:\s+[Ee]rror\:" /var/log/collectd/healthchecks.log) - truncate -s0 /var/log/collectd/healthchecks.log - if [ ! -z "${output}" ]; then - echo ${output:3} && exit 2; - fi + default: /scripts/collectd_check_health.py CollectdContainerHealthCheckInterval: type: number description: The frequency in seconds the docker health check is executed. @@ -655,6 +644,12 @@ outputs: - path: /var/log/collectd owner: collectd:collectd recurse: true + container_config_scripts: + map_merge: + - {get_attr: [ContainersCommon, container_config_scripts]} + - collectd_check_health.py: + mode: "0755" + content: { get_file: ../../container_config_scripts/monitoring/collectd_check_health.py } docker_config: step_5: collectd: @@ -676,6 +671,7 @@ outputs: - /var/lib/config-data/puppet-generated/collectd:/var/lib/kolla/config_files/src:ro - /var/log/containers/collectd:/var/log/collectd:rw,z - /var/run/:/var/run:rw + - /var/lib/container-config-scripts:/scripts:ro - /sys/fs/cgroup:/sys/fs/cgroup:ro environment: KOLLA_CONFIG_STRATEGY: COPY_ALWAYS