Browse Source

Return details in output of container health check

This patch reformats check-container-health script
for sensubility to output json formatted data instead
of semi-colon separated data. Removes calculation
of duration for each container HC to keep the runtime
shorter.

Change-Id: I18bcde4b6031c79deae3f6c9ee6f2c4bb754be88
(cherry picked from commit f84655ed55)
changes/47/758447/8
Paul Leimer 11 months ago
committed by pleimer
parent
commit
f525e4ab6b
2 changed files with 78 additions and 12 deletions
  1. +70
    -0
      container_config_scripts/monitoring/collectd_check_health.py
  2. +8
    -12
      deployment/metrics/collectd-container-puppet.yaml

+ 70
- 0
container_config_scripts/monitoring/collectd_check_health.py View File

@ -0,0 +1,70 @@
#!/usr/bin/env python3
# Copyright 2018 Red Hat Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import json
import re
import sys
HCLOG = '/var/log/collectd/healthchecks.log'
SERVICE_REGX = re.compile(r"""
\shealthcheck_(?P<service_name>\w+) # service
\[(?P<id>\d+)\] # pid
""", re.VERBOSE)
ERROR_REGX = re.compile(r"""
\shealthcheck_(?P<service_name>\w+) # service
\[(?P<id>\d+)\] # pid
:\s[Ee]rror: (?P<error>.+) # error
""", re.VERBOSE)
def process_healthcheck_output(logfile):
"""Process saved output of health checks and returns list of healthy and
unhealthy containers.
"""
with open(logfile, 'r') as logs:
data = {}
for line in logs:
match = SERVICE_REGX.search(line)
if match and not match.group('service_name') in data:
data[match.group('service_name')] = {
'service': match.group('service_name'),
'container': match.group('id'),
'status': 'healthy',
'healthy': 1
}
match = ERROR_REGX.search(line)
if match:
data[match.group('service_name')] = {
'service': match.group('service_name'),
'container': match.group('id'),
'status': 'unhealthy',
'healthy': 0
}
# truncate
with open(logfile, 'w') as logs:
pass
ret_code, output = 0, []
for _, opt in data.items():
if opt['healthy'] > 0 and ret_code != 2:
ret_code = 2
output.append(opt)
return ret_code, output
if __name__ == "__main__":
RET_CODE, STATUS = process_healthcheck_output(HCLOG)
print(json.dumps(STATUS))
sys.exit(RET_CODE)

+ 8
- 12
deployment/metrics/collectd-container-puppet.yaml View File

@ -330,18 +330,7 @@ parameters:
default: true
CollectdContainerHealthCheckCommand:
type: string
default: |
output=""
while read line ; do
i=$(echo $line | awk '//{gsub(/:/, "", $0); print $5}')
log=$(echo $line | awk '{split($0,a,/:\s+Error:\s+/); print a[2]}')
log=${log:0:-1}
output+=" ; ${i}: ${log}"
done < <(egrep "^[a-zA-Z]{3}\s+[0-9]{2}\s+[0-9\:]{8}\s+.*\s+.*:\s+[Ee]rror\:" /var/log/collectd/healthchecks.log)
truncate -s0 /var/log/collectd/healthchecks.log
if [ ! -z "${output}" ]; then
echo ${output:3} && exit 2;
fi
default: /scripts/collectd_check_health.py
CollectdContainerHealthCheckInterval:
type: number
description: The frequency in seconds the docker health check is executed.
@ -655,6 +644,12 @@ outputs:
- path: /var/log/collectd
owner: collectd:collectd
recurse: true
container_config_scripts:
map_merge:
- {get_attr: [ContainersCommon, container_config_scripts]}
- collectd_check_health.py:
mode: "0755"
content: { get_file: ../../container_config_scripts/monitoring/collectd_check_health.py }
docker_config:
step_5:
collectd:
@ -676,6 +671,7 @@ outputs:
- /var/lib/config-data/puppet-generated/collectd:/var/lib/kolla/config_files/src:ro
- /var/log/containers/collectd:/var/log/collectd:rw,z
- /var/run/:/var/run:rw
- /var/lib/container-config-scripts:/scripts:ro
- /sys/fs/cgroup:/sys/fs/cgroup:ro
environment:
KOLLA_CONFIG_STRATEGY: COPY_ALWAYS


Loading…
Cancel
Save