From ffa6830ecf6d24264e3faf3326156c94f72bc08b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20M=C3=A1gr?= Date: Tue, 29 Nov 2022 21:18:39 +0100 Subject: [PATCH] [TRAIN-ONLY] Collect and parse all health check logs Previously some of the container health check run log records were not collected by rsyslog and so health status of some containers was not reported. This patch makes it so all relevant log records from systemd-timer based health check runs are collected and correctly parsed and so those containers with health check attached will be reported. Wallaby+ is using podman socket for fetching container status data, so this patch is needed only for stable/train. Resolves: rhbz#2149002 Change-Id: Ie0dc9edac5dc1483c9d419a169457a8689be4064 --- .../monitoring/collectd_check_health.py | 79 +++++++++++++------ .../metrics/collectd-container-puppet.yaml | 2 +- 2 files changed, 55 insertions(+), 26 deletions(-) diff --git a/container_config_scripts/monitoring/collectd_check_health.py b/container_config_scripts/monitoring/collectd_check_health.py index 604414d1b4..d429d6fbb4 100755 --- a/container_config_scripts/monitoring/collectd_check_health.py +++ b/container_config_scripts/monitoring/collectd_check_health.py @@ -18,15 +18,41 @@ import re import sys HCLOG = '/var/log/collectd/healthchecks.log' -SERVICE_REGX = re.compile(r""" - \shealthcheck_(?P\w+) # service - \[(?P\d+)\] # pid - """, re.VERBOSE) -ERROR_REGX = re.compile(r""" - \shealthcheck_(?P\w+) # service - \[(?P\d+)\] # pid - :\s[Ee]rror: (?P.+) # error - """, re.VERBOSE) +# log records when health check run was successful +SUCCESS_REXS = [ + re.compile(r'(?P\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) ' + r'(?P[\w\-\.\:]*) systemd\[.*\]: ' + r'Started (?P[\w-]+) healthcheck'), + re.compile(r'(?P\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) ' + r'(?P[\w\-\.\:]*) healthcheck_(?P[\w-]+)' + r'\[(?P\d+)\]: (?P(?![Ee][Rr][Rr][Oo][Rr]).*)'), + re.compile(r'(?P\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) ' + r'(?P[\w\-\.\:]*) systemd\[.*\]: ' + r'tripleo_(?P[\w-]+)_healthcheck.service: Succeeded') +] +# log records when health check run failed +FAILED_REXS = [ + re.compile(r'(?P\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) ' + r'(?P[\w\-\.\:]*) healthcheck_(?P[\w-]+)' + r'\[(?P\d+)\]: [Ee][Rr][Rr][Oo][Rr]: (?P.+)'), + re.compile(r'(?P\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) ' + r'(?P[\w\-\.\:]*) systemd\[.*\]: ' + r'Failed to start (?P[\w-]+) healthcheck') + +] +# log records when health check is executed, contains additional data +EXEC_REXS = [ + # osp-16.1 + re.compile(r'(?P\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) ' + r'(?P[\w\-\.\:]*) podman\[(?P\d*)\]: ' + r'(?P.*) container exec (?P\w*) ' + r'\(.*name=(?P[\w-]+).*\)'), + # osp-16.2 + re.compile(r'(?P\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) ' + r'(?P[\w\-\.\:]*) podman\[(?P\d*)\]: ' + r'(?P.*) container exec (?P\w*) ' + r'\(.*container_name=(?P[\w-]+).*\)') +] def process_healthcheck_output(logfile): @@ -36,22 +62,21 @@ def process_healthcheck_output(logfile): with open(logfile, 'r') as logs: data = {} for line in logs: - match = SERVICE_REGX.search(line) - if match and not match.group('service_name') in data: - data[match.group('service_name')] = { - 'service': match.group('service_name'), - 'container': match.group('id'), - 'status': 'healthy', - 'healthy': 1 - } - match = ERROR_REGX.search(line) - if match: - data[match.group('service_name')] = { - 'service': match.group('service_name'), - 'container': match.group('id'), - 'status': 'unhealthy', - 'healthy': 0 - } + for rex_list, default in [ + (SUCCESS_REXS, {'status': 'healthy', 'healthy': 1}), + (FAILED_REXS, {'status': 'unhealthy', 'healthy': 0}), + (EXEC_REXS, {'status': 'checking', 'healthy': 2})]: + for rex in rex_list: + match = rex.search(line) + if match: + groups = match.groupdict() + item = data.setdefault(groups['service'], { + 'service': groups['service'], + 'container': 'unknown', + }) + it = data[groups['service']] = {**item, **default} + if 'container_id' in groups: + it['container'] = groups['container_id'][:12] # truncate with open(logfile, 'w') as logs: @@ -59,6 +84,10 @@ def process_healthcheck_output(logfile): ret_code, output = 0, [] for _, opt in data.items(): + if opt['status'] == 'checking': + # incomplete parsing, eg. exec log was located in file, + # but success log of fail log is missing + continue if opt['healthy'] < 1 and ret_code != 2: ret_code = 2 output.append(opt) diff --git a/deployment/metrics/collectd-container-puppet.yaml b/deployment/metrics/collectd-container-puppet.yaml index f407b9e042..7b550fd253 100644 --- a/deployment/metrics/collectd-container-puppet.yaml +++ b/deployment/metrics/collectd-container-puppet.yaml @@ -732,7 +732,7 @@ outputs: copy: dest: /etc/rsyslog.d/openstack-healthcheck.conf content: | - if $programname startswith 'healthcheck_' then -/var/log/containers/collectd/healthchecks.log + if ($programname startswith 'podman' and $msg contains 'container exec') or ($programname startswith 'systemd' and $msg contains 'healthcheck') or ($programname startswith 'healthcheck_') then -/var/log/containers/collectd/healthchecks.log & stop - name: Remove healthcheck log when: