Merge "[TRAIN-ONLY] Collect and parse all health check logs" into stable/train

This commit is contained in:
Zuul 2023-01-06 14:18:21 +00:00 committed by Gerrit Code Review
commit ba7270605c
2 changed files with 55 additions and 26 deletions

View File

@ -18,15 +18,41 @@ import re
import sys
HCLOG = '/var/log/collectd/healthchecks.log'
SERVICE_REGX = re.compile(r"""
\shealthcheck_(?P<service_name>\w+) # service
\[(?P<id>\d+)\] # pid
""", re.VERBOSE)
ERROR_REGX = re.compile(r"""
\shealthcheck_(?P<service_name>\w+) # service
\[(?P<id>\d+)\] # pid
:\s[Ee]rror: (?P<error>.+) # error
""", re.VERBOSE)
# log records when health check run was successful
SUCCESS_REXS = [
re.compile(r'(?P<timestamp>\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) '
r'(?P<host>[\w\-\.\:]*) systemd\[.*\]: '
r'Started (?P<service>[\w-]+) healthcheck'),
re.compile(r'(?P<timestamp>\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) '
r'(?P<host>[\w\-\.\:]*) healthcheck_(?P<service>[\w-]+)'
r'\[(?P<pid>\d+)\]: (?P<output>(?![Ee][Rr][Rr][Oo][Rr]).*)'),
re.compile(r'(?P<timestamp>\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) '
r'(?P<host>[\w\-\.\:]*) systemd\[.*\]: '
r'tripleo_(?P<service>[\w-]+)_healthcheck.service: Succeeded')
]
# log records when health check run failed
FAILED_REXS = [
re.compile(r'(?P<timestamp>\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) '
r'(?P<host>[\w\-\.\:]*) healthcheck_(?P<service>[\w-]+)'
r'\[(?P<pid>\d+)\]: [Ee][Rr][Rr][Oo][Rr]: (?P<error>.+)'),
re.compile(r'(?P<timestamp>\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) '
r'(?P<host>[\w\-\.\:]*) systemd\[.*\]: '
r'Failed to start (?P<service>[\w-]+) healthcheck')
]
# log records when health check is executed, contains additional data
EXEC_REXS = [
# osp-16.1
re.compile(r'(?P<timestamp>\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) '
r'(?P<host>[\w\-\.\:]*) podman\[(?P<pid>\d*)\]: '
r'(?P<trash>.*) container exec (?P<container_id>\w*) '
r'\(.*name=(?P<service>[\w-]+).*\)'),
# osp-16.2
re.compile(r'(?P<timestamp>\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) '
r'(?P<host>[\w\-\.\:]*) podman\[(?P<pid>\d*)\]: '
r'(?P<trash>.*) container exec (?P<container_id>\w*) '
r'\(.*container_name=(?P<service>[\w-]+).*\)')
]
def process_healthcheck_output(logfile):
@ -36,22 +62,21 @@ def process_healthcheck_output(logfile):
with open(logfile, 'r') as logs:
data = {}
for line in logs:
match = SERVICE_REGX.search(line)
if match and not match.group('service_name') in data:
data[match.group('service_name')] = {
'service': match.group('service_name'),
'container': match.group('id'),
'status': 'healthy',
'healthy': 1
}
match = ERROR_REGX.search(line)
if match:
data[match.group('service_name')] = {
'service': match.group('service_name'),
'container': match.group('id'),
'status': 'unhealthy',
'healthy': 0
}
for rex_list, default in [
(SUCCESS_REXS, {'status': 'healthy', 'healthy': 1}),
(FAILED_REXS, {'status': 'unhealthy', 'healthy': 0}),
(EXEC_REXS, {'status': 'checking', 'healthy': 2})]:
for rex in rex_list:
match = rex.search(line)
if match:
groups = match.groupdict()
item = data.setdefault(groups['service'], {
'service': groups['service'],
'container': 'unknown',
})
it = data[groups['service']] = {**item, **default}
if 'container_id' in groups:
it['container'] = groups['container_id'][:12]
# truncate
with open(logfile, 'w') as logs:
@ -59,6 +84,10 @@ def process_healthcheck_output(logfile):
ret_code, output = 0, []
for _, opt in data.items():
if opt['status'] == 'checking':
# incomplete parsing, eg. exec log was located in file,
# but success log of fail log is missing
continue
if opt['healthy'] < 1 and ret_code != 2:
ret_code = 2
output.append(opt)

View File

@ -732,7 +732,7 @@ outputs:
copy:
dest: /etc/rsyslog.d/openstack-healthcheck.conf
content: |
if $programname startswith 'healthcheck_' then -/var/log/containers/collectd/healthchecks.log
if ($programname startswith 'podman' and $msg contains 'container exec') or ($programname startswith 'systemd' and $msg contains 'healthcheck') or ($programname startswith 'healthcheck_') then -/var/log/containers/collectd/healthchecks.log
& stop
- name: Remove healthcheck log
when: