Merge "[TRAIN-ONLY] Collect and parse all health check logs" into stable/train
This commit is contained in:
commit
ba7270605c
|
@ -18,15 +18,41 @@ import re
|
|||
import sys
|
||||
|
||||
HCLOG = '/var/log/collectd/healthchecks.log'
|
||||
SERVICE_REGX = re.compile(r"""
|
||||
\shealthcheck_(?P<service_name>\w+) # service
|
||||
\[(?P<id>\d+)\] # pid
|
||||
""", re.VERBOSE)
|
||||
ERROR_REGX = re.compile(r"""
|
||||
\shealthcheck_(?P<service_name>\w+) # service
|
||||
\[(?P<id>\d+)\] # pid
|
||||
:\s[Ee]rror: (?P<error>.+) # error
|
||||
""", re.VERBOSE)
|
||||
# log records when health check run was successful
|
||||
SUCCESS_REXS = [
|
||||
re.compile(r'(?P<timestamp>\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) '
|
||||
r'(?P<host>[\w\-\.\:]*) systemd\[.*\]: '
|
||||
r'Started (?P<service>[\w-]+) healthcheck'),
|
||||
re.compile(r'(?P<timestamp>\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) '
|
||||
r'(?P<host>[\w\-\.\:]*) healthcheck_(?P<service>[\w-]+)'
|
||||
r'\[(?P<pid>\d+)\]: (?P<output>(?![Ee][Rr][Rr][Oo][Rr]).*)'),
|
||||
re.compile(r'(?P<timestamp>\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) '
|
||||
r'(?P<host>[\w\-\.\:]*) systemd\[.*\]: '
|
||||
r'tripleo_(?P<service>[\w-]+)_healthcheck.service: Succeeded')
|
||||
]
|
||||
# log records when health check run failed
|
||||
FAILED_REXS = [
|
||||
re.compile(r'(?P<timestamp>\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) '
|
||||
r'(?P<host>[\w\-\.\:]*) healthcheck_(?P<service>[\w-]+)'
|
||||
r'\[(?P<pid>\d+)\]: [Ee][Rr][Rr][Oo][Rr]: (?P<error>.+)'),
|
||||
re.compile(r'(?P<timestamp>\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) '
|
||||
r'(?P<host>[\w\-\.\:]*) systemd\[.*\]: '
|
||||
r'Failed to start (?P<service>[\w-]+) healthcheck')
|
||||
|
||||
]
|
||||
# log records when health check is executed, contains additional data
|
||||
EXEC_REXS = [
|
||||
# osp-16.1
|
||||
re.compile(r'(?P<timestamp>\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) '
|
||||
r'(?P<host>[\w\-\.\:]*) podman\[(?P<pid>\d*)\]: '
|
||||
r'(?P<trash>.*) container exec (?P<container_id>\w*) '
|
||||
r'\(.*name=(?P<service>[\w-]+).*\)'),
|
||||
# osp-16.2
|
||||
re.compile(r'(?P<timestamp>\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) '
|
||||
r'(?P<host>[\w\-\.\:]*) podman\[(?P<pid>\d*)\]: '
|
||||
r'(?P<trash>.*) container exec (?P<container_id>\w*) '
|
||||
r'\(.*container_name=(?P<service>[\w-]+).*\)')
|
||||
]
|
||||
|
||||
|
||||
def process_healthcheck_output(logfile):
|
||||
|
@ -36,22 +62,21 @@ def process_healthcheck_output(logfile):
|
|||
with open(logfile, 'r') as logs:
|
||||
data = {}
|
||||
for line in logs:
|
||||
match = SERVICE_REGX.search(line)
|
||||
if match and not match.group('service_name') in data:
|
||||
data[match.group('service_name')] = {
|
||||
'service': match.group('service_name'),
|
||||
'container': match.group('id'),
|
||||
'status': 'healthy',
|
||||
'healthy': 1
|
||||
}
|
||||
match = ERROR_REGX.search(line)
|
||||
if match:
|
||||
data[match.group('service_name')] = {
|
||||
'service': match.group('service_name'),
|
||||
'container': match.group('id'),
|
||||
'status': 'unhealthy',
|
||||
'healthy': 0
|
||||
}
|
||||
for rex_list, default in [
|
||||
(SUCCESS_REXS, {'status': 'healthy', 'healthy': 1}),
|
||||
(FAILED_REXS, {'status': 'unhealthy', 'healthy': 0}),
|
||||
(EXEC_REXS, {'status': 'checking', 'healthy': 2})]:
|
||||
for rex in rex_list:
|
||||
match = rex.search(line)
|
||||
if match:
|
||||
groups = match.groupdict()
|
||||
item = data.setdefault(groups['service'], {
|
||||
'service': groups['service'],
|
||||
'container': 'unknown',
|
||||
})
|
||||
it = data[groups['service']] = {**item, **default}
|
||||
if 'container_id' in groups:
|
||||
it['container'] = groups['container_id'][:12]
|
||||
|
||||
# truncate
|
||||
with open(logfile, 'w') as logs:
|
||||
|
@ -59,6 +84,10 @@ def process_healthcheck_output(logfile):
|
|||
|
||||
ret_code, output = 0, []
|
||||
for _, opt in data.items():
|
||||
if opt['status'] == 'checking':
|
||||
# incomplete parsing, eg. exec log was located in file,
|
||||
# but success log of fail log is missing
|
||||
continue
|
||||
if opt['healthy'] < 1 and ret_code != 2:
|
||||
ret_code = 2
|
||||
output.append(opt)
|
||||
|
|
|
@ -732,7 +732,7 @@ outputs:
|
|||
copy:
|
||||
dest: /etc/rsyslog.d/openstack-healthcheck.conf
|
||||
content: |
|
||||
if $programname startswith 'healthcheck_' then -/var/log/containers/collectd/healthchecks.log
|
||||
if ($programname startswith 'podman' and $msg contains 'container exec') or ($programname startswith 'systemd' and $msg contains 'healthcheck') or ($programname startswith 'healthcheck_') then -/var/log/containers/collectd/healthchecks.log
|
||||
& stop
|
||||
- name: Remove healthcheck log
|
||||
when:
|
||||
|
|
Loading…
Reference in New Issue