Update sensubility's container health check

Current logic parses podman log, which does not contain relevant
logs any more after podman update.

This patch makes it to get container health information straight
from podman using the podman socket.

Change-Id: I2a8c7d0797c3894a5593da407dd628a25e0535b8
This commit is contained in:
Martin Mágr 2022-06-22 23:38:23 +02:00
parent 189f4afe6a
commit 4832fbfbc9
2 changed files with 64 additions and 55 deletions

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# #
# Copyright 2018 Red Hat Inc. # Copyright 2022 Red Hat Inc.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); you may # Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain # not use this file except in compliance with the License. You may obtain
@ -15,69 +15,72 @@
# under the License. # under the License.
import json import json
import re import shutil
import subprocess
import sys import sys
HCLOG = '/var/log/collectd/healthchecks.stdout' SOCKET = "unix:/run/podman/podman.sock"
START_RE = re.compile( FORMAT = ("{service: .Name, container: .Id, status: .State.Running, "
r'(?P<timestamp>\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) (?P<host>[\w\-\.\:]*) systemd\[.*\]: Started /usr/bin/podman healthcheck run (?P<container_id>\w*)') "healthy: .State.Health.Status}")
EXEC_RE = re.compile( SKIP_LIST = ['_bootstrap', 'container-puppet-', '_db_sync',
r'(?P<timestamp>\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) (?P<host>[\w\-\.\:]*) podman\[(?P<pid>\d*)\]: (?P<trash>.*) container exec (?P<container_id>\w*) \(.*name=(?P<container_name>\w*).*\)') '_ensure_', '_fix_', '_init_', '_map_', '_wait_',
RESULT_RE = re.compile( 'mysql_data_ownership', 'configure_cms_options']
r'(?P<timestamp>\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) (?P<host>[\w\-\.\:]*) podman\[(?P<pid>\d*)\]: (?P<result>(un)?healthy)')
def process_healthcheck_output(path_to_log): def execute(cmd, workdir: str = None,
"""Process saved output of health checks and returns list of unhealthy prev_proc: subprocess.Popen = None) -> subprocess.Popen:
containers. if type(cmd[0]) is list: # multiple piped commands
""" last = prev_proc
data = {} for c in cmd:
pid_map = {} last = execute(c, workdir, last)
with open(path_to_log, "r+") as logfile: return last
for line in logfile: else: # single command
match = START_RE.search(line) inpipe = prev_proc.stdout if prev_proc is not None else None
if match: proc = subprocess.Popen(cmd, cwd=workdir, stdin=inpipe,
item = data.setdefault(match.group('container_id'), {}) stdout=subprocess.PIPE, stderr=subprocess.PIPE)
item['timestamp_start'] = match.group('timestamp') if prev_proc is not None:
item['host'] = match.group('host') prev_proc.stdout.close()
continue prev_proc.stderr.close()
match = EXEC_RE.search(line) return proc
if match:
item = data.setdefault(match.group('container_id'), {})
item['container_name'] = match.group('container_name')
item['host'] = match.group('host')
item['pid'] = match.group('pid')
pid_map[match.group('pid')] = match.group('container_id')
continue
match = RESULT_RE.search(line)
if match:
if match.group('pid') not in pid_map:
continue
item = data[pid_map[match.group('pid')]]
item['result'] = match.group('result')
item['timestamp_end'] = match.group('timestamp')
# truncate the file
with open(HCLOG, "w") as logfile:
pass
rc, output = 0, [] def fetch_container_health(containers):
for cid, item in data.items(): out = {}
if 'result' not in item: for cont in set(containers.split('\n')) - set(SKIP_LIST):
if not cont:
continue continue
if item['result'] != 'healthy' and rc != 2: proc = execute([
rc = 2 if item['result'] == 'unhealthy' else 1 [shutil.which('podman-remote'),
output.append({ '--url', SOCKET, 'inspect', cont],
'container': cid, [shutil.which('jq'), '.[] | %s' % FORMAT]
'service': item['container_name'], ])
'status': item['result'], o, e = proc.communicate()
'healthy': int(item['result'] == 'healthy'), if proc.returncode != 0:
}) msg = "Failed to fetch status of %s: %s" % (cont, e.decode())
return rc, output return proc.returncode, msg
item = json.loads(o.decode())
if len(item['healthy']) > 0:
item['status'] = item['healthy']
else:
item['status'] = 'running' if item['status'] else 'stopped'
item['healthy'] = int(item['healthy'] == 'healthy')
out[item['service']] = item
return 0, out
if __name__ == "__main__": if __name__ == "__main__":
rc, status = process_healthcheck_output(HCLOG) proc = execute([shutil.which('podman-remote'), '--url', SOCKET,
'ps', '--all', '--format', '{{.Names}}'])
o, e = proc.communicate()
if proc.returncode != 0:
print("Failed to list containers:\n%s\n%s" % (o.decode(), e.decode()))
sys.exit(1)
rc, status = fetch_container_health(o.decode())
if rc != 0:
print("Failed to inspect containers:\n%s" % status)
sys.exit(rc)
print(json.dumps(status)) print(json.dumps(status))
sys.exit(rc)

View File

@ -651,3 +651,9 @@ outputs:
include_role: include_role:
name: tripleo_provision_mcelog name: tripleo_provision_mcelog
when: {get_param: CollectdEnableMcelog} when: {get_param: CollectdEnableMcelog}
- name: enable podman socket
ansible.builtin.service:
name: podman.socket
state: started
enabled: true
when: {get_param: CollectdEnableSensubility}