Update sensubility's container health check

Current logic parses podman log, which does not contain relevant logs any more after podman update. This patch makes it to get container health information straight from podman using the podman socket. Change-Id: I2a8c7d0797c3894a5593da407dd628a25e0535b8
2022-06-22 23:38:23 +02:00 · 2022-06-22 23:38:23 +02:00 · 4832fbfbc9
parent 189f4afe6a
commit 4832fbfbc9
2 changed files with 64 additions and 55 deletions
--- a/container_config_scripts/monitoring/collectd_check_health.py
+++ b/container_config_scripts/monitoring/collectd_check_health.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 #
-# Copyright 2018 Red Hat Inc.
+# Copyright 2022 Red Hat Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may
 # not use this file except in compliance with the License. You may obtain
@ -15,69 +15,72 @@
 # under the License.
 import json
-import re
+import shutil
 import subprocess
 import sys
-HCLOG = '/var/log/collectd/healthchecks.stdout'
+SOCKET = "unix:/run/podman/podman.sock"
-START_RE = re.compile(
+FORMAT = ("{service: .Name, container: .Id, status: .State.Running, "
-    r'(?P<timestamp>\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) (?P<host>[\w\-\.\:]*) systemd\[.*\]: Started /usr/bin/podman healthcheck run (?P<container_id>\w*)')
+         "healthy: .State.Health.Status}")
-EXEC_RE = re.compile(
+SKIP_LIST = ['_bootstrap', 'container-puppet-', '_db_sync',
-    r'(?P<timestamp>\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) (?P<host>[\w\-\.\:]*) podman\[(?P<pid>\d*)\]: (?P<trash>.*) container exec (?P<container_id>\w*) \(.*name=(?P<container_name>\w*).*\)')
+             '_ensure_', '_fix_', '_init_', '_map_', '_wait_',
-RESULT_RE = re.compile(
+             'mysql_data_ownership', 'configure_cms_options']
    r'(?P<timestamp>\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) (?P<host>[\w\-\.\:]*) podman\[(?P<pid>\d*)\]: (?P<result>(un)?healthy)')
-def process_healthcheck_output(path_to_log):
+def execute(cmd, workdir: str = None,
-    """Process saved output of health checks and returns list of unhealthy
+            prev_proc: subprocess.Popen = None) -> subprocess.Popen:
-    containers.
+    if type(cmd[0]) is list:  # multiple piped commands
-    """
+        last = prev_proc
-    data = {}
+        for c in cmd:
-    pid_map = {}
+            last = execute(c, workdir, last)
-    with open(path_to_log, "r+") as logfile:
+        return last
-        for line in logfile:
+    else:  # single command
-            match = START_RE.search(line)
+        inpipe = prev_proc.stdout if prev_proc is not None else None
-            if match:
+        proc = subprocess.Popen(cmd, cwd=workdir, stdin=inpipe,
-                item = data.setdefault(match.group('container_id'), {})
+                                stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-                item['timestamp_start'] = match.group('timestamp')
+        if prev_proc is not None:
-                item['host'] = match.group('host')
+            prev_proc.stdout.close()
-                continue
+            prev_proc.stderr.close()
-            match = EXEC_RE.search(line)
+        return proc
            if match:
                item = data.setdefault(match.group('container_id'), {})
                item['container_name'] = match.group('container_name')
                item['host'] = match.group('host')
                item['pid'] = match.group('pid')
                pid_map[match.group('pid')] = match.group('container_id')
                continue
            match = RESULT_RE.search(line)
            if match:
                if match.group('pid') not in pid_map:
                    continue
                item = data[pid_map[match.group('pid')]]
                item['result'] = match.group('result')
                item['timestamp_end'] = match.group('timestamp')
    # truncate the file
    with open(HCLOG, "w") as logfile:
        pass
-    rc, output = 0, []
+def fetch_container_health(containers):
-    for cid, item in data.items():
+    out = {}
-        if 'result' not in item:
+    for cont in set(containers.split('\n')) - set(SKIP_LIST):
        if not cont:
            continue
-        if item['result'] != 'healthy' and rc != 2:
+        proc = execute([
-            rc = 2 if item['result'] == 'unhealthy' else 1
+            [shutil.which('podman-remote'),
-        output.append({
+                '--url', SOCKET, 'inspect', cont],
-            'container': cid,
+            [shutil.which('jq'), '.[] | %s' % FORMAT]
-            'service': item['container_name'],
+        ])
-            'status': item['result'],
+        o, e = proc.communicate()
-            'healthy': int(item['result'] == 'healthy'),
+        if proc.returncode != 0:
-        })
+            msg = "Failed to fetch status of %s: %s" % (cont, e.decode())
-    return rc, output
+            return proc.returncode, msg
        item = json.loads(o.decode())
        if len(item['healthy']) > 0:
            item['status'] = item['healthy']
        else:
            item['status'] = 'running' if item['status'] else 'stopped'
        item['healthy'] = int(item['healthy'] == 'healthy')
        out[item['service']] = item
    return 0, out
 if __name__ == "__main__":
-    rc, status = process_healthcheck_output(HCLOG)
+    proc = execute([shutil.which('podman-remote'), '--url', SOCKET,
                    'ps', '--all', '--format', '{{.Names}}'])
    o, e = proc.communicate()
    if proc.returncode != 0:
        print("Failed to list containers:\n%s\n%s" % (o.decode(), e.decode()))
        sys.exit(1)
    rc, status = fetch_container_health(o.decode())
    if rc != 0:
        print("Failed to inspect containers:\n%s" % status)
        sys.exit(rc)
    print(json.dumps(status))
    sys.exit(rc)
--- a/deployment/metrics/collectd-container-puppet.yaml
+++ b/deployment/metrics/collectd-container-puppet.yaml
@ -651,3 +651,9 @@ outputs:
          include_role:
            name: tripleo_provision_mcelog
          when: {get_param: CollectdEnableMcelog}
        - name: enable podman socket
          ansible.builtin.service:
            name: podman.socket
            state: started
            enabled: true
          when: {get_param: CollectdEnableSensubility}