diff --git a/ansible/install/roles/collectd-openstack/files/collectd_pacemaker_monitoring.py b/ansible/install/roles/collectd-openstack/files/collectd_pacemaker_monitoring.py new file mode 100644 index 000000000..6e9caba95 --- /dev/null +++ b/ansible/install/roles/collectd-openstack/files/collectd_pacemaker_monitoring.py @@ -0,0 +1,143 @@ +import collectd +import os + +LOG_FILE_PATH = '/var/log/containers/stdouts/collectd_pacemaker.out' +PIPE_FILE_PATH = '/collectd_pipe' +INTERVAL = 15 + +def config_func(config): + log_file_path_set = False + + for node in config.children: + key = node.key.lower() + + if key == 'interval': + global INTERVAL + INTERVAL = int(node.values[0]) + +def read_func(): + global INTERVAL + global LOG_FILE_PATH + + os.system('''echo "pcs status" > '''+PIPE_FILE_PATH) + + with open(LOG_FILE_PATH, 'rb') as f: + full_output = [line.decode("utf-8") for line in f.readlines()] + + latest_output = [] + + for line in full_output[-1::-1]: + latest_output.append(line) + if "Cluster name:" in line: + break + + components_list = ["total_nodes", "online_nodes", "online_guests", + "resource_instances", "haproxy_resource_total_count", + "galera_resource_total_count", "rabbitmq_resource_total_count", + "redis_resource_total_count", "ovn_resource_total_count", "cinder_resource_total_count", + "haproxy_resource_master_count", "galera_resource_master_count", "rabbitmq_resource_master_count", + "redis_resource_master_count", "ovn_resource_master_count", "cinder_resource_master_count", + "corosync_daemon_status", "pacemaker_daemon_status", "pcsd_daemon_status", + "haproxy_resource_failures", "galera_resource_failures", "rabbitmq_resource_failures", + "redis_resource_failures", "ovn_resource_failures", "cinder_resource_failures"] + + for component in components_list: + val = 0 + + if component == "total_nodes": + for line in latest_output[-1::-1]: + if "nodes configured" in line: + line_split = line.split() + nodes_index = line_split.index("nodes") + val = int(line_split[nodes_index-1]) + break + + elif component == "online_nodes": + for line in latest_output[-1::-1]: + if "Online: [" in line and "Guest" not in line: + line_split = line.split("[")[1].replace(" ]","").strip().split() + val = int(len(line_split)) + + elif component == "online_guests": + for line in latest_output[-1::-1]: + if "GuestOnline: [" in line: + line_split = line.split("[")[1].replace(" ]","").strip().split() + val = int(len(line_split)) + + elif component == "resource_instances": + for line in latest_output[-1::-1]: + if "resource instances configured" in line: + line_split = line.split() + nodes_index = line_split.index("resource") + val = int(line_split[nodes_index-1]) + + elif "resource_total_count" in component: + resource = component.split("_")[0] + # Flag to make sure that failures are not counted + # in resource total count. + is_failures_total = False + for line in latest_output[-1::-1]: + if "Failed" in line: + is_failures_total = True + if (resource == "haproxy" or resource == "galera" + or resource == "rabbitmq" or resource == "redis"): + if resource+"-bundle-" in line and "Guest" not in line and not is_failures_total: + val += 1 + if resource == "ovn": + if "ovn-dbs-bundle-" in line and "Guest" not in line and not is_failures_total: + val += 1 + if resource == "cinder": + if "openstack-cinder-volume-" in line and "Guest" not in line and not is_failures_total: + val += 1 + if is_failures_total and "Daemon Status" in line: + is_failures_total = False + + elif "resource_master_count" in component: + resource = component.split("_")[0] + # Flag to make sure that failures are not counted + # in resource master count + is_failures_master = False + for line in latest_output[-1::-1]: + if "Failed" in line: + is_failures_master = True + if (resource == "haproxy" or resource == "galera" + or resource == "rabbitmq" or resource == "redis"): + if resource+"-bundle-" in line and "Master" in line and not is_failures_master: + val += 1 + if resource == "ovn": + if "ovn-dbs-bundle-" in line and "Master" in line and not is_failures_master: + val += 1 + if resource == "cinder": + if "openstack-cinder-volume-" in line and "Master" in line and not is_failures_master: + val += 1 + if is_failures_master and "Daemon Status" in line: + is_failures_master = False + + if "daemon_status" in component: + daemon = component.split("_")[0] + for line in latest_output: + if daemon+":" in line and "active/enabled" in line: + val = 1 + break + + if "resource_failures" in component: + resource = component.split("_")[0] + is_failures = False + for line in latest_output[-1::-1]: + if "Failed" in line: + is_failures = True + if resource in line and is_failures: + val += 1 + if is_failures and "Daemon Status" in line: + is_failures = False + + metric = collectd.Values() + metric.plugin = 'pacemaker_monitoring' + metric.interval = INTERVAL + metric.type = 'gauge' + metric.type_instance = component + metric.values = [val] + metric.dispatch() + +collectd.register_config(config_func) +collectd.register_read(read_func) diff --git a/ansible/install/roles/collectd-openstack/files/ovn_monitoring.sh b/ansible/install/roles/collectd-openstack/files/ovn_monitoring.sh new file mode 100755 index 000000000..b1741d0c0 --- /dev/null +++ b/ansible/install/roles/collectd-openstack/files/ovn_monitoring.sh @@ -0,0 +1,9 @@ +#!/bin/sh +HOSTNAME="${COLLECTD_HOSTNAME:-`hostname -f`}" +INTERVAL="${COLLECTD_INTERVAL:-15}" + +while sleep "$INTERVAL"; do + VALUE=$(sudo ovsdb-client dump --no-headings unix:/var/lib/openvswitch/ovn/ovn$1_db.sock $2 | wc -l) + VALUE=$[VALUE-1] + echo "PUTVAL \"$HOSTNAME/ovn-$1db-$2/gauge-ovn_$1db_$2\" interval=$INTERVAL N:$VALUE" +done