From e95a430342b39388c56e22dfcd230a5e90a018ed Mon Sep 17 00:00:00 2001 From: Sanjay Chari Date: Thu, 30 Jun 2022 17:38:19 +0530 Subject: [PATCH] Add OVN RAFT monitoring This patch adds monitoring for OVN RAFT. Change-Id: I806507696d5534005c4905ecac3f87ec422f05ce --- ansible/install/group_vars/all.yml | 7 ++ ansible/install/roles/collectd/tasks/main.yml | 6 +- .../templates/controller.collectd.conf.j2 | 11 +++ ...enstack_general_system_performance.yaml.j2 | 2 + .../partials/ovn_raft_monitoring.yaml | 73 +++++++++++++++ .../collectd-openstack/Dockerfile | 1 + .../files/collectd_ovn_raft_monitoring.py | 88 +++++++++++++++++++ 7 files changed, 187 insertions(+), 1 deletion(-) create mode 100644 ansible/install/roles/grafana-dashboards/templates/partials/ovn_raft_monitoring.yaml create mode 100644 browbeat-containers/collectd-openstack/files/collectd_ovn_raft_monitoring.py diff --git a/ansible/install/group_vars/all.yml b/ansible/install/group_vars/all.yml index bcf23db0c..593517cf2 100644 --- a/ansible/install/group_vars/all.yml +++ b/ansible/install/group_vars/all.yml @@ -333,6 +333,13 @@ ovs_flows_monitoring: false # before enabling this plugin. ovn_monitoring: false +####################### +# OVN RAFT Monitoring +####################### +# Monitors OVN RAFT cluster status metrics on controllers. +ovn_raft_monitoring: false +ovn_raft_controller_collectd_interval: 30 + ####################### # Pacemaker Monitoring ####################### diff --git a/ansible/install/roles/collectd/tasks/main.yml b/ansible/install/roles/collectd/tasks/main.yml index e6e4f371f..176730d2e 100644 --- a/ansible/install/roles/collectd/tasks/main.yml +++ b/ansible/install/roles/collectd/tasks/main.yml @@ -259,6 +259,10 @@ -v /var/lib/openvswitch/ovn/ovnnb_db.sock:/var/lib/openvswitch/ovn/ovnnb_db.sock \ -v /var/lib/openvswitch/ovn/ovnsb_db.sock:/var/lib/openvswitch/ovn/ovnsb_db.sock \ {% endif %} + {% if ovn_raft_monitoring %} + -v /var/lib/openvswitch/ovn/ovnnb_db.ctl:/var/lib/openvswitch/ovn/ovnnb_db.ctl \ + -v /var/lib/openvswitch/ovn/ovnsb_db.ctl:/var/lib/openvswitch/ovn/ovnsb_db.ctl \ + {% endif %} {% if pacemaker_monitoring %} -v /home/{{ host_remote_user }}/collectd_pipe:/collectd_pipe \ {% endif %} @@ -277,4 +281,4 @@ podman exec -it -u root collectd-controller usermod -G wheel stack become: yes become_user: root - when: "config_type == 'controller' and ovn_monitoring" + when: "config_type == 'controller' and (ovn_monitoring or ovn_raft_monitoring)" diff --git a/ansible/install/roles/collectd/templates/controller.collectd.conf.j2 b/ansible/install/roles/collectd/templates/controller.collectd.conf.j2 index 0918d5597..9de8a4be6 100644 --- a/ansible/install/roles/collectd/templates/controller.collectd.conf.j2 +++ b/ansible/install/roles/collectd/templates/controller.collectd.conf.j2 @@ -585,6 +585,17 @@ LoadPlugin python {% endif %} +{%if ovn_raft_monitoring %} + + ModulePath "/usr/local/bin/" + Import "collectd_ovn_raft_monitoring" + + Interval {{ovn_raft_controller_collectd_interval}} + + + +{% endif %} + {%if gnocchi_status_controller_collectd_plugin %} {%if inventory_hostname == groups['Controller'][0] %} diff --git a/ansible/install/roles/grafana-dashboards/templates/openstack_general_system_performance.yaml.j2 b/ansible/install/roles/grafana-dashboards/templates/openstack_general_system_performance.yaml.j2 index e64ed683b..c117a6767 100644 --- a/ansible/install/roles/grafana-dashboards/templates/openstack_general_system_performance.yaml.j2 +++ b/ansible/install/roles/grafana-dashboards/templates/openstack_general_system_performance.yaml.j2 @@ -169,6 +169,8 @@ dashboard: {% include 'partials/ovn_db_tables.yaml' %} {% include 'partials/pacemaker_monitoring.yaml' %} + + {% include 'partials/ovn_raft_monitoring.yaml' %} {% endif %} {% include 'partials/ovn_metrics.yaml' %} diff --git a/ansible/install/roles/grafana-dashboards/templates/partials/ovn_raft_monitoring.yaml b/ansible/install/roles/grafana-dashboards/templates/partials/ovn_raft_monitoring.yaml new file mode 100644 index 000000000..56963d110 --- /dev/null +++ b/ansible/install/roles/grafana-dashboards/templates/partials/ovn_raft_monitoring.yaml @@ -0,0 +1,73 @@ + - title: OVN RAFT Metrics + collapse: true + height: 200px + showTitle: true + panels: + - title: $Cloud - $Node - OVN RAFT Northbound DB Metrics + type: graph + legend: + alignAsTable: true + avg: false + current: true + max: true + min: true + rightSide: true + show: true + total: false + values: true + nullPointMode: 'null' + targets: + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-nb_disconnections, 'disconnections') + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-nb_election_timer, 'election_timer') + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-nb_entries_not_applied, 'entries_not_yet_applied') + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-nb_entries_not_committed, 'entries_not_yet_committed') + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-nb_term, 'term') + - title: $Cloud - $Node - OVN RAFT Northbound DB Leader Data + type: graph + legend: + alignAsTable: true + avg: false + current: true + max: true + min: true + rightSide: true + show: true + total: false + values: true + nullPointMode: 'null' + targets: + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-nb_is_leader, 'is_selected_controller_leader') + - title: $Cloud - $Node - OVN RAFT Southbound DB Metrics + type: graph + legend: + alignAsTable: true + avg: false + current: true + max: true + min: true + rightSide: true + show: true + total: false + values: true + nullPointMode: 'null' + targets: + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-sb_disconnections, 'disconnections') + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-sb_election_timer, 'election_timer') + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-sb_entries_not_applied, 'entries_not_yet_applied') + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-sb_entries_not_committed, 'entries_not_yet_committed') + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-sb_term, 'term') + - title: $Cloud - $Node - OVN RAFT Southbound DB Leader Data + type: graph + legend: + alignAsTable: true + avg: false + current: true + max: true + min: true + rightSide: true + show: true + total: false + values: true + nullPointMode: 'null' + targets: + - target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-sb_is_leader, 'is_selected_controller_leader') diff --git a/browbeat-containers/collectd-openstack/Dockerfile b/browbeat-containers/collectd-openstack/Dockerfile index 7da30095a..c082a95a3 100644 --- a/browbeat-containers/collectd-openstack/Dockerfile +++ b/browbeat-containers/collectd-openstack/Dockerfile @@ -30,6 +30,7 @@ ADD files/collectd_rabbitmq_monitoring.py /usr/local/bin/collectd_rabbitmq_monit ADD files/collectd_swift_stat.py /usr/local/bin/collectd_swift_stat.py ADD files/collectd_pacemaker_monitoring.py /usr/local/bin/collectd_pacemaker_monitoring.py ADD files/collectd_iostat_python.py /usr/local/bin/collectd_iostat_python.py +ADD files/collectd_ovn_raft_monitoring.py /usr/local/bin/collectd_ovn_raft_monitoring.py ADD files/ovs_flows.sh /usr/local/bin/ovs_flows.sh ADD files/ovn_monitoring.sh /usr/local/bin/ovn_monitoring.sh diff --git a/browbeat-containers/collectd-openstack/files/collectd_ovn_raft_monitoring.py b/browbeat-containers/collectd-openstack/files/collectd_ovn_raft_monitoring.py new file mode 100644 index 000000000..f876c2074 --- /dev/null +++ b/browbeat-containers/collectd-openstack/files/collectd_ovn_raft_monitoring.py @@ -0,0 +1,88 @@ +import collectd +import subprocess + +INTERVAL = 30 + +def config_func(config): + for node in config.children: + key = node.key.lower() + + if key == 'interval': + global INTERVAL + INTERVAL = int(node.values[0]) + +def read_func(): + global INTERVAL + + nbdb_process = subprocess.Popen(['sudo','ovs-appctl', '-t', '/var/lib/openvswitch/ovn/ovnnb_db.ctl', + 'cluster/status', 'OVN_Northbound'], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + nbdb_out, nbdb_err = nbdb_process.communicate() + nbdb_out = nbdb_out.decode("utf-8") + + sbdb_process = subprocess.Popen(['sudo', 'ovs-appctl', '-t', '/var/lib/openvswitch/ovn/ovnsb_db.ctl', + 'cluster/status', 'OVN_Southbound'], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + sbdb_out, sbdb_err = sbdb_process.communicate() + sbdb_out = sbdb_out.decode("utf-8") + + components_list = ["nb_term", "nb_election_timer", "nb_entries_not_committed", + "nb_entries_not_applied", "nb_disconnections", "nb_is_leader", + "sb_term", "sb_election_timer", "sb_entries_not_committed", + "sb_entries_not_applied", "sb_disconnections", "sb_is_leader"] + + for component in components_list: + val = 0 + + if "nb" in component: + for line in nbdb_out.split("\n"): + if component == "nb_term" and "Term:" in line: + val = int(line.split(": ")[1]) + break + elif component == "nb_election_timer" and "Election timer:" in line: + val = int(line.split(": ")[1]) + break + elif component == "nb_entries_not_committed" and "Entries not yet committed:" in line: + val = int(line.split(": ")[1]) + break + elif component == "nb_entries_not_applied" and "Entries not yet applied:" in line: + val = int(line.split(": ")[1]) + break + elif component == "nb_disconnections" and "Disconnections:" in line: + val = int(line.split(": ")[1]) + break + elif component == "nb_is_leader" and "Role: leader" in line: + val = 1 + break + + elif "sb" in component: + for line in sbdb_out.split("\n"): + if component == "sb_term" and "Term:" in line: + val = int(line.split(": ")[1]) + break + elif component == "sb_election_timer" and "Election timer:" in line: + val = int(line.split(": ")[1]) + break + elif component == "sb_entries_not_committed" and "Entries not yet committed:" in line: + val = int(line.split(": ")[1]) + break + elif component == "sb_entries_not_applied" and "Entries not yet applied:" in line: + val = int(line.split(": ")[1]) + break + elif component == "sb_disconnections" and "Disconnections:" in line: + val = int(line.split(": ")[1]) + break + elif component == "sb_is_leader" and "Role: leader" in line: + val = 1 + break + + metric = collectd.Values() + metric.plugin = 'ovn_raft_monitoring' + metric.interval = INTERVAL + metric.type = 'gauge' + metric.type_instance = component + metric.values = [val] + metric.dispatch() + +collectd.register_config(config_func) +collectd.register_read(read_func)