Add OVN RAFT monitoring
This patch adds monitoring for OVN RAFT. Change-Id: I806507696d5534005c4905ecac3f87ec422f05ce
This commit is contained in:
parent
38a589e729
commit
e95a430342
@ -333,6 +333,13 @@ ovs_flows_monitoring: false
|
|||||||
# before enabling this plugin.
|
# before enabling this plugin.
|
||||||
ovn_monitoring: false
|
ovn_monitoring: false
|
||||||
|
|
||||||
|
#######################
|
||||||
|
# OVN RAFT Monitoring
|
||||||
|
#######################
|
||||||
|
# Monitors OVN RAFT cluster status metrics on controllers.
|
||||||
|
ovn_raft_monitoring: false
|
||||||
|
ovn_raft_controller_collectd_interval: 30
|
||||||
|
|
||||||
#######################
|
#######################
|
||||||
# Pacemaker Monitoring
|
# Pacemaker Monitoring
|
||||||
#######################
|
#######################
|
||||||
|
@ -259,6 +259,10 @@
|
|||||||
-v /var/lib/openvswitch/ovn/ovnnb_db.sock:/var/lib/openvswitch/ovn/ovnnb_db.sock \
|
-v /var/lib/openvswitch/ovn/ovnnb_db.sock:/var/lib/openvswitch/ovn/ovnnb_db.sock \
|
||||||
-v /var/lib/openvswitch/ovn/ovnsb_db.sock:/var/lib/openvswitch/ovn/ovnsb_db.sock \
|
-v /var/lib/openvswitch/ovn/ovnsb_db.sock:/var/lib/openvswitch/ovn/ovnsb_db.sock \
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
{% if ovn_raft_monitoring %}
|
||||||
|
-v /var/lib/openvswitch/ovn/ovnnb_db.ctl:/var/lib/openvswitch/ovn/ovnnb_db.ctl \
|
||||||
|
-v /var/lib/openvswitch/ovn/ovnsb_db.ctl:/var/lib/openvswitch/ovn/ovnsb_db.ctl \
|
||||||
|
{% endif %}
|
||||||
{% if pacemaker_monitoring %}
|
{% if pacemaker_monitoring %}
|
||||||
-v /home/{{ host_remote_user }}/collectd_pipe:/collectd_pipe \
|
-v /home/{{ host_remote_user }}/collectd_pipe:/collectd_pipe \
|
||||||
{% endif %}
|
{% endif %}
|
||||||
@ -277,4 +281,4 @@
|
|||||||
podman exec -it -u root collectd-controller usermod -G wheel stack
|
podman exec -it -u root collectd-controller usermod -G wheel stack
|
||||||
become: yes
|
become: yes
|
||||||
become_user: root
|
become_user: root
|
||||||
when: "config_type == 'controller' and ovn_monitoring"
|
when: "config_type == 'controller' and (ovn_monitoring or ovn_raft_monitoring)"
|
||||||
|
@ -585,6 +585,17 @@ LoadPlugin python
|
|||||||
|
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
{%if ovn_raft_monitoring %}
|
||||||
|
<Plugin python>
|
||||||
|
ModulePath "/usr/local/bin/"
|
||||||
|
Import "collectd_ovn_raft_monitoring"
|
||||||
|
<Module collectd_ovn_raft_monitoring>
|
||||||
|
Interval {{ovn_raft_controller_collectd_interval}}
|
||||||
|
</Module>
|
||||||
|
</Plugin>
|
||||||
|
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
{%if gnocchi_status_controller_collectd_plugin %}
|
{%if gnocchi_status_controller_collectd_plugin %}
|
||||||
{%if inventory_hostname == groups['Controller'][0] %}
|
{%if inventory_hostname == groups['Controller'][0] %}
|
||||||
<Plugin python>
|
<Plugin python>
|
||||||
|
@ -169,6 +169,8 @@ dashboard:
|
|||||||
{% include 'partials/ovn_db_tables.yaml' %}
|
{% include 'partials/ovn_db_tables.yaml' %}
|
||||||
|
|
||||||
{% include 'partials/pacemaker_monitoring.yaml' %}
|
{% include 'partials/pacemaker_monitoring.yaml' %}
|
||||||
|
|
||||||
|
{% include 'partials/ovn_raft_monitoring.yaml' %}
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
{% include 'partials/ovn_metrics.yaml' %}
|
{% include 'partials/ovn_metrics.yaml' %}
|
||||||
|
@ -0,0 +1,73 @@
|
|||||||
|
- title: OVN RAFT Metrics
|
||||||
|
collapse: true
|
||||||
|
height: 200px
|
||||||
|
showTitle: true
|
||||||
|
panels:
|
||||||
|
- title: $Cloud - $Node - OVN RAFT Northbound DB Metrics
|
||||||
|
type: graph
|
||||||
|
legend:
|
||||||
|
alignAsTable: true
|
||||||
|
avg: false
|
||||||
|
current: true
|
||||||
|
max: true
|
||||||
|
min: true
|
||||||
|
rightSide: true
|
||||||
|
show: true
|
||||||
|
total: false
|
||||||
|
values: true
|
||||||
|
nullPointMode: 'null'
|
||||||
|
targets:
|
||||||
|
- target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-nb_disconnections, 'disconnections')
|
||||||
|
- target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-nb_election_timer, 'election_timer')
|
||||||
|
- target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-nb_entries_not_applied, 'entries_not_yet_applied')
|
||||||
|
- target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-nb_entries_not_committed, 'entries_not_yet_committed')
|
||||||
|
- target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-nb_term, 'term')
|
||||||
|
- title: $Cloud - $Node - OVN RAFT Northbound DB Leader Data
|
||||||
|
type: graph
|
||||||
|
legend:
|
||||||
|
alignAsTable: true
|
||||||
|
avg: false
|
||||||
|
current: true
|
||||||
|
max: true
|
||||||
|
min: true
|
||||||
|
rightSide: true
|
||||||
|
show: true
|
||||||
|
total: false
|
||||||
|
values: true
|
||||||
|
nullPointMode: 'null'
|
||||||
|
targets:
|
||||||
|
- target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-nb_is_leader, 'is_selected_controller_leader')
|
||||||
|
- title: $Cloud - $Node - OVN RAFT Southbound DB Metrics
|
||||||
|
type: graph
|
||||||
|
legend:
|
||||||
|
alignAsTable: true
|
||||||
|
avg: false
|
||||||
|
current: true
|
||||||
|
max: true
|
||||||
|
min: true
|
||||||
|
rightSide: true
|
||||||
|
show: true
|
||||||
|
total: false
|
||||||
|
values: true
|
||||||
|
nullPointMode: 'null'
|
||||||
|
targets:
|
||||||
|
- target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-sb_disconnections, 'disconnections')
|
||||||
|
- target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-sb_election_timer, 'election_timer')
|
||||||
|
- target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-sb_entries_not_applied, 'entries_not_yet_applied')
|
||||||
|
- target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-sb_entries_not_committed, 'entries_not_yet_committed')
|
||||||
|
- target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-sb_term, 'term')
|
||||||
|
- title: $Cloud - $Node - OVN RAFT Southbound DB Leader Data
|
||||||
|
type: graph
|
||||||
|
legend:
|
||||||
|
alignAsTable: true
|
||||||
|
avg: false
|
||||||
|
current: true
|
||||||
|
max: true
|
||||||
|
min: true
|
||||||
|
rightSide: true
|
||||||
|
show: true
|
||||||
|
total: false
|
||||||
|
values: true
|
||||||
|
nullPointMode: 'null'
|
||||||
|
targets:
|
||||||
|
- target: alias($Cloud.$Node.ovn_raft_monitoring.gauge-sb_is_leader, 'is_selected_controller_leader')
|
@ -30,6 +30,7 @@ ADD files/collectd_rabbitmq_monitoring.py /usr/local/bin/collectd_rabbitmq_monit
|
|||||||
ADD files/collectd_swift_stat.py /usr/local/bin/collectd_swift_stat.py
|
ADD files/collectd_swift_stat.py /usr/local/bin/collectd_swift_stat.py
|
||||||
ADD files/collectd_pacemaker_monitoring.py /usr/local/bin/collectd_pacemaker_monitoring.py
|
ADD files/collectd_pacemaker_monitoring.py /usr/local/bin/collectd_pacemaker_monitoring.py
|
||||||
ADD files/collectd_iostat_python.py /usr/local/bin/collectd_iostat_python.py
|
ADD files/collectd_iostat_python.py /usr/local/bin/collectd_iostat_python.py
|
||||||
|
ADD files/collectd_ovn_raft_monitoring.py /usr/local/bin/collectd_ovn_raft_monitoring.py
|
||||||
ADD files/ovs_flows.sh /usr/local/bin/ovs_flows.sh
|
ADD files/ovs_flows.sh /usr/local/bin/ovs_flows.sh
|
||||||
ADD files/ovn_monitoring.sh /usr/local/bin/ovn_monitoring.sh
|
ADD files/ovn_monitoring.sh /usr/local/bin/ovn_monitoring.sh
|
||||||
|
|
||||||
|
@ -0,0 +1,88 @@
|
|||||||
|
import collectd
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
INTERVAL = 30
|
||||||
|
|
||||||
|
def config_func(config):
|
||||||
|
for node in config.children:
|
||||||
|
key = node.key.lower()
|
||||||
|
|
||||||
|
if key == 'interval':
|
||||||
|
global INTERVAL
|
||||||
|
INTERVAL = int(node.values[0])
|
||||||
|
|
||||||
|
def read_func():
|
||||||
|
global INTERVAL
|
||||||
|
|
||||||
|
nbdb_process = subprocess.Popen(['sudo','ovs-appctl', '-t', '/var/lib/openvswitch/ovn/ovnnb_db.ctl',
|
||||||
|
'cluster/status', 'OVN_Northbound'], stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE)
|
||||||
|
nbdb_out, nbdb_err = nbdb_process.communicate()
|
||||||
|
nbdb_out = nbdb_out.decode("utf-8")
|
||||||
|
|
||||||
|
sbdb_process = subprocess.Popen(['sudo', 'ovs-appctl', '-t', '/var/lib/openvswitch/ovn/ovnsb_db.ctl',
|
||||||
|
'cluster/status', 'OVN_Southbound'], stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE)
|
||||||
|
sbdb_out, sbdb_err = sbdb_process.communicate()
|
||||||
|
sbdb_out = sbdb_out.decode("utf-8")
|
||||||
|
|
||||||
|
components_list = ["nb_term", "nb_election_timer", "nb_entries_not_committed",
|
||||||
|
"nb_entries_not_applied", "nb_disconnections", "nb_is_leader",
|
||||||
|
"sb_term", "sb_election_timer", "sb_entries_not_committed",
|
||||||
|
"sb_entries_not_applied", "sb_disconnections", "sb_is_leader"]
|
||||||
|
|
||||||
|
for component in components_list:
|
||||||
|
val = 0
|
||||||
|
|
||||||
|
if "nb" in component:
|
||||||
|
for line in nbdb_out.split("\n"):
|
||||||
|
if component == "nb_term" and "Term:" in line:
|
||||||
|
val = int(line.split(": ")[1])
|
||||||
|
break
|
||||||
|
elif component == "nb_election_timer" and "Election timer:" in line:
|
||||||
|
val = int(line.split(": ")[1])
|
||||||
|
break
|
||||||
|
elif component == "nb_entries_not_committed" and "Entries not yet committed:" in line:
|
||||||
|
val = int(line.split(": ")[1])
|
||||||
|
break
|
||||||
|
elif component == "nb_entries_not_applied" and "Entries not yet applied:" in line:
|
||||||
|
val = int(line.split(": ")[1])
|
||||||
|
break
|
||||||
|
elif component == "nb_disconnections" and "Disconnections:" in line:
|
||||||
|
val = int(line.split(": ")[1])
|
||||||
|
break
|
||||||
|
elif component == "nb_is_leader" and "Role: leader" in line:
|
||||||
|
val = 1
|
||||||
|
break
|
||||||
|
|
||||||
|
elif "sb" in component:
|
||||||
|
for line in sbdb_out.split("\n"):
|
||||||
|
if component == "sb_term" and "Term:" in line:
|
||||||
|
val = int(line.split(": ")[1])
|
||||||
|
break
|
||||||
|
elif component == "sb_election_timer" and "Election timer:" in line:
|
||||||
|
val = int(line.split(": ")[1])
|
||||||
|
break
|
||||||
|
elif component == "sb_entries_not_committed" and "Entries not yet committed:" in line:
|
||||||
|
val = int(line.split(": ")[1])
|
||||||
|
break
|
||||||
|
elif component == "sb_entries_not_applied" and "Entries not yet applied:" in line:
|
||||||
|
val = int(line.split(": ")[1])
|
||||||
|
break
|
||||||
|
elif component == "sb_disconnections" and "Disconnections:" in line:
|
||||||
|
val = int(line.split(": ")[1])
|
||||||
|
break
|
||||||
|
elif component == "sb_is_leader" and "Role: leader" in line:
|
||||||
|
val = 1
|
||||||
|
break
|
||||||
|
|
||||||
|
metric = collectd.Values()
|
||||||
|
metric.plugin = 'ovn_raft_monitoring'
|
||||||
|
metric.interval = INTERVAL
|
||||||
|
metric.type = 'gauge'
|
||||||
|
metric.type_instance = component
|
||||||
|
metric.values = [val]
|
||||||
|
metric.dispatch()
|
||||||
|
|
||||||
|
collectd.register_config(config_func)
|
||||||
|
collectd.register_read(read_func)
|
Loading…
Reference in New Issue
Block a user