Add some new metrics and update some old ones
Add cpu total cores and used cores Switch memory metrics to report in GB Add simpler health metric Add dropped and errors on network Change-Id: I5a91dba3a3a22a3e8a25055c22c8037ecdeda6e0
This commit is contained in:
@@ -243,50 +243,57 @@ Example cache (pretty-printed excerpt, see next section for complete list of ava
|
|||||||
```
|
```
|
||||||
## Per-Instance Metrics
|
## Per-Instance Metrics
|
||||||
|
|
||||||
| Name | Description | Associated Dimensions |
|
| Name | Description | Associated Dimensions |
|
||||||
| -------------------- | -------------------------------------- | ---------------------- |
|
| ------------------------- | --------------------------------------------------------- | ---------------------- |
|
||||||
| cpu.utilization_perc | Overall CPU utilization (percentage) | |
|
| cpu.total_cores | Total virtual cpus allocated to vm | |
|
||||||
| cpu.utilization_norm_perc | Normalized CPU utilization (percentage) | |
|
| cpu.used_cores | Number of cpu cores used | |
|
||||||
| disk.allocation | Total Disk allocation for a device | 'device' (ie, 'hdd') |
|
| cpu.utilization_perc | Overall CPU utilization (percentage) | |
|
||||||
| disk.capacity | Total Disk capacity for a device | 'device' (ie, 'hdd') |
|
| cpu.utilization_norm_perc | Normalized CPU utilization (percentage) | |
|
||||||
| disk.physical | Total Disk usage for a device | 'device' (ie, 'hdd') |
|
| disk.allocation | Total Disk allocation for a device | 'device' (ie, 'hdd') |
|
||||||
| disk.allocation_total| Total Disk allocation across devices for instances | |
|
| disk.capacity | Total Disk capacity for a device | 'device' (ie, 'hdd') |
|
||||||
| disk.capacity_total | Total Disk capacity across devices for instances | |
|
| disk.physical | Total Disk usage for a device | 'device' (ie, 'hdd') |
|
||||||
| disk.physical_total | Total Disk usage across devices for instances | |
|
| disk.allocation_total | Total Disk allocation across devices for instances | |
|
||||||
| host_alive_status | See [host_alive_status Codes](#host_alive_status-codes) below | |
|
| disk.capacity_total | Total Disk capacity across devices for instances | |
|
||||||
| io.read_ops_sec | Disk I/O read operations per second | 'device' (ie, 'hdd') |
|
| disk.physical_total | Total Disk usage across devices for instances | |
|
||||||
| io.read_ops | Disk I/O read operations val | 'device' (ie, 'hdd') |
|
| health_status | Reports if vm is running (0) or not (1) | |
|
||||||
| io.read_bytes | Disk I/O read bytes val | 'device' (ie, 'hdd') |
|
| host_alive_status | See [host_alive_status Codes](#host_alive_status-codes) below | |
|
||||||
| io.read_bytes_sec | Disk I/O read bytes per second | 'device' (ie, 'hdd') |
|
| io.read_ops_sec | Disk I/O read operations per second | 'device' (ie, 'hdd') |
|
||||||
| io.read_bytes_total | Total Disk I/O read bytes across all devices | |
|
| io.read_ops | Disk I/O read operations val | 'device' (ie, 'hdd') |
|
||||||
| io.read_bytes_total_sec | Total Disk I/O read bytes per second across devices | |
|
| io.read_bytes | Disk I/O read bytes val | 'device' (ie, 'hdd') |
|
||||||
| io.read_ops_total | Total Disk I/O read operations across all devices | |
|
| io.read_bytes_sec | Disk I/O read bytes per second | 'device' (ie, 'hdd') |
|
||||||
| io.read_ops_total_sec | Total Disk I/O read operations across all devices per sec | |
|
| io.read_bytes_total | Total Disk I/O read bytes across all devices | |
|
||||||
| io.write_ops_sec | Disk I/O write operations per second | 'device' (ie, 'hdd') |
|
| io.read_bytes_total_sec | Total Disk I/O read bytes per second across devices | |
|
||||||
| io.write_ops | Disk I/O write operations val | 'device' (ie, 'hdd') |
|
| io.read_ops_total | Total Disk I/O read operations across all devices | |
|
||||||
| io.write_bytes | Disk I/O write bytes val | 'device' (ie, 'hdd') |
|
| io.read_ops_total_sec | Total Disk I/O read operations across all devices per sec | |
|
||||||
| io.write_bytes_sec | Disk I/O write bytes per second | 'device' (ie, 'hdd') |
|
| io.write_ops_sec | Disk I/O write operations per second | 'device' (ie, 'hdd') |
|
||||||
| io.errors_sec | Disk I/O errors per second | 'device' (ie, 'hdd') |
|
| io.write_ops | Disk I/O write operations val | 'device' (ie, 'hdd') |
|
||||||
| io.write_bytes_total | Total Disk I/O write bytes across all devices | |
|
| io.write_bytes | Disk I/O write bytes val | 'device' (ie, 'hdd') |
|
||||||
| io.write_bytes_total_sec | Total Disk I/O Write bytes per second across devices | |
|
| io.write_bytes_sec | Disk I/O write bytes per second | 'device' (ie, 'hdd') |
|
||||||
| io.write_ops_total | Total Disk I/O write operations across all devices | |
|
| io.errors_sec | Disk I/O errors per second | 'device' (ie, 'hdd') |
|
||||||
| io.write_ops_total_sec | Total Disk I/O write operations across all devices per sec | |
|
| io.write_bytes_total | Total Disk I/O write bytes across all devices | |
|
||||||
| net.in_packets_sec | Network received packets per second | 'device' (ie, 'vnet0') |
|
| io.write_bytes_total_sec | Total Disk I/O Write bytes per second across devices | |
|
||||||
| net.out_packets_sec | Network transmitted packets per second | 'device' (ie, 'vnet0') |
|
| io.write_ops_total | Total Disk I/O write operations across all devices | |
|
||||||
| net.in_bytes_sec | Network received bytes per second | 'device' (ie, 'vnet0') |
|
| io.write_ops_total_sec | Total Disk I/O write operations across all devices per sec | |
|
||||||
| net.out_bytes_sec | Network transmitted bytes per second | 'device' (ie, 'vnet0') |
|
| net.in_packets_sec | Network received packets per second | 'device' (ie, 'vnet0') |
|
||||||
| net.in_packets | Network received total packets | 'device' (ie, 'vnet0') |
|
| net.out_packets_sec | Network transmitted packets per second | 'device' (ie, 'vnet0') |
|
||||||
| net.out_packets | Network transmitted total packets | 'device' (ie, 'vnet0') |
|
| net.in_bytes_sec | Network received bytes per second | 'device' (ie, 'vnet0') |
|
||||||
| net.in_bytes | Network received total bytes | 'device' (ie, 'vnet0') |
|
| net.out_bytes_sec | Network transmitted bytes per second | 'device' (ie, 'vnet0') |
|
||||||
| net.out_bytes | Network transmitted total bytes | 'device' (ie, 'vnet0') |
|
| net.in_dropped_sec | Network received packets dropped per second | 'device' (ie, 'vnet0') |
|
||||||
| mem.free_mb | Free memory in Mbytes | |
|
| net.out_dropped_sec | Network transmitted packets dropped per second | 'device' (ie, 'vnet0') |
|
||||||
| mem.total_mb | Total memory in Mbytes | |
|
| net.in_errors_sec | Network received packets with errors per second | 'device' (ie, 'vnet0') |
|
||||||
| mem.used_mb | Used memory in Mbytes | |
|
| net.out_errors_sec | Network transmitted packets with errors per second | 'device' (ie, 'vnet0') |
|
||||||
| mem.free_perc | Percent of memory free | |
|
| net.in_packets | Network received total packets | 'device' (ie, 'vnet0') |
|
||||||
| mem.swap_used_mb | Used swap space in Mbytes | |
|
| net.out_packets | Network transmitted total packets | 'device' (ie, 'vnet0') |
|
||||||
| ping_status | 0 for ping success, 1 for ping failure | |
|
| net.in_bytes | Network received total bytes | 'device' (ie, 'vnet0') |
|
||||||
| cpu.time_ns | Cumulative CPU time (in ns) | |
|
| net.out_bytes | Network transmitted total bytes | 'device' (ie, 'vnet0') |
|
||||||
| mem.resident_mb | Total memory used on host, an Operations-only metric | |
|
| mem.free_gb | Free memory in Gbytes | |
|
||||||
|
| mem.total_gb | Total memory in Gbytes | |
|
||||||
|
| mem.used_gb | Used memory in Gbytes | |
|
||||||
|
| mem.free_perc | Percent of memory free | |
|
||||||
|
| mem.swap_used_gb | Used swap space in Gbytes | |
|
||||||
|
| ping_status | 0 for ping success, 1 for ping failure | |
|
||||||
|
| cpu.time_ns | Cumulative CPU time (in ns) | |
|
||||||
|
| mem.resident_gb | Total memory used on host, an Operations-only metric | |
|
||||||
|
|
||||||
### host_alive_status Codes
|
### host_alive_status Codes
|
||||||
| Code | Description | value_meta 'detail' |
|
| Code | Description | value_meta 'detail' |
|
||||||
|
|||||||
@@ -401,14 +401,14 @@ class LibvirtCheck(AgentCheck):
|
|||||||
cpu_info = insp.inspect_cpus(inst)
|
cpu_info = insp.inspect_cpus(inst)
|
||||||
|
|
||||||
if 'cpu.time' in metric_cache[inst_name]:
|
if 'cpu.time' in metric_cache[inst_name]:
|
||||||
# I have a prior value, so calculate the raw_perc & push the metric
|
# I have a prior value, so calculate the used_cores & push the metric
|
||||||
cpu_diff = cpu_info.time - metric_cache[inst_name]['cpu.time']['value']
|
cpu_diff = cpu_info.time - metric_cache[inst_name]['cpu.time']['value']
|
||||||
time_diff = sample_time - float(metric_cache[inst_name]['cpu.time']['timestamp'])
|
time_diff = sample_time - float(metric_cache[inst_name]['cpu.time']['timestamp'])
|
||||||
# Convert time_diff to nanoseconds, and calculate percentage
|
# Convert time_diff to nanoseconds, and calculate percentage
|
||||||
raw_perc = (cpu_diff / (time_diff * 1000000000)) * 100
|
used_cores = (cpu_diff / (time_diff * 1000000000))
|
||||||
# Divide by the number of cores to normalize the percentage
|
# Divide by the number of cores to normalize the percentage
|
||||||
normalized_perc = (raw_perc / cpu_info.number)
|
normalized_perc = (used_cores / cpu_info.number) * 100
|
||||||
if raw_perc < 0:
|
if used_cores < 0:
|
||||||
# Bad value, save current reading and skip
|
# Bad value, save current reading and skip
|
||||||
self.log.warn("Ignoring negative CPU sample for: "
|
self.log.warn("Ignoring negative CPU sample for: "
|
||||||
"{0} new cpu time: {1} old cpu time: {2}"
|
"{0} new cpu time: {1} old cpu time: {2}"
|
||||||
@@ -418,7 +418,15 @@ class LibvirtCheck(AgentCheck):
|
|||||||
'value': cpu_info.time}
|
'value': cpu_info.time}
|
||||||
return
|
return
|
||||||
|
|
||||||
self.gauge('cpu.utilization_perc', int(round(raw_perc, 0)),
|
self.gauge('cpu.total_cores', float(cpu_info.number),
|
||||||
|
dimensions=dims_customer,
|
||||||
|
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
||||||
|
hostname=instance_cache.get(inst_name)['hostname'])
|
||||||
|
self.gauge('cpu.used_cores', float(used_cores),
|
||||||
|
dimensions=dims_customer,
|
||||||
|
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
||||||
|
hostname=instance_cache.get(inst_name)['hostname'])
|
||||||
|
self.gauge('cpu.utilization_perc', int(round(used_cores * 100, 0)),
|
||||||
dimensions=dims_customer,
|
dimensions=dims_customer,
|
||||||
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
||||||
hostname=instance_cache.get(inst_name)['hostname'])
|
hostname=instance_cache.get(inst_name)['hostname'])
|
||||||
@@ -426,7 +434,12 @@ class LibvirtCheck(AgentCheck):
|
|||||||
dimensions=dims_customer,
|
dimensions=dims_customer,
|
||||||
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
||||||
hostname=instance_cache.get(inst_name)['hostname'])
|
hostname=instance_cache.get(inst_name)['hostname'])
|
||||||
self.gauge('vm.cpu.utilization_perc', int(round(raw_perc, 0)),
|
|
||||||
|
self.gauge('vm.cpu.total_cores', float(cpu_info.number),
|
||||||
|
dimensions=dims_operations)
|
||||||
|
self.gauge('vm.cpu.used_cores', float(used_cores),
|
||||||
|
dimensions=dims_operations)
|
||||||
|
self.gauge('vm.cpu.utilization_perc', int(round(used_cores * 100, 0)),
|
||||||
dimensions=dims_operations)
|
dimensions=dims_operations)
|
||||||
self.gauge('vm.cpu.utilization_norm_perc', int(round(normalized_perc, 0)),
|
self.gauge('vm.cpu.utilization_norm_perc', int(round(normalized_perc, 0)),
|
||||||
dimensions=dims_operations)
|
dimensions=dims_operations)
|
||||||
@@ -580,6 +593,7 @@ class LibvirtCheck(AgentCheck):
|
|||||||
"""
|
"""
|
||||||
inst_state = inst.state()
|
inst_state = inst.state()
|
||||||
dom_status = inst_state[0] - 1
|
dom_status = inst_state[0] - 1
|
||||||
|
health_status = 0 if dom_status == 0 else 1 # anything other than 'running' is considered unhealthy
|
||||||
metatag = None
|
metatag = None
|
||||||
|
|
||||||
if inst_state[0] in DOM_STATES:
|
if inst_state[0] in DOM_STATES:
|
||||||
@@ -597,6 +611,13 @@ class LibvirtCheck(AgentCheck):
|
|||||||
dimensions=dims_operations,
|
dimensions=dims_operations,
|
||||||
value_meta=metatag)
|
value_meta=metatag)
|
||||||
|
|
||||||
|
self.gauge('health_status', health_status,
|
||||||
|
dimensions=dims_customer,
|
||||||
|
delegated_tenant=instance_cache.get(inst_name)['tenant_id'],
|
||||||
|
hostname=instance_cache.get(inst_name)['hostname'])
|
||||||
|
self.gauge('vm.health_status', health_status,
|
||||||
|
dimensions=dims_operations)
|
||||||
|
|
||||||
return dom_status
|
return dom_status
|
||||||
|
|
||||||
def prepare_run(self):
|
def prepare_run(self):
|
||||||
@@ -766,10 +787,10 @@ class LibvirtCheck(AgentCheck):
|
|||||||
# (req. balloon driver; Linux kernel param CONFIG_VIRTIO_BALLOON)
|
# (req. balloon driver; Linux kernel param CONFIG_VIRTIO_BALLOON)
|
||||||
try:
|
try:
|
||||||
mem_stats = inst.memoryStats()
|
mem_stats = inst.memoryStats()
|
||||||
mem_metrics = {'mem.free_mb': float(mem_stats['unused']) / 1024,
|
mem_metrics = {'mem.free_gb': float(mem_stats['unused']) / 1024 / 1024,
|
||||||
'mem.swap_used_mb': float(mem_stats['swap_out']) / 1024,
|
'mem.swap_used_gb': float(mem_stats['swap_out']) / 1024 / 1024,
|
||||||
'mem.total_mb': float(mem_stats['available']) / 1024,
|
'mem.total_gb': float(mem_stats['available']) / 1024 / 1024,
|
||||||
'mem.used_mb': float(mem_stats['available'] - mem_stats['unused']) / 1024,
|
'mem.used_gb': float(mem_stats['available'] - mem_stats['unused']) / 1024 / 1024,
|
||||||
'mem.free_perc': float(mem_stats['unused']) / float(mem_stats['available']) * 100}
|
'mem.free_perc': float(mem_stats['unused']) / float(mem_stats['available']) * 100}
|
||||||
for name in mem_metrics:
|
for name in mem_metrics:
|
||||||
self.gauge(name, mem_metrics[name], dimensions=dims_customer,
|
self.gauge(name, mem_metrics[name], dimensions=dims_customer,
|
||||||
@@ -778,7 +799,7 @@ class LibvirtCheck(AgentCheck):
|
|||||||
self.gauge("vm.{0}".format(name), mem_metrics[name],
|
self.gauge("vm.{0}".format(name), mem_metrics[name],
|
||||||
dimensions=dims_operations)
|
dimensions=dims_operations)
|
||||||
memory_info = insp.inspect_memory_resident(inst)
|
memory_info = insp.inspect_memory_resident(inst)
|
||||||
self.gauge('vm.mem.resident_mb', float(memory_info.resident), dimensions=dims_operations)
|
self.gauge('vm.mem.resident_gb', float(memory_info.resident) / 1024, dimensions=dims_operations)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
self.log.debug("Balloon driver not active/available on guest {0} ({1})".format(inst_name,
|
self.log.debug("Balloon driver not active/available on guest {0} ({1})".format(inst_name,
|
||||||
instance_cache.get(inst_name)['hostname'].encode('utf8')))
|
instance_cache.get(inst_name)['hostname'].encode('utf8')))
|
||||||
|
|||||||
@@ -85,7 +85,9 @@ Interface = collections.namedtuple('Interface', ['name', 'mac',
|
|||||||
#
|
#
|
||||||
InterfaceStats = collections.namedtuple('InterfaceStats',
|
InterfaceStats = collections.namedtuple('InterfaceStats',
|
||||||
['rx_bytes', 'rx_packets',
|
['rx_bytes', 'rx_packets',
|
||||||
'tx_bytes', 'tx_packets'])
|
'rx_errors', 'rx_dropped',
|
||||||
|
'tx_bytes', 'tx_packets',
|
||||||
|
'tx_errors', 'tx_dropped'])
|
||||||
|
|
||||||
|
|
||||||
# Named tuple representing vNIC rate statistics.
|
# Named tuple representing vNIC rate statistics.
|
||||||
|
|||||||
@@ -147,8 +147,12 @@ class LibvirtInspector(virt_inspector.Inspector):
|
|||||||
dom_stats = domain.interfaceStats(name)
|
dom_stats = domain.interfaceStats(name)
|
||||||
stats = virt_inspector.InterfaceStats(rx_bytes=dom_stats[0],
|
stats = virt_inspector.InterfaceStats(rx_bytes=dom_stats[0],
|
||||||
rx_packets=dom_stats[1],
|
rx_packets=dom_stats[1],
|
||||||
|
rx_errors=dom_stats[2],
|
||||||
|
rx_dropped=dom_stats[3],
|
||||||
tx_bytes=dom_stats[4],
|
tx_bytes=dom_stats[4],
|
||||||
tx_packets=dom_stats[5])
|
tx_packets=dom_stats[5],
|
||||||
|
tx_errors=dom_stats[6],
|
||||||
|
tx_dropped=dom_stats[7])
|
||||||
yield (interface, stats)
|
yield (interface, stats)
|
||||||
|
|
||||||
def inspect_disks(self, instance):
|
def inspect_disks(self, instance):
|
||||||
|
|||||||
Reference in New Issue
Block a user