monasca-agent/monagent/collector/checks_d/libvirt.py

255 lines
12 KiB
Python

#!/bin/env python
# Copyright (c) 2014 Hewlett-Packard Development Company, L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""Monasca Agent interface for libvirt metrics"""
import os
import stat
import time
import yaml
from calendar import timegm
from datetime import datetime
from monagent.collector.virt import inspector
from monagent.collector.checks import AgentCheck
class LibvirtCheck(AgentCheck):
"""Inherit Agent class and gather libvirt metrics"""
def __init__(self, name, init_config, agent_config):
AgentCheck.__init__(self, name, init_config, agent_config)
self.instance_cache_file = "{}/{}".format(self.init_config.get('cache_dir'),
'libvirt_instances.yaml')
self.metric_cache_file = "{}/{}".format(self.init_config.get('cache_dir'),
'libvirt_metrics.yaml')
def _test_vm_probation(self, created):
"""Test to see if a VM was created within the probation period.
Convert an ISO-8601 timestamp into UNIX epoch timestamp from now
and compare that against configured vm_probation. Return the
number of seconds this VM will remain in probation.
"""
dt = datetime.strptime(created, '%Y-%m-%dT%H:%M:%SZ')
created_sec = (time.time() - timegm(dt.timetuple()))
probation_time = self.init_config.get('vm_probation') - created_sec
return int(probation_time)
def _update_instance_cache(self):
"""Collect instance_id, project_id, and AZ for all instance UUIDs
"""
from novaclient.v3 import client
id_cache = {}
# Get a list of all instances from the Nova API
nova_client = client.Client(self.init_config.get('admin_user'),
self.init_config.get('admin_password'),
self.init_config.get('admin_tenant_name'),
self.init_config.get('identity_uri'),
service_type="compute")
instances = nova_client.servers.list(search_opts={'all_tenants': 1})
for instance in instances:
inst_name = instance.__getattr__('OS-EXT-SRV-ATTR:instance_name')
inst_az = instance.__getattr__('OS-EXT-AZ:availability_zone')
id_cache[inst_name] = {'instance_uuid': instance.id,
'hostname': instance.name,
'zone': inst_az,
'created': instance.created,
'tenant_id': instance.tenant_id}
id_cache['last_update'] = int(time.time())
# Write the updated cache
try:
with open(self.instance_cache_file, 'w') as cache_yaml:
yaml.safe_dump(id_cache, cache_yaml)
if stat.S_IMODE(os.stat(self.instance_cache_file).st_mode) != 0600:
os.chmod(self.instance_cache_file, 0600)
except IOError as e:
self.log.error("Cannot write to {}: {}".format(self.instance_cache_file, e))
return id_cache
def _load_instance_cache(self):
"""Load the cache if instance names to IDs.
If the cache does not yet exist, return an empty one.
"""
instance_cache = {}
try:
with open(self.instance_cache_file, 'r') as cache_yaml:
instance_cache = yaml.safe_load(cache_yaml)
# Is it time to force a refresh of this data?
if self.init_config.get('nova_refresh') is not None:
time_diff = time.time() - instance_cache['last_update']
if time_diff > self.init_config.get('nova_refresh'):
self._update_instance_cache()
except IOError:
# The file may not exist yet, and that's OK. Build it now.
instance_cache = self._update_instance_cache()
pass
return instance_cache
def _load_metric_cache(self):
"""Load the counter metrics from the previous collection iteration
"""
metric_cache = {}
try:
with open(self.metric_cache_file, 'r') as cache_yaml:
metric_cache = yaml.safe_load(cache_yaml)
except IOError:
# The file may not exist yet.
pass
return metric_cache
def _update_metric_cache(self, metric_cache):
try:
with open(self.metric_cache_file, 'w') as cache_yaml:
yaml.safe_dump(metric_cache, cache_yaml)
if stat.S_IMODE(os.stat(self.metric_cache_file).st_mode) != 0600:
os.chmod(self.metric_cache_file, 0600)
except IOError as e:
self.log.error("Cannot write to {}: {}".format(self.metric_cache_file, e))
def check(self, instance):
"""Gather VM metrics for each instance"""
# Load metric cache
metric_cache = self._load_metric_cache()
# Load the nova-obtained instance data cache
instance_cache = self._load_instance_cache()
# Build dimensions for both the customer and for operations
dims_base = {'service': 'compute', 'component': 'vm'}
insp = inspector.get_hypervisor_inspector()
for inst in insp.inspect_instances():
# Verify that this instance exists in the cache. Add if necessary.
if inst.name not in instance_cache:
instance_cache = self._update_instance_cache()
if inst.name not in metric_cache:
metric_cache[inst.name] = {}
# Skip instances created within the probation period
vm_probation_remaining = self._test_vm_probation(instance_cache.get(inst.name)['created'])
if (vm_probation_remaining >= 0):
self.log.info("Libvirt: {} in probation for another {} seconds".format(instance_cache.get(inst.name)['hostname'],
vm_probation_remaining))
continue
# Build customer dimensions
dims_customer = dims_base.copy()
dims_customer['resource_id'] = instance_cache.get(inst.name)['instance_uuid']
dims_customer['zone'] = instance_cache.get(inst.name)['zone']
# Add dimensions that would be helpful for operations
dims_operations = dims_customer.copy()
dims_operations['tenant_id'] = instance_cache.get(inst.name)['tenant_id']
dims_operations['cloud_tier'] = 'overcloud'
# CPU utilization percentage
sample_time = float("{:9f}".format(time.time()))
if 'cpu.time' in metric_cache[inst.name]:
# I have a prior value, so calculate the rate & push the metric
cpu_diff = insp.inspect_cpus(inst.name).time - metric_cache[inst.name]['cpu.time']['value']
time_diff = sample_time - float(metric_cache[inst.name]['cpu.time']['timestamp'])
# Convert time_diff to nanoseconds, and calculate percentage
rate = (cpu_diff / (time_diff * 1000000000)) * 100
self.gauge('cpu.utilization_perc', int(round(rate, 0)),
dimensions=dims_customer,
delegated_tenant=instance_cache.get(inst.name)['tenant_id'],
hostname=instance_cache.get(inst.name)['hostname'])
self.gauge('vm.cpu.utilization_perc', int(round(rate, 0)),
dimensions=dims_operations)
metric_cache[inst.name]['cpu.time'] = {'timestamp': sample_time,
'value': insp.inspect_cpus(inst.name).time}
# Disk utilization
for disk in insp.inspect_disks(inst.name):
sample_time = int(time.time())
disk_dimensions = {'device': disk[0].device}
for metric in disk[1]._fields:
metric_name = "io.{}".format(metric)
if metric_name not in metric_cache[inst.name]:
metric_cache[inst.name][metric_name] = {}
value = int(disk[1].__getattribute__(metric))
if disk[0].device in metric_cache[inst.name][metric_name]:
time_diff = sample_time - metric_cache[inst.name][metric_name][disk[0].device]['timestamp']
val_diff = value - metric_cache[inst.name][metric_name][disk[0].device]['value']
# Change the metric name to a rate, ie. "io.read_requests"
# gets converted to "io.read_ops_sec"
rate_name = "{}_sec".format(metric_name.replace('requests', 'ops'))
# Customer
this_dimensions = disk_dimensions.copy()
this_dimensions.update(dims_customer)
self.gauge(rate_name, val_diff, dimensions=this_dimensions,
delegated_tenant=instance_cache.get(inst.name)['tenant_id'],
hostname=instance_cache.get(inst.name)['hostname'])
# Operations (metric name prefixed with "vm."
this_dimensions = disk_dimensions.copy()
this_dimensions.update(dims_operations)
self.gauge("vm.{}".format(rate_name), val_diff,
dimensions=this_dimensions)
# Save this metric to the cache
metric_cache[inst.name][metric_name][disk[0].device] = {
'timestamp': sample_time,
'value': value}
# Network utilization
for vnic in insp.inspect_vnics(inst.name):
sample_time = int(time.time())
vnic_dimensions = {'device': vnic[0].name}
for metric in vnic[1]._fields:
metric_name = "net.{}".format(metric)
if metric_name not in metric_cache[inst.name]:
metric_cache[inst.name][metric_name] = {}
value = int(vnic[1].__getattribute__(metric))
if vnic[0].name in metric_cache[inst.name][metric_name]:
time_diff = sample_time - metric_cache[inst.name][metric_name][vnic[0].name]['timestamp']
val_diff = value - metric_cache[inst.name][metric_name][vnic[0].name]['value']
# Change the metric name to a rate, ie. "net.rx_bytes"
# gets converted to "net.rx_bytes_sec"
rate_name = "{}_sec".format(metric_name)
# Rename "tx" to "out" and "rx" to "in"
rate_name = rate_name.replace("tx", "out")
rate_name = rate_name.replace("rx", "in")
# Customer
this_dimensions = vnic_dimensions.copy()
this_dimensions.update(dims_customer)
self.gauge(rate_name, val_diff,
dimensions=this_dimensions,
delegated_tenant=instance_cache.get(inst.name)['tenant_id'],
hostname=instance_cache.get(inst.name)['hostname'])
# Operations (metric name prefixed with "vm."
this_dimensions = vnic_dimensions.copy()
this_dimensions.update(dims_operations)
self.gauge("vm.{}".format(rate_name), val_diff,
dimensions=this_dimensions)
# Save this metric to the cache
metric_cache[inst.name][metric_name][vnic[0].name] = {
'timestamp': sample_time,
'value': value}
# Save these metrics for the next collector invocation
self._update_metric_cache(metric_cache)