255 lines
12 KiB
Python
255 lines
12 KiB
Python
#!/bin/env python
|
|
|
|
# Copyright (c) 2014 Hewlett-Packard Development Company, L.P.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
"""Monasca Agent interface for libvirt metrics"""
|
|
|
|
import os
|
|
import stat
|
|
import time
|
|
import yaml
|
|
|
|
from calendar import timegm
|
|
from datetime import datetime
|
|
from monasca_agent.collector.virt import inspector
|
|
from monasca_agent.collector.checks import AgentCheck
|
|
|
|
|
|
class LibvirtCheck(AgentCheck):
|
|
|
|
"""Inherit Agent class and gather libvirt metrics"""
|
|
|
|
def __init__(self, name, init_config, agent_config):
|
|
AgentCheck.__init__(self, name, init_config, agent_config)
|
|
self.instance_cache_file = "{0}/{1}".format(self.init_config.get('cache_dir'),
|
|
'libvirt_instances.yaml')
|
|
self.metric_cache_file = "{0}/{1}".format(self.init_config.get('cache_dir'),
|
|
'libvirt_metrics.yaml')
|
|
|
|
def _test_vm_probation(self, created):
|
|
"""Test to see if a VM was created within the probation period.
|
|
|
|
Convert an ISO-8601 timestamp into UNIX epoch timestamp from now
|
|
and compare that against configured vm_probation. Return the
|
|
number of seconds this VM will remain in probation.
|
|
"""
|
|
dt = datetime.strptime(created, '%Y-%m-%dT%H:%M:%SZ')
|
|
created_sec = (time.time() - timegm(dt.timetuple()))
|
|
probation_time = self.init_config.get('vm_probation') - created_sec
|
|
return int(probation_time)
|
|
|
|
def _update_instance_cache(self):
|
|
"""Collect instance_id, project_id, and AZ for all instance UUIDs
|
|
"""
|
|
from novaclient.v3 import client
|
|
id_cache = {}
|
|
# Get a list of all instances from the Nova API
|
|
nova_client = client.Client(self.init_config.get('admin_user'),
|
|
self.init_config.get('admin_password'),
|
|
self.init_config.get('admin_tenant_name'),
|
|
self.init_config.get('identity_uri'),
|
|
service_type="compute")
|
|
instances = nova_client.servers.list(search_opts={'all_tenants': 1})
|
|
|
|
for instance in instances:
|
|
inst_name = instance.__getattr__('OS-EXT-SRV-ATTR:instance_name')
|
|
inst_az = instance.__getattr__('OS-EXT-AZ:availability_zone')
|
|
id_cache[inst_name] = {'instance_uuid': instance.id,
|
|
'hostname': instance.name,
|
|
'zone': inst_az,
|
|
'created': instance.created,
|
|
'tenant_id': instance.tenant_id}
|
|
id_cache['last_update'] = int(time.time())
|
|
|
|
# Write the updated cache
|
|
try:
|
|
with open(self.instance_cache_file, 'w') as cache_yaml:
|
|
yaml.safe_dump(id_cache, cache_yaml)
|
|
if stat.S_IMODE(os.stat(self.instance_cache_file).st_mode) != 0600:
|
|
os.chmod(self.instance_cache_file, 0600)
|
|
except IOError as e:
|
|
self.log.error("Cannot write to {0}: {1}".format(self.instance_cache_file, e))
|
|
|
|
return id_cache
|
|
|
|
def _load_instance_cache(self):
|
|
"""Load the cache if instance names to IDs.
|
|
|
|
If the cache does not yet exist, return an empty one.
|
|
"""
|
|
instance_cache = {}
|
|
try:
|
|
with open(self.instance_cache_file, 'r') as cache_yaml:
|
|
instance_cache = yaml.safe_load(cache_yaml)
|
|
|
|
# Is it time to force a refresh of this data?
|
|
if self.init_config.get('nova_refresh') is not None:
|
|
time_diff = time.time() - instance_cache['last_update']
|
|
if time_diff > self.init_config.get('nova_refresh'):
|
|
self._update_instance_cache()
|
|
except IOError:
|
|
# The file may not exist yet, and that's OK. Build it now.
|
|
instance_cache = self._update_instance_cache()
|
|
pass
|
|
|
|
return instance_cache
|
|
|
|
def _load_metric_cache(self):
|
|
"""Load the counter metrics from the previous collection iteration
|
|
"""
|
|
metric_cache = {}
|
|
try:
|
|
with open(self.metric_cache_file, 'r') as cache_yaml:
|
|
metric_cache = yaml.safe_load(cache_yaml)
|
|
except IOError:
|
|
# The file may not exist yet.
|
|
pass
|
|
|
|
return metric_cache
|
|
|
|
def _update_metric_cache(self, metric_cache):
|
|
try:
|
|
with open(self.metric_cache_file, 'w') as cache_yaml:
|
|
yaml.safe_dump(metric_cache, cache_yaml)
|
|
if stat.S_IMODE(os.stat(self.metric_cache_file).st_mode) != 0600:
|
|
os.chmod(self.metric_cache_file, 0600)
|
|
except IOError as e:
|
|
self.log.error("Cannot write to {0}: {1}".format(self.metric_cache_file, e))
|
|
|
|
def check(self, instance):
|
|
"""Gather VM metrics for each instance"""
|
|
|
|
# Load metric cache
|
|
metric_cache = self._load_metric_cache()
|
|
|
|
# Load the nova-obtained instance data cache
|
|
instance_cache = self._load_instance_cache()
|
|
|
|
# Build dimensions for both the customer and for operations
|
|
dims_base = self._set_dimensions({'service': 'compute', 'component': 'vm'}, instance)
|
|
|
|
insp = inspector.get_hypervisor_inspector()
|
|
for inst in insp.inspect_instances():
|
|
# Verify that this instance exists in the cache. Add if necessary.
|
|
if inst.name not in instance_cache:
|
|
instance_cache = self._update_instance_cache()
|
|
if inst.name not in metric_cache:
|
|
metric_cache[inst.name] = {}
|
|
|
|
# Skip instances created within the probation period
|
|
vm_probation_remaining = self._test_vm_probation(instance_cache.get(inst.name)['created'])
|
|
if (vm_probation_remaining >= 0):
|
|
self.log.info("Libvirt: {0} in probation for another {1} seconds".format(instance_cache.get(inst.name)['hostname'],
|
|
vm_probation_remaining))
|
|
continue
|
|
|
|
# Build customer dimensions
|
|
dims_customer = dims_base.copy()
|
|
dims_customer['resource_id'] = instance_cache.get(inst.name)['instance_uuid']
|
|
dims_customer['zone'] = instance_cache.get(inst.name)['zone']
|
|
# Add dimensions that would be helpful for operations
|
|
dims_operations = dims_customer.copy()
|
|
dims_operations['tenant_id'] = instance_cache.get(inst.name)['tenant_id']
|
|
dims_operations['cloud_tier'] = 'overcloud'
|
|
|
|
# CPU utilization percentage
|
|
sample_time = float("{:9f}".format(time.time()))
|
|
if 'cpu.time' in metric_cache[inst.name]:
|
|
# I have a prior value, so calculate the rate & push the metric
|
|
cpu_diff = insp.inspect_cpus(inst.name).time - metric_cache[inst.name]['cpu.time']['value']
|
|
time_diff = sample_time - float(metric_cache[inst.name]['cpu.time']['timestamp'])
|
|
# Convert time_diff to nanoseconds, and calculate percentage
|
|
rate = (cpu_diff / (time_diff * 1000000000)) * 100
|
|
|
|
self.gauge('cpu.utilization_perc', int(round(rate, 0)),
|
|
dimensions=dims_customer,
|
|
delegated_tenant=instance_cache.get(inst.name)['tenant_id'],
|
|
hostname=instance_cache.get(inst.name)['hostname'])
|
|
self.gauge('vm.cpu.utilization_perc', int(round(rate, 0)),
|
|
dimensions=dims_operations)
|
|
|
|
metric_cache[inst.name]['cpu.time'] = {'timestamp': sample_time,
|
|
'value': insp.inspect_cpus(inst.name).time}
|
|
|
|
# Disk utilization
|
|
for disk in insp.inspect_disks(inst.name):
|
|
sample_time = int(time.time())
|
|
disk_dimensions = {'device': disk[0].device}
|
|
for metric in disk[1]._fields:
|
|
metric_name = "io.{0}".format(metric)
|
|
if metric_name not in metric_cache[inst.name]:
|
|
metric_cache[inst.name][metric_name] = {}
|
|
|
|
value = int(disk[1].__getattribute__(metric))
|
|
if disk[0].device in metric_cache[inst.name][metric_name]:
|
|
time_diff = sample_time - metric_cache[inst.name][metric_name][disk[0].device]['timestamp']
|
|
val_diff = value - metric_cache[inst.name][metric_name][disk[0].device]['value']
|
|
# Change the metric name to a rate, ie. "io.read_requests"
|
|
# gets converted to "io.read_ops_sec"
|
|
rate_name = "{0}_sec".format(metric_name.replace('requests', 'ops'))
|
|
# Customer
|
|
this_dimensions = disk_dimensions.copy()
|
|
this_dimensions.update(dims_customer)
|
|
self.gauge(rate_name, val_diff, dimensions=this_dimensions,
|
|
delegated_tenant=instance_cache.get(inst.name)['tenant_id'],
|
|
hostname=instance_cache.get(inst.name)['hostname'])
|
|
# Operations (metric name prefixed with "vm."
|
|
this_dimensions = disk_dimensions.copy()
|
|
this_dimensions.update(dims_operations)
|
|
self.gauge("vm.{0}".format(rate_name), val_diff,
|
|
dimensions=this_dimensions)
|
|
# Save this metric to the cache
|
|
metric_cache[inst.name][metric_name][disk[0].device] = {
|
|
'timestamp': sample_time,
|
|
'value': value}
|
|
|
|
# Network utilization
|
|
for vnic in insp.inspect_vnics(inst.name):
|
|
sample_time = int(time.time())
|
|
vnic_dimensions = {'device': vnic[0].name}
|
|
for metric in vnic[1]._fields:
|
|
metric_name = "net.{0}".format(metric)
|
|
if metric_name not in metric_cache[inst.name]:
|
|
metric_cache[inst.name][metric_name] = {}
|
|
|
|
value = int(vnic[1].__getattribute__(metric))
|
|
if vnic[0].name in metric_cache[inst.name][metric_name]:
|
|
time_diff = sample_time - metric_cache[inst.name][metric_name][vnic[0].name]['timestamp']
|
|
val_diff = value - metric_cache[inst.name][metric_name][vnic[0].name]['value']
|
|
# Change the metric name to a rate, ie. "net.rx_bytes"
|
|
# gets converted to "net.rx_bytes_sec"
|
|
rate_name = "{0}_sec".format(metric_name)
|
|
# Rename "tx" to "out" and "rx" to "in"
|
|
rate_name = rate_name.replace("tx", "out")
|
|
rate_name = rate_name.replace("rx", "in")
|
|
# Customer
|
|
this_dimensions = vnic_dimensions.copy()
|
|
this_dimensions.update(dims_customer)
|
|
self.gauge(rate_name, val_diff,
|
|
dimensions=this_dimensions,
|
|
delegated_tenant=instance_cache.get(inst.name)['tenant_id'],
|
|
hostname=instance_cache.get(inst.name)['hostname'])
|
|
# Operations (metric name prefixed with "vm."
|
|
this_dimensions = vnic_dimensions.copy()
|
|
this_dimensions.update(dims_operations)
|
|
self.gauge("vm.{0}".format(rate_name), val_diff,
|
|
dimensions=this_dimensions)
|
|
# Save this metric to the cache
|
|
metric_cache[inst.name][metric_name][vnic[0].name] = {
|
|
'timestamp': sample_time,
|
|
'value': value}
|
|
|
|
# Save these metrics for the next collector invocation
|
|
self._update_metric_cache(metric_cache)
|