monasca-agent/monagent/collector/checks_d/libvirt.py

#!/bin/env python

# Copyright (c) 2014 Hewlett-Packard Development Company, L.P.
#
#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.
"""Monasca Agent interface for libvirt metrics"""

import os
import stat
import time
import yaml

from calendar import timegm
from datetime import datetime
from monagent.collector.virt import inspector
from monagent.collector.checks import AgentCheck


class LibvirtCheck(AgentCheck):

    """Inherit Agent class and gather libvirt metrics"""

    def __init__(self, name, init_config, agent_config):
        AgentCheck.__init__(self, name, init_config, agent_config)
        self.instance_cache_file = "{}/{}".format(self.init_config.get('cache_dir'),
                                                  'libvirt_instances.yaml')
        self.metric_cache_file = "{}/{}".format(self.init_config.get('cache_dir'),
                                                'libvirt_metrics.yaml')

    def _test_vm_probation(self, created):
        """Test to see if a VM was created within the probation period.

        Convert an ISO-8601 timestamp into UNIX epoch timestamp from now
        and compare that against configured vm_probation.  Return the
        number of seconds this VM will remain in probation.
        """
        dt = datetime.strptime(created, '%Y-%m-%dT%H:%M:%SZ')
        created_sec = (time.time() - timegm(dt.timetuple()))
        probation_time = self.init_config.get('vm_probation') - created_sec
        return int(probation_time)

    def _update_instance_cache(self):
        """Collect instance_id, project_id, and AZ for all instance UUIDs
        """
        from novaclient.v3 import client
        id_cache = {}
        # Get a list of all instances from the Nova API
        nova_client = client.Client(self.init_config.get('admin_user'),
                                    self.init_config.get('admin_password'),
                                    self.init_config.get('admin_tenant_name'),
                                    self.init_config.get('identity_uri'),
                                    service_type="compute")
        instances = nova_client.servers.list(search_opts={'all_tenants': 1})

        for instance in instances:
            inst_name = instance.__getattr__('OS-EXT-SRV-ATTR:instance_name')
            inst_az = instance.__getattr__('OS-EXT-AZ:availability_zone')
            id_cache[inst_name] = {'instance_uuid': instance.id,
                                   'hostname': instance.name,
                                   'zone': inst_az,
                                   'created': instance.created,
                                   'tenant_id': instance.tenant_id}
        id_cache['last_update'] = int(time.time())

        # Write the updated cache
        try:
            with open(self.instance_cache_file, 'w') as cache_yaml:
                yaml.safe_dump(id_cache, cache_yaml)
            if stat.S_IMODE(os.stat(self.instance_cache_file).st_mode) != 0600:
                os.chmod(self.instance_cache_file, 0600)
        except IOError as e:
            self.log.error("Cannot write to {}: {}".format(self.instance_cache_file, e))

        return id_cache

    def _load_instance_cache(self):
        """Load the cache if instance names to IDs.

           If the cache does not yet exist, return an empty one.
        """
        instance_cache = {}
        try:
            with open(self.instance_cache_file, 'r') as cache_yaml:
                instance_cache = yaml.safe_load(cache_yaml)

                # Is it time to force a refresh of this data?
                if self.init_config.get('nova_refresh') is not None:
                    time_diff = time.time() - instance_cache['last_update']
                    if time_diff > self.init_config.get('nova_refresh'):
                        self._update_instance_cache()
        except IOError:
            # The file may not exist yet, and that's OK.  Build it now.
            instance_cache = self._update_instance_cache()
            pass

        return instance_cache

    def _load_metric_cache(self):
        """Load the counter metrics from the previous collection iteration
        """
        metric_cache = {}
        try:
            with open(self.metric_cache_file, 'r') as cache_yaml:
                metric_cache = yaml.safe_load(cache_yaml)
        except IOError:
            # The file may not exist yet.
            pass

        return metric_cache

    def _update_metric_cache(self, metric_cache):
        try:
            with open(self.metric_cache_file, 'w') as cache_yaml:
                yaml.safe_dump(metric_cache, cache_yaml)
            if stat.S_IMODE(os.stat(self.metric_cache_file).st_mode) != 0600:
                os.chmod(self.metric_cache_file, 0600)
        except IOError as e:
            self.log.error("Cannot write to {}: {}".format(self.metric_cache_file, e))

    def check(self, instance):
        """Gather VM metrics for each instance"""

        # Load metric cache
        metric_cache = self._load_metric_cache()

        # Load the nova-obtained instance data cache
        instance_cache = self._load_instance_cache()

        # Build dimensions for both the customer and for operations
        dims_base = {'service': 'compute', 'component': 'vm'}

        insp = inspector.get_hypervisor_inspector()
        for inst in insp.inspect_instances():
            # Verify that this instance exists in the cache.  Add if necessary.
            if inst.name not in instance_cache:
                instance_cache = self._update_instance_cache()
            if inst.name not in metric_cache:
                metric_cache[inst.name] = {}

            # Skip instances created within the probation period
            vm_probation_remaining = self._test_vm_probation(instance_cache.get(inst.name)['created'])
            if (vm_probation_remaining >= 0):
                self.log.info("Libvirt: {} in probation for another {} seconds".format(instance_cache.get(inst.name)['hostname'],
                                                                                       vm_probation_remaining))
                continue

            # Build customer dimensions
            dims_customer = dims_base.copy()
            dims_customer['resource_id'] = instance_cache.get(inst.name)['instance_uuid']
            dims_customer['zone'] = instance_cache.get(inst.name)['zone']
            # Add dimensions that would be helpful for operations
            dims_operations = dims_customer.copy()
            dims_operations['tenant_id'] = instance_cache.get(inst.name)['tenant_id']
            dims_operations['cloud_tier'] = 'overcloud'

            # CPU utilization percentage
            sample_time = float("{:9f}".format(time.time()))
            if 'cpu.time' in metric_cache[inst.name]:
                # I have a prior value, so calculate the rate & push the metric
                cpu_diff = insp.inspect_cpus(inst.name).time - metric_cache[inst.name]['cpu.time']['value']
                time_diff = sample_time - float(metric_cache[inst.name]['cpu.time']['timestamp'])
                # Convert time_diff to nanoseconds, and calculate percentage
                rate = (cpu_diff / (time_diff * 1000000000)) * 100

                self.gauge('cpu.utilization_perc', int(round(rate, 0)),
                           dimensions=dims_customer,
                           delegated_tenant=instance_cache.get(inst.name)['tenant_id'],
                           hostname=instance_cache.get(inst.name)['hostname'])
                self.gauge('vm.cpu.utilization_perc', int(round(rate, 0)),
                           dimensions=dims_operations)

            metric_cache[inst.name]['cpu.time'] = {'timestamp': sample_time,
                                                   'value': insp.inspect_cpus(inst.name).time}

            # Disk utilization
            for disk in insp.inspect_disks(inst.name):
                sample_time = int(time.time())
                disk_dimensions = {'device': disk[0].device}
                for metric in disk[1]._fields:
                    metric_name = "io.{}".format(metric)
                    if metric_name not in metric_cache[inst.name]:
                        metric_cache[inst.name][metric_name] = {}

                    value = int(disk[1].__getattribute__(metric))
                    if disk[0].device in metric_cache[inst.name][metric_name]:
                        time_diff = sample_time - metric_cache[inst.name][metric_name][disk[0].device]['timestamp']
                        val_diff = value - metric_cache[inst.name][metric_name][disk[0].device]['value']
                        # Change the metric name to a rate, ie. "io.read_requests"
                        # gets converted to "io.read_ops_sec"
                        rate_name = "{}_sec".format(metric_name.replace('requests', 'ops'))
                        # Customer
                        this_dimensions = disk_dimensions.copy()
                        this_dimensions.update(dims_customer)
                        self.gauge(rate_name, val_diff, dimensions=this_dimensions,
                                   delegated_tenant=instance_cache.get(inst.name)['tenant_id'],
                                   hostname=instance_cache.get(inst.name)['hostname'])
                        # Operations (metric name prefixed with "vm."
                        this_dimensions = disk_dimensions.copy()
                        this_dimensions.update(dims_operations)
                        self.gauge("vm.{}".format(rate_name), val_diff,
                                   dimensions=this_dimensions)
                    # Save this metric to the cache
                    metric_cache[inst.name][metric_name][disk[0].device] = {
                        'timestamp': sample_time,
                        'value': value}

            # Network utilization
            for vnic in insp.inspect_vnics(inst.name):
                sample_time = int(time.time())
                vnic_dimensions = {'device': vnic[0].name}
                for metric in vnic[1]._fields:
                    metric_name = "net.{}".format(metric)
                    if metric_name not in metric_cache[inst.name]:
                        metric_cache[inst.name][metric_name] = {}

                    value = int(vnic[1].__getattribute__(metric))
                    if vnic[0].name in metric_cache[inst.name][metric_name]:
                        time_diff = sample_time - metric_cache[inst.name][metric_name][vnic[0].name]['timestamp']
                        val_diff = value - metric_cache[inst.name][metric_name][vnic[0].name]['value']
                        # Change the metric name to a rate, ie. "net.rx_bytes"
                        # gets converted to "net.rx_bytes_sec"
                        rate_name = "{}_sec".format(metric_name)
                        # Rename "tx" to "out" and "rx" to "in"
                        rate_name = rate_name.replace("tx", "out")
                        rate_name = rate_name.replace("rx", "in")
                        # Customer
                        this_dimensions = vnic_dimensions.copy()
                        this_dimensions.update(dims_customer)
                        self.gauge(rate_name, val_diff,
                                   dimensions=this_dimensions,
                                   delegated_tenant=instance_cache.get(inst.name)['tenant_id'],
                                   hostname=instance_cache.get(inst.name)['hostname'])
                        # Operations (metric name prefixed with "vm."
                        this_dimensions = vnic_dimensions.copy()
                        this_dimensions.update(dims_operations)
                        self.gauge("vm.{}".format(rate_name), val_diff,
                                   dimensions=this_dimensions)
                    # Save this metric to the cache
                    metric_cache[inst.name][metric_name][vnic[0].name] = {
                        'timestamp': sample_time,
                        'value': value}

        # Save these metrics for the next collector invocation
        self._update_metric_cache(metric_cache)