monasca-agent/monasca_agent/collector/checks_d/libvirt.py

#!/bin/env python

# Copyright (c) 2014 Hewlett-Packard Development Company, L.P.
#
#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.
"""Monasca Agent interface for libvirt metrics"""

import os
import stat
import time
import yaml

from calendar import timegm
from datetime import datetime
from monasca_agent.collector.virt import inspector
from monasca_agent.collector.checks import AgentCheck


class LibvirtCheck(AgentCheck):

    """Inherit Agent class and gather libvirt metrics"""

    def __init__(self, name, init_config, agent_config):
        AgentCheck.__init__(self, name, init_config, agent_config)
        self.instance_cache_file = "{0}/{1}".format(self.init_config.get('cache_dir'),
                                                    'libvirt_instances.yaml')
        self.metric_cache_file = "{0}/{1}".format(self.init_config.get('cache_dir'),
                                                  'libvirt_metrics.yaml')

    def _test_vm_probation(self, created):
        """Test to see if a VM was created within the probation period.

        Convert an ISO-8601 timestamp into UNIX epoch timestamp from now
        and compare that against configured vm_probation.  Return the
        number of seconds this VM will remain in probation.
        """
        dt = datetime.strptime(created, '%Y-%m-%dT%H:%M:%SZ')
        created_sec = (time.time() - timegm(dt.timetuple()))
        probation_time = self.init_config.get('vm_probation') - created_sec
        return int(probation_time)

    def _update_instance_cache(self):
        """Collect instance_id, project_id, and AZ for all instance UUIDs
        """
        from novaclient.v3 import client
        id_cache = {}
        # Get a list of all instances from the Nova API
        nova_client = client.Client(self.init_config.get('admin_user'),
                                    self.init_config.get('admin_password'),
                                    self.init_config.get('admin_tenant_name'),
                                    self.init_config.get('identity_uri'),
                                    service_type="compute")
        instances = nova_client.servers.list(search_opts={'all_tenants': 1})

        for instance in instances:
            inst_name = instance.__getattr__('OS-EXT-SRV-ATTR:instance_name')
            inst_az = instance.__getattr__('OS-EXT-AZ:availability_zone')
            id_cache[inst_name] = {'instance_uuid': instance.id,
                                   'hostname': instance.name,
                                   'zone': inst_az,
                                   'created': instance.created,
                                   'tenant_id': instance.tenant_id}
        id_cache['last_update'] = int(time.time())

        # Write the updated cache
        try:
            with open(self.instance_cache_file, 'w') as cache_yaml:
                yaml.safe_dump(id_cache, cache_yaml)
            if stat.S_IMODE(os.stat(self.instance_cache_file).st_mode) != 0600:
                os.chmod(self.instance_cache_file, 0600)
        except IOError as e:
            self.log.error("Cannot write to {0}: {1}".format(self.instance_cache_file, e))

        return id_cache

    def _load_instance_cache(self):
        """Load the cache if instance names to IDs.

           If the cache does not yet exist, return an empty one.
        """
        instance_cache = {}
        try:
            with open(self.instance_cache_file, 'r') as cache_yaml:
                instance_cache = yaml.safe_load(cache_yaml)

                # Is it time to force a refresh of this data?
                if self.init_config.get('nova_refresh') is not None:
                    time_diff = time.time() - instance_cache['last_update']
                    if time_diff > self.init_config.get('nova_refresh'):
                        self._update_instance_cache()
        except IOError:
            # The file may not exist yet, and that's OK.  Build it now.
            instance_cache = self._update_instance_cache()
            pass

        return instance_cache

    def _load_metric_cache(self):
        """Load the counter metrics from the previous collection iteration
        """
        metric_cache = {}
        try:
            with open(self.metric_cache_file, 'r') as cache_yaml:
                metric_cache = yaml.safe_load(cache_yaml)
        except IOError:
            # The file may not exist yet.
            pass

        return metric_cache

    def _update_metric_cache(self, metric_cache):
        try:
            with open(self.metric_cache_file, 'w') as cache_yaml:
                yaml.safe_dump(metric_cache, cache_yaml)
            if stat.S_IMODE(os.stat(self.metric_cache_file).st_mode) != 0600:
                os.chmod(self.metric_cache_file, 0600)
        except IOError as e:
            self.log.error("Cannot write to {0}: {1}".format(self.metric_cache_file, e))

    def check(self, instance):
        """Gather VM metrics for each instance"""

        # Load metric cache
        metric_cache = self._load_metric_cache()

        # Load the nova-obtained instance data cache
        instance_cache = self._load_instance_cache()

        # Build dimensions for both the customer and for operations
        dims_base = self._set_dimensions({'service': 'compute', 'component': 'vm'}, instance)

        insp = inspector.get_hypervisor_inspector()
        for inst in insp.inspect_instances():
            # Verify that this instance exists in the cache.  Add if necessary.
            if inst.name not in instance_cache:
                instance_cache = self._update_instance_cache()
            if inst.name not in metric_cache:
                metric_cache[inst.name] = {}

            # Skip instances created within the probation period
            vm_probation_remaining = self._test_vm_probation(instance_cache.get(inst.name)['created'])
            if (vm_probation_remaining >= 0):
                self.log.info("Libvirt: {0} in probation for another {1} seconds".format(instance_cache.get(inst.name)['hostname'],
                                                                                         vm_probation_remaining))
                continue

            # Build customer dimensions
            dims_customer = dims_base.copy()
            dims_customer['resource_id'] = instance_cache.get(inst.name)['instance_uuid']
            dims_customer['zone'] = instance_cache.get(inst.name)['zone']
            # Add dimensions that would be helpful for operations
            dims_operations = dims_customer.copy()
            dims_operations['tenant_id'] = instance_cache.get(inst.name)['tenant_id']
            dims_operations['cloud_tier'] = 'overcloud'

            # CPU utilization percentage
            sample_time = float("{:9f}".format(time.time()))
            if 'cpu.time' in metric_cache[inst.name]:
                # I have a prior value, so calculate the rate & push the metric
                cpu_diff = insp.inspect_cpus(inst.name).time - metric_cache[inst.name]['cpu.time']['value']
                time_diff = sample_time - float(metric_cache[inst.name]['cpu.time']['timestamp'])
                # Convert time_diff to nanoseconds, and calculate percentage
                rate = (cpu_diff / (time_diff * 1000000000)) * 100

                self.gauge('cpu.utilization_perc', int(round(rate, 0)),
                           dimensions=dims_customer,
                           delegated_tenant=instance_cache.get(inst.name)['tenant_id'],
                           hostname=instance_cache.get(inst.name)['hostname'])
                self.gauge('vm.cpu.utilization_perc', int(round(rate, 0)),
                           dimensions=dims_operations)

            metric_cache[inst.name]['cpu.time'] = {'timestamp': sample_time,
                                                   'value': insp.inspect_cpus(inst.name).time}

            # Disk utilization
            for disk in insp.inspect_disks(inst.name):
                sample_time = int(time.time())
                disk_dimensions = {'device': disk[0].device}
                for metric in disk[1]._fields:
                    metric_name = "io.{0}".format(metric)
                    if metric_name not in metric_cache[inst.name]:
                        metric_cache[inst.name][metric_name] = {}

                    value = int(disk[1].__getattribute__(metric))
                    if disk[0].device in metric_cache[inst.name][metric_name]:
                        time_diff = sample_time - metric_cache[inst.name][metric_name][disk[0].device]['timestamp']
                        val_diff = value - metric_cache[inst.name][metric_name][disk[0].device]['value']
                        # Change the metric name to a rate, ie. "io.read_requests"
                        # gets converted to "io.read_ops_sec"
                        rate_name = "{0}_sec".format(metric_name.replace('requests', 'ops'))
                        # Customer
                        this_dimensions = disk_dimensions.copy()
                        this_dimensions.update(dims_customer)
                        self.gauge(rate_name, val_diff, dimensions=this_dimensions,
                                   delegated_tenant=instance_cache.get(inst.name)['tenant_id'],
                                   hostname=instance_cache.get(inst.name)['hostname'])
                        # Operations (metric name prefixed with "vm."
                        this_dimensions = disk_dimensions.copy()
                        this_dimensions.update(dims_operations)
                        self.gauge("vm.{0}".format(rate_name), val_diff,
                                   dimensions=this_dimensions)
                    # Save this metric to the cache
                    metric_cache[inst.name][metric_name][disk[0].device] = {
                        'timestamp': sample_time,
                        'value': value}

            # Network utilization
            for vnic in insp.inspect_vnics(inst.name):
                sample_time = int(time.time())
                vnic_dimensions = {'device': vnic[0].name}
                for metric in vnic[1]._fields:
                    metric_name = "net.{0}".format(metric)
                    if metric_name not in metric_cache[inst.name]:
                        metric_cache[inst.name][metric_name] = {}

                    value = int(vnic[1].__getattribute__(metric))
                    if vnic[0].name in metric_cache[inst.name][metric_name]:
                        time_diff = sample_time - metric_cache[inst.name][metric_name][vnic[0].name]['timestamp']
                        val_diff = value - metric_cache[inst.name][metric_name][vnic[0].name]['value']
                        # Change the metric name to a rate, ie. "net.rx_bytes"
                        # gets converted to "net.rx_bytes_sec"
                        rate_name = "{0}_sec".format(metric_name)
                        # Rename "tx" to "out" and "rx" to "in"
                        rate_name = rate_name.replace("tx", "out")
                        rate_name = rate_name.replace("rx", "in")
                        # Customer
                        this_dimensions = vnic_dimensions.copy()
                        this_dimensions.update(dims_customer)
                        self.gauge(rate_name, val_diff,
                                   dimensions=this_dimensions,
                                   delegated_tenant=instance_cache.get(inst.name)['tenant_id'],
                                   hostname=instance_cache.get(inst.name)['hostname'])
                        # Operations (metric name prefixed with "vm."
                        this_dimensions = vnic_dimensions.copy()
                        this_dimensions.update(dims_operations)
                        self.gauge("vm.{0}".format(rate_name), val_diff,
                                   dimensions=this_dimensions)
                    # Save this metric to the cache
                    metric_cache[inst.name][metric_name][vnic[0].name] = {
                        'timestamp': sample_time,
                        'value': value}

        # Save these metrics for the next collector invocation
        self._update_metric_cache(metric_cache)