From b692d3a6a61fbbe4c3db8c14a7ae5423559b5144 Mon Sep 17 00:00:00 2001 From: Mehdi Abaakouk Date: Tue, 15 Nov 2016 11:24:46 +0100 Subject: [PATCH] Don't poll nova with compute agent This change introduces a new method to get instances metadata on the ceilometer-compute-agent. This switches devstack/gate to libvirt_metadata for gnocchi Change-Id: Ice1918659be49589a45d7a406044adc0a187aa27 --- ceilometer/compute/discovery.py | 171 +++++++++++++++++- ceilometer/compute/virt/libvirt/inspector.py | 55 +----- ceilometer/compute/virt/libvirt/utils.py | 104 +++++++++++ ceilometer/opts.py | 4 +- .../tests/unit/compute/test_discovery.py | 106 +++++++++++ .../compute/virt/libvirt/test_inspector.py | 3 + devstack/plugin.sh | 1 + .../less-nova-polling-ac56687da3f8b1a3.yaml | 22 +++ requirements.txt | 1 + 9 files changed, 413 insertions(+), 54 deletions(-) create mode 100644 ceilometer/compute/virt/libvirt/utils.py create mode 100644 releasenotes/notes/less-nova-polling-ac56687da3f8b1a3.yaml diff --git a/ceilometer/compute/discovery.py b/ceilometer/compute/discovery.py index 50980a1b31..41f6a7eb5f 100644 --- a/ceilometer/compute/discovery.py +++ b/ceilometer/compute/discovery.py @@ -13,17 +13,44 @@ # License for the specific language governing permissions and limitations # under the License. +import hashlib +from lxml import etree +import operator + +import cachetools +from novaclient import exceptions from oslo_config import cfg +from oslo_log import log from oslo_utils import timeutils + +try: + import libvirt +except ImportError: + libvirt = None + from ceilometer.agent import plugin_base +from ceilometer.compute.virt.libvirt import utils as libvirt_utils from ceilometer import nova_client OPTS = [ cfg.BoolOpt('workload_partitioning', default=False, + deprecated_for_removal=True, help='Enable work-load partitioning, allowing multiple ' - 'compute agents to be run simultaneously.'), + 'compute agents to be run simultaneously. ' + '(replaced by instance_discovery_method)'), + cfg.StrOpt('instance_discovery_method', + default='naive', + choices=['naive', 'workload_partitioning', 'libvirt_metadata'], + help="Ceilometer offers many methods to discover the instance" + "running on a compute node: \n" + "* naive: poll nova to get all instances\n" + "* workload_partitioning: poll nova to get instances of " + "the compute\n" + "* libvirt_metadata: get instances from libvirt metadata " + " but without instance metadata (recommended for Gnocchi " + " backend"), cfg.IntOpt('resource_update_interval', default=0, min=0, @@ -34,20 +61,154 @@ OPTS = [ "the instance list to poll will be updated based " "on this option's interval. Measurements relating " "to the instances will match intervals " - "defined in pipeline.") + "defined in pipeline. "), ] +LOG = log.getLogger(__name__) + + +class NovaLikeServer(object): + def __init__(self, **kwargs): + for k, v in kwargs.items(): + setattr(self, k, v) + + def __repr__(self): + return '' % getattr(self, 'name', 'unknown-name') + class InstanceDiscovery(plugin_base.DiscoveryBase): + method = None + def __init__(self, conf): super(InstanceDiscovery, self).__init__(conf) + if not self.method: + self.method = conf.compute.instance_discovery_method + + # For backward compatibility + if self.method == "naive" and conf.compute.workload_partitioning: + self.method = "workload_partitioning" + self.nova_cli = nova_client.Client(conf) - self.last_run = None - self.instances = {} self.expiration_time = conf.compute.resource_update_interval + if self.method == "libvirt_metadata": + self._connection = None + # 4096 instances on a compute should be enough :) + self._flavor_cache = cachetools.LRUCache(4096) + else: + self.instances = {} + self.last_run = None + + @property + def connection(self): + if not self._connection: + self._connection = libvirt_utils.get_libvirt_connection(self.conf) + return self._connection def discover(self, manager, param=None): """Discover resources to monitor.""" + if self.method != "libvirt_metadata": + return self.discover_nova_polling(manager, param=None) + else: + return self.discover_libvirt_polling(manager, param=None) + + @staticmethod + def _safe_find_int(xml, path): + elem = xml.find("./%s" % path) + if elem is not None: + return int(elem.text) + return 0 + + @cachetools.cachedmethod(operator.attrgetter('_flavor_cache')) + def get_flavor_id(self, name): + try: + return self.nova_cli.nova_client.flavors.find(name=name).id + except exceptions.NotFound: + return None + + @libvirt_utils.retry_on_disconnect + def discover_libvirt_polling(self, manager, param=None): + instances = [] + for domain in self.connection.listAllDomains(): + full_xml = etree.fromstring(domain.XMLDesc()) + os_type_xml = full_xml.find("./os/type") + + xml_string = domain.metadata( + libvirt.VIR_DOMAIN_METADATA_ELEMENT, + "http://openstack.org/xmlns/libvirt/nova/1.0") + metadata_xml = etree.fromstring(xml_string) + + # TODO(sileht): We don't have the flavor ID here So the Gnocchi + # resource update will fail for compute sample (or put None ?) + # We currently poll nova to get the flavor ID, but storing the + # flavor_id doesn't have any sense because the flavor description + # can change over the time, we should store the detail of the + # flavor. this is why nova doesn't put the id in the libvirt + # metadata + + # This implements + flavor_xml = metadata_xml.find("./flavor") + flavor = { + "id": self.get_flavor_id(flavor_xml.attrib["name"]), + "name": flavor_xml.attrib["name"], + "vcpus": self._safe_find_int(flavor_xml, "vcpus"), + "ram": self._safe_find_int(flavor_xml, "memory"), + "disk": self._safe_find_int(flavor_xml, "disk"), + "ephemeral": self._safe_find_int(flavor_xml, "ephemeral"), + "swap": self._safe_find_int(flavor_xml, "swap"), + } + dom_state = domain.state()[0] + vm_state = libvirt_utils.LIBVIRT_POWER_STATE.get(dom_state) + status = libvirt_utils.LIBVIRT_STATUS.get(dom_state) + + user_id = metadata_xml.find("./owner/user").attrib["uuid"] + project_id = metadata_xml.find("./owner/project").attrib["uuid"] + + # From: + # https://github.com/openstack/nova/blob/852f40fd0c6e9d8878212ff3120556668023f1c4/nova/api/openstack/compute/views/servers.py#L214-L220 + host_id = hashlib.sha224( + (project_id + self.conf.host).encode('utf-8')).hexdigest() + + # The image description is partial, but Gnocchi only care about the + # id, so we are fine + image_xml = metadata_xml.find("./root[@type='image']") + image = ({'id': image_xml.attrib['uuid']} + if image_xml is not None else None) + + instance_data = { + "id": domain.UUIDString(), + "name": metadata_xml.find("./name").text, + "flavor": flavor, + "image": image, + "os_type": os_type_xml.text, + "architecture": os_type_xml.attrib["arch"], + + "OS-EXT-SRV-ATTR:instance_name": domain.name(), + "OS-EXT-SRV-ATTR:host": self.conf.host, + "OS-EXT-STS:vm_state": vm_state, + + "tenant_id": project_id, + "user_id": user_id, + + "hostId": host_id, + "status": status, + + # NOTE(sileht): Other fields that Ceilometer tracks + # where we can't get the value here, but their are + # retreived by notification + "metadata": {}, + # "OS-EXT-STS:task_state" + # 'reservation_id', + # 'OS-EXT-AZ:availability_zone', + # 'kernel_id', + # 'ramdisk_id', + # some image detail + } + + LOG.debug("instance data: %s", instance_data) + instances.append(NovaLikeServer(**instance_data)) + return instances + + def discover_nova_polling(self, manager, param=None): secs_from_last_update = 0 if self.last_run: secs_from_last_update = timeutils.delta_seconds( @@ -80,7 +241,7 @@ class InstanceDiscovery(plugin_base.DiscoveryBase): @property def group_id(self): - if self.conf.compute.workload_partitioning: + if self.method == "workload_partitioning": return self.conf.host else: return None diff --git a/ceilometer/compute/virt/libvirt/inspector.py b/ceilometer/compute/virt/libvirt/inspector.py index 6b5230c06c..633c21b16c 100644 --- a/ceilometer/compute/virt/libvirt/inspector.py +++ b/ceilometer/compute/virt/libvirt/inspector.py @@ -15,55 +15,25 @@ """Implementation of Inspector abstraction for libvirt.""" from lxml import etree -from oslo_config import cfg from oslo_log import log as logging from oslo_utils import units import six +try: + import libvirt +except ImportError: + libvirt = None + from ceilometer.compute.pollsters import util from ceilometer.compute.virt import inspector as virt_inspector +from ceilometer.compute.virt.libvirt import utils as libvirt_utils from ceilometer.i18n import _LW, _LE, _ -libvirt = None - LOG = logging.getLogger(__name__) -OPTS = [ - cfg.StrOpt('libvirt_type', - default='kvm', - choices=['kvm', 'lxc', 'qemu', 'uml', 'xen'], - help='Libvirt domain type.'), - cfg.StrOpt('libvirt_uri', - default='', - help='Override the default libvirt URI ' - '(which is dependent on libvirt_type).'), -] - - -def retry_on_disconnect(function): - def decorator(self, *args, **kwargs): - try: - return function(self, *args, **kwargs) - except ImportError: - # NOTE(sileht): in case of libvirt failed to be imported - raise - except libvirt.libvirtError as e: - if (e.get_error_code() in (libvirt.VIR_ERR_SYSTEM_ERROR, - libvirt.VIR_ERR_INTERNAL_ERROR) and - e.get_error_domain() in (libvirt.VIR_FROM_REMOTE, - libvirt.VIR_FROM_RPC)): - LOG.debug('Connection to libvirt broken') - self.connection = None - return function(self, *args, **kwargs) - else: - raise - return decorator - class LibvirtInspector(virt_inspector.Inspector): - per_type_uris = dict(uml='uml:///system', xen='xen:///', lxc='lxc:///') - def __init__(self, conf): super(LibvirtInspector, self).__init__(conf) self._connection = None @@ -71,19 +41,10 @@ class LibvirtInspector(virt_inspector.Inspector): @property def connection(self): if not self._connection: - global libvirt - if libvirt is None: - libvirt = __import__('libvirt') - - uri = (self.conf.libvirt_uri or - self.per_type_uris.get(self.conf.libvirt_type, - 'qemu:///system')) - LOG.debug('Connecting to libvirt: %s', uri) - self._connection = libvirt.openReadOnly(uri) - + self._connection = libvirt_utils.get_libvirt_connection(self.conf) return self._connection - @retry_on_disconnect + @libvirt_utils.retry_on_disconnect def _lookup_by_uuid(self, instance): instance_name = util.instance_name(instance) try: diff --git a/ceilometer/compute/virt/libvirt/utils.py b/ceilometer/compute/virt/libvirt/utils.py new file mode 100644 index 0000000000..914a6de0ba --- /dev/null +++ b/ceilometer/compute/virt/libvirt/utils.py @@ -0,0 +1,104 @@ +# +# Copyright 2016 Red Hat, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from oslo_config import cfg +from oslo_log import log as logging + +try: + import libvirt +except ImportError: + libvirt = None + +LOG = logging.getLogger(__name__) + +OPTS = [ + cfg.StrOpt('libvirt_type', + default='kvm', + choices=['kvm', 'lxc', 'qemu', 'uml', 'xen'], + help='Libvirt domain type.'), + cfg.StrOpt('libvirt_uri', + default='', + help='Override the default libvirt URI ' + '(which is dependent on libvirt_type).'), +] + +LIBVIRT_PER_TYPE_URIS = dict(uml='uml:///system', xen='xen:///', lxc='lxc:///') + + +# We don't use the libvirt constants in case of libvirt is not avialable +VIR_DOMAIN_NOSTATE = 0 +VIR_DOMAIN_RUNNING = 1 +VIR_DOMAIN_BLOCKED = 2 +VIR_DOMAIN_PAUSED = 3 +VIR_DOMAIN_SHUTDOWN = 4 +VIR_DOMAIN_SHUTOFF = 5 +VIR_DOMAIN_CRASHED = 6 +VIR_DOMAIN_PMSUSPENDED = 7 + +# Stolen from nova +LIBVIRT_POWER_STATE = { + VIR_DOMAIN_NOSTATE: 'pending', + VIR_DOMAIN_RUNNING: 'running', + VIR_DOMAIN_BLOCKED: 'running', + VIR_DOMAIN_PAUSED: 'paused', + VIR_DOMAIN_SHUTDOWN: 'shutdown', + VIR_DOMAIN_SHUTOFF: 'shutdown', + VIR_DOMAIN_CRASHED: 'crashed', + VIR_DOMAIN_PMSUSPENDED: 'suspended', +} + +# NOTE(sileht): This is a guessing of the nova +# status, should be true 99.9% on the time, +# but can be wrong during some transistion state +# like shelving/rescuing +LIBVIRT_STATUS = { + VIR_DOMAIN_NOSTATE: 'building', + VIR_DOMAIN_RUNNING: 'active', + VIR_DOMAIN_BLOCKED: 'active', + VIR_DOMAIN_PAUSED: 'paused', + VIR_DOMAIN_SHUTDOWN: 'stopped', + VIR_DOMAIN_SHUTOFF: 'stopped', + VIR_DOMAIN_CRASHED: 'error', + VIR_DOMAIN_PMSUSPENDED: 'suspended', +} + + +def get_libvirt_connection(conf): + if not libvirt: + raise ImportError("python-libvirt module is missing") + uri = (conf.libvirt_uri or LIBVIRT_PER_TYPE_URIS.get(conf.libvirt_type, + 'qemu:///system')) + LOG.debug('Connecting to libvirt: %s', uri) + return libvirt.openReadOnly(uri) + + +def retry_on_disconnect(function): + def decorator(self, *args, **kwargs): + try: + return function(self, *args, **kwargs) + except ImportError: + # NOTE(sileht): in case of libvirt failed to be imported + raise + except libvirt.libvirtError as e: + if (e.get_error_code() in (libvirt.VIR_ERR_SYSTEM_ERROR, + libvirt.VIR_ERR_INTERNAL_ERROR) and + e.get_error_domain() in (libvirt.VIR_FROM_REMOTE, + libvirt.VIR_FROM_RPC)): + LOG.debug('Connection to libvirt broken') + self.connection = None + return function(self, *args, **kwargs) + else: + raise + return decorator diff --git a/ceilometer/opts.py b/ceilometer/opts.py index 371820faee..824bad5fee 100644 --- a/ceilometer/opts.py +++ b/ceilometer/opts.py @@ -23,7 +23,7 @@ import ceilometer.api.controllers.v2.root import ceilometer.collector import ceilometer.compute.discovery import ceilometer.compute.virt.inspector -import ceilometer.compute.virt.libvirt.inspector +import ceilometer.compute.virt.libvirt.utils import ceilometer.compute.virt.vmware.inspector import ceilometer.compute.virt.xenapi.inspector import ceilometer.coordination @@ -79,7 +79,7 @@ def list_opts(): itertools.chain(ceilometer.agent.manager.OPTS, ceilometer.api.app.OPTS, ceilometer.compute.virt.inspector.OPTS, - ceilometer.compute.virt.libvirt.inspector.OPTS, + ceilometer.compute.virt.libvirt.utils.OPTS, ceilometer.dispatcher.OPTS, ceilometer.ipmi.notifications.ironic.OPTS, ceilometer.middleware.OPTS, diff --git a/ceilometer/tests/unit/compute/test_discovery.py b/ceilometer/tests/unit/compute/test_discovery.py index d6619892b5..42009ef9ba 100644 --- a/ceilometer/tests/unit/compute/test_discovery.py +++ b/ceilometer/tests/unit/compute/test_discovery.py @@ -18,9 +18,69 @@ from oslo_config import fixture as fixture_config from oslotest import mockpatch from ceilometer.compute import discovery +from ceilometer.compute.pollsters import util +from ceilometer.compute.virt.libvirt import utils import ceilometer.tests.base as base +LIBVIRT_METADATA_XML = """ + + + test.dom.com + 2016-11-16 07:35:06 + + 512 + 1 + 0 + 0 + 1 + + + admin + admin + + + +""" + +LIBVIRT_DESC_XML = """ + + instance-00000001 + a75c2fa5-6c03-45a8-bbf7-b993cfcdec27 + + hvm + /opt/stack/data/nova/instances/a75c2fa5-6c03-45a8-bbf7-b993cfcdec27/kernel + /opt/stack/data/nova/instances/a75c2fa5-6c03-45a8-bbf7-b993cfcdec27/ramdisk + root=/dev/vda console=tty0 console=ttyS0 + + + + +""" + + +class FakeDomain(object): + def state(self): + return [1, 2] + + def name(self): + return "instance-00000001" + + def UUIDString(self): + return "a75c2fa5-6c03-45a8-bbf7-b993cfcdec27" + + def XMLDesc(self): + return LIBVIRT_DESC_XML + + def metadata(self, flags, url): + return LIBVIRT_METADATA_XML + + +class FakeConn(object): + def listAllDomains(self): + return [FakeDomain()] + + class TestDiscovery(base.BaseTestCase): def setUp(self): @@ -32,6 +92,8 @@ class TestDiscovery(base.BaseTestCase): self.instance.name) setattr(self.instance, 'OS-EXT-STS:vm_state', 'active') + # FIXME(sileht): This is wrong, this should be a uuid + # The internal id of nova can't be retrieved via API or notification self.instance.id = 1 self.instance.flavor = {'name': 'm1.small', 'id': 2, 'vcpus': 1, 'ram': 512, 'disk': 20, 'ephemeral': 0} @@ -97,3 +159,47 @@ class TestDiscovery(base.BaseTestCase): self.assertEqual(1, list(resources)[0].id) self.client.instance_get_all_by_host.assert_called_once_with( self.CONF.host, "2016-01-01T00:00:00+00:00") + + @mock.patch.object(utils, "libvirt") + @mock.patch.object(discovery, "libvirt") + def test_discovery_with_libvirt(self, libvirt, libvirt2): + self.CONF.set_override("instance_discovery_method", + "libvirt_metadata", + group="compute") + libvirt.VIR_DOMAIN_METADATA_ELEMENT = 2 + libvirt2.openReadOnly.return_value = FakeConn() + dsc = discovery.InstanceDiscovery(self.CONF) + resources = dsc.discover(mock.MagicMock()) + + self.assertEqual(1, len(resources)) + r = list(resources)[0] + s = util.make_sample_from_instance(self.CONF, r, "metric", "delta", + "carrot", 1) + self.assertEqual("a75c2fa5-6c03-45a8-bbf7-b993cfcdec27", + s.resource_id) + self.assertEqual("d99c829753f64057bc0f2030da309943", + s.project_id) + self.assertEqual("a1f4684e58bd4c88aefd2ecb0783b497", + s.user_id) + + metadata = s.resource_metadata + self.assertEqual(1, metadata["vcpus"]) + self.assertEqual(512, metadata["memory_mb"]) + self.assertEqual(1, metadata["disk_gb"]) + self.assertEqual(0, metadata["ephemeral_gb"]) + self.assertEqual(1, metadata["root_gb"]) + self.assertEqual("bdaf114a-35e9-4163-accd-226d5944bf11", + metadata["image_ref"]) + self.assertEqual("test.dom.com", metadata["display_name"]) + self.assertEqual("instance-00000001", metadata["name"]) + self.assertEqual("a75c2fa5-6c03-45a8-bbf7-b993cfcdec27", + metadata["instance_id"]) + self.assertEqual("m1.tiny", metadata["instance_type"]) + self.assertEqual( + "4d0bc931ea7f0513da2efd9acb4cf3a273c64b7bcc544e15c070e662", + metadata["host"]) + self.assertEqual(self.CONF.host, metadata["instance_host"]) + self.assertEqual("active", metadata["status"]) + self.assertEqual("running", metadata["state"]) + self.assertEqual("hvm", metadata["os_type"]) + self.assertEqual("x86_64", metadata["architecture"]) diff --git a/ceilometer/tests/unit/compute/virt/libvirt/test_inspector.py b/ceilometer/tests/unit/compute/virt/libvirt/test_inspector.py index b922e5ee5b..fc8fd2d6b2 100644 --- a/ceilometer/tests/unit/compute/virt/libvirt/test_inspector.py +++ b/ceilometer/tests/unit/compute/virt/libvirt/test_inspector.py @@ -27,6 +27,7 @@ from oslotest import base from ceilometer.compute.virt import inspector as virt_inspector from ceilometer.compute.virt.libvirt import inspector as libvirt_inspector +from ceilometer.compute.virt.libvirt import utils class TestLibvirtInspection(base.BaseTestCase): @@ -46,6 +47,7 @@ class TestLibvirtInspection(base.BaseTestCase): libvirt_inspector.libvirt = mock.Mock() libvirt_inspector.libvirt.VIR_DOMAIN_SHUTOFF = 5 libvirt_inspector.libvirt.libvirtError = self.fakeLibvirtError + utils.libvirt = libvirt_inspector.libvirt self.domain = mock.Mock() self.addCleanup(mock.patch.stopall) @@ -469,6 +471,7 @@ class TestLibvirtInspectionWithError(base.BaseTestCase): mock.MagicMock(side_effect=Exception('dummy')))) libvirt_inspector.libvirt = mock.Mock() libvirt_inspector.libvirt.libvirtError = self.fakeLibvirtError + utils.libvirt = libvirt_inspector.libvirt def test_inspect_unknown_error(self): self.assertRaises(virt_inspector.InspectorException, diff --git a/devstack/plugin.sh b/devstack/plugin.sh index bd0ceaede4..33fc3ce515 100644 --- a/devstack/plugin.sh +++ b/devstack/plugin.sh @@ -257,6 +257,7 @@ function _ceilometer_configure_storage_backend { iniset $CEILOMETER_CONF database event_connection mongodb://localhost:27017/ceilometer iniset $CEILOMETER_CONF database metering_connection mongodb://localhost:27017/ceilometer elif [ "$CEILOMETER_BACKEND" = 'gnocchi' ] ; then + iniset $CEILOMETER_CONF compute instance_discovery_method libvirt_metadata iniset $CEILOMETER_CONF DEFAULT meter_dispatchers gnocchi iniset $CEILOMETER_CONF DEFAULT event_dispatchers gnocchi # NOTE(gordc): set higher retry in case gnocchi is started after ceilometer on a slow machine diff --git a/releasenotes/notes/less-nova-polling-ac56687da3f8b1a3.yaml b/releasenotes/notes/less-nova-polling-ac56687da3f8b1a3.yaml new file mode 100644 index 0000000000..577af52a7f --- /dev/null +++ b/releasenotes/notes/less-nova-polling-ac56687da3f8b1a3.yaml @@ -0,0 +1,22 @@ +--- +features: + - The Ceilometer compute agent can now retrieve some instance metadata from + the metadata libvirt API instead of polling the Nova API. Since Mitaka, + Nova fills this metadata with some information about the instance. + To enable this feature you should set [compute]/instance_discovery_method = + libvirt_metadata in the configuration file. + + The only downside of this method is that user_metadata (and some other + instance attributes) are no longer part of the samples created by the + agent. But when Gnocchi is used as backend, this is not an issue since + Gnocchi doesn't store resource metadata aside of the measurements. And the + missing informations are still retrieved through the Nova notifications + and will fully update the resource information in Gnocchi. +upgrade: + - If you are using Gnocchi as backend it's strongly + recommended to switch [compute]/instance_discovery_method to + libvirt_metadata. This will reduce the load on the Nova API + especially if you have many compute nodes. +deprecations: + - The [compute]/workload_partitioning = True is deprecated in favor + of [compute]/instance_discovery_method = workload_partitioning diff --git a/requirements.txt b/requirements.txt index 3eb832eeb9..21c8c6363d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ # of appearance. Changing the order has an impact on the overall integration # process, which may cause wedges in the gate later. +cachetools>=1.1.0 # MIT License cotyledon>=1.3.0 #Apache-2.0 futures>=3.0;python_version=='2.7' or python_version=='2.6' # BSD futurist>=0.11.0 # Apache-2.0