diff --git a/prometheus-openstack-exporter/Dockerfile.ubuntu_xenial b/prometheus-openstack-exporter/Dockerfile.ubuntu_xenial new file mode 100644 index 00000000..be6deaa8 --- /dev/null +++ b/prometheus-openstack-exporter/Dockerfile.ubuntu_xenial @@ -0,0 +1,15 @@ +ARG FROM=docker.io/ubuntu:xenial +FROM ${FROM} + +RUN apt-get -y update \ + && apt-get -y install curl python-dateutil python-requests python-simplejson python-yaml python-prometheus-client\ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN mkdir /usr/local/bin/exporter +COPY exporter /usr/local/bin/exporter +RUN chmod +x /usr/local/bin/exporter/main.py + +EXPOSE 9103 + +CMD ["/usr/local/bin/exporter/main.py"] diff --git a/prometheus-openstack-exporter/build.sh b/prometheus-openstack-exporter/build.sh new file mode 100644 index 00000000..dbf95a3a --- /dev/null +++ b/prometheus-openstack-exporter/build.sh @@ -0,0 +1,14 @@ +#!/bin/bash +SCRIPT=`realpath $0` +SCRIPT_DIR=`dirname ${SCRIPT}` +## Only build from main folder +cd ${SCRIPT_DIR}/.. + +IMAGE="prometheus-openstack-exporter" +VERSION=${VERSION:-latest} +DISTRO=${DISTRO:-ubuntu_xenial} +REGISTRY_URI=${REGISTRY_URI:-"openstackhelm/"} +EXTRA_TAG_INFO=${EXTRA_TAG_INFO:-""} +docker build -f ${IMAGE}/Dockerfile.${DISTRO} --network=host -t ${REGISTRY_URI}${IMAGE}:${VERSION}-${DISTRO}${EXTRA_TAG_INFO} ${extra_build_args} ${IMAGE} + +cd - diff --git a/prometheus-openstack-exporter/exporter/__init__.py b/prometheus-openstack-exporter/exporter/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/prometheus-openstack-exporter/exporter/base.py b/prometheus-openstack-exporter/exporter/base.py new file mode 100644 index 00000000..c85e06bc --- /dev/null +++ b/prometheus-openstack-exporter/exporter/base.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python +# Copyright 2017 The Openstack-Helm Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + + +class OSBase(object): + FAIL = 0 + OK = 1 + UNKNOWN = 2 + GAUGE_NAME_FORMAT = "openstack_{}" + + def __init__(self, oscache, osclient): + self.oscache = oscache + self.osclient = osclient + self.oscache.cache_me(self) + + def get_cache_data(self): + return self.oscache.get_cache_data(self.get_cache_key()) + + def build_cache_data(self): + """ build a hash to store in cache """ + raise NotImplemented("Must be implemented by the subclass!") + + def get_cache_key(self): + """ cache key """ + raise NotImplemented("Must be implemented by the subclass!") + + def get_stats(self): + """ build stats for prometheus exporter """ + raise NotImplemented("Must be implemented by the subclass!") + + def gauge_name_sanitize(self, input): + if input.startswith("openstack_"): + return re.sub(r'[^a-zA-Z0-9:_]', '_', input) + else: + return self.GAUGE_NAME_FORMAT.format( + re.sub(r'[^a-zA-Z0-9:_]', '_', input)) diff --git a/prometheus-openstack-exporter/exporter/check_os_api.py b/prometheus-openstack-exporter/exporter/check_os_api.py new file mode 100644 index 00000000..9778432e --- /dev/null +++ b/prometheus-openstack-exporter/exporter/check_os_api.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python +# Copyright 2017 The Openstack-Helm Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from base import OSBase + +from urlparse import urlparse +from prometheus_client import CollectorRegistry, generate_latest, Gauge +import logging +logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s:%(levelname)s:%(message)s") +logger = logging.getLogger(__name__) + + +class CheckOSApi(OSBase): + """Class to check the status of OpenStack API services.""" + + CHECK_MAP = { + 'keystone': {'path': '/', 'expect': [300], 'name': 'keystone-public-api'}, + 'heat': {'path': '/', 'expect': [300], 'name': 'heat-api'}, + 'heat-cfn': {'path': '/', 'expect': [300], 'name': 'heat-cfn-api'}, + 'glance': {'path': '/', 'expect': [300], 'name': 'glance-api'}, + 'cinder': {'path': '/', 'expect': [200, 300], 'name': 'cinder-api'}, + 'cinderv2': { + 'path': '/', 'expect': [200, 300], 'name': 'cinder-v2-api'}, + 'neutron': {'path': '/', 'expect': [200], 'name': 'neutron-api'}, + 'nova': {'path': '/', 'expect': [200], 'name': 'nova-api'}, + 'ceilometer': { + 'path': 'v2/capabilities', 'expect': [200], 'auth': True, + 'name': 'ceilometer-api'}, + 'swift': {'path': '/', 'expect': [200], 'name': 'ceph'}, + 'swift_s3': { + 'path': 'healthcheck', 'expect': [200], 'name': 'swift-s3-api'}, + 'murano': {'path': '/', 'expect': [200, 300], 'name': 'murano-api'}, + 'trove': {'path': '/', 'expect': [200, 300], 'name': 'trove-api'}, + 'mistral': {'path': '/', 'expect': [200, 300], 'name': 'mistral-api'}, + 'designate': {'path': '/', 'expect': [200, 300], 'name': 'designate-api'}, + 'contrail_analytics': {'path': '/', 'expect': [200], 'name': 'contrail-analytics-api'}, + 'contrail_config': {'path': '/', 'expect': [200], 'name': 'contrail-config-api'}, + 'congress': {'path': '/', 'expect': [200], 'name': 'congress-api'}, + 'placement': {'path': '/', 'expect': [401], 'name': 'placement-api'}, + 'shipyard': {'path': 'v1.0/health', 'expect': [204, 503], 'name': 'shipyard'}, + 'armada': {'path': 'v1.0/health', 'expect': [204, 503], 'name': 'armada'}, + 'deckhand': {'path': 'v1.0/health', 'expect': [204, 503], 'name': 'deckhand'}, + 'drydock': {'path': 'v1.0/health', 'expect': [204, 503], 'name': 'drydock'}, + 'promenade': {'path': 'v1.0/health', 'expect': [204, 503], 'name': 'promenade'}, + } + + def _service_url(self, endpoint, path): + url = urlparse(endpoint) + u = '%s://%s' % (url.scheme, url.netloc) + if path != '/': + u = '%s/%s' % (u, path) + return u + + def build_cache_data(self): + """ Check the status of all the API services. + + Yields a list of dict items with 'service', 'status' (either OK, + FAIL or UNKNOWN) and 'region' keys. + """ + check_array = [] + catalog = self.osclient.service_catalog + + for service in catalog: + name = service['name'] + url = None + status_code = 500 + if name not in self.CHECK_MAP: + logger.info( + "No check found for service '%s', creating one" % name) + self.CHECK_MAP[name] = { + 'path': '/', + 'expect': [200, 300, 302, 401, 404], + 'name': name, + } + check = self.CHECK_MAP[name] + url = self._service_url(service['url'], check['path']) + r = self.osclient.raw_get( + url, token_required=check.get( + 'auth', False)) + + if r is not None: + status_code = r.status_code + + if r is None or status_code not in check['expect']: + logger.info( + "Service %s check failed " + "(returned '%s' but expected '%s')" % ( + name, status_code, check['expect']) + ) + status = self.FAIL + else: + status = self.OK + + check_array.append({ + 'service': name, + 'status': status, + 'url': url, + 'status_code': status_code, + 'region': self.osclient.region, + }) + return check_array + + def get_cache_key(self): + return "check_os_api" + + def get_stats(self): + registry = CollectorRegistry() + labels = ['region', 'url', 'service'] + check_api_data_cache = self.get_cache_data() + for check_api_data in check_api_data_cache: + label_values = [ + check_api_data['region'], + check_api_data['url'], + check_api_data['service']] + gague_name = self.gauge_name_sanitize( + "check_{}_api".format(check_api_data['service'])) + check_gauge = Gauge( + gague_name, + 'Openstack API check. fail = 0, ok = 1 and unknown = 2', + labels, + registry=registry) + check_gauge.labels(*label_values).set(check_api_data['status']) + return generate_latest(registry) diff --git a/prometheus-openstack-exporter/exporter/cinder_services.py b/prometheus-openstack-exporter/exporter/cinder_services.py new file mode 100644 index 00000000..ff94f45d --- /dev/null +++ b/prometheus-openstack-exporter/exporter/cinder_services.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python +# Copyright 2017 The Openstack-Helm Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from base import OSBase +from collections import Counter +from collections import defaultdict +from prometheus_client import CollectorRegistry, generate_latest, Gauge +import logging +logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s:%(levelname)s:%(message)s") +logger = logging.getLogger(__name__) + + +class CinderServiceStats(OSBase): + """ Class to report the statistics on Cinder services. + + state of workers broken down by state + """ + + def build_cache_data(self): + + aggregated_workers = defaultdict(Counter) + + stats = self.osclient.get_workers('cinder') + for worker in stats: + service = worker['service'] + state = worker['state'] + aggregated_workers[service][state] += 1 + + for service in aggregated_workers: + totalw = sum(aggregated_workers[service].values()) + for state in self.osclient.states: + prct = (100.0 * aggregated_workers[service][state]) / totalw + stats.append({ + 'stat_name': "services_{}_{}_percent".format(service, state), + 'stat_value': prct, + 'state': state, + 'service': service + }) + stats.append({ + 'stat_name': "services_{}_{}_total".format(service, state), + 'stat_value': aggregated_workers[service][state], + 'state': state, + 'service': service + }) + + return stats + + def get_cache_key(self): + return "cinder_services_stats" + + def get_stats(self): + registry = CollectorRegistry() + labels = ['region', 'host', 'service', 'state'] + cinder_services_stats_cache = self.get_cache_data() + for cinder_services_stat in cinder_services_stats_cache: + stat_gauge = Gauge( + self.gauge_name_sanitize( + cinder_services_stat['stat_name']), + 'Openstack Cinder Service statistic', + labels, + registry=registry) + label_values = [self.osclient.region, + cinder_services_stat.get('host', ''), + cinder_services_stat.get('service', ''), + cinder_services_stat.get('state', '')] + stat_gauge.labels( + * + label_values).set( + cinder_services_stat['stat_value']) + return generate_latest(registry) diff --git a/prometheus-openstack-exporter/exporter/hypervisor_stats.py b/prometheus-openstack-exporter/exporter/hypervisor_stats.py new file mode 100644 index 00000000..df5e92c9 --- /dev/null +++ b/prometheus-openstack-exporter/exporter/hypervisor_stats.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python +# Copyright 2017 The Openstack-Helm Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from base import OSBase + +from prometheus_client import CollectorRegistry, generate_latest, Gauge +import logging +logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s:%(levelname)s:%(message)s") +logger = logging.getLogger(__name__) + + +class HypervisorStats(OSBase): + """ Class to report the statistics on Nova hypervisors.""" + VALUE_MAP = { + 'current_workload': 'running_tasks', + 'running_vms': 'running_instances', + 'local_gb_used': 'used_disk_GB', + 'free_disk_gb': 'free_disk_GB', + 'memory_mb_used': 'used_ram_MB', + 'free_ram_mb': 'free_ram_MB', + 'vcpus_used': 'used_vcpus', + } + + def __init__( + self, + oscache, + osclient, + cpu_overcommit_ratio, + ram_overcommit_ratio): + super(HypervisorStats, self).__init__(oscache, osclient) + self.cpu_overcommit_ratio = cpu_overcommit_ratio + self.ram_overcommit_ratio = ram_overcommit_ratio + + def build_cache_data(self): + cache_stats = [] + nova_aggregates = {} + r = self.osclient.get('nova', 'os-aggregates') + if not r: + logger.warning("Could not get nova aggregates") + else: + aggregates_list = r.json().get('aggregates', []) + for agg in aggregates_list: + nova_aggregates[agg['name']] = { + 'id': agg['id'], + 'hosts': [h.split('.')[0] for h in agg['hosts']], + 'metrics': {'free_vcpus': 0}, + } + nova_aggregates[agg['name']]['metrics'].update( + {v: 0 for v in self.VALUE_MAP.values()} + ) + + r = self.osclient.get('nova', 'os-hypervisors/detail') + if not r: + logger.warning("Could not get hypervisor statistics") + return + + total_stats = {v: 0 for v in self.VALUE_MAP.values()} + total_stats['free_vcpus'] = 0 + hypervisor_stats = r.json().get('hypervisors', []) + for stats in hypervisor_stats: + host = stats['hypervisor_hostname'] + for k, v in self.VALUE_MAP.iteritems(): + m_val = stats.get(k, 0) + cache_stats.append({ + 'stat_name': v, + 'stat_value': m_val, + 'host': host, + }) + total_stats[v] += m_val + for agg in nova_aggregates.keys(): + agg_hosts = nova_aggregates[agg]['hosts'] + if host in agg_hosts: + nova_aggregates[agg]['metrics'][v] += m_val + m_vcpus = stats.get('vcpus', 0) + m_vcpus_used = stats.get('vcpus_used', 0) + free = (int(self.cpu_overcommit_ratio * m_vcpus)) - m_vcpus_used + cache_stats.append({ + 'stat_name': 'free_vcpus', + 'stat_value': free, + 'host': host, + }) + total_stats['free_vcpus'] += free + for agg in nova_aggregates.keys(): + agg_hosts = nova_aggregates[agg]['hosts'] + if host in agg_hosts: + free = ((int(self.extra_config['cpu_ratio'] * + m_vcpus)) - + m_vcpus_used) + nova_aggregates[agg]['metrics']['free_vcpus'] += free + + # Dispatch the aggregate metrics + for agg in nova_aggregates.keys(): + agg_id = nova_aggregates[agg]['id'] + agg_total_free_ram = ( + nova_aggregates[agg]['metrics']['free_ram_MB'] + + nova_aggregates[agg]['metrics']['used_ram_MB'] + ) + if agg_total_free_ram > 0: + nova_aggregates[agg]['metrics']['free_ram_percent'] = round( + (100.0 * nova_aggregates[agg]['metrics']['free_ram_MB']) / + agg_total_free_ram, + 2) + for k, v in nova_aggregates[agg]['metrics'].iteritems(): + cache_stats.append({ + 'stat_name': 'aggregate_{}'.format(k), + 'stat_value': v, + 'aggregate': agg, + 'aggregate_id': agg_id, + }) + # Dispatch the global metrics + for k, v in total_stats.iteritems(): + cache_stats.append({ + 'stat_name': 'total_{}'.format(k), + 'stat_value': v, + }) + + return cache_stats + + def get_cache_key(self): + return "hypervisor_stats" + + def get_stats(self): + registry = CollectorRegistry() + labels = ['region', 'host', 'aggregate', 'aggregate_id'] + hypervisor_stats_cache = self.get_cache_data() + for hypervisor_stat in hypervisor_stats_cache: + stat_gauge = Gauge( + self.gauge_name_sanitize( + hypervisor_stat['stat_name']), + 'Openstack Hypervisor statistic', + labels, + registry=registry) + label_values = [self.osclient.region, + hypervisor_stat.get('host', ''), + hypervisor_stat.get('aggregate', ''), + hypervisor_stat.get('aggregate_id', '')] + stat_gauge.labels(*label_values).set(hypervisor_stat['stat_value']) + return generate_latest(registry) diff --git a/prometheus-openstack-exporter/exporter/main.py b/prometheus-openstack-exporter/exporter/main.py new file mode 100644 index 00000000..51990850 --- /dev/null +++ b/prometheus-openstack-exporter/exporter/main.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python +# Copyright 2017 The Openstack-Helm Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import yaml +import os +import urlparse +from BaseHTTPServer import BaseHTTPRequestHandler +from BaseHTTPServer import HTTPServer +from SocketServer import ForkingMixIn +from prometheus_client import CONTENT_TYPE_LATEST + +from osclient import OSClient +from oscache import OSCache +from check_os_api import CheckOSApi +from neutron_agents import NeutronAgentStats +from nova_services import NovaServiceStats +from cinder_services import CinderServiceStats +from hypervisor_stats import HypervisorStats + +import logging +logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s:%(levelname)s:%(message)s") +logger = logging.getLogger(__name__) + +collectors = [] + + +class ForkingHTTPServer(ForkingMixIn, HTTPServer): + pass + + +class OpenstackExporterHandler(BaseHTTPRequestHandler): + def __init__(self, *args, **kwargs): + BaseHTTPRequestHandler.__init__(self, *args, **kwargs) + + def do_GET(self): + url = urlparse.urlparse(self.path) + if url.path == '/metrics': + output = '' + for collector in collectors: + try: + stats = collector.get_stats() + if stats is not None: + output = output + stats + except BaseException: + logger.warning( + "Could not get stats for collector {}".format( + collector.get_cache_key())) + self.send_response(200) + self.send_header('Content-Type', CONTENT_TYPE_LATEST) + self.end_headers() + self.wfile.write(output) + elif url.path == '/': + self.send_response(200) + self.end_headers() + self.wfile.write(""" + OpenStack Exporter + +

OpenStack Exporter

+

Visit /metrics to use.

+ + """) + else: + self.send_response(404) + self.end_headers() + + +def handler(*args, **kwargs): + OpenstackExporterHandler(*args, **kwargs) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + usage=__doc__, + description='Prometheus OpenStack exporter', + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--config-file', nargs='?', + help='Configuration file path', + type=argparse.FileType('r'), + required=False) + args = parser.parse_args() + config = {} + if args.config_file: + config = yaml.safe_load(args.config_file.read()) + + os_keystone_url = config.get('OS_AUTH_URL', os.getenv('OS_AUTH_URL')) + os_password = config.get('OS_PASSWORD', os.getenv('OS_PASSWORD')) + os_tenant_name = config.get( + 'OS_PROJECT_NAME', + os.getenv('OS_PROJECT_NAME')) + os_username = config.get('OS_USERNAME', os.getenv('OS_USERNAME')) + os_user_domain = config.get( + 'OS_USER_DOMAIN_NAME', + os.getenv('OS_USER_DOMAIN_NAME')) + os_region = config.get('OS_REGION_NAME', os.getenv('OS_REGION_NAME')) + os_timeout = config.get( + 'TIMEOUT_SECONDS', int( + os.getenv( + 'TIMEOUT_SECONDS', 10))) + os_polling_interval = config.get( + 'OS_POLLING_INTERVAL', int( + os.getenv( + 'OS_POLLING_INTERVAL', 900))) + os_retries = config.get('OS_RETRIES', int(os.getenv('OS_RETRIES', 1))) + os_cpu_overcomit_ratio = config.get( + 'OS_CPU_OC_RATIO', float( + os.getenv( + 'OS_CPU_OC_RATIO', 1))) + os_ram_overcomit_ratio = config.get( + 'OS_RAM_OC_RATIO', float( + os.getenv( + 'OS_RAM_OC_RATIO', 1))) + + osclient = OSClient( + os_keystone_url, + os_password, + os_tenant_name, + os_username, + os_user_domain, + os_region, + os_timeout, + os_retries) + oscache = OSCache(os_polling_interval, os_region) + collectors.append(oscache) + + check_os_api = CheckOSApi(oscache, osclient) + collectors.append(check_os_api) + neutron_agent_stats = NeutronAgentStats(oscache, osclient) + collectors.append(neutron_agent_stats) + cinder_service_stats = CinderServiceStats(oscache, osclient) + collectors.append(cinder_service_stats) + nova_service_stats = NovaServiceStats(oscache, osclient) + collectors.append(nova_service_stats) + hypervisor_stats = HypervisorStats( + oscache, + osclient, + os_cpu_overcomit_ratio, + os_ram_overcomit_ratio) + collectors.append(hypervisor_stats) + + oscache.start() + + listen_port = config.get( + 'LISTEN_PORT', int( + os.getenv( + 'LISTEN_PORT', 9103))) + server = ForkingHTTPServer(('', listen_port), handler) + server.serve_forever() diff --git a/prometheus-openstack-exporter/exporter/neutron_agents.py b/prometheus-openstack-exporter/exporter/neutron_agents.py new file mode 100644 index 00000000..934b141d --- /dev/null +++ b/prometheus-openstack-exporter/exporter/neutron_agents.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python +# Copyright 2017 The Openstack-Helm Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from base import OSBase +from collections import Counter +from collections import defaultdict +from prometheus_client import CollectorRegistry, generate_latest, Gauge +import logging +logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s:%(levelname)s:%(message)s") +logger = logging.getLogger(__name__) + + +class NeutronAgentStats(OSBase): + """ Class to report the statistics on Neutron agents. + + state of agents + """ + + def build_cache_data(self): + + # Get information of the state per agent + # State can be up or down + aggregated_agents = defaultdict(Counter) + stats = self.osclient.get_workers('neutron') + + for agent in stats: + service = agent['service'] + state = agent['state'] + aggregated_agents[service][state] += 1 + + for service in aggregated_agents: + totala = sum(aggregated_agents[service].values()) + for state in self.osclient.states: + prct = (100.0 * aggregated_agents[service][state]) / totala + stats.append({ + 'stat_name': "services_{}_{}_percent".format(service, state), + 'stat_value': prct, + 'service': service, + 'state': state + }) + stats.append({ + 'stat_name': "services_{}_{}_total".format(service, state), + 'stat_value': aggregated_agents[service][state], + 'service': service, + 'state': state, + }) + return stats + + def get_cache_key(self): + return "neutron_agent_stats" + + def get_stats(self): + registry = CollectorRegistry() + labels = ['region', 'host', 'service', 'state'] + neutron_agent_stats_cache = self.get_cache_data() + for neutron_agent_stat in neutron_agent_stats_cache: + stat_gauge = Gauge( + self.gauge_name_sanitize( + neutron_agent_stat['stat_name']), + 'Openstack Neutron agent statistic', + labels, + registry=registry) + label_values = [self.osclient.region, + neutron_agent_stat.get('host', ''), + neutron_agent_stat.get('service', ''), + neutron_agent_stat.get('state', '')] + stat_gauge.labels( + * + label_values).set( + neutron_agent_stat['stat_value']) + return generate_latest(registry) diff --git a/prometheus-openstack-exporter/exporter/nova_services.py b/prometheus-openstack-exporter/exporter/nova_services.py new file mode 100644 index 00000000..dc8672c9 --- /dev/null +++ b/prometheus-openstack-exporter/exporter/nova_services.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python +# Copyright 2017 The Openstack-Helm Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from base import OSBase +from collections import Counter +from collections import defaultdict +from prometheus_client import CollectorRegistry, generate_latest, Gauge +import logging +logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s:%(levelname)s:%(message)s") +logger = logging.getLogger(__name__) + + +class NovaServiceStats(OSBase): + """ Class to report the statistics on Nova services. + + status per service broken down by state + """ + + def build_cache_data(self): + # Get information of the state per service + # State can be: 'up', 'down' or 'disabled' + aggregated_workers = defaultdict(Counter) + stats = self.osclient.get_workers('nova') + for worker in stats: + service = worker['service'] + state = worker['state'] + aggregated_workers[service][state] += 1 + + for service in aggregated_workers: + total = sum(aggregated_workers[service].values()) + for state in self.osclient.states: + prct = 0 + if total > 0: + prct = (100.0 * aggregated_workers[service][state]) / total + + stats.append({ + 'stat_name': "services_{}_{}_percent".format(service, state), + 'stat_value': prct, + 'state': state, + 'service': service + }) + stats.append({ + 'stat_name': "services_{}_{}_total".format(service, state), + 'stat_value': aggregated_workers[service][state], + 'state': state, + 'service': service + }) + return stats + + def get_cache_key(self): + return "nova_services_stats" + + def get_stats(self): + registry = CollectorRegistry() + labels = ['region', 'host', 'service', 'state'] + services_stats_cache = self.get_cache_data() + for services_stat in services_stats_cache: + stat_gauge = Gauge( + self.gauge_name_sanitize( + services_stat['stat_name']), + 'Openstack Nova Service statistic', + labels, + registry=registry) + label_values = [self.osclient.region, + services_stat.get('host', ''), + services_stat.get('service', ''), + services_stat.get('state', '')] + stat_gauge.labels(*label_values).set(services_stat['stat_value']) + return generate_latest(registry) diff --git a/prometheus-openstack-exporter/exporter/oscache.py b/prometheus-openstack-exporter/exporter/oscache.py new file mode 100644 index 00000000..50f6ffde --- /dev/null +++ b/prometheus-openstack-exporter/exporter/oscache.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python +# Copyright 2017 The Openstack-Helm Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from threading import Thread +from threading import Lock +from prometheus_client import CollectorRegistry, generate_latest, Gauge +from time import sleep, time +import logging +logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s:%(levelname)s:%(message)s") +logger = logging.getLogger(__name__) + + +class ThreadSafeDict(dict): + def __init__(self, * p_arg, ** n_arg): + dict.__init__(self, * p_arg, ** n_arg) + self._lock = Lock() + + def __enter__(self): + self._lock.acquire() + return self + + def __exit__(self, type, value, traceback): + self._lock.release() + + +class OSCache(Thread): + + def __init__(self, refresh_interval, region): + Thread.__init__(self) + self.daemon = True + self.duration = 0 + self.refresh_interval = refresh_interval + self.cache = ThreadSafeDict() + self.region = region + self.osclients = [] + + def cache_me(self, osclient): + self.osclients.append(osclient) + logger.debug("new osclient added to cache") + + def run(self): + while True: + start_time = time() + for osclient in self.osclients: + try: + self.cache[osclient.get_cache_key( + )] = osclient.build_cache_data() + except Exception as e: + logger.error(str(e)) + logger.error( + "failed to get data for cache key {}".format( + osclient.get_cache_key())) + self.duration = time() - start_time + sleep(self.refresh_interval) + + def get_cache_data(self, key): + if key in self.cache: + return self.cache[key] + else: + return [] + + def get_stats(self): + registry = CollectorRegistry() + labels = ['region'] + label_values = [self.region] + duration = Gauge('openstack_exporter_cache_refresh_duration_seconds', + 'Cache refresh duration in seconds.', + labels, registry=registry) + duration.labels(*label_values).set(self.duration) + return generate_latest(registry) diff --git a/prometheus-openstack-exporter/exporter/osclient.py b/prometheus-openstack-exporter/exporter/osclient.py new file mode 100644 index 00000000..c99ff355 --- /dev/null +++ b/prometheus-openstack-exporter/exporter/osclient.py @@ -0,0 +1,285 @@ +#!/usr/bin/env python +# Copyright 2017 The Openstack-Helm Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import dateutil.parser +import dateutil.tz +import requests +import simplejson as json +import logging +logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s:%(levelname)s:%(message)s") +logger = logging.getLogger(__name__) + + +class KeystoneException(Exception): + pass + + +class OSClient(object): + """ Base class for querying the OpenStack API endpoints. + + It uses the Keystone service catalog to discover the API endpoints. + """ + EXPIRATION_TOKEN_DELTA = datetime.timedelta(0, 30) + states = {'up': 1, 'down': 0, 'disabled': 2} + + def __init__( + self, + keystone_url, + password, + tenant_name, + username, + user_domain, + region, + timeout, + retries): + self.keystone_url = keystone_url + self.password = password + self.tenant_name = tenant_name + self.username = username + self.user_domain = user_domain + self.region = region + self.timeout = timeout + self.retries = retries + self.token = None + self.valid_until = None + self.session = requests.Session() + self.session.mount( + 'http://', requests.adapters.HTTPAdapter(max_retries=retries)) + self.session.mount( + 'https://', requests.adapters.HTTPAdapter(max_retries=retries)) + self._service_catalog = [] + + def is_valid_token(self): + now = datetime.datetime.now(tz=dateutil.tz.tzutc()) + return self.token is not None and self.valid_until is not None and self.valid_until > now + + def clear_token(self): + self.token = None + self.valid_until = None + + def get_token(self): + self.clear_token() + data = json.dumps({ + "auth": { + "identity": { + "methods": ["password"], + "password": { + "user": { + "name": self.username, + "domain": {"id": self.user_domain}, + "password": self.password + } + } + }, + "scope": { + "project": { + "name": self.tenant_name, + "domain": {"id": self.user_domain} + } + } + } + }) + logger.info("Trying to get token from '%s'" % self.keystone_url) + r = self.make_request('post', + '%s/auth/tokens' % self.keystone_url, data=data, + token_required=False) + if not r: + logger.error( + "Cannot get a valid token from {}".format( + self.keystone_url)) + + if r.status_code < 200 or r.status_code > 299: + logger.error( + "{} responded with code {}".format( + self.keystone_url, + r.status_code)) + + data = r.json() + self.token = r.headers.get("X-Subject-Token") + self.tenant_id = data['token']['project']['id'] + self.valid_until = dateutil.parser.parse( + data['token']['expires_at']) - self.EXPIRATION_TOKEN_DELTA + self._service_catalog = [] + for item in data['token']['catalog']: + internalURL = None + publicURL = None + adminURL = None + for endpoint in item['endpoints']: + if endpoint['region'] == self.region or self.region is None: + if endpoint['interface'] == 'internal': + internalURL = endpoint['url'] + elif endpoint['interface'] == 'public': + publicURL = endpoint['url'] + elif endpoint['interface'] == 'admin': + adminURL = endpoint['url'] + + if internalURL is None and publicURL is None: + logger.warning( + "Service '{}' skipped because no URL can be found".format( + item['name'])) + continue + self._service_catalog.append({ + 'name': item['name'], + 'region': self.region, + 'service_type': item['type'], + 'url': internalURL if internalURL is not None else publicURL, + 'admin_url': adminURL, + }) + + logger.debug("Got token '%s'" % self.token) + return self.token + + @property + def service_catalog(self): + if not self._service_catalog: + self.get_token() + return self._service_catalog + + @service_catalog.setter + def service_catalog(self, service_catalog): + self._service_catalog = service_catalog + + def get_service(self, service_name): + return next((x for x in self._service_catalog + if x['name'] == service_name), None) + + def raw_get(self, url, token_required=False): + return self.make_request('get', url, + token_required=token_required) + + def make_request(self, verb, url, data=None, token_required=True, + params=None): + kwargs = { + 'url': url, + 'timeout': self.timeout, + 'headers': {'Content-type': 'application/json'} + } + if token_required and not self.is_valid_token(): + self.get_token() + if not self.is_valid_token(): + logger.error("Aborting request, no valid token") + return + if token_required: + kwargs['headers']['X-Auth-Token'] = self.token + + if data is not None: + kwargs['data'] = data + + if params is not None: + kwargs['params'] = params + + func = getattr(self.session, verb.lower()) + + try: + r = func(**kwargs) + except Exception as e: + logger.error("Got exception for '%s': '%s'" % + (kwargs['url'], e)) + return + + logger.info("%s responded with status code %d" % + (kwargs['url'], r.status_code)) + + return r + + def get(self, service, resource, params=None): + url = self._build_url(service, resource) + if not url: + return + logger.info('GET({}) {}'.format(url, params)) + return self.make_request('get', url, params=params) + + def _build_url(self, service, resource): + s = (self.get_service(service) or {}) + url = s.get('url') + # v3 API must be used in order to obtain tenants in multi-domain envs + if service == 'keystone' and (resource in ['projects', + 'users', 'roles']): + url = url.replace('v2.0', 'v3') + + if url: + if url[-1] != '/': + url += '/' + url = "%s%s" % (url, resource) + else: + logger.error("Service '%s' not found in catalog" % service) + return url + + def get_workers(self, service): + """ Return the list of workers and their state + + Here is an example of returned dictionnary: + { + 'host': 'node.example.com', + 'service': 'nova-compute', + 'state': 'up' + } + + where 'state' can be 'up', 'down' or 'disabled' + """ + worker_metrics = [] + if service == 'neutron': + endpoint = 'v2.0/agents' + entry = 'agents' + else: + endpoint = 'os-services' + entry = 'services' + + ost_services_r = self.get(service, endpoint) + + msg = "Cannot get state of {} workers".format(service) + if ost_services_r is None: + logger.warning(msg) + elif ost_services_r.status_code != 200: + msg = "{}: Got {} ({})".format( + msg, ost_services_r.status_code, ost_services_r.content) + logger.warning(msg) + else: + try: + r_json = ost_services_r.json() + except ValueError: + r_json = {} + + if entry not in r_json: + msg = "{}: couldn't find '{}' key".format(msg, entry) + logger.warning(msg) + else: + for val in r_json[entry]: + data = {'host': val['host'], 'service': val['binary']} + + if service == 'neutron': + if not val['admin_state_up']: + data['state'] = 'disabled' + else: + data['state'] = 'up' if val['alive'] else 'down' + else: + if val['status'] == 'disabled': + data['state'] = 'disabled' + elif val['state'] == 'up' or val['state'] == 'down': + data['state'] = val['state'] + else: + data['state'] = 'unknown' + msg = "Unknown state for {} workers:{}".format( + service, val['state']) + logger.warning(msg) + continue + data['stat_value'] = self.states[data['state']] + data['stat_name'] = "services_{}_{}".format( + service, val['binary']) + worker_metrics.append(data) + return worker_metrics diff --git a/zuul.d/prometheus-openstack-exporter.yaml b/zuul.d/prometheus-openstack-exporter.yaml new file mode 100644 index 00000000..8f533f43 --- /dev/null +++ b/zuul.d/prometheus-openstack-exporter.yaml @@ -0,0 +1,57 @@ +--- +# Copyright 2019 The Openstack-Helm Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +- project: + check: + jobs: + - openstack-helm-images-build-prometheus-openstack-exporter + gate: + jobs: + - openstack-helm-images-upload-prometheus-openstack-exporter + promote: + jobs: + - openstack-helm-images-promote-prometheus-openstack-exporter + periodic: + jobs: + - openstack-helm-images-build-prometheus-openstack-exporter + +- job: + name: openstack-helm-images-build-prometheus-openstack-exporter + parent: openstack-helm-images-build + description: Build Prometheus OpenStack exporter image + vars: &prometheus-openstack-exporter_vars + docker_images: + - context: prometheus-openstack-exporter + repository: openstackhelm/prometheus-openstack-exporter + dockerfile: Dockerfile.ubuntu_xenial + tags: + - latest-ubuntu_xenial + files: &prometheus-openstack-exporter_files + - prometheus-openstack-exporter/.* + - zuul.d/prometheus-openstack-exporter.yaml + +- job: + name: openstack-helm-images-upload-prometheus-openstack-exporter + parent: openstack-helm-images-upload + description: Build and upload Prometheus OpenStack exporter image + vars: *prometheus-openstack-exporter_vars + files: *prometheus-openstack-exporter_files + +- job: + name: openstack-helm-images-promote-prometheus-openstack-exporter + parent: openstack-helm-images-promote + description: Promote a previously published Prometheus OpenStack exporter image. + vars: *prometheus-openstack-exporter_vars + files: *prometheus-openstack-exporter_files