From 962888a9ff10035cdf9bf6a76d73cd0f04c2a3d4 Mon Sep 17 00:00:00 2001 From: Brad Klein Date: Mon, 9 Jan 2017 16:14:33 -0700 Subject: [PATCH] Add router max bandwidth metrics to ovs plugin This patch will help operators determine if a node that hosts routers has enough active routers to potentially exceed the network capacity of the host. It can also be used to decide if routers should be moved around/rebalanced. Three new metrics will be published if the 'publish_router_capacity' flag is set to true in the ovs.yaml file (off by default): vrouter.max_bw_kb (customer metric) ovs.vrouter.max_bw_kb (ops metric) ovs.vrouter.host_max_bw_kb (ops metric) This functionality depends on nova flavor bandwith quotas being configured: https://wiki.openstack.org/wiki/InstanceResourceQuota#Bandwidth_limits At some point in the future when neutron supports router QOS, this functionality should be extended to also honor that. Change-Id: Ie00260d99bdd1f761dca28d2a5779b906a196564 --- docs/Ovs.md | 36 ++++-- monasca_agent/collector/checks_d/ovs.py | 156 ++++++++++++++++++++++-- monasca_setup/detection/plugins/ovs.py | 7 +- 3 files changed, 177 insertions(+), 22 deletions(-) diff --git a/docs/Ovs.md b/docs/Ovs.md index e72a87b9..c5fe02b0 100644 --- a/docs/Ovs.md +++ b/docs/Ovs.md @@ -9,6 +9,7 @@ - [Per-Router Metrics](#per-router-metrics) - [Per-DHCP port Metrics](#per-dhcp-port-metrics) - [Per-DHCP Rate Metrics](#per-dhcp-rate-metrics) + - [Per-Host Metrics](#per-host-metrics) - [Mapping Metrics to Configuration Parameters](#mapping-metrics-to-configuration-parameters) - [Router Metric Dimensions](#router-metric-dimensions) - [OVS Port Metric Dimensions](#ovs-port-metric-dimensions) @@ -101,18 +102,19 @@ instances: ## Per-Router Metrics -| Name | Description | -| -------------------------|-----------------------------------------------------------------| -| vrouter.in_bytes |Inbound bytes for the router (if `network_use_bits` is false) | -| vrouter.out_bytes | Outgoing bytes for the router (if `network_use_bits` is false) | -| vrouter.in_bits | Inbound bits for the router (if `network_use_bits` is true) | -| vrouter.out_bits | Outgoing bits for the router (if `network_use_bits` is true) | -| vrouter.in_packets | Incoming packets for the router | -| vrouter.out_packets | Outgoing packets for the router | -| vrouter.in_dropped | Incoming dropped packets for the router | -| vrouter.out_dropped | Outgoing dropped packets for the router | -| vrouter.in_errors | Number of incoming errors for the router | -| vrouter.out_errors | Number of outgoing errors for the router | +| Name | Description | +| -------------------------|--------------------------------------------------------------------------------------------------------------------------| +| vrouter.in_bytes | Inbound bytes for the router (if `network_use_bits` is false) | +| vrouter.out_bytes | Outgoing bytes for the router (if `network_use_bits` is false) | +| vrouter.in_bits | Inbound bits for the router (if `network_use_bits` is true) | +| vrouter.out_bits | Outgoing bits for the router (if `network_use_bits` is true) | +| vrouter.in_packets | Incoming packets for the router | +| vrouter.out_packets | Outgoing packets for the router | +| vrouter.in_dropped | Incoming dropped packets for the router | +| vrouter.out_dropped | Outgoing dropped packets for the router | +| vrouter.in_errors | Number of incoming errors for the router | +| vrouter.out_errors | Number of outgoing errors for the router | +| vrouter.max_bw_kb | Maximum bandwidth possible for the router based on the instances using the router (if `publish_router_capacity` is true) | ## Per-DHCP port Metrics @@ -144,8 +146,14 @@ instances: | vswitch.out_error_sec | Outgoing errors per second for the DHCP port | | vswitch.in_error_sec | Incoming errors per second for the DHCP port | +## Per-Host Metrics + +| Name | Description | +| ---------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| ovs.vrouter.host_max_bw_kb | Maximum bandwidth possible for routers on the the host based on the instances using those routers (if `publish_router_capacity` is true). Only published to the operations tenant. | + ## Mapping Metrics to Configuration Parameters -Configuration parameters can be used to control which metrics are reported by ovs plugin. There are 3 parameters currently in ovs config file: use_rate_metrics, use_absolute_metrics and use_health_metrics. +Configuration parameters can be used to control which metrics are reported by ovs plugin. There are 4 parameters currently in ovs config file: use_rate_metrics, use_absolute_metrics, use_health_metrics and publish_router_capacity. | Tuning Knob | Admin Metric Name | Tenant Metric Name | @@ -166,6 +174,8 @@ Configuration parameters can be used to control which metrics are reported by ov | | ovs.vrouter.in_errors | vrouter.in_errors | | | ovs.vrouter.out_dropped | vrouter.out_dropped | | | ovs.vrouter.out_errors | vrouter.out_errors | +| publish_router_capacity (default: False) | ovs.vrouter.max_bw_kb | vrouter.max_bw_kb | +| | ovs.vrouter.host_max_bw_kb | N/A | NOTE: diff --git a/monasca_agent/collector/checks_d/ovs.py b/monasca_agent/collector/checks_d/ovs.py index 304fee4a..26042a04 100644 --- a/monasca_agent/collector/checks_d/ovs.py +++ b/monasca_agent/collector/checks_d/ovs.py @@ -17,6 +17,7 @@ import time from copy import deepcopy from monasca_agent.collector.checks import AgentCheck from neutronclient.v2_0 import client as neutron_client +from novaclient import client as nova_client OVS_CMD = """\ %s --columns=name,external_ids,statistics,options \ @@ -48,6 +49,7 @@ class OvsCheck(AgentCheck): self.use_absolute_metrics = self.init_config.get('use_absolute_metrics') self.use_rate_metrics = self.init_config.get('use_rate_metrics') self.use_health_metrics = self.init_config.get('use_health_metrics') + self.publish_router_capacity = self.init_config.get('publish_router_capacity') if include_re is None: include_re = 'qg.*' else: @@ -132,6 +134,7 @@ class OvsCheck(AgentCheck): # let's publish. # tried_one_update = False + host_router_max_bw = 0 for ifx, value in ifx_deltas.iteritems(): port_uuid = value['port_uuid'] @@ -141,7 +144,7 @@ class OvsCheck(AgentCheck): # file for a missing port uuid once per wakeup. # tried_one_update = True - log_msg = "port_uuid {0} not in port cache -- updating." + log_msg = "port_uuid {0} not in port cache -- updating." self.log.info(log_msg.format(port_uuid)) port_cache = self._update_port_cache() if not port_cache: @@ -175,6 +178,13 @@ class OvsCheck(AgentCheck): this_dimensions = dims_base.copy() this_dimensions.update(ifx_dimensions) + customer_dimensions = this_dimensions.copy() + del customer_dimensions['hostname'] + ops_dimensions = this_dimensions.copy() + ops_dimensions.update({'tenant_id': tenant_id}) + if tenant_name: + ops_dimensions.update({'tenant_name': tenant_name}) + for metric_name, idx in self._get_metrics_map(measure).iteritems(): # POST to customer project interface_stats_key = self._get_interface_stats_key(idx, metric_name, measure, ifx) @@ -185,8 +195,6 @@ class OvsCheck(AgentCheck): # a value to publish for that metric this round. # continue - customer_dimensions = this_dimensions.copy() - del customer_dimensions['hostname'] if is_router_port: metric_name_rate = "vrouter.{0}_sec".format(metric_name) metric_name_abs = "vrouter.{0}".format(metric_name) @@ -195,10 +203,6 @@ class OvsCheck(AgentCheck): metric_name_abs = "vswitch.{0}".format(metric_name) if not self.use_health_metrics and interface_stats_key in HEALTH_METRICS: continue - ops_dimensions = this_dimensions.copy() - ops_dimensions.update({'tenant_id': tenant_id}) - if tenant_name: - ops_dimensions.update({'tenant_name': tenant_name}) if self.use_rate_metrics: self.gauge(metric_name_rate, value[interface_stats_key], dimensions=customer_dimensions, @@ -220,6 +224,15 @@ class OvsCheck(AgentCheck): # POST to operations project self.gauge("ovs.{0}".format(metric_name_abs), abs_value, dimensions=ops_dimensions) + + self._publish_max_bw_metrics(port_info, customer_dimensions, + ops_dimensions) + host_router_max_bw += self._get_port_cache_max_bw(port_info) + + if host_router_max_bw > 0: + self.gauge('ovs.vrouter.host_max_bw_kb', host_router_max_bw, + dimensions=dims_base) + self._update_counter_cache(ctr_cache, math.ceil(time.time() - time_start), measure) @@ -275,6 +288,24 @@ class OvsCheck(AgentCheck): return data return None + def _get_nova_client(self): + + username = self.init_config.get('admin_user') + password = self.init_config.get('admin_password') + tenant_name = self.init_config.get('admin_tenant_name') + auth_url = self.init_config.get('identity_uri') + region_name = self.init_config.get('region_name') + + nc = nova_client.Client(2, username, + password, + tenant_name, + auth_url, + endpoint_type='internalURL', + service_type="compute", + region_name=region_name) + + return nc + def _get_neutron_client(self): username = self.init_config.get('admin_user') @@ -429,6 +460,8 @@ class OvsCheck(AgentCheck): if tenant_name: port_cache[port_uuid]['tenant_name'] = tenant_name + port_cache = self._add_max_bw_to_port_cache(port_cache, + all_ports_data) port_cache['last_update'] = int(time.time()) # Write the updated cache @@ -442,6 +475,115 @@ class OvsCheck(AgentCheck): format(self.port_cache_file, e)) return port_cache + def _get_port_cache_max_bw(self, port_info): + if port_info['is_router_port'] and 'max_bw_kb' in port_info: + return port_info['max_bw_kb'] + else: + return 0 + + def _publish_max_bw_metrics(self, port_info, cust_dims, ops_dims): + max_bw_kb = self._get_port_cache_max_bw(port_info) + + if not self.publish_router_capacity or max_bw_kb == 0: + return + + metric_name = 'vrouter.max_bw_kb' + + self.gauge(metric_name, max_bw_kb, dimensions=cust_dims, + delegated_tenant=ops_dims['tenant_id'], + hostname='SUPPRESS') + + self.gauge("ovs.{0}".format(metric_name), max_bw_kb, + dimensions=ops_dims) + + def _get_max_flavor_bw(self, flavor_keys): + avg_bw = 0 + peak_bw = 0 + burst_bw = 0 + + # + # we'll sum inbound and outbound for the max possible throughput + # + avg_re = re.compile('quota:vif_.*bound_average') + peak_re = re.compile('quota:vif_.*bound_peak') + burst_re = re.compile('quota:vif_.*bound_burst') + + for key in flavor_keys: + if re.match(avg_re, key): + avg_bw += int(flavor_keys[key]) + elif re.match(peak_re, key): + peak_bw += int(flavor_keys[key]) + elif re.match(burst_re, key): + burst_bw += int(flavor_keys[key]) + + return max(avg_bw, peak_bw, burst_bw) + + def _add_max_bw_to_port_cache(self, port_cache, all_ports_data): + if not self.publish_router_capacity: + return port_cache + + tmp_port_cache = deepcopy(port_cache) + # + # No need to do a flavor get multiple times + # for the same flavor when rebuilding the cache. + # + flavor_cache = {} + + try: + if not hasattr(self, 'nova_client'): + self.nova_client = self._get_nova_client() + + for uuid in port_cache: + router_max_bw_kb = 0 + port = port_cache[uuid] + + if not port['is_router_port']: + continue + + inst_ids = self._get_instance_ids(all_ports_data, port['device_uuid']) + + for instance_id in inst_ids: + instance = self.nova_client.servers.get(instance_id) + flavor_id = instance.flavor['id'] + if flavor_id not in flavor_cache: + flavor = self.nova_client.flavors.get(instance.flavor['id']) + flavor_cache[flavor_id] = flavor.get_keys() + router_max_bw_kb += self._get_max_flavor_bw(flavor_cache[flavor_id]) + + if router_max_bw_kb > 0: + tmp_port_cache[uuid]['max_bw_kb'] = router_max_bw_kb + + except Exception as e: + msg = "Unable to get the nova instance and flavor info: {0}" + self.log.error(msg.format(e)) + + return tmp_port_cache + + def _get_instance_ids(self, ports, router_uuid): + subnet_ids = self._get_port_ids(ports, + 'network:router_interface', + [router_uuid], + 'device_id', + 'network_id') + + instance_ids = self._get_port_ids(ports, + 'compute:', + subnet_ids, + 'network_id', + 'device_id') + return instance_ids + + def _get_port_ids(self, ports, owner, uuids, match_field, return_field): + return_uuids = [] + if len(uuids) == 0: + return return_uuids + + for port in ports: + if port[match_field] in uuids and owner in port['device_owner']: + return_uuids.append(port[return_field]) + + return return_uuids + def _load_port_cache(self): """Load the cache map of router/dhcp port uuids to router uuid, name, and tenant name. diff --git a/monasca_setup/detection/plugins/ovs.py b/monasca_setup/detection/plugins/ovs.py index 6e98ca1e..47a599a7 100644 --- a/monasca_setup/detection/plugins/ovs.py +++ b/monasca_setup/detection/plugins/ovs.py @@ -29,12 +29,14 @@ use_absolute_metrics = True use_rate_metrics = True # If set, will submit the health metrics use_health_metrics = True +# If set, router max capacity metrics will be published +publish_router_capacity = False # Acceptable arguments acceptable_args = ['admin_user', 'admin_password', 'admin_tenant_name', 'identity_uri', 'cache_dir', 'neutron_refresh', 'ovs_cmd', 'network_use_bits', 'check_router_ha', 'region_name', 'included_interface_re', 'conf_file_path', 'use_absolute_metrics', - 'use_rate_metrics', 'use_health_metrics'] + 'use_rate_metrics', 'use_health_metrics', 'publish_router_capacity'] # Arguments which must be ignored if provided ignorable_args = ['admin_user', 'admin_password', 'admin_tenant_name', 'identity_uri', 'region_name', 'conf_file_path'] @@ -128,7 +130,8 @@ class Ovs(monasca_setup.detection.Plugin): 'included_interface_re': included_interface_re, 'use_absolute_metrics': use_absolute_metrics, 'use_rate_metrics': use_rate_metrics, - 'use_health_metrics': use_health_metrics} + 'use_health_metrics': use_health_metrics, + 'publish_router_capacity': publish_router_capacity} for option in cfg_needed: init_config[option] = neutron_cfg.get(cfg_section, cfg_needed[option])