From 08ac1308b88271f8b8d6c86defe0ffc7544bf575 Mon Sep 17 00:00:00 2001 From: Fouad Benamrane Date: Thu, 19 Oct 2017 15:48:42 +0200 Subject: [PATCH] Add network congestion detection to monasca To allow monasca detecting network congestion, we add a new agent plugin for monasca-agent module based on Explicit Congestion Notification (ECN) tos bits. The agent will create a map between the number of Congestion Encountered(CE) packets/bytes and the source ip address responsible of the congestion. The source could be a compute or (if activated) a set of VMs deployed in a compute. First, we ensure that ECN marking is enabled for both edges. Second, we check ecn.packets and ecn.bytes and calculate ecn.cong.rate associated to each source of congestion. Last, the agent sends the three metrics and their timestamp to monasca API for any further processing and keeps the metric counters into a cache file. This piece of information could be used to trigger a new kind of performance alarm in monasca threshold. Change-Id: I98405524588582065c7825ca511c489d59826cda Task: 5750 Story: 2001234 --- docs/Plugins.md | 63 ++++ .../collector/checks_d/congestion.py | 336 ++++++++++++++++++ monasca_setup/detection/plugins/congestion.py | 100 ++++++ 3 files changed, 499 insertions(+) create mode 100644 monasca_agent/collector/checks_d/congestion.py create mode 100644 monasca_setup/detection/plugins/congestion.py diff --git a/docs/Plugins.md b/docs/Plugins.md index 02d4f9fe..d4f741fc 100644 --- a/docs/Plugins.md +++ b/docs/Plugins.md @@ -30,6 +30,7 @@ - [Check_MK_Local](#check_mk_local) - [Ceph](#ceph) - [Certificate Expiration (HTTPS)](#certificate-expiration-https) + - [Congestion](#congestion) - [Couch](#couch) - [Couchbase](#couchbase) - [Crash](#crash) @@ -836,6 +837,68 @@ These options can be set if desired: * collect_period: Integer time in seconds between outputting the metric. Since the metric is in days, it makes sense to output it at a slower rate. The default is 3600, once per hour * timeout: Float time in seconds before timing out the connect to the url. Increase if needed for very slow servers, but making this too long will increase the time this plugin takes to run if the server for the url is down. The default is 1.0 seconds +## Congestion + +This section describes the congestion check performed by monasca-agent. Congestion check collects metrics from special iptable chain created by the agent called congestion. Metric names that are cross-posted to the infrastructure project will have the ecn. prefix. + +Configuration +The congestion check requires a configuration file called congestion.yaml to be available in the agent conf.d configuration directory. An example of the configuration is given below. + +`auth_url` is the keystone endpoint for authentication + +`cache_dir` will be used to cache ecn metrics in a file called congestion_status.json. + +`enable_ecn` optional method that activates ecn marking in each machine. When the transmission equipment in the network encounters a congestion in its queues, it will not mark packets until the ecn is enabled by the sender. enable_ecn method ensures that ecn marking is enabled for both the sender and the receiver. This method is optional because the end user could enable ecn by changing the value of tcp_ecn from 0 to 2 or running 'echo 2 > /proc/sys/net/ipv4/tcp_ecn' in each machine. + +`enable_vm` optional method that gathers ecn metrics ecn.packets, ecn.bytes, and ecn.cong.rate of each VM hosted in a remote compute. By default the agent collects ecn metrics of computes, activating this method add fine-grained control of the congestion. + +`s_factor` Smoothing factor used to compute ecn congestion rate. + +`collect_period` Period of time in sec to collect metrics and used also in ecn congestion rate calculation. + +`password` is the password for the nova user. + +`project_name` is the project/tenant to POST ecn metrics. + +`region_name` is used to add the region dimension to metrics. + +`username` is the username capable of making administrative nova calls. + +`instances` are not used and should be empty in congestion.yaml because the plugin runs against all computes. + +Sample config (`congestion.yaml`): +``` +--- +init_config: + auth_url: http://10.10.10.10/identity_admin + cache_dir: /dev/shm + enable_ecn: true + enable_vm: true + password: admin + project_name: service + region_name: RegionOne + username: nova + collect_period: 30 + s_factor = 0.1 + +instances: + - {} +``` + +The congestion checks return the following metrics: +| Metric Name | Dimensions | Semantics | +|---------------| --------------------------------------------------------| ---------------------------------------------------------| +| ecn.packets | hostname, device, component=Neutron, service=networking | Number of packets marked as Congestion Experienced | +| ecn.bytes | hostname, device, component=Neutron, service=networking | Number of bytes marked as Congestion Experienced | +| ecn.cong.rate | hostname, device, component=Neutron, service=networking | Congestion rate in kbps calculated using ecn.bytes value | + +There is a detection plugin that should be used to configure this plugin. It is invoked as: + +$ monasca-setup -d congestion + +You can check the current congestion status of the network by simply running + +$ sudo monasca-collector -v check congestion ## Couch See [the example configuration](https://github.com/openstack/monasca-agent/blob/master/conf.d/couch.yaml.example) for how to configure the Couch plugin. diff --git a/monasca_agent/collector/checks_d/congestion.py b/monasca_agent/collector/checks_d/congestion.py new file mode 100644 index 00000000..c2fb7e97 --- /dev/null +++ b/monasca_agent/collector/checks_d/congestion.py @@ -0,0 +1,336 @@ +#!/bin/env python + +# Copyright 2017 OrangeLabs + +from copy import deepcopy +import json +import logging +import math +from monasca_agent.collector.checks import AgentCheck +from monasca_agent.common import keystone +from novaclient import client as nova_client +import os +import stat +import subprocess +import time + +log = logging.getLogger(__name__) +prerouting_chain = "PREROUTING" +congestion_chain = "congestion" +forward_chain = "FORWARD" + +"""Monasca Agent interface for congestion metrics""" + + +class Congestion(AgentCheck): + + """This Agent provides congestion metrics necessary to monitor network + performance. It uses ECN marking mechanism to discover the congestion + in the network. The iptables chains and rules are used to collect ECN + packets/bytes. Also, the agent provides congestion threshold computed + from the collected ECN bytes. + """ + + def __init__(self, name, init_config, agent_config, instances=None): + AgentCheck.__init__(self, name, init_config, + agent_config, instances=[{}]) + cache_dir = self.init_config.get('cache_dir') + self.enable_vm = self.init_config.get('enable_vm') + self.enable_ecn = self.init_config.get('enable_ecn') + self.s_factor = self.init_config.get('s_factor') + self.collect_period = self.init_config.get('collect_period') + self.cong_cache_file = os.path.join(cache_dir, + 'congestion_status.json') + self.session = keystone.get_session(**self.init_config) + self.chain_exist = False + self.rule_exist = False + self._check_chain() + self.checked = [] + if self.enable_ecn: + self._activate_ecn() + + def check(self, instance): + """Extend check method to collect and update congestion metrics. + """ + dimensions = self._set_dimensions({'service': 'networking', + 'component': 'Neutron'}, instance) + self.sample_time = float("{:9f}".format(time.time())) + """Check iptables information and verify/install the ECN rule for + specific hypervisor""" + ip_list = self._get_hypervisors() + """update congestion metrics for each remote hypervisor""" + for name, ip in ip_list.items(): + if name != self.hostname and name not in self.checked: + self.checked.append(name) + dimensions.update({'hostname': name}) + self._update_metrics(name, ip, dimensions) + """update congestion metrics for vms if this option + was enabled""" + if self.enable_vm: + ip_vm_list = self._get_vms(name) + if ip_vm_list: + for name_vm, ip_vm in ip_vm_list.items(): + if name_vm not in self.checked: + self.checked.append(name_vm) + dimensions.update({'device': name_vm}) + self._update_metrics(name_vm, ip_vm, + dimensions) + + def _update_metrics(self, name, ip, dimensions): + """This method updates congestion metrics and cache and sends + them to monasca API for further treatment or evaluation. + """ + cong_cache = self._load_cong_cache() + rule_data = self._get_counters(ip, congestion_chain) + if not rule_data: + match = "tos --tos 0x03" + action = "MARK --set-mark 3" + self._add_rule(congestion_chain, ip, match, action) + rule_data = self._get_counters(ip, congestion_chain) + if name not in cong_cache: + cong_cache[name] = {} + """initalize cache values""" + cong_cache[name]['ecn.cong.rate'] = {'value': 0} + cong_cache[name]['ecn.bytes'] = {'value': 0} + cong_cache[name]['ecn_bytes_sum'] = {'value': 0} + cong_cache[name]['ecn.packets'] = {'value': 0} + cong_cache[name]['ecn.packets_sum'] = {'value': 0} + ecn_packets = int(rule_data[0]) - \ + cong_cache[name]['ecn.packets_sum']['value'] + cong_cache[name]['ecn.packets_sum']['value'] = int(rule_data[0]) + ecn_bytes = int(rule_data[1]) - \ + cong_cache[name]['ecn_bytes_sum']['value'] + cong_cache[name]['ecn_bytes_sum']['value'] = int(rule_data[1]) + """ecn congestion average equation""" + ecn_cong_avg = self.s_factor * \ + (ecn_bytes * 8 / 1000 / self.collect_period) + ecn_cong_rate_prev = cong_cache[name]['ecn.cong.rate']['value'] + """Actual ecn congestion rate takes into consideration the previous + value with a certain percentage. The result is expressed in kbps""" + ecn_cong_rate = ecn_cong_avg + (1 - self.s_factor) * ecn_cong_rate_prev + """Update the cache file with new metric values""" + cong_cache[name]['ecn.packets'] = {'timestamp': self.sample_time, + 'value': ecn_packets} + cong_cache[name]['ecn.bytes'] = {'timestamp': self.sample_time, + 'value': ecn_bytes} + cong_cache[name]['ecn.cong.rate'] = {'timestamp': self.sample_time, + 'value': ecn_cong_rate} + self.log.info("metric updated for %s.", name) + self.gauge('ecn.packets', ecn_packets, dimensions) + self.gauge('ecn.bytes', ecn_bytes, dimensions) + self.gauge('ecn.cong.rate', ecn_cong_rate, dimensions) + self._update_cong_cache(cong_cache) + + def _check_chain(self): + """Verify if the necessary iptables' chains are in place + """ + for chain in [congestion_chain, prerouting_chain]: + self._get_rule(chain) + """Add new congestion chain if it's not in the table""" + if not self.chain_exist: + self._add_chain(congestion_chain) + """Redirect any packet received by Prerouting chain to congestion + chain""" + if not self.rule_exist: + self._add_rule(prerouting_chain, None, None, congestion_chain) + + def _activate_ecn(self): + """Ensures that the ECN marking is enable on each tap interface + """ + tos_rule = "TOS --set-tos 0x02/0xff" + tos_tap = False + """Collect tap intefaces attached to linux bridge""" + try: + taps = subprocess.check_output( + "brctl show | awk '{print $1}' | grep tap", + shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + self.log.error(e.output) + taps = filter(None, taps.split('\n')) + """Collect installed rules in Forward chain""" + forw_rules = self._get_rule(forward_chain) + for tap in taps: + for rule in forw_rules: + """Check if the rule was applied to tap interface""" + if (tap + " -j " + tos_rule) in rule: + tos_tap = True + break + if not tos_tap: + """Enable ECN""" + match = "physdev --physdev-out " + tap + self._add_rule(forward_chain, None, match, tos_rule) + self.log.info("ECN is enabled for %s interface.", tap) + break + + def _add_chain(self, chain): + """This method adds 'chain' into iptables. + """ + command = "sudo iptables -t mangle -N " + chain + " -w" + try: + subprocess.check_output( + command, shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + self.log.error( + "Command {} return with error (code {}): {}" + .format(e.cmd, e.returncode, e.output)) + self.log.info("New %s chain was added to mangle table.", chain) + + def _add_rule(self, chain, ip, match, action): + """Add new iptables rule based on the given arguments. + """ + basic_rule = "sudo iptables -t mangle -A " + if chain == prerouting_chain: + command = basic_rule + chain + " -j " + action + " -w " + if chain == congestion_chain: + command = basic_rule + chain + " -s " + ip + " -m " + match + \ + " -j " + action + " -w" + if chain == forward_chain: + command = basic_rule + chain + " -m " + match + \ + " -j " + action + " -w" + try: + subprocess.check_output( + command, shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + self.log.error( + "Command {} return with error (code {}): {}" + .format(e.cmd, e.returncode, e.output)) + self.log.info("New rule was added to %s ", chain) + + def _get_rule(self, chain): + """Search in the iptables chains if 'chain' exist. + """ + command = "sudo iptables -S -t mangle -w" + forw_rules = [] + try: + output = subprocess.check_output( + command, shell=True, stderr=subprocess.STDOUT, + universal_newlines=True) + except subprocess.CalledProcessError as e: + self.log.error( + "Command {} return with error (code {}): {}" + .format(e.cmd, e.returncode, e.output)) + output = filter(None, output.split('\n')) + if chain == congestion_chain: + for rule in output: + if '-N congestion' in rule: + self.chain_exist = True + break + if chain == prerouting_chain: + for rule in output: + if '-A PREROUTING -j congestion' in rule: + self.rule_exist = True + break + if chain == forward_chain: + for rule in output: + if '-A FORWARD' in rule: + forw_rules.append(rule) + return forw_rules + return 0 + + def _get_hypervisors(self): + """Connect to nova client and get the name/ip of all remote compute + (hypervisor). + """ + hyp_list = {} + n_client = self._get_nova_client() + hypervisors = n_client.hypervisors.list() + for hypervisor in hypervisors: + name = hypervisor.__dict__['hypervisor_hostname'] + ip = hypervisor.__dict__['host_ip'] + if name not in hyp_list: + hyp_list[name] = ip + return hyp_list + + def _get_vms(self, compute_name): + """Connect to nova client and collect the name and the ip of all VMs + deployed in a specific compute_name. + """ + vm_list = {} + n_client = self._get_nova_client() + try: + instances = n_client.servers.list( + search_opts={'all_tenants': 1, 'host': compute_name}) + except Exception as e: + self.log.error( + "%s : No instances hosted in %s compute. ", e, compute_name) + vm_list = {} + if instances: + for instance in instances: + inst_name = instance.__getattr__('name') + for net in instance.addresses: + for ip in instance.addresses[net]: + if (ip['OS-EXT-IPS:type'] == 'fixed' and + ip['version'] == 4): + vm_list[inst_name] = ip['addr'] + return vm_list + + def _get_counters(self, ip, chain): + """Collect packets and bytes of each source 'ip' existing in 'chain'. + """ + counters = () + command = "sudo iptables -L " + chain + " -v -t mangle -w" + try: + output = subprocess.check_output( + command, shell=True, stderr=subprocess.STDOUT, + universal_newlines=True) + except subprocess.CalledProcessError as e: + self.log.error( + "Command {} return with error (code {}): {}" + .format(e.cmd, e.returncode, e.output)) + output = filter(None, output.split('\n')) + for line in output: + if 'tos match0x03' in line: + line = filter(None, line.split(' ')) + if str(ip) == line[7]: + packet = self._convert_data(line[0]) + bytes = self._convert_data(line[1]) + counters = (packet, bytes) + return counters + + def _convert_data(self, data): + """Convert any string that contains a K or M letter to an integer. + """ + if 'K' in str(data): + data = int(data.replace('K', '')) * 1000 + if 'M' in str(data): + data = int(data.replace('M', '')) * 1000000 + return data + + def _load_cong_cache(self): + """Load congestion metrics from the previous measurement stored as + cache file in the hard disk. + """ + load_cong_cache = {} + try: + with open(self.cong_cache_file, 'r') as cache_json: + load_cong_cache = json.load(cache_json) + except (IOError, TypeError, ValueError): + self.log.warning( + "Congestion cache missing or corrupt, rebuilding.") + load_cong_cache = {} + return load_cong_cache + + def _update_cong_cache(self, cong_cache): + """update cache file.""" + write_cong_cache = deepcopy(cong_cache) + try: + with open(self.cong_cache_file, 'w') as cache_json: + json.dump(write_cong_cache, cache_json) + if stat.S_IMODE(os.stat(self.cong_cache_file).st_mode) != 0o600: + os.chmod(self.cong_cache_file, 0o600) + self.log.warning("Your cache file is updated : %s ", time.time()) + except IOError as e: + self.log.error("Cannot write to {0}: {1}". + format(self.cong_cache_file, e)) + + def _get_nova_client(self): + """Get nova client object based on configured parameters. + """ + region_name = self.init_config.get('region_name') + endpoint_type = self.init_config.get("endpoint_type", "publicURL") + nc = nova_client.Client(2, session=self.session, + endpoint_type=endpoint_type, + service_type="compute", + region_name=region_name) + + return nc diff --git a/monasca_setup/detection/plugins/congestion.py b/monasca_setup/detection/plugins/congestion.py new file mode 100644 index 00000000..e0ac2644 --- /dev/null +++ b/monasca_setup/detection/plugins/congestion.py @@ -0,0 +1,100 @@ +# Copyright 2017 OrangeLabs + +import ConfigParser +import logging +from monasca_agent.common.psutil_wrapper import psutil +import monasca_setup.agent_config +import monasca_setup.detection +import os +log = logging.getLogger(__name__) + +# Directory to use for metric caches +cache_dir = "/dev/shm" +# Enable vm congestion check +enable_vm = True +# Ensure that ECN marking is enabled +enable_ecn = True +# Smoothing factor used to compute ecn congestion rate +s_factor = 0.1 +# Period of time in second to collect metrics +collect_period = 30 +# Acceptable arguments +acceptable_args = ['username', 'password', 'project_name', + 'auth_url', 'cache_dir', 'enable_vm', 'enable_ecn', + 'region_name', 's_factor', 'collect_period'] + + +class Congestion(monasca_setup.detection.Plugin): + + """Configures congestion detection plugin.""" + + def _detect(self): + """Run detection, set self.available True if the service is + detected. + """ + self.available = True + # Start with plugin-specific configuration parameters + # Try to detect the location of the Nova configuration file. + # Walk through the list of processes, searching for 'nova-compute' + # process with 'nova.conf' inside one of the parameters + nova_conf = None + for proc in psutil.process_iter(): + try: + cmd = proc.as_dict(['cmdline'])['cmdline'] + if (len(cmd) > 2 and 'python' in cmd[0] and + 'nova-compute' in cmd[1]): + conf_indexes = [cmd.index(y) + for y in cmd if 'nova.conf' in y] + if not conf_indexes: + continue + param = conf_indexes[0] + if '=' in cmd[param]: + nova_conf = cmd[param].split('=')[1] + else: + nova_conf = cmd[param] + except (IOError, psutil.NoSuchProcess): + # Process has already terminated, ignore + continue + if (nova_conf is not None and os.path.isfile(nova_conf)): + self.available = True + self.nova_conf = nova_conf + + def build_config(self): + """Build the config as a Plugins object and return. """ + config = monasca_setup.agent_config.Plugins() + log.info("Configuring congestion plugin") + nova_cfg = ConfigParser.SafeConfigParser() + log.info("\tUsing nova configuration file {0}".format(self.nova_conf)) + nova_cfg.read(self.nova_conf) + # Which configuration options are needed for the plugin YAML? + # Use a dict so that they can be renamed later if necessary + cfg_needed = { + 'username': 'username', 'password': 'password', + 'project_name': 'project_name'} + cfg_section = 'keystone_authtoken' + region_name_sec = 'neutron' + init_config = { + 'cache_dir': cache_dir, + 'enable_vm': enable_vm, + 'enable_ecn': enable_ecn, + 's_factor': s_factor, + 'collect_period': collect_period} + for option in cfg_needed: + init_config[option] = nova_cfg.get( + cfg_section, cfg_needed[option]) + init_config['region_name'] = nova_cfg.get( + region_name_sec, 'region_name') + # Create an identity URI (again, slightly different for Devstack) + if nova_cfg.has_option(cfg_section, 'auth_url'): + init_config['auth_url'] = nova_cfg.get(cfg_section, 'auth_url') + else: + init_config['auth_url'] = nova_cfg.get( + cfg_section, 'identity_uri') + + config = monasca_setup.agent_config.Plugins() + config['congestion'] = { + 'init_config': init_config, 'instances': []} + return config + + def dependencies_installed(self): + return True