Add container memory percentage metric when running in k8s

Change-Id: I42767eaaf29d872c444a36017eb1972f108176b5
This commit is contained in:
mhoppal
2017-12-13 09:36:23 -07:00
parent 9105ce9a29
commit cc6f1b9ec2
3 changed files with 44 additions and 17 deletions

View File

@@ -5,6 +5,8 @@ init_config:
connection_timeout: 3 connection_timeout: 3
# Report container metrics. Defaults to false. # Report container metrics. Defaults to false.
report_container_metrics: False report_container_metrics: False
# Report container usage percent if running in kubernetes. Defaults to true.
report_container_mem_percent: True
instances: instances:
# There are two options for getting the host that we use to connect to kubelet/cadvisor with. Either manually setting # There are two options for getting the host that we use to connect to kubelet/cadvisor with. Either manually setting
# it via host or by setting derive_host to True. We derive the host by first using the kubernetes environment # it via host or by setting derive_host to True. We derive the host by first using the kubernetes environment

View File

@@ -1557,7 +1557,7 @@ instances:
The kubernetes check returns the following metrics (note that for containers running under kubernetes and pod metrics The kubernetes check returns the following metrics (note that for containers running under kubernetes and pod metrics
can also have dimensions set from the configuration option 'kubernetes_labels' which by default will include 'app') can also have dimensions set from the configuration option 'kubernetes_labels' which by default will include 'app')
**Note** the container metrics will only be reported when the report_container_metrics is true **Note** the container metrics will only be reported when the report_container_metrics is True with the exception of container.mem.usage_percent which will be reported based on the flag report_container_mem_percent.
Common Container metrics between containers running underneath kubernetes and standalone: Common Container metrics between containers running underneath kubernetes and standalone:
@@ -1609,6 +1609,7 @@ Container metrics specific to containers running under kubernetes:
| container.memory.limit_bytes | image, name, pod_name, namespace | Limit of memory in bytes for the container | container.memory.limit_bytes | image, name, pod_name, namespace | Limit of memory in bytes for the container
| container.request.cpu | image, name, pod_name, namespace | Amount of CPU cores requested by the container | container.request.cpu | image, name, pod_name, namespace | Amount of CPU cores requested by the container
| container.request.memory_bytes | image, name, pod_name, namespace | Amount of memory in bytes requested by the container | container.request.memory_bytes | image, name, pod_name, namespace | Amount of memory in bytes requested by the container
| container.mem.usage_percent | image, name, pod_name, namespace | Percentage of memory used out of defined container limit.
Kubelet Metrics: Kubelet Metrics:

View File

@@ -1,5 +1,4 @@
# (C) Copyright 2017 Hewlett Packard Enterprise Development LP # (C) Copyright 2017 Hewlett Packard Enterprise Development LP
import json
import logging import logging
import requests import requests
@@ -96,6 +95,7 @@ class Kubernetes(checks.AgentCheck):
self.connection_timeout = int(init_config.get('connection_timeout', DEFAULT_TIMEOUT)) self.connection_timeout = int(init_config.get('connection_timeout', DEFAULT_TIMEOUT))
self.host = None self.host = None
self.report_container_metrics = init_config.get('report_container_metrics', REPORT_CONTAINER_METRICS) self.report_container_metrics = init_config.get('report_container_metrics', REPORT_CONTAINER_METRICS)
self.report_container_mem_percent = init_config.get('report_container_mem_percent', True)
self.kubernetes_connector = None self.kubernetes_connector = None
def prepare_run(self): def prepare_run(self):
@@ -118,6 +118,7 @@ class Kubernetes(checks.AgentCheck):
kubernetes_labels = instance.get('kubernetes_labels', ["app"]) kubernetes_labels = instance.get('kubernetes_labels', ["app"])
container_dimension_map = {} container_dimension_map = {}
pod_dimensions_map = {} pod_dimensions_map = {}
memory_limit_map = {}
dimensions = self._set_dimensions(None, instance) dimensions = self._set_dimensions(None, instance)
# Remove hostname from dimensions as the majority of the metrics are not tied to the hostname. # Remove hostname from dimensions as the majority of the metrics are not tied to the hostname.
del dimensions['hostname'] del dimensions['hostname']
@@ -132,11 +133,13 @@ class Kubernetes(checks.AgentCheck):
kubernetes_labels, kubernetes_labels,
dimensions, dimensions,
container_dimension_map, container_dimension_map,
pod_dimensions_map) pod_dimensions_map,
memory_limit_map)
self._process_containers(cadvisor, self._process_containers(cadvisor,
dimensions, dimensions,
container_dimension_map, container_dimension_map,
pod_dimensions_map) pod_dimensions_map,
memory_limit_map)
def _get_urls(self, instance): def _get_urls(self, instance):
base_url = "http://{}".format(self.host) base_url = "http://{}".format(self.host)
@@ -164,7 +167,8 @@ class Kubernetes(checks.AgentCheck):
break break
return api_health return api_health
def _process_pods(self, pods, kubernetes_labels, dimensions, container_dimension_map, pod_dimensions_map): def _process_pods(self, pods, kubernetes_labels, dimensions, container_dimension_map, pod_dimensions_map,
memory_limit_map):
for pod in pods: for pod in pods:
pod_status = pod['status'] pod_status = pod['status']
pod_spec = pod['spec'] pod_spec = pod['spec']
@@ -200,13 +204,13 @@ class Kubernetes(checks.AgentCheck):
pod_retry_count += container_restart_count pod_retry_count += container_restart_count
# Report limit/request metrics # Report limit/request metrics
if self.report_container_metrics: if self.report_container_metrics or self.report_container_mem_percent:
self._report_container_limits(pod_containers, container_dimension_map, name2id) self._report_container_limits(pod_containers, container_dimension_map, name2id, memory_limit_map)
self.gauge("pod.restart_count", pod_retry_count, pod_dimensions, hostname="SUPPRESS") self.gauge("pod.restart_count", pod_retry_count, pod_dimensions, hostname="SUPPRESS")
self.gauge("pod.phase", POD_PHASE.get(pod_status['phase']), pod_dimensions, hostname="SUPPRESS") self.gauge("pod.phase", POD_PHASE.get(pod_status['phase']), pod_dimensions, hostname="SUPPRESS")
def _report_container_limits(self, pod_containers, container_dimension_map, name2id): def _report_container_limits(self, pod_containers, container_dimension_map, name2id, memory_limit_map):
for container in pod_containers: for container in pod_containers:
container_name = container['name'] container_name = container['name']
container_dimensions = container_dimension_map[name2id[container_name]] container_dimensions = container_dimension_map[name2id[container_name]]
@@ -215,12 +219,18 @@ class Kubernetes(checks.AgentCheck):
if 'cpu' in container_limits: if 'cpu' in container_limits:
cpu_limit = container_limits['cpu'] cpu_limit = container_limits['cpu']
cpu_value = self._convert_cpu_to_cores(cpu_limit) cpu_value = self._convert_cpu_to_cores(cpu_limit)
self.gauge("container.cpu.limit", cpu_value, container_dimensions, hostname="SUPPRESS") if self.report_container_metrics:
self.gauge("container.cpu.limit", cpu_value, container_dimensions, hostname="SUPPRESS")
if 'memory' in container_limits: if 'memory' in container_limits:
memory_limit = container_limits['memory'] memory_limit = container_limits['memory']
memory_in_bytes = utils.convert_memory_string_to_bytes(memory_limit) memory_in_bytes = utils.convert_memory_string_to_bytes(memory_limit)
self.gauge("container.memory.limit_bytes", memory_in_bytes, container_dimensions, if self.report_container_metrics:
hostname="SUPPRESS") self.gauge("container.memory.limit_bytes", memory_in_bytes, container_dimensions,
hostname="SUPPRESS")
if self.report_container_mem_percent:
container_key = container_name + " " + container_dimensions["namespace"]
if container_key not in memory_limit_map:
memory_limit_map[container_key] = memory_in_bytes
except KeyError: except KeyError:
self.log.exception("Unable to report container limits for {}".format(container_name)) self.log.exception("Unable to report container limits for {}".format(container_name))
try: try:
@@ -228,12 +238,14 @@ class Kubernetes(checks.AgentCheck):
if 'cpu' in container_requests: if 'cpu' in container_requests:
cpu_request = container_requests['cpu'] cpu_request = container_requests['cpu']
cpu_value = self._convert_cpu_to_cores(cpu_request) cpu_value = self._convert_cpu_to_cores(cpu_request)
self.gauge("container.request.cpu", cpu_value, container_dimensions, hostname="SUPPRESS") if self.report_container_metrics:
self.gauge("container.request.cpu", cpu_value, container_dimensions, hostname="SUPPRESS")
if 'memory' in container_requests: if 'memory' in container_requests:
memory_request = container_requests['memory'] memory_request = container_requests['memory']
memory_in_bytes = utils.convert_memory_string_to_bytes(memory_request) memory_in_bytes = utils.convert_memory_string_to_bytes(memory_request)
self.gauge("container.request.memory_bytes", memory_in_bytes, container_dimensions, if self.report_container_metrics:
hostname="SUPPRESS") self.gauge("container.request.memory_bytes", memory_in_bytes, container_dimensions,
hostname="SUPPRESS")
except KeyError: except KeyError:
self.log.exception("Unable to report container requests for {}".format(container_name)) self.log.exception("Unable to report container requests for {}".format(container_name))
@@ -260,7 +272,7 @@ class Kubernetes(checks.AgentCheck):
self.gauge(metric_name, value, dimensions, self.gauge(metric_name, value, dimensions,
hostname="SUPPRESS" if "pod_name" in dimensions else None) hostname="SUPPRESS" if "pod_name" in dimensions else None)
def _parse_memory(self, memory_data, container_dimensions, pod_key, pod_map): def _parse_memory(self, memory_data, container_dimensions, pod_key, pod_map, memory_limit_map):
memory_metrics = CADVISOR_METRICS['memory_metrics'] memory_metrics = CADVISOR_METRICS['memory_metrics']
for cadvisor_key, metric_name in memory_metrics.items(): for cadvisor_key, metric_name in memory_metrics.items():
if cadvisor_key in memory_data: if cadvisor_key in memory_data:
@@ -272,6 +284,16 @@ class Kubernetes(checks.AgentCheck):
METRIC_TYPES_UNITS[metric_name][0], METRIC_TYPES_UNITS[metric_name][0],
METRIC_TYPES_UNITS[metric_name][1]) METRIC_TYPES_UNITS[metric_name][1])
self._add_pod_metric(metric_name, metric_value, pod_key, pod_map) self._add_pod_metric(metric_name, metric_value, pod_key, pod_map)
if self.report_container_mem_percent and cadvisor_key == "working_set":
if "container_name" in container_dimensions and "namespace" in container_dimensions:
container_key = container_dimensions["container_name"] + " " + container_dimensions["namespace"]
if container_key not in memory_limit_map:
continue
memory_limit = memory_limit_map[container_key]
memory_usage = metric_value
memory_usage_percent = (memory_usage / memory_limit) * 100
self.gauge("container.mem.usage_percent", memory_usage_percent, container_dimensions,
hostname="SUPPRESS")
def _parse_filesystem(self, filesystem_data, container_dimensions): def _parse_filesystem(self, filesystem_data, container_dimensions):
if not self.report_container_metrics: if not self.report_container_metrics:
@@ -387,7 +409,8 @@ class Kubernetes(checks.AgentCheck):
pod_key = None pod_key = None
return pod_key, container_dimensions return pod_key, container_dimensions
def _process_containers(self, cadvisor_url, dimensions, container_dimension_map, pod_dimension_map): def _process_containers(self, cadvisor_url, dimensions, container_dimension_map, pod_dimension_map,
memory_limit_map):
try: try:
cadvisor_spec_url = cadvisor_url + CADVISOR_SPEC_URL cadvisor_spec_url = cadvisor_url + CADVISOR_SPEC_URL
cadvisor_metric_url = cadvisor_url + CADVISOR_METRIC_URL cadvisor_metric_url = cadvisor_url + CADVISOR_METRIC_URL
@@ -410,7 +433,8 @@ class Kubernetes(checks.AgentCheck):
self._parse_memory(cadvisor_metrics['memory'], self._parse_memory(cadvisor_metrics['memory'],
container_dimensions, container_dimensions,
pod_key, pod_key,
pod_metrics) pod_metrics,
memory_limit_map)
if cadvisor_metrics['has_filesystem'] and 'filesystem' in cadvisor_metrics \ if cadvisor_metrics['has_filesystem'] and 'filesystem' in cadvisor_metrics \
and cadvisor_metrics['filesystem']: and cadvisor_metrics['filesystem']:
self._parse_filesystem(cadvisor_metrics['filesystem'], self._parse_filesystem(cadvisor_metrics['filesystem'],