diff --git a/doc/source/userguide.rst b/doc/source/userguide.rst index 762ab518b1..6fb9dd9983 100644 --- a/doc/source/userguide.rst +++ b/doc/source/userguide.rst @@ -33,6 +33,7 @@ Contents #. `Storage`_ #. `Image Management`_ #. `Notification`_ +#. `Container Monitoring`_ =========== Terminology @@ -304,7 +305,11 @@ the table are linked to more details elsewhere in the user guide. +---------------------------------------+--------------------+---------------+ | `admission_control_list`_ | see below | see below | +---------------------------------------+--------------------+---------------+ - +| `prometheus_monitoring`_ | - true | false | +| | - false | | ++---------------------------------------+--------------------+---------------+ +| `grafana_admin_passwd`_ | (any string) | "admin" | ++---------------------------------------+--------------------+---------------+ ======= Cluster @@ -2719,3 +2724,69 @@ created. This example can be applied for any ``create``, ``update`` or "publisher_id": "magnum.host1234", "timestamp": "2016-05-20 15:03:45.960280" } + + +==================== +Container Monitoring +==================== + +The offered monitoring stack relies on the following set of containers and +services: + +- cAdvisor +- Node Exporter +- Prometheus +- Grafana + +To setup this monitoring stack, users are given two configurable labels in +the Magnum cluster template's definition: + +_`prometheus_monitoring` + This label accepts a boolean value. If *True*, the monitoring stack will be + setup. By default *prometheus_monitoring = False*. + +_`grafana_admin_passwd` + This label lets users create their own *admin* user password for the Grafana + interface. It expects a string value. By default it is set to *admin*. + + +Container Monitoring in Kubernetes +---------------------------------- + +By default, all Kubernetes clusters already contain *cAdvisor* integrated +with the *Kubelet* binary. Its container monitoring data can be accessed on +a node level basis through *http://NODE_IP:4194*. + +Node Exporter is part of the above mentioned monitoring stack as it can be +used to export machine metrics. Such functionality also work on a node level +which means that when `prometheus_monitoring`_ is *True*, the Kubernetes nodes +will be populated with an additional manifest under +*/etc/kubernetes/manifests*. Node Exporter is then automatically picked up +and launched as a regular Kubernetes POD. + +To aggregate and complement all the existing monitoring metrics and add a +built-in visualization layer, Prometheus is used. It is launched by the +Kubernetes master node(s) as a *Service* within a *Deployment* with one +replica and it relies on a *ConfigMap* where the Prometheus configuration +(prometheus.yml) is defined. This configuration uses Prometheus native +support for service discovery in Kubernetes clusters, +*kubernetes_sd_configs*. The respective manifests can be found in +*/srv/kubernetes/monitoring/* on the master nodes and once the service is +up and running, Prometheus UI can be accessed through port 9090. + +Finally, for custom plotting and enhanced metric aggregation and +visualization, Prometheus can be integrated with Grafana as it provides +native compliance for Prometheus data sources. Also Grafana is deployed as +a *Service* within a *Deployment* with one replica. The default user is +*admin* and the password is setup according to `grafana_admin_passwd`_. +There is also a default Grafana dashboard provided with this installation, +from the official `Grafana dashboards' repository +`_. The Prometheus data +source is automatically added to Grafana once it is up and running, pointing +to *http://prometheus:9090* through *Proxy*. The respective manifests can +also be found in */srv/kubernetes/monitoring/* on the master nodes and once +the service is running, the Grafana dashboards can be accessed through port +3000. + +For both Prometheus and Grafana, there is an assigned *systemd* service +called *kube-enable-monitoring*. diff --git a/magnum/drivers/common/templates/kubernetes/fragments/enable-monitoring.sh b/magnum/drivers/common/templates/kubernetes/fragments/enable-monitoring.sh new file mode 100644 index 0000000000..282f732251 --- /dev/null +++ b/magnum/drivers/common/templates/kubernetes/fragments/enable-monitoring.sh @@ -0,0 +1,139 @@ +#!/bin/bash + +. /etc/sysconfig/heat-params + +if [ "$(echo $PROMETHEUS_MONITORING | tr '[:upper:]' '[:lower:]')" = "false" ]; then + exit 0 +fi + +function writeFile { + # $1 is filename + # $2 is file content + + [ -f ${1} ] || { + echo "Writing File: $1" + mkdir -p $(dirname ${1}) + cat << EOF > ${1} +$2 +EOF + } +} + +KUBE_MON_BIN=/usr/local/bin/kube-enable-monitoring +KUBE_MON_SERVICE=/etc/systemd/system/kube-enable-monitoring.service +GRAFANA_DEF_DASHBOARDS="/var/lib/grafana/dashboards" +GRAFANA_DEF_DASHBOARD_FILE=$GRAFANA_DEF_DASHBOARDS"/default.json" + +# Write the binary for enable-monitoring +KUBE_MON_BIN_CONTENT='''#!/bin/sh +until curl -sf "http://127.0.0.1:8080/healthz" +do + echo "Waiting for Kubernetes API..." + sleep 5 +done + +# Check if all resources exist already before creating them +# Check if configmap Prometheus exists +kubectl get configmap prometheus -n kube-system +if [ "$?" != "0" ] && \ + [ -f "/srv/kubernetes/monitoring/prometheusConfigMap.yaml" ]; then + kubectl create -f /srv/kubernetes/monitoring/prometheusConfigMap.yaml +fi + +# Check if deployment and service Prometheus exist +kubectl get service prometheus -n kube-system | kubectl get deployment prometheus -n kube-system +if [ "${PIPESTATUS[0]}" != "0" ] && [ "${PIPESTATUS[1]}" != "0" ] && \ + [ -f "/srv/kubernetes/monitoring/prometheusService.yaml" ]; then + kubectl create -f /srv/kubernetes/monitoring/prometheusService.yaml +fi + +# Check if configmap graf-dash exists +kubectl get configmap graf-dash -n kube-system +if [ "$?" != "0" ] && \ + [ -f '''$GRAFANA_DEF_DASHBOARD_FILE''' ]; then + kubectl create configmap graf-dash --from-file='''$GRAFANA_DEF_DASHBOARD_FILE''' -n kube-system +fi + +# Check if deployment and service Grafana exist +kubectl get service grafana -n kube-system | kubectl get deployment grafana -n kube-system +if [ "${PIPESTATUS[0]}" != "0" ] && [ "${PIPESTATUS[1]}" != "0" ] && \ + [ -f "/srv/kubernetes/monitoring/grafanaService.yaml" ]; then + kubectl create -f /srv/kubernetes/monitoring/grafanaService.yaml +fi + +# Wait for Grafana pod and then inject data source +while true +do + echo "Waiting for Grafana pod to be up and Running" + if [ "$(kubectl get po -n kube-system -l name=grafana -o jsonpath={..phase})" = "Running" ]; then + break + fi + sleep 2 +done + +# Which node is running Grafana +NODE_IP=`kubectl get po -n kube-system -o jsonpath={.items[0].status.hostIP} -l name=grafana` +PROM_SERVICE_IP=`kubectl get svc prometheus --namespace kube-system -o jsonpath={..clusterIP}` + +# The Grafana pod might be running but the app might still be initiating +echo "Check if Grafana is ready..." +curl --user admin:$ADMIN_PASSWD -X GET http://$NODE_IP:3000/api/datasources/1 +until [ $? -eq 0 ] +do + sleep 2 + curl --user admin:$ADMIN_PASSWD -X GET http://$NODE_IP:3000/api/datasources/1 +done + +# Inject Prometheus datasource into Grafana +while true +do + INJECT=`curl --user admin:$ADMIN_PASSWD -X POST \ + -H "Content-Type: application/json;charset=UTF-8" \ + --data-binary '''"'"'''{"name":"k8sPrometheus","isDefault":true, + "type":"prometheus","url":"http://'''"'"'''$PROM_SERVICE_IP'''"'"''':9090","access":"proxy"}'''"'"'''\ + "http://$NODE_IP:3000/api/datasources/"` + + if [[ "$INJECT" = *"Datasource added"* ]]; then + echo "Prometheus datasource injected into Grafana" + break + fi + echo "Trying to inject Prometheus datasource into Grafana - "$INJECT +done +''' +writeFile $KUBE_MON_BIN "$KUBE_MON_BIN_CONTENT" + + +# Write the monitoring service +KUBE_MON_SERVICE_CONTENT='''[Unit] +Requires=kubelet.service + +[Service] +Type=oneshot +Environment=HOME=/root +EnvironmentFile=-/etc/kubernetes/config +ExecStart='''${KUBE_MON_BIN}''' + +[Install] +WantedBy=multi-user.target +''' +writeFile $KUBE_MON_SERVICE "$KUBE_MON_SERVICE_CONTENT" + +chown root:root ${KUBE_MON_BIN} +chmod 0755 ${KUBE_MON_BIN} + +chown root:root ${KUBE_MON_SERVICE} +chmod 0644 ${KUBE_MON_SERVICE} + +# Download the default JSON Grafana dashboard +# Not a crucial step, so allow it to fail +# TODO: this JSON should be passed into the minions as gzip in cloud-init +GRAFANA_DASHB_URL="https://grafana.net/api/dashboards/1621/revisions/1/download" +mkdir -p $GRAFANA_DEF_DASHBOARDS +curl $GRAFANA_DASHB_URL -o $GRAFANA_DEF_DASHBOARD_FILE || echo "Failed to fetch default Grafana dashboard" +if [ -f $GRAFANA_DEF_DASHBOARD_FILE ]; then + sed -i -- 's|${DS_PROMETHEUS}|k8sPrometheus|g' $GRAFANA_DEF_DASHBOARD_FILE +fi + +# Launch the monitoring service +systemctl enable kube-enable-monitoring +systemctl start --no-block kube-enable-monitoring diff --git a/magnum/drivers/common/templates/kubernetes/fragments/enable-node-exporter.sh b/magnum/drivers/common/templates/kubernetes/fragments/enable-node-exporter.sh new file mode 100644 index 0000000000..fbcdfd0cd3 --- /dev/null +++ b/magnum/drivers/common/templates/kubernetes/fragments/enable-node-exporter.sh @@ -0,0 +1,27 @@ +#!/bin/sh + +. /etc/sysconfig/heat-params + +if [ "$(echo $PROMETHEUS_MONITORING | tr '[:upper:]' '[:lower:]')" = "false" ]; then + exit 0 +fi + +# Write node-exporter manifest as a regular pod +cat > /etc/kubernetes/manifests/node-exporter.yaml << EOF +apiVersion: v1 +kind: Pod +metadata: + name: node-exporter + namespace: kube-system + annotations: + prometheus.io/scrape: "true" + labels: + app: node-exporter +spec: + containers: + - name: node-exporter + image: prom/node-exporter + ports: + - containerPort: 9100 + hostPort: 9100 +EOF diff --git a/magnum/drivers/common/templates/kubernetes/fragments/write-grafana-service.yaml b/magnum/drivers/common/templates/kubernetes/fragments/write-grafana-service.yaml new file mode 100644 index 0000000000..17fe45a354 --- /dev/null +++ b/magnum/drivers/common/templates/kubernetes/fragments/write-grafana-service.yaml @@ -0,0 +1,67 @@ +#cloud-config +merge_how: dict(recurse_array)+list(append) +write_files: + - path: /srv/kubernetes/monitoring/grafanaService.yaml + owner: "root:root" + permissions: "0644" + content: | + apiVersion: v1 + kind: Service + metadata: + labels: + name: node + role: service + name: grafana + namespace: kube-system + spec: + type: "NodePort" + ports: + - port: 3000 + targetPort: 3000 + nodePort: 30603 + selector: + grafana: "true" + --- + apiVersion: extensions/v1beta1 + kind: Deployment + metadata: + name: grafana + namespace: kube-system + spec: + replicas: 1 + template: + metadata: + labels: + name: grafana + grafana: "true" + role: db + spec: + containers: + - image: grafana/grafana + imagePullPolicy: Always + name: grafana + env: + - name: GF_SECURITY_ADMIN_PASSWORD + value: $ADMIN_PASSWD + - name: GF_DASHBOARDS_JSON_ENABLED + value: "true" + - name: GF_DASHBOARDS_JSON_PATH + value: /var/lib/grafana/dashboards + resources: + # keep request = limit to keep this container in guaranteed class + limits: + cpu: 100m + memory: 200Mi + requests: + cpu: 100m + memory: 200Mi + volumeMounts: + - name: default-dashboard + mountPath: /var/lib/grafana/dashboards + ports: + - containerPort: 3000 + hostPort: 3000 + volumes: + - name: default-dashboard + configMap: + name: graf-dash diff --git a/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.yaml b/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.yaml index ec0f1f678a..84bd51b128 100644 --- a/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.yaml +++ b/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.yaml @@ -5,6 +5,7 @@ write_files: owner: "root:root" permissions: "0600" content: | + PROMETHEUS_MONITORING="$PROMETHEUS_MONITORING" KUBE_API_PUBLIC_ADDRESS="$KUBE_API_PUBLIC_ADDRESS" KUBE_API_PRIVATE_ADDRESS="$KUBE_API_PRIVATE_ADDRESS" KUBE_API_PORT="$KUBE_API_PORT" diff --git a/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params.yaml b/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params.yaml index 20a98fa909..de94057052 100644 --- a/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params.yaml +++ b/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params.yaml @@ -5,6 +5,7 @@ write_files: owner: "root:root" permissions: "0600" content: | + PROMETHEUS_MONITORING="$PROMETHEUS_MONITORING" KUBE_ALLOW_PRIV="$KUBE_ALLOW_PRIV" KUBE_MASTER_IP="$KUBE_MASTER_IP" KUBE_API_PORT="$KUBE_API_PORT" diff --git a/magnum/drivers/common/templates/kubernetes/fragments/write-prometheus-configmap.yaml b/magnum/drivers/common/templates/kubernetes/fragments/write-prometheus-configmap.yaml new file mode 100644 index 0000000000..7b28aac242 --- /dev/null +++ b/magnum/drivers/common/templates/kubernetes/fragments/write-prometheus-configmap.yaml @@ -0,0 +1,82 @@ +#cloud-config +merge_how: dict(recurse_array)+list(append) +write_files: + - path: /srv/kubernetes/monitoring/prometheusConfigMap.yaml + owner: "root:root" + permissions: "0644" + content: | + apiVersion: v1 + kind: ConfigMap + metadata: + name: prometheus + namespace: kube-system + data: + prometheus.yml: | + global: + scrape_interval: 10s + scrape_timeout: 10s + evaluation_interval: 10s + + scrape_configs: + - job_name: 'kubernetes-nodes-cadvisor' + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - source_labels: [__meta_kubernetes_role] + action: replace + target_label: kubernetes_role + - source_labels: [__address__] + regex: '(.*):10250' + replacement: '${1}:10255' + target_label: __address__ + metric_relabel_configs: + - action: replace + source_labels: [id] + regex: '^/machine\.slice/machine-rkt\\x2d([^\\]+)\\.+/([^/]+)\.service$' + target_label: rkt_container_name + replacement: '${2}-${1}' + - action: replace + source_labels: [id] + regex: '^/system\.slice/(.+)\.service$' + target_label: systemd_service_name + replacement: '${1}' + + - job_name: 'kubernetes-apiserver-cadvisor' + tls_config: + insecure_skip_verify: true + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - source_labels: [__meta_kubernetes_role] + action: replace + target_label: kubernetes_role + - source_labels: [__address__] + regex: '(.*):10250' + replacement: '${1}:10255' + target_label: __address__ + + - job_name: 'kubernetes-node-exporter' + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - source_labels: [__meta_kubernetes_role] + action: replace + target_label: kubernetes_role + - source_labels: [__address__] + regex: '(.*):10250' + replacement: '${1}:9100' + target_label: __address__ diff --git a/magnum/drivers/common/templates/kubernetes/fragments/write-prometheus-service.yaml b/magnum/drivers/common/templates/kubernetes/fragments/write-prometheus-service.yaml new file mode 100644 index 0000000000..7375a30762 --- /dev/null +++ b/magnum/drivers/common/templates/kubernetes/fragments/write-prometheus-service.yaml @@ -0,0 +1,60 @@ +#cloud-config +merge_how: dict(recurse_array)+list(append) +write_files: + - path: /srv/kubernetes/monitoring/prometheusService.yaml + owner: "root:root" + permissions: "0644" + content: | + apiVersion: v1 + kind: Service + metadata: + annotations: + prometheus.io/scrape: 'true' + labels: + name: prometheus + name: prometheus + namespace: kube-system + spec: + selector: + app: prometheus + type: NodePort + ports: + - name: prometheus + protocol: TCP + port: 9090 + nodePort: 30900 + --- + apiVersion: extensions/v1beta1 + kind: Deployment + metadata: + name: prometheus + namespace: kube-system + spec: + replicas: 1 + selector: + matchLabels: + app: prometheus + template: + metadata: + name: prometheus + labels: + app: prometheus + spec: + containers: + - name: prometheus + image: prom/prometheus + args: + - '-storage.local.retention=6h' + - '-storage.local.memory-chunks=500000' + - '-config.file=/etc/prometheus/prometheus.yml' + ports: + - name: web + containerPort: 9090 + hostPort: 9090 + volumeMounts: + - name: config-volume + mountPath: /etc/prometheus + volumes: + - name: config-volume + configMap: + name: prometheus diff --git a/magnum/drivers/heat/k8s_template_def.py b/magnum/drivers/heat/k8s_template_def.py index a0a807fa51..fb7255f312 100644 --- a/magnum/drivers/heat/k8s_template_def.py +++ b/magnum/drivers/heat/k8s_template_def.py @@ -109,7 +109,9 @@ class K8sTemplateDefinition(template_def.BaseTemplateDefinition): 'flannel_network_subnetlen', 'system_pods_initial_delay', 'system_pods_timeout', - 'admission_control_list'] + 'admission_control_list', + 'prometheus_monitoring', + 'grafana_admin_passwd'] for label in label_list: extra_params[label] = cluster_template.labels.get(label) diff --git a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml index d2fad7475a..22c8b16e26 100644 --- a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml +++ b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml @@ -40,6 +40,19 @@ parameters: default: m1.small description: flavor to use when booting the server for minions + prometheus_monitoring: + type: boolean + default: false + description: > + whether or not to have the grafana-prometheus-cadvisor monitoring setup + + grafana_admin_passwd: + type: string + default: admin + hidden: true + description: > + admin user password for the Grafana monitoring interface + dns_nameserver: type: string description: address of a DNS nameserver reachable in your environment @@ -417,6 +430,8 @@ resources: resource_def: type: kubemaster.yaml properties: + prometheus_monitoring: {get_param: prometheus_monitoring} + grafana_admin_passwd: {get_param: grafana_admin_passwd} api_public_address: {get_attr: [api_lb, floating_address]} api_private_address: {get_attr: [api_lb, address]} ssh_key_name: {get_param: ssh_key_name} @@ -474,6 +489,7 @@ resources: resource_def: type: kubeminion.yaml properties: + prometheus_monitoring: {get_param: prometheus_monitoring} ssh_key_name: {get_param: ssh_key_name} server_image: {get_param: server_image} minion_flavor: {get_param: minion_flavor} diff --git a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml index 4ea56fe0de..32dd7162c9 100644 --- a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml +++ b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml @@ -105,6 +105,17 @@ parameters: type: string description: endpoint to retrieve TLS certs from + prometheus_monitoring: + type: boolean + description: > + whether or not to have prometheus and grafana deployed + + grafana_admin_passwd: + type: string + hidden: true + description: > + admin user password for the Grafana monitoring interface + api_public_address: type: string description: Public IP address of the Kubernetes master server. @@ -238,6 +249,7 @@ resources: str_replace: template: {get_file: ../../common/templates/kubernetes/fragments/write-heat-params-master.yaml} params: + "$PROMETHEUS_MONITORING": {get_param: prometheus_monitoring} "$KUBE_API_PUBLIC_ADDRESS": {get_attr: [api_address_switch, public_ip]} "$KUBE_API_PRIVATE_ADDRESS": {get_attr: [api_address_switch, private_ip]} "$KUBE_API_PORT": {get_param: kubernetes_port} @@ -314,6 +326,39 @@ resources: group: ungrouped config: {get_file: ../../common/templates/kubernetes/fragments/write-network-config.sh} + write_prometheus_configmap: + type: OS::Heat::SoftwareConfig + properties: + group: ungrouped + config: {get_file: ../../common/templates/kubernetes/fragments/write-prometheus-configmap.yaml} + + + write_prometheus_service: + type: OS::Heat::SoftwareConfig + properties: + group: ungrouped + config: {get_file: ../../common/templates/kubernetes/fragments/write-prometheus-service.yaml} + + write_grafana_service: + type: OS::Heat::SoftwareConfig + properties: + group: ungrouped + config: + str_replace: + template: {get_file: ../../common/templates/kubernetes/fragments/write-grafana-service.yaml} + params: + "$ADMIN_PASSWD": {get_param: grafana_admin_passwd} + + enable_monitoring: + type: OS::Heat::SoftwareConfig + properties: + group: ungrouped + config: + str_replace: + template: {get_file: ../../common/templates/kubernetes/fragments/enable-monitoring.sh} + params: + "$ADMIN_PASSWD": {get_param: grafana_admin_passwd} + network_config_service: type: OS::Heat::SoftwareConfig properties: @@ -394,6 +439,9 @@ resources: - config: {get_resource: add_proxy} - config: {get_resource: enable_services} - config: {get_resource: write_network_config} + - config: {get_resource: write_prometheus_configmap} + - config: {get_resource: write_prometheus_service} + - config: {get_resource: write_grafana_service} - config: {get_resource: network_config_service} - config: {get_resource: network_service} - config: {get_resource: kube_system_namespace_service} @@ -401,6 +449,7 @@ resources: - config: {get_resource: enable_kube_proxy} - config: {get_resource: kube_ui_service} - config: {get_resource: kube_examples} + - config: {get_resource: enable_monitoring} - config: {get_resource: master_wc_notify} ###################################################################### diff --git a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubeminion.yaml b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubeminion.yaml index 88c610d15c..f284ec0cd6 100644 --- a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubeminion.yaml +++ b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubeminion.yaml @@ -61,6 +61,11 @@ parameters: type: string description: endpoint to retrieve TLS certs from + prometheus_monitoring: + type: boolean + description: > + whether or not to have the node-exporter running on the node + kube_master_ip: type: string description: IP address of the Kubernetes master server. @@ -220,6 +225,7 @@ resources: str_replace: template: {get_file: ../../common/templates/kubernetes/fragments/write-heat-params.yaml} params: + $PROMETHEUS_MONITORING: {get_param: prometheus_monitoring} $KUBE_ALLOW_PRIV: {get_param: kube_allow_priv} $KUBE_MASTER_IP: {get_param: kube_master_ip} $KUBE_API_PORT: {get_param: kubernetes_port} @@ -321,6 +327,12 @@ resources: group: ungrouped config: {get_file: ../../common/templates/kubernetes/fragments/enable-kube-proxy-minion.sh} + enable_node_exporter: + type: OS::Heat::SoftwareConfig + properties: + group: ungrouped + config: {get_file: ../../common/templates/kubernetes/fragments/enable-node-exporter.sh} + minion_wc_notify: type: OS::Heat::SoftwareConfig properties: @@ -361,6 +373,7 @@ resources: - config: {get_resource: add_proxy} - config: {get_resource: enable_services} - config: {get_resource: enable_kube_proxy} + - config: {get_resource: enable_node_exporter} - config: {get_resource: enable_docker_registry} - config: {get_resource: minion_wc_notify} diff --git a/magnum/drivers/k8s_fedora_atomic_v1/tools/grafana-prometheus-dashboard.json b/magnum/drivers/k8s_fedora_atomic_v1/tools/grafana-prometheus-dashboard.json new file mode 100644 index 0000000000..cd69765081 --- /dev/null +++ b/magnum/drivers/k8s_fedora_atomic_v1/tools/grafana-prometheus-dashboard.json @@ -0,0 +1,2079 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "Compliant with Prometheus 1.5.2", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "3.1.1" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.3.0" + } + ], + "id": null, + "title": "Kubernetes cluster monitoring (via Prometheus)", + "description": "Monitors Kubernetes cluster using Prometheus. Shows overall cluster CPU / Memory / Filesystem usage as well as individual pod, containers, systemd services statistics. Uses cAdvisor metrics only.", + "tags": [ + "kubernetes" + ], + "style": "dark", + "timezone": "browser", + "editable": true, + "hideControls": false, + "sharedCrosshair": false, + "rows": [ + { + "collapse": false, + "editable": true, + "height": "200px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)", + "thresholdLine": false + }, + "height": "200px", + "id": 32, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum (rate (container_network_receive_bytes_total{kubernetes_io_hostname=~\"^$Node$\"}[1m]))", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Received", + "metric": "network", + "refId": "A", + "step": 10 + }, + { + "expr": "- sum (rate (container_network_transmit_bytes_total{kubernetes_io_hostname=~\"^$Node$\"}[1m]))", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Sent", + "metric": "network", + "refId": "B", + "step": 10 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Network I/O pressure", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "title": "Network I/O pressure" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "180px", + "id": 4, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) / sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"}) * 100", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "thresholds": "65, 90", + "title": "Cluster memory usage", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "180px", + "id": 6, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) / sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"}) * 100", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "thresholds": "65, 90", + "title": "Cluster CPU usage (1m avg)", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "180px", + "id": 7, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum (container_fs_usage_bytes{device=~\"^/dev/.*$\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) / sum (container_fs_limit_bytes{device=~\"^/dev/.*$\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) * 100", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 10 + } + ], + "thresholds": "65, 90", + "title": "Cluster filesystem usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "1px", + "id": 9, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "20%", + "prefix": "", + "prefixFontSize": "20%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "thresholds": "", + "title": "Used", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "1px", + "id": 10, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"})", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "thresholds": "", + "title": "Total", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "1px", + "id": 11, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": " cores", + "postfixFontSize": "30%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m]))", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "thresholds": "", + "title": "Used", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "1px", + "id": 12, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": " cores", + "postfixFontSize": "30%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"})", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "thresholds": "", + "title": "Total", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "1px", + "id": 13, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum (container_fs_usage_bytes{device=~\"^/dev/.*$\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "thresholds": "", + "title": "Used", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "1px", + "id": 14, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum (container_fs_limit_bytes{device=~\"^/dev/.*$\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "thresholds": "", + "title": "Total", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "showTitle": false, + "title": "Total usage" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 3, + "editable": true, + "error": false, + "fill": 0, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "height": "", + "id": 17, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "sum (rate (container_cpu_usage_seconds_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (pod_name)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "{{ pod_name }}", + "metric": "container_cpu", + "refId": "A", + "step": 10 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Pods CPU usage (1m avg)", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "none", + "label": "cores", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "showTitle": false, + "title": "Pods CPU usage" + }, + { + "collapse": true, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 3, + "editable": true, + "error": false, + "fill": 0, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "height": "", + "id": 23, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "sum (rate (container_cpu_usage_seconds_total{systemd_service_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (systemd_service_name)", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "{{ systemd_service_name }}", + "metric": "container_cpu", + "refId": "A", + "step": 10 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "System services CPU usage (1m avg)", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "none", + "label": "cores", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "title": "System services CPU usage" + }, + { + "collapse": true, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 3, + "editable": true, + "error": false, + "fill": 0, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "height": "", + "id": 24, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "sum (rate (container_cpu_usage_seconds_total{image!=\"\",name=~\"^k8s_.*\",container_name!=\"POD\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (container_name, pod_name)", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "pod: {{ pod_name }} | {{ container_name }}", + "metric": "container_cpu", + "refId": "A", + "step": 10 + }, + { + "expr": "sum (rate (container_cpu_usage_seconds_total{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, name, image)", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})", + "metric": "container_cpu", + "refId": "B", + "step": 10 + }, + { + "expr": "sum (rate (container_cpu_usage_seconds_total{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, rkt_container_name)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}", + "metric": "container_cpu", + "refId": "C", + "step": 10 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Containers CPU usage (1m avg)", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "none", + "label": "cores", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "title": "Containers CPU usage" + }, + { + "collapse": true, + "editable": true, + "height": "500px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 3, + "editable": true, + "error": false, + "fill": 0, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 20, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "sum (rate (container_cpu_usage_seconds_total{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (id)", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "{{ id }}", + "metric": "container_cpu", + "refId": "A", + "step": 10 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "All processes CPU usage (1m avg)", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "none", + "label": "cores", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "showTitle": false, + "title": "All processes CPU usage" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "fill": 0, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 25, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}) by (pod_name)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "{{ pod_name }}", + "metric": "container_memory_usage:sort_desc", + "refId": "A", + "step": 10 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Pods memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "title": "Pods memory usage" + }, + { + "collapse": true, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "fill": 0, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 26, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "sum (container_memory_working_set_bytes{systemd_service_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}) by (systemd_service_name)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "{{ systemd_service_name }}", + "metric": "container_memory_usage:sort_desc", + "refId": "A", + "step": 10 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "System services memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "title": "System services memory usage" + }, + { + "collapse": true, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "fill": 0, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 27, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",container_name!=\"POD\",kubernetes_io_hostname=~\"^$Node$\"}) by (container_name, pod_name)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "pod: {{ pod_name }} | {{ container_name }}", + "metric": "container_memory_usage:sort_desc", + "refId": "A", + "step": 10 + }, + { + "expr": "sum (container_memory_working_set_bytes{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}) by (kubernetes_io_hostname, name, image)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})", + "metric": "container_memory_usage:sort_desc", + "refId": "B", + "step": 10 + }, + { + "expr": "sum (container_memory_working_set_bytes{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}) by (kubernetes_io_hostname, rkt_container_name)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}", + "metric": "container_memory_usage:sort_desc", + "refId": "C", + "step": 10 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Containers memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "title": "Containers memory usage" + }, + { + "collapse": true, + "editable": true, + "height": "500px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "fill": 0, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 28, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "sum (container_memory_working_set_bytes{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) by (id)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "{{ id }}", + "metric": "container_memory_usage:sort_desc", + "refId": "A", + "step": 10 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "All processes memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "title": "All processes memory usage" + }, + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 16, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum (rate (container_network_receive_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (pod_name)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "-> {{ pod_name }}", + "metric": "network", + "refId": "A", + "step": 10 + }, + { + "expr": "- sum (rate (container_network_transmit_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (pod_name)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "<- {{ pod_name }}", + "metric": "network", + "refId": "B", + "step": 10 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Pods network I/O (1m avg)", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "title": "Pods network I/O" + }, + { + "collapse": true, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 30, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum (rate (container_network_receive_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (container_name, pod_name)", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "-> pod: {{ pod_name }} | {{ container_name }}", + "metric": "network", + "refId": "B", + "step": 10 + }, + { + "expr": "- sum (rate (container_network_transmit_bytes_total{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (container_name, pod_name)", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "<- pod: {{ pod_name }} | {{ container_name }}", + "metric": "network", + "refId": "D", + "step": 10 + }, + { + "expr": "sum (rate (container_network_receive_bytes_total{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, name, image)", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "-> docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})", + "metric": "network", + "refId": "A", + "step": 10 + }, + { + "expr": "- sum (rate (container_network_transmit_bytes_total{image!=\"\",name!~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, name, image)", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "<- docker: {{ kubernetes_io_hostname }} | {{ image }} ({{ name }})", + "metric": "network", + "refId": "C", + "step": 10 + }, + { + "expr": "sum (rate (container_network_transmit_bytes_total{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, rkt_container_name)", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "-> rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}", + "metric": "network", + "refId": "E", + "step": 10 + }, + { + "expr": "- sum (rate (container_network_transmit_bytes_total{rkt_container_name!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (kubernetes_io_hostname, rkt_container_name)", + "hide": false, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "<- rkt: {{ kubernetes_io_hostname }} | {{ rkt_container_name }}", + "metric": "network", + "refId": "F", + "step": 10 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Containers network I/O (1m avg)", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "title": "Containers network I/O" + }, + { + "collapse": true, + "editable": true, + "height": "500px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 29, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum (rate (container_network_receive_bytes_total{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (id)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "-> {{ id }}", + "metric": "network", + "refId": "A", + "step": 10 + }, + { + "expr": "- sum (rate (container_network_transmit_bytes_total{id!=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (id)", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "<- {{ id }}", + "metric": "network", + "refId": "B", + "step": 10 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "All processes network I/O (1m avg)", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "title": "All processes network I/O" + } + ], + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "multi": false, + "name": "Node", + "options": [], + "query": "label_values(kubernetes_io_hostname)", + "refresh": 1, + "type": "query" + } + ] + }, + "annotations": { + "list": [] + }, + "refresh": "10s", + "schemaVersion": 12, + "version": 13, + "links": [], + "gnetId": 1621 +} \ No newline at end of file diff --git a/magnum/drivers/k8s_fedora_ironic_v1/templates/kubecluster.yaml b/magnum/drivers/k8s_fedora_ironic_v1/templates/kubecluster.yaml index 03d7a8754f..e46923310c 100644 --- a/magnum/drivers/k8s_fedora_ironic_v1/templates/kubecluster.yaml +++ b/magnum/drivers/k8s_fedora_ironic_v1/templates/kubecluster.yaml @@ -43,6 +43,19 @@ parameters: default: baremetal description: flavor to use when booting the server + prometheus_monitoring: + type: boolean + default: false + description: > + whether or not to have the grafana-prometheus-cadvisor monitoring setup + + grafana_admin_passwd: + type: string + default: admin + hidden: true + description: > + admin user password for the Grafana monitoring interface + dns_nameserver: type: string description: address of a dns nameserver reachable in your environment @@ -405,6 +418,8 @@ resources: resource_def: type: kubemaster.yaml properties: + prometheus_monitoring: {get_param: prometheus_monitoring} + grafana_admin_passwd: {get_param: grafana_admin_passwd} api_public_address: {get_attr: [api_lb, floating_address]} api_private_address: {get_attr: [api_lb, address]} ssh_key_name: {get_param: ssh_key_name} @@ -491,6 +506,7 @@ resources: kubeminion_software_configs: type: kubeminion_software_configs.yaml properties: + prometheus_monitoring: {get_param: prometheus_monitoring} network_driver: {get_param: network_driver} kube_master_ip: {get_attr: [api_address_lb_switch, private_ip]} etcd_server_ip: {get_attr: [etcd_address_lb_switch, private_ip]} diff --git a/magnum/drivers/k8s_fedora_ironic_v1/templates/kubemaster.yaml b/magnum/drivers/k8s_fedora_ironic_v1/templates/kubemaster.yaml index d6e6435c94..0d9a58cfc7 100644 --- a/magnum/drivers/k8s_fedora_ironic_v1/templates/kubemaster.yaml +++ b/magnum/drivers/k8s_fedora_ironic_v1/templates/kubemaster.yaml @@ -105,6 +105,17 @@ parameters: type: string description: endpoint to retrieve TLS certs from + prometheus_monitoring: + type: boolean + description: > + whether or not to have prometheus and grafana deployed + + grafana_admin_passwd: + type: string + hidden: true + description: > + admin user password for the Grafana monitoring interface + api_public_address: type: string description: Public IP address of the Kubernetes master server. @@ -232,6 +243,7 @@ resources: str_replace: template: {get_file: ../../common/templates/kubernetes/fragments/write-heat-params-master.yaml} params: + "$PROMETHEUS_MONITORING": {get_param: prometheus_monitoring} "$KUBE_API_PUBLIC_ADDRESS": {get_attr: [api_address_switch, public_ip]} "$KUBE_API_PRIVATE_ADDRESS": {get_attr: [api_address_switch, private_ip]} "$KUBE_API_PORT": {get_param: kubernetes_port} @@ -307,6 +319,39 @@ resources: group: ungrouped config: {get_file: ../../common/templates/kubernetes/fragments/write-network-config.sh} + write_prometheus_configmap: + type: OS::Heat::SoftwareConfig + properties: + group: ungrouped + config: {get_file: ../../common/templates/kubernetes/fragments/write-prometheus-configmap.yaml} + + + write_prometheus_service: + type: OS::Heat::SoftwareConfig + properties: + group: ungrouped + config: {get_file: ../../common/templates/kubernetes/fragments/write-prometheus-service.yaml} + + write_grafana_service: + type: OS::Heat::SoftwareConfig + properties: + group: ungrouped + config: + str_replace: + template: {get_file: ../../common/templates/kubernetes/fragments/write-grafana-service.yaml} + params: + "$ADMIN_PASSWD": {get_param: grafana_admin_passwd} + + enable_monitoring: + type: OS::Heat::SoftwareConfig + properties: + group: ungrouped + config: + str_replace: + template: {get_file: ../../common/templates/kubernetes/fragments/enable-monitoring.sh} + params: + "$ADMIN_PASSWD": {get_param: grafana_admin_passwd} + network_config_service: type: OS::Heat::SoftwareConfig properties: @@ -387,6 +432,9 @@ resources: - config: {get_resource: add_proxy} - config: {get_resource: enable_services} - config: {get_resource: write_network_config} + - config: {get_resource: write_prometheus_configmap} + - config: {get_resource: write_prometheus_service} + - config: {get_resource: write_grafana_service} - config: {get_resource: network_config_service} - config: {get_resource: network_service} - config: {get_resource: kube_system_namespace_service} @@ -394,6 +442,7 @@ resources: - config: {get_resource: enable_kube_proxy} - config: {get_resource: kube_ui_service} - config: {get_resource: kube_examples} + - config: {get_resource: enable_monitoring} - config: {get_resource: master_wc_notify} ###################################################################### diff --git a/magnum/drivers/k8s_fedora_ironic_v1/templates/kubeminion_software_configs.yaml b/magnum/drivers/k8s_fedora_ironic_v1/templates/kubeminion_software_configs.yaml index d3c997a417..4ab67a18d1 100644 --- a/magnum/drivers/k8s_fedora_ironic_v1/templates/kubeminion_software_configs.yaml +++ b/magnum/drivers/k8s_fedora_ironic_v1/templates/kubeminion_software_configs.yaml @@ -43,6 +43,11 @@ parameters: type: string description: endpoint to retrieve TLS certs from + prometheus_monitoring: + type: boolean + description: > + whether or not to have the node-exporter running on the node + kube_master_ip: type: string description: IP address of the Kubernetes master server. @@ -176,6 +181,7 @@ resources: str_replace: template: {get_file: ../../common/templates/kubernetes/fragments/write-heat-params.yaml} params: + $PROMETHEUS_MONITORING: {get_param: prometheus_monitoring} $KUBE_ALLOW_PRIV: {get_param: kube_allow_priv} $KUBE_MASTER_IP: {get_param: kube_master_ip} $KUBE_API_PORT: {get_param: kubernetes_port} @@ -276,6 +282,12 @@ resources: group: ungrouped config: {get_file: ../../common/templates/kubernetes/fragments/enable-kube-proxy-minion.sh} + enable_node_exporter: + type: OS::Heat::SoftwareConfig + properties: + group: ungrouped + config: {get_file: ../../common/templates/kubernetes/fragments/enable-node-exporter.sh} + minion_wc_notify: type: OS::Heat::SoftwareConfig properties: @@ -316,6 +328,7 @@ resources: - config: {get_resource: add_proxy} - config: {get_resource: enable_services} - config: {get_resource: enable_kube_proxy} + - config: {get_resource: enable_node_exporter} - config: {get_resource: enable_docker_registry} - config: {get_resource: minion_wc_notify} diff --git a/magnum/tests/unit/conductor/handlers/test_k8s_cluster_conductor.py b/magnum/tests/unit/conductor/handlers/test_k8s_cluster_conductor.py index 074539b69c..7cc1b6ff81 100644 --- a/magnum/tests/unit/conductor/handlers/test_k8s_cluster_conductor.py +++ b/magnum/tests/unit/conductor/handlers/test_k8s_cluster_conductor.py @@ -51,7 +51,9 @@ class TestClusterConductorWithK8s(base.TestCase): 'flannel_backend': 'vxlan', 'system_pods_initial_delay': '15', 'system_pods_timeout': '1', - 'admission_control_list': 'fake_list'}, + 'admission_control_list': 'fake_list', + 'prometheus_monitoring': 'False', + 'grafana_admin_passwd': 'fake_pwd'}, 'tls_disabled': False, 'server_type': 'vm', 'registry_enabled': False, @@ -149,7 +151,9 @@ class TestClusterConductorWithK8s(base.TestCase): 'flannel_backend': 'vxlan', 'system_pods_initial_delay': '15', 'system_pods_timeout': '1', - 'admission_control_list': 'fake_list'}, + 'admission_control_list': 'fake_list', + 'prometheus_monitoring': 'False', + 'grafana_admin_passwd': 'fake_pwd'}, 'http_proxy': 'http_proxy', 'https_proxy': 'https_proxy', 'no_proxy': 'no_proxy', @@ -180,6 +184,8 @@ class TestClusterConductorWithK8s(base.TestCase): 'system_pods_initial_delay': '15', 'system_pods_timeout': '1', 'admission_control_list': 'fake_list', + 'prometheus_monitoring': 'False', + 'grafana_admin_passwd': 'fake_pwd', 'http_proxy': 'http_proxy', 'https_proxy': 'https_proxy', 'no_proxy': 'no_proxy', @@ -261,6 +267,8 @@ class TestClusterConductorWithK8s(base.TestCase): 'system_pods_initial_delay': '15', 'system_pods_timeout': '1', 'admission_control_list': 'fake_list', + 'prometheus_monitoring': 'False', + 'grafana_admin_passwd': 'fake_pwd', 'http_proxy': 'http_proxy', 'https_proxy': 'https_proxy', 'magnum_url': 'http://127.0.0.1:9511/v1', @@ -344,6 +352,8 @@ class TestClusterConductorWithK8s(base.TestCase): 'system_pods_initial_delay': '15', 'system_pods_timeout': '1', 'admission_control_list': 'fake_list', + 'prometheus_monitoring': 'False', + 'grafana_admin_passwd': 'fake_pwd', 'insecure_registry_url': '10.0.0.1:5000', 'kube_version': 'fake-version', 'magnum_url': 'http://127.0.0.1:9511/v1', @@ -419,6 +429,8 @@ class TestClusterConductorWithK8s(base.TestCase): 'system_pods_initial_delay': '15', 'system_pods_timeout': '1', 'admission_control_list': 'fake_list', + 'prometheus_monitoring': 'False', + 'grafana_admin_passwd': 'fake_pwd', 'tls_disabled': False, 'registry_enabled': False, 'trustee_domain_id': self.mock_keystone.trustee_domain_id, @@ -486,6 +498,8 @@ class TestClusterConductorWithK8s(base.TestCase): 'system_pods_initial_delay': '15', 'system_pods_timeout': '1', 'admission_control_list': 'fake_list', + 'prometheus_monitoring': 'False', + 'grafana_admin_passwd': 'fake_pwd', 'tls_disabled': False, 'registry_enabled': False, 'trustee_domain_id': self.mock_keystone.trustee_domain_id, @@ -679,6 +693,8 @@ class TestClusterConductorWithK8s(base.TestCase): 'system_pods_initial_delay': '15', 'system_pods_timeout': '1', 'admission_control_list': 'fake_list', + 'prometheus_monitoring': 'False', + 'grafana_admin_passwd': 'fake_pwd', 'tenant_name': 'fake_tenant', 'username': 'fake_user', 'cluster_uuid': self.cluster_dict['uuid'], diff --git a/magnum/tests/unit/drivers/test_template_definition.py b/magnum/tests/unit/drivers/test_template_definition.py index cd50d653f4..2eed17a0d7 100644 --- a/magnum/tests/unit/drivers/test_template_definition.py +++ b/magnum/tests/unit/drivers/test_template_definition.py @@ -260,6 +260,10 @@ class AtomicK8sTemplateDefinitionTestCase(BaseTemplateDefinitionTestCase): 'system_pods_timeout') admission_control_list = mock_cluster_template.labels.get( 'admission_control_list') + prometheus_monitoring = mock_cluster_template.labels.get( + 'prometheus_monitoring') + grafana_admin_passwd = mock_cluster_template.labels.get( + 'grafana_admin_passwd') k8s_def = k8sa_tdef.AtomicK8sTemplateDefinition() @@ -275,6 +279,8 @@ class AtomicK8sTemplateDefinitionTestCase(BaseTemplateDefinitionTestCase): 'system_pods_initial_delay': system_pods_initial_delay, 'system_pods_timeout': system_pods_timeout, 'admission_control_list': admission_control_list, + 'prometheus_monitoring': prometheus_monitoring, + 'grafana_admin_passwd': grafana_admin_passwd, 'username': 'fake_user', 'tenant_name': 'fake_tenant', 'magnum_url': mock_osc.magnum_url.return_value, @@ -325,6 +331,10 @@ class AtomicK8sTemplateDefinitionTestCase(BaseTemplateDefinitionTestCase): 'system_pods_timeout') admission_control_list = mock_cluster_template.labels.get( 'admission_control_list') + prometheus_monitoring = mock_cluster_template.labels.get( + 'prometheus_monitoring') + grafana_admin_passwd = mock_cluster_template.labels.get( + 'grafana_admin_passwd') k8s_def = k8sa_tdef.AtomicK8sTemplateDefinition() @@ -340,6 +350,8 @@ class AtomicK8sTemplateDefinitionTestCase(BaseTemplateDefinitionTestCase): 'system_pods_initial_delay': system_pods_initial_delay, 'system_pods_timeout': system_pods_timeout, 'admission_control_list': admission_control_list, + 'prometheus_monitoring': prometheus_monitoring, + 'grafana_admin_passwd': grafana_admin_passwd, 'username': 'fake_user', 'tenant_name': 'fake_tenant', 'magnum_url': mock_osc.magnum_url.return_value, diff --git a/releasenotes/notes/bp-container-monitoring-d4bb1cbd0a4e44cc.yaml b/releasenotes/notes/bp-container-monitoring-d4bb1cbd0a4e44cc.yaml new file mode 100644 index 0000000000..b6b0064151 --- /dev/null +++ b/releasenotes/notes/bp-container-monitoring-d4bb1cbd0a4e44cc.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + Includes a monitoring stack based on cAdvisor, node-exporter, Prometheus + and Grafana. Users can enable this stack through the label + prometheus_monitoring. Prometheus scrapes metrics from the Kubernetes + cluster and then serves them to Grafana through Grafana's Prometheus + data source. Upon completion, a default Grafana dashboard is provided.