Browse Source

[k8s] Update prometheus monitoring helm based configuration

* prometheus-operator chart version upgraded from 0.1.31. to 5.12.3
* Fix an issue where when using Feature Gate Priority the scheduler
would evict the prometheus monitoring node-exporter pods
* Fix an issue where intensive CPU utilization would make the
metrics fail intermitently or completly fail
* Prometheus resources are now calculated based on the MAX_NODE_COUNT
requested
* Change the sampling rate from the standard 30s to 1 minute
* Add the missing tiller CONTAINER_INFRA_PREFIX variable to the ConfigMap
* Add label prometheus_operator_chart_version to enable the user to
specify the stable/prometheus-operator chart to use

Change-Id: If42873cd6668c07e4e911e4eef5e4ae2232be66f
Task: 30777
Task: 30779
Story: 2005588
Signed-off-by: Diogo Guerra <dy090.guerra@gmail.com>
changes/03/657403/5
Diogo Guerra 2 months ago
parent
commit
9c3a30fa0e

+ 13
- 4
doc/source/user/index.rst View File

@@ -309,6 +309,8 @@ the table are linked to more details elsewhere in the user guide.
309 309
 | `monitoring_enabled`_                 | - true             | false         |
310 310
 |                                       | - false            |               |
311 311
 +---------------------------------------+--------------------+---------------+
312
+| `prometheus_operator_chart_version`_  | see below          | see below     |
313
++---------------------------------------+--------------------+---------------+
312 314
 | `swarm_strategy`_                     | - spread           | spread        |
313 315
 |                                       | - binpack          |               |
314 316
 |                                       | - random           |               |
@@ -1142,10 +1144,10 @@ _`container_infra_prefix`
1142 1144
   * gcr.io/google_containers/kubernetes-dashboard-amd64:v1.5.1
1143 1145
   * gcr.io/google-containers/hyperkube:v1.12.1
1144 1146
   * quay.io/coreos/configmap-reload:v0.0.1
1145
-  * quay.io/coreos/prometheus-config-reloader:v0.26.0
1146
-  * quay.io/coreos/prometheus-operator:v0.15.3
1147
-  * quay.io/prometheus/alertmanager:v0.15.3
1148
-  * quay.io/prometheus/prometheus:v2.5.0
1147
+  * quay.io/coreos/prometheus-config-reloader:v0.30.1
1148
+  * quay.io/coreos/prometheus-operator:v0.30.1
1149
+  * quay.io/prometheus/alertmanager:v0.17.0
1150
+  * quay.io/prometheus/prometheus:v2.9.1
1149 1151
   * k8s.gcr.io/node-problem-detector:v0.6.2
1150 1152
   * docker.io/planetlabs/draino:abf028a
1151 1153
   * docker.io/openstackmagnum/cluster-autoscaler:v1.0
@@ -1274,6 +1276,13 @@ _`monitoring_enabled`
1274 1276
   stable/prometheus-operator helm chart.
1275 1277
   Default: false
1276 1278
 
1279
+_`prometheus_operator_chart_version`
1280
+  Add prometheus_operator_chart_version to select version of the
1281
+  stable/prometheus-operator chart to install. When installing the chart,
1282
+  helm will use the default values of the tag defined and overwrite them based
1283
+  on the prometheus-operator-config ConfigMap currently defined. You must
1284
+  certify that the versions are compatible.
1285
+
1277 1286
 _`tiller_enabled`
1278 1287
   If set to true, tiller will be deployed in the kube-system namespace.
1279 1288
   Defaults to false.

+ 1
- 0
magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.sh View File

@@ -40,6 +40,7 @@ HEAT_PARAMS=/etc/sysconfig/heat-params
40 40
       CLUSTER_UUID="$CLUSTER_UUID"
41 41
       MAGNUM_URL="$MAGNUM_URL"
42 42
       MONITORING_ENABLED="$MONITORING_ENABLED"
43
+      PROMETHEUS_OPERATOR_CHART_VERSION="$PROMETHEUS_OPERATOR_CHART_VERSION"
43 44
       VOLUME_DRIVER="$VOLUME_DRIVER"
44 45
       REGION_NAME="$REGION_NAME"
45 46
       HTTP_PROXY="$HTTP_PROXY"

+ 56
- 13
magnum/drivers/common/templates/kubernetes/helm/prometheus-operator.sh View File

@@ -10,10 +10,16 @@ printf "Starting to run ${step}\n"
10 10
 ### Configuration
11 11
 ###############################################################################
12 12
 CHART_NAME="prometheus-operator"
13
-CHART_VERSION="0.1.31"
13
+CHART_VERSION=${PROMETHEUS_OPERATOR_CHART_VERSION:-5.12.3}
14
+
14 15
 
15 16
 if [ "$(echo ${MONITORING_ENABLED} | tr '[:upper:]' '[:lower:]')" = "true" ]; then
16 17
 
18
+    # Calculate resources needed to run the Prometheus Monitoring Solution
19
+    # MAX_NODE_COUNT so we can have metrics even if cluster scales
20
+    PROMETHEUS_SERVER_CPU=$(expr 128 + 7 \* ${MAX_NODE_COUNT} )
21
+    PROMETHEUS_SERVER_RAM=$(expr 256 + 40 \* ${MAX_NODE_COUNT})
22
+
17 23
     # Validate if communication node <-> master is secure or insecure
18 24
     PROTOCOL="https"
19 25
     INSECURE_SKIP_VERIFY="False"
@@ -53,11 +59,12 @@ data:
53 59
     done
54 60
     helm repo update
55 61
 
56
-    if [[ \$(helm history prometheus-operator | grep prometheus-operator) ]]; then
62
+    if [[ \$(helm history ${CHART_NAME} | grep ${CHART_NAME}) ]]; then
57 63
         echo "${CHART_NAME} already installed on server. Continue..."
58 64
         exit 0
59 65
     else
60
-        helm install stable/${CHART_NAME} --namespace monitoring --name ${CHART_NAME} --version v${CHART_VERSION} --values /opt/magnum/install-${CHART_NAME}-values.yaml
66
+        # TODO: Set namespace to monitoring. This is needed as the Kubernetes default priorityClass can only be used in NS kube-system
67
+        helm install stable/${CHART_NAME} --namespace kube-system --name ${CHART_NAME} --version v${CHART_VERSION} --values /opt/magnum/install-${CHART_NAME}-values.yaml
61 68
     fi
62 69
 
63 70
   install-${CHART_NAME}-values.yaml:  |
@@ -68,10 +75,21 @@ data:
68 75
       alertmanagerSpec:
69 76
         image:
70 77
           repository: ${CONTAINER_INFRA_PREFIX:-quay.io/}prometheus/alertmanager
78
+        # # Needs testing
79
+        # resources:
80
+        #   requests:
81
+        #     cpu: 100m
82
+        #     memory: 256Mi
83
+        priorityClassName: "system-cluster-critical"
84
+
71 85
 
72 86
     # Dashboard
73 87
     grafana:
74 88
       #enabled: ${ENABLE_GRAFANA}
89
+      resources:
90
+        requests:
91
+          cpu: 100m
92
+          memory: 128Mi
75 93
       adminPassword: ${ADMIN_PASSWD}
76 94
 
77 95
     kubeApiServer:
@@ -91,20 +109,35 @@ data:
91 109
           k8s-app: coredns
92 110
 
93 111
     kubeEtcd:
94
-      service:
95
-        port: 4001
96
-        targetPort: 4001
97
-        selector:
98
-          k8s-app: etcd-server
112
+      endpoints:
113
+      - ${KUBE_MASTER_IP}
99 114
       serviceMonitor:
100 115
         scheme: ${PROTOCOL}
101
-        insecureSkipVerify: ${INSECURE_SKIP_VERIFY}
116
+        insecureSkipVerify: true
117
+        serverName: ${KUBE_MASTER_IP}
102 118
         ##  If Protocol is http this files should be neglected
103
-        caFile: ${CERT_DIR}/ca.crt
104
-        certFile: ${CERT_DIR}/kubelet.crt
105
-        keyFile: ${CERT_DIR}/kubelet.key
119
+        caFile: /etc/prometheus/secrets/etcd-certificates/ca.crt
120
+        certFile: /etc/prometheus/secrets/etcd-certificates/kubelet.crt
121
+        keyFile: /etc/prometheus/secrets/etcd-certificates/kubelet.key
122
+
123
+    kube-state-metrics:
124
+      priorityClassName: "system-cluster-critical"
125
+      resources:
126
+        #Guaranteed
127
+        limits:
128
+          cpu: 50m
129
+          memory: 64M
130
+
131
+    prometheus-node-exporter:
132
+      priorityClassName: "system-node-critical"
133
+      resources:
134
+        #Guaranteed
135
+        limits:
136
+          cpu: 20m
137
+          memory: 20M
106 138
 
107 139
     prometheusOperator:
140
+      priorityClassName: "system-cluster-critical"
108 141
       image:
109 142
         repository: ${CONTAINER_INFRA_PREFIX:-quay.io/}coreos/prometheus-operator
110 143
       configmapReloadImage:
@@ -116,9 +149,19 @@ data:
116 149
 
117 150
     prometheus:
118 151
       prometheusSpec:
152
+        scrapeInterval: 1m
153
+        evaluationInterval: 1m
119 154
         image:
120 155
           repository: ${CONTAINER_INFRA_PREFIX:-quay.io/}prometheus/prometheus
121 156
         retention: 14d
157
+        resources:
158
+          requests:
159
+            cpu: ${PROMETHEUS_SERVER_CPU}m
160
+            memory: ${PROMETHEUS_SERVER_RAM}M
161
+        # secrets:
162
+        # - etcd-certificates
163
+        priorityClassName: "system-cluster-critical"
164
+
122 165
 ---
123 166
 apiVersion: batch/v1
124 167
 kind: Job
@@ -132,7 +175,7 @@ spec:
132 175
       serviceAccountName: tiller
133 176
       containers:
134 177
       - name: config-helm
135
-        image: docker.io/openstackmagnum/helm-client:dev
178
+        image: ${CONTAINER_INFRA_PREFIX:-docker.io/openstackmagnum/}helm-client:dev
136 179
         command:
137 180
         - bash
138 181
         args:

+ 1
- 0
magnum/drivers/heat/k8s_fedora_template_def.py View File

@@ -132,6 +132,7 @@ class K8sFedoraTemplateDefinition(k8s_template_def.K8sTemplateDefinition):
132 132
                       'heat_container_agent_tag',
133 133
                       'keystone_auth_enabled', 'k8s_keystone_auth_tag',
134 134
                       'monitoring_enabled',
135
+                      'prometheus_operator_chart_version',
135 136
                       'tiller_enabled',
136 137
                       'tiller_tag',
137 138
                       'tiller_namespace',

+ 6
- 0
magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml View File

@@ -577,6 +577,11 @@ parameters:
577 577
     description: Enable or disable prometheus-operator monitoring solution.
578 578
     default: false
579 579
 
580
+  prometheus_operator_chart_version:
581
+    type: string
582
+    description: The stable/prometheus-operator chart version to use.
583
+    default: 5.12.3
584
+
580 585
   project_id:
581 586
     type: string
582 587
     description: >
@@ -929,6 +934,7 @@ resources:
929 934
           keystone_auth_enabled: {get_param: keystone_auth_enabled}
930 935
           k8s_keystone_auth_tag: {get_param: k8s_keystone_auth_tag}
931 936
           monitoring_enabled: {get_param: monitoring_enabled}
937
+          prometheus_operator_chart_version: {get_param: prometheus_operator_chart_version}
932 938
           project_id: {get_param: project_id}
933 939
           tiller_enabled: {get_param: tiller_enabled}
934 940
           tiller_tag: {get_param: tiller_tag}

+ 5
- 0
magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml View File

@@ -430,6 +430,10 @@ parameters:
430 430
     description: Enable or disable prometheus-operator monitoring solution.
431 431
     default: false
432 432
 
433
+  prometheus_operator_chart_version:
434
+    type: string
435
+    description: The stable/prometheus-operator chart version to use.
436
+
433 437
   project_id:
434 438
     type: string
435 439
     description: >
@@ -613,6 +617,7 @@ resources:
613 617
                   "$KEYSTONE_AUTH_ENABLED": {get_param: keystone_auth_enabled}
614 618
                   "$K8S_KEYSTONE_AUTH_TAG": {get_param: k8s_keystone_auth_tag}
615 619
                   "$MONITORING_ENABLED": {get_param: monitoring_enabled}
620
+                  "$PROMETHEUS_OPERATOR_CHART_VERSION": {get_param: PROMETHEUS_OPERATOR_CHART_VERSION}
616 621
                   "$PROJECT_ID": {get_param: project_id}
617 622
                   "$EXTERNAL_NETWORK_ID": {get_param: external_network}
618 623
                   "$TILLER_ENABLED": {get_param: tiller_enabled}

+ 6
- 0
magnum/tests/unit/drivers/test_template_definition.py View File

@@ -510,6 +510,8 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
510 510
             'k8s_keystone_auth_tag')
511 511
         monitoring_enabled = mock_cluster.labels.get(
512 512
             'monitoring_enabled')
513
+        prometheus_operator_chart_version = mock_cluster.labels.get(
514
+            'prometheus_operator_chart_version')
513 515
         project_id = mock_cluster.project_id
514 516
         tiller_enabled = mock_cluster.labels.get(
515 517
             'tiller_enabled')
@@ -589,6 +591,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
589 591
             'keystone_auth_enabled': keystone_auth_enabled,
590 592
             'k8s_keystone_auth_tag': k8s_keystone_auth_tag,
591 593
             'monitoring_enabled': monitoring_enabled,
594
+            'prometheus_operator_chart_version': prometheus_operator_chart_version,
592 595
             'project_id': project_id,
593 596
             'external_network': external_network_id,
594 597
             'tiller_enabled': tiller_enabled,
@@ -912,6 +915,8 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
912 915
             'k8s_keystone_auth_tag')
913 916
         monitoring_enabled = mock_cluster.labels.get(
914 917
             'monitoring_enabled')
918
+        prometheus_operator_chart_version = mock_cluster.labels.get(
919
+            'prometheus_operator_chart_version')
915 920
         project_id = mock_cluster.project_id
916 921
         tiller_enabled = mock_cluster.labels.get(
917 922
             'tiller_enabled')
@@ -993,6 +998,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
993 998
             'keystone_auth_enabled': keystone_auth_enabled,
994 999
             'k8s_keystone_auth_tag': k8s_keystone_auth_tag,
995 1000
             'monitoring_enabled': monitoring_enabled,
1001
+            'prometheus_operator_chart_version': prometheus_operator_chart_version,
996 1002
             'project_id': project_id,
997 1003
             'external_network': external_network_id,
998 1004
             'tiller_enabled': tiller_enabled,

+ 1
- 1
releasenotes/notes/helm-install-prometheus-operator-ea87752bc57a0945.yaml View File

@@ -5,4 +5,4 @@ features:
5 5
     solution by means of helm stable/prometheus-operator public chart.
6 6
     Defaults to false. grafana_admin_passwd label can be used to set
7 7
     grafana dashboard admin access password. If grafana_admin_passwd
8
-    is not set the password defaults to prom_operator.
8
+    is not set the password defaults to prom-operator.

Loading…
Cancel
Save