[k8s] Update prometheus monitoring helm based configuration

* prometheus-operator chart version upgraded from 0.1.31. to 5.12.3
* Fix an issue where when using Feature Gate Priority the scheduler
would evict the prometheus monitoring node-exporter pods
* Fix an issue where intensive CPU utilization would make the
metrics fail intermitently or completly fail
* Prometheus resources are now calculated based on the MAX_NODE_COUNT
requested
* Change the sampling rate from the standard 30s to 1 minute
* Add the missing tiller CONTAINER_INFRA_PREFIX variable to the ConfigMap
* Add label prometheus_operator_chart_version to enable the user to
specify the stable/prometheus-operator chart to use

Change-Id: If42873cd6668c07e4e911e4eef5e4ae2232be66f
Task: 30777
Task: 30779
Story: 2005588
Signed-off-by: Diogo Guerra <dy090.guerra@gmail.com>
This commit is contained in:
Diogo Guerra 2019-04-30 15:17:51 +02:00
parent 3217e75b63
commit 7ac4e63269
7 changed files with 87 additions and 18 deletions

View File

@ -309,6 +309,8 @@ the table are linked to more details elsewhere in the user guide.
| `monitoring_enabled`_ | - true | false |
| | - false | |
+---------------------------------------+--------------------+---------------+
| `prometheus_operator_chart_version`_ | see below | see below |
+---------------------------------------+--------------------+---------------+
| `swarm_strategy`_ | - spread | spread |
| | - binpack | |
| | - random | |
@ -1142,10 +1144,10 @@ _`container_infra_prefix`
* gcr.io/google_containers/kubernetes-dashboard-amd64:v1.5.1
* gcr.io/google-containers/hyperkube:v1.12.1
* quay.io/coreos/configmap-reload:v0.0.1
* quay.io/coreos/prometheus-config-reloader:v0.26.0
* quay.io/coreos/prometheus-operator:v0.15.3
* quay.io/prometheus/alertmanager:v0.15.3
* quay.io/prometheus/prometheus:v2.5.0
* quay.io/coreos/prometheus-config-reloader:v0.30.1
* quay.io/coreos/prometheus-operator:v0.30.1
* quay.io/prometheus/alertmanager:v0.17.0
* quay.io/prometheus/prometheus:v2.9.1
* k8s.gcr.io/node-problem-detector:v0.6.2
* docker.io/planetlabs/draino:abf028a
* docker.io/openstackmagnum/cluster-autoscaler:v1.0
@ -1274,6 +1276,13 @@ _`monitoring_enabled`
stable/prometheus-operator helm chart.
Default: false
_`prometheus_operator_chart_version`
Add prometheus_operator_chart_version to select version of the
stable/prometheus-operator chart to install. When installing the chart,
helm will use the default values of the tag defined and overwrite them based
on the prometheus-operator-config ConfigMap currently defined. You must
certify that the versions are compatible.
_`tiller_enabled`
If set to true, tiller will be deployed in the kube-system namespace.
Defaults to false.

View File

@ -40,6 +40,7 @@ HEAT_PARAMS=/etc/sysconfig/heat-params
CLUSTER_UUID="$CLUSTER_UUID"
MAGNUM_URL="$MAGNUM_URL"
MONITORING_ENABLED="$MONITORING_ENABLED"
PROMETHEUS_OPERATOR_CHART_VERSION="$PROMETHEUS_OPERATOR_CHART_VERSION"
VOLUME_DRIVER="$VOLUME_DRIVER"
REGION_NAME="$REGION_NAME"
HTTP_PROXY="$HTTP_PROXY"

View File

@ -10,10 +10,16 @@ printf "Starting to run ${step}\n"
### Configuration
###############################################################################
CHART_NAME="prometheus-operator"
CHART_VERSION="0.1.31"
CHART_VERSION=${PROMETHEUS_OPERATOR_CHART_VERSION:-5.12.3}
if [ "$(echo ${MONITORING_ENABLED} | tr '[:upper:]' '[:lower:]')" = "true" ]; then
# Calculate resources needed to run the Prometheus Monitoring Solution
# MAX_NODE_COUNT so we can have metrics even if cluster scales
PROMETHEUS_SERVER_CPU=$(expr 128 + 7 \* ${MAX_NODE_COUNT} )
PROMETHEUS_SERVER_RAM=$(expr 256 + 40 \* ${MAX_NODE_COUNT})
# Validate if communication node <-> master is secure or insecure
PROTOCOL="https"
INSECURE_SKIP_VERIFY="False"
@ -53,11 +59,12 @@ data:
done
helm repo update
if [[ \$(helm history prometheus-operator | grep prometheus-operator) ]]; then
if [[ \$(helm history ${CHART_NAME} | grep ${CHART_NAME}) ]]; then
echo "${CHART_NAME} already installed on server. Continue..."
exit 0
else
helm install stable/${CHART_NAME} --namespace monitoring --name ${CHART_NAME} --version v${CHART_VERSION} --values /opt/magnum/install-${CHART_NAME}-values.yaml
# TODO: Set namespace to monitoring. This is needed as the Kubernetes default priorityClass can only be used in NS kube-system
helm install stable/${CHART_NAME} --namespace kube-system --name ${CHART_NAME} --version v${CHART_VERSION} --values /opt/magnum/install-${CHART_NAME}-values.yaml
fi
install-${CHART_NAME}-values.yaml: |
@ -68,10 +75,21 @@ data:
alertmanagerSpec:
image:
repository: ${CONTAINER_INFRA_PREFIX:-quay.io/}prometheus/alertmanager
# # Needs testing
# resources:
# requests:
# cpu: 100m
# memory: 256Mi
priorityClassName: "system-cluster-critical"
# Dashboard
grafana:
#enabled: ${ENABLE_GRAFANA}
resources:
requests:
cpu: 100m
memory: 128Mi
adminPassword: ${ADMIN_PASSWD}
kubeApiServer:
@ -91,20 +109,35 @@ data:
k8s-app: coredns
kubeEtcd:
service:
port: 4001
targetPort: 4001
selector:
k8s-app: etcd-server
endpoints:
- ${KUBE_MASTER_IP}
serviceMonitor:
scheme: ${PROTOCOL}
insecureSkipVerify: ${INSECURE_SKIP_VERIFY}
insecureSkipVerify: true
serverName: ${KUBE_MASTER_IP}
## If Protocol is http this files should be neglected
caFile: ${CERT_DIR}/ca.crt
certFile: ${CERT_DIR}/kubelet.crt
keyFile: ${CERT_DIR}/kubelet.key
caFile: /etc/prometheus/secrets/etcd-certificates/ca.crt
certFile: /etc/prometheus/secrets/etcd-certificates/kubelet.crt
keyFile: /etc/prometheus/secrets/etcd-certificates/kubelet.key
kube-state-metrics:
priorityClassName: "system-cluster-critical"
resources:
#Guaranteed
limits:
cpu: 50m
memory: 64M
prometheus-node-exporter:
priorityClassName: "system-node-critical"
resources:
#Guaranteed
limits:
cpu: 20m
memory: 20M
prometheusOperator:
priorityClassName: "system-cluster-critical"
image:
repository: ${CONTAINER_INFRA_PREFIX:-quay.io/}coreos/prometheus-operator
configmapReloadImage:
@ -116,9 +149,19 @@ data:
prometheus:
prometheusSpec:
scrapeInterval: 1m
evaluationInterval: 1m
image:
repository: ${CONTAINER_INFRA_PREFIX:-quay.io/}prometheus/prometheus
retention: 14d
resources:
requests:
cpu: ${PROMETHEUS_SERVER_CPU}m
memory: ${PROMETHEUS_SERVER_RAM}M
# secrets:
# - etcd-certificates
priorityClassName: "system-cluster-critical"
---
apiVersion: batch/v1
kind: Job
@ -132,7 +175,7 @@ spec:
serviceAccountName: tiller
containers:
- name: config-helm
image: docker.io/openstackmagnum/helm-client:dev
image: ${CONTAINER_INFRA_PREFIX:-docker.io/openstackmagnum/}helm-client:dev
command:
- bash
args:

View File

@ -577,6 +577,11 @@ parameters:
description: Enable or disable prometheus-operator monitoring solution.
default: false
prometheus_operator_chart_version:
type: string
description: The stable/prometheus-operator chart version to use.
default: 5.12.3
project_id:
type: string
description: >
@ -929,6 +934,7 @@ resources:
keystone_auth_enabled: {get_param: keystone_auth_enabled}
k8s_keystone_auth_tag: {get_param: k8s_keystone_auth_tag}
monitoring_enabled: {get_param: monitoring_enabled}
prometheus_operator_chart_version: {get_param: prometheus_operator_chart_version}
project_id: {get_param: project_id}
tiller_enabled: {get_param: tiller_enabled}
tiller_tag: {get_param: tiller_tag}

View File

@ -430,6 +430,10 @@ parameters:
description: Enable or disable prometheus-operator monitoring solution.
default: false
prometheus_operator_chart_version:
type: string
description: The stable/prometheus-operator chart version to use.
project_id:
type: string
description: >

View File

@ -510,6 +510,8 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
'k8s_keystone_auth_tag')
monitoring_enabled = mock_cluster.labels.get(
'monitoring_enabled')
prometheus_operator_chart_version = mock_cluster.labels.get(
'prometheus_operator_chart_version')
project_id = mock_cluster.project_id
tiller_enabled = mock_cluster.labels.get(
'tiller_enabled')
@ -589,6 +591,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
'keystone_auth_enabled': keystone_auth_enabled,
'k8s_keystone_auth_tag': k8s_keystone_auth_tag,
'monitoring_enabled': monitoring_enabled,
'prometheus_operator_chart_version': prometheus_operator_chart_version,
'project_id': project_id,
'external_network': external_network_id,
'tiller_enabled': tiller_enabled,
@ -912,6 +915,8 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
'k8s_keystone_auth_tag')
monitoring_enabled = mock_cluster.labels.get(
'monitoring_enabled')
prometheus_operator_chart_version = mock_cluster.labels.get(
'prometheus_operator_chart_version')
project_id = mock_cluster.project_id
tiller_enabled = mock_cluster.labels.get(
'tiller_enabled')
@ -993,6 +998,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
'keystone_auth_enabled': keystone_auth_enabled,
'k8s_keystone_auth_tag': k8s_keystone_auth_tag,
'monitoring_enabled': monitoring_enabled,
'prometheus_operator_chart_version': prometheus_operator_chart_version,
'project_id': project_id,
'external_network': external_network_id,
'tiller_enabled': tiller_enabled,

View File

@ -5,4 +5,4 @@ features:
solution by means of helm stable/prometheus-operator public chart.
Defaults to false. grafana_admin_passwd label can be used to set
grafana dashboard admin access password. If grafana_admin_passwd
is not set the password defaults to prom_operator.
is not set the password defaults to prom-operator.