Przeglądaj źródła

[k8s] Update prometheus monitoring helm based configuration

* prometheus-operator chart version upgraded from 0.1.31. to 5.12.3
* Fix an issue where when using Feature Gate Priority the scheduler
would evict the prometheus monitoring node-exporter pods
* Fix an issue where intensive CPU utilization would make the
metrics fail intermitently or completly fail
* Prometheus resources are now calculated based on the MAX_NODE_COUNT
requested
* Change the sampling rate from the standard 30s to 1 minute
* Add the missing tiller CONTAINER_INFRA_PREFIX variable to the ConfigMap
* Add label prometheus_operator_chart_version to enable the user to
specify the stable/prometheus-operator chart to use

Change-Id: If42873cd6668c07e4e911e4eef5e4ae2232be66f
Task: 30777
Task: 30779
Story: 2005588
Signed-off-by: Diogo Guerra <dy090.guerra@gmail.com>
changes/03/657403/5
Diogo Guerra 6 miesięcy temu
rodzic
commit
9c3a30fa0e

+ 13
- 4
doc/source/user/index.rst Wyświetl plik

@@ -309,6 +309,8 @@ the table are linked to more details elsewhere in the user guide.
| `monitoring_enabled`_ | - true | false |
| | - false | |
+---------------------------------------+--------------------+---------------+
| `prometheus_operator_chart_version`_ | see below | see below |
+---------------------------------------+--------------------+---------------+
| `swarm_strategy`_ | - spread | spread |
| | - binpack | |
| | - random | |
@@ -1142,10 +1144,10 @@ _`container_infra_prefix`
* gcr.io/google_containers/kubernetes-dashboard-amd64:v1.5.1
* gcr.io/google-containers/hyperkube:v1.12.1
* quay.io/coreos/configmap-reload:v0.0.1
* quay.io/coreos/prometheus-config-reloader:v0.26.0
* quay.io/coreos/prometheus-operator:v0.15.3
* quay.io/prometheus/alertmanager:v0.15.3
* quay.io/prometheus/prometheus:v2.5.0
* quay.io/coreos/prometheus-config-reloader:v0.30.1
* quay.io/coreos/prometheus-operator:v0.30.1
* quay.io/prometheus/alertmanager:v0.17.0
* quay.io/prometheus/prometheus:v2.9.1
* k8s.gcr.io/node-problem-detector:v0.6.2
* docker.io/planetlabs/draino:abf028a
* docker.io/openstackmagnum/cluster-autoscaler:v1.0
@@ -1274,6 +1276,13 @@ _`monitoring_enabled`
stable/prometheus-operator helm chart.
Default: false

_`prometheus_operator_chart_version`
Add prometheus_operator_chart_version to select version of the
stable/prometheus-operator chart to install. When installing the chart,
helm will use the default values of the tag defined and overwrite them based
on the prometheus-operator-config ConfigMap currently defined. You must
certify that the versions are compatible.

_`tiller_enabled`
If set to true, tiller will be deployed in the kube-system namespace.
Defaults to false.

+ 1
- 0
magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.sh Wyświetl plik

@@ -40,6 +40,7 @@ HEAT_PARAMS=/etc/sysconfig/heat-params
CLUSTER_UUID="$CLUSTER_UUID"
MAGNUM_URL="$MAGNUM_URL"
MONITORING_ENABLED="$MONITORING_ENABLED"
PROMETHEUS_OPERATOR_CHART_VERSION="$PROMETHEUS_OPERATOR_CHART_VERSION"
VOLUME_DRIVER="$VOLUME_DRIVER"
REGION_NAME="$REGION_NAME"
HTTP_PROXY="$HTTP_PROXY"

+ 56
- 13
magnum/drivers/common/templates/kubernetes/helm/prometheus-operator.sh Wyświetl plik

@@ -10,10 +10,16 @@ printf "Starting to run ${step}\n"
### Configuration
###############################################################################
CHART_NAME="prometheus-operator"
CHART_VERSION="0.1.31"
CHART_VERSION=${PROMETHEUS_OPERATOR_CHART_VERSION:-5.12.3}


if [ "$(echo ${MONITORING_ENABLED} | tr '[:upper:]' '[:lower:]')" = "true" ]; then

# Calculate resources needed to run the Prometheus Monitoring Solution
# MAX_NODE_COUNT so we can have metrics even if cluster scales
PROMETHEUS_SERVER_CPU=$(expr 128 + 7 \* ${MAX_NODE_COUNT} )
PROMETHEUS_SERVER_RAM=$(expr 256 + 40 \* ${MAX_NODE_COUNT})

# Validate if communication node <-> master is secure or insecure
PROTOCOL="https"
INSECURE_SKIP_VERIFY="False"
@@ -53,11 +59,12 @@ data:
done
helm repo update

if [[ \$(helm history prometheus-operator | grep prometheus-operator) ]]; then
if [[ \$(helm history ${CHART_NAME} | grep ${CHART_NAME}) ]]; then
echo "${CHART_NAME} already installed on server. Continue..."
exit 0
else
helm install stable/${CHART_NAME} --namespace monitoring --name ${CHART_NAME} --version v${CHART_VERSION} --values /opt/magnum/install-${CHART_NAME}-values.yaml
# TODO: Set namespace to monitoring. This is needed as the Kubernetes default priorityClass can only be used in NS kube-system
helm install stable/${CHART_NAME} --namespace kube-system --name ${CHART_NAME} --version v${CHART_VERSION} --values /opt/magnum/install-${CHART_NAME}-values.yaml
fi

install-${CHART_NAME}-values.yaml: |
@@ -68,10 +75,21 @@ data:
alertmanagerSpec:
image:
repository: ${CONTAINER_INFRA_PREFIX:-quay.io/}prometheus/alertmanager
# # Needs testing
# resources:
# requests:
# cpu: 100m
# memory: 256Mi
priorityClassName: "system-cluster-critical"


# Dashboard
grafana:
#enabled: ${ENABLE_GRAFANA}
resources:
requests:
cpu: 100m
memory: 128Mi
adminPassword: ${ADMIN_PASSWD}

kubeApiServer:
@@ -91,20 +109,35 @@ data:
k8s-app: coredns

kubeEtcd:
service:
port: 4001
targetPort: 4001
selector:
k8s-app: etcd-server
endpoints:
- ${KUBE_MASTER_IP}
serviceMonitor:
scheme: ${PROTOCOL}
insecureSkipVerify: ${INSECURE_SKIP_VERIFY}
insecureSkipVerify: true
serverName: ${KUBE_MASTER_IP}
## If Protocol is http this files should be neglected
caFile: ${CERT_DIR}/ca.crt
certFile: ${CERT_DIR}/kubelet.crt
keyFile: ${CERT_DIR}/kubelet.key
caFile: /etc/prometheus/secrets/etcd-certificates/ca.crt
certFile: /etc/prometheus/secrets/etcd-certificates/kubelet.crt
keyFile: /etc/prometheus/secrets/etcd-certificates/kubelet.key

kube-state-metrics:
priorityClassName: "system-cluster-critical"
resources:
#Guaranteed
limits:
cpu: 50m
memory: 64M

prometheus-node-exporter:
priorityClassName: "system-node-critical"
resources:
#Guaranteed
limits:
cpu: 20m
memory: 20M

prometheusOperator:
priorityClassName: "system-cluster-critical"
image:
repository: ${CONTAINER_INFRA_PREFIX:-quay.io/}coreos/prometheus-operator
configmapReloadImage:
@@ -116,9 +149,19 @@ data:

prometheus:
prometheusSpec:
scrapeInterval: 1m
evaluationInterval: 1m
image:
repository: ${CONTAINER_INFRA_PREFIX:-quay.io/}prometheus/prometheus
retention: 14d
resources:
requests:
cpu: ${PROMETHEUS_SERVER_CPU}m
memory: ${PROMETHEUS_SERVER_RAM}M
# secrets:
# - etcd-certificates
priorityClassName: "system-cluster-critical"

---
apiVersion: batch/v1
kind: Job
@@ -132,7 +175,7 @@ spec:
serviceAccountName: tiller
containers:
- name: config-helm
image: docker.io/openstackmagnum/helm-client:dev
image: ${CONTAINER_INFRA_PREFIX:-docker.io/openstackmagnum/}helm-client:dev
command:
- bash
args:

+ 1
- 0
magnum/drivers/heat/k8s_fedora_template_def.py Wyświetl plik

@@ -132,6 +132,7 @@ class K8sFedoraTemplateDefinition(k8s_template_def.K8sTemplateDefinition):
'heat_container_agent_tag',
'keystone_auth_enabled', 'k8s_keystone_auth_tag',
'monitoring_enabled',
'prometheus_operator_chart_version',
'tiller_enabled',
'tiller_tag',
'tiller_namespace',

+ 6
- 0
magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml Wyświetl plik

@@ -577,6 +577,11 @@ parameters:
description: Enable or disable prometheus-operator monitoring solution.
default: false

prometheus_operator_chart_version:
type: string
description: The stable/prometheus-operator chart version to use.
default: 5.12.3

project_id:
type: string
description: >
@@ -929,6 +934,7 @@ resources:
keystone_auth_enabled: {get_param: keystone_auth_enabled}
k8s_keystone_auth_tag: {get_param: k8s_keystone_auth_tag}
monitoring_enabled: {get_param: monitoring_enabled}
prometheus_operator_chart_version: {get_param: prometheus_operator_chart_version}
project_id: {get_param: project_id}
tiller_enabled: {get_param: tiller_enabled}
tiller_tag: {get_param: tiller_tag}

+ 5
- 0
magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml Wyświetl plik

@@ -430,6 +430,10 @@ parameters:
description: Enable or disable prometheus-operator monitoring solution.
default: false

prometheus_operator_chart_version:
type: string
description: The stable/prometheus-operator chart version to use.

project_id:
type: string
description: >
@@ -613,6 +617,7 @@ resources:
"$KEYSTONE_AUTH_ENABLED": {get_param: keystone_auth_enabled}
"$K8S_KEYSTONE_AUTH_TAG": {get_param: k8s_keystone_auth_tag}
"$MONITORING_ENABLED": {get_param: monitoring_enabled}
"$PROMETHEUS_OPERATOR_CHART_VERSION": {get_param: PROMETHEUS_OPERATOR_CHART_VERSION}
"$PROJECT_ID": {get_param: project_id}
"$EXTERNAL_NETWORK_ID": {get_param: external_network}
"$TILLER_ENABLED": {get_param: tiller_enabled}

+ 6
- 0
magnum/tests/unit/drivers/test_template_definition.py Wyświetl plik

@@ -510,6 +510,8 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
'k8s_keystone_auth_tag')
monitoring_enabled = mock_cluster.labels.get(
'monitoring_enabled')
prometheus_operator_chart_version = mock_cluster.labels.get(
'prometheus_operator_chart_version')
project_id = mock_cluster.project_id
tiller_enabled = mock_cluster.labels.get(
'tiller_enabled')
@@ -589,6 +591,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
'keystone_auth_enabled': keystone_auth_enabled,
'k8s_keystone_auth_tag': k8s_keystone_auth_tag,
'monitoring_enabled': monitoring_enabled,
'prometheus_operator_chart_version': prometheus_operator_chart_version,
'project_id': project_id,
'external_network': external_network_id,
'tiller_enabled': tiller_enabled,
@@ -912,6 +915,8 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
'k8s_keystone_auth_tag')
monitoring_enabled = mock_cluster.labels.get(
'monitoring_enabled')
prometheus_operator_chart_version = mock_cluster.labels.get(
'prometheus_operator_chart_version')
project_id = mock_cluster.project_id
tiller_enabled = mock_cluster.labels.get(
'tiller_enabled')
@@ -993,6 +998,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
'keystone_auth_enabled': keystone_auth_enabled,
'k8s_keystone_auth_tag': k8s_keystone_auth_tag,
'monitoring_enabled': monitoring_enabled,
'prometheus_operator_chart_version': prometheus_operator_chart_version,
'project_id': project_id,
'external_network': external_network_id,
'tiller_enabled': tiller_enabled,

+ 1
- 1
releasenotes/notes/helm-install-prometheus-operator-ea87752bc57a0945.yaml Wyświetl plik

@@ -5,4 +5,4 @@ features:
solution by means of helm stable/prometheus-operator public chart.
Defaults to false. grafana_admin_passwd label can be used to set
grafana dashboard admin access password. If grafana_admin_passwd
is not set the password defaults to prom_operator.
is not set the password defaults to prom-operator.

Ładowanie…
Anuluj
Zapisz