1. Configurable prometheus monitoring persistent storage

* Add metrics_retention_days magnum label allowing user to specify
prometheus server scraped metrics retention days (default: 14)
* Add metrics_retention_size magnum label allowing user to specify
prometheus server metrics storage maximum size in Gib (default: 14)
* Add metrics_scrape_interval allowing user to specify prometheus
scrape frequency in seconds (default: 30)
* Add metrics_storage_class_name allowing user to specify the
storageClass to use as external retention for pod fail-over data
persistency

task: 39509
story: 2006765

Change-Id: I42117837e8e3cd03f3cb723df4d73692ead0d169
Signed-off-by: Diogo Guerra <diogo.filipe.tomas.guerra@cern.ch>
This commit is contained in:
Diogo Guerra 2020-04-20 19:46:57 +09:00 committed by Diogo Guerra
parent e24bf6252f
commit 37497ccf5b
10 changed files with 201 additions and 11 deletions

View File

@ -320,6 +320,14 @@ the table are linked to more details elsewhere in the user guide.
| `monitoring_enabled`_ | - true | false |
| | - false | |
+---------------------------------------+--------------------+---------------+
| `monitoring_retention_days`_ | see below | see below |
+---------------------------------------+--------------------+---------------+
| `monitoring_retention_size`_ | see below | see below |
+---------------------------------------+--------------------+---------------+
| `monitoring_storage_class_name`_ | see below | see below |
+---------------------------------------+--------------------+---------------+
| `monitoring_interval_seconds`_ | see below | see below |
+---------------------------------------+--------------------+---------------+
| `prometheus_operator_chart_tag`_ | see below | see below |
+---------------------------------------+--------------------+---------------+
| `prometheus_adapter_enabled`_ | - true | true |
@ -1475,6 +1483,25 @@ _`monitoring_enabled`
helm_client_tag<v3.0.0.
Default: false
_`monitoring_retention_days`
The number of time (in days) that prometheus metrics should be kept.
Default: 14
_`monitoring_retention_size`
The maximum memory (in GiB) allowed to be used by prometheus server to
store metrics.
Default: 14
_`monitoring_interval_seconds`
The time interval (in seconds) between consecutive metric scrapings.
Default: 30
_`monitoring_storage_class_name`
The kubernetes storage class name to use for the prometheus pvc.
Using this label will activate the usage of a pvc instead of local
disk space.
Default: ""
_`prometheus_adapter_enabled`
Enable installation of cluster custom metrics provided by the
stable/prometheus-adapter helm chart. This service depends on

View File

@ -59,6 +59,10 @@ VERIFY_CA="$VERIFY_CA"
CLUSTER_UUID="$CLUSTER_UUID"
MAGNUM_URL="$MAGNUM_URL"
MONITORING_ENABLED="$MONITORING_ENABLED"
MONITORING_RETENTION_DAYS="$MONITORING_RETENTION_DAYS"
MONITORING_RETENTION_SIZE="$MONITORING_RETENTION_SIZE"
MONITORING_INTERVAL_SECONDS="$MONITORING_INTERVAL_SECONDS"
MONITORING_STORAGE_CLASS_NAME="$MONITORING_STORAGE_CLASS_NAME"
PROMETHEUS_OPERATOR_CHART_TAG="$PROMETHEUS_OPERATOR_CHART_TAG"
PROMETHEUS_ADAPTER_ENABLED="$PROMETHEUS_ADAPTER_ENABLED"
PROMETHEUS_ADAPTER_CHART_TAG="$PROMETHEUS_ADAPTER_CHART_TAG"

View File

@ -21,6 +21,11 @@ EOF
PROMETHEUS_SERVER_CPU=$(expr 128 + 7 \* ${MAX_NODE_COUNT} )
PROMETHEUS_SERVER_RAM=$(expr 256 + 40 \* ${MAX_NODE_COUNT})
# Because the PVC and Prometheus use different scales for the volume size
# conversion is needed. The prometheus-monitoring value (in GB) is the conversion
# with a ratio of (1 GiB = 1.073741824 GB) and then rounded to int
MONITORING_RETENTION_SIZE_GB=$(echo | awk "{print int(${MONITORING_RETENTION_SIZE}*1.073741824)}")
# Validate if communication node <-> master is secure or insecure
PROTOCOL="https"
INSECURE_SKIP_VERIFY="False"
@ -193,6 +198,7 @@ prometheus-operator:
prometheus:
prometheusSpec:
scrapeInterval: ${MONITORING_INTERVAL_SECONDS}s
scrapeInterval: 30s
evaluationInterval: 30s
image:
@ -209,6 +215,8 @@ prometheus-operator:
# - kube-controller-manager-certificates
# - kube-scheduler-certificates
# - kube-proxy-manager-certificates
retention: ${MONITORING_RETENTION_DAYS}d
retentionSize: ${MONITORING_RETENTION_SIZE_GB}GB
resources:
requests:
cpu: ${PROMETHEUS_SERVER_CPU}m
@ -216,6 +224,21 @@ prometheus-operator:
priorityClassName: "system-cluster-critical"
EOF
#######################
# Set up definitions for persistent storage using k8s storageClass
if [ "${MONITORING_STORAGE_CLASS_NAME}" != "" ]; then
cat << EOF >> ${HELM_CHART_DIR}/values.yaml
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: ${MONITORING_STORAGE_CLASS_NAME}
accessModes: ["ReadWriteMany"]
resources:
requests:
storage: ${MONITORING_RETENTION_SIZE}Gi
EOF
fi #END PERSISTENT STORAGE CONFIG
#######################
# Set up definitions for ingress objects
@ -225,17 +248,17 @@ EOF
:
elif [ "${INGRESS_CONTROLLER}" == "traefik" ]; then
cat << EOF >> ${HELM_CHART_DIR}/values.yaml
additionalServiceMonitors:
- name: prometheus-traefik-metrics
selector:
matchLabels:
k8s-app: traefik
namespaceSelector:
matchNames:
- kube-system
endpoints:
- path: /metrics
port: metrics
additionalServiceMonitors:
- name: prometheus-traefik-metrics
selector:
matchLabels:
k8s-app: traefik
namespaceSelector:
matchNames:
- kube-system
endpoints:
- path: /metrics
port: metrics
EOF
fi #END INGRESS

View File

@ -98,6 +98,10 @@ class K8sFedoraTemplateDefinition(k8s_template_def.K8sTemplateDefinition):
'metrics_server_enabled',
'metrics_server_chart_tag',
'monitoring_enabled',
'monitoring_retention_days',
'monitoring_retention_size',
'monitoring_interval_seconds',
'monitoring_storage_class_name',
'prometheus_operator_chart_tag',
'prometheus_adapter_enabled',
'prometheus_adapter_chart_tag',

View File

@ -694,6 +694,28 @@ parameters:
description: Enable or disable prometheus-operator monitoring solution.
default: false
monitoring_retention_days:
type: number
description: The number of time (in days) that prometheus metrics should be kept.
default: 14
monitoring_retention_size:
type: number
description: >
The maximum memory (in Gi) allowed to be used by prometheus server to store metrics.
default: 14
monitoring_interval_seconds:
type: number
description: >
The time interval (in seconds) between consecutive metric scrapings.
default: 30
monitoring_storage_class_name:
type: string
description: The kubernetes storage class name to use for the prometheus pvc.
default: ""
prometheus_operator_chart_tag:
type: string
description: The stable/prometheus-operator chart version to use.
@ -1228,6 +1250,10 @@ resources:
keystone_auth_enabled: {get_param: keystone_auth_enabled}
k8s_keystone_auth_tag: {get_param: k8s_keystone_auth_tag}
monitoring_enabled: {get_param: monitoring_enabled}
monitoring_retention_days: {get_param: monitoring_retention_days}
monitoring_retention_size: {get_param: monitoring_retention_size}
monitoring_interval_seconds: {get_param: monitoring_interval_seconds}
monitoring_storage_class_name: {get_param: monitoring_storage_class_name}
prometheus_operator_chart_tag: {get_param: prometheus_operator_chart_tag}
prometheus_adapter_enabled: {get_param: prometheus_adapter_enabled}
prometheus_adapter_chart_tag: {get_param: prometheus_adapter_chart_tag}

View File

@ -463,6 +463,24 @@ parameters:
type: boolean
description: Enable or disable prometheus-operator monitoring solution.
monitoring_retention_days:
type: number
description: The number of time (in days) that prometheus metrics should be kept.
monitoring_retention_size:
type: number
description: >
The maximum memory (in Gi) allowed to be used by prometheus server to store metrics.
monitoring_interval_seconds:
type: number
description: >
The time interval (in seconds) between consecutive metric scrapings.
monitoring_storage_class_name:
type: string
description: The kubernetes storage class name to use for the prometheus pvc.
prometheus_operator_chart_tag:
type: string
description: The stable/prometheus-operator chart version to use.
@ -795,6 +813,10 @@ resources:
"$KEYSTONE_AUTH_ENABLED": {get_param: keystone_auth_enabled}
"$K8S_KEYSTONE_AUTH_TAG": {get_param: k8s_keystone_auth_tag}
"$MONITORING_ENABLED": {get_param: monitoring_enabled}
"$MONITORING_RETENTION_DAYS": {get_param: monitoring_retention_days}
"$MONITORING_RETENTION_SIZE": {get_param: monitoring_retention_size}
"$MONITORING_INTERVAL_SECONDS": {get_param: monitoring_interval_seconds}
"$MONITORING_STORAGE_CLASS_NAME": {get_param: monitoring_storage_class_name}
"$PROMETHEUS_OPERATOR_CHART_TAG": {get_param: prometheus_operator_chart_tag}
"$PROMETHEUS_ADAPTER_ENABLED": {get_param: prometheus_adapter_enabled}
"$PROMETHEUS_ADAPTER_CHART_TAG": {get_param: prometheus_adapter_chart_tag}

View File

@ -704,6 +704,28 @@ parameters:
description: Enable or disable prometheus-operator monitoring solution.
default: false
monitoring_retention_days:
type: number
description: The number of time (in days) that prometheus metrics should be kept.
default: 14
monitoring_retention_size:
type: number
description: >
The maximum memory (in Gi) allowed to be used by prometheus server to store metrics.
default: 14
monitoring_interval_seconds:
type: number
description: >
The time interval (in seconds) between consecutive metric scrapings.
default: 30
monitoring_storage_class_name:
type: string
description: The kubernetes storage class name to use for the prometheus pvc.
default: ""
prometheus_operator_chart_tag:
type: string
description: The stable/prometheus-operator chart version to use.
@ -1256,6 +1278,10 @@ resources:
keystone_auth_enabled: {get_param: keystone_auth_enabled}
k8s_keystone_auth_tag: {get_param: k8s_keystone_auth_tag}
monitoring_enabled: {get_param: monitoring_enabled}
monitoring_retention_days: {get_param: monitoring_retention_days}
monitoring_retention_size: {get_param: monitoring_retention_size}
monitoring_interval_seconds: {get_param: monitoring_interval_seconds}
monitoring_storage_class_name: {get_param: monitoring_storage_class_name}
prometheus_operator_chart_tag: {get_param: prometheus_operator_chart_tag}
prometheus_adapter_enabled: {get_param: prometheus_adapter_enabled}
prometheus_adapter_chart_tag: {get_param: prometheus_adapter_chart_tag}

View File

@ -467,6 +467,24 @@ parameters:
type: boolean
description: Enable or disable prometheus-operator monitoring solution.
monitoring_retention_days:
type: number
description: The number of time (in days) that prometheus metrics should be kept.
monitoring_retention_size:
type: number
description: >
The maximum memory (in Gi) allowed to be used by prometheus server to store metrics.
monitoring_interval_seconds:
type: number
description: >
The time interval (in seconds) between consecutive metric scrapings.
monitoring_storage_class_name:
type: string
description: The kubernetes storage class name to use for the prometheus pvc.
prometheus_operator_chart_tag:
type: string
description: The stable/prometheus-operator chart version to use.
@ -814,6 +832,10 @@ resources:
"$KEYSTONE_AUTH_ENABLED": {get_param: keystone_auth_enabled}
"$K8S_KEYSTONE_AUTH_TAG": {get_param: k8s_keystone_auth_tag}
"$MONITORING_ENABLED": {get_param: monitoring_enabled}
"$MONITORING_RETENTION_DAYS": {get_param: monitoring_retention_days}
"$MONITORING_RETENTION_SIZE": {get_param: monitoring_retention_size}
"$MONITORING_INTERVAL_SECONDS": {get_param: monitoring_interval_seconds}
"$MONITORING_STORAGE_CLASS_NAME": {get_param: monitoring_storage_class_name}
"$PROMETHEUS_OPERATOR_CHART_TAG": {get_param: prometheus_operator_chart_tag}
"$PROMETHEUS_ADAPTER_ENABLED": {get_param: prometheus_adapter_enabled}
"$PROMETHEUS_ADAPTER_CHART_TAG": {get_param: prometheus_adapter_chart_tag}

View File

@ -539,6 +539,14 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
'k8s_keystone_auth_tag')
monitoring_enabled = mock_cluster.labels.get(
'monitoring_enabled')
monitoring_retention_days = mock_cluster.labels.get(
'monitoring_retention_days')
monitoring_retention_size = mock_cluster.labels.get(
'monitoring_retention_size')
monitoring_interval_seconds = mock_cluster.labels.get(
'monitoring_interval_seconds')
monitoring_storage_class_name = mock_cluster.labels.get(
'monitoring_storage_class_name')
prometheus_operator_chart_tag = mock_cluster.labels.get(
'prometheus_operator_chart_tag')
prometheus_adapter_enabled = mock_cluster.labels.get(
@ -674,6 +682,10 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
'keystone_auth_enabled': keystone_auth_enabled,
'k8s_keystone_auth_tag': k8s_keystone_auth_tag,
'monitoring_enabled': monitoring_enabled,
'monitoring_retention_days': monitoring_retention_days,
'monitoring_retention_size': monitoring_retention_size,
'monitoring_interval_seconds': monitoring_interval_seconds,
'monitoring_storage_class_name': monitoring_storage_class_name,
'prometheus_operator_chart_tag': prometheus_operator_chart_tag,
'prometheus_adapter_enabled': prometheus_adapter_enabled,
'prometheus_adapter_chart_tag': prometheus_adapter_chart_tag,
@ -1070,6 +1082,14 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
'k8s_keystone_auth_tag')
monitoring_enabled = mock_cluster.labels.get(
'monitoring_enabled')
monitoring_retention_days = mock_cluster.labels.get(
'monitoring_retention_days')
monitoring_retention_size = mock_cluster.labels.get(
'monitoring_retention_size')
monitoring_interval_seconds = mock_cluster.labels.get(
'monitoring_interval_seconds')
monitoring_storage_class_name = mock_cluster.labels.get(
'monitoring_storage_class_name')
prometheus_operator_chart_tag = mock_cluster.labels.get(
'prometheus_operator_chart_tag')
prometheus_adapter_enabled = mock_cluster.labels.get(
@ -1208,6 +1228,10 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
'keystone_auth_enabled': keystone_auth_enabled,
'k8s_keystone_auth_tag': k8s_keystone_auth_tag,
'monitoring_enabled': monitoring_enabled,
'monitoring_retention_days': monitoring_retention_days,
'monitoring_retention_size': monitoring_retention_size,
'monitoring_interval_seconds': monitoring_interval_seconds,
'monitoring_storage_class_name': monitoring_storage_class_name,
'prometheus_operator_chart_tag': prometheus_operator_chart_tag,
'prometheus_adapter_enabled': prometheus_adapter_enabled,
'prometheus_adapter_chart_tag': prometheus_adapter_chart_tag,

View File

@ -0,0 +1,12 @@
---
features:
- |
Added metrics_retention_days magnum label allowing user to specify
prometheus server scraped metrics retention days (default: 14).
Added metrics_retention_size_gi magnum label allowing user to specify
prometheus server metrics storage maximum size in Gi (default: 14).
Added metrics_interval_seconds allowing user to specify prometheus
scrape frequency in seconds (default: 30).
Added metrics_storage_class_name allowing user to specify the
storageClass to use as external retention for pod fail-over data
persistency.