Update magnum k8s monitoring infra

* Prometheus-server now runs only on master nodes. * Update prometheus-operator helm chart and tag. * Update prometheus-adapter version. * Deprecation notice for prometheus_monitoring component. task: 41569 story: 2006765 Signed-off-by: Diogo Guerra <diogo.filipe.tomas.guerra@cern.ch> Change-Id: I05e8c2be4e4c8e66a166b485ec7851875dca8b1c
2021-01-07 14:20:42 +00:00 · 2021-01-07 14:20:42 +00:00 · 7b257e94b1
parent 61c7f7b34b
commit 7b257e94b1
7 changed files with 114 additions and 91 deletions
--- a/doc/source/user/index.rst
+++ b/doc/source/user/index.rst
@ -1264,13 +1264,14 @@ _`container_infra_prefix`

  Images that might be needed if 'monitoring_enabled' is 'true':

-  * quay.io/prometheus/alertmanager:v0.20.0
-  * docker.io/squareup/ghostunnel:v1.5.2
-  * docker.io/jettech/kube-webhook-certgen:v1.0.0
-  * quay.io/coreos/prometheus-operator:v0.37.0
-  * quay.io/coreos/configmap-reload:v0.0.1
-  * quay.io/coreos/prometheus-config-reloader:v0.37.0
-  * quay.io/prometheus/prometheus:v2.15.2
+  * quay.io/prometheus/alertmanager:v0.21.0
+  * docker.io/jettech/kube-webhook-certgen:v1.5.0
+  * quay.io/prometheus-operator/prometheus-operator:v0.44.0
+  * docker.io/jimmidyson/configmap-reload:v0.4.0
+  * quay.io/prometheus-operator/prometheus-config-reloader:v0.44.0
+  * quay.io/prometheus/prometheus:v2.22.1
+  * quay.io/prometheus/node-exporter:v1.0.1
+  * docker.io/directxman12/k8s-prometheus-adapter:v0.8.2

  Images that might be needed if 'cinder_csi_enabled' is 'true':

--- a/doc/source/user/monitoring.rst
+++ b/doc/source/user/monitoring.rst
@ -35,15 +35,15 @@ _`metrics_server_enabled`

 _`monitoring_enabled`
  Enable installation of cluster monitoring solution provided by the
-  stable/prometheus-operator helm chart.
+  prometheus-community/kube-prometheus-stack helm chart.
  To use this service tiller_enabled must be true when using
  helm_client_tag<v3.0.0.
  Default: false

 _`prometheus_adapter_enabled`
  Enable installation of cluster custom metrics provided by the
-  stable/prometheus-adapter helm chart. This service depends on
-  monitoring_enabled.
+  prometheus-community/prometheus-adapter helm chart.
+  This service depends on monitoring_enabled.
  Default: true

 To control deployed versions, extra labels are available:
@ -56,14 +56,17 @@ _`metrics_server_chart_tag`

 _`prometheus_operator_chart_tag`
  Add prometheus_operator_chart_tag to select version of the
-  stable/prometheus-operator chart to install. When installing the chart,
-  helm will use the default values of the tag defined and overwrite them based
-  on the prometheus-operator-config ConfigMap currently defined. You must
-  certify that the versions are compatible.
+  prometheus-community/kube-prometheus-stack chart to install.
+  When installing the chart, helm will use the default values of the tag
+  defined and overwrite them based on the prometheus-operator-config
+  ConfigMap currently defined.
+  You must certify that the versions are compatible.
+  Wallaby-default: 17.2.0

 _`prometheus_adapter_chart_tag`
-  The stable/prometheus-adapter helm chart version to use.
+  The prometheus-community/prometheus-adapter helm chart version to use.
  Train-default: 1.4.0
+  Wallaby-default: 2.12.1

 Full fledged cluster monitoring
 +++++++++++++++++++++++++++++++
--- a/magnum/drivers/common/templates/kubernetes/helm/prometheus-adapter.sh
+++ b/magnum/drivers/common/templates/kubernetes/helm/prometheus-adapter.sh
@ -21,10 +21,11 @@ EOF
    cat << EOF >> ${HELM_CHART_DIR}/values.yaml
 prometheus-adapter:
  image:
-    repository: ${CONTAINER_INFRA_PREFIX:-docker.io/directxman12/}k8s-prometheus-adapter-${ARCH}
+    repository: ${CONTAINER_INFRA_PREFIX:-k8s.gcr.io/prometheus-adapter/}prometheus-adapter
  priorityClassName: "system-cluster-critical"
  prometheus:
-    url: http://web.tcp.prometheus-prometheus.kube-system.svc.cluster.local
+    url: http://web.tcp.magnum-kube-prometheus-sta-prometheus.kube-system.svc.cluster.local
+    path: /prometheus
  resources:
    requests:
      cpu: 150m
--- a/magnum/drivers/common/templates/kubernetes/helm/prometheus-operator.sh
+++ b/magnum/drivers/common/templates/kubernetes/helm/prometheus-operator.sh
@ -2,7 +2,7 @@ set +x
 . /etc/sysconfig/heat-params
 set -ex

-CHART_NAME="prometheus-operator"
+CHART_NAME="kube-prometheus-stack"

 if [ "$(echo ${MONITORING_ENABLED} | tr '[:upper:]' '[:lower:]')" = "true" ]; then
    echo "Writing ${CHART_NAME} config"
@ -80,22 +80,18 @@ EOF
        PROTOCOL="http"
        INSECURE_SKIP_VERIFY="True"
    fi
-    # FIXME: Force protocol to http as we don't want to use the cluster certs
-    USE_HTTPS="False"

    if [ "$(echo ${VERIFY_CA} | tr '[:upper:]' '[:lower:]')" == "false" ]; then
        INSECURE_SKIP_VERIFY="True"
    fi

    cat << EOF >> ${HELM_CHART_DIR}/values.yaml
-prometheus-operator:
-
-  defaultRules:
-    rules:
-      #TODO: To enable this we need firstly take care of exposing certs
-      etcd: false
+kube-prometheus-stack:

  alertmanager:
+    podDisruptionBudget:
+      enabled: true
+    #config:
    ingress:
      enabled: ${MONITORING_INGRESS_ENABLED}
      annotations:
@ -108,6 +104,7 @@ ${APP_INGRESS_BASIC_AUTH_ANNOTATIONS}
      - ${CLUSTER_ROOT_DOMAIN_NAME}
      paths:
      - /alertmanager${APP_INGRESS_PATH_APPEND}
+      pathType: ImplementationSpecific
      ## TLS configuration for Alertmanager Ingress
      ## Secret must be manually created in the namespace
      tls: []
@ -118,8 +115,8 @@ ${APP_INGRESS_BASIC_AUTH_ANNOTATIONS}
      image:
        repository: ${CONTAINER_INFRA_PREFIX:-quay.io/prometheus/}alertmanager
      logFormat: json
+      routePrefix: /alertmanager
      externalUrl: https://${CLUSTER_ROOT_DOMAIN_NAME}/alertmanager
-      # routePrefix: /alertmanager
      # resources:
      #   requests:
      #     cpu: 100m
@ -127,15 +124,7 @@ ${APP_INGRESS_BASIC_AUTH_ANNOTATIONS}
      priorityClassName: "system-cluster-critical"

  grafana:
-    image:
-      repository: ${CONTAINER_INFRA_PREFIX:-grafana/}grafana
    #enabled: ${ENABLE_GRAFANA}
-    sidecar:
-      image: ${CONTAINER_INFRA_PREFIX:-kiwigrid/}k8s-sidecar:0.1.99
-    resources:
-      requests:
-        cpu: 100m
-        memory: 128Mi
    adminPassword: ${GRAFANA_ADMIN_PASSWD}
    ingress:
      enabled: ${MONITORING_INGRESS_ENABLED}
@ -146,13 +135,24 @@ ${APP_INGRESS_ANNOTATIONS}
      ## Must be provided if Ingress is enable.
      hosts:
      - ${CLUSTER_ROOT_DOMAIN_NAME}
-      path: /grafana${APP_INGRESS_PATH_APPEND}
+      paths:
+      - /grafana${APP_INGRESS_PATH_APPEND}
+      pathType: ImplementationSpecific
      ## TLS configuration for grafana Ingress
      ## Secret must be manually created in the namespace
      tls: []
      # - secretName: grafana-general-tls
      #   hosts:
      #   - grafana.example.com
+    sidecar:
+      image:
+        repository: ${CONTAINER_INFRA_PREFIX:-quay.io/kiwigrid/}k8s-sidecar
+    image:
+      repository: ${CONTAINER_INFRA_PREFIX:-grafana/}grafana
+    resources:
+      requests:
+        cpu: 100m
+        memory: 128Mi
    persistence:
      enabled: ${APP_GRAFANA_PERSISTENT_STORAGE}
      storageClassName: ${MONITORING_STORAGE_CLASS_NAME}
@ -162,21 +162,10 @@ ${APP_INGRESS_ANNOTATIONS}
        domain: ${CLUSTER_ROOT_DOMAIN_NAME}
        root_url: https://${CLUSTER_ROOT_DOMAIN_NAME}/grafana
        serve_from_sub_path: true
-      paths:
-        data: /var/lib/grafana/data
-        logs: /var/log/grafana
-        plugins: /var/lib/grafana/plugins
-        provisioning: /etc/grafana/provisioning
-      analytics:
-        check_for_updates: true
      log:
        mode: console
      log.console:
        format: json
-      grafana_net:
-        url: https://grafana.net
-    plugins:
-    - grafana-piechart-panel

  kubeApiServer:
    tlsConfig:
@ -198,9 +187,9 @@ ${APP_INGRESS_ANNOTATIONS}
    serviceMonitor:
      ## Enable scraping kube-controller-manager over https.
      ## Requires proper certs (not self-signed) and delegated authentication/authorization checks
-      https: ${USE_HTTPS}
+      https: "True"
      # Skip TLS certificate validation when scraping
-      insecureSkipVerify: null
+      insecureSkipVerify: "True"
      # Name of the server to use when validating TLS certificate
      serverName: null

@ -242,19 +231,21 @@ ${APP_INGRESS_ANNOTATIONS}
    serviceMonitor:
      ## Enable scraping kube-scheduler over https.
      ## Requires proper certs (not self-signed) and delegated authentication/authorization checks
-      https: ${USE_HTTPS}
+      https: "True"
      ## Skip TLS certificate validation when scraping
-      insecureSkipVerify: null
+      insecureSkipVerify: "True"
      ## Name of the server to use when validating TLS certificate
      serverName: null

-  # kubeProxy:
-  #   ## If your kube proxy is not deployed as a pod, specify IPs it can be found on
-  #   endpoints: [] # masters + minions
-  #   serviceMonitor:
-  #     ## Enable scraping kube-proxy over https.
-  #     ## Requires proper certs (not self-signed) and delegated authentication/authorization checks
-  #     https: ${USE_HTTPS}
+  kubeProxy:
+    ## If your kube proxy is not deployed as a pod, specify IPs it can be found on
+    endpoints: ${KUBE_MASTERS_PRIVATE} # masters + minions
+    serviceMonitor:
+      ## Enable scraping kube-proxy over https.
+      ## Requires proper certs (not self-signed) and delegated authentication/authorization checks
+      https: "True"
+      ## Skip TLS certificate validation when scraping
+      insecureSkipVerify: "True"

  kube-state-metrics:
    priorityClassName: "system-cluster-critical"
@ -271,37 +262,34 @@ ${APP_INGRESS_ANNOTATIONS}
      limits:
        cpu: 20m
        memory: 20M
-    extraArgs:
-      - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)
-      - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
-    sidecars: []
-    ##  - name: nvidia-dcgm-exporter
-    ##    image: nvidia/dcgm-exporter:1.4.3

  prometheusOperator:
-    priorityClassName: "system-cluster-critical"
-    tlsProxy:
-      image:
-        repository: ${CONTAINER_INFRA_PREFIX:-squareup/}ghostunnel
    admissionWebhooks:
      patch:
        image:
          repository: ${CONTAINER_INFRA_PREFIX:-jettech/}kube-webhook-certgen
-        priorityClassName: "system-cluster-critical"
-
-    resources: {}
-    # requests:
-    #   cpu: 5m
-    #   memory: 10Mi
+        resources:
+          requests:
+            cpu: 2m
+          limits:
+            memory: 30M
+    # clusterDomain: ${CLUSTER_ROOT_DOMAIN_NAME}
+    priorityClassName: "system-cluster-critical"
    logFormat: json
+    logLevel: info
+    resources:
+      requests:
+        cpu: 2m
+      limits:
+        memory: 32M
    image:
-      repository: ${CONTAINER_INFRA_PREFIX:-quay.io/coreos/}prometheus-operator
-    configmapReloadImage:
-      repository: ${CONTAINER_INFRA_PREFIX:-quay.io/coreos/}configmap-reload
+      repository: ${CONTAINER_INFRA_PREFIX:-quay.io/prometheus-operator/}prometheus-operator
+    prometheusDefaultBaseImage: ${CONTAINER_INFRA_PREFIX:-quay.io/prometheus/}prometheus
+    alertmanagerDefaultBaseImage: ${CONTAINER_INFRA_PREFIX:-quay.io/prometheus/}alertmanager
    prometheusConfigReloaderImage:
-      repository: ${CONTAINER_INFRA_PREFIX:-quay.io/coreos/}prometheus-config-reloader
-    hyperkubeImage:
-      repository: ${CONTAINER_INFRA_PREFIX:-k8s.gcr.io/}hyperkube
+      repository: ${CONTAINER_INFRA_PREFIX:-quay.io/prometheus-operator/}prometheus-config-reloader
+    thanosImage:
+      repository: ${CONTAINER_INFRA_PREFIX:-quay.io/thanos/}thanos

  prometheus:
    ingress:
@ -317,6 +305,7 @@ ${APP_INGRESS_BASIC_AUTH_ANNOTATIONS}
      - ${CLUSTER_ROOT_DOMAIN_NAME}
      paths:
      - /prometheus${APP_INGRESS_PATH_APPEND}
+      pathType: ImplementationSpecific
      ## TLS configuration for Prometheus Ingress
      ## Secret must be manually created in the namespace
      tls: []
@ -332,11 +321,13 @@ ${APP_INGRESS_BASIC_AUTH_ANNOTATIONS}
      bearerTokenFile:
    prometheusSpec:
      scrapeInterval: ${MONITORING_INTERVAL_SECONDS}s
-      scrapeInterval: 30s
      evaluationInterval: 30s
      image:
        repository: ${CONTAINER_INFRA_PREFIX:-quay.io/prometheus/}prometheus
-      retention: 14d
+      tolerations:
+      - key: "node-role.kubernetes.io/master"
+        operator: "Exists"
+        effect: "NoSchedule"
      externalLabels:
        cluster_uuid: ${CLUSTER_UUID}
      externalUrl: https://${CLUSTER_ROOT_DOMAIN_NAME}/prometheus
@ -352,7 +343,16 @@ ${APP_INGRESS_BASIC_AUTH_ANNOTATIONS}
      retention: ${MONITORING_RETENTION_DAYS}d
      retentionSize: ${MONITORING_RETENTION_SIZE_GB}GB
      logFormat: json
-      #routePrefix: /prometheus
+      routePrefix: /prometheus
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: magnum.openstack.org/role
+                operator: In
+                values:
+                - master
      resources:
        requests:
          cpu: ${PROMETHEUS_SERVER_CPU}m
--- a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml
+++ b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml
@ -740,8 +740,8 @@ parameters:

  prometheus_operator_chart_tag:
    type: string
-    description: The stable/prometheus-operator chart version to use.
-    default: v8.12.13
+    description: The prometheus-community/kube-prometheus-stack chart version to use.
+    default: 17.2.0

  prometheus_adapter_enabled:
    type: boolean
@ -750,8 +750,8 @@ parameters:

  prometheus_adapter_chart_tag:
    type: string
-    description: The stable/prometheus-adapter chart version to use.
-    default: 1.4.0
+    description: The prometheus-community/prometheus-adapter chart version to use.
+    default: 2.5.1

  prometheus_adapter_configmap:
    type: string
@ -1051,6 +1051,10 @@ resources:
        - protocol: udp
          port_range_min: 8472
          port_range_max: 8472
+          # Prometheus Server
+        - protocol: tcp
+          port_range_min: 9090
+          port_range_max: 9090

  secgroup_kube_minion:
    condition: create_cluster_resources
--- a/magnum/drivers/k8s_fedora_coreos_v1/templates/kubecluster.yaml
+++ b/magnum/drivers/k8s_fedora_coreos_v1/templates/kubecluster.yaml
@ -754,8 +754,8 @@ parameters:

  prometheus_operator_chart_tag:
    type: string
-    description: The stable/prometheus-operator chart version to use.
-    default: v8.12.13
+    description: The prometheus-community/kube-prometheus-stack chart version to use.
+    default: 33.0.0

  prometheus_adapter_enabled:
    type: boolean
@ -764,8 +764,8 @@ parameters:

  prometheus_adapter_chart_tag:
    type: string
-    description: The stable/prometheus-adapter chart version to use.
-    default: 1.4.0
+    description: The prometheus-community/prometheus-adapter chart version to use.
+    default: 3.0.2

  prometheus_adapter_configmap:
    type: string
@ -1082,6 +1082,10 @@ resources:
        - protocol: udp
          port_range_min: 8472
          port_range_max: 8472
+          # Prometheus Server
+        - protocol: tcp
+          port_range_min: 9090
+          port_range_max: 9090

  secgroup_kube_minion:
    condition: create_cluster_resources
--- a/releasenotes/notes/update-monitoring-charts-1067dc4a0f0060b6.yaml
+++ b/releasenotes/notes/update-monitoring-charts-1067dc4a0f0060b6.yaml
@ -0,0 +1,10 @@
+---
+upgrade:
+  - Prometheus-Adapter helm chart updated to 2.12.1 from 1.4.0.
+  - Prometheus-Operator helm chart updated to kube-prometheus-stack:17.2.0
+    from prometheus-operator:v8.12.13.
+  - Prometheus-server now runs only on master nodes
+
+deprecations:
+  - Enabling monitoring using the prometheus_monitoring label is deprecated
+    and will be removed in the X cycle.