magnum/magnum/drivers/common/templates/kubernetes/helm/prometheus-operator.sh

set +x
. /etc/sysconfig/heat-params
set -ex

CHART_NAME="prometheus-operator"

if [ "$(echo ${MONITORING_ENABLED} | tr '[:upper:]' '[:lower:]')" = "true" ]; then
    echo "Writing ${CHART_NAME} config"

    HELM_CHART_DIR="/srv/magnum/kubernetes/helm/magnum"
    mkdir -p ${HELM_CHART_DIR}

    cat << EOF >> ${HELM_CHART_DIR}/requirements.yaml
- name: ${CHART_NAME}
  version: ${PROMETHEUS_OPERATOR_CHART_TAG}
  repository: https://prometheus-community.github.io/helm-charts
EOF

    # Calculate resources needed to run the Prometheus Monitoring Solution
    # MAX_NODE_COUNT so we can have metrics even if cluster scales
    PROMETHEUS_SERVER_CPU=$(expr 128 + 7 \* ${MAX_NODE_COUNT} )
    PROMETHEUS_SERVER_RAM=$(expr 256 + 40 \* ${MAX_NODE_COUNT})

    # Because the PVC and Prometheus use different scales for the volume size
    # conversion is needed. The prometheus-monitoring value (in GB) is the conversion
    # with a ratio of (1 GiB = 1.073741824 GB) and then rounded to int

    MONITORING_RETENTION_SIZE_GB=$(echo | awk "{print int(${MONITORING_RETENTION_SIZE}*1.073741824)}")
    APP_GRAFANA_PERSISTENT_STORAGE="false"
    if [ "${MONITORING_STORAGE_CLASS_NAME}" != "" ]; then
        APP_GRAFANA_PERSISTENT_STORAGE="true"
    fi

    # Validate if communication node <-> master is secure or insecure
    PROTOCOL="https"
    INSECURE_SKIP_VERIFY="False"
    if [ "$TLS_DISABLED" = "True" ]; then
        PROTOCOL="http"
        INSECURE_SKIP_VERIFY="True"
    fi
    # FIXME: Force protocol to http as we don't want to use the cluster certs
    USE_HTTPS="False"

    if [ "$(echo ${VERIFY_CA} | tr '[:upper:]' '[:lower:]')" == "false" ]; then
        INSECURE_SKIP_VERIFY="True"
    fi

    cat << EOF >> ${HELM_CHART_DIR}/values.yaml
prometheus-operator:

  defaultRules:
    rules:
      #TODO: To enable this we need firstly take care of exposing certs
      etcd: false

  alertmanager:
    alertmanagerSpec:
      image:
        repository: ${CONTAINER_INFRA_PREFIX:-quay.io/prometheus/}alertmanager
      # # Needs testing
      # resources:
      #   requests:
      #     cpu: 100m
      #     memory: 256Mi
      priorityClassName: "system-cluster-critical"

  # Dashboard
  grafana:
    image:
      repository: ${CONTAINER_INFRA_PREFIX:-grafana/}grafana
    #enabled: ${ENABLE_GRAFANA}
    sidecar:
      image: ${CONTAINER_INFRA_PREFIX:-kiwigrid/}k8s-sidecar:0.1.99
    resources:
      requests:
        cpu: 100m
        memory: 128Mi
    adminPassword: ${GRAFANA_ADMIN_PASSWD}
    persistence:
      enabled: ${APP_GRAFANA_PERSISTENT_STORAGE}
      storageClassName: ${MONITORING_STORAGE_CLASS_NAME}
      size: 1Gi

  kubeApiServer:
    tlsConfig:
      insecureSkipVerify: "False"

  kubelet:
    serviceMonitor:
      https: "True"

  kubeControllerManager:
    ## If your kube controller manager is not deployed as a pod, specify IPs it can be found on
    endpoints: ${KUBE_MASTERS_PRIVATE}
    ## If using kubeControllerManager.endpoints only the port and targetPort are used
    service:
      port: 10252
      targetPort: 10252
      # selector:
      #   component: kube-controller-manager
    serviceMonitor:
      ## Enable scraping kube-controller-manager over https.
      ## Requires proper certs (not self-signed) and delegated authentication/authorization checks
      https: ${USE_HTTPS}
      # Skip TLS certificate validation when scraping
      insecureSkipVerify: null
      # Name of the server to use when validating TLS certificate
      serverName: null

  coreDns:
    enabled: true
    service:
      port: 9153
      targetPort: 9153
      selector:
        k8s-app: kube-dns

  kubeEtcd:
    ## If your etcd is not deployed as a pod, specify IPs it can be found on
    endpoints: ${KUBE_MASTERS_PRIVATE}
    ## Etcd service. If using kubeEtcd.endpoints only the port and targetPort are used
    service:
      port: 2379
      targetPort: 2379
      # selector:
      #   component: etcd
    ## Configure secure access to the etcd cluster by loading a secret into prometheus and
    ## specifying security configuration below. For example, with a secret named etcd-client-cert
    serviceMonitor:
      scheme: https
      insecureSkipVerify: true
      caFile: /etc/prometheus/secrets/etcd-certificates/ca.crt
      certFile: /etc/prometheus/secrets/etcd-certificates/kubelet.crt
      keyFile: /etc/prometheus/secrets/etcd-certificates/kubelet.key

  kubeScheduler:
    ## If your kube scheduler is not deployed as a pod, specify IPs it can be found on
    endpoints: ${KUBE_MASTERS_PRIVATE}
    ## If using kubeScheduler.endpoints only the port and targetPort are used
    service:
      port: 10251
      targetPort: 10251
      # selector:
      #   component: kube-scheduler
    serviceMonitor:
      ## Enable scraping kube-scheduler over https.
      ## Requires proper certs (not self-signed) and delegated authentication/authorization checks
      https: ${USE_HTTPS}
      ## Skip TLS certificate validation when scraping
      insecureSkipVerify: null
      ## Name of the server to use when validating TLS certificate
      serverName: null

  # kubeProxy:
  #   ## If your kube proxy is not deployed as a pod, specify IPs it can be found on
  #   endpoints: [] # masters + minions
  #   serviceMonitor:
  #     ## Enable scraping kube-proxy over https.
  #     ## Requires proper certs (not self-signed) and delegated authentication/authorization checks
  #     https: ${USE_HTTPS}

  kube-state-metrics:
    priorityClassName: "system-cluster-critical"
    resources:
      #Guaranteed
      limits:
        cpu: 50m
        memory: 64M

  prometheus-node-exporter:
    priorityClassName: "system-node-critical"
    resources:
      #Guaranteed
      limits:
        cpu: 20m
        memory: 20M
    extraArgs:
      - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)
      - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
    sidecars: []
    ##  - name: nvidia-dcgm-exporter
    ##    image: nvidia/dcgm-exporter:1.4.3

  prometheusOperator:
    priorityClassName: "system-cluster-critical"
    tlsProxy:
      image:
        repository: ${CONTAINER_INFRA_PREFIX:-squareup/}ghostunnel
    admissionWebhooks:
      patch:
        image:
          repository: ${CONTAINER_INFRA_PREFIX:-jettech/}kube-webhook-certgen
        priorityClassName: "system-cluster-critical"

    resources: {}
    # requests:
    #   cpu: 5m
    #   memory: 10Mi
    image:
      repository: ${CONTAINER_INFRA_PREFIX:-quay.io/coreos/}prometheus-operator
    configmapReloadImage:
      repository: ${CONTAINER_INFRA_PREFIX:-quay.io/coreos/}configmap-reload
    prometheusConfigReloaderImage:
      repository: ${CONTAINER_INFRA_PREFIX:-quay.io/coreos/}prometheus-config-reloader
    hyperkubeImage:
      repository: ${CONTAINER_INFRA_PREFIX:-k8s.gcr.io/}hyperkube

  prometheus:
    prometheusSpec:
      scrapeInterval: ${MONITORING_INTERVAL_SECONDS}s
      scrapeInterval: 30s
      evaluationInterval: 30s
      image:
        repository: ${CONTAINER_INFRA_PREFIX:-quay.io/prometheus/}prometheus
      retention: 14d
      externalLabels:
        cluster_uuid: ${CLUSTER_UUID}
      ## Secrets is a list of Secrets in the same namespace as the Prometheus object, which shall be mounted into the Prometheus Pods.
      ## The Secrets are mounted into /etc/prometheus/secrets/. Secrets changes after initial creation of a Prometheus object are not
      ## reflected in the running Pods. To change the secrets mounted into the Prometheus Pods, the object must be deleted and recreated
      ## with the new list of secrets.
      # secrets:
      # - etcd-certificates
      # - kube-controller-manager-certificates
      # - kube-scheduler-certificates
      # - kube-proxy-manager-certificates
      retention: ${MONITORING_RETENTION_DAYS}d
      retentionSize: ${MONITORING_RETENTION_SIZE_GB}GB
      resources:
        requests:
          cpu: ${PROMETHEUS_SERVER_CPU}m
          memory: ${PROMETHEUS_SERVER_RAM}M
      priorityClassName: "system-cluster-critical"
EOF

    #######################
    # Set up definitions for persistent storage using k8s storageClass
    if [ "${MONITORING_STORAGE_CLASS_NAME}" != "" ]; then
        cat << EOF >> ${HELM_CHART_DIR}/values.yaml
      storageSpec:
        volumeClaimTemplate:
          spec:
            storageClassName: ${MONITORING_STORAGE_CLASS_NAME}
            accessModes: ["ReadWriteMany"]
            resources:
              requests:
                storage: ${MONITORING_RETENTION_SIZE}Gi
EOF
    fi #END PERSISTENT STORAGE CONFIG

    #######################
    # Set up definitions for ingress objects

    # Ensure name conformity
    INGRESS_CONTROLLER=$(echo ${INGRESS_CONTROLLER} | tr '[:upper:]' '[:lower:]')
    if [ "${INGRESS_CONTROLLER}" == "nginx" ]; then
        :
    elif [ "${INGRESS_CONTROLLER}" == "traefik" ]; then
        cat << EOF >> ${HELM_CHART_DIR}/values.yaml
    additionalServiceMonitors:
    - name: prometheus-traefik-metrics
      selector:
        matchLabels:
          k8s-app: traefik
      namespaceSelector:
        matchNames:
        - kube-system
      endpoints:
      - path: /metrics
        port: metrics
EOF
    fi #END INGRESS

    if [ "$(echo ${AUTO_SCALING_ENABLED} | tr '[:upper:]' '[:lower:]')" == "true" ]; then
        cat << EOF >> ${HELM_CHART_DIR}/values.yaml
    additionalPodMonitors:
    - name: prometheus-cluster-autoscaler
      podMetricsEndpoints:
      - port: metrics
        scheme: http
      namespaceSelector:
        matchNames:
        - kube-system
      selector:
        matchLabels:
          app: cluster-autoscaler
EOF
    fi #END AUTOSCALING
fi