diff --git a/global/profiles/host/cp.yaml b/global/profiles/host/cp.yaml
index f976652bd..1ab68a052 100644
--- a/global/profiles/host/cp.yaml
+++ b/global/profiles/host/cp.yaml
@@ -105,4 +105,5 @@ data:
kube-ingress: enabled
beta.kubernetes.io/fluentd-ds-ready: 'true'
node-exporter: enabled
+ fluentbit: enabled
...
diff --git a/global/profiles/host/dp.yaml b/global/profiles/host/dp.yaml
index 2a454edb0..51df5f68d 100644
--- a/global/profiles/host/dp.yaml
+++ b/global/profiles/host/dp.yaml
@@ -57,4 +57,5 @@ data:
openstack-libvirt: kernel
beta.kubernetes.io/fluentd-ds-ready: 'true'
node-exporter: enabled
+ fluentbit: enabled
...
diff --git a/global/software/charts/osh-infra/osh-infra-dashboards/grafana.yaml b/global/software/charts/osh-infra/osh-infra-dashboards/grafana.yaml
index 48e36e23a..ae7ddf022 100644
--- a/global/software/charts/osh-infra/osh-infra-dashboards/grafana.yaml
+++ b/global/software/charts/osh-infra/osh-infra-dashboards/grafana.yaml
@@ -68,6 +68,12 @@ metadata:
path: .osh_infra.grafana.oslo_db
dest:
path: .values.endpoints.oslo_db.auth.user
+ - src:
+ schema: pegleg/AccountCatalogue/v1
+ name: osh_infra_service_accounts
+ path: .osh_infra.prometheus.admin
+ dest:
+ path: .values.endpoints.prometheus.auth.admin
- src:
schema: pegleg/AccountCatalogue/v1
name: osh_infra_service_accounts
@@ -120,6 +126,12 @@ metadata:
schema: deckhand/Passphrase/v1
name: osh_infra_oslo_db_admin_password
path: .
+ - dest:
+ path: .values.endpoints.prometheus.auth.admin.password
+ src:
+ schema: deckhand/Passphrase/v1
+ name: osh_infra_prometheus_admin_password
+ path: .
# LDAP Configuration Details
- src:
diff --git a/global/software/charts/osh-infra/osh-infra-logging/elasticsearch.yaml b/global/software/charts/osh-infra/osh-infra-logging/elasticsearch.yaml
index 2e7a6b640..6b16eddc6 100644
--- a/global/software/charts/osh-infra/osh-infra-logging/elasticsearch.yaml
+++ b/global/software/charts/osh-infra/osh-infra-logging/elasticsearch.yaml
@@ -54,7 +54,18 @@ metadata:
path: .osh_infra.elasticsearch.admin
dest:
path: .values.endpoints.elasticsearch.auth.admin
-
+ - src:
+ schema: pegleg/AccountCatalogue/v1
+ name: osh_infra_service_accounts
+ path: .osh_infra.ceph_object_store.admin
+ dest:
+ path: .values.endpoints.ceph_object_store.auth.admin
+ - src:
+ schema: pegleg/AccountCatalogue/v1
+ name: osh_infra_service_accounts
+ path: .osh_infra.ceph_object_store.elasticsearch
+ dest:
+ path: .values.endpoints.ceph_object_store.auth.elasticsearch
# Secrets
- dest:
path: .values.endpoints.elasticsearch.auth.admin.password
@@ -62,6 +73,30 @@ metadata:
schema: deckhand/Passphrase/v1
name: osh_infra_elasticsearch_admin_password
path: .
+ - dest:
+ path: .values.endpoints.ceph_object_store.auth.admin.access_key
+ src:
+ schema: deckhand/Passphrase/v1
+ name: osh_infra_rgw_s3_admin_access_key
+ path: .
+ - dest:
+ path: .values.endpoints.ceph_object_store.auth.admin.secret_key
+ src:
+ schema: deckhand/Passphrase/v1
+ name: osh_infra_rgw_s3_admin_secret_key
+ path: .
+ - dest:
+ path: .values.endpoints.ceph_object_store.auth.elasticsearch.access_key
+ src:
+ schema: deckhand/Passphrase/v1
+ name: osh_infra_rgw_s3_elasticsearch_access_key
+ path: .
+ - dest:
+ path: .values.endpoints.ceph_object_store.auth.elasticsearch.secret_key
+ src:
+ schema: deckhand/Passphrase/v1
+ name: osh_infra_rgw_s3_elasticsearch_secret_key
+ path: .
# LDAP Details
- src:
@@ -97,6 +132,75 @@ data:
post:
create: []
values:
+ pod:
+ replicas:
+ client: 5
+ resources:
+ enabled: true
+ apache_proxy:
+ limits:
+ memory: "1024Mi"
+ cpu: "2000m"
+ requests:
+ memory: "0"
+ cpu: "0"
+ client:
+ requests:
+ memory: "8Gi"
+ cpu: "1000m"
+ limits:
+ memory: "16Gi"
+ cpu: "2000m"
+ master:
+ requests:
+ memory: "8Gi"
+ cpu: "1000m"
+ limits:
+ memory: "16Gi"
+ cpu: "2000m"
+ data:
+ requests:
+ memory: "8Gi"
+ cpu: "1000m"
+ limits:
+ memory: "16Gi"
+ cpu: "2000m"
+ prometheus_elasticsearch_exporter:
+ requests:
+ memory: "0"
+ cpu: "0"
+ limits:
+ memory: "1024Mi"
+ cpu: "2000m"
+ jobs:
+ curator:
+ requests:
+ memory: "0"
+ cpu: "0"
+ limits:
+ memory: "1024Mi"
+ cpu: "2000m"
+ image_repo_sync:
+ requests:
+ memory: "0"
+ cpu: "0"
+ limits:
+ memory: "1024Mi"
+ cpu: "2000m"
+ snapshot_repository:
+ requests:
+ memory: "0"
+ cpu: "0"
+ limits:
+ memory: "1024Mi"
+ cpu: "2000m"
+ tests:
+ requests:
+ memory: "0"
+ cpu: "0"
+ limits:
+ memory: "1024Mi"
+ cpu: "2000m"
labels:
elasticsearch:
node_selector_key: openstack-control-plane
@@ -108,27 +212,95 @@ data:
prometheus:
enabled: true
conf:
- apache:
- host: |
-
-
- ProxyPass http://localhost:{{ tuple "elasticsearch" "internal" "client" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}/
- ProxyPassReverse http://localhost:{{ tuple "elasticsearch" "internal" "client" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}/
-
-
- AuthName "Elasticsearch"
- AuthType Basic
- AuthBasicProvider file ldap
- AuthUserFile /usr/local/apache2/conf/.htpasswd
- AuthLDAPBindDN {{ .Values.endpoints.ldap.auth.admin.bind }}
- AuthLDAPBindPassword {{ .Values.endpoints.ldap.auth.admin.password }}
- AuthLDAPURL {{ tuple "ldap" "public" "ldap" . | include "helm-toolkit.endpoints.keystone_endpoint_uri_lookup" }}
- Require valid-user
-
-
+ httpd: |
+ ServerRoot "/usr/local/apache2"
+ Listen 80
+ LoadModule mpm_event_module modules/mod_mpm_event.so
+ LoadModule authn_file_module modules/mod_authn_file.so
+ LoadModule authn_core_module modules/mod_authn_core.so
+ LoadModule authz_host_module modules/mod_authz_host.so
+ LoadModule authz_groupfile_module modules/mod_authz_groupfile.so
+ LoadModule authz_user_module modules/mod_authz_user.so
+ LoadModule authz_core_module modules/mod_authz_core.so
+ LoadModule access_compat_module modules/mod_access_compat.so
+ LoadModule auth_basic_module modules/mod_auth_basic.so
+ LoadModule ldap_module modules/mod_ldap.so
+ LoadModule authnz_ldap_module modules/mod_authnz_ldap.so
+ LoadModule reqtimeout_module modules/mod_reqtimeout.so
+ LoadModule filter_module modules/mod_filter.so
+ LoadModule proxy_html_module modules/mod_proxy_html.so
+ LoadModule log_config_module modules/mod_log_config.so
+ LoadModule env_module modules/mod_env.so
+ LoadModule headers_module modules/mod_headers.so
+ LoadModule setenvif_module modules/mod_setenvif.so
+ LoadModule version_module modules/mod_version.so
+ LoadModule proxy_module modules/mod_proxy.so
+ LoadModule proxy_connect_module modules/mod_proxy_connect.so
+ LoadModule proxy_http_module modules/mod_proxy_http.so
+ LoadModule proxy_balancer_module modules/mod_proxy_balancer.so
+ LoadModule slotmem_shm_module modules/mod_slotmem_shm.so
+ LoadModule slotmem_plain_module modules/mod_slotmem_plain.so
+ LoadModule unixd_module modules/mod_unixd.so
+ LoadModule status_module modules/mod_status.so
+ LoadModule autoindex_module modules/mod_autoindex.so
+
+ User daemon
+ Group daemon
+
+
+ AllowOverride none
+ Require all denied
+
+
+ Require all denied
+
+ ErrorLog /dev/stderr
+ LogLevel warn
+
+ LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined
+ LogFormat "%h %l %u %t \"%r\" %>s %b" common
+
+ LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %I %O" combinedio
+
+ CustomLog /dev/stdout common
+ CustomLog /dev/stdout combined
+
+
+ AllowOverride None
+ Options None
+ Require all granted
+
+
+ RequestHeader unset Proxy early
+
+
+ Include conf/extra/proxy-html.conf
+
+
+
+ ProxyPass http://localhost:{{ tuple "elasticsearch" "internal" "client" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}/
+ ProxyPassReverse http://localhost:{{ tuple "elasticsearch" "internal" "client" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}/
+
+
+ AuthName "Elasticsearch"
+ AuthType Basic
+ AuthBasicProvider file ldap
+ AuthUserFile /usr/local/apache2/conf/.htpasswd
+ AuthLDAPBindDN {{ .Values.endpoints.ldap.auth.admin.bind }}
+ AuthLDAPBindPassword {{ .Values.endpoints.ldap.auth.admin.password }}
+ AuthLDAPURL {{ tuple "ldap" "public" "ldap" . | include "helm-toolkit.endpoints.keystone_endpoint_uri_lookup" | quote }}
+ Require valid-user
+
+
elasticsearch:
+ config:
+ http:
+ max_content_length: 2gb
+ pipelining: false
env:
- java_opts: "-Xms5g -Xmx5g"
+ java_opts: "-Xms8g -Xmx8g"
+ snapshots:
+ enabled: true
curator:
#run every 6th hour
schedule: "0 */6 * * *"
diff --git a/global/software/charts/osh-infra/osh-infra-logging/fluent-logging.yaml b/global/software/charts/osh-infra/osh-infra-logging/fluent-logging.yaml
index 28f9dd633..f6e41179a 100644
--- a/global/software/charts/osh-infra/osh-infra-logging/fluent-logging.yaml
+++ b/global/software/charts/osh-infra/osh-infra-logging/fluent-logging.yaml
@@ -82,12 +82,51 @@ data:
post:
create: []
values:
+ pod:
+ resources:
+ enabled: true
+ fluentbit:
+ limits:
+ memory: '4Gi'
+ cpu: '2000m'
+ requests:
+ memory: '2Gi'
+ cpu: '1000m'
+ fluentd:
+ limits:
+ memory: '4Gi'
+ cpu: '2000m'
+ requests:
+ memory: '2Gi'
+ cpu: '1000m'
+ prometheus_fluentd_exporter:
+ limits:
+ memory: '1024Mi'
+ cpu: '2000m'
+ requests:
+ memory: '0'
+ cpu: '0'
+ jobs:
+ image_repo_sync:
+ requests:
+ memory: '0'
+ cpu: '0'
+ limits:
+ memory: '1024Mi'
+ cpu: '2000m'
+ tests:
+ requests:
+ memory: '0'
+ cpu: '0'
+ limits:
+ memory: '1024Mi'
+ cpu: '2000m'
labels:
fluentd:
- node_selector_key: openstack-control-plane
+ node_selector_key: fluentd
node_selector_value: enabled
fluentbit:
- node_selector_key: openstack-control-plane
+ node_selector_key: fluentbit
node_selector_value: enabled
prometheus_fluentd_exporter:
node_selector_key: openstack-control-plane
@@ -95,20 +134,6 @@ data:
job:
node_selector_key: openstack-control-plane
node_selector_value: enabled
- dependencies:
- static:
- fluentbit:
- jobs: ""
- services:
- - endpoint: internal
- service: fluentd
- fluentd:
- jobs: ""
- services:
- - endpoint: internal
- service: elasticsearch
- manifests:
- job_elasticsearch_template: false
conf:
fluentbit:
- service:
@@ -117,6 +142,28 @@ data:
Daemon: Off
Log_Level: info
Parsers_File: parsers.conf
+ - kubelet:
+ header: input
+ Name: systemd
+ Path: /var/log/journal
+ Tag: syslog.*
+ Systemd_Filter: _SYSTEMD_UNIT=kubelet.service
+ DB: /var/log/kubelet.db
+ Mem_Buf_Limit: 5MB
+ DB.Sync: Normal
+ Buffer_Chunk_Size: 1M
+ Buffer_Max_Size: 1M
+ - docker_daemon:
+ header: input
+ Name: systemd
+ Path: /var/log/journal
+ Tag: syslog.*
+ Systemd_Filter: _SYSTEMD_UNIT=docker.service
+ DB: /var/log/docker.db
+ Mem_Buf_Limit: 5MB
+ DB.Sync: Normal
+ Buffer_Chunk_Size: 1M
+ Buffer_Max_Size: 1M
- containers_tail:
header: input
Name: tail
@@ -124,10 +171,14 @@ data:
Path: /var/log/containers/*.log
Parser: docker
DB: /var/log/flb_kube.db
+ Mem_Buf_Limit: 5MB
DB.Sync: Normal
Buffer_Chunk_Size: 1M
Buffer_Max_Size: 1M
- Mem_Buf_Limit: 5MB
+ - drop_fluentd_logs:
+ header: output
+ Name: "null"
+ Match: "**.fluentd**"
- kube_filter:
header: filter
Name: kubernetes
@@ -137,8 +188,16 @@ data:
header: output
Name: forward
Match: "*"
- Host: ${FLUENTD_HOST}
- Port: ${FLUENTD_PORT}
+ Host: fluentd-logging.osh-infra
+ Port: 24224
+ parsers:
+ - docker:
+ header: parser
+ Name: docker
+ Format: json
+ Time_Key: time
+ Time_Format: "%Y-%m-%dT%H:%M:%S.%L"
+ Time_Keep: On
td_agent:
- metrics_agent:
header: source
@@ -150,22 +209,268 @@ data:
type: forward
port: "#{ENV['FLUENTD_PORT']}"
bind: 0.0.0.0
- - elasticsearch:
+ - drop_fluent_logs:
+ header: match
+ type: "null"
+ expression: "fluent.*"
+ - add_container_name:
+ header: filter
+ type: record_transformer
+ expression: "kube.**"
+ enable_ruby: true
+ record:
+ -
+ - header: record
+ container_name: ${record["kubernetes"]["container_name"]}
+ - remove_openstack_pod_logged_events:
+ header: filter
+ type: grep
+ expression: "kube.**"
+ exclude:
+ -
+ - header: exclude
+ key: container_name
+ pattern: ^(cinder-api|cinder-scheduler|cinder-volume|cinder-backup|glance-api|glance-registry|heat-api|heat-cfn|heat-engine|keystone-api|neutron-dhcp-agent|neutron-l3-agent|neutron-server|nova-osapi|nova-api|nova-compute|nova-conductor|nova-consoleauth|nova-novncproxy|nova-scheduler)$
+ # NOTE(srwilkers): Look for specific keywords in the log key to determine
+ # log level of event
+ - tag_kubernetes_log_level:
+ header: match
+ type: rewrite_tag_filter
+ expression: "kube.var.log.containers.**.log"
+ rule:
+ -
+ - header: rule
+ key: log
+ pattern: /info/i
+ tag: info.${tag}
+ - header: rule
+ key: log
+ pattern: /warn/i
+ tag: warn.${tag}
+ - header: rule
+ key: log
+ pattern: /error/i
+ tag: error.${tag}
+ - header: rule
+ key: log
+ pattern: /critical/i
+ tag: critical.${tag}
+ - header: rule
+ key: log
+ pattern: (.+)
+ tag: info.${tag}
+ # NOTE(srwilkers): Create new key for log level, and use the tag prefix
+ # added previously
+ - add_kubernetes_log_level_and_application_key:
+ header: filter
+ type: record_transformer
+ enable_ruby: true
+ expression: "**.kube.var.log.containers.**.log"
+ record:
+ -
+ - header: record
+ level: ${tag_parts[0]}
+ application: ${record["kubernetes"]["labels"]["application"]}
+ - add_openstack_application_key:
+ header: filter
+ type: record_transformer
+ expression: "openstack.**"
+ record:
+ -
+ - header: record
+ application: ${tag_parts[1]}
+ #NOTE(srwilkers): This prefixes the tag for oslo.log entries from the
+ # fluent handler/formatter with the log level, allowing for lookups on
+ # openstack logs with a particular log level (ie: error.openstack.keystone)
+ - tag_openstack_log_level:
+ header: match
+ type: rewrite_tag_filter
+ expression: "openstack.**"
+ rule:
+ -
+ - header: rule
+ key: level
+ pattern: INFO
+ tag: info.${tag}
+ - header: rule
+ key: level
+ pattern: WARN
+ tag: warn.${tag}
+ - header: rule
+ key: level
+ pattern: ERROR
+ tag: error.${tag}
+ - header: rule
+ key: level
+ pattern: CRITICAL
+ tag: critical.${tag}
+ - syslog_elasticsearch:
header: match
type: elasticsearch
user: "#{ENV['ELASTICSEARCH_USERNAME']}"
password: "#{ENV['ELASTICSEARCH_PASSWORD']}"
- expression: "**"
+ expression: "syslog.**"
include_tag_key: true
host: "#{ENV['ELASTICSEARCH_HOST']}"
port: "#{ENV['ELASTICSEARCH_PORT']}"
logstash_format: true
+ logstash_prefix: syslog
+ buffer_type: memory
buffer_chunk_limit: 10M
- buffer_queue_limit: 32
- flush_interval: 20s
+ buffer_queue_limit: 512
+ flush_interval: 10
max_retry_wait: 300
+ request_timeout: 60
disable_retry_limit: ""
num_threads: 8
+ type_name: syslog
+ - ceph_elasticsearch:
+ header: match
+ type: elasticsearch
+ user: "#{ENV['ELASTICSEARCH_USERNAME']}"
+ password: "#{ENV['ELASTICSEARCH_PASSWORD']}"
+ expression: "ceph-**.log"
+ include_tag_key: true
+ host: "#{ENV['ELASTICSEARCH_HOST']}"
+ port: "#{ENV['ELASTICSEARCH_PORT']}"
+ logstash_format: true
+ logstash_prefix: ceph
+ buffer_chunk_limit: 10M
+ buffer_queue_limit: 512
+ flush_interval: 10
+ max_retry_wait: 300
+ request_timeout: 60
+ disable_retry_limit: ""
+ num_threads: 8
+ type_name: ceph_logs
+ - oslo_fluentd_elasticsearch:
+ header: match
+ type: elasticsearch
+ user: "#{ENV['ELASTICSEARCH_USERNAME']}"
+ password: "#{ENV['ELASTICSEARCH_PASSWORD']}"
+ expression: "**.openstack.*"
+ include_tag_key: true
+ host: "#{ENV['ELASTICSEARCH_HOST']}"
+ port: "#{ENV['ELASTICSEARCH_PORT']}"
+ logstash_format: true
+ logstash_prefix: openstack
+ buffer_type: memory
+ buffer_chunk_limit: 10M
+ buffer_queue_limit: 512
+ flush_interval: 10
+ max_retry_wait: 300
+ request_timeout: 60
+ disable_retry_limit: ""
+ num_threads: 8
+ type_name: oslo_openstack_fluentd
+ - docker_fluentd_elasticsearch:
+ header: match
+ type: elasticsearch
+ user: "#{ENV['ELASTICSEARCH_USERNAME']}"
+ password: "#{ENV['ELASTICSEARCH_PASSWORD']}"
+ expression: "**.kube.**.log"
+ include_tag_key: true
+ host: "#{ENV['ELASTICSEARCH_HOST']}"
+ port: "#{ENV['ELASTICSEARCH_PORT']}"
+ logstash_format: true
+ buffer_type: memory
+ buffer_chunk_limit: 10M
+ buffer_queue_limit: 512
+ flush_interval: 10
+ max_retry_wait: 300
+ request_timeout: 60
+ disable_retry_limit: ""
+ num_threads: 8
+ type_name: docker_fluentd
+ fluentd_exporter:
+ log:
+ format: "logger:stdout?json=true"
+ level: "info"
+ templates:
+ syslog:
+ template: "syslog-*"
+ index_patterns: "syslog-*"
+ settings:
+ number_of_shards: 1
+ mappings:
+ syslog:
+ properties:
+ cluster:
+ type: keyword
+ app:
+ type: keyword
+ pid:
+ type: integer
+ host:
+ type: keyword
+ log:
+ type: text
+ ceph_logs:
+ template: "ceph-*"
+ index_patterns: "ceph-*"
+ settings:
+ number_of_shards: 1
+ mappings:
+ ceph_logs:
+ properties:
+ log:
+ type: text
+ oslo_openstack_fluentd:
+ template: "openstack-*"
+ index_patterns: "openstack-*"
+ settings:
+ number_of_shards: 1
+ mappings:
+ oslo_openstack_fluentd:
+ properties:
+ extra:
+ properties:
+ project:
+ type: text
+ norms: false
+ version:
+ type: text
+ norms: false
+ filename:
+ type: text
+ norms: false
+ funcname:
+ type: text
+ norms: false
+ message:
+ type: text
+ norms: false
+ process_name:
+ type: keyword
+ index: false
+ docker_fluentd:
+ template: "logstash-*"
+ index_patterns: "logstash-*"
+ settings:
+ number_of_shards: 1
+ mappings:
+ docker_fluentd:
+ properties:
+ kubernetes:
+ properties:
+ container_name:
+ type: keyword
+ index: false
+ docker_id:
+ type: keyword
+ index: false
+ host:
+ type: keyword
+ index: false
+ namespace_name:
+ type: keyword
+ index: false
+ pod_id:
+ type: keyword
+ index: false
+ pod_name:
+ type: keyword
+ index: false
dependencies:
- osh-infra-helm-toolkit
...
diff --git a/global/software/charts/osh-infra/osh-infra-mariadb/mariadb.yaml b/global/software/charts/osh-infra/osh-infra-mariadb/mariadb.yaml
index 310cfec33..1d90c3ccd 100644
--- a/global/software/charts/osh-infra/osh-infra-mariadb/mariadb.yaml
+++ b/global/software/charts/osh-infra/osh-infra-mariadb/mariadb.yaml
@@ -31,6 +31,13 @@ metadata:
path: .osh_infra.oslo_db
dest:
path: .values.endpoints.olso_db
+ - src:
+ schema: pegleg/EndpointCatalogue/v1
+ name: osh_infra_endpoints
+ path: .osh_infra.prometheus_mysql_exporter
+ dest:
+ path: .values.endpoints.prometheus_mysql_exporter
+
# Accounts
- src:
schema: pegleg/AccountCatalogue/v1
@@ -38,6 +45,12 @@ metadata:
path: .osh_infra.oslo_db.admin
dest:
path: .values.endpoints.oslo_db.auth.admin
+ - src:
+ schema: pegleg/AccountCatalogue/v1
+ name: osh_infra_service_accounts
+ path: .osh_infra.prometheus_mysql_exporter.user
+ dest:
+ path: .values.endpoints.prometheus_mysql_exporter.auth.user
# Secrets
- dest:
@@ -46,7 +59,12 @@ metadata:
schema: deckhand/Passphrase/v1
name: osh_infra_oslo_db_admin_password
path: .
-
+ - dest:
+ path: .values.endpoints.oslo_db.auth.exporter.password
+ src:
+ schema: deckhand/Passphrase/v1
+ name: osh_infra_oslo_db_exporter_password
+ path: .
data:
chart_name: osh-infra-mariadb
release: osh-infra-mariadb
@@ -72,6 +90,9 @@ data:
prometheus_mysql_exporter:
node_selector_key: openstack-control-plane
node_selector_value: enabled
+ monitoring:
+ prometheus:
+ enabled: true
dependencies:
- osh-helm-toolkit
...
diff --git a/global/software/charts/osh-infra/osh-infra-monitoring/chart-group.yaml b/global/software/charts/osh-infra/osh-infra-monitoring/chart-group.yaml
index 4cb879cd4..be06ca8ad 100644
--- a/global/software/charts/osh-infra/osh-infra-monitoring/chart-group.yaml
+++ b/global/software/charts/osh-infra/osh-infra-monitoring/chart-group.yaml
@@ -13,5 +13,6 @@ data:
- prometheus
- prometheus-alertmanager
- prometheus-node-exporter
+ - prometheus-process-exporter
- prometheus-kube-state-metrics
- nagios
diff --git a/global/software/charts/osh-infra/osh-infra-monitoring/nagios.yaml b/global/software/charts/osh-infra/osh-infra-monitoring/nagios.yaml
index 35ff41b3e..1830a2b86 100644
--- a/global/software/charts/osh-infra/osh-infra-monitoring/nagios.yaml
+++ b/global/software/charts/osh-infra/osh-infra-monitoring/nagios.yaml
@@ -37,6 +37,12 @@ metadata:
path: .osh_infra.monitoring
dest:
path: .values.endpoints.monitoring
+ - src:
+ schema: pegleg/EndpointCatalogue/v1
+ name: osh_infra_endpoints
+ path: .osh_infra.elasticsearch
+ dest:
+ path: .values.endpoints.elasticsearch
- src:
schema: pegleg/EndpointCatalogue/v1
name: osh_infra_endpoints
@@ -51,6 +57,18 @@ metadata:
path: .osh_infra.nagios.admin
dest:
path: .values.endpoints.nagios.auth.admin
+ - src:
+ schema: pegleg/AccountCatalogue/v1
+ name: osh_infra_service_accounts
+ path: .osh_infra.prometheus.admin
+ dest:
+ path: .values.endpoints.prometheus.auth.admin
+ - src:
+ schema: pegleg/AccountCatalogue/v1
+ name: osh_infra_service_accounts
+ path: .osh_infra.elasticsearch.admin
+ dest:
+ path: .values.endpoints.elasticsearch.auth.admin
# Secrets
- dest:
@@ -59,6 +77,18 @@ metadata:
schema: deckhand/Passphrase/v1
name: osh_infra_nagios_admin_password
path: .
+ - dest:
+ path: .values.endpoints.elasticsearch.auth.admin.password
+ src:
+ schema: deckhand/Passphrase/v1
+ name: osh_infra_elasticsearch_admin_password
+ path: .
+ - dest:
+ path: .values.endpoints.prometheus.auth.admin.password
+ src:
+ schema: deckhand/Passphrase/v1
+ name: osh_infra_prometheus_admin_password
+ path: .
# LDAP Details
- src:
diff --git a/global/software/charts/osh-infra/osh-infra-monitoring/prometheus-process-exporter.yaml b/global/software/charts/osh-infra/osh-infra-monitoring/prometheus-process-exporter.yaml
new file mode 100644
index 000000000..d64e8564e
--- /dev/null
+++ b/global/software/charts/osh-infra/osh-infra-monitoring/prometheus-process-exporter.yaml
@@ -0,0 +1,65 @@
+---
+schema: armada/Chart/v1
+metadata:
+ schema: metadata/Document/v1
+ name: prometheus-process-exporter
+ layeringDefinition:
+ abstract: false
+ layer: global
+ storagePolicy: cleartext
+ substitutions:
+ # Chart source
+ - src:
+ schema: pegleg/SoftwareVersions/v1
+ name: software-versions
+ path: .charts.osh_infra.prometheus_process_exporter
+ dest:
+ path: .source
+
+ # Images
+ - src:
+ schema: pegleg/SoftwareVersions/v1
+ name: software-versions
+ path: .images.osh_infra.prometheus_process_exporter
+ dest:
+ path: .values.images.tags
+
+ # Endpoints
+ - src:
+ schema: pegleg/EndpointCatalogue/v1
+ name: osh_infra_endpoints
+ path: .osh_infra.process_exporter_metrics
+ dest:
+ path: .values.endpoints.process_exporter_metrics
+
+data:
+ chart_name: prometheus-process-exporter
+ release: prometheus-process-exporter
+ namespace: kube-system
+ wait:
+ timeout: 900
+ labels:
+ release_group: airship-prometheus-process-exporter
+ install:
+ no_hooks: false
+ upgrade:
+ no_hooks: false
+ pre:
+ delete:
+ - type: job
+ labels:
+ release_group: airship-prometheus-process-exporter
+ create: []
+ post:
+ create: []
+ values:
+ labels:
+ node_exporter:
+ node_selector_key: node-exporter
+ node_selector_value: enabled
+ job:
+ node_selector_key: openstack-control-plane
+ node_selector_value: enabled
+ dependencies:
+ - osh-infra-helm-toolkit
+...
diff --git a/global/software/charts/osh-infra/osh-infra-monitoring/prometheus.yaml b/global/software/charts/osh-infra/osh-infra-monitoring/prometheus.yaml
index 9674e8897..bfde817a9 100644
--- a/global/software/charts/osh-infra/osh-infra-monitoring/prometheus.yaml
+++ b/global/software/charts/osh-infra/osh-infra-monitoring/prometheus.yaml
@@ -37,6 +37,42 @@ metadata:
path: .osh_infra.alerts
dest:
path: .values.endpoints.alerts
+ - src:
+ schema: pegleg/EndpointCatalogue/v1
+ name: osh_infra_endpoints
+ path: .osh_infra.ldap
+ dest:
+ path: .values.endpoints.ldap
+
+ # Accounts
+ - src:
+ schema: pegleg/AccountCatalogue/v1
+ name: osh_infra_service_accounts
+ path: .osh_infra.prometheus.admin
+ dest:
+ path: .values.endpoints.prometheus.auth.admin
+
+ # Secrets
+ - dest:
+ path: .values.endpoints.prometheus.auth.admin.password
+ src:
+ schema: deckhand/Passphrase/v1
+ name: osh_infra_prometheus_admin_password
+ path: .
+
+ # LDAP Details
+ - src:
+ schema: pegleg/AccountCatalogue/v1
+ name: osh_infra_service_accounts
+ path: .osh_infra.ldap.admin
+ dest:
+ path: .values.endpoints.ldap.auth.admin
+ - dest:
+ path: .values.endpoints.ldap.auth.admin.password
+ src:
+ schema: deckhand/Passphrase/v1
+ name: osh_keystone_ldap_password
+ path: .
data:
chart_name: prometheus
@@ -72,9 +108,1545 @@ data:
pod:
replicas:
prometheus: 3
+ resources:
+ enabled: true
+ prometheus:
+ limits:
+ memory: "64Gi"
+ cpu: "4000m"
+ requests:
+ memory: "16Gi"
+ cpu: "2000m"
storage:
requests:
storage: 500Gi
+ conf:
+ prometheus:
+ command_line_flags:
+ storage.tsdb.max_block_duration: 17h
+ scrape_configs:
+ global:
+ scrape_interval: 60s
+ evaluation_interval: 60s
+ scrape_configs:
+ # NOTE(srwilkers): The job definition for Prometheus should always be
+ # listed first, so we can inject the basic auth username and password
+ # via the endpoints section
+ - job_name: 'prometheus-metrics'
+ kubernetes_sd_configs:
+ - role: endpoints
+ scrape_interval: 60s
+ relabel_configs:
+ - source_labels:
+ - __meta_kubernetes_service_name
+ action: keep
+ regex: "prom-metrics"
+ - source_labels:
+ - __meta_kubernetes_service_annotation_prometheus_io_scrape
+ action: keep
+ regex: true
+ - source_labels:
+ - __meta_kubernetes_service_annotation_prometheus_io_scheme
+ action: replace
+ target_label: __scheme__
+ regex: (https?)
+ - source_labels:
+ - __meta_kubernetes_service_annotation_prometheus_io_path
+ action: replace
+ target_label: __metrics_path__
+ regex: (.+)
+ - source_labels:
+ - __address__
+ - __meta_kubernetes_service_annotation_prometheus_io_port
+ action: replace
+ target_label: __address__
+ regex: ([^:]+)(?::\d+)?;(\d+)
+ replacement: $1:$2
+ - action: labelmap
+ regex: __meta_kubernetes_service_label_(.+)
+ - source_labels:
+ - __meta_kubernetes_namespace
+ action: replace
+ target_label: kubernetes_namespace
+ - source_labels:
+ - __meta_kubernetes_service_name
+ action: replace
+ target_label: instance
+ - source_labels:
+ - __meta_kubernetes_service_name
+ action: replace
+ target_label: kubernetes_name
+ - source_labels:
+ - __meta_kubernetes_service_name
+ target_label: job
+ replacement: ${1}
+ - job_name: kubelet
+ scheme: https
+ # This TLS & bearer token file config is used to connect to the actual scrape
+ # endpoints for cluster components. This is separate to discovery auth
+ # configuration because discovery & scraping are two separate concerns in
+ # Prometheus. The discovery auth config is automatic if Prometheus runs inside
+ # the cluster. Otherwise, more config options have to be provided within the
+ # .
+ tls_config:
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+ kubernetes_sd_configs:
+ - role: node
+ scrape_interval: 45s
+ relabel_configs:
+ - action: labelmap
+ regex: __meta_kubernetes_node_label_(.+)
+ - target_label: __address__
+ replacement: kubernetes.default.svc:443
+ - source_labels:
+ - __meta_kubernetes_node_name
+ regex: (.+)
+ target_label: __metrics_path__
+ replacement: /api/v1/nodes/${1}/proxy/metrics
+ - source_labels:
+ - __meta_kubernetes_node_name
+ action: replace
+ target_label: kubernetes_io_hostname
+ # Scrape config for Kubelet cAdvisor.
+ #
+ # This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics
+ # (those whose names begin with 'container_') have been removed from the
+ # Kubelet metrics endpoint. This job scrapes the cAdvisor endpoint to
+ # retrieve those metrics.
+ #
+ # In Kubernetes 1.7.0-1.7.2, these metrics are only exposed on the cAdvisor
+ # HTTP endpoint; use "replacement: /api/v1/nodes/${1}:4194/proxy/metrics"
+ # in that case (and ensure cAdvisor's HTTP server hasn't been disabled with
+ # the --cadvisor-port=0 Kubelet flag).
+ #
+ # This job is not necessary and should be removed in Kubernetes 1.6 and
+ # earlier versions, or it will cause the metrics to be scraped twice.
+ - job_name: 'kubernetes-cadvisor'
+
+ # Default to scraping over https. If required, just disable this or change to
+ # `http`.
+ scheme: https
+
+ # This TLS & bearer token file config is used to connect to the actual scrape
+ # endpoints for cluster components. This is separate to discovery auth
+ # configuration because discovery & scraping are two separate concerns in
+ # Prometheus. The discovery auth config is automatic if Prometheus runs inside
+ # the cluster. Otherwise, more config options have to be provided within the
+ # .
+ tls_config:
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+ kubernetes_sd_configs:
+ - role: node
+
+ relabel_configs:
+ - action: labelmap
+ regex: __meta_kubernetes_node_label_(.+)
+ - target_label: __address__
+ replacement: kubernetes.default.svc:443
+ - source_labels:
+ - __meta_kubernetes_node_name
+ regex: (.+)
+ target_label: __metrics_path__
+ replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
+ metric_relabel_configs:
+ - source_labels:
+ - __name__
+ regex: 'container_network_tcp_usage_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_tasks_state'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_network_udp_usage_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_memory_failures_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_cpu_load_average_10s'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_cpu_system_seconds_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_cpu_user_seconds_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_fs_inodes_free'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_fs_inodes_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_fs_io_current'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_fs_io_time_seconds_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_fs_io_time_weighted_seconds_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_fs_read_seconds_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_fs_reads_merged_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_fs_reads_merged_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_fs_reads_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_fs_sector_reads_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_fs_sector_writes_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_fs_write_seconds_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_fs_writes_bytes_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_fs_writes_merged_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_fs_writes_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_last_seen'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_memory_cache'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_memory_failcnt'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_memory_max_usage_bytes'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_memory_rss'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_memory_swap'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_memory_usage_bytes'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_network_receive_errors_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_network_receive_packets_dropped_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_network_receive_packets_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_network_transmit_errors_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_network_transmit_packets_dropped_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_network_transmit_packets_total'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_spec_cpu_period'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_spec_cpu_shares'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_spec_memory_limit_bytes'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_spec_memory_reservation_limit_bytes'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_spec_memory_swap_limit_bytes'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'container_start_time_seconds'
+ action: drop
+ # Scrape config for API servers.
+ #
+ # Kubernetes exposes API servers as endpoints to the default/kubernetes
+ # service so this uses `endpoints` role and uses relabelling to only keep
+ # the endpoints associated with the default/kubernetes service using the
+ # default named port `https`. This works for single API server deployments as
+ # well as HA API server deployments.
+ - job_name: 'apiserver'
+ kubernetes_sd_configs:
+ - role: endpoints
+ scrape_interval: 45s
+ # Default to scraping over https. If required, just disable this or change to
+ # `http`.
+ scheme: https
+ # This TLS & bearer token file config is used to connect to the actual scrape
+ # endpoints for cluster components. This is separate to discovery auth
+ # configuration because discovery & scraping are two separate concerns in
+ # Prometheus. The discovery auth config is automatic if Prometheus runs inside
+ # the cluster. Otherwise, more config options have to be provided within the
+ # .
+ tls_config:
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+ # If your node certificates are self-signed or use a different CA to the
+ # master CA, then disable certificate verification below. Note that
+ # certificate verification is an integral part of a secure infrastructure
+ # so this should only be disabled in a controlled environment. You can
+ # disable certificate verification by uncommenting the line below.
+ #
+ # insecure_skip_verify: true
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+ # Keep only the default/kubernetes service endpoints for the https port. This
+ # will add targets for each API server which Kubernetes adds an endpoint to
+ # the default/kubernetes service.
+ relabel_configs:
+ - source_labels:
+ - __meta_kubernetes_namespace
+ - __meta_kubernetes_service_name
+ - __meta_kubernetes_endpoint_port_name
+ action: keep
+ regex: default;kubernetes;https
+ metric_relabel_configs:
+ - source_labels:
+ - __name__
+ regex: 'apiserver_admission_controller_admission_latencies_seconds_bucket'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'rest_client_request_latency_seconds_bucket'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'apiserver_response_sizes_bucket'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'apiserver_admission_step_admission_latencies_seconds_bucket'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'apiserver_admission_controller_admission_latencies_seconds_count'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'apiserver_admission_controller_admission_latencies_seconds_sum'
+ action: drop
+ - source_labels:
+ - __name__
+ regex: 'apiserver_request_latencies_summary'
+ action: drop
+ # Scrape config for service endpoints.
+ #
+ # The relabeling allows the actual service scrape endpoint to be configured
+ # via the following annotations:
+ #
+ # * `prometheus.io/scrape`: Only scrape services that have a value of `true`
+ # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
+ # to set this to `https` & most likely set the `tls_config` of the scrape config.
+ # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
+ # * `prometheus.io/port`: If the metrics are exposed on a different port to the
+ # service then set this appropriately.
+ - job_name: 'openstack-exporter'
+ kubernetes_sd_configs:
+ - role: endpoints
+ scrape_interval: 60s
+ relabel_configs:
+ - source_labels:
+ - __meta_kubernetes_service_name
+ action: keep
+ regex: "openstack-metrics"
+ - source_labels:
+ - __meta_kubernetes_service_annotation_prometheus_io_scrape
+ action: keep
+ regex: true
+ - source_labels:
+ - __meta_kubernetes_service_annotation_prometheus_io_scheme
+ action: replace
+ target_label: __scheme__
+ regex: (https?)
+ - source_labels:
+ - __meta_kubernetes_service_annotation_prometheus_io_path
+ action: replace
+ target_label: __metrics_path__
+ regex: (.+)
+ - source_labels:
+ - __address__
+ - __meta_kubernetes_service_annotation_prometheus_io_port
+ action: replace
+ target_label: __address__
+ regex: ([^:]+)(?::\d+)?;(\d+)
+ replacement: $1:$2
+ - action: labelmap
+ regex: __meta_kubernetes_service_label_(.+)
+ - source_labels:
+ - __meta_kubernetes_namespace
+ action: replace
+ target_label: kubernetes_namespace
+ - source_labels:
+ - __meta_kubernetes_service_name
+ action: replace
+ target_label: instance
+ - source_labels:
+ - __meta_kubernetes_service_name
+ action: replace
+ target_label: kubernetes_name
+ - source_labels:
+ - __meta_kubernetes_service_name
+ target_label: job
+ replacement: ${1}
+ - job_name: 'kubernetes-service-endpoints'
+ kubernetes_sd_configs:
+ - role: endpoints
+ scrape_interval: 60s
+ relabel_configs:
+ - source_labels:
+ - __meta_kubernetes_service_name
+ action: drop
+ regex: '(openstack-metrics|prom-metrics)'
+ - source_labels:
+ - __meta_kubernetes_service_annotation_prometheus_io_scrape
+ action: keep
+ regex: true
+ - source_labels:
+ - __meta_kubernetes_service_annotation_prometheus_io_scheme
+ action: replace
+ target_label: __scheme__
+ regex: (https?)
+ - source_labels:
+ - __meta_kubernetes_service_annotation_prometheus_io_path
+ action: replace
+ target_label: __metrics_path__
+ regex: (.+)
+ - source_labels:
+ - __address__
+ - __meta_kubernetes_service_annotation_prometheus_io_port
+ action: replace
+ target_label: __address__
+ regex: ([^:]+)(?::\d+)?;(\d+)
+ replacement: $1:$2
+ - action: labelmap
+ regex: __meta_kubernetes_service_label_(.+)
+ - source_labels:
+ - __meta_kubernetes_namespace
+ action: replace
+ target_label: kubernetes_namespace
+ - source_labels:
+ - __meta_kubernetes_service_name
+ action: replace
+ target_label: kubernetes_name
+ - source_labels:
+ - __meta_kubernetes_service_name
+ target_label: job
+ replacement: ${1}
+ # Example scrape config for pods
+ #
+ # The relabeling allows the actual pod scrape endpoint to be configured via the
+ # following annotations:
+ #
+ # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`
+ # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
+ # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the
+ # pod's declared ports (default is a port-free target if none are declared).
+ - job_name: 'kubernetes-pods'
+ kubernetes_sd_configs:
+ - role: pod
+ relabel_configs:
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
+ action: keep
+ regex: true
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
+ action: replace
+ target_label: __metrics_path__
+ regex: (.+)
+ - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
+ action: replace
+ regex: ([^:]+)(?::\d+)?;(\d+)
+ replacement: $1:$2
+ target_label: __address__
+ - action: labelmap
+ regex: __meta_kubernetes_pod_label_(.+)
+ - source_labels: [__meta_kubernetes_namespace]
+ action: replace
+ target_label: kubernetes_namespace
+ - source_labels: [__meta_kubernetes_pod_name]
+ action: replace
+ target_label: kubernetes_pod_name
+ - job_name: calico-etcd
+ kubernetes_sd_configs:
+ - role: service
+ scrape_interval: 20s
+ relabel_configs:
+ - action: labelmap
+ regex: __meta_kubernetes_service_label_(.+)
+ - action: keep
+ source_labels:
+ - __meta_kubernetes_service_name
+ regex: "calico-etcd"
+ - action: keep
+ source_labels:
+ - __meta_kubernetes_namespace
+ regex: kube-system
+ target_label: namespace
+ - source_labels:
+ - __meta_kubernetes_pod_name
+ target_label: pod
+ - source_labels:
+ - __meta_kubernetes_service_name
+ target_label: service
+ - source_labels:
+ - __meta_kubernetes_service_name
+ target_label: job
+ replacement: ${1}
+ - source_labels:
+ - __meta_kubernetes_service_label
+ target_label: job
+ regex: calico-etcd
+ replacement: ${1}
+ - target_label: endpoint
+ replacement: "calico-etcd"
+ alerting:
+ alertmanagers:
+ - kubernetes_sd_configs:
+ - role: pod
+ tls_config:
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+ relabel_configs:
+ - source_labels: [__meta_kubernetes_pod_label_application]
+ regex: alertmanager
+ action: keep
+ - source_labels: [__meta_kubernetes_pod_container_port_name]
+ regex: alerts-api
+ action: keep
+ - source_labels: [__meta_kubernetes_pod_container_port_name]
+ regex: peer-mesh
+ action: drop
+ rules:
+ alertmanager:
+ groups:
+ - name: alertmanager.rules
+ rules:
+ - alert: AlertmanagerConfigInconsistent
+ expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.
+ summary: Alertmanager configurations are inconsistent
+ - alert: AlertmanagerDownOrMissing
+ expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.
+ summary: Alertmanager down or not discovered
+ - alert: FailedReload
+ expr: alertmanager_config_last_reload_successful == 0
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.
+ summary: Alertmanager configuration reload has failed
+ etcd3:
+ groups:
+ - name: etcd3.rules
+ rules:
+ - alert: etcd_InsufficientMembers
+ expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
+ for: 3m
+ labels:
+ severity: critical
+ annotations:
+ description: If one more etcd member goes down the cluster will be unavailable
+ summary: etcd cluster insufficient members
+ - alert: etcd_NoLeader
+ expr: etcd_server_has_leader{job="etcd"} == 0
+ for: 1m
+ labels:
+ severity: critical
+ annotations:
+ description: etcd member {{ $labels.instance }} has no leader
+ summary: etcd member has no leader
+ - alert: etcd_HighNumberOfLeaderChanges
+ expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
+ labels:
+ severity: warning
+ annotations:
+ description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour
+ summary: a high number of leader changes within the etcd cluster are happening
+ - alert: etcd_HighNumberOfFailedGRPCRequests
+ expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
+ summary: a high number of gRPC requests are failing
+ - alert: etcd_HighNumberOfFailedGRPCRequests
+ expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
+ summary: a high number of gRPC requests are failing
+ - alert: etcd_GRPCRequestsSlow
+ expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
+ for: 10m
+ labels:
+ severity: critical
+ annotations:
+ description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow
+ summary: slow gRPC requests
+ - alert: etcd_HighNumberOfFailedHTTPRequests
+ expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
+ summary: a high number of HTTP requests are failing
+ - alert: etcd_HighNumberOfFailedHTTPRequests
+ expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
+ summary: a high number of HTTP requests are failing
+ - alert: etcd_HTTPRequestsSlow
+ expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow
+ summary: slow HTTP requests
+ - alert: etcd_EtcdMemberCommunicationSlow
+ expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow
+ summary: etcd member communication is slow
+ - alert: etcd_HighNumberOfFailedProposals
+ expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
+ labels:
+ severity: warning
+ annotations:
+ description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour
+ summary: a high number of proposals within the etcd cluster are failing
+ - alert: etcd_HighFsyncDurations
+ expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: etcd instance {{ $labels.instance }} fync durations are high
+ summary: high fsync durations
+ - alert: etcd_HighCommitDurations
+ expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: etcd instance {{ $labels.instance }} commit durations are high
+ summary: high commit durations
+ kube_apiserver:
+ groups:
+ - name: kube-apiserver.rules
+ rules:
+ - alert: K8SApiserverDown
+ expr: absent(up{job="apiserver"} == 1)
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ description: Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.
+ summary: API server unreachable
+ - alert: K8SApiServerLatency
+ expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: 99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.
+ summary: Kubernetes apiserver latency is high
+ kube_controller_manager:
+ groups:
+ - name: kube-controller-manager.rules
+ rules:
+ - alert: K8SControllerManagerDown
+ expr: absent(up{job="kube-controller-manager-discovery"} == 1)
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ description: There is no running K8S controller manager. Deployments and replication controllers are not making progress.
+ runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
+ summary: Controller manager is down
+ kubelet:
+ groups:
+ - name: kubelet.rules
+ rules:
+ - alert: K8SNodeNotReady
+ expr: kube_node_status_ready{condition="true"} == 0
+ for: 1h
+ labels:
+ severity: warning
+ annotations:
+ description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour
+ summary: Node status is NotReady
+ - alert: K8SManyNodesNotReady
+ expr: count(kube_node_status_ready{condition="true"} == 0) > 1 and (count(kube_node_status_ready{condition="true"} == 0) / count(kube_node_status_ready{condition="true"})) > 0.2
+ for: 1m
+ labels:
+ severity: critical
+ annotations:
+ description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
+ summary: Many Kubernetes nodes are Not Ready
+ - alert: K8SKubeletDown
+ expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
+ for: 1h
+ labels:
+ severity: warning
+ annotations:
+ description: Prometheus failed to scrape {{ $value }}% of kubelets.
+ summary: Many Kubelets cannot be scraped
+ - alert: K8SKubeletDown
+ expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
+ for: 1h
+ labels:
+ severity: critical
+ annotations:
+ description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.
+ summary: Many Kubelets cannot be scraped
+ - alert: K8SKubeletTooManyPods
+ expr: kubelet_running_pod_count > 100
+ labels:
+ severity: warning
+ annotations:
+ description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110
+ summary: Kubelet is close to pod limit
+ kubernetes:
+ groups:
+ - name: kubernetes.rules
+ rules:
+ - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
+ expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
+ - record: cluster_namespace_controller_pod_container:spec_cpu_shares
+ expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
+ - record: cluster_namespace_controller_pod_container:cpu_usage:rate
+ expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
+ - record: cluster_namespace_controller_pod_container:memory_usage:bytes
+ expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
+ - record: cluster_namespace_controller_pod_container:memory_working_set:bytes
+ expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
+ - record: cluster_namespace_controller_pod_container:memory_rss:bytes
+ expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
+ - record: cluster_namespace_controller_pod_container:memory_cache:bytes
+ expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
+ - record: cluster_namespace_controller_pod_container:disk_usage:bytes
+ expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
+ - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
+ expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type)
+ - record: cluster_namespace_controller_pod_container:memory_oom:rate
+ expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type)
+ - record: cluster:memory_allocation:percent
+ expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster)
+ - record: cluster:memory_used:percent
+ expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster)
+ - record: cluster:cpu_allocation:percent
+ expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} * ON(cluster, instance) machine_cpu_cores) BY (cluster)
+ - record: cluster:node_cpu_use:percent
+ expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) BY (cluster)
+ - record: cluster_resource_verb:apiserver_latency:quantile_seconds
+ expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
+ labels:
+ quantile: "0.99"
+ - record: cluster_resource_verb:apiserver_latency:quantile_seconds
+ expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
+ labels:
+ quantile: "0.9"
+ - record: cluster_resource_verb:apiserver_latency:quantile_seconds
+ expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
+ labels:
+ quantile: "0.5"
+ - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
+ expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
+ labels:
+ quantile: "0.99"
+ - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
+ expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
+ labels:
+ quantile: "0.9"
+ - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
+ expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
+ labels:
+ quantile: "0.5"
+ - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
+ expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
+ labels:
+ quantile: "0.99"
+ - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
+ expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
+ labels:
+ quantile: "0.9"
+ - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
+ expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
+ labels:
+ quantile: "0.5"
+ - record: cluster:scheduler_binding_latency:quantile_seconds
+ expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
+ labels:
+ quantile: "0.99"
+ - record: cluster:scheduler_binding_latency:quantile_seconds
+ expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
+ labels:
+ quantile: "0.9"
+ - record: cluster:scheduler_binding_latency:quantile_seconds
+ expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
+ labels:
+ quantile: "0.5"
+ - alert: kube_statefulset_replicas_unavailable
+ expr: kube_statefulset_status_replicas < kube_statefulset_replicas
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: 'statefulset {{$labels.statefulset}} has {{$value}} replicas, which is less than desired'
+ summary: '{{$labels.statefulset}}: has inssuficient replicas.'
+ - alert: kube_daemonsets_misscheduled
+ expr: kube_daemonset_status_number_misscheduled > 0
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: 'Daemonset {{$labels.daemonset}} is running where it is not supposed to run'
+ summary: 'Daemonsets not scheduled correctly'
+ - alert: kube_daemonsets_not_scheduled
+ expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number'
+ summary: 'Less than desired number of daemonsets scheduled'
+ - alert: kube_deployment_replicas_unavailable
+ expr: kube_deployment_status_replicas_unavailable > 0
+ for: 10m
+ labels:
+ severity: page
+ annotations:
+ description: 'deployment {{$labels.deployment}} has {{$value}} replicas unavailable'
+ summary: '{{$labels.deployment}}: has inssuficient replicas.'
+ - alert: kube_rollingupdate_deployment_replica_less_than_spec_max_unavailable
+ expr: kube_deployment_status_replicas_available - kube_deployment_spec_strategy_rollingupdate_max_unavailable < 0
+ for: 10m
+ labels:
+ severity: page
+ annotations:
+ description: 'deployment {{$labels.deployment}} has {{$value}} replicas available which is less than specified as max unavailable during a rolling update'
+ summary: '{{$labels.deployment}}: has inssuficient replicas during a rolling update.'
+ - alert: kube_job_status_failed
+ expr: kube_job_status_failed > 0
+ for: 10m
+ labels:
+ severity: page
+ annotations:
+ description: 'Job {{$labels.exported_job}} is in failed status'
+ summary: '{{$labels.exported_job}} has failed status'
+ - alert: kube_pod_status_pending
+ expr: kube_pod_status_phase{phase="Pending"} == 1
+ for: 10m
+ labels:
+ severity: page
+ annotations:
+ description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes'
+ summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status'
+ - alert: kube_pod_error_image_pull
+ expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
+ for: 10m
+ labels:
+ severity: page
+ annotations:
+ description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
+ summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
+ - alert: kube_pod_status_error_image_pull
+ expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
+ for: 10m
+ labels:
+ severity: page
+ annotations:
+ description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes'
+ summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
+ - alert: kube_replicaset_missing_replicas
+ expr: kube_replicaset_spec_replicas - kube_replicaset_status_ready_replicas > 0
+ for: 10m
+ labels:
+ severity: page
+ annotations:
+ description: 'Replicaset {{$labels.replicaset}} is missing desired number of replicas for more than 10 minutes'
+ summary: 'Replicaset {{$labels.replicaset}} is missing replicas'
+ - alert: kube_pod_container_terminated
+ expr: kube_pod_container_status_terminated_reason{reason=~"OOMKilled|Error|ContainerCannotRun"} > 0
+ for: 10m
+ labels:
+ severity: page
+ annotations:
+ description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a container terminated for more than 10 minutes'
+ summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status'
+ - alert: volume_claim_capacity_high_utilization
+ expr: (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.80
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: 'volume claim {{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity'
+ summary: '{{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity.'
+ basic_linux:
+ groups:
+ - name: basic_linux.rules
+ rules:
+ - alert: node_filesystem_full_80percent
+ expr: sort(node_filesystem_free{device!="ramfs"} < node_filesystem_size{device!="ramfs"}
+ * 0.2) / 1024 ^ 3
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
+ got less than 10% space left on its filesystem.'
+ summary: '{{$labels.alias}}: Filesystem is running out of space soon.'
+ - alert: node_filesystem_full_in_4h
+ expr: predict_linear(node_filesystem_free{device!="ramfs"}[1h], 4 * 3600) <= 0
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}}
+ is running out of space of in approx. 4 hours'
+ summary: '{{$labels.alias}}: Filesystem is running out of space in 4 hours.'
+ - alert: node_filedescriptors_full_in_3h
+ expr: predict_linear(node_filefd_allocated[1h], 3 * 3600) >= node_filefd_maximum
+ for: 20m
+ labels:
+ severity: page
+ annotations:
+ description: '{{$labels.alias}} is running out of available file descriptors
+ in approx. 3 hours'
+ summary: '{{$labels.alias}} is running out of available file descriptors in
+ 3 hours.'
+ - alert: node_load1_90percent
+ expr: node_load1 / ON(alias) count(node_cpu{mode="system"}) BY (alias) >= 0.9
+ for: 1h
+ labels:
+ severity: page
+ annotations:
+ description: '{{$labels.alias}} is running with > 90% total load for at least
+ 1h.'
+ summary: '{{$labels.alias}}: Running on high load.'
+ - alert: node_cpu_util_90percent
+ expr: 100 - (avg(irate(node_cpu{mode="idle"}[5m])) BY (alias) * 100) >= 90
+ for: 1h
+ labels:
+ severity: page
+ annotations:
+ description: '{{$labels.alias}} has total CPU utilization over 90% for at least
+ 1h.'
+ summary: '{{$labels.alias}}: High CPU utilization.'
+ - alert: node_ram_using_90percent
+ expr: node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal
+ * 0.1
+ for: 30m
+ labels:
+ severity: page
+ annotations:
+ description: '{{$labels.alias}} is using at least 90% of its RAM for at least
+ 30 minutes now.'
+ summary: '{{$labels.alias}}: Using lots of RAM.'
+ - alert: node_swap_using_80percent
+ expr: node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached)
+ > node_memory_SwapTotal * 0.8
+ for: 10m
+ labels:
+ severity: page
+ annotations:
+ description: '{{$labels.alias}} is using 80% of its swap space for at least
+ 10 minutes now.'
+ summary: '{{$labels.alias}}: Running out of swap soon.'
+ - alert: node_high_cpu_load
+ expr: node_load15 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ description: '{{$labels.alias}} is running with load15 > 1 for at least 5 minutes: {{$value}}'
+ summary: '{{$labels.alias}}: Running on high load: {{$value}}'
+ - alert: node_high_memory_load
+ expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers
+ + node_memory_Cached)) / sum(node_memory_MemTotal) * 100 > 85
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ description: Host memory usage is {{ humanize $value }}%. Reported by
+ instance {{ $labels.instance }} of job {{ $labels.job }}.
+ summary: Server memory is almost full
+ - alert: node_high_storage_load
+ expr: (node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})
+ / node_filesystem_size{mountpoint="/"} * 100 > 85
+ for: 30s
+ labels:
+ severity: warning
+ annotations:
+ description: Host storage usage is {{ humanize $value }}%. Reported by
+ instance {{ $labels.instance }} of job {{ $labels.job }}.
+ summary: Server storage is almost full
+ - alert: node_high_swap
+ expr: (node_memory_SwapTotal - node_memory_SwapFree) < (node_memory_SwapTotal
+ * 0.4)
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ description: Host system has a high swap usage of {{ humanize $value }}. Reported
+ by instance {{ $labels.instance }} of job {{ $labels.job }}.
+ summary: Server has a high swap usage
+ - alert: node_high_network_drop_rcv
+ expr: node_network_receive_drop{device!="lo"} > 3000
+ for: 30s
+ labels:
+ severity: warning
+ annotations:
+ description: Host system has an unusally high drop in network reception ({{
+ humanize $value }}). Reported by instance {{ $labels.instance }} of job {{
+ $labels.job }}
+ summary: Server has a high receive drop
+ - alert: node_high_network_drop_send
+ expr: node_network_transmit_drop{device!="lo"} > 3000
+ for: 30s
+ labels:
+ severity: warning
+ annotations:
+ description: Host system has an unusally high drop in network transmission ({{
+ humanize $value }}). Reported by instance {{ $labels.instance }} of job {{
+ $labels.job }}
+ summary: Server has a high transmit drop
+ - alert: node_high_network_errs_rcv
+ expr: node_network_receive_errs{device!="lo"} > 3000
+ for: 30s
+ labels:
+ severity: warning
+ annotations:
+ description: Host system has an unusally high error rate in network reception
+ ({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job
+ {{ $labels.job }}
+ summary: Server has unusual high reception errors
+ - alert: node_high_network_errs_send
+ expr: node_network_transmit_errs{device!="lo"} > 3000
+ for: 30s
+ labels:
+ severity: warning
+ annotations:
+ description: Host system has an unusally high error rate in network transmission
+ ({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job
+ {{ $labels.job }}
+ summary: Server has unusual high transmission errors
+ - alert: node_network_conntrack_usage_80percent
+ expr: sort(node_nf_conntrack_entries{job="node-exporter"} > node_nf_conntrack_entries_limit{job="node-exporter"} * 0.8)
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: '{{$labels.instance}} has network conntrack entries of {{ $value }} which is more than 80% of maximum limit'
+ summary: '{{$labels.instance}}: available network conntrack entries are low.'
+ - alert: node_entropy_available_low
+ expr: node_entropy_available_bits < 300
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: '{{$labels.instance}} has available entropy bits of {{ $value }} which is less than required of 300'
+ summary: '{{$labels.instance}}: is low on entropy bits.'
+ - alert: node_hwmon_high_cpu_temp
+ expr: node_hwmon_temp_crit_celsius*0.9 - node_hwmon_temp_celsius < 0 OR node_hwmon_temp_max_celsius*0.95 - node_hwmon_temp_celsius < 0
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: '{{$labels.alias}} reports hwmon sensor {{$labels.sensor}}/{{$labels.chip}} temperature value is nearly critical: {{$value}}'
+ summary: '{{$labels.alias}}: Sensor {{$labels.sensor}}/{{$labels.chip}} temp is high: {{$value}}'
+ - alert: node_vmstat_paging_rate_high
+ expr: irate(node_vmstat_pgpgin[5m]) > 80
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: '{{$labels.alias}} has a memory paging rate of change higher than 80%: {{$value}}'
+ summary: '{{$labels.alias}}: memory paging rate is high: {{$value}}'
+ - alert: node_xfs_block_allocation_high
+ expr: 100*(node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"} / (node_xfs_extent_allocation_blocks_freed_total{job="node-exporter", instance=~"172.17.0.1.*"} + node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"})) > 80
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: '{{$labels.alias}} has xfs allocation blocks higher than 80%: {{$value}}'
+ summary: '{{$labels.alias}}: xfs block allocation high: {{$value}}'
+ - alert: node_network_bond_slaves_down
+ expr: node_net_bonding_slaves - node_net_bonding_slaves_active > 0
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: '{{ $labels.master }} is missing {{ $value }} slave interface(s).'
+ summary: 'Instance {{ $labels.instance }}: {{ $labels.master }} missing {{ $value }} slave interface(s)'
+ - alert: node_numa_memory_used
+ expr: 100*node_memory_numa_MemUsed / node_memory_numa_MemTotal > 80
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: '{{$labels.alias}} has more than 80% NUMA memory usage: {{ $value }}'
+ summary: '{{$labels.alias}}: has high NUMA memory usage: {{$value}}'
+ - alert: node_ntp_clock_skew_high
+ expr: abs(node_ntp_drift_seconds) > 2
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: '{{$labels.alias}} has time difference of more than 2 seconds compared to NTP server: {{ $value }}'
+ summary: '{{$labels.alias}}: time is skewed by : {{$value}} seconds'
+ - alert: node_disk_read_latency
+ expr: (rate(node_disk_read_time_ms[5m]) / rate(node_disk_reads_completed[5m])) > 10
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: '{{$labels.device}} has a high read latency of {{ $value }}'
+ summary: 'High read latency observed for device {{ $labels.device }}'
+ - alert: node_disk_write_latency
+ expr: (rate(node_disk_write_time_ms[5m]) / rate(node_disk_writes_completed[5m])) > 10
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: '{{$labels.device}} has a high write latency of {{ $value }}'
+ summary: 'High write latency observed for device {{ $labels.device }}'
+ openstack:
+ groups:
+ - name: openstack.rules
+ rules:
+ - alert: os_glance_api_availability
+ expr: check_glance_api != 1
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: 'Glance API is not available at {{$labels.url}} for more than 5 minutes'
+ summary: 'Glance API is not available at {{$labels.url}}'
+ - alert: os_nova_api_availability
+ expr: check_nova_api != 1
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: 'Nova API is not available at {{$labels.url}} for more than 5 minutes'
+ summary: 'Nova API is not available at {{$labels.url}}'
+ - alert: os_keystone_api_availability
+ expr: check_keystone_api != 1
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: 'Keystone API is not available at {{$labels.url}} for more than 5 minutes'
+ summary: 'Keystone API is not available at {{$labels.url}}'
+ - alert: os_neutron_api_availability
+ expr: check_neutron_api != 1
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: 'Neutron API is not available at {{$labels.url}} for more than 5 minutes'
+ summary: 'Neutron API is not available at {{$labels.url}}'
+ - alert: os_swift_api_availability
+ expr: check_swift_api != 1
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: 'Swift API is not available at {{$labels.url}} for more than 5 minutes'
+ summary: 'Swift API is not available at {{$labels.url}}'
+ - alert: os_nova_compute_disabled
+ expr: services_nova_compute_disabled_total > 0
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: 'nova-compute is disabled on certain hosts for more than 5 minutes'
+ summary: 'Openstack compute service nova-compute is disabled on some hosts'
+ - alert: os_nova_conductor_disabled
+ expr: services_nova_conductor_disabled_total > 0
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: 'nova-conductor is disabled on certain hosts for more than 5 minutes'
+ summary: 'Openstack compute service nova-conductor is disabled on some hosts'
+ - alert: os_nova_consoleauth_disabled
+ expr: services_nova_consoleauth_disabled_total > 0
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: 'nova-consoleauth is disabled on certain hosts for more than 5 minutes'
+ summary: 'Openstack compute service nova-consoleauth is disabled on some hosts'
+ - alert: os_nova_scheduler_disabled
+ expr: services_nova_scheduler_disabled_total > 0
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: 'nova-scheduler is disabled on certain hosts for more than 5 minutes'
+ summary: 'Openstack compute service nova-scheduler is disabled on some hosts'
+ ceph:
+ groups:
+ - name: ceph.rules
+ rules:
+ - alert: ceph_monitor_quorum_low
+ expr: ceph_monitor_quorum_count < 3
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: 'ceph monitor quorum has been less than 3 for more than 5 minutes'
+ summary: 'ceph high availability is at risk'
+ - alert: ceph_cluster_usage_high
+ expr: 100* ceph_cluster_used_bytes/ceph_cluster_capacity_bytes > 80
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: 'ceph cluster capacity usage more than 80 percent'
+ summary: 'ceph cluster usage is more than 80 percent'
+ - alert: ceph_placement_group_degrade_pct_high
+ expr: 100*ceph_degraded_pgs/ceph_total_pgs > 80
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: 'ceph placement group degradation is more than 80 percent'
+ summary: 'ceph placement groups degraded'
+ - alert: ceph_osd_down_pct_high
+ expr: 100* ceph_osds_down/(ceph_osds_down+ceph_osds_up) > 80
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: 'ceph OSDs down percent is more than 80 percent'
+ summary: 'ceph OSDs down percent is high'
+ - alert: ceph_monitor_clock_skew_high
+ expr: ceph_monitor_clock_skew_seconds > 2
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: 'ceph monitors clock skew on {{$labels.instance}} is more than 2 seconds'
+ summary: 'ceph monitor clock skew high'
+ fluentd:
+ groups:
+ - name: fluentd.rules
+ rules:
+ - alert: fluentd_not_running
+ expr: fluentd_up == 0
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: 'fluentd is down on {{$labels.instance}} for more than 5 minutes'
+ summary: 'Fluentd is down'
+ calico:
+ groups:
+ - name: calico.rules
+ rules:
+ - alert: calico_datapane_failures_high_1h
+ expr: absent(felix_int_dataplane_failures) OR increase(felix_int_dataplane_failures[1h]) > 5
+ labels:
+ severity: page
+ annotations:
+ description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} dataplane failures within the last hour'
+ summary: 'A high number of dataplane failures within Felix are happening'
+ - alert: calico_datapane_address_msg_batch_size_high_5m
+ expr: absent(felix_int_dataplane_addr_msg_batch_size_sum) OR absent(felix_int_dataplane_addr_msg_batch_size_count) OR (felix_int_dataplane_addr_msg_batch_size_sum/felix_int_dataplane_addr_msg_batch_size_count) > 5
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane address message batch size'
+ summary: 'Felix address message batch size is higher'
+ - alert: calico_datapane_iface_msg_batch_size_high_5m
+ expr: absent(felix_int_dataplane_iface_msg_batch_size_sum) OR absent(felix_int_dataplane_iface_msg_batch_size_count) OR (felix_int_dataplane_iface_msg_batch_size_sum/felix_int_dataplane_iface_msg_batch_size_count) > 5
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane interface message batch size'
+ summary: 'Felix interface message batch size is higher'
+ - alert: calico_ipset_errors_high_1h
+ expr: absent(felix_ipset_errors) OR increase(felix_ipset_errors[1h]) > 5
+ labels:
+ severity: page
+ annotations:
+ description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} ipset errors within the last hour'
+ summary: 'A high number of ipset errors within Felix are happening'
+ - alert: calico_iptable_save_errors_high_1h
+ expr: absent(felix_iptables_save_errors) OR increase(felix_iptables_save_errors[1h]) > 5
+ labels:
+ severity: page
+ annotations:
+ description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable save errors within the last hour'
+ summary: 'A high number of iptable save errors within Felix are happening'
+ - alert: calico_iptable_restore_errors_high_1h
+ expr: absent(felix_iptables_restore_errors) OR increase(felix_iptables_restore_errors[1h]) > 5
+ labels:
+ severity: page
+ annotations:
+ description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable restore errors within the last hour'
+ summary: 'A high number of iptable restore errors within Felix are happening'
+ rabbitmq:
+ groups:
+ - name: rabbitmq.rules
+ rules:
+ - alert: rabbitmq_network_pratitions_detected
+ expr: min(partitions) by(instance) > 0
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: 'RabbitMQ at {{ $labels.instance }} has {{ $value }} partitions'
+ summary: 'RabbitMQ Network partitions detected'
+ - alert: rabbitmq_down
+ expr: min(rabbitmq_up) by(instance) != 1
+ for: 10m
+ labels:
+ severity: page
+ annotations:
+ description: 'RabbitMQ Server instance {{ $labels.instance }} is down'
+ summary: 'The RabbitMQ Server instance at {{ $labels.instance }} has been down the last 10 mins'
+ - alert: rabbitmq_file_descriptor_usage_high
+ expr: fd_used * 100 /fd_total > 80
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: 'RabbitMQ Server instance {{ $labels.instance }} has high file descriptor usage of {{ $value }} percent.'
+ summary: 'RabbitMQ file descriptors usage is high for last 10 mins'
+ - alert: rabbitmq_node_disk_free_alarm
+ expr: node_disk_free_alarm > 0
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: 'RabbitMQ Server instance {{ $labels.instance }} has low disk free space available.'
+ summary: 'RabbitMQ disk space usage is high'
+ - alert: rabbitmq_node_memory_alarm
+ expr: node_mem_alarm > 0
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: 'RabbitMQ Server instance {{ $labels.instance }} has low free memory.'
+ summary: 'RabbitMQ memory usage is high'
+ - alert: rabbitmq_less_than_3_nodes
+ expr: running < 3
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: 'RabbitMQ Server has less than 3 nodes running.'
+ summary: 'RabbitMQ server is at risk of loosing data'
+ - alert: rabbitmq_queue_messages_returned_high
+ expr: queue_messages_returned_total/queue_messages_published_total * 100 > 50
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ description: 'RabbitMQ Server is returing more than 50 percent of messages received.'
+ summary: 'RabbitMQ server is returning more than 50 percent of messages received.'
+ - alert: rabbitmq_consumers_low_utilization
+ expr: queue_consumer_utilisation < .4
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ description: 'RabbitMQ consumers message consumption speed is low'
+ summary: 'RabbitMQ consumers message consumption speed is low'
+ - alert: rabbitmq_high_message_load
+ expr: queue_messages_total > 17000 or increase(queue_messages_total[5m]) > 4000
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ description: 'RabbitMQ has high message load. Total Queue depth > 17000 or growth more than 4000 messages.'
+ summary: 'RabbitMQ has high message load'
+ elasticsearch:
+ groups:
+ - name: elasticsearch.rules
+ rules:
+ - alert: es_high_process_open_files_count
+ expr: sum(elasticsearch_process_open_files_count) by (host) > 64000
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: 'Elasticsearch at {{ $labels.host }} has more than 64000 process open file count.'
+ summary: 'Elasticsearch has a very high process open file count.'
+ - alert: es_high_process_cpu_percent
+ expr: elasticsearch_process_cpu_percent > 95
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: 'Elasticsearch at {{ $labels.instance }} has high process cpu percent of {{ $value }}.'
+ summary: 'Elasticsearch process cpu usage is more than 95 percent.'
+ - alert: es_fs_usage_high
+ expr: (100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes) / elasticsearch_filesystem_data_size_bytes) > 80
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: 'Elasticsearch at {{ $labels.instance }} has filesystem usage of {{ $value }}.'
+ summary: 'Elasticsearch filesystem usage is high.'
+ - alert: es_unassigned_shards
+ expr: elasticsearch_cluster_health_unassigned_shards > 0
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: 'Elasticsearch has {{ $value }} unassigned shards.'
+ summary: 'Elasticsearch has unassigned shards and hence a unhealthy cluster state.'
+ - alert: es_cluster_health_timed_out
+ expr: elasticsearch_cluster_health_timed_out > 0
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: 'Elasticsearch cluster health status call timedout {{ $value }} times.'
+ summary: 'Elasticsearch cluster health status calls are timing out.'
+ - alert: es_cluster_health_status_alert
+ expr: elasticsearch_cluster_health_status > 0
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: 'Elasticsearch cluster health status is not green. One or more shards or replicas are unallocated.'
+ summary: 'Elasticsearch cluster health status is not green.'
+ - alert: es_cluster_health_too_few_nodes_running
+ expr: elasticsearch_cluster_health_number_of_nodes < 3
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: 'There are only {{$value}} < 3 ElasticSearch nodes running'
+ summary: 'ElasticSearch running on less than 3 nodes'
+ - alert: es_cluster_health_too_few_data_nodes_running
+ expr: elasticsearch_cluster_health_number_of_data_nodes < 3
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: 'There are only {{$value}} < 3 ElasticSearch data nodes running'
+ summary: 'ElasticSearch running on less than 3 data nodes'
+ mariadb:
+ groups:
+ - name: mariadb.rules
+ rules:
+ - alert: mariadb_table_lock_wait_high
+ expr: 100 * mysql_global_status_table_locks_waited/(mysql_global_status_table_locks_waited + mysql_global_status_table_locks_immediate) > 30
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: 'Mariadb has high table lock waits of {{ $value }} percentage'
+ summary: 'Mariadb table lock waits are high'
+ - alert: mariadb_node_not_ready
+ expr: mysql_global_status_wsrep_ready != 1
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: '{{$labels.job}} on {{$labels.instance}} is not ready.'
+ summary: 'Galera cluster node not ready'
+ - alert: mariadb_galera_node_out_of_sync
+ expr: mysql_global_status_wsrep_local_state != 4 AND mysql_global_variables_wsrep_desync == 0
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: '{{$labels.job}} on {{$labels.instance}} is not in sync ({{$value}} != 4)'
+ summary: 'Galera cluster node out of sync'
+ - alert: mariadb_innodb_replication_fallen_behind
+ expr: (mysql_global_variables_innodb_replication_delay > 30) AND on (instance) (predict_linear(mysql_global_variables_innodb_replication_delay[5m], 60*2) > 0)
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ description: 'The mysql innodb replication has fallen behind and is not recovering'
+ summary: 'MySQL innodb replication is lagging'
dependencies:
- osh-infra-helm-toolkit
...
diff --git a/global/software/charts/osh-infra/osh-infra-radosgw/chart-group.yaml b/global/software/charts/osh-infra/osh-infra-radosgw/chart-group.yaml
new file mode 100644
index 000000000..07d160819
--- /dev/null
+++ b/global/software/charts/osh-infra/osh-infra-radosgw/chart-group.yaml
@@ -0,0 +1,13 @@
+---
+schema: armada/ChartGroup/v1
+metadata:
+ schema: metadata/Document/v1
+ name: osh-infra-radosgw
+ layeringDefinition:
+ abstract: false
+ layer: global
+ storagePolicy: cleartext
+data:
+ description: Deploy Radosgw for OSH-Infra
+ chart_group:
+ - osh-infra-radosgw
diff --git a/global/software/charts/osh-infra/osh-infra-radosgw/radosgw.yaml b/global/software/charts/osh-infra/osh-infra-radosgw/radosgw.yaml
new file mode 100644
index 000000000..b39c703fb
--- /dev/null
+++ b/global/software/charts/osh-infra/osh-infra-radosgw/radosgw.yaml
@@ -0,0 +1,118 @@
+---
+schema: armada/Chart/v1
+metadata:
+ schema: metadata/Document/v1
+ name: osh-infra-radosgw
+ layeringDefinition:
+ abstract: false
+ layer: global
+ storagePolicy: cleartext
+ substitutions:
+ # Chart source
+ - src:
+ schema: pegleg/SoftwareVersions/v1
+ name: software-versions
+ path: .charts.ucp.ceph-rgw
+ dest:
+ path: .source
+
+ # Images
+ - src:
+ schema: pegleg/SoftwareVersions/v1
+ name: software-versions
+ path: .images.ceph.ceph-rgw
+ dest:
+ path: .values.images.tags
+
+ # IP addresses
+ - src:
+ schema: pegleg/CommonAddresses/v1
+ name: common-addresses
+ path: .storage.ceph.public_cidr
+ dest:
+ path: .values.network.public
+ - src:
+ schema: pegleg/CommonAddresses/v1
+ name: common-addresses
+ path: .storage.ceph.cluster_cidr
+ dest:
+ path: .values.network.cluster
+
+ # Endpoints
+ - src:
+ schema: pegleg/EndpointCatalogue/v1
+ name: osh_infra_endpoints
+ path: .osh_infra.ceph_object_store
+ dest:
+ path: .values.endpoints.ceph_object_store
+ - src:
+ schema: pegleg/EndpointCatalogue/v1
+ name: ucp_endpoints
+ path: .ceph.ceph_mon
+ dest:
+ path: .values.endpoints.ceph_mon
+
+ # Credentials
+ - src:
+ schema: pegleg/AccountCatalogue/v1
+ name: osh_infra_service_accounts
+ path: .osh_infra.ceph_object_store.admin
+ dest:
+ path: .values.endpoints.ceph_object_store.auth.admin
+
+ # Secrets
+ - dest:
+ path: .values.endpoints.ceph_object_store.auth.admin.access_key
+ src:
+ schema: deckhand/Passphrase/v1
+ name: osh_infra_rgw_s3_admin_access_key
+ path: .
+ - dest:
+ path: .values.endpoints.ceph_object_store.auth.admin.secret_key
+ src:
+ schema: deckhand/Passphrase/v1
+ name: osh_infra_rgw_s3_admin_secret_key
+ path: .
+
+data:
+ chart_name: osh-infra-radosgw
+ release: osh-infra-radosgw
+ namespace: osh-infra
+ wait:
+ timeout: 900
+ labels:
+ release_group: clcp-osh-infra-radosgw
+ install:
+ no_hooks: false
+ upgrade:
+ no_hooks: false
+ pre:
+ delete:
+ - type: job
+ labels:
+ release_group: clcp-osh-infra-radosgw
+ values:
+ labels:
+ job:
+ node_selector_key: openstack-control-plane
+ node_selector_value: enabled
+ rgw:
+ node_selector_key: ceph-rgw
+ node_selector_value: enabled
+ deployment:
+ storage_secrets: false
+ ceph: true
+ rbd_provisioner: false
+ cephfs_provisioner: false
+ client_secrets: false
+ rgw_keystone_user_and_endpoints: false
+ bootstrap:
+ enabled: false
+ conf:
+ rgw_s3:
+ enabled: true
+ ceph_client:
+ configmap: ceph-etc
+ dependencies:
+ - osh-infra-helm-toolkit
+...
diff --git a/global/software/charts/osh/openstack-mariadb/mariadb.yaml b/global/software/charts/osh/openstack-mariadb/mariadb.yaml
index cae3239a4..d93ef70a0 100644
--- a/global/software/charts/osh/openstack-mariadb/mariadb.yaml
+++ b/global/software/charts/osh/openstack-mariadb/mariadb.yaml
@@ -31,6 +31,13 @@ metadata:
path: .osh.oslo_db
dest:
path: .values.endpoints.olso_db
+ - src:
+ schema: pegleg/EndpointCatalogue/v1
+ name: osh_endpoints
+ path: .osh.prometheus_mysql_exporter
+ dest:
+ path: .values.endpoints.prometheus_mysql_exporter
+
# Accounts
- src:
schema: pegleg/AccountCatalogue/v1
@@ -38,6 +45,12 @@ metadata:
path: .osh.oslo_db.admin
dest:
path: .values.endpoints.oslo_db.auth.admin
+ - src:
+ schema: pegleg/AccountCatalogue/v1
+ name: osh_service_accounts
+ path: .osh.prometheus_mysql_exporter.user
+ dest:
+ path: .values.endpoints.prometheus_mysql_exporter.auth.user
# Secrets
- dest:
@@ -46,6 +59,12 @@ metadata:
schema: deckhand/Passphrase/v1
name: osh_oslo_db_admin_password
path: .
+ - dest:
+ path: .values.endpoints.oslo_db.auth.exporter.password
+ src:
+ schema: deckhand/Passphrase/v1
+ name: osh_oslo_db_exporter_password
+ path: .
data:
chart_name: openstack-mariadb
@@ -72,6 +91,9 @@ data:
prometheus_mysql_exporter:
node_selector_key: openstack-control-plane
node_selector_value: enabled
+ monitoring:
+ prometheus:
+ enabled: true
dependencies:
- osh-helm-toolkit
...
diff --git a/global/software/config/versions.yaml b/global/software/config/versions.yaml
index 3573b796e..88e39d7b2 100644
--- a/global/software/config/versions.yaml
+++ b/global/software/config/versions.yaml
@@ -182,57 +182,62 @@ data:
osh_infra:
elasticsearch:
location: https://git.openstack.org/openstack/openstack-helm-infra
- reference: 4f4e9c5838e9cdf25c453c6a5b85bfc1ce12ad91
+ reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44
subpath: elasticsearch
type: git
fluent_logging:
location: https://git.openstack.org/openstack/openstack-helm-infra
- reference: 4f4e9c5838e9cdf25c453c6a5b85bfc1ce12ad91
+ reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44
subpath: fluent-logging
type: git
grafana:
location: https://git.openstack.org/openstack/openstack-helm-infra
- reference: 4f4e9c5838e9cdf25c453c6a5b85bfc1ce12ad91
+ reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44
subpath: grafana
type: git
helm_toolkit:
location: https://git.openstack.org/openstack/openstack-helm-infra
- reference: 4f4e9c5838e9cdf25c453c6a5b85bfc1ce12ad91
+ reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44
subpath: helm-toolkit
type: git
kibana:
location: https://git.openstack.org/openstack/openstack-helm-infra
- reference: 4f4e9c5838e9cdf25c453c6a5b85bfc1ce12ad91
+ reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44
subpath: kibana
type: git
nagios:
location: https://git.openstack.org/openstack/openstack-helm-infra
- reference: 4f4e9c5838e9cdf25c453c6a5b85bfc1ce12ad91
+ reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44
subpath: nagios
type: git
prometheus:
location: https://git.openstack.org/openstack/openstack-helm-infra
- reference: 4f4e9c5838e9cdf25c453c6a5b85bfc1ce12ad91
+ reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44
subpath: prometheus
type: git
prometheus_alertmanager:
location: https://git.openstack.org/openstack/openstack-helm-infra
- reference: 4f4e9c5838e9cdf25c453c6a5b85bfc1ce12ad91
+ reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44
subpath: prometheus-alertmanager
type: git
prometheus_kube_state_metrics:
location: https://git.openstack.org/openstack/openstack-helm-infra
- reference: 4f4e9c5838e9cdf25c453c6a5b85bfc1ce12ad91
+ reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44
subpath: prometheus-kube-state-metrics
type: git
prometheus_node_exporter:
location: https://git.openstack.org/openstack/openstack-helm-infra
- reference: 4f4e9c5838e9cdf25c453c6a5b85bfc1ce12ad91
+ reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44
subpath: prometheus-node-exporter
type: git
+ prometheus_process_exporter:
+ location: https://git.openstack.org/openstack/openstack-helm-infra
+ reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44
+ subpath: prometheus-process-exporter
+ type: git
prometheus_openstack_exporter:
location: https://git.openstack.org/openstack/openstack-helm-infra
- reference: 4f4e9c5838e9cdf25c453c6a5b85bfc1ce12ad91
+ reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44
subpath: prometheus-openstack-exporter
type: git
ucp:
@@ -661,6 +666,9 @@ data:
curator: docker.io/bobrik/curator:5.2.0
dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.3.1
elasticsearch: docker.io/elasticsearch:5.6.4
+ ceph_key_placement: docker.io/port/ceph-config-helper:v1.10.3
+ s3_bucket: docker.io/port/ceph-config-helper:v1.10.3
+ s3_user: docker.io/port/ceph-config-helper:v1.10.3
helm_tests: docker.io/openstackhelm/heat:ocata
image_repo_sync: docker.io/docker:17.07.0
memory_init: docker.io/openstackhelm/heat:ocata
@@ -713,7 +721,7 @@ data:
ks_endpoints: docker.io/openstackhelm/heat:ocata
ks_service: docker.io/openstackhelm/heat:ocata
ks_user: docker.io/openstackhelm/heat:ocata
- prometheus_openstack_exporter: quay.io/attcomdev/prometheus-openstack-exporter:3231f14419f0c47547ce2551b7d884cd222104e6
+ prometheus_openstack_exporter: quay.io/attcomdev/prometheus-openstack-exporter:5010c3a532471d4940471a189ca8456bc4db46cb
ucp:
armada:
api: quay.io/airshipit/armada:90618f549c1f6d7741b11dc5c4898f3c6d536895
diff --git a/global/software/manifests/full-site.yaml b/global/software/manifests/full-site.yaml
index ed3a5015d..f51883c9f 100644
--- a/global/software/manifests/full-site.yaml
+++ b/global/software/manifests/full-site.yaml
@@ -31,6 +31,7 @@ data:
- ucp-shipyard
- osh-infra-ingress-controller
- osh-infra-ceph-config
+ - osh-infra-radosgw
- osh-infra-logging
- osh-infra-monitoring
- osh-infra-mariadb
diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_oslo_db_exporter_password.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_oslo_db_exporter_password.yaml
new file mode 100644
index 000000000..f134f46a9
--- /dev/null
+++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_oslo_db_exporter_password.yaml
@@ -0,0 +1,11 @@
+---
+schema: deckhand/Passphrase/v1
+metadata:
+ schema: metadata/Document/v1
+ name: osh_infra_oslo_db_exporter_password
+ layeringDefinition:
+ abstract: false
+ layer: site
+ storagePolicy: cleartext
+data: password123
+...
diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_prometheus_admin_password.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_prometheus_admin_password.yaml
new file mode 100644
index 000000000..b3df5f659
--- /dev/null
+++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_prometheus_admin_password.yaml
@@ -0,0 +1,11 @@
+---
+schema: deckhand/Passphrase/v1
+metadata:
+ schema: metadata/Document/v1
+ name: osh_infra_prometheus_admin_password
+ layeringDefinition:
+ abstract: false
+ layer: site
+ storagePolicy: cleartext
+data: password123
+...
diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_access_key.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_access_key.yaml
new file mode 100644
index 000000000..7fc1eddf1
--- /dev/null
+++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_access_key.yaml
@@ -0,0 +1,11 @@
+---
+schema: deckhand/Passphrase/v1
+metadata:
+ schema: metadata/Document/v1
+ name: osh_infra_rgw_s3_admin_access_key
+ layeringDefinition:
+ abstract: false
+ layer: site
+ storagePolicy: cleartext
+data: password123
+...
diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_secret_key.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_secret_key.yaml
new file mode 100644
index 000000000..32f7d80f5
--- /dev/null
+++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_secret_key.yaml
@@ -0,0 +1,11 @@
+---
+schema: deckhand/Passphrase/v1
+metadata:
+ schema: metadata/Document/v1
+ name: osh_infra_rgw_s3_admin_secret_key
+ layeringDefinition:
+ abstract: false
+ layer: site
+ storagePolicy: cleartext
+data: password123
+...
diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_access_key.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_access_key.yaml
new file mode 100644
index 000000000..befc16e1f
--- /dev/null
+++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_access_key.yaml
@@ -0,0 +1,11 @@
+---
+schema: deckhand/Passphrase/v1
+metadata:
+ schema: metadata/Document/v1
+ name: osh_infra_rgw_s3_elasticsearch_access_key
+ layeringDefinition:
+ abstract: false
+ layer: site
+ storagePolicy: cleartext
+data: password123
+...
diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_secret_key.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_secret_key.yaml
new file mode 100644
index 000000000..6dff56e51
--- /dev/null
+++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_secret_key.yaml
@@ -0,0 +1,11 @@
+---
+schema: deckhand/Passphrase/v1
+metadata:
+ schema: metadata/Document/v1
+ name: osh_infra_rgw_s3_elasticsearch_secret_key
+ layeringDefinition:
+ abstract: false
+ layer: site
+ storagePolicy: cleartext
+data: password123
+...
diff --git a/site/airship-seaworthy/secrets/passphrases/osh_oslo_db_exporter_password.yaml b/site/airship-seaworthy/secrets/passphrases/osh_oslo_db_exporter_password.yaml
new file mode 100644
index 000000000..61b4144ad
--- /dev/null
+++ b/site/airship-seaworthy/secrets/passphrases/osh_oslo_db_exporter_password.yaml
@@ -0,0 +1,11 @@
+---
+schema: deckhand/Passphrase/v1
+metadata:
+ schema: metadata/Document/v1
+ name: osh_oslo_db_exporter_password
+ layeringDefinition:
+ abstract: false
+ layer: site
+ storagePolicy: cleartext
+data: password123
+...
diff --git a/site/airship-seaworthy/software/config/endpoints.yaml b/site/airship-seaworthy/software/config/endpoints.yaml
index 3f67f8372..a77fbd52f 100644
--- a/site/airship-seaworthy/software/config/endpoints.yaml
+++ b/site/airship-seaworthy/software/config/endpoints.yaml
@@ -715,6 +715,19 @@ data:
default: 3306
wsrep:
default: 4567
+ prometheus_mysql_exporter:
+ namespace: openstack
+ hosts:
+ default: mysql-exporter
+ host_fqdn_override:
+ default: null
+ path:
+ default: /metrics
+ scheme:
+ default: 'http'
+ port:
+ metrics:
+ default: 9104
keystone_oslo_messaging:
namespace: openstack
hosts:
@@ -1257,6 +1270,22 @@ metadata:
# pattern: AUTH_PATH
data:
osh_infra:
+ ceph_object_store:
+ name: radosgw
+ namespace: osh-infra
+ hosts:
+ default: ceph-rgw
+ public: radosgw
+ host_fqdn_override:
+ default: null
+ path:
+ default: null
+ scheme:
+ default: "http"
+ port:
+ api:
+ default: 8088
+ public: 80
elasticsearch:
name: elasticsearch
namespace: osh-infra
@@ -1272,8 +1301,12 @@ data:
scheme:
default: "http"
port:
+ client:
+ default: 9200
http:
default: 80
+ discovery:
+ default: 9300
prometheus_elasticsearch_exporter:
namespace: null
hosts:
@@ -1327,6 +1360,19 @@ data:
port:
mysql:
default: 3306
+ prometheus_mysql_exporter:
+ namespace: openstack
+ hosts:
+ default: mysql-exporter
+ host_fqdn_override:
+ default: null
+ path:
+ default: /metrics
+ scheme:
+ default: 'http'
+ port:
+ metrics:
+ default: 9104
grafana:
name: grafana
namespace: osh-infra
@@ -1345,6 +1391,7 @@ data:
port:
grafana:
default: 3000
+ public: 80
# public: 443
monitoring:
name: prometheus
@@ -1361,7 +1408,8 @@ data:
port:
api:
default: 9090
- public: 80
+ http:
+ default: 80
kibana:
name: kibana
namespace: osh-infra
@@ -1380,6 +1428,8 @@ data:
port:
kibana:
default: 5601
+ http:
+ default: 80
# public: 443
alerts:
name: alertmanager
@@ -1438,6 +1488,19 @@ data:
default: 9100
prometheus_port:
default: 9100
+ process_exporter_metrics:
+ namespace: kube-system
+ hosts:
+ default: process-exporter
+ host_fqdn_override:
+ default: null
+ path:
+ default: null
+ scheme:
+ default: "http"
+ port:
+ metrics:
+ default: 9256
prometheus_openstack_exporter:
namespace: openstack
hosts:
diff --git a/site/airship-seaworthy/software/config/service_accounts.yaml b/site/airship-seaworthy/software/config/service_accounts.yaml
index 792072936..1320c7028 100644
--- a/site/airship-seaworthy/software/config/service_accounts.yaml
+++ b/site/airship-seaworthy/software/config/service_accounts.yaml
@@ -303,6 +303,9 @@ data:
oslo_db:
admin:
username: root
+ prometheus_mysql_exporter:
+ user:
+ username: osh-oslodb-exporter
neutron:
neutron:
role: admin
@@ -383,6 +386,11 @@ metadata:
path: .osh_infra.prometheus_openstack_exporter.user.region_name
data:
osh_infra:
+ ceph_object_store:
+ admin:
+ username: s3_admin
+ elasticsearch:
+ username: elasticsearch
grafana:
admin:
username: grafana
@@ -401,6 +409,9 @@ data:
oslo_db:
admin:
username: root
+ prometheus_mysql_exporter:
+ user:
+ username: osh-infra-oslodb-exporter
prometheus_openstack_exporter:
user:
role: admin
@@ -411,6 +422,9 @@ data:
nagios:
admin:
username: nagios
+ prometheus:
+ admin:
+ username: prometheus
ldap:
admin:
# NEWSITE-CHANGEME: Replace with the site's LDAP account used to
diff --git a/site/airship-seaworthy/software/manifests/full-site.yaml b/site/airship-seaworthy/software/manifests/full-site.yaml
index 593dfc106..77ddc3f80 100644
--- a/site/airship-seaworthy/software/manifests/full-site.yaml
+++ b/site/airship-seaworthy/software/manifests/full-site.yaml
@@ -37,6 +37,7 @@ data:
- ucp-shipyard
- osh-infra-ingress-controller
- osh-infra-ceph-config
+ - osh-infra-radosgw
- osh-infra-logging
- osh-infra-monitoring
- osh-infra-mariadb