diff --git a/global/profiles/host/cp.yaml b/global/profiles/host/cp.yaml index f976652bd..1ab68a052 100644 --- a/global/profiles/host/cp.yaml +++ b/global/profiles/host/cp.yaml @@ -105,4 +105,5 @@ data: kube-ingress: enabled beta.kubernetes.io/fluentd-ds-ready: 'true' node-exporter: enabled + fluentbit: enabled ... diff --git a/global/profiles/host/dp.yaml b/global/profiles/host/dp.yaml index 2a454edb0..51df5f68d 100644 --- a/global/profiles/host/dp.yaml +++ b/global/profiles/host/dp.yaml @@ -57,4 +57,5 @@ data: openstack-libvirt: kernel beta.kubernetes.io/fluentd-ds-ready: 'true' node-exporter: enabled + fluentbit: enabled ... diff --git a/global/software/charts/osh-infra/osh-infra-dashboards/grafana.yaml b/global/software/charts/osh-infra/osh-infra-dashboards/grafana.yaml index 48e36e23a..ae7ddf022 100644 --- a/global/software/charts/osh-infra/osh-infra-dashboards/grafana.yaml +++ b/global/software/charts/osh-infra/osh-infra-dashboards/grafana.yaml @@ -68,6 +68,12 @@ metadata: path: .osh_infra.grafana.oslo_db dest: path: .values.endpoints.oslo_db.auth.user + - src: + schema: pegleg/AccountCatalogue/v1 + name: osh_infra_service_accounts + path: .osh_infra.prometheus.admin + dest: + path: .values.endpoints.prometheus.auth.admin - src: schema: pegleg/AccountCatalogue/v1 name: osh_infra_service_accounts @@ -120,6 +126,12 @@ metadata: schema: deckhand/Passphrase/v1 name: osh_infra_oslo_db_admin_password path: . + - dest: + path: .values.endpoints.prometheus.auth.admin.password + src: + schema: deckhand/Passphrase/v1 + name: osh_infra_prometheus_admin_password + path: . # LDAP Configuration Details - src: diff --git a/global/software/charts/osh-infra/osh-infra-logging/elasticsearch.yaml b/global/software/charts/osh-infra/osh-infra-logging/elasticsearch.yaml index 2e7a6b640..6b16eddc6 100644 --- a/global/software/charts/osh-infra/osh-infra-logging/elasticsearch.yaml +++ b/global/software/charts/osh-infra/osh-infra-logging/elasticsearch.yaml @@ -54,7 +54,18 @@ metadata: path: .osh_infra.elasticsearch.admin dest: path: .values.endpoints.elasticsearch.auth.admin - + - src: + schema: pegleg/AccountCatalogue/v1 + name: osh_infra_service_accounts + path: .osh_infra.ceph_object_store.admin + dest: + path: .values.endpoints.ceph_object_store.auth.admin + - src: + schema: pegleg/AccountCatalogue/v1 + name: osh_infra_service_accounts + path: .osh_infra.ceph_object_store.elasticsearch + dest: + path: .values.endpoints.ceph_object_store.auth.elasticsearch # Secrets - dest: path: .values.endpoints.elasticsearch.auth.admin.password @@ -62,6 +73,30 @@ metadata: schema: deckhand/Passphrase/v1 name: osh_infra_elasticsearch_admin_password path: . + - dest: + path: .values.endpoints.ceph_object_store.auth.admin.access_key + src: + schema: deckhand/Passphrase/v1 + name: osh_infra_rgw_s3_admin_access_key + path: . + - dest: + path: .values.endpoints.ceph_object_store.auth.admin.secret_key + src: + schema: deckhand/Passphrase/v1 + name: osh_infra_rgw_s3_admin_secret_key + path: . + - dest: + path: .values.endpoints.ceph_object_store.auth.elasticsearch.access_key + src: + schema: deckhand/Passphrase/v1 + name: osh_infra_rgw_s3_elasticsearch_access_key + path: . + - dest: + path: .values.endpoints.ceph_object_store.auth.elasticsearch.secret_key + src: + schema: deckhand/Passphrase/v1 + name: osh_infra_rgw_s3_elasticsearch_secret_key + path: . # LDAP Details - src: @@ -97,6 +132,75 @@ data: post: create: [] values: + pod: + replicas: + client: 5 + resources: + enabled: true + apache_proxy: + limits: + memory: "1024Mi" + cpu: "2000m" + requests: + memory: "0" + cpu: "0" + client: + requests: + memory: "8Gi" + cpu: "1000m" + limits: + memory: "16Gi" + cpu: "2000m" + master: + requests: + memory: "8Gi" + cpu: "1000m" + limits: + memory: "16Gi" + cpu: "2000m" + data: + requests: + memory: "8Gi" + cpu: "1000m" + limits: + memory: "16Gi" + cpu: "2000m" + prometheus_elasticsearch_exporter: + requests: + memory: "0" + cpu: "0" + limits: + memory: "1024Mi" + cpu: "2000m" + jobs: + curator: + requests: + memory: "0" + cpu: "0" + limits: + memory: "1024Mi" + cpu: "2000m" + image_repo_sync: + requests: + memory: "0" + cpu: "0" + limits: + memory: "1024Mi" + cpu: "2000m" + snapshot_repository: + requests: + memory: "0" + cpu: "0" + limits: + memory: "1024Mi" + cpu: "2000m" + tests: + requests: + memory: "0" + cpu: "0" + limits: + memory: "1024Mi" + cpu: "2000m" labels: elasticsearch: node_selector_key: openstack-control-plane @@ -108,27 +212,95 @@ data: prometheus: enabled: true conf: - apache: - host: | - - - ProxyPass http://localhost:{{ tuple "elasticsearch" "internal" "client" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}/ - ProxyPassReverse http://localhost:{{ tuple "elasticsearch" "internal" "client" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}/ - - - AuthName "Elasticsearch" - AuthType Basic - AuthBasicProvider file ldap - AuthUserFile /usr/local/apache2/conf/.htpasswd - AuthLDAPBindDN {{ .Values.endpoints.ldap.auth.admin.bind }} - AuthLDAPBindPassword {{ .Values.endpoints.ldap.auth.admin.password }} - AuthLDAPURL {{ tuple "ldap" "public" "ldap" . | include "helm-toolkit.endpoints.keystone_endpoint_uri_lookup" }} - Require valid-user - - + httpd: | + ServerRoot "/usr/local/apache2" + Listen 80 + LoadModule mpm_event_module modules/mod_mpm_event.so + LoadModule authn_file_module modules/mod_authn_file.so + LoadModule authn_core_module modules/mod_authn_core.so + LoadModule authz_host_module modules/mod_authz_host.so + LoadModule authz_groupfile_module modules/mod_authz_groupfile.so + LoadModule authz_user_module modules/mod_authz_user.so + LoadModule authz_core_module modules/mod_authz_core.so + LoadModule access_compat_module modules/mod_access_compat.so + LoadModule auth_basic_module modules/mod_auth_basic.so + LoadModule ldap_module modules/mod_ldap.so + LoadModule authnz_ldap_module modules/mod_authnz_ldap.so + LoadModule reqtimeout_module modules/mod_reqtimeout.so + LoadModule filter_module modules/mod_filter.so + LoadModule proxy_html_module modules/mod_proxy_html.so + LoadModule log_config_module modules/mod_log_config.so + LoadModule env_module modules/mod_env.so + LoadModule headers_module modules/mod_headers.so + LoadModule setenvif_module modules/mod_setenvif.so + LoadModule version_module modules/mod_version.so + LoadModule proxy_module modules/mod_proxy.so + LoadModule proxy_connect_module modules/mod_proxy_connect.so + LoadModule proxy_http_module modules/mod_proxy_http.so + LoadModule proxy_balancer_module modules/mod_proxy_balancer.so + LoadModule slotmem_shm_module modules/mod_slotmem_shm.so + LoadModule slotmem_plain_module modules/mod_slotmem_plain.so + LoadModule unixd_module modules/mod_unixd.so + LoadModule status_module modules/mod_status.so + LoadModule autoindex_module modules/mod_autoindex.so + + User daemon + Group daemon + + + AllowOverride none + Require all denied + + + Require all denied + + ErrorLog /dev/stderr + LogLevel warn + + LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined + LogFormat "%h %l %u %t \"%r\" %>s %b" common + + LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %I %O" combinedio + + CustomLog /dev/stdout common + CustomLog /dev/stdout combined + + + AllowOverride None + Options None + Require all granted + + + RequestHeader unset Proxy early + + + Include conf/extra/proxy-html.conf + + + + ProxyPass http://localhost:{{ tuple "elasticsearch" "internal" "client" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}/ + ProxyPassReverse http://localhost:{{ tuple "elasticsearch" "internal" "client" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}/ + + + AuthName "Elasticsearch" + AuthType Basic + AuthBasicProvider file ldap + AuthUserFile /usr/local/apache2/conf/.htpasswd + AuthLDAPBindDN {{ .Values.endpoints.ldap.auth.admin.bind }} + AuthLDAPBindPassword {{ .Values.endpoints.ldap.auth.admin.password }} + AuthLDAPURL {{ tuple "ldap" "public" "ldap" . | include "helm-toolkit.endpoints.keystone_endpoint_uri_lookup" | quote }} + Require valid-user + + elasticsearch: + config: + http: + max_content_length: 2gb + pipelining: false env: - java_opts: "-Xms5g -Xmx5g" + java_opts: "-Xms8g -Xmx8g" + snapshots: + enabled: true curator: #run every 6th hour schedule: "0 */6 * * *" diff --git a/global/software/charts/osh-infra/osh-infra-logging/fluent-logging.yaml b/global/software/charts/osh-infra/osh-infra-logging/fluent-logging.yaml index 28f9dd633..f6e41179a 100644 --- a/global/software/charts/osh-infra/osh-infra-logging/fluent-logging.yaml +++ b/global/software/charts/osh-infra/osh-infra-logging/fluent-logging.yaml @@ -82,12 +82,51 @@ data: post: create: [] values: + pod: + resources: + enabled: true + fluentbit: + limits: + memory: '4Gi' + cpu: '2000m' + requests: + memory: '2Gi' + cpu: '1000m' + fluentd: + limits: + memory: '4Gi' + cpu: '2000m' + requests: + memory: '2Gi' + cpu: '1000m' + prometheus_fluentd_exporter: + limits: + memory: '1024Mi' + cpu: '2000m' + requests: + memory: '0' + cpu: '0' + jobs: + image_repo_sync: + requests: + memory: '0' + cpu: '0' + limits: + memory: '1024Mi' + cpu: '2000m' + tests: + requests: + memory: '0' + cpu: '0' + limits: + memory: '1024Mi' + cpu: '2000m' labels: fluentd: - node_selector_key: openstack-control-plane + node_selector_key: fluentd node_selector_value: enabled fluentbit: - node_selector_key: openstack-control-plane + node_selector_key: fluentbit node_selector_value: enabled prometheus_fluentd_exporter: node_selector_key: openstack-control-plane @@ -95,20 +134,6 @@ data: job: node_selector_key: openstack-control-plane node_selector_value: enabled - dependencies: - static: - fluentbit: - jobs: "" - services: - - endpoint: internal - service: fluentd - fluentd: - jobs: "" - services: - - endpoint: internal - service: elasticsearch - manifests: - job_elasticsearch_template: false conf: fluentbit: - service: @@ -117,6 +142,28 @@ data: Daemon: Off Log_Level: info Parsers_File: parsers.conf + - kubelet: + header: input + Name: systemd + Path: /var/log/journal + Tag: syslog.* + Systemd_Filter: _SYSTEMD_UNIT=kubelet.service + DB: /var/log/kubelet.db + Mem_Buf_Limit: 5MB + DB.Sync: Normal + Buffer_Chunk_Size: 1M + Buffer_Max_Size: 1M + - docker_daemon: + header: input + Name: systemd + Path: /var/log/journal + Tag: syslog.* + Systemd_Filter: _SYSTEMD_UNIT=docker.service + DB: /var/log/docker.db + Mem_Buf_Limit: 5MB + DB.Sync: Normal + Buffer_Chunk_Size: 1M + Buffer_Max_Size: 1M - containers_tail: header: input Name: tail @@ -124,10 +171,14 @@ data: Path: /var/log/containers/*.log Parser: docker DB: /var/log/flb_kube.db + Mem_Buf_Limit: 5MB DB.Sync: Normal Buffer_Chunk_Size: 1M Buffer_Max_Size: 1M - Mem_Buf_Limit: 5MB + - drop_fluentd_logs: + header: output + Name: "null" + Match: "**.fluentd**" - kube_filter: header: filter Name: kubernetes @@ -137,8 +188,16 @@ data: header: output Name: forward Match: "*" - Host: ${FLUENTD_HOST} - Port: ${FLUENTD_PORT} + Host: fluentd-logging.osh-infra + Port: 24224 + parsers: + - docker: + header: parser + Name: docker + Format: json + Time_Key: time + Time_Format: "%Y-%m-%dT%H:%M:%S.%L" + Time_Keep: On td_agent: - metrics_agent: header: source @@ -150,22 +209,268 @@ data: type: forward port: "#{ENV['FLUENTD_PORT']}" bind: 0.0.0.0 - - elasticsearch: + - drop_fluent_logs: + header: match + type: "null" + expression: "fluent.*" + - add_container_name: + header: filter + type: record_transformer + expression: "kube.**" + enable_ruby: true + record: + - + - header: record + container_name: ${record["kubernetes"]["container_name"]} + - remove_openstack_pod_logged_events: + header: filter + type: grep + expression: "kube.**" + exclude: + - + - header: exclude + key: container_name + pattern: ^(cinder-api|cinder-scheduler|cinder-volume|cinder-backup|glance-api|glance-registry|heat-api|heat-cfn|heat-engine|keystone-api|neutron-dhcp-agent|neutron-l3-agent|neutron-server|nova-osapi|nova-api|nova-compute|nova-conductor|nova-consoleauth|nova-novncproxy|nova-scheduler)$ + # NOTE(srwilkers): Look for specific keywords in the log key to determine + # log level of event + - tag_kubernetes_log_level: + header: match + type: rewrite_tag_filter + expression: "kube.var.log.containers.**.log" + rule: + - + - header: rule + key: log + pattern: /info/i + tag: info.${tag} + - header: rule + key: log + pattern: /warn/i + tag: warn.${tag} + - header: rule + key: log + pattern: /error/i + tag: error.${tag} + - header: rule + key: log + pattern: /critical/i + tag: critical.${tag} + - header: rule + key: log + pattern: (.+) + tag: info.${tag} + # NOTE(srwilkers): Create new key for log level, and use the tag prefix + # added previously + - add_kubernetes_log_level_and_application_key: + header: filter + type: record_transformer + enable_ruby: true + expression: "**.kube.var.log.containers.**.log" + record: + - + - header: record + level: ${tag_parts[0]} + application: ${record["kubernetes"]["labels"]["application"]} + - add_openstack_application_key: + header: filter + type: record_transformer + expression: "openstack.**" + record: + - + - header: record + application: ${tag_parts[1]} + #NOTE(srwilkers): This prefixes the tag for oslo.log entries from the + # fluent handler/formatter with the log level, allowing for lookups on + # openstack logs with a particular log level (ie: error.openstack.keystone) + - tag_openstack_log_level: + header: match + type: rewrite_tag_filter + expression: "openstack.**" + rule: + - + - header: rule + key: level + pattern: INFO + tag: info.${tag} + - header: rule + key: level + pattern: WARN + tag: warn.${tag} + - header: rule + key: level + pattern: ERROR + tag: error.${tag} + - header: rule + key: level + pattern: CRITICAL + tag: critical.${tag} + - syslog_elasticsearch: header: match type: elasticsearch user: "#{ENV['ELASTICSEARCH_USERNAME']}" password: "#{ENV['ELASTICSEARCH_PASSWORD']}" - expression: "**" + expression: "syslog.**" include_tag_key: true host: "#{ENV['ELASTICSEARCH_HOST']}" port: "#{ENV['ELASTICSEARCH_PORT']}" logstash_format: true + logstash_prefix: syslog + buffer_type: memory buffer_chunk_limit: 10M - buffer_queue_limit: 32 - flush_interval: 20s + buffer_queue_limit: 512 + flush_interval: 10 max_retry_wait: 300 + request_timeout: 60 disable_retry_limit: "" num_threads: 8 + type_name: syslog + - ceph_elasticsearch: + header: match + type: elasticsearch + user: "#{ENV['ELASTICSEARCH_USERNAME']}" + password: "#{ENV['ELASTICSEARCH_PASSWORD']}" + expression: "ceph-**.log" + include_tag_key: true + host: "#{ENV['ELASTICSEARCH_HOST']}" + port: "#{ENV['ELASTICSEARCH_PORT']}" + logstash_format: true + logstash_prefix: ceph + buffer_chunk_limit: 10M + buffer_queue_limit: 512 + flush_interval: 10 + max_retry_wait: 300 + request_timeout: 60 + disable_retry_limit: "" + num_threads: 8 + type_name: ceph_logs + - oslo_fluentd_elasticsearch: + header: match + type: elasticsearch + user: "#{ENV['ELASTICSEARCH_USERNAME']}" + password: "#{ENV['ELASTICSEARCH_PASSWORD']}" + expression: "**.openstack.*" + include_tag_key: true + host: "#{ENV['ELASTICSEARCH_HOST']}" + port: "#{ENV['ELASTICSEARCH_PORT']}" + logstash_format: true + logstash_prefix: openstack + buffer_type: memory + buffer_chunk_limit: 10M + buffer_queue_limit: 512 + flush_interval: 10 + max_retry_wait: 300 + request_timeout: 60 + disable_retry_limit: "" + num_threads: 8 + type_name: oslo_openstack_fluentd + - docker_fluentd_elasticsearch: + header: match + type: elasticsearch + user: "#{ENV['ELASTICSEARCH_USERNAME']}" + password: "#{ENV['ELASTICSEARCH_PASSWORD']}" + expression: "**.kube.**.log" + include_tag_key: true + host: "#{ENV['ELASTICSEARCH_HOST']}" + port: "#{ENV['ELASTICSEARCH_PORT']}" + logstash_format: true + buffer_type: memory + buffer_chunk_limit: 10M + buffer_queue_limit: 512 + flush_interval: 10 + max_retry_wait: 300 + request_timeout: 60 + disable_retry_limit: "" + num_threads: 8 + type_name: docker_fluentd + fluentd_exporter: + log: + format: "logger:stdout?json=true" + level: "info" + templates: + syslog: + template: "syslog-*" + index_patterns: "syslog-*" + settings: + number_of_shards: 1 + mappings: + syslog: + properties: + cluster: + type: keyword + app: + type: keyword + pid: + type: integer + host: + type: keyword + log: + type: text + ceph_logs: + template: "ceph-*" + index_patterns: "ceph-*" + settings: + number_of_shards: 1 + mappings: + ceph_logs: + properties: + log: + type: text + oslo_openstack_fluentd: + template: "openstack-*" + index_patterns: "openstack-*" + settings: + number_of_shards: 1 + mappings: + oslo_openstack_fluentd: + properties: + extra: + properties: + project: + type: text + norms: false + version: + type: text + norms: false + filename: + type: text + norms: false + funcname: + type: text + norms: false + message: + type: text + norms: false + process_name: + type: keyword + index: false + docker_fluentd: + template: "logstash-*" + index_patterns: "logstash-*" + settings: + number_of_shards: 1 + mappings: + docker_fluentd: + properties: + kubernetes: + properties: + container_name: + type: keyword + index: false + docker_id: + type: keyword + index: false + host: + type: keyword + index: false + namespace_name: + type: keyword + index: false + pod_id: + type: keyword + index: false + pod_name: + type: keyword + index: false dependencies: - osh-infra-helm-toolkit ... diff --git a/global/software/charts/osh-infra/osh-infra-mariadb/mariadb.yaml b/global/software/charts/osh-infra/osh-infra-mariadb/mariadb.yaml index 310cfec33..1d90c3ccd 100644 --- a/global/software/charts/osh-infra/osh-infra-mariadb/mariadb.yaml +++ b/global/software/charts/osh-infra/osh-infra-mariadb/mariadb.yaml @@ -31,6 +31,13 @@ metadata: path: .osh_infra.oslo_db dest: path: .values.endpoints.olso_db + - src: + schema: pegleg/EndpointCatalogue/v1 + name: osh_infra_endpoints + path: .osh_infra.prometheus_mysql_exporter + dest: + path: .values.endpoints.prometheus_mysql_exporter + # Accounts - src: schema: pegleg/AccountCatalogue/v1 @@ -38,6 +45,12 @@ metadata: path: .osh_infra.oslo_db.admin dest: path: .values.endpoints.oslo_db.auth.admin + - src: + schema: pegleg/AccountCatalogue/v1 + name: osh_infra_service_accounts + path: .osh_infra.prometheus_mysql_exporter.user + dest: + path: .values.endpoints.prometheus_mysql_exporter.auth.user # Secrets - dest: @@ -46,7 +59,12 @@ metadata: schema: deckhand/Passphrase/v1 name: osh_infra_oslo_db_admin_password path: . - + - dest: + path: .values.endpoints.oslo_db.auth.exporter.password + src: + schema: deckhand/Passphrase/v1 + name: osh_infra_oslo_db_exporter_password + path: . data: chart_name: osh-infra-mariadb release: osh-infra-mariadb @@ -72,6 +90,9 @@ data: prometheus_mysql_exporter: node_selector_key: openstack-control-plane node_selector_value: enabled + monitoring: + prometheus: + enabled: true dependencies: - osh-helm-toolkit ... diff --git a/global/software/charts/osh-infra/osh-infra-monitoring/chart-group.yaml b/global/software/charts/osh-infra/osh-infra-monitoring/chart-group.yaml index 4cb879cd4..be06ca8ad 100644 --- a/global/software/charts/osh-infra/osh-infra-monitoring/chart-group.yaml +++ b/global/software/charts/osh-infra/osh-infra-monitoring/chart-group.yaml @@ -13,5 +13,6 @@ data: - prometheus - prometheus-alertmanager - prometheus-node-exporter + - prometheus-process-exporter - prometheus-kube-state-metrics - nagios diff --git a/global/software/charts/osh-infra/osh-infra-monitoring/nagios.yaml b/global/software/charts/osh-infra/osh-infra-monitoring/nagios.yaml index 35ff41b3e..1830a2b86 100644 --- a/global/software/charts/osh-infra/osh-infra-monitoring/nagios.yaml +++ b/global/software/charts/osh-infra/osh-infra-monitoring/nagios.yaml @@ -37,6 +37,12 @@ metadata: path: .osh_infra.monitoring dest: path: .values.endpoints.monitoring + - src: + schema: pegleg/EndpointCatalogue/v1 + name: osh_infra_endpoints + path: .osh_infra.elasticsearch + dest: + path: .values.endpoints.elasticsearch - src: schema: pegleg/EndpointCatalogue/v1 name: osh_infra_endpoints @@ -51,6 +57,18 @@ metadata: path: .osh_infra.nagios.admin dest: path: .values.endpoints.nagios.auth.admin + - src: + schema: pegleg/AccountCatalogue/v1 + name: osh_infra_service_accounts + path: .osh_infra.prometheus.admin + dest: + path: .values.endpoints.prometheus.auth.admin + - src: + schema: pegleg/AccountCatalogue/v1 + name: osh_infra_service_accounts + path: .osh_infra.elasticsearch.admin + dest: + path: .values.endpoints.elasticsearch.auth.admin # Secrets - dest: @@ -59,6 +77,18 @@ metadata: schema: deckhand/Passphrase/v1 name: osh_infra_nagios_admin_password path: . + - dest: + path: .values.endpoints.elasticsearch.auth.admin.password + src: + schema: deckhand/Passphrase/v1 + name: osh_infra_elasticsearch_admin_password + path: . + - dest: + path: .values.endpoints.prometheus.auth.admin.password + src: + schema: deckhand/Passphrase/v1 + name: osh_infra_prometheus_admin_password + path: . # LDAP Details - src: diff --git a/global/software/charts/osh-infra/osh-infra-monitoring/prometheus-process-exporter.yaml b/global/software/charts/osh-infra/osh-infra-monitoring/prometheus-process-exporter.yaml new file mode 100644 index 000000000..d64e8564e --- /dev/null +++ b/global/software/charts/osh-infra/osh-infra-monitoring/prometheus-process-exporter.yaml @@ -0,0 +1,65 @@ +--- +schema: armada/Chart/v1 +metadata: + schema: metadata/Document/v1 + name: prometheus-process-exporter + layeringDefinition: + abstract: false + layer: global + storagePolicy: cleartext + substitutions: + # Chart source + - src: + schema: pegleg/SoftwareVersions/v1 + name: software-versions + path: .charts.osh_infra.prometheus_process_exporter + dest: + path: .source + + # Images + - src: + schema: pegleg/SoftwareVersions/v1 + name: software-versions + path: .images.osh_infra.prometheus_process_exporter + dest: + path: .values.images.tags + + # Endpoints + - src: + schema: pegleg/EndpointCatalogue/v1 + name: osh_infra_endpoints + path: .osh_infra.process_exporter_metrics + dest: + path: .values.endpoints.process_exporter_metrics + +data: + chart_name: prometheus-process-exporter + release: prometheus-process-exporter + namespace: kube-system + wait: + timeout: 900 + labels: + release_group: airship-prometheus-process-exporter + install: + no_hooks: false + upgrade: + no_hooks: false + pre: + delete: + - type: job + labels: + release_group: airship-prometheus-process-exporter + create: [] + post: + create: [] + values: + labels: + node_exporter: + node_selector_key: node-exporter + node_selector_value: enabled + job: + node_selector_key: openstack-control-plane + node_selector_value: enabled + dependencies: + - osh-infra-helm-toolkit +... diff --git a/global/software/charts/osh-infra/osh-infra-monitoring/prometheus.yaml b/global/software/charts/osh-infra/osh-infra-monitoring/prometheus.yaml index 9674e8897..bfde817a9 100644 --- a/global/software/charts/osh-infra/osh-infra-monitoring/prometheus.yaml +++ b/global/software/charts/osh-infra/osh-infra-monitoring/prometheus.yaml @@ -37,6 +37,42 @@ metadata: path: .osh_infra.alerts dest: path: .values.endpoints.alerts + - src: + schema: pegleg/EndpointCatalogue/v1 + name: osh_infra_endpoints + path: .osh_infra.ldap + dest: + path: .values.endpoints.ldap + + # Accounts + - src: + schema: pegleg/AccountCatalogue/v1 + name: osh_infra_service_accounts + path: .osh_infra.prometheus.admin + dest: + path: .values.endpoints.prometheus.auth.admin + + # Secrets + - dest: + path: .values.endpoints.prometheus.auth.admin.password + src: + schema: deckhand/Passphrase/v1 + name: osh_infra_prometheus_admin_password + path: . + + # LDAP Details + - src: + schema: pegleg/AccountCatalogue/v1 + name: osh_infra_service_accounts + path: .osh_infra.ldap.admin + dest: + path: .values.endpoints.ldap.auth.admin + - dest: + path: .values.endpoints.ldap.auth.admin.password + src: + schema: deckhand/Passphrase/v1 + name: osh_keystone_ldap_password + path: . data: chart_name: prometheus @@ -72,9 +108,1545 @@ data: pod: replicas: prometheus: 3 + resources: + enabled: true + prometheus: + limits: + memory: "64Gi" + cpu: "4000m" + requests: + memory: "16Gi" + cpu: "2000m" storage: requests: storage: 500Gi + conf: + prometheus: + command_line_flags: + storage.tsdb.max_block_duration: 17h + scrape_configs: + global: + scrape_interval: 60s + evaluation_interval: 60s + scrape_configs: + # NOTE(srwilkers): The job definition for Prometheus should always be + # listed first, so we can inject the basic auth username and password + # via the endpoints section + - job_name: 'prometheus-metrics' + kubernetes_sd_configs: + - role: endpoints + scrape_interval: 60s + relabel_configs: + - source_labels: + - __meta_kubernetes_service_name + action: keep + regex: "prom-metrics" + - source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + action: keep + regex: true + - source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + action: replace + target_label: kubernetes_namespace + - source_labels: + - __meta_kubernetes_service_name + action: replace + target_label: instance + - source_labels: + - __meta_kubernetes_service_name + action: replace + target_label: kubernetes_name + - source_labels: + - __meta_kubernetes_service_name + target_label: job + replacement: ${1} + - job_name: kubelet + scheme: https + # This TLS & bearer token file config is used to connect to the actual scrape + # endpoints for cluster components. This is separate to discovery auth + # configuration because discovery & scraping are two separate concerns in + # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # the cluster. Otherwise, more config options have to be provided within the + # . + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + scrape_interval: 45s + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: + - __meta_kubernetes_node_name + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics + - source_labels: + - __meta_kubernetes_node_name + action: replace + target_label: kubernetes_io_hostname + # Scrape config for Kubelet cAdvisor. + # + # This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics + # (those whose names begin with 'container_') have been removed from the + # Kubelet metrics endpoint. This job scrapes the cAdvisor endpoint to + # retrieve those metrics. + # + # In Kubernetes 1.7.0-1.7.2, these metrics are only exposed on the cAdvisor + # HTTP endpoint; use "replacement: /api/v1/nodes/${1}:4194/proxy/metrics" + # in that case (and ensure cAdvisor's HTTP server hasn't been disabled with + # the --cadvisor-port=0 Kubelet flag). + # + # This job is not necessary and should be removed in Kubernetes 1.6 and + # earlier versions, or it will cause the metrics to be scraped twice. + - job_name: 'kubernetes-cadvisor' + + # Default to scraping over https. If required, just disable this or change to + # `http`. + scheme: https + + # This TLS & bearer token file config is used to connect to the actual scrape + # endpoints for cluster components. This is separate to discovery auth + # configuration because discovery & scraping are two separate concerns in + # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # the cluster. Otherwise, more config options have to be provided within the + # . + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: node + + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: + - __meta_kubernetes_node_name + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + metric_relabel_configs: + - source_labels: + - __name__ + regex: 'container_network_tcp_usage_total' + action: drop + - source_labels: + - __name__ + regex: 'container_tasks_state' + action: drop + - source_labels: + - __name__ + regex: 'container_network_udp_usage_total' + action: drop + - source_labels: + - __name__ + regex: 'container_memory_failures_total' + action: drop + - source_labels: + - __name__ + regex: 'container_cpu_load_average_10s' + action: drop + - source_labels: + - __name__ + regex: 'container_cpu_system_seconds_total' + action: drop + - source_labels: + - __name__ + regex: 'container_cpu_user_seconds_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_inodes_free' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_inodes_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_io_current' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_io_time_seconds_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_io_time_weighted_seconds_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_read_seconds_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_reads_merged_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_reads_merged_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_reads_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_sector_reads_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_sector_writes_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_write_seconds_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_writes_bytes_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_writes_merged_total' + action: drop + - source_labels: + - __name__ + regex: 'container_fs_writes_total' + action: drop + - source_labels: + - __name__ + regex: 'container_last_seen' + action: drop + - source_labels: + - __name__ + regex: 'container_memory_cache' + action: drop + - source_labels: + - __name__ + regex: 'container_memory_failcnt' + action: drop + - source_labels: + - __name__ + regex: 'container_memory_max_usage_bytes' + action: drop + - source_labels: + - __name__ + regex: 'container_memory_rss' + action: drop + - source_labels: + - __name__ + regex: 'container_memory_swap' + action: drop + - source_labels: + - __name__ + regex: 'container_memory_usage_bytes' + action: drop + - source_labels: + - __name__ + regex: 'container_network_receive_errors_total' + action: drop + - source_labels: + - __name__ + regex: 'container_network_receive_packets_dropped_total' + action: drop + - source_labels: + - __name__ + regex: 'container_network_receive_packets_total' + action: drop + - source_labels: + - __name__ + regex: 'container_network_transmit_errors_total' + action: drop + - source_labels: + - __name__ + regex: 'container_network_transmit_packets_dropped_total' + action: drop + - source_labels: + - __name__ + regex: 'container_network_transmit_packets_total' + action: drop + - source_labels: + - __name__ + regex: 'container_spec_cpu_period' + action: drop + - source_labels: + - __name__ + regex: 'container_spec_cpu_shares' + action: drop + - source_labels: + - __name__ + regex: 'container_spec_memory_limit_bytes' + action: drop + - source_labels: + - __name__ + regex: 'container_spec_memory_reservation_limit_bytes' + action: drop + - source_labels: + - __name__ + regex: 'container_spec_memory_swap_limit_bytes' + action: drop + - source_labels: + - __name__ + regex: 'container_start_time_seconds' + action: drop + # Scrape config for API servers. + # + # Kubernetes exposes API servers as endpoints to the default/kubernetes + # service so this uses `endpoints` role and uses relabelling to only keep + # the endpoints associated with the default/kubernetes service using the + # default named port `https`. This works for single API server deployments as + # well as HA API server deployments. + - job_name: 'apiserver' + kubernetes_sd_configs: + - role: endpoints + scrape_interval: 45s + # Default to scraping over https. If required, just disable this or change to + # `http`. + scheme: https + # This TLS & bearer token file config is used to connect to the actual scrape + # endpoints for cluster components. This is separate to discovery auth + # configuration because discovery & scraping are two separate concerns in + # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # the cluster. Otherwise, more config options have to be provided within the + # . + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # If your node certificates are self-signed or use a different CA to the + # master CA, then disable certificate verification below. Note that + # certificate verification is an integral part of a secure infrastructure + # so this should only be disabled in a controlled environment. You can + # disable certificate verification by uncommenting the line below. + # + # insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + # Keep only the default/kubernetes service endpoints for the https port. This + # will add targets for each API server which Kubernetes adds an endpoint to + # the default/kubernetes service. + relabel_configs: + - source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + action: keep + regex: default;kubernetes;https + metric_relabel_configs: + - source_labels: + - __name__ + regex: 'apiserver_admission_controller_admission_latencies_seconds_bucket' + action: drop + - source_labels: + - __name__ + regex: 'rest_client_request_latency_seconds_bucket' + action: drop + - source_labels: + - __name__ + regex: 'apiserver_response_sizes_bucket' + action: drop + - source_labels: + - __name__ + regex: 'apiserver_admission_step_admission_latencies_seconds_bucket' + action: drop + - source_labels: + - __name__ + regex: 'apiserver_admission_controller_admission_latencies_seconds_count' + action: drop + - source_labels: + - __name__ + regex: 'apiserver_admission_controller_admission_latencies_seconds_sum' + action: drop + - source_labels: + - __name__ + regex: 'apiserver_request_latencies_summary' + action: drop + # Scrape config for service endpoints. + # + # The relabeling allows the actual service scrape endpoint to be configured + # via the following annotations: + # + # * `prometheus.io/scrape`: Only scrape services that have a value of `true` + # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need + # to set this to `https` & most likely set the `tls_config` of the scrape config. + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: If the metrics are exposed on a different port to the + # service then set this appropriately. + - job_name: 'openstack-exporter' + kubernetes_sd_configs: + - role: endpoints + scrape_interval: 60s + relabel_configs: + - source_labels: + - __meta_kubernetes_service_name + action: keep + regex: "openstack-metrics" + - source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + action: keep + regex: true + - source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + action: replace + target_label: kubernetes_namespace + - source_labels: + - __meta_kubernetes_service_name + action: replace + target_label: instance + - source_labels: + - __meta_kubernetes_service_name + action: replace + target_label: kubernetes_name + - source_labels: + - __meta_kubernetes_service_name + target_label: job + replacement: ${1} + - job_name: 'kubernetes-service-endpoints' + kubernetes_sd_configs: + - role: endpoints + scrape_interval: 60s + relabel_configs: + - source_labels: + - __meta_kubernetes_service_name + action: drop + regex: '(openstack-metrics|prom-metrics)' + - source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + action: keep + regex: true + - source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + action: replace + target_label: kubernetes_namespace + - source_labels: + - __meta_kubernetes_service_name + action: replace + target_label: kubernetes_name + - source_labels: + - __meta_kubernetes_service_name + target_label: job + replacement: ${1} + # Example scrape config for pods + # + # The relabeling allows the actual pod scrape endpoint to be configured via the + # following annotations: + # + # * `prometheus.io/scrape`: Only scrape pods that have a value of `true` + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the + # pod's declared ports (default is a port-free target if none are declared). + - job_name: 'kubernetes-pods' + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + - job_name: calico-etcd + kubernetes_sd_configs: + - role: service + scrape_interval: 20s + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: keep + source_labels: + - __meta_kubernetes_service_name + regex: "calico-etcd" + - action: keep + source_labels: + - __meta_kubernetes_namespace + regex: kube-system + target_label: namespace + - source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - source_labels: + - __meta_kubernetes_service_name + target_label: service + - source_labels: + - __meta_kubernetes_service_name + target_label: job + replacement: ${1} + - source_labels: + - __meta_kubernetes_service_label + target_label: job + regex: calico-etcd + replacement: ${1} + - target_label: endpoint + replacement: "calico-etcd" + alerting: + alertmanagers: + - kubernetes_sd_configs: + - role: pod + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + relabel_configs: + - source_labels: [__meta_kubernetes_pod_label_application] + regex: alertmanager + action: keep + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: alerts-api + action: keep + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: peer-mesh + action: drop + rules: + alertmanager: + groups: + - name: alertmanager.rules + rules: + - alert: AlertmanagerConfigInconsistent + expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1 + for: 5m + labels: + severity: critical + annotations: + description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. + summary: Alertmanager configurations are inconsistent + - alert: AlertmanagerDownOrMissing + expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 + for: 5m + labels: + severity: warning + annotations: + description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery. + summary: Alertmanager down or not discovered + - alert: FailedReload + expr: alertmanager_config_last_reload_successful == 0 + for: 10m + labels: + severity: warning + annotations: + description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. + summary: Alertmanager configuration reload has failed + etcd3: + groups: + - name: etcd3.rules + rules: + - alert: etcd_InsufficientMembers + expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) + for: 3m + labels: + severity: critical + annotations: + description: If one more etcd member goes down the cluster will be unavailable + summary: etcd cluster insufficient members + - alert: etcd_NoLeader + expr: etcd_server_has_leader{job="etcd"} == 0 + for: 1m + labels: + severity: critical + annotations: + description: etcd member {{ $labels.instance }} has no leader + summary: etcd member has no leader + - alert: etcd_HighNumberOfLeaderChanges + expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour + summary: a high number of leader changes within the etcd cluster are happening + - alert: etcd_HighNumberOfFailedGRPCRequests + expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}' + summary: a high number of gRPC requests are failing + - alert: etcd_HighNumberOfFailedGRPCRequests + expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05 + for: 5m + labels: + severity: critical + annotations: + description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}' + summary: a high number of gRPC requests are failing + - alert: etcd_GRPCRequestsSlow + expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15 + for: 10m + labels: + severity: critical + annotations: + description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow + summary: slow gRPC requests + - alert: etcd_HighNumberOfFailedHTTPRequests + expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}' + summary: a high number of HTTP requests are failing + - alert: etcd_HighNumberOfFailedHTTPRequests + expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05 + for: 5m + labels: + severity: critical + annotations: + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}' + summary: a high number of HTTP requests are failing + - alert: etcd_HTTPRequestsSlow + expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15 + for: 10m + labels: + severity: warning + annotations: + description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow + summary: slow HTTP requests + - alert: etcd_EtcdMemberCommunicationSlow + expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow + summary: etcd member communication is slow + - alert: etcd_HighNumberOfFailedProposals + expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour + summary: a high number of proposals within the etcd cluster are failing + - alert: etcd_HighFsyncDurations + expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} fync durations are high + summary: high fsync durations + - alert: etcd_HighCommitDurations + expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} commit durations are high + summary: high commit durations + kube_apiserver: + groups: + - name: kube-apiserver.rules + rules: + - alert: K8SApiserverDown + expr: absent(up{job="apiserver"} == 1) + for: 5m + labels: + severity: critical + annotations: + description: Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery. + summary: API server unreachable + - alert: K8SApiServerLatency + expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1 + for: 10m + labels: + severity: warning + annotations: + description: 99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s. + summary: Kubernetes apiserver latency is high + kube_controller_manager: + groups: + - name: kube-controller-manager.rules + rules: + - alert: K8SControllerManagerDown + expr: absent(up{job="kube-controller-manager-discovery"} == 1) + for: 5m + labels: + severity: critical + annotations: + description: There is no running K8S controller manager. Deployments and replication controllers are not making progress. + runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager + summary: Controller manager is down + kubelet: + groups: + - name: kubelet.rules + rules: + - alert: K8SNodeNotReady + expr: kube_node_status_ready{condition="true"} == 0 + for: 1h + labels: + severity: warning + annotations: + description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour + summary: Node status is NotReady + - alert: K8SManyNodesNotReady + expr: count(kube_node_status_ready{condition="true"} == 0) > 1 and (count(kube_node_status_ready{condition="true"} == 0) / count(kube_node_status_ready{condition="true"})) > 0.2 + for: 1m + labels: + severity: critical + annotations: + description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).' + summary: Many Kubernetes nodes are Not Ready + - alert: K8SKubeletDown + expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03 + for: 1h + labels: + severity: warning + annotations: + description: Prometheus failed to scrape {{ $value }}% of kubelets. + summary: Many Kubelets cannot be scraped + - alert: K8SKubeletDown + expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1 + for: 1h + labels: + severity: critical + annotations: + description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery. + summary: Many Kubelets cannot be scraped + - alert: K8SKubeletTooManyPods + expr: kubelet_running_pod_count > 100 + labels: + severity: warning + annotations: + description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110 + summary: Kubelet is close to pod limit + kubernetes: + groups: + - name: kubernetes.rules + rules: + - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes + expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:spec_cpu_shares + expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:cpu_usage:rate + expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:memory_usage:bytes + expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:memory_working_set:bytes + expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:memory_rss:bytes + expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:memory_cache:bytes + expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:disk_usage:bytes + expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate + expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type) + - record: cluster_namespace_controller_pod_container:memory_oom:rate + expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type) + - record: cluster:memory_allocation:percent + expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster) + - record: cluster:memory_used:percent + expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster) + - record: cluster:cpu_allocation:percent + expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} * ON(cluster, instance) machine_cpu_cores) BY (cluster) + - record: cluster:node_cpu_use:percent + expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) BY (cluster) + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - alert: kube_statefulset_replicas_unavailable + expr: kube_statefulset_status_replicas < kube_statefulset_replicas + for: 5m + labels: + severity: page + annotations: + description: 'statefulset {{$labels.statefulset}} has {{$value}} replicas, which is less than desired' + summary: '{{$labels.statefulset}}: has inssuficient replicas.' + - alert: kube_daemonsets_misscheduled + expr: kube_daemonset_status_number_misscheduled > 0 + for: 10m + labels: + severity: warning + annotations: + description: 'Daemonset {{$labels.daemonset}} is running where it is not supposed to run' + summary: 'Daemonsets not scheduled correctly' + - alert: kube_daemonsets_not_scheduled + expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }} of Daemonset {{$labels.daemonset}} scheduled which is less than desired number' + summary: 'Less than desired number of daemonsets scheduled' + - alert: kube_deployment_replicas_unavailable + expr: kube_deployment_status_replicas_unavailable > 0 + for: 10m + labels: + severity: page + annotations: + description: 'deployment {{$labels.deployment}} has {{$value}} replicas unavailable' + summary: '{{$labels.deployment}}: has inssuficient replicas.' + - alert: kube_rollingupdate_deployment_replica_less_than_spec_max_unavailable + expr: kube_deployment_status_replicas_available - kube_deployment_spec_strategy_rollingupdate_max_unavailable < 0 + for: 10m + labels: + severity: page + annotations: + description: 'deployment {{$labels.deployment}} has {{$value}} replicas available which is less than specified as max unavailable during a rolling update' + summary: '{{$labels.deployment}}: has inssuficient replicas during a rolling update.' + - alert: kube_job_status_failed + expr: kube_job_status_failed > 0 + for: 10m + labels: + severity: page + annotations: + description: 'Job {{$labels.exported_job}} is in failed status' + summary: '{{$labels.exported_job}} has failed status' + - alert: kube_pod_status_pending + expr: kube_pod_status_phase{phase="Pending"} == 1 + for: 10m + labels: + severity: page + annotations: + description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes' + summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status' + - alert: kube_pod_error_image_pull + expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1 + for: 10m + labels: + severity: page + annotations: + description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes' + summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status' + - alert: kube_pod_status_error_image_pull + expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1 + for: 10m + labels: + severity: page + annotations: + description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has an Image pull error for more than 10 minutes' + summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status' + - alert: kube_replicaset_missing_replicas + expr: kube_replicaset_spec_replicas - kube_replicaset_status_ready_replicas > 0 + for: 10m + labels: + severity: page + annotations: + description: 'Replicaset {{$labels.replicaset}} is missing desired number of replicas for more than 10 minutes' + summary: 'Replicaset {{$labels.replicaset}} is missing replicas' + - alert: kube_pod_container_terminated + expr: kube_pod_container_status_terminated_reason{reason=~"OOMKilled|Error|ContainerCannotRun"} > 0 + for: 10m + labels: + severity: page + annotations: + description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has a container terminated for more than 10 minutes' + summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in error status' + - alert: volume_claim_capacity_high_utilization + expr: (kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes) > 0.80 + for: 5m + labels: + severity: page + annotations: + description: 'volume claim {{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity' + summary: '{{$labels.persistentvolumeclaim}} usage has exceeded 80% of total capacity.' + basic_linux: + groups: + - name: basic_linux.rules + rules: + - alert: node_filesystem_full_80percent + expr: sort(node_filesystem_free{device!="ramfs"} < node_filesystem_size{device!="ramfs"} + * 0.2) / 1024 ^ 3 + for: 5m + labels: + severity: page + annotations: + description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} + got less than 10% space left on its filesystem.' + summary: '{{$labels.alias}}: Filesystem is running out of space soon.' + - alert: node_filesystem_full_in_4h + expr: predict_linear(node_filesystem_free{device!="ramfs"}[1h], 4 * 3600) <= 0 + for: 5m + labels: + severity: page + annotations: + description: '{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} + is running out of space of in approx. 4 hours' + summary: '{{$labels.alias}}: Filesystem is running out of space in 4 hours.' + - alert: node_filedescriptors_full_in_3h + expr: predict_linear(node_filefd_allocated[1h], 3 * 3600) >= node_filefd_maximum + for: 20m + labels: + severity: page + annotations: + description: '{{$labels.alias}} is running out of available file descriptors + in approx. 3 hours' + summary: '{{$labels.alias}} is running out of available file descriptors in + 3 hours.' + - alert: node_load1_90percent + expr: node_load1 / ON(alias) count(node_cpu{mode="system"}) BY (alias) >= 0.9 + for: 1h + labels: + severity: page + annotations: + description: '{{$labels.alias}} is running with > 90% total load for at least + 1h.' + summary: '{{$labels.alias}}: Running on high load.' + - alert: node_cpu_util_90percent + expr: 100 - (avg(irate(node_cpu{mode="idle"}[5m])) BY (alias) * 100) >= 90 + for: 1h + labels: + severity: page + annotations: + description: '{{$labels.alias}} has total CPU utilization over 90% for at least + 1h.' + summary: '{{$labels.alias}}: High CPU utilization.' + - alert: node_ram_using_90percent + expr: node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal + * 0.1 + for: 30m + labels: + severity: page + annotations: + description: '{{$labels.alias}} is using at least 90% of its RAM for at least + 30 minutes now.' + summary: '{{$labels.alias}}: Using lots of RAM.' + - alert: node_swap_using_80percent + expr: node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached) + > node_memory_SwapTotal * 0.8 + for: 10m + labels: + severity: page + annotations: + description: '{{$labels.alias}} is using 80% of its swap space for at least + 10 minutes now.' + summary: '{{$labels.alias}}: Running out of swap soon.' + - alert: node_high_cpu_load + expr: node_load15 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0 + for: 1m + labels: + severity: warning + annotations: + description: '{{$labels.alias}} is running with load15 > 1 for at least 5 minutes: {{$value}}' + summary: '{{$labels.alias}}: Running on high load: {{$value}}' + - alert: node_high_memory_load + expr: (sum(node_memory_MemTotal) - sum(node_memory_MemFree + node_memory_Buffers + + node_memory_Cached)) / sum(node_memory_MemTotal) * 100 > 85 + for: 1m + labels: + severity: warning + annotations: + description: Host memory usage is {{ humanize $value }}%. Reported by + instance {{ $labels.instance }} of job {{ $labels.job }}. + summary: Server memory is almost full + - alert: node_high_storage_load + expr: (node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}) + / node_filesystem_size{mountpoint="/"} * 100 > 85 + for: 30s + labels: + severity: warning + annotations: + description: Host storage usage is {{ humanize $value }}%. Reported by + instance {{ $labels.instance }} of job {{ $labels.job }}. + summary: Server storage is almost full + - alert: node_high_swap + expr: (node_memory_SwapTotal - node_memory_SwapFree) < (node_memory_SwapTotal + * 0.4) + for: 1m + labels: + severity: warning + annotations: + description: Host system has a high swap usage of {{ humanize $value }}. Reported + by instance {{ $labels.instance }} of job {{ $labels.job }}. + summary: Server has a high swap usage + - alert: node_high_network_drop_rcv + expr: node_network_receive_drop{device!="lo"} > 3000 + for: 30s + labels: + severity: warning + annotations: + description: Host system has an unusally high drop in network reception ({{ + humanize $value }}). Reported by instance {{ $labels.instance }} of job {{ + $labels.job }} + summary: Server has a high receive drop + - alert: node_high_network_drop_send + expr: node_network_transmit_drop{device!="lo"} > 3000 + for: 30s + labels: + severity: warning + annotations: + description: Host system has an unusally high drop in network transmission ({{ + humanize $value }}). Reported by instance {{ $labels.instance }} of job {{ + $labels.job }} + summary: Server has a high transmit drop + - alert: node_high_network_errs_rcv + expr: node_network_receive_errs{device!="lo"} > 3000 + for: 30s + labels: + severity: warning + annotations: + description: Host system has an unusally high error rate in network reception + ({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job + {{ $labels.job }} + summary: Server has unusual high reception errors + - alert: node_high_network_errs_send + expr: node_network_transmit_errs{device!="lo"} > 3000 + for: 30s + labels: + severity: warning + annotations: + description: Host system has an unusally high error rate in network transmission + ({{ humanize $value }}). Reported by instance {{ $labels.instance }} of job + {{ $labels.job }} + summary: Server has unusual high transmission errors + - alert: node_network_conntrack_usage_80percent + expr: sort(node_nf_conntrack_entries{job="node-exporter"} > node_nf_conntrack_entries_limit{job="node-exporter"} * 0.8) + for: 5m + labels: + severity: page + annotations: + description: '{{$labels.instance}} has network conntrack entries of {{ $value }} which is more than 80% of maximum limit' + summary: '{{$labels.instance}}: available network conntrack entries are low.' + - alert: node_entropy_available_low + expr: node_entropy_available_bits < 300 + for: 5m + labels: + severity: page + annotations: + description: '{{$labels.instance}} has available entropy bits of {{ $value }} which is less than required of 300' + summary: '{{$labels.instance}}: is low on entropy bits.' + - alert: node_hwmon_high_cpu_temp + expr: node_hwmon_temp_crit_celsius*0.9 - node_hwmon_temp_celsius < 0 OR node_hwmon_temp_max_celsius*0.95 - node_hwmon_temp_celsius < 0 + for: 5m + labels: + severity: page + annotations: + description: '{{$labels.alias}} reports hwmon sensor {{$labels.sensor}}/{{$labels.chip}} temperature value is nearly critical: {{$value}}' + summary: '{{$labels.alias}}: Sensor {{$labels.sensor}}/{{$labels.chip}} temp is high: {{$value}}' + - alert: node_vmstat_paging_rate_high + expr: irate(node_vmstat_pgpgin[5m]) > 80 + for: 5m + labels: + severity: page + annotations: + description: '{{$labels.alias}} has a memory paging rate of change higher than 80%: {{$value}}' + summary: '{{$labels.alias}}: memory paging rate is high: {{$value}}' + - alert: node_xfs_block_allocation_high + expr: 100*(node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"} / (node_xfs_extent_allocation_blocks_freed_total{job="node-exporter", instance=~"172.17.0.1.*"} + node_xfs_extent_allocation_blocks_allocated_total{job="node-exporter", instance=~"172.17.0.1.*"})) > 80 + for: 5m + labels: + severity: page + annotations: + description: '{{$labels.alias}} has xfs allocation blocks higher than 80%: {{$value}}' + summary: '{{$labels.alias}}: xfs block allocation high: {{$value}}' + - alert: node_network_bond_slaves_down + expr: node_net_bonding_slaves - node_net_bonding_slaves_active > 0 + for: 5m + labels: + severity: page + annotations: + description: '{{ $labels.master }} is missing {{ $value }} slave interface(s).' + summary: 'Instance {{ $labels.instance }}: {{ $labels.master }} missing {{ $value }} slave interface(s)' + - alert: node_numa_memory_used + expr: 100*node_memory_numa_MemUsed / node_memory_numa_MemTotal > 80 + for: 5m + labels: + severity: page + annotations: + description: '{{$labels.alias}} has more than 80% NUMA memory usage: {{ $value }}' + summary: '{{$labels.alias}}: has high NUMA memory usage: {{$value}}' + - alert: node_ntp_clock_skew_high + expr: abs(node_ntp_drift_seconds) > 2 + for: 5m + labels: + severity: page + annotations: + description: '{{$labels.alias}} has time difference of more than 2 seconds compared to NTP server: {{ $value }}' + summary: '{{$labels.alias}}: time is skewed by : {{$value}} seconds' + - alert: node_disk_read_latency + expr: (rate(node_disk_read_time_ms[5m]) / rate(node_disk_reads_completed[5m])) > 10 + for: 5m + labels: + severity: page + annotations: + description: '{{$labels.device}} has a high read latency of {{ $value }}' + summary: 'High read latency observed for device {{ $labels.device }}' + - alert: node_disk_write_latency + expr: (rate(node_disk_write_time_ms[5m]) / rate(node_disk_writes_completed[5m])) > 10 + for: 5m + labels: + severity: page + annotations: + description: '{{$labels.device}} has a high write latency of {{ $value }}' + summary: 'High write latency observed for device {{ $labels.device }}' + openstack: + groups: + - name: openstack.rules + rules: + - alert: os_glance_api_availability + expr: check_glance_api != 1 + for: 5m + labels: + severity: page + annotations: + description: 'Glance API is not available at {{$labels.url}} for more than 5 minutes' + summary: 'Glance API is not available at {{$labels.url}}' + - alert: os_nova_api_availability + expr: check_nova_api != 1 + for: 5m + labels: + severity: page + annotations: + description: 'Nova API is not available at {{$labels.url}} for more than 5 minutes' + summary: 'Nova API is not available at {{$labels.url}}' + - alert: os_keystone_api_availability + expr: check_keystone_api != 1 + for: 5m + labels: + severity: page + annotations: + description: 'Keystone API is not available at {{$labels.url}} for more than 5 minutes' + summary: 'Keystone API is not available at {{$labels.url}}' + - alert: os_neutron_api_availability + expr: check_neutron_api != 1 + for: 5m + labels: + severity: page + annotations: + description: 'Neutron API is not available at {{$labels.url}} for more than 5 minutes' + summary: 'Neutron API is not available at {{$labels.url}}' + - alert: os_swift_api_availability + expr: check_swift_api != 1 + for: 5m + labels: + severity: page + annotations: + description: 'Swift API is not available at {{$labels.url}} for more than 5 minutes' + summary: 'Swift API is not available at {{$labels.url}}' + - alert: os_nova_compute_disabled + expr: services_nova_compute_disabled_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'nova-compute is disabled on certain hosts for more than 5 minutes' + summary: 'Openstack compute service nova-compute is disabled on some hosts' + - alert: os_nova_conductor_disabled + expr: services_nova_conductor_disabled_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'nova-conductor is disabled on certain hosts for more than 5 minutes' + summary: 'Openstack compute service nova-conductor is disabled on some hosts' + - alert: os_nova_consoleauth_disabled + expr: services_nova_consoleauth_disabled_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'nova-consoleauth is disabled on certain hosts for more than 5 minutes' + summary: 'Openstack compute service nova-consoleauth is disabled on some hosts' + - alert: os_nova_scheduler_disabled + expr: services_nova_scheduler_disabled_total > 0 + for: 5m + labels: + severity: page + annotations: + description: 'nova-scheduler is disabled on certain hosts for more than 5 minutes' + summary: 'Openstack compute service nova-scheduler is disabled on some hosts' + ceph: + groups: + - name: ceph.rules + rules: + - alert: ceph_monitor_quorum_low + expr: ceph_monitor_quorum_count < 3 + for: 5m + labels: + severity: page + annotations: + description: 'ceph monitor quorum has been less than 3 for more than 5 minutes' + summary: 'ceph high availability is at risk' + - alert: ceph_cluster_usage_high + expr: 100* ceph_cluster_used_bytes/ceph_cluster_capacity_bytes > 80 + for: 5m + labels: + severity: page + annotations: + description: 'ceph cluster capacity usage more than 80 percent' + summary: 'ceph cluster usage is more than 80 percent' + - alert: ceph_placement_group_degrade_pct_high + expr: 100*ceph_degraded_pgs/ceph_total_pgs > 80 + for: 5m + labels: + severity: page + annotations: + description: 'ceph placement group degradation is more than 80 percent' + summary: 'ceph placement groups degraded' + - alert: ceph_osd_down_pct_high + expr: 100* ceph_osds_down/(ceph_osds_down+ceph_osds_up) > 80 + for: 5m + labels: + severity: page + annotations: + description: 'ceph OSDs down percent is more than 80 percent' + summary: 'ceph OSDs down percent is high' + - alert: ceph_monitor_clock_skew_high + expr: ceph_monitor_clock_skew_seconds > 2 + for: 5m + labels: + severity: page + annotations: + description: 'ceph monitors clock skew on {{$labels.instance}} is more than 2 seconds' + summary: 'ceph monitor clock skew high' + fluentd: + groups: + - name: fluentd.rules + rules: + - alert: fluentd_not_running + expr: fluentd_up == 0 + for: 5m + labels: + severity: page + annotations: + description: 'fluentd is down on {{$labels.instance}} for more than 5 minutes' + summary: 'Fluentd is down' + calico: + groups: + - name: calico.rules + rules: + - alert: calico_datapane_failures_high_1h + expr: absent(felix_int_dataplane_failures) OR increase(felix_int_dataplane_failures[1h]) > 5 + labels: + severity: page + annotations: + description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} dataplane failures within the last hour' + summary: 'A high number of dataplane failures within Felix are happening' + - alert: calico_datapane_address_msg_batch_size_high_5m + expr: absent(felix_int_dataplane_addr_msg_batch_size_sum) OR absent(felix_int_dataplane_addr_msg_batch_size_count) OR (felix_int_dataplane_addr_msg_batch_size_sum/felix_int_dataplane_addr_msg_batch_size_count) > 5 + for: 5m + labels: + severity: page + annotations: + description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane address message batch size' + summary: 'Felix address message batch size is higher' + - alert: calico_datapane_iface_msg_batch_size_high_5m + expr: absent(felix_int_dataplane_iface_msg_batch_size_sum) OR absent(felix_int_dataplane_iface_msg_batch_size_count) OR (felix_int_dataplane_iface_msg_batch_size_sum/felix_int_dataplane_iface_msg_batch_size_count) > 5 + for: 5m + labels: + severity: page + annotations: + description: 'Felix instance {{ $labels.instance }} has seen a high value of {{ $value }} dataplane interface message batch size' + summary: 'Felix interface message batch size is higher' + - alert: calico_ipset_errors_high_1h + expr: absent(felix_ipset_errors) OR increase(felix_ipset_errors[1h]) > 5 + labels: + severity: page + annotations: + description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} ipset errors within the last hour' + summary: 'A high number of ipset errors within Felix are happening' + - alert: calico_iptable_save_errors_high_1h + expr: absent(felix_iptables_save_errors) OR increase(felix_iptables_save_errors[1h]) > 5 + labels: + severity: page + annotations: + description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable save errors within the last hour' + summary: 'A high number of iptable save errors within Felix are happening' + - alert: calico_iptable_restore_errors_high_1h + expr: absent(felix_iptables_restore_errors) OR increase(felix_iptables_restore_errors[1h]) > 5 + labels: + severity: page + annotations: + description: 'Felix instance {{ $labels.instance }} has seen {{ $value }} iptable restore errors within the last hour' + summary: 'A high number of iptable restore errors within Felix are happening' + rabbitmq: + groups: + - name: rabbitmq.rules + rules: + - alert: rabbitmq_network_pratitions_detected + expr: min(partitions) by(instance) > 0 + for: 10m + labels: + severity: warning + annotations: + description: 'RabbitMQ at {{ $labels.instance }} has {{ $value }} partitions' + summary: 'RabbitMQ Network partitions detected' + - alert: rabbitmq_down + expr: min(rabbitmq_up) by(instance) != 1 + for: 10m + labels: + severity: page + annotations: + description: 'RabbitMQ Server instance {{ $labels.instance }} is down' + summary: 'The RabbitMQ Server instance at {{ $labels.instance }} has been down the last 10 mins' + - alert: rabbitmq_file_descriptor_usage_high + expr: fd_used * 100 /fd_total > 80 + for: 10m + labels: + severity: warning + annotations: + description: 'RabbitMQ Server instance {{ $labels.instance }} has high file descriptor usage of {{ $value }} percent.' + summary: 'RabbitMQ file descriptors usage is high for last 10 mins' + - alert: rabbitmq_node_disk_free_alarm + expr: node_disk_free_alarm > 0 + for: 10m + labels: + severity: warning + annotations: + description: 'RabbitMQ Server instance {{ $labels.instance }} has low disk free space available.' + summary: 'RabbitMQ disk space usage is high' + - alert: rabbitmq_node_memory_alarm + expr: node_mem_alarm > 0 + for: 10m + labels: + severity: warning + annotations: + description: 'RabbitMQ Server instance {{ $labels.instance }} has low free memory.' + summary: 'RabbitMQ memory usage is high' + - alert: rabbitmq_less_than_3_nodes + expr: running < 3 + for: 10m + labels: + severity: warning + annotations: + description: 'RabbitMQ Server has less than 3 nodes running.' + summary: 'RabbitMQ server is at risk of loosing data' + - alert: rabbitmq_queue_messages_returned_high + expr: queue_messages_returned_total/queue_messages_published_total * 100 > 50 + for: 5m + labels: + severity: warning + annotations: + description: 'RabbitMQ Server is returing more than 50 percent of messages received.' + summary: 'RabbitMQ server is returning more than 50 percent of messages received.' + - alert: rabbitmq_consumers_low_utilization + expr: queue_consumer_utilisation < .4 + for: 5m + labels: + severity: warning + annotations: + description: 'RabbitMQ consumers message consumption speed is low' + summary: 'RabbitMQ consumers message consumption speed is low' + - alert: rabbitmq_high_message_load + expr: queue_messages_total > 17000 or increase(queue_messages_total[5m]) > 4000 + for: 5m + labels: + severity: warning + annotations: + description: 'RabbitMQ has high message load. Total Queue depth > 17000 or growth more than 4000 messages.' + summary: 'RabbitMQ has high message load' + elasticsearch: + groups: + - name: elasticsearch.rules + rules: + - alert: es_high_process_open_files_count + expr: sum(elasticsearch_process_open_files_count) by (host) > 64000 + for: 10m + labels: + severity: warning + annotations: + description: 'Elasticsearch at {{ $labels.host }} has more than 64000 process open file count.' + summary: 'Elasticsearch has a very high process open file count.' + - alert: es_high_process_cpu_percent + expr: elasticsearch_process_cpu_percent > 95 + for: 10m + labels: + severity: warning + annotations: + description: 'Elasticsearch at {{ $labels.instance }} has high process cpu percent of {{ $value }}.' + summary: 'Elasticsearch process cpu usage is more than 95 percent.' + - alert: es_fs_usage_high + expr: (100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes) / elasticsearch_filesystem_data_size_bytes) > 80 + for: 10m + labels: + severity: warning + annotations: + description: 'Elasticsearch at {{ $labels.instance }} has filesystem usage of {{ $value }}.' + summary: 'Elasticsearch filesystem usage is high.' + - alert: es_unassigned_shards + expr: elasticsearch_cluster_health_unassigned_shards > 0 + for: 10m + labels: + severity: warning + annotations: + description: 'Elasticsearch has {{ $value }} unassigned shards.' + summary: 'Elasticsearch has unassigned shards and hence a unhealthy cluster state.' + - alert: es_cluster_health_timed_out + expr: elasticsearch_cluster_health_timed_out > 0 + for: 10m + labels: + severity: warning + annotations: + description: 'Elasticsearch cluster health status call timedout {{ $value }} times.' + summary: 'Elasticsearch cluster health status calls are timing out.' + - alert: es_cluster_health_status_alert + expr: elasticsearch_cluster_health_status > 0 + for: 10m + labels: + severity: warning + annotations: + description: 'Elasticsearch cluster health status is not green. One or more shards or replicas are unallocated.' + summary: 'Elasticsearch cluster health status is not green.' + - alert: es_cluster_health_too_few_nodes_running + expr: elasticsearch_cluster_health_number_of_nodes < 3 + for: 10m + labels: + severity: warning + annotations: + description: 'There are only {{$value}} < 3 ElasticSearch nodes running' + summary: 'ElasticSearch running on less than 3 nodes' + - alert: es_cluster_health_too_few_data_nodes_running + expr: elasticsearch_cluster_health_number_of_data_nodes < 3 + for: 10m + labels: + severity: warning + annotations: + description: 'There are only {{$value}} < 3 ElasticSearch data nodes running' + summary: 'ElasticSearch running on less than 3 data nodes' + mariadb: + groups: + - name: mariadb.rules + rules: + - alert: mariadb_table_lock_wait_high + expr: 100 * mysql_global_status_table_locks_waited/(mysql_global_status_table_locks_waited + mysql_global_status_table_locks_immediate) > 30 + for: 10m + labels: + severity: warning + annotations: + description: 'Mariadb has high table lock waits of {{ $value }} percentage' + summary: 'Mariadb table lock waits are high' + - alert: mariadb_node_not_ready + expr: mysql_global_status_wsrep_ready != 1 + for: 10m + labels: + severity: warning + annotations: + description: '{{$labels.job}} on {{$labels.instance}} is not ready.' + summary: 'Galera cluster node not ready' + - alert: mariadb_galera_node_out_of_sync + expr: mysql_global_status_wsrep_local_state != 4 AND mysql_global_variables_wsrep_desync == 0 + for: 10m + labels: + severity: warning + annotations: + description: '{{$labels.job}} on {{$labels.instance}} is not in sync ({{$value}} != 4)' + summary: 'Galera cluster node out of sync' + - alert: mariadb_innodb_replication_fallen_behind + expr: (mysql_global_variables_innodb_replication_delay > 30) AND on (instance) (predict_linear(mysql_global_variables_innodb_replication_delay[5m], 60*2) > 0) + for: 10m + labels: + severity: warning + annotations: + description: 'The mysql innodb replication has fallen behind and is not recovering' + summary: 'MySQL innodb replication is lagging' dependencies: - osh-infra-helm-toolkit ... diff --git a/global/software/charts/osh-infra/osh-infra-radosgw/chart-group.yaml b/global/software/charts/osh-infra/osh-infra-radosgw/chart-group.yaml new file mode 100644 index 000000000..07d160819 --- /dev/null +++ b/global/software/charts/osh-infra/osh-infra-radosgw/chart-group.yaml @@ -0,0 +1,13 @@ +--- +schema: armada/ChartGroup/v1 +metadata: + schema: metadata/Document/v1 + name: osh-infra-radosgw + layeringDefinition: + abstract: false + layer: global + storagePolicy: cleartext +data: + description: Deploy Radosgw for OSH-Infra + chart_group: + - osh-infra-radosgw diff --git a/global/software/charts/osh-infra/osh-infra-radosgw/radosgw.yaml b/global/software/charts/osh-infra/osh-infra-radosgw/radosgw.yaml new file mode 100644 index 000000000..b39c703fb --- /dev/null +++ b/global/software/charts/osh-infra/osh-infra-radosgw/radosgw.yaml @@ -0,0 +1,118 @@ +--- +schema: armada/Chart/v1 +metadata: + schema: metadata/Document/v1 + name: osh-infra-radosgw + layeringDefinition: + abstract: false + layer: global + storagePolicy: cleartext + substitutions: + # Chart source + - src: + schema: pegleg/SoftwareVersions/v1 + name: software-versions + path: .charts.ucp.ceph-rgw + dest: + path: .source + + # Images + - src: + schema: pegleg/SoftwareVersions/v1 + name: software-versions + path: .images.ceph.ceph-rgw + dest: + path: .values.images.tags + + # IP addresses + - src: + schema: pegleg/CommonAddresses/v1 + name: common-addresses + path: .storage.ceph.public_cidr + dest: + path: .values.network.public + - src: + schema: pegleg/CommonAddresses/v1 + name: common-addresses + path: .storage.ceph.cluster_cidr + dest: + path: .values.network.cluster + + # Endpoints + - src: + schema: pegleg/EndpointCatalogue/v1 + name: osh_infra_endpoints + path: .osh_infra.ceph_object_store + dest: + path: .values.endpoints.ceph_object_store + - src: + schema: pegleg/EndpointCatalogue/v1 + name: ucp_endpoints + path: .ceph.ceph_mon + dest: + path: .values.endpoints.ceph_mon + + # Credentials + - src: + schema: pegleg/AccountCatalogue/v1 + name: osh_infra_service_accounts + path: .osh_infra.ceph_object_store.admin + dest: + path: .values.endpoints.ceph_object_store.auth.admin + + # Secrets + - dest: + path: .values.endpoints.ceph_object_store.auth.admin.access_key + src: + schema: deckhand/Passphrase/v1 + name: osh_infra_rgw_s3_admin_access_key + path: . + - dest: + path: .values.endpoints.ceph_object_store.auth.admin.secret_key + src: + schema: deckhand/Passphrase/v1 + name: osh_infra_rgw_s3_admin_secret_key + path: . + +data: + chart_name: osh-infra-radosgw + release: osh-infra-radosgw + namespace: osh-infra + wait: + timeout: 900 + labels: + release_group: clcp-osh-infra-radosgw + install: + no_hooks: false + upgrade: + no_hooks: false + pre: + delete: + - type: job + labels: + release_group: clcp-osh-infra-radosgw + values: + labels: + job: + node_selector_key: openstack-control-plane + node_selector_value: enabled + rgw: + node_selector_key: ceph-rgw + node_selector_value: enabled + deployment: + storage_secrets: false + ceph: true + rbd_provisioner: false + cephfs_provisioner: false + client_secrets: false + rgw_keystone_user_and_endpoints: false + bootstrap: + enabled: false + conf: + rgw_s3: + enabled: true + ceph_client: + configmap: ceph-etc + dependencies: + - osh-infra-helm-toolkit +... diff --git a/global/software/charts/osh/openstack-mariadb/mariadb.yaml b/global/software/charts/osh/openstack-mariadb/mariadb.yaml index cae3239a4..d93ef70a0 100644 --- a/global/software/charts/osh/openstack-mariadb/mariadb.yaml +++ b/global/software/charts/osh/openstack-mariadb/mariadb.yaml @@ -31,6 +31,13 @@ metadata: path: .osh.oslo_db dest: path: .values.endpoints.olso_db + - src: + schema: pegleg/EndpointCatalogue/v1 + name: osh_endpoints + path: .osh.prometheus_mysql_exporter + dest: + path: .values.endpoints.prometheus_mysql_exporter + # Accounts - src: schema: pegleg/AccountCatalogue/v1 @@ -38,6 +45,12 @@ metadata: path: .osh.oslo_db.admin dest: path: .values.endpoints.oslo_db.auth.admin + - src: + schema: pegleg/AccountCatalogue/v1 + name: osh_service_accounts + path: .osh.prometheus_mysql_exporter.user + dest: + path: .values.endpoints.prometheus_mysql_exporter.auth.user # Secrets - dest: @@ -46,6 +59,12 @@ metadata: schema: deckhand/Passphrase/v1 name: osh_oslo_db_admin_password path: . + - dest: + path: .values.endpoints.oslo_db.auth.exporter.password + src: + schema: deckhand/Passphrase/v1 + name: osh_oslo_db_exporter_password + path: . data: chart_name: openstack-mariadb @@ -72,6 +91,9 @@ data: prometheus_mysql_exporter: node_selector_key: openstack-control-plane node_selector_value: enabled + monitoring: + prometheus: + enabled: true dependencies: - osh-helm-toolkit ... diff --git a/global/software/config/versions.yaml b/global/software/config/versions.yaml index 3573b796e..88e39d7b2 100644 --- a/global/software/config/versions.yaml +++ b/global/software/config/versions.yaml @@ -182,57 +182,62 @@ data: osh_infra: elasticsearch: location: https://git.openstack.org/openstack/openstack-helm-infra - reference: 4f4e9c5838e9cdf25c453c6a5b85bfc1ce12ad91 + reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44 subpath: elasticsearch type: git fluent_logging: location: https://git.openstack.org/openstack/openstack-helm-infra - reference: 4f4e9c5838e9cdf25c453c6a5b85bfc1ce12ad91 + reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44 subpath: fluent-logging type: git grafana: location: https://git.openstack.org/openstack/openstack-helm-infra - reference: 4f4e9c5838e9cdf25c453c6a5b85bfc1ce12ad91 + reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44 subpath: grafana type: git helm_toolkit: location: https://git.openstack.org/openstack/openstack-helm-infra - reference: 4f4e9c5838e9cdf25c453c6a5b85bfc1ce12ad91 + reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44 subpath: helm-toolkit type: git kibana: location: https://git.openstack.org/openstack/openstack-helm-infra - reference: 4f4e9c5838e9cdf25c453c6a5b85bfc1ce12ad91 + reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44 subpath: kibana type: git nagios: location: https://git.openstack.org/openstack/openstack-helm-infra - reference: 4f4e9c5838e9cdf25c453c6a5b85bfc1ce12ad91 + reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44 subpath: nagios type: git prometheus: location: https://git.openstack.org/openstack/openstack-helm-infra - reference: 4f4e9c5838e9cdf25c453c6a5b85bfc1ce12ad91 + reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44 subpath: prometheus type: git prometheus_alertmanager: location: https://git.openstack.org/openstack/openstack-helm-infra - reference: 4f4e9c5838e9cdf25c453c6a5b85bfc1ce12ad91 + reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44 subpath: prometheus-alertmanager type: git prometheus_kube_state_metrics: location: https://git.openstack.org/openstack/openstack-helm-infra - reference: 4f4e9c5838e9cdf25c453c6a5b85bfc1ce12ad91 + reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44 subpath: prometheus-kube-state-metrics type: git prometheus_node_exporter: location: https://git.openstack.org/openstack/openstack-helm-infra - reference: 4f4e9c5838e9cdf25c453c6a5b85bfc1ce12ad91 + reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44 subpath: prometheus-node-exporter type: git + prometheus_process_exporter: + location: https://git.openstack.org/openstack/openstack-helm-infra + reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44 + subpath: prometheus-process-exporter + type: git prometheus_openstack_exporter: location: https://git.openstack.org/openstack/openstack-helm-infra - reference: 4f4e9c5838e9cdf25c453c6a5b85bfc1ce12ad91 + reference: bc1afb87d7aa529a4ed5321d889cdfe2f1af8a44 subpath: prometheus-openstack-exporter type: git ucp: @@ -661,6 +666,9 @@ data: curator: docker.io/bobrik/curator:5.2.0 dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.3.1 elasticsearch: docker.io/elasticsearch:5.6.4 + ceph_key_placement: docker.io/port/ceph-config-helper:v1.10.3 + s3_bucket: docker.io/port/ceph-config-helper:v1.10.3 + s3_user: docker.io/port/ceph-config-helper:v1.10.3 helm_tests: docker.io/openstackhelm/heat:ocata image_repo_sync: docker.io/docker:17.07.0 memory_init: docker.io/openstackhelm/heat:ocata @@ -713,7 +721,7 @@ data: ks_endpoints: docker.io/openstackhelm/heat:ocata ks_service: docker.io/openstackhelm/heat:ocata ks_user: docker.io/openstackhelm/heat:ocata - prometheus_openstack_exporter: quay.io/attcomdev/prometheus-openstack-exporter:3231f14419f0c47547ce2551b7d884cd222104e6 + prometheus_openstack_exporter: quay.io/attcomdev/prometheus-openstack-exporter:5010c3a532471d4940471a189ca8456bc4db46cb ucp: armada: api: quay.io/airshipit/armada:90618f549c1f6d7741b11dc5c4898f3c6d536895 diff --git a/global/software/manifests/full-site.yaml b/global/software/manifests/full-site.yaml index ed3a5015d..f51883c9f 100644 --- a/global/software/manifests/full-site.yaml +++ b/global/software/manifests/full-site.yaml @@ -31,6 +31,7 @@ data: - ucp-shipyard - osh-infra-ingress-controller - osh-infra-ceph-config + - osh-infra-radosgw - osh-infra-logging - osh-infra-monitoring - osh-infra-mariadb diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_oslo_db_exporter_password.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_oslo_db_exporter_password.yaml new file mode 100644 index 000000000..f134f46a9 --- /dev/null +++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_oslo_db_exporter_password.yaml @@ -0,0 +1,11 @@ +--- +schema: deckhand/Passphrase/v1 +metadata: + schema: metadata/Document/v1 + name: osh_infra_oslo_db_exporter_password + layeringDefinition: + abstract: false + layer: site + storagePolicy: cleartext +data: password123 +... diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_prometheus_admin_password.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_prometheus_admin_password.yaml new file mode 100644 index 000000000..b3df5f659 --- /dev/null +++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_prometheus_admin_password.yaml @@ -0,0 +1,11 @@ +--- +schema: deckhand/Passphrase/v1 +metadata: + schema: metadata/Document/v1 + name: osh_infra_prometheus_admin_password + layeringDefinition: + abstract: false + layer: site + storagePolicy: cleartext +data: password123 +... diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_access_key.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_access_key.yaml new file mode 100644 index 000000000..7fc1eddf1 --- /dev/null +++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_access_key.yaml @@ -0,0 +1,11 @@ +--- +schema: deckhand/Passphrase/v1 +metadata: + schema: metadata/Document/v1 + name: osh_infra_rgw_s3_admin_access_key + layeringDefinition: + abstract: false + layer: site + storagePolicy: cleartext +data: password123 +... diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_secret_key.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_secret_key.yaml new file mode 100644 index 000000000..32f7d80f5 --- /dev/null +++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_admin_secret_key.yaml @@ -0,0 +1,11 @@ +--- +schema: deckhand/Passphrase/v1 +metadata: + schema: metadata/Document/v1 + name: osh_infra_rgw_s3_admin_secret_key + layeringDefinition: + abstract: false + layer: site + storagePolicy: cleartext +data: password123 +... diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_access_key.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_access_key.yaml new file mode 100644 index 000000000..befc16e1f --- /dev/null +++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_access_key.yaml @@ -0,0 +1,11 @@ +--- +schema: deckhand/Passphrase/v1 +metadata: + schema: metadata/Document/v1 + name: osh_infra_rgw_s3_elasticsearch_access_key + layeringDefinition: + abstract: false + layer: site + storagePolicy: cleartext +data: password123 +... diff --git a/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_secret_key.yaml b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_secret_key.yaml new file mode 100644 index 000000000..6dff56e51 --- /dev/null +++ b/site/airship-seaworthy/secrets/passphrases/osh_infra_rgw_s3_elasticsearch_secret_key.yaml @@ -0,0 +1,11 @@ +--- +schema: deckhand/Passphrase/v1 +metadata: + schema: metadata/Document/v1 + name: osh_infra_rgw_s3_elasticsearch_secret_key + layeringDefinition: + abstract: false + layer: site + storagePolicy: cleartext +data: password123 +... diff --git a/site/airship-seaworthy/secrets/passphrases/osh_oslo_db_exporter_password.yaml b/site/airship-seaworthy/secrets/passphrases/osh_oslo_db_exporter_password.yaml new file mode 100644 index 000000000..61b4144ad --- /dev/null +++ b/site/airship-seaworthy/secrets/passphrases/osh_oslo_db_exporter_password.yaml @@ -0,0 +1,11 @@ +--- +schema: deckhand/Passphrase/v1 +metadata: + schema: metadata/Document/v1 + name: osh_oslo_db_exporter_password + layeringDefinition: + abstract: false + layer: site + storagePolicy: cleartext +data: password123 +... diff --git a/site/airship-seaworthy/software/config/endpoints.yaml b/site/airship-seaworthy/software/config/endpoints.yaml index 3f67f8372..a77fbd52f 100644 --- a/site/airship-seaworthy/software/config/endpoints.yaml +++ b/site/airship-seaworthy/software/config/endpoints.yaml @@ -715,6 +715,19 @@ data: default: 3306 wsrep: default: 4567 + prometheus_mysql_exporter: + namespace: openstack + hosts: + default: mysql-exporter + host_fqdn_override: + default: null + path: + default: /metrics + scheme: + default: 'http' + port: + metrics: + default: 9104 keystone_oslo_messaging: namespace: openstack hosts: @@ -1257,6 +1270,22 @@ metadata: # pattern: AUTH_PATH data: osh_infra: + ceph_object_store: + name: radosgw + namespace: osh-infra + hosts: + default: ceph-rgw + public: radosgw + host_fqdn_override: + default: null + path: + default: null + scheme: + default: "http" + port: + api: + default: 8088 + public: 80 elasticsearch: name: elasticsearch namespace: osh-infra @@ -1272,8 +1301,12 @@ data: scheme: default: "http" port: + client: + default: 9200 http: default: 80 + discovery: + default: 9300 prometheus_elasticsearch_exporter: namespace: null hosts: @@ -1327,6 +1360,19 @@ data: port: mysql: default: 3306 + prometheus_mysql_exporter: + namespace: openstack + hosts: + default: mysql-exporter + host_fqdn_override: + default: null + path: + default: /metrics + scheme: + default: 'http' + port: + metrics: + default: 9104 grafana: name: grafana namespace: osh-infra @@ -1345,6 +1391,7 @@ data: port: grafana: default: 3000 + public: 80 # public: 443 monitoring: name: prometheus @@ -1361,7 +1408,8 @@ data: port: api: default: 9090 - public: 80 + http: + default: 80 kibana: name: kibana namespace: osh-infra @@ -1380,6 +1428,8 @@ data: port: kibana: default: 5601 + http: + default: 80 # public: 443 alerts: name: alertmanager @@ -1438,6 +1488,19 @@ data: default: 9100 prometheus_port: default: 9100 + process_exporter_metrics: + namespace: kube-system + hosts: + default: process-exporter + host_fqdn_override: + default: null + path: + default: null + scheme: + default: "http" + port: + metrics: + default: 9256 prometheus_openstack_exporter: namespace: openstack hosts: diff --git a/site/airship-seaworthy/software/config/service_accounts.yaml b/site/airship-seaworthy/software/config/service_accounts.yaml index 792072936..1320c7028 100644 --- a/site/airship-seaworthy/software/config/service_accounts.yaml +++ b/site/airship-seaworthy/software/config/service_accounts.yaml @@ -303,6 +303,9 @@ data: oslo_db: admin: username: root + prometheus_mysql_exporter: + user: + username: osh-oslodb-exporter neutron: neutron: role: admin @@ -383,6 +386,11 @@ metadata: path: .osh_infra.prometheus_openstack_exporter.user.region_name data: osh_infra: + ceph_object_store: + admin: + username: s3_admin + elasticsearch: + username: elasticsearch grafana: admin: username: grafana @@ -401,6 +409,9 @@ data: oslo_db: admin: username: root + prometheus_mysql_exporter: + user: + username: osh-infra-oslodb-exporter prometheus_openstack_exporter: user: role: admin @@ -411,6 +422,9 @@ data: nagios: admin: username: nagios + prometheus: + admin: + username: prometheus ldap: admin: # NEWSITE-CHANGEME: Replace with the site's LDAP account used to diff --git a/site/airship-seaworthy/software/manifests/full-site.yaml b/site/airship-seaworthy/software/manifests/full-site.yaml index 593dfc106..77ddc3f80 100644 --- a/site/airship-seaworthy/software/manifests/full-site.yaml +++ b/site/airship-seaworthy/software/manifests/full-site.yaml @@ -37,6 +37,7 @@ data: - ucp-shipyard - osh-infra-ingress-controller - osh-infra-ceph-config + - osh-infra-radosgw - osh-infra-logging - osh-infra-monitoring - osh-infra-mariadb