--- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: namespace: {{ .Release.Namespace }} name: {{ include "tempest-pushgateway.fullname" . }} labels: {{ include "tempest-pushgateway.labels" . | indent 4 }} spec: groups: - name: tempest rules: - alert: TempestTestNotRunning expr: | time() - tempest_last_run_unixtime > 900 labels: severity: P3 annotations: summary: "[`{{`{{$labels.instance}}`}}`] Tempest not reporting" description: > Tempest has not reported in for over 15 minutes which means that the tests are not running and the state of the cloud is unknown. - alert: TempestTestFailure expr: | tempest_last_run_result{tempest_last_run_result="success"} != 1 labels: severity: P5 annotations: summary: "[`{{`{{$labels.instance}}`}}`] Tempest test failure" description: > The test `{{`{{$labels.instance}}`}}` has failed in it's most recent run. - alert: TempestTestFailure for: 8m expr: | tempest_last_run_result{tempest_last_run_result="success"} != 1 labels: severity: P4 annotations: summary: "[`{{`{{$labels.instance}}`}}`] Tempest test failure" description: > The test `{{`{{$labels.instance}}`}}` has failed in it's most recent run for 8 minutes. - alert: TempestTestFailure for: 13m expr: | tempest_last_run_result{tempest_last_run_result="success"} != 1 labels: severity: P3 annotations: summary: "[`{{`{{$labels.instance}}`}}`] Tempest test failure" description: > The test `{{`{{$labels.instance}}`}}` has failed in it's most recent run for 13 minutes.