--- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: namespace: {{ .Release.Namespace }} name: {{ include "dell-exporter.fullname" . }} labels: {{ include "dell-exporter.labels" . | indent 4 }} spec: groups: - name: dell rules: - alert: DellSystemStatus expr: | dell_hw_system_status == 2 for: 1m labels: severity: P4 annotations: summary: "[`{{`{{$labels.instance}}`}}`] System warning" description: > The host `{{`{{$labels.instance}}`}}` is reporting a non-critical status, please investigate. If this is the only firing alert, please add another one to handle the specific failure. - alert: DellSystemStatus expr: | dell_hw_system_status == 1 for: 1m labels: severity: P3 annotations: summary: "[`{{`{{$labels.instance}}`}}`] System failure" description: > The host `{{`{{$labels.instance}}`}}` is reporting a critical status, please investigate. If this is the only firing alert, please add another one to handle the specific failure. - alert: DellFanFailure expr: | dell_hw_chassis_fan_status != 0 for: 1m labels: severity: P4 annotations: summary: "[`{{`{{$labels.instance}}`}}`] Fan failure" description: > The `{{`{{$labels.fan}}`}}` on host `{{`{{$labels.instance}}`}}` is reporting a failure, please replace it. - alert: DellMemoryFailure expr: | dell_hw_chassis_memory_status != 0 for: 1m labels: severity: P4 annotations: summary: "[`{{`{{$labels.instance}}`}}`] Memory failure" description: > The `{{`{{$labels.memory}}`}}` on host `{{`{{$labels.instance}}`}}` is reporting a failure, please replace it. - alert: DellPowerSupplyFailure expr: | dell_hw_ps_status != 0 for: 1m labels: severity: P3 annotations: summary: "[`{{`{{$labels.instance}}`}}`] Power failure" description: > The PSU `{{`{{$labels.id}}`}}` on host `{{`{{$labels.instance}}`}}` is reporting a failure, please investigate if power is lost or power supply requires replacement. - alert: DellHardwareStorageFailure expr: | dell_hw_storage_controller_status != 0 for: 1m labels: severity: P3 annotations: summary: "[`{{`{{$labels.instance}}`}}`] Storage failure" description: > The storage controller `{{`{{$labels.id}}`}}` on host `{{`{{$labels.instance}}`}}` is reporting a failure, please investigate inside the host and add the appropriate alerting rules if no alerts except this one have fired. - alert: DellChassisTemperature expr: | dell_hw_chassis_temps_reading > dell_hw_chassis_temps_max_warning labels: severity: P3 annotations: summary: "[{{`{{$labels.instance}}`}}] {{`{{$labels.component}}`}}" description: > The component {{`{{$labels.component}}`}} is reporting temperatures of {{`{{$value}}`}} which is above the critical reading for {{`{{$labels.instance}}`}}. - alert: DellChassisTemperature expr: | dell_hw_chassis_temps_reading > dell_hw_chassis_temps_max_warning labels: severity: P4 annotations: summary: "[{{`{{$labels.instance}}`}}] {{`{{$labels.component}}`}}" description: > The component {{`{{$labels.component}}`}} is reporting temperatures of {{`{{$value}}`}} which is above the warning reading for {{`{{$labels.instance}}`}}.