fault/fm-doc/fm_doc/events.yaml

3216 lines
114 KiB
YAML
Executable File

---
#
# Copyright (c) 2013-2018 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
############################################################################
#
# ALARM & CUSTOMER LOG DOCUMENTATION
#
############################################################################
############################################################################
#
# Record Format ... for documentation
#
# 100.001:
# Type: < Alarm | Log >
# Description: < yaml string >
# OR
# [ < yaml string >, // list of yaml strings
# < yaml string > ]
# OR
# critical: < yaml string > // i.e. dictionary of yaml strings indexed by severity
# major: < yaml string >
# minor: < yaml string >
# warning: < yaml string >
# Entity_Instance_ID: < yaml string ... e.g. host=<hostname>.interface=<ifname> >
# OR
# [ < yaml string >, // list of yaml strings
# < yaml string > ]
# Severity: < critical | major | minor | warning >
# OR
# [ critical, major ] // list of severity values
# Proposed_Repair_Action: < yaml string > // NOTE ALARM ONLY FIELD
# OR
# critical: < yaml string > // i.e. dictionary of yaml strings indexed by severity
# major: < yaml string >
# minor: < yaml string >
# warning: < yaml string >
# Maintenance_Action: < yaml string > // NOTE ALARM ONLY FIELD
# OR
# critical: < yaml string > // i.e. dictionary of yaml strings indexed by severity
# major: < yaml string >
# minor: < yaml string >
# warning: < yaml string >
# Inhibit_Alarms: < True | False > // NOTE ALARM ONLY FIELD
# Alarm_Type: < operational-violation | ... >
# Probable_Cause: < timing-problem | ... >
# OR
# [ < timing-problem | ... >, // list of probable-causes
# < timing-problem | ... > ]
# Service_Affecting: < True | False >
# Suppression: < True | False > // NOTE ALARM ONLY FIELD
# Management_Affecting_Severity: < none | critical | major | minor | warning >
# // lowest alarm level of this type that will block forced upgrades & orchestration actions
# Degrade_Affecting_Severity: < none | critical | major | minor >
# // lowest alarm level of this type sets a host to 'degraded'
#
#
# Other Notes:
# - use general record format above
# - the only dictionaries allowed are ones indexed by severity
# - if there are multiple lists in a record,
# then they should all have the same # of items and corresponding list items represent instance of alarm
# - if you can't describe the alarm/log based on the above rules,
# then you can use a multi-line string format
# - DELETING alarms from events.yaml: alarms should only be deleted when going to a new Titanium Cloud release
# - if all possible alarm severities are mgmt affecting, the convention is to
# use 'warning' as the Management_Affecting_Severity, even if warning is not a possible severity for that alarm
#
# Testing:
# - Testing of events.yaml can be done by running regular make command
# and specifying fm-doc:
# nice -n 20 ionice -c Idle make -C build fm-doc.rebuild
# - When building, events.yaml will be parsed for correct format, and also
# to ensure that Alarm IDs defined in constants.py and fmAlarm.h are
# listed in events.yaml
#
############################################################################
#---------------------------------------------------------------------------
# Monitored Resource Alarms
#---------------------------------------------------------------------------
100.101:
Type: Alarm
Description: |-
Platform CPU threshold exceeded; threshold x%, actual y% .
CRITICAL @ 95%
MAJOR @ 90%
MINOR @ 80%
Entity_Instance_ID: host=<hostname>
Severity: [critical, major, minor]
Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support."
Maintenance_Action:
critical: degrade
major: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: threshold-crossed
Service_Affecting: False
Suppression: True
Management_Affecting_Severity: major
Degrade_Affecting_Severity: critical
100.102:
Type: Alarm
Description: |-
VSwitch CPU threshold exceeded; threshold x%, actual y% .
CRITICAL @ 95%
MAJOR @ 90%
MINOR @ 80%
Entity_Instance_ID: host=<hostname>
Severity: [critical, major, minor]
Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support."
Maintenance_Action:
critical: degrade
major: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: threshold-crossed
Service_Affecting: False
Suppression: True
Management_Affecting_Severity: none
Degrade_Affecting_Severity: none
100.103:
Type: Alarm
Description: |-
Memory threshold exceeded; threshold x%, actual y% .
CRITICAL @ 90%
MAJOR @ 80%
MINOR @ 70%
Entity_Instance_ID: host=<hostname>
Severity: [critical, major, minor]
Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support; may require additional memory on Host."
Maintenance_Action:
critical: degrade
major: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: threshold-crossed
Service_Affecting: False
Suppression: True
Management_Affecting_Severity: none
Degrade_Affecting_Severity: critical
100.104: # NOTE This should really be split into two different Alarms.
Type: Alarm
Description: |-
host=<hostname>.filesystem=<mount-dir>
File System threshold exceeded; threshold x%, actual y% .
CRITICAL @ 90%
MAJOR @ 80%
MINOR @ 70%
OR
host=<hostname>.volumegroup=<volumegroup-name>
Monitor and if condition persists, consider adding additional physical volumes to the volume group.
Entity_Instance_ID: |-
host=<hostname>.filesystem=<mount-dir>
OR
host=<hostname>.volumegroup=<volumegroup-name>
Severity: [critical, major, minor]
Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support."
Maintenance_Action:
critical: degrade
major: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: threshold-crossed
Service_Affecting: False
Suppression: True
Management_Affecting_Severity: critical
Degrade_Affecting_Severity: critical
#--------
# 100.105: Retired (with R2 release): previously monitored /etc/nova/instances
# NFS mount from controller to computes
#--------
100.106:
Type: Alarm
Description: "'OAM' Port failed."
Entity_Instance_ID: host=<hostname>.port=<port-name>
Severity: major
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unknown
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: major
100.107:
Type: Alarm
Description: |-
'OAM' Interface degraded.
OR
'OAM' Interface failed.
Entity_Instance_ID: host=<hostname>.interface=<if-name>
Severity: [critical, major]
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action:
critical: degrade
major: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unknown
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: major
100.108:
Type: Alarm
Description: "'MGMT' Port failed."
Entity_Instance_ID: host=<hostname>.port=<port-name>
Severity: major
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unknown
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: major
100.109:
Type: Alarm
Description: |-
'MGMT' Interface degraded.
OR
'MGMT' Interface failed.
Entity_Instance_ID: host=<hostname>.interface=<if-name>
Severity: [critical, major]
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action:
critical: degrade
major: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unknown
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: major
100.110:
Type: Alarm
Description: "'CLUSTER-HOST' Port failed."
Entity_Instance_ID: host=<hostname>.port=<port-name>
Severity: major
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unknown
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: major
100.111:
Type: Alarm
Description: |-
'CLUSTER-HOST' Interface degraded.
OR
'CLUSTER-HOST' Interface failed.
Entity_Instance_ID: host=<hostname>.interface=<if-name>
Severity: [critical, major]
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action:
critical: degrade
major: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unknown
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: major
100.112:
Type: Alarm
Description: "'DATA-VRS' Port down."
Entity_Instance_ID: host=<hostname>.port=<port-name>
Severity: major
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unknown
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: none
Degrade_Affecting_Severity: major
100.113:
Type: Alarm
Description: |-
'DATA-VRS' Interface degraded.
OR
'DATA-VRS' Interface down.
Entity_Instance_ID: host=<hostname>.interface=<if-name>
Severity: [critical, major]
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action:
major: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unknown
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: none
Degrade_Affecting_Severity: major
100.114:
Type: Alarm
Description:
major: "NTP configuration does not contain any valid or reachable NTP servers."
minor: "NTP address <IP address> is not a valid or a reachable NTP server."
Entity_Instance_ID:
major: host=<hostname>.ntp
minor: host=<hostname>.ntp=<IP address>
Severity: [major, minor]
Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support."
Maintenance_Action: none
Inhibit_Alarms:
Alarm_Type: communication
Probable_Cause: unknown
Service_Affecting: False
Suppression: False
Management_Affecting_Severity: none
Degrade_Affecting_Severity: none
100.115:
Type: Alarm
Description: "VSwitch Memory Usage, processor <processor> threshold exceeded; threshold x%, actual y% ."
Entity_Instance_ID: host=<hostname>.processor=<processor>
Severity: [critical, major, minor]
Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support."
Maintenance_Action:
critical: degrade
major: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: threshold-crossed
Service_Affecting: False
Suppression: True
Management_Affecting_Severity: none
Degrade_Affecting_Severity: critical
100.116:
Type: Alarm
Description: "Cinder LVM Thinpool Usage threshold exceeded; threshold x%, actual y% ."
Entity_Instance_ID: host=<hostname>
Severity: [critical, major, minor]
Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support."
Maintenance_Action:
critical: degrade
major: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: threshold-crossed
Service_Affecting: False
Suppression: True
Management_Affecting_Severity: none
Degrade_Affecting_Severity: critical
100.117:
Type: Alarm
Description: "Nova LVM Thinpool Usage threshold exceeded; threshold x%, actual y% ."
Entity_Instance_ID: host=<hostname>
Severity: [critical, major, minor]
Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support."
Maintenance_Action:
critical: degrade
major: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: threshold-crossed
Service_Affecting: False
Suppression: True
Management_Affecting_Severity: major
Degrade_Affecting_Severity: critical
100.118:
Type: Alarm
Description: Controller cannot establish connection with remote logging server.
Entity_Instance_ID: host=<hostname>
Severity: minor
Proposed_Repair_Action: "Ensure Remote Log Server IP is reachable from Controller through OAM interface; otherwise contact next level of support."
Maintenance_Action: none
Inhibit_Alarms: False
Alarm_Type: communication
Probable_Cause: communication-subsystem-failure
Service_Affecting: False
Suppression: False
Management_Affecting_Severity: none
Degrade_Affecting_Severity: none
100.119:
Type: Alarm
Description:
major: "PTP configuration or out-of-tolerance timestamping conditions"
minor: "PTP out-of-tolerance timestamping condition"
Entity_Instance_ID: |-
host=<hostname>.ptp
OR
host=<hostname>.ptp=no-lock
OR
host=<hostname>.ptp=<interface>.unsupported=hardware-timestamping
OR
host=<hostname>.ptp=<interface>.unsupported=software-timestamping
OR
host=<hostname>.ptp=<interface>.unsupported=legacy-timestamping
OR
host=<hostname>.ptp=out-of-tolerance
Severity: [major, minor]
Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support."
Maintenance_Action: none
Inhibit_Alarms:
Alarm_Type: communication
Probable_Cause: unknown
Service_Affecting: False
Suppression: False
Management_Affecting_Severity: none
Degrade_Affecting_Severity: none
#---------------------------------------------------------------------------
# MAINTENANCE
#---------------------------------------------------------------------------
200.001:
Type: Alarm
Description: <hostname> was administratively locked to take it out-of-service.
Entity_Instance_ID: host=<hostname>
Severity: warning
Proposed_Repair_Action: Administratively unlock Host to bring it back in-service.
Maintenance_Action: none
Inhibit_Alarms: True
Alarm_Type: operational-violation
Probable_Cause: out-of-service
Service_Affecting: True
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
200.004:
Type: Alarm
Description: |-
<hostname> experienced a service-affecting failure.
Host is being auto recovered by Reboot.
Entity_Instance_ID: host=<hostname>
Severity: critical
Proposed_Repair_Action: If auto-recovery is consistently unable to recover host to the unlocked-enabled state contact next level of support or lock and replace failing host.
Maintenance_Action: auto recover
Inhibit_Alarms: False
Alarm_Type: operational-violation
Probable_Cause: application-subsystem-failure
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
200.011:
Type: Alarm
Description: <hostname> experienced a configuration failure during initialization. Host is being re-configured by Reboot.
Entity_Instance_ID: host=<hostname>
Severity: critical
Proposed_Repair_Action: If auto-recovery is consistently unable to recover host to the unlocked-enabled state contact next level of support or lock and replace failing host.
Maintenance_Action: auto-recover
Inhibit_Alarms: False
Alarm_Type: operational-violation
Probable_Cause: configuration-or-customization-error
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
200.010:
Type: Alarm
Description: <hostname> access to board management module has failed.
Entity_Instance_ID: host=<hostname>
Severity: warning
Proposed_Repair_Action: Check Host's board management configuration and connectivity.
Maintenance_Action: auto recover
Inhibit_Alarms: False
Alarm_Type: operational-violation
Probable_Cause: communication-subsystem-failure
Service_Affecting: False
Suppression: False
Management_Affecting_Severity: none
Degrade_Affecting_Severity: none
200.012:
Type: Alarm
Description: <hostname> controller function has in-service failure while compute services remain healthy.
Entity_Instance_ID: host=<hostname>
Severity: major
Proposed_Repair_Action: Lock and then Unlock host to recover. Avoid using 'Force Lock' action as that will impact compute services running on this host. If lock action fails then contact next level of support to investigate and recover.
Maintenance_Action: "degrade - requires manual action"
Inhibit_Alarms: False
Alarm_Type: operational-violation
Probable_Cause: communication-subsystem-failure
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: major
200.013:
Type: Alarm
Description: <hostname> compute service of the only available controller is not poperational. Auto-recovery is disabled. Deggrading host instead.
Entity_Instance_ID: host=<hostname>
Severity: major
Proposed_Repair_Action: Enable second controller and Switch Activity (Swact) over to it as soon as possible. Then Lock and Unlock host to recover its local compute service.
Maintenance_Action: "degrade - requires manual action"
Inhibit_Alarms: False
Alarm_Type: operational-violation
Probable_Cause: communication-subsystem-failure
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: major
200.005:
Type: Alarm
Description: |-
Degrade:
<hostname> is experiencing an intermittent 'Management Network' communication failures that have exceeded its lower alarming threshold.
Failure:
<hostname> is experiencing a persistent critical 'Management Network' communication failure."
Entity_Instance_ID: host=<hostname>
Severity: [critical, major]
Proposed_Repair_Action: "Check 'Management Network' connectivity and support for multicast messaging. If problem consistently occurs after that and Host is reset, then contact next level of support or lock and replace failing host."
Maintenance_Action: auto recover
Inhibit_Alarms: False
Alarm_Type: communication
Probable_Cause: unknown
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
200.009:
Type: Alarm
Description: |-
Degrade:
<hostname> is experiencing an intermittent 'Cluster-host Network' communication failures that have exceeded its lower alarming threshold.
Failure:
<hostname> is experiencing a persistent critical 'Cluster-host Network' communication failure."
Entity_Instance_ID: host=<hostname>
Severity: [critical, major]
Proposed_Repair_Action: "Check 'Cluster-host Network' connectivity and support for multicast messaging. If problem consistently occurs after that and Host is reset, then contact next level of support or lock and replace failing host."
Maintenance_Action: auto recover
Inhibit_Alarms: False
Alarm_Type: communication
Probable_Cause: unknown
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
200.006:
Type: Alarm
Description: |-
Main Process Monitor Daemon Failure (major):
<hostname> 'Process Monitor' (pmond) process is not running or functioning properly. The system is trying to recover this process.
Monitored Process Failure (critical/major/minor):
Critical: <hostname> critical '<processname>' process has failed and could not be auto-recovered gracefully.
Auto-recovery progression by host reboot is required and in progress.
Major: <hostname> is degraded due to the failure of its '<processname>' process. Auto recovery of this major process is in progress.
Minor: <hostname> '<processname>' process has failed. Auto recovery of this minor process is in progress.
OR
<hostname> '<processname>' process has failed. Manual recovery is required.
Entity_Instance_ID: host=<hostname>.process=<processname>
Severity: [critical, major, minor]
Proposed_Repair_Action: |-
If this alarm does not automatically clear after some time and continues to be asserted after Host is locked and unlocked then contact next level of support for root cause analysis and recovery.
If problem consistently occurs after Host is locked and unlocked then contact next level of support for root cause analysis and recovery."
Maintenance_Action:
critical: auto-recover
major: degrade
minor:
Inhibit_Alarms: False
Alarm_Type: operational-violation
Probable_Cause: unknown
Service_Affecting:
critical: True
major: True
minor: False
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: major
# 200.006: // NOTE using duplicate ID of a completely analogous Alarm for this
# Type: Log
# Description: |-
# Main Process Monitor Daemon Failure (major)
# <hostname> 'Process Monitor' (pmond) process is not running or functioning properly.
# The system is trying to recover this process.
#
# Monitored Process Failure (critical/major/minor)
# critical: <hostname> critical '<processname>' process has failed and could not be auto-recovered gracefully.
# Auto-recovery progression by host reboot is required and in progress.
# major: <hostname> is degraded due to the failure of its '<processname>' process. Auto recovery of this major process is in progress.
# minor: <hostname> '<processname>' process has failed. Auto recovery of this minor process is in progress.
# OR
# <hostname> '<processname>' process has failed. Manual recovery is required.
# Entity_Instance_ID: host=<hostname>.process=<process-name>
# Severity: minor
# Alarm_Type: other
# Probable_Cause: unspecified-reason
# Service_Affecting: True
200.007:
Type: Alarm
Description:
critical: "Host is degraded due to a 'critical' out-of-tolerance reading from the '<sensorname>' sensor"
major: "Host is degraded due to a 'major' out-of-tolerance reading from the '<sensorname>' sensor"
minor: "Host is reporting a 'minor' out-of-tolerance reading from the '<sensorname>' sensor"
Entity_Instance_ID: host=<hostname>.sensor=<sensorname>
Severity: [critical, major, minor]
Proposed_Repair_Action: "If problem consistently occurs after Host is power cycled and or reset, contact next level of support or lock and replace failing host."
Maintenance_Action:
critical: degrade
major: degrade
minor: auto-recover (polling)
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unspecified-reason
Service_Affecting:
critical: True
major: False
minor: False
Suppression: True
Management_Affecting_Severity: none
Degrade_Affecting_Severity: critical
200.014:
Type: Alarm
Description: "The Hardware Monitor was unable to load, configure and monitor one or more hardware sensors."
Entity_Instance_ID: host=<hostname>
Severity: minor
Proposed_Repair_Action: Check Board Management Controller provisioning. Try reprovisioning the BMC. If problem persists try power cycling the host and then the entire server including the BMC power. If problem persists then contact next level of support.
Maintenance_Action: None
Inhibit_Alarms: False
Alarm_Type: operational-violation
Probable_Cause: unknown
Service_Affecting: False
Suppression: True
Management_Affecting_Severity: none
Degrade_Affecting_Severity: none
200.015:
Type: Alarm
Description: Unable to read one or more sensor groups from this host's board management controller
Entity_Instance_ID: host=<hostname>
Severity: major
Proposed_Repair_Action: Check board management connectivity and try rebooting the board management controller. If problem persists contact next level of support or lock and replace failing host.
Maintenance_Action: None
Inhibit_Alarms: False
Alarm_Type: operational-violation
Probable_Cause: unknown
Service_Affecting: False
Suppression: False
Management_Affecting_Severity: none
Degrade_Affecting_Severity: none
200.020:
Type: Log
Description: ["<hostname> has been 'discovered' on the network",
"<hostname> has been 'added' to the system",
"<hostname> has 'entered' multi-node failure avoidance",
"<hostname> has 'exited' multi-node failure avoidance"]
Entity_Instance_ID: [host=<hostname>.event=discovered,
host=<hostname>.event=add,
host=<hostname>.event=mnfa_enter,
host=<hostname>.event=mnfa_exit]
Severity: warning
Alarm_Type: other
Probable_Cause: unspecified-reason
Service_Affecting: True
200.021:
Type: Log
Description: ["<hostname> board management controller has been 'provisioned'",
"<hostname> board management controller has been 're-provisioned'",
"<hostname> board management controller has been 'de-provisioned'",
"<hostname> manual 'unlock' request",
"<hostname> manual 'reboot' request",
"<hostname> manual 'reset' request",
"<hostname> manual 'power-off' request",
"<hostname> manual 'power-on' request",
"<hostname> manual 'reinstall' request",
"<hostname> manual 'force-lock' request",
"<hostname> manual 'delete' request",
"<hostname> manual 'controller switchover' request"]
Entity_Instance_ID: [host=<hostname>.command=provision,
host=<hostname>.command=reprovision,
host=<hostname>.command=deprovision,
host=<hostname>.command=unlock,
host=<hostname>.command=reboot,
host=<hostname>.command=reset,
host=<hostname>.command=power-off,
host=<hostname>.command=power-on,
host=<hostname>.command=reinstall,
host=<hostname>.command=force-lock,
host=<hostname>.command=delete,
host=<hostname>.command=swact]
Severity: warning
Alarm_Type: other
Probable_Cause: unspecified-reason
Service_Affecting: False
200.022:
Type: Log
Description: ["<hostname> is now 'disabled'",
"<hostname> is now 'enabled'",
"<hostname> is now 'online'",
"<hostname> is now 'offline'",
"<hostname> is 'disabled-failed' to the system",
"<hostname> reinstall failed",
"<hostname> reinstall completed successfully"]
Entity_Instance_ID: [host=<hostname>.state=disabled,
host=<hostname>.state=enabled,
host=<hostname>.status=online,
host=<hostname>.status=offline,
host=<hostname>.status=failed,
host=<hostname>.status=reinstall-failed,
host=<hostname>.status=reinstall-complete]
Severity: warning
Alarm_Type: other
Probable_Cause: unspecified-reason
Service_Affecting: True
#---------------------------------------------------------------------------
# BACKUP AND RESTORE
#---------------------------------------------------------------------------
210.001:
Type: Alarm
Description: System Backup in progress.
Entity_Instance_ID: host=controller
Severity: minor
Proposed_Repair_Action: No action required.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unspecified-reason
Service_Affecting: False
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
#---------------------------------------------------------------------------
# SYSTEM CONFIGURATION
#---------------------------------------------------------------------------
250.001:
Type: Alarm
Description: <hostname> Configuration is out-of-date.
Entity_Instance_ID: host=<hostname>
Severity: major
Proposed_Repair_Action: Administratively lock and unlock <hostname> to update config.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unspecified-reason
Service_Affecting: True
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
250.002:
Type: Alarm
Description: <hostname> Ceph cache tiering configuration is out-of-date.
Entity_Instance_ID: cluster=<dist-fs-uuid>
Severity: major
Proposed_Repair_Action: Apply Ceph service parameter settings.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unspecified-reason
Service_Affecting: False
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
#---------------------------------------------------------------------------
# VM Compute Services
#---------------------------------------------------------------------------
270.001:
Type: Alarm
Description: "Host <host_name> compute services failure[, reason = <reason_text>]"
Entity_Instance_ID: host=<host_name>.services=compute
Severity: critical
Proposed_Repair_Action: Wait for host services recovery to complete; if problem persists contact next level of support
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
270.101:
Type: Log
Description: "Host <host_name> compute services failure[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
270.102:
Type: Log
Description: Host <host_name> compute services enabled
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
270.103:
Type: Log
Description: Host <host_name> compute services disabled
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
275.001:
Type: Log
Description: Host <host_name> hypervisor is now <administrative_state>-<operational_state>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
#---------------------------------------------------------------------------
# DISTRIBUTED CLOUD
#---------------------------------------------------------------------------
280.001:
Type: Alarm
Description: <subcloud> is offline
Entity_Instance_ID: subcloud=<subcloud>
Severity: critical
Proposed_Repair_Action: Wait for subcloud to become online; if problem persists contact next level of support
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: communication
Probable_Cause: loss-of-signal
Service_Affecting: False
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
280.002:
Type: Alarm
Description: <subcloud> <resource> sync_status is out-of-sync
Entity_Instance_ID: [subcloud=<subcloud>.resource=<compute | network | platform | volumev2>]
Severity: major
Proposed_Repair_Action: If problem persists contact next level of support
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: other
Probable_Cause: application-subsystem-failure
Service_Affecting: False
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
#---------------------------------------------------------------------------
# NETWORK
#---------------------------------------------------------------------------
300.001:
Type: Alarm
Description: "'Data' Port failed."
Entity_Instance_ID: host=<hostname>.port=<port-uuid>
Severity: major
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: loss-of-signal
Service_Affecting: True
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
300.002:
Type: Alarm
Description: |-
'Data' Interface degraded.
OR
'Data' Interface failed.
Entity_Instance_ID: host=<hostname>.interface=<if-uuid>
Severity: [critical, major]
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: loss-of-signal
Service_Affecting: True
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: critical
300.003:
Type: Alarm
Description: Networking Agent not responding.
Entity_Instance_ID: host=<hostname>.agent=<agent-uuid>
Severity: major
Proposed_Repair_Action: "If condition persists, attempt to clear issue by administratively locking and unlocking the Host."
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: underlying-resource-unavailable
Service_Affecting: True
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
300.004:
Type: Alarm
Description: No enabled compute host with connectivity to provider network.
Entity_Instance_ID: service=networking.providernet=<pnet-uuid>
Severity: major
Proposed_Repair_Action: Enable compute hosts with required provider network connectivity.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: underlying-resource-unavailable
Service_Affecting: True
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
300.005:
Type: Alarm
Description: |-
Communication failure detected over provider network x% for ranges y% on host z%.
OR
Communication failure detected over provider network x% on host z%.
Entity_Instance_ID: host=<hostname>.service=networking.providernet=<pnet-uuid>
Severity: major
Proposed_Repair_Action: Check neighbour switch port VLAN assignments.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: underlying-resource-unavailable
Service_Affecting: True
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
300.010:
Type: Alarm
Description: |-
ML2 Driver Agent non-reachable
OR
ML2 Driver Agent reachable but non-responsive
OR
ML2 Driver Agent authentication failure
OR
ML2 Driver Agent is unable to sync Neutron database
Entity_Instance_ID: host=<hostname>.ml2driver=<driver>
Severity: major
Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support."
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: underlying-resource-unavailable
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
300.012:
Type: Alarm
Description: "Openflow Controller connection failed."
Entity_Instance_ID: host=<hostname>.openflow-controller=<uri>
Severity: major
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: loss-of-signal
Service_Affecting: True
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: critical
300.013:
Type: Alarm
Description: |-
No active Openflow controller connections found for this network.
OR
One or more Openflow controller connections in disconnected state for this network.
Entity_Instance_ID: host=<hostname>.openflow-network=<name>
Severity: [critical, major]
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: loss-of-signal
Service_Affecting: True
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: critical
300.014:
Type: Alarm
Description: "OVSDB Manager connection failed."
Entity_Instance_ID: host=<hostname>.sdn-controller=<uuid>
Severity: major
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: loss-of-signal
Service_Affecting: True
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: critical
300.015:
Type: Alarm
Description: "No active OVSDB connections found."
Entity_Instance_ID: host=<hostname>
Severity: critical
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: loss-of-signal
Service_Affecting: True
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: critical
300.016:
Type: Alarm
Description: "Dynamic routing agent x% lost connectivity to peer y%."
Entity_Instance_ID: host=<hostname>,agent=<agent-uuid>,bgp-peer=<bgp-peer>
Severity: major
Proposed_Repair_Action: If condition persists, fix connectivity to peer.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: loss-of-signal
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
#---------------------------------------------------------------------------
# HIGH AVAILABILITY
#---------------------------------------------------------------------------
400.001:
Type: Alarm
Description: |-
Service group failure; <list of affected services>.
OR
Service group degraded; <list of affected services>.
OR
Service group warning; <list of affected services>.
Entity_Instance_ID: service_domain=<domain_name>.service_group=<group_name>.host=<hostname>
Severity: [critical, major, minor]
Proposed_Repair_Action: Contact next level of support.
Maintenance_Action:
Inhibit_Alarms: False
Alarm_Type: processing-error
Probable_Cause: underlying-resource-unavailable
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: major
400.002:
Type: Alarm
Description: |-
Service group loss of redundancy; expected <num> standby member<s> but only <num> standby member<s> available.
OR
Service group loss of redundancy; expected <num> standby member<s> but only <num> standby member<s> available.
OR
Service group loss of redundancy; expected <num> active member<s> but no active members available.
OR
Service group loss of redundancy; expected <num> active member<s> but only <num> active member<s> available.
Entity_Instance_ID: service_domain=<domain_name>.service_group=<group_name>
Severity: major
Proposed_Repair_Action: "Bring a controller node back in to service, otherwise contact next level of support."
Maintenance_Action:
Inhibit_Alarms: False
Alarm_Type: processing-error
Probable_Cause: underlying-resource-unavailable
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
400.003:
Type: Alarm
Description: |-
License key is not installed; a valid license key is required for operation.
OR
License key has expired or is invalid; a valid license key is required for operation.
OR
Evaluation license key will expire on <date>; there are <num_days> days remaining in this evaluation.
OR
Evaluation license key will expire on <date>; there is only 1 day remaining in this evaluation.
Entity_Instance_ID: host=<hostname>
Severity: critical
Proposed_Repair_Action: Contact next level of support to obtain a new license key.
Maintenance_Action:
Inhibit_Alarms: False
Alarm_Type: processing-error
Probable_Cause: key-expired
Service_Affecting: True
Suppression: False
Management_Affecting_Severity: critical
Degrade_Affecting_Severity: none
# 400.004: // NOTE Removed
# Type: Alarm
# Description: Service group software modification detected; <list of affected files>.
# Entity_Instance_ID: host=<hostname>
# Severity: major
# Proposed_Repair_Action: Contact next level of support.
# Maintenance_Action:
# Inhibit_Alarms: False
# Alarm_Type: processing-error
# Probable_Cause: software-program-error
# Service_Affecting: True
# Suppression: False
400.005:
Type: Alarm
Description: |-
Communication failure detected with peer over port <linux-ifname>.
OR
Communication failure detected with peer over port <linux-ifname> within the last 30 seconds.
Entity_Instance_ID: host=<hostname>.network=<mgmt | oam | cluster-host>
Severity: major
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action:
Inhibit_Alarms: False
Alarm_Type: communication
Probable_Cause: underlying-resource-unavailable
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
#---------------------------------------------------------------------------
# SM
#---------------------------------------------------------------------------
401.001:
Type: Log
Description: Service group <group> state change from <state> to <state> on host <host_name>
Entity_Instance_ID: service_domain=<domain>.service_group=<group>.host=<host_name>
Severity: critical
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: True
401.002:
Type: Log
Description: |-
Service group <group> loss of redundancy; expected <X> standby member but no standby members available
or
Service group <group> loss of redundancy; expected <X> standby member but only <Y> standby member(s) available
or
Service group <group> has no active members available; expected <X> active member(s)
or
Service group <group> loss of redundancy; expected <X> active member(s) but only <Y> active member(s) available
Entity_Instance_ID: service_domain=<domain>.service_group=<group>
Severity: critical
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: True
401.003:
Type: Log
Description: |-
License key has expired or is invalid
or
Evaluation license key will expire on <date>
or
License key is valid
Entity_Instance_ID: host=<host_name>
Severity: critical
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: True
401.005:
Type: Log
Description: |-
Communication failure detected with peer over port <port> on host <host name>
or
Communication failure detected with peer over port <port> on host <host name> within the last <X> seconds
or
Communication established with peer over port <port> on host <host name>
Entity_Instance_ID: host=<host_name>.network=<network>
Severity: critical
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: True
401.007:
Type: Log
Description: Swact or swact-force
Entity_Instance_ID: host=<host_name>
Severity: critical
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: True
#---------------------------------------------------------------------------
# SECURITY
#---------------------------------------------------------------------------
500.100:
Type: Alarm
Description: TPM initialization failed on host.
Entity_Instance_ID: host=<hostname>
Severity: major
Proposed_Repair_Action: reinstall HTTPS certificate; if problem persists contact next level of support.
Maintenance_Action: degrade
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: procedural-error
Service_Affecting: True
Suppression: False
Management_Affecting_Severity: none
Degrade_Affecting_Severity: none
500.101:
Type: Alarm
Description: Developer patch certificate enabled.
Entity_Instance_ID: host=controller
Severity: critical
Proposed_Repair_Action: Reinstall system to disable developer certificate and remove untrusted patches.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unspecified-reason
Service_Affecting: False
Suppression: False
Management_Affecting_Severity: none
Degrade_Affecting_Severity: none
500.500:
Type: Log
Description: "Host <host_name> has IMA Appraisal failure for service <service> when executing <file>, reason = <reason_text>]"
Entity_Instance_ID: host=<hostname>.service=<service>
Severity: major
Alarm_Type: integrity-violation
Probable_Cause: information-modification-detected
Service_Affecting: False
#---------------------------------------------------------------------------
# VM
#---------------------------------------------------------------------------
700.001:
Type: Alarm
Description: |-
Instance <instance_name> owned by <tenant_name> has failed on host <host_name>
Instance <instance_name> owned by <tenant_name> has failed to schedule
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: The system will attempt recovery; no repair action required
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: software-error
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
700.002:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> is paused on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: Unpause the instance
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: procedural-error
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
700.003:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> is suspended on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: Resume the instance
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: procedural-error
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
700.004:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> is stopped on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: Start the instance
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: procedural-error
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
700.005:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> is rebooting on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: Wait for reboot to complete; if problem persists contact next level of support
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
700.006:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> is rebuilding on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: Wait for rebuild to complete; if problem persists contact next level of support
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: underlying-resource-unavailable
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
700.007:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> is evacuating from host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: Wait for evacuate to complete; if problem persists contact next level of support
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: underlying-resource-unavailable
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
700.008:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> is live migrating from host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: warning
Proposed_Repair_Action: Wait for live migration to complete; if problem persists contact next level of support
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
700.009:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> is cold migrating from host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: Wait for cold migration to complete; if problem persists contact next level of support
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
700.010:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> has been cold-migrated to host <host_name> waiting for confirmation
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: Confirm or revert cold-migrate of instance
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
700.011:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> is reverting cold migrate to host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: "Wait for cold migration revert to complete; if problem persists contact next level of support"
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: other
Probable_Cause: unspecified-reason
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
700.012:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> is resizing on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: Wait for resize to complete; if problem persists contact next level of support
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
700.013:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> has been resized on host <host_name> waiting for confirmation
Entity_Instance_ID: itenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: Confirm or revert resize of instance
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
700.014:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> is reverting resize on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: "Wait for resize revert to complete; if problem persists contact next level of support"
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: other
Probable_Cause: unspecified-reason
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
700.015:
Type: Alarm
Description: Guest Heartbeat not established for instance <instance_name> owned by <tenant_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: major
Proposed_Repair_Action: "Verify that the instance is running the Guest-Client daemon, or disable Guest Heartbeat for the instance if no longer needed, otherwise contact next level of support"
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: communication
Probable_Cause: procedural-error
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
700.016:
Type: Alarm
Description: Multi-Node Recovery Mode
Entity_Instance_ID: subsystem=vim
Severity: minor
Proposed_Repair_Action: "Wait for the system to exit out of this mode"
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
700.017:
Type: Alarm
Description: Server group <server_group_name> <policy> policy was not satisfied
Entity_Instance_ID: server-group<server-group-uuid>
Severity: minor
Proposed_Repair_Action: "Migrate instances in an attempt to satisfy the policy; if problem persists contact next level of support"
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: procedural-error
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: none
Degrade_Affecting_Severity: none
700.101:
Type: Log
Description: Instance <instance_name> is enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.102:
Type: Log
Description: Instance <instance_name> owned by <tenant_name> has failed[, reason = <reason_text>]
Instance <instance_name> owned by <tenant_name> has failed to schedule[, reason = <reason_text>]
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.103:
Type: Log
Description: Create issued <by <tenant_name>|by the system> against <instance_name> owned by <tenant_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.104:
Type: Log
Description: Creating instance <instance_name> owned by <tenant_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.105:
Type: Log
Description: "Create rejected for instance <instance_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.106:
Type: Log
Description: "Create cancelled for instance <instance_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.107:
Type: Log
Description: "Create failed for instance <instance_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.108:
Type: Log
Description: Inance <instance_name> owned by <tenant_name> has been created
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.109:
Type: Log
Description: "Delete issued <by tenant <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.110:
Type: Log
Description: Deleting instance <instance_name> owned by <tenatn_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.111:
Type: Log
Description: "Delete rejected for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.112:
Type: Log
Description: "Delete cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.113:
Type: Log
Description: "Delete failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.114:
Type: Log
Description: Deleted instance <instance_name> owned by <tenant_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.115:
Type: Log
Description: "Pause issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.116:
Type: Log
Description: Pause inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.117:
Type: Log
Description: "Pause rejected for instance <instance_name> enabled on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.118:
Type: Log
Description: "Pause cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.119:
Type: Log
Description: "Pause failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.120:
Type: Log
Description: Pause complete for instance <instance_name> now paused on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.121:
Type: Log
Description: "Unpause issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.122:
Type: Log
Description: Unpause inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.123:
Type: Log
Description: "Unpause rejected for instance <instance_name> paused on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.124:
Type: Log
Description: "Unpause cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.125:
Type: Log
Description: "Unpause failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.126:
Type: Log
Description: Unpause complete for instance <instance_name> now enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.127:
Type: Log
Description: "Suspend issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.128:
Type: Log
Description: Suspend inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.129:
Type: Log
Description: "Suspend rejected for instance <instance_name> enabled on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.130:
Type: Log
Description: "Suspend cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.131:
Type: Log
Description: "Suspend failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.132:
Type: Log
Description: Suspend complete for instance <instance_name> now suspended on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.133:
Type: Log
Description: "Resume issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.134:
Type: Log
Description: Resume inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.135:
Type: Log
Description: "Resume rejected for instance <instance_name> suspended on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.136:
Type: Log
Description: "Resume cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.137:
Type: Log
Description: "Resume failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.138:
Type: Log
Description: Resume complete for instance <instance_name> now enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.139:
Type: Log
Description: "Start issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.140:
Type: Log
Description: Start inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.141:
Type: Log
Description: "Start rejected for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.142:
Type: Log
Description: "Start cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.143:
Type: Log
Description: "Start failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.144:
Type: Log
Description: Start complete for instance <instance_name> now enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.145:
Type: Log
Description: "Stop issued <by <tenant_name>|by the system|by the instance> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.146:
Type: Log
Description: Stop inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.147:
Type: Log
Description: "Stop rejected for instance <instance_name> enabled on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.148:
Type: Log
Description: "Stop cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.149:
Type: Log
Description: "Stop failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.150:
Type: Log
Description: Stop complete for instance <instance_name> now disabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.151:
Type: Log
Description: "Live-Migrate issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> from host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.152:
Type: Log
Description: Live-Migrate inprogress for instance <instance_name> from host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.153:
Type: Log
Description: "Live-Migrate rejected for instance <instance_name> now on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.154:
Type: Log
Description: "Live-Migrate cancelled for instance <instance_name> now on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.155:
Type: Log
Description: "Live-Migrate failed for instance <instance_name> now on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.156:
Type: Log
Description: Live-Migrate complete for instance <instance_name> now enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.157:
Type: Log
Description: "Cold-Migrate issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> from host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.158:
Type: Log
Description: Cold-Migrate inprogress for instance <instance_name> from host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.159:
Type: Log
Description: "Cold-Migrate rejected for instance <instance_name> now on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.160:
Type: Log
Description: "Cold-Migrate cancelled for instance <instance_name> now on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.161:
Type: Log
Description: "Cold-Migrate failed for instance <instance_name> now on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.162:
Type: Log
Description: Cold-Migrate complete for instance <instance_name> now enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.163:
Type: Log
Description: "Cold-Migrate-Confirm issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.164:
Type: Log
Description: Cold-Migrate-Confirm inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.165:
Type: Log
Description: "Cold-Migrate-Confirm rejected for instance <instance_name> now enabled on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.166:
Type: Log
Description: "Cold-Migrate-Confirm cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.167:
Type: Log
Description: "Cold-Migrate-Confirm failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.168:
Type: Log
Description: Cold-Migrate-Confirm complete for instance <instance_name> enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.169:
Type: Log
Description: "Cold-Migrate-Revert issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.170:
Type: Log
Description: Cold-Migrate-Revert inprogress for instance <instance_name> from host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.171:
Type: Log
Description: "Cold-Migrate-Revert rejected for instance <instance_name> now on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.172:
Type: Log
Description: "Cold-Migrate-Revert cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.173:
Type: Log
Description: "Cold-Migrate-Revert failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.174:
Type: Log
Description: Cold-Migrate-Revert complete for instance <instance_name> now enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.175:
Type: Log
Description: "Evacuate issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.176:
Type: Log
Description: Evacuating instance <instance_name> owned by <tenant_name> from host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.177:
Type: Log
Description: "Evacuate rejected for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.178:
Type: Log
Description: "Evacuate cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.179:
Type: Log
Description: "Evacuate failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.180:
Type: Log
Description: Evacuate complete for instance <instance_name> now enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.181:
Type: Log
Description: "Reboot <(soft-reboot)|(hard-reboot)> issued <by <tenant_name>|by the system|by the instance> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.182:
Type: Log
Description: Reboot inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.183:
Type: Log
Description: "Reboot rejected for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.184:
Type: Log
Description: "Reboot cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.185:
Type: Log
Description: "Reboot failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.186:
Type: Log
Description: Reboot complete for instance <instance_name> now enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.187:
Type: Log
Description: "Rebuild issued <by <tenant_name>|by the system> against instance <instance_name> using image <image_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.188:
Type: Log
Description: Rebuild inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.189:
Type: Log
Description: "Rebuild rejected for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.190:
Type: Log
Description: "Rebuild cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.191:
Type: Log
Description: "Rebuild failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.192:
Type: Log
Description: Rebuild complete for instance <instance_name> now enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.193:
Type: Log
Description: "Resize issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.194:
Type: Log
Description: Resize inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.195:
Type: Log
Description: "Resize rejected for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.196:
Type: Log
Description: "Resize cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.197:
Type: Log
Description: "Resize failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.198:
Type: Log
Description: Resize complete for instance <instance_name> enabled on host <host_name> waiting for confirmation
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.199:
Type: Log
Description: "Resize-Confirm issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.200:
Type: Log
Description: Resize-Confirm inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.201:
Type: Log
Description: "Resize-Confirm rejected for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.202:
Type: Log
Description: "Resize-Confirm cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.203:
Type: Log
Description: "Resize-Confirm failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.204:
Type: Log
Description: Resize-Confirm complete for instance <instance_name> enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.205:
Type: Log
Description: "Resize-Revert issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.206:
Type: Log
Description: Resize-Revert inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.207:
Type: Log
Description: "Resize-Revert rejected for instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.208:
Type: Log
Description: "Resize-Revert cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.209:
Type: Log
Description: "Resize-Revert failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.210:
Type: Log
Description: Resize-Revert complete for instance <instance_name> enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.211:
Type: Log
Description: Guest Heartbeat established for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: major
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.212:
Type: Log
Description: Guest Heartbeat disconnected for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: major
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.213:
Type: Log
Description: "Guest Heartbeat failed for instance <instance_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.214:
Type: Log
Description: Instance <instance_name> has been renamed to <new_instance_name> owned by <tenant_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.215:
Type: Log
Description: "Guest Health Check failed for instance <instance_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.216:
Type: Log
Description: "Entered Multi-Node Recovery Mode"
Entity_Instance_ID: subsystem=vim
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
700.217:
Type: Log
Description: "Exited Multi-Node Recovery Mode"
Entity_Instance_ID: subsystem=vim
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
#---------------------------------------------------------------------------
# STORAGE
#---------------------------------------------------------------------------
800.001:
Type: Alarm
Description: |-
Storage Alarm Condition:
1 mons down, quorum 1,2 controller-1,storage-0
Entity_Instance_ID: cluster=<dist-fs-uuid>
Severity: [critical, major]
Proposed_Repair_Action: "If problem persists, contact next level of support."
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: equipment-malfunction
Service_Affecting:
critical: True
major: False
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
800.010:
Type: Alarm
Description: |-
Potential data loss. No available OSDs in storage replication group.
Entity_Instance_ID: cluster=<dist-fs-uuid>.peergroup=<group-x>
Severity: [critical]
Proposed_Repair_Action: "Ensure storage hosts from replication group are unlocked and available.
Check if OSDs of each storage host are up and running.
If problem persists contact next level of support."
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: equipment-malfunction
Service_Affecting:
critical: True
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
800.011:
Type: Alarm
Description: |-
Loss of replication in peergroup.
Entity_Instance_ID: cluster=<dist-fs-uuid>.peergroup=<group-x>
Severity: [major]
Proposed_Repair_Action: "Ensure storage hosts from replication group are unlocked and available.
Check if OSDs of each storage host are up and running.
If problem persists contact next level of support."
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: equipment-malfunction
Service_Affecting:
major: True
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
800.002:
Type: Alarm
Description: ["Image storage media is full: There is not enough disk space on the image storage media.",
"Instance <instance name> snapshot failed: There is not enough disk space on the image storage media.",
"Supplied <attrs> (<supplied>) and <attrs> generated from uploaded image (<actual>) did not match. Setting image status to 'killed'.",
"Error in store configuration. Adding images to store is disabled.",
"Forbidden upload attempt: <exception>",
"Insufficient permissions on image storage media: <exception>",
"Denying attempt to upload image larger than <size> bytes.",
"Denying attempt to upload image because it exceeds the quota: <exception>",
"Received HTTP error while uploading image <image_id>",
"Client disconnected before sending all data to backend",
"Failed to upload image <image_id>"]
Entity_Instance_ID: ["image=<image-uuid>, instance=<instance-uuid>",
"tenant=<tenant-uuid>, instance=<instance-uuid>",
"image=<image-uuid>, instance=<instance-uuid>",
"image=<image-uuid>, instance=<instance-uuid>",
"image=<image-uuid>, instance=<instance-uuid>",
"image=<image-uuid>, instance=<instance-uuid>",
"image=<image-uuid>, instance=<instance-uuid>",
"image=<image-uuid>, instance=<instance-uuid>",
"image=<image-uuid>, instance=<instance-uuid>",
"image=<image-uuid>, instance=<instance-uuid>",
"image=<image-uuid>, instance=<instance-uuid>"]
Alarm_Type: [physical-violation,
physical-violation,
integrity-violation,
integrity-violation,
security-service-or-mechanism-violation,
security-service-or-mechanism-violation,
security-service-or-mechanism-violation,
security-service-or-mechanism-violation,
communication,
communication,
operational-violation]
Severity: warning
Proposed_Repair_Action:
Maintenance_Action:
Inhibit_Alarms:
Probable_Cause: unspecified-reason
Service_Affecting: False
Suppression: False
Management_Affecting_Severity: none
Degrade_Affecting_Severity: none
800.003:
Type: Alarm
Description: |-
Storage Alarm Condition:
Quota/Space mismatch for the <tiername> tier. The sum of Ceph pool quotas does not match the tier size.
Entity_Instance_ID: cluster=<dist-fs-uuid>.tier=<tiername>
Severity: minor
Proposed_Repair_Action: "Update ceph storage pool quotas to use all available tier space."
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: configuration-out-of-date
Service_Affecting: False
Suppression: False
Management_Affecting_Severity: none
Degrade_Affecting_Severity: none
800.100:
Type: Alarm
Description: |-
Storage Alarm Condition:
Cinder I/O Congestion is above normal range and is building
Entity_Instance_ID: cinder_io_monitor
Severity: major
Proposed_Repair_Action: "Reduce the I/O load on the Cinder LVM backend. Use
Cinder QoS mechanisms on high usage volumes."
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: qos
Probable_Cause: congestion
Service_Affecting: False
Suppression: False
Management_Affecting_Severity: none
Degrade_Affecting_Severity: none
800.101:
Type: Alarm
Description: |-
Storage Alarm Condition:
Cinder I/O Congestion is high and impacting guest performance
Entity_Instance_ID: cinder_io_monitor
Severity: critical
Proposed_Repair_Action: "Reduce the I/O load on the Cinder LVM backend.
Cinder actions may fail until congestion is reduced.
Use Cinder QoS mechanisms on high usage volumes."
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: qos
Probable_Cause: congestion
Service_Affecting: False
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
800.102:
Type: Alarm
Description: |-
Storage Alarm Condition:
PV configuration <error/failed to apply> on <hostname>. Reason: <detailed reason>.
Entity_Instance_ID: pv=<pv_uuid>
Severity: [critical, major]
Proposed_Repair_Action: "Remove failed PV and associated Storage Device then recreate them."
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: configuration-or-customization-error
Service_Affecting: True
Suppression: False
Management_Affecting_Severity: major
Degrade_Affecting_Severity: none
800.103:
Type: Alarm
Description: |-
Storage Alarm Condition:
[ Metadata usage for LVM thin pool <VG name>/<Pool name> exceeded threshold and automatic extension failed,
Metadata usage for LVM thin pool <VG name>/<Pool name> exceeded threshold ]; threshold x%, actual y%.
Entity_Instance_ID: <hostname>.lvmthinpool=<VG name>/<Pool name>
Severity: critical
Proposed_Repair_Action: "Increase Storage Space Allotment for Cinder on the 'lvm' backend.
Consult the System Administration Manual for more details.
If problem persists, contact next level of support."
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: threshold-crossed
Service_Affecting: False
Suppression: False
Management_Affecting_Severity: major
Degrade_Affecting_Severity: none
800.104:
Type: Alarm
Description: |-
Storage Alarm Condition:
<storage-backend-name> configuration failed to apply on host: <host-uuid>.
Entity_Instance_ID: storage_backend=<storage-backend-name>
Severity: critical
Proposed_Repair_Action: "Update backend setting to reapply configuration.
Consult the System Administration Manual for more details.
If problem persists, contact next level of support."
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: configuration-or-customization-error
Service_Affecting: True
Suppression: False
Management_Affecting_Severity: major
Degrade_Affecting_Severity: none
#---------------------------------------------------------------------------
# SOFTWARE
#---------------------------------------------------------------------------
900.001:
Type: Alarm
Description: Patching operation in progress.
Entity_Instance_ID: host=controller
Severity: minor
Proposed_Repair_Action: Complete reboots of affected hosts.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: environmental
Probable_Cause: unspecified-reason
Service_Affecting: False
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
900.002:
Type: Alarm
Description: Obsolete patch in system.
Entity_Instance_ID: host=controller
Severity: warning
Proposed_Repair_Action: Remove and delete obsolete patches.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: environmental
Probable_Cause: unspecified-reason
Service_Affecting: False
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
900.003:
Type: Alarm
Description: Patch host install failure.
Entity_Instance_ID: host=<hostname>
Severity: major
Proposed_Repair_Action: Undo patching operation.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: environmental
Probable_Cause: unspecified-reason
Service_Affecting: False
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
900.004:
Type: Alarm
Description: Host version mismatch.
Entity_Instance_ID: host=<hostname>
Severity: major
Proposed_Repair_Action: Reinstall host to update applied load.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unspecified-reason
Service_Affecting: True
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
900.005:
Type: Alarm
Description: System Upgrade in progress.
Entity_Instance_ID: host=controller
Severity: minor
Proposed_Repair_Action: No action required.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unspecified-reason
Service_Affecting: False
Suppression: False
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
900.101:
Type: Alarm
Description: Software patch auto-apply inprogress
Entity_Instance_ID: orchestration=sw-patch
Severity: major
Proposed_Repair_Action: Wait for software patch auto-apply to complete; if problem persists contact next level of support
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
900.102:
Type: Alarm
Description: Software patch auto-apply aborting
Entity_Instance_ID: orchestration=sw-patch
Severity: major
Proposed_Repair_Action: Wait for software patch auto-apply abort to complete; if problem persists contact next level of support
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
900.103:
Type: Alarm
Description: Software patch auto-apply failed
Entity_Instance_ID: orchestration=sw-patch
Severity: critical
Proposed_Repair_Action: Attempt to apply software patches manually; if problem persists contact next level of support
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: underlying-resource-unavailable
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
900.111:
Type: Log
Description: Software patch auto-apply start
Entity_Instance_ID: orchestration=sw-patch
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
900.112:
Type: Log
Description: Software patch auto-apply inprogress
Entity_Instance_ID: orchestration=sw-patch
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
900.113:
Type: Log
Description: Software patch auto-apply rejected
Entity_Instance_ID: orchestration=sw-patch
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
900.114:
Type: Log
Description: Software patch auto-apply cancelled
Entity_Instance_ID: orchestration=sw-patch
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
900.115:
Type: Log
Description: Software patch auto-apply failed
Entity_Instance_ID: orchestration=sw-patch
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
900.116:
Type: Log
Description: Software patch auto-apply completed
Entity_Instance_ID: orchestration=sw-patch
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
900.117:
Type: Log
Description: Software patch auto-apply abort
Entity_Instance_ID: orchestration=sw-patch
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
900.118:
Type: Log
Description: Software patch auto-apply aborting
Entity_Instance_ID: orchestration=sw-patch
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
900.119:
Type: Log
Description: Software patch auto-apply abort rejected
Entity_Instance_ID: orchestration=sw-patch
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
900.120:
Type: Log
Description: Software patch auto-apply abort failed
Entity_Instance_ID: orchestration=sw-patch
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
900.121:
Type: Log
Description: Software patch auto-apply aborted
Entity_Instance_ID: orchestration=sw-patch
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
900.201:
Type: Alarm
Description: Software upgrade auto-apply inprogress
Entity_Instance_ID: orchestration=sw-upgrade
Severity: major
Proposed_Repair_Action: Wait for software upgrade auto-apply to complete; if problem persists contact next level of support
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
900.202:
Type: Alarm
Description: Software upgrade auto-apply aborting
Entity_Instance_ID: orchestration=sw-upgrade
Severity: major
Proposed_Repair_Action: Wait for software upgrade auto-apply abort to complete; if problem persists contact next level of support
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
900.203:
Type: Alarm
Description: Software upgrade auto-apply failed
Entity_Instance_ID: orchestration=sw-upgrade
Severity: critical
Proposed_Repair_Action: Attempt to apply software upgrade manually; if problem persists contact next level of support
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: underlying-resource-unavailable
Service_Affecting: True
Suppression: True
Management_Affecting_Severity: warning
Degrade_Affecting_Severity: none
900.211:
Type: Log
Description: Software upgrade auto-apply start
Entity_Instance_ID: orchestration=sw-upgrade
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
900.212:
Type: Log
Description: Software upgrade auto-apply inprogress
Entity_Instance_ID: orchestration=sw-upgrade
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
900.213:
Type: Log
Description: Software upgrade auto-apply rejected
Entity_Instance_ID: orchestration=sw-upgrade
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
900.214:
Type: Log
Description: Software upgrade auto-apply cancelled
Entity_Instance_ID: orchestration=sw-upgrade
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
900.215:
Type: Log
Description: Software upgrade auto-apply failed
Entity_Instance_ID: orchestration=sw-upgrade
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
900.216:
Type: Log
Description: Software upgrade auto-apply completed
Entity_Instance_ID: orchestration=sw-upgrade
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
900.217:
Type: Log
Description: Software upgrade auto-apply abort
Entity_Instance_ID: orchestration=sw-upgrade
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
900.218:
Type: Log
Description: Software upgrade auto-apply aborting
Entity_Instance_ID: orchestration=sw-upgrade
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
900.219:
Type: Log
Description: Software upgrade auto-apply abort rejected
Entity_Instance_ID: orchestration=sw-upgrade
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
900.220:
Type: Log
Description: Software upgrade auto-apply abort failed
Entity_Instance_ID: orchestration=sw-upgrade
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
900.221:
Type: Log
Description: Software upgrade auto-apply aborted
Entity_Instance_ID: orchestration=sw-upgrade
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: False
...