config/sysinv/sysinv/sysinv/sysinv/tests/events_for_testing.yaml

2383 lines
86 KiB
YAML

---
############################################################################
#
# events.yaml file unit testing - this is not for production!
#
############################################################################
############################################################################
#
# Record Format ... for documentation
#
# 100.001:
# Type: < Alarm | Log >
# Description: < yaml string >
# OR
# [< yaml string >, // list of yaml strings
# < yaml string >]
# OR
# critical: < yaml string > // i.e. dictionary of yaml strings indexed by severity
# major: < yaml string >
# minor: < yaml string >
# warning: < yaml string >
# Entity_Instance_ID: < yaml string ... e.g. host=<hostname>.interface=<ifname> >
# OR
# [< yaml string >, // list of yaml strings
# < yaml string >]
# Severity: < critical | major | minor | warning >
# OR
# [critical, major] // list of severity values
# Proposed_Repair_Action: < yaml string > // NOTE ALARM ONLY FIELD
# OR
# critical: < yaml string > // i.e. dictionary of yaml strings indexed by severity
# major: < yaml string >
# minor: < yaml string >
# warning: < yaml string >
# Maintenance_Action: < yaml string > // NOTE ALARM ONLY FIELD
# OR
# critical: < yaml string > // i.e. dictionary of yaml strings indexed by severity
# major: < yaml string >
# minor: < yaml string >
# warning: < yaml string >
# Inhibit_Alarms: < true | false > // NOTE ALARM ONLY FIELD
# Alarm_Type: < operational-violation | ... >
# Probable_Cause: < timing-problem | ... >
# OR
# [< timing-problem | ... >, // list of probable-causes
# < timing-problem | ... >]
# Service_Affecting: < true | false >
# Suppression: < true | false > // NOTE ALARM ONLY FIELD
#
#
# Other Notes:
# - use general record format above
# - the only dictionaries allowed are ones indexed by severity
# - if there are multiple lists in a record,
# then they should all have the same # of items and corresponding list items represent instance of alarm
# - if you can't describe the alarm/log based on the above rules,
# then you can use a multi-line string format
#
############################################################################
# ---------------------------------------------------------------------------
# RMON
# ---------------------------------------------------------------------------
100.101:
Type: Alarm
Description: "Platform CPU threshold exceeded; threshold x%, actual y% ."
Entity_Instance_ID: host=<hostname>
Severity: [critical, major, minor]
Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support."
Maintenance_Action:
critical: degrade
major: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: threshold-crossed
Service_Affecting: false
Suppression: true
100.102:
Type: Alarm
Description: "VSwitch CPU threshold exceeded; threshold x%, actual y% ."
Entity_Instance_ID: host=<hostname>
Severity: [critical, major, minor]
Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support."
Maintenance_Action:
critical: degrade
major: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: threshold-crossed
Service_Affecting: false
Suppression: true
100.103:
Type: Alarm
Description: "Memory threshold exceeded; threshold x%, actual y% ."
Entity_Instance_ID: host=<hostname>
Severity: [critical, major, minor]
Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support; may require additional memory on Host."
Maintenance_Action:
critical: degrade
major: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: threshold-crossed
Service_Affecting: false
Suppression: true
100.104: # NOTE This should really be split into two different Alarms.
Type: Alarm
Description: |-
host=<hostname>.filesystem=<mount-dir>
File System threshold exceeded; threshold x%, actual y% .
OR
host=<hostname>.volumegroup=<volumegroup-name>
Monitor and if condition persists, consider adding additional physical volumes to the volume group.
Entity_Instance_ID: |-
host=<hostname>.filesystem=<mount-dir>
OR
host=<hostname>.volumegroup=<volumegroup-name>
Severity: [critical, major, minor]
Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support."
Maintenance_Action:
critical: degrade
major: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: threshold-crossed
Service_Affecting: false
Suppression: true
100.105:
Type: Alarm
Description: No access to remote VM volumes.
Entity_Instance_ID: host=<hostname>
Severity: major
Proposed_Repair_Action: Check Management and Infrastructure Networks and Controller or Storage Nodes.
Maintenance_Action: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unknown
Service_Affecting: true
Suppression: true
100.106:
Type: Alarm
Description: "'OAM' Port failed."
Entity_Instance_ID: host=<hostname>.port=<port-name>
Severity: major
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unknown
Service_Affecting: true
Suppression: true
100.107:
Type: Alarm
Description: |-
'OAM' Interface degraded.
OR
'OAM' Interface failed.
Entity_Instance_ID: host=<hostname>.interface=<if-name>
Severity: [critical, major]
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action:
critical: degrade
major: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unknown
Service_Affecting: true
Suppression: true
100.108:
Type: Alarm
Description: "'MGMT' Port failed."
Entity_Instance_ID: host=<hostname>.port=<port-name>
Severity: major
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unknown
Service_Affecting: true
Suppression: true
100.109:
Type: Alarm
Description: |-
'MGMT' Interface degraded.
OR
'MGMT' Interface failed.
Entity_Instance_ID: host=<hostname>.interface=<if-name>
Severity: [critical, major]
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action:
critical: degrade
major: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unknown
Service_Affecting: true
Suppression: true
100.110:
Type: Alarm
Description: "'INFRA' Port failed."
Entity_Instance_ID: host=<hostname>.port=<port-name>
Severity: major
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unknown
Service_Affecting: true
Suppression: true
100.111:
Type: Alarm
Description: |-
'INFRA' Interface degraded.
OR
'INFRA' Interface failed.
Entity_Instance_ID: host=<hostname>.interface=<if-name>
Severity: [critical, major]
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action:
critical: degrade
major: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unknown
Service_Affecting: true
Suppression: true
100.112:
Type: Alarm
Description: "'DATA-VRS' Port down."
Entity_Instance_ID: host=<hostname>.port=<port-name>
Severity: major
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unknown
Service_Affecting: true
Suppression: true
100.113:
Type: Alarm
Description: |-
'DATA-VRS' Interface degraded.
OR
'DATA-VRS' Interface down.
Entity_Instance_ID: host=<hostname>.interface=<if-name>
Severity: [critical, major]
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action:
major: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unknown
Service_Affecting: true
Suppression: true
100.114:
Type: Alarm
Description:
major: "NTP configuration does not contain any valid or reachable NTP servers."
minor: "NTP address <IP address> is not a valid or a reachable NTP server."
Entity_Instance_ID:
major: host=<hostname>.ntp
minor: host=<hostname>.ntp=<IP address>
Severity: [major, minor]
Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support."
Maintenance_Action: none
Inhibit_Alarms:
Alarm_Type: communication
Probable_Cause: unknown
Service_Affecting: false
Suppression: false
100.115:
Type: Alarm
Description: "VSwitch Memory Usage, processor <processor> threshold exceeded; threshold x%, actual y% ."
Entity_Instance_ID: host=<hostname>.processor=<processor>
Severity: [critical, major, minor]
Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support."
Maintenance_Action:
critical: degrade
major: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: threshold-crossed
Service_Affecting: false
Suppression: true
100.116:
Type: Alarm
Description: "Cinder LVM Thinpool Usage threshold exceeded; threshold x%, actual y% ."
Entity_Instance_ID: host=<hostname>.volumegroup=<volumegroup>
Severity: [critical, major, minor]
Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support."
Maintenance_Action:
critical: degrade
major: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: threshold-crossed
Service_Affecting: false
Suppression: true
100.117:
Type: Alarm
Description: "Nova Thinpool Usage threshold exceeded; threshold x%, actual y% ."
Entity_Instance_ID: host=<hostname>.volumegroup=<volumegroup>
Severity: [critical, major, minor]
Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support."
Maintenance_Action:
critical: degrade
major: degrade
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: threshold-crossed
Service_Affecting: false
Suppression: true
# ---------------------------------------------------------------------------
# MAINTENANCE
# ---------------------------------------------------------------------------
200.001:
Type: Alarm
Description: <hostname> was administratively locked to take it out-of-service.
Entity_Instance_ID: host=<hostname>
Severity: warning
Proposed_Repair_Action: Administratively unlock Host to bring it back in-service.
Maintenance_Action: none
Inhibit_Alarms: true
Alarm_Type: operational-violation
Probable_Cause: out-of-service
Service_Affecting: true
Suppression: false
200.004:
Type: Alarm
Description: |-
<hostname> experienced a service-affecting failure.
Host is being auto recovered by Reboot.
Entity_Instance_ID: host=<hostname>
Severity: critical
Proposed_Repair_Action: If auto-recovery is consistently unable to recover host to the unlocked-enabled state contact next level of support or lock and replace failing host.
Maintenance_Action: auto recover
Inhibit_Alarms: false
Alarm_Type: operational-violation
Probable_Cause: application-subsystem-failure
Service_Affecting: true
Suppression: true
200.011:
Type: Alarm
Description: <hostname> experienced a configuration failure during initialization. Host is being re-configured by Reboot.
Entity_Instance_ID: host=<hostname>
Severity: critical
Proposed_Repair_Action: If auto-recovery is consistently unable to recover host to the unlocked-enabled state contact next level of support or lock and replace failing host.
Maintenance_Action: auto-recover
Inhibit_Alarms: false
Alarm_Type: operational-violation
Probable_Cause: configuration-or-customization-error
Service_Affecting: true
Suppression: true
200.010:
Type: Alarm
Description: <hostname> access to board management module has failed.
Entity_Instance_ID: host=<hostname>
Severity: warning
Proposed_Repair_Action: Check Host's board management configuration and connectivity.
Maintenance_Action: auto recover
Inhibit_Alarms: false
Alarm_Type: operational-violation
Probable_Cause: communication-subsystem-failure
Service_Affecting: false
Suppression: false
200.012:
Type: Alarm
Description: <hostname> controller function has in-service failure while compute services remain healthy.
Entity_Instance_ID: host=<hostname>
Severity: major
Proposed_Repair_Action: |-
Lock and then Unlock host to recover.
Avoid using 'Force Lock' action as that will impact compute services running on this host,
If lock action fails then contact next level of support to investigate and recover.
Maintenance_Action: "degrade - requires manual action"
Inhibit_Alarms: false
Alarm_Type: operational-violation
Probable_Cause: communication-subsystem-failure
Service_Affecting: true
Suppression: true
200.013:
Type: Alarm
Description: <hostname> compute service of the only available controller is not poperational. Auto-recovery is disabled. Deggrading host instead.
Entity_Instance_ID: host=<hostname>
Severity: major
Proposed_Repair_Action: Enable second controller and Switch Activity (Swact) over to it as soon as possible. Then Lock and Unlock host to recover its local compute service.
Maintenance_Action: "degrade - requires manual action"
Inhibit_Alarms: false
Alarm_Type: operational-violation
Probable_Cause: communication-subsystem-failure
Service_Affecting: true
Suppression: true
200.005:
Type: Alarm
Description: |-
Degrade:
<hostname> is experiencing an intermittent 'Management Network' communication failures that have exceeded its lower alarming threshold.
Failure:
<hostname> is experiencing a persistent critical 'Management Network' communication failure."
Entity_Instance_ID: host=<hostname>
Severity: [critical, major]
Proposed_Repair_Action: |-
"Check 'Management Network' connectivity and support for multicast messaging.
If problem consistently occurs after that and Host is reset, then contact next level of support or lock and replace failing host."
Maintenance_Action: auto recover
Inhibit_Alarms: false
Alarm_Type: communication
Probable_Cause: unknown
Service_Affecting: true
Suppression: true
200.009:
Type: Alarm
Description: |-
Degrade:
<hostname> is experiencing an intermittent 'Infrastructure Network' communication failures that have exceeded its lower alarming threshold.
Failure:
<hostname> is experiencing a persistent critical 'Infrastructure Network' communication failure."
Entity_Instance_ID: host=<hostname>
Severity: [critical, major]
Proposed_Repair_Action: |-
"Check 'Infrastructure Network' connectivity and support for multicast messaging.
If problem consistently occurs after that and Host is reset, then contact next level of support or lock and replace failing host."
Maintenance_Action: auto recover
Inhibit_Alarms: false
Alarm_Type: communication
Probable_Cause: unknown
Service_Affecting: true
Suppression: true
200.006:
Type: Alarm
Description: |-
Main Process Monitor Daemon Failure (major):
<hostname> 'Process Monitor' (pmond) process is not running or functioning properly. The system is trying to recover this process.
Monitored Process Failure (critical/major/minor):
Critical: <hostname> critical '<processname>' process has failed and could not be auto-recovered gracefully.
Auto-recovery progression by host reboot is required and in progress.
Major: <hostname> is degraded due to the failure of its '<processname>' process. Auto recovery of this major process is in progress.
Minor: <hostname> '<processname>' process has failed. Auto recovery of this minor process is in progress.
OR
<hostname> '<processname>' process has failed. Manual recovery is required.
Entity_Instance_ID: host=<hostname>.process=<processname>
Severity: [critical, major, minor]
Proposed_Repair_Action: |-
If this alarm does not automatically clear after some time and continues to be asserted after Host is locked and unlocked
then contact next level of support for root cause analysis and recovery.
If problem consistently occurs after Host is locked and unlocked then contact next level of support for root cause analysys and recovery."
Maintenance_Action:
critical: auto-recover
major: degrade
minor:
Inhibit_Alarms: false
Alarm_Type: operational-violation
Probable_Cause: unknown
Service_Affecting:
critical: true
major: true
minor: false
Suppression: true
# 200.006: // NOTE using duplicate ID of a completely analogous Alarm for this
# Type: Log
# Description: |-
# Main Process Monitor Daemon Failure (major)
# <hostname> 'Process Monitor' (pmond) process is not running or functioning properly.
# The system is trying to recover this process.
#
# Monitored Process Failure (critical/major/minor)
# critical: <hostname> critical '<processname>' process has failed and could not be auto-recovered gracefully.
# Auto-recovery progression by host reboot is required and in progress.
# major: <hostname> is degraded due to the failure of its '<processname>' process. Auto recovery of this major process is in progress.
# minor: <hostname> '<processname>' process has failed. Auto recovery of this minor process is in progress.
# OR
# <hostname> '<processname>' process has failed. Manual recovery is required.
# Entity_Instance_ID: host=<hostname>.process=<process-name>
# Severity: minor
# Alarm_Type: other
# Probable_Cause: unspecified-reason
# Service_Affecting: true
200.007:
Type: Alarm
Description:
critical: "Host is degraded due to a 'critical' out-of-tolerance reading from the '<sensorname>' sensor"
major: "Host is degraded due to a 'major' out-of-tolerance reading from the '<sensorname>' sensor"
minor: "Host is reporting a 'minor' out-of-tolerance reading from the '<sensorname>' sensor"
Entity_Instance_ID: host=<hostname>.sensor=<sensorname>
Severity: [critical, major, minor]
Proposed_Repair_Action: "If problem consistently occurs after Host is power cycled and or reset, contact next level of support or lock and replace failing host."
Maintenance_Action:
critical: degrade
major: degrade
minor: auto-recover (polling)
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unspecified-reason
Service_Affecting:
critical: true
major: false
minor: false
Suppression: true
200.014:
Type: Alarm
Description: "The Hardware Monitor was unable to load, configure and monitor one or more hardware sensors."
Entity_Instance_ID: host=<hostname>
Severity: minor
Proposed_Repair_Action: |-
Check Board Management Controller provisioning. Try reprovisioning the BMC.
If problem persists try power cycling the host and then the entire server including the BMC power.
If problem persists then contact next level of support.
Maintenance_Action: None
Inhibit_Alarms: false
Alarm_Type: operational-violation
Probable_Cause: unknown
Service_Affecting: false
Suppression: true
200.015:
Type: Alarm
Description: Unable to read one or more sensor groups from this host's board management controller
Entity_Instance_ID: host=<hostname>
Severity: major
Proposed_Repair_Action: Check board management connectivity and try rebooting the board management controller. If problem persists contact next level of support or lock and replace failing host.
Maintenance_Action: None
Inhibit_Alarms: false
Alarm_Type: operational-violation
Probable_Cause: unknown
Service_Affecting: false
Suppression: false
200.020:
Type: Log
Description: ["<hostname> has been 'discovered' on the network",
"<hostname> has been 'added' to the system",
"<hostname> has 'entered' multi-node failure avoidance",
"<hostname> has 'exited' multi-node failure avoidance"]
Entity_Instance_ID: [host=<hostname>.event=discovered,
host=<hostname>.event=add,
host=<hostname>.event=mnfa_enter,
host=<hostname>.event=mnfa_exit]
Severity: warning
Alarm_Type: other
Probable_Cause: unspecified-reason
Service_Affecting: true
200.021:
Type: Log
Description: ["<hostname> board management controller has been 'provisioned'",
"<hostname> board management controller has been 're-provisioned'",
"<hostname> board management controller has been 'de-provisioned'",
"<hostname> manual 'unlock' request",
"<hostname> manual 'reboot' request",
"<hostname> manual 'reset' request",
"<hostname> manual 'power-off' request",
"<hostname> manual 'power-on' request",
"<hostname> manual 'reinstall' request",
"<hostname> manual 'force-lock' request",
"<hostname> manual 'delete' request",
"<hostname> manual 'controller switchover' request"]
Entity_Instance_ID: [host=<hostname>.command=provision,
host=<hostname>.command=reprovision,
host=<hostname>.command=deprovision,
host=<hostname>.command=unlock,
host=<hostname>.command=reboot,
host=<hostname>.command=reset,
host=<hostname>.command=power-off,
host=<hostname>.command=power-on,
host=<hostname>.command=reinstall,
host=<hostname>.command=force-lock,
host=<hostname>.command=delete,
host=<hostname>.command=swact]
Severity: warning
Alarm_Type: other
Probable_Cause: unspecified-reason
Service_Affecting: false
200.022:
Type: Log
Description: ["<hostname> is now 'disabled'",
"<hostname> is now 'enabled'",
"<hostname> is now 'online'",
"<hostname> is now 'offline'",
"<hostname> is 'disabled-failed' to the system"]
Entity_Instance_ID: [host=<hostname>.state=disabled,
host=<hostname>.state=enabled,
host=<hostname>.status=online,
host=<hostname>.status=offline,
host=<hostname>.status=failed]
Severity: warning
Alarm_Type: other
Probable_Cause: unspecified-reason
Service_Affecting: true
# ---------------------------------------------------------------------------
# BACKUP AND RESTORE
# ---------------------------------------------------------------------------
210.001:
Type: Alarm
Description: System Backup in progress.
Entity_Instance_ID: host=controller
Severity: minor
Proposed_Repair_Action: No action required.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unspecified-reason
Service_Affecting: false
Suppression: false
# ---------------------------------------------------------------------------
# SYSTEM CONFIGURATION
# ---------------------------------------------------------------------------
250.001:
Type: Alarm
Description: <hostname> Configuation is out-of-date.
Entity_Instance_ID: host=<hostname>
Severity: major
Proposed_Repair_Action: Administratively lock and unlock <hostname> to update config.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: unspecified-reason
Service_Affecting: true
Suppression: false
# ---------------------------------------------------------------------------
# VM Compute Services
# ---------------------------------------------------------------------------
270.001:
Type: Alarm
Description: "Host <host_name> compute services failure[, reason = <reason_text>]"
Entity_Instance_ID: host=<host_name>.services=compute
Severity: critical
Proposed_Repair_Action: Wait for host services recovery to complete; if problem persists contact next level of support
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: true
Suppression: true
270.101:
Type: Log
Description: "Host <host_name> compute services failure[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
270.102:
Type: Log
Description: Host <host_name> compute services enabled
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
270.103:
Type: Log
Description: Host <host_name> compute services disabled
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
275.001:
Type: Log
Description: Host <host_name> hypervisor is now <administrative_state>-<operational_state>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
# ---------------------------------------------------------------------------
# NETWORK
# ---------------------------------------------------------------------------
300.001:
Type: Alarm
Description: "'Data' Port failed."
Entity_Instance_ID: host=<hostname>.port=<port-uuid>
Severity: major
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: loss-of-signal
Service_Affecting: true
Suppression: false
300.002:
Type: Alarm
Description: |-
'Data' Interface degraded.
OR
'Data' Interface failed.
Entity_Instance_ID: host=<hostname>.interface=<if-uuid>
Severity: [critical, major]
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: loss-of-signal
Service_Affecting: true
Suppression: false
300.003:
Type: Alarm
Description: Networking Agent not responding.
Entity_Instance_ID: host=<hostname>.agent=<agent-uuid>
Severity: major
Proposed_Repair_Action: "If condition persists, attempt to clear issue by administratively locking and unlocking the Host."
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: underlying-resource-unavailable
Service_Affecting: true
Suppression: false
300.004:
Type: Alarm
Description: No enabled compute host with connectivity to provider network.
Entity_Instance_ID: host=<hostname>.providernet=<pnet-uuid>
Severity: major
Proposed_Repair_Action: Enable compute hosts with required provider network connectivity.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: underlying-resource-unavailable
Service_Affecting: true
Suppression: false
300.005:
Type: Alarm
Description: |-
Communication failure detected over provider network x% for ranges y% on host z%.
OR
Communication failure detected over provider network x% on host z%.
Entity_Instance_ID: providernet=<pnet-uuid>.host=<hostname>
Severity: major
Proposed_Repair_Action: Check neighbour switch port VLAN assignments.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: underlying-resource-unavailable
Service_Affecting: true
Suppression: false
# ---------------------------------------------------------------------------
# HIGH AVAILABILITY
# ---------------------------------------------------------------------------
400.001:
Type: Alarm
Description: |-
Service group failure; <list of affected services>.
OR
Service group degraded; <list of affected services>.
OR
Service group warning; <list of affected services>.
Entity_Instance_ID: service_domain=<domain_name>.service_group=<group_name>.host=<hostname>
Severity: [critical, major, minor]
Proposed_Repair_Action: Contact next level of support.
Maintenance_Action:
Inhibit_Alarms: false
Alarm_Type: processing-error
Probable_Cause: underlying-resource-unavailable
Service_Affecting: true
Suppression: true
400.002:
Type: Alarm
Description: |-
Service group loss of redundancy; expected <num> standby member<s> but only <num> standby member<s> available.
OR
Service group loss of redundancy; expected <num> standby member<s> but only <num> standby member<s> available.
OR
Service group loss of redundancy; expected <num> active member<s> but no active members available.
OR
Service group loss of redundancy; expected <num> active member<s> but only <num> active member<s> available.
Entity_Instance_ID: service_domain=<domain_name>.service_group=<group_name>
Severity: major
Proposed_Repair_Action: "Bring a controller node back in to service, otherwise contact next level of support."
Maintenance_Action:
Inhibit_Alarms: false
Alarm_Type: processing-error
Probable_Cause: underlying-resource-unavailable
Service_Affecting: true
Suppression: true
400.003:
Type: Alarm
Description: |-
License key is not installed; a valid license key is required for operation.
OR
License key has expired or is invalid; a valid license key is required for operation.
OR
Evaluation license key will expire on <date>; there are <num_days> days remaining in this evaluation.
OR
Evaluation license key will expire on <date>; there is only 1 day remaining in this evaluation.
Entity_Instance_ID: host=<hostname>
Severity: critical
Proposed_Repair_Action: Contact next level of support to obtain a new license key.
Maintenance_Action:
Inhibit_Alarms: false
Alarm_Type: processing-error
Probable_Cause: key-expired
Service_Affecting: true
Suppression: false
# 400.004: // NOTE Removed
# Type: Alarm
# Description: Service group software modification detected; <list of affected files>.
# Entity_Instance_ID: host=<hostname>
# Severity: major
# Proposed_Repair_Action: Contact next level of support.
# Maintenance_Action:
# Inhibit_Alarms: false
# Alarm_Type: processing-error
# Probable_Cause: software-program-error
# Service_Affecting: true
# Suppression: false
400.005:
Type: Alarm
Description: |-
Communication failure detected with peer over port <linux-ifname>.
OR
Communication failure detected with peer over port <linux-ifname> within the last 30 seconds.
Entity_Instance_ID: host=<hostname>.network=<mgmt | oam | infra>
Severity: major
Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment.
Maintenance_Action:
Inhibit_Alarms: false
Alarm_Type: communication
Probable_Cause: underlying-resource-unavailable
Service_Affecting: true
Suppression: true
# ---------------------------------------------------------------------------
# SM
# ---------------------------------------------------------------------------
401.001:
Type: Log
Description: Service group <group> state change from <state> to <state> on host <host_name>
Entity_Instance_ID: service_domain=<domain>.service_group=<group>.host=<host_name>
Severity: critical
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: true
401.002:
Type: Log
Description: |-
Service group <group> loss of redundancy; expected <X> standby member but no standby members available
or
Service group <group> loss of redundancy; expected <X> standby member but only <Y> standby member(s) available
or
Service group <group> has no active members available; expected <X> active member(s)
or
Service group <group> loss of redundancy; expected <X> active member(s) but only <Y> active member(s) available
Entity_Instance_ID: service_domain=<domain>.service_group=<group>
Severity: critical
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: true
401.003:
Type: Log
Description: |-
License key has expired or is invalid
or
Evaluation license key will expire on <date>
or
License key is valid
Entity_Instance_ID: host=<host_name>
Severity: critical
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: true
401.005:
Type: Log
Description: |-
Communication failure detected with peer over port <port> on host <host name>
or
Communication failure detected with peer over port <port> on host <host name> within the last <X> seconds
or
Communication established with peer over port <port> on host <host name>
Entity_Instance_ID: host=<host_name>.network=<network>
Severity: critical
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: true
401.007:
Type: Log
Description: Swact or swact-force
Entity_Instance_ID: host=<host_name>
Severity: critical
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: true
# ---------------------------------------------------------------------------
# VM
# ---------------------------------------------------------------------------
700.001:
Type: Alarm
Description: |-
Instance <instance_name> owned by <tenant_name> has failed on host <host_name>
Instance <instance_name> owned by <tenant_name> has failed to schedule
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: The system will attempt recovery; no repair action required
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: software-error
Service_Affecting: true
Suppression: true
700.002:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> is paused on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: Unpause the instance
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: procedural-error
Service_Affecting: true
Suppression: true
700.003:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> is suspended on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: Resume the instance
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: procedural-error
Service_Affecting: true
Suppression: true
700.005:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> is rebooting on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: Wait for reboot to complete; if problem persists contact next level of support
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: true
Suppression: true
700.006:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> is rebuilding on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: Wait for rebuild to complete; if problem persists contact next level of support
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: underlying-resource-unavailable
Service_Affecting: true
Suppression: true
700.007:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> is evacuating from host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: Wait for evacuate to complete; if problem persists contact next level of support
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: underlying-resource-unavailable
Service_Affecting: true
Suppression: true
700.008:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> is live migrating from host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: warning
Proposed_Repair_Action: Wait for live migration to complete; if problem persists contact next level of support
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: true
Suppression: true
700.009:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> is cold migrating from host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: Wait for cold migration to complete; if problem persists contact next level of support
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: true
Suppression: true
700.010:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> has been cold-migrated to host <host_name> waiting for confirmation
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: Confirm or revert cold-migrate of instance
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: true
Suppression: true
700.011:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> is reverting cold migrate to host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: "Wait for cold migration revert to complete; if problem persists contact next level of support"
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: other
Probable_Cause: unspecified-reason
Service_Affecting: true
Suppression: true
700.012:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> is resizing on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: Wait for resize to complete; if problem persists contact next level of support
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: true
Suppression: true
700.013:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> has been resized on host <host_name> waiting for confirmation
Entity_Instance_ID: itenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: Confirm or revert resize of instance
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: processing-error
Probable_Cause: unspecified-reason
Service_Affecting: true
Suppression: true
700.014:
Type: Alarm
Description: Instance <instance_name> owned by <tenant_name> is reverting resize on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Proposed_Repair_Action: "Wait for resize revert to complete; if problem persists contact next level of support"
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: other
Probable_Cause: unspecified-reason
Service_Affecting: true
Suppression: true
700.015:
Type: Alarm
Description: Guest Heartbeat not established for instance <instance_name> owned by <tenant_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: major
Proposed_Repair_Action: "Verify that the instance is running the Guest-Client daemon, or disabsle Guest Heartbeat for the instance if no longer needed, otherwise contact next level of support"
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: communication
Probable_Cause: procedural-error
Service_Affecting: true
Suppression: true
700.016:
Type: Alarm
Description: Multi-Node Recovery Mode
Entity_Instance_ID: subsystem=vim
Severity: major
Proposed_Repair_Action: "Wait for the system to exit out of this mode"
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: true
Suppression: true
700.101:
Type: Log
Description: Instance <instance_name> is enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.102:
Type: Log
Description: Instance <instance_name> owned by <tenant_name> has failed[, reason = <reason_text>]
Instance <instance_name> owned by <tenant_name> has failed to schedule[, reason = <reason_text>]
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.103:
Type: Log
Description: Create issued <by <tenant_name>|by the system> against <instance_name> owned by <tenant_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.104:
Type: Log
Description: Creating instance <instance_name> owned by <tenant_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.105:
Type: Log
Description: "Create rejected for instance <instance_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.106:
Type: Log
Description: "Create cancelled for instance <instance_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.107:
Type: Log
Description: "Create failed for instance <instance_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.108:
Type: Log
Description: Inance <instance_name> owned by <tenant_name> has been created
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.109:
Type: Log
Description: "Delete issued <by tenant <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.110:
Type: Log
Description: Deleting instance <instance_name> owned by <tenatn_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.111:
Type: Log
Description: "Delete rejected for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.112:
Type: Log
Description: "Delete cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.113:
Type: Log
Description: "Delete failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.114:
Type: Log
Description: Deleted instance <instance_name> owned by <tenant_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.115:
Type: Log
Description: "Pause issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.116:
Type: Log
Description: Pause inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.117:
Type: Log
Description: "Pause rejected for instance <instance_name> enabled on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.118:
Type: Log
Description: "Pause cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.119:
Type: Log
Description: "Pause failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.120:
Type: Log
Description: Pause complete for instance <instance_name> now paused on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.121:
Type: Log
Description: "Unpause issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.122:
Type: Log
Description: Unpause inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.123:
Type: Log
Description: "Unpause rejected for instance <instance_name> paused on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.124:
Type: Log
Description: "Unpause cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.125:
Type: Log
Description: "Unpause failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.126:
Type: Log
Description: Unpause complete for instance <instance_name> now enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.127:
Type: Log
Description: "Suspend issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.128:
Type: Log
Description: Suspend inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.129:
Type: Log
Description: "Suspend rejected for instance <instance_name> enabled on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.130:
Type: Log
Description: "Suspend cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.131:
Type: Log
Description: "Suspend failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.132:
Type: Log
Description: Suspend complete for instance <instance_name> now suspended on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.133:
Type: Log
Description: "Resume issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.134:
Type: Log
Description: Resume inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.135:
Type: Log
Description: "Resume rejected for instance <instance_name> suspended on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.136:
Type: Log
Description: "Resume cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.137:
Type: Log
Description: "Resume failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.138:
Type: Log
Description: Resume complete for instance <instance_name> now enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.139:
Type: Log
Description: "Start issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.140:
Type: Log
Description: Start inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.141:
Type: Log
Description: "Start rejected for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.142:
Type: Log
Description: "Start cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.143:
Type: Log
Description: "Start failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.144:
Type: Log
Description: Start complete for instance <instance_name> now enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.145:
Type: Log
Description: "Stop issued <by <tenant_name>|by the system|by the instance> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.146:
Type: Log
Description: Stop inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.147:
Type: Log
Description: "Stop rejected for instance <instance_name> enabled on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.148:
Type: Log
Description: "Stop cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.149:
Type: Log
Description: "Stop failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.150:
Type: Log
Description: Stop complete for instance <instance_name> now disabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.151:
Type: Log
Description: "Live-Migrate issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> from host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.152:
Type: Log
Description: Live-Migrate inprogress for instance <instance_name> from host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.153:
Type: Log
Description: "Live-Migrate rejected for instance <instance_name> now on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.154:
Type: Log
Description: "Live-Migrate cancelled for instance <instance_name> now on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.155:
Type: Log
Description: "Live-Migrate failed for instance <instance_name> now on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.156:
Type: Log
Description: Live-Migrate complete for instance <instance_name> now enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.157:
Type: Log
Description: "Cold-Migrate issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> from host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.158:
Type: Log
Description: Cold-Migrate inprogress for instance <instance_name> from host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.159:
Type: Log
Description: "Cold-Migrate rejected for instance <instance_name> now on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.160:
Type: Log
Description: "Cold-Migrate cancelled for instance <instance_name> now on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.161:
Type: Log
Description: "Cold-Migrate failed for instance <instance_name> now on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.162:
Type: Log
Description: Cold-Migrate complete for instance <instance_name> now enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.163:
Type: Log
Description: "Cold-Migrate-Confirm issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.164:
Type: Log
Description: Cold-Migrate-Confirm inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.165:
Type: Log
Description: "Cold-Migrate-Confirm rejected for instance <instance_name> now enabled on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.166:
Type: Log
Description: "Cold-Migrate-Confirm cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.167:
Type: Log
Description: "Cold-Migrate-Confirm failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.168:
Type: Log
Description: Cold-Migrate-Confirm complete for instance <instance_name> enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.169:
Type: Log
Description: "Cold-Migrate-Revert issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.170:
Type: Log
Description: Cold-Migrate-Revert inprogress for instance <instance_name> from host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.171:
Type: Log
Description: "Cold-Migrate-Revert rejected for instance <instance_name> now on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.172:
Type: Log
Description: "Cold-Migrate-Revert cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.173:
Type: Log
Description: "Cold-Migrate-Revert failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.174:
Type: Log
Description: Cold-Migrate-Revert complete for instance <instance_name> now enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.175:
Type: Log
Description: "Evacuate issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.176:
Type: Log
Description: Evacuating instance <instance_name> owned by <tenant_name> from host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.177:
Type: Log
Description: "Evacuate rejected for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.178:
Type: Log
Description: "Evacuate cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.179:
Type: Log
Description: "Evacuate failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.180:
Type: Log
Description: Evacuate complete for instance <instance_name> now enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.181:
Type: Log
Description: |-
"Reboot <(soft-reboot)|(hard-reboot)> issued <by <tenant_name>|by the system|by the instance>
against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.182:
Type: Log
Description: Reboot inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.183:
Type: Log
Description: "Reboot rejected for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.184:
Type: Log
Description: "Reboot cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.185:
Type: Log
Description: "Reboot failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.186:
Type: Log
Description: Reboot complete for instance <instance_name> now enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.187:
Type: Log
Description: "Rebuild issued <by <tenant_name>|by the system> against instance <instance_name> using image <image_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.188:
Type: Log
Description: Rebuild inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.189:
Type: Log
Description: "Rebuild rejected for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.190:
Type: Log
Description: "Rebuild cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.191:
Type: Log
Description: "Rebuild failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.192:
Type: Log
Description: Rebuild complete for instance <instance_name> now enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.193:
Type: Log
Description: "Resize issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.194:
Type: Log
Description: Resize inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.195:
Type: Log
Description: "Resize rejected for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.196:
Type: Log
Description: "Resize cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.197:
Type: Log
Description: "Resize failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.198:
Type: Log
Description: Resize complete for instance <instance_name> enabled on host <host_name> waiting for confirmation
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.199:
Type: Log
Description: "Resize-Confirm issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.200:
Type: Log
Description: Resize-Confirm inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.201:
Type: Log
Description: "Resize-Confirm rejected for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.202:
Type: Log
Description: "Resize-Confirm cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.203:
Type: Log
Description: "Resize-Confirm failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.204:
Type: Log
Description: Resize-Confirm complete for instance <instance_name> enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.205:
Type: Log
Description: "Resize-Revert issued <by <tenant_name>|by the system> against instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.206:
Type: Log
Description: Resize-Revert inprogress for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.207:
Type: Log
Description: "Resize-Revert rejected for instance <instance_name> owned by <tenant_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.208:
Type: Log
Description: "Resize-Revert cancelled for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.209:
Type: Log
Description: "Resize-Revert failed for instance <instance_name> on host <host_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.210:
Type: Log
Description: Resize-Revert complete for instance <instance_name> enabled on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.211:
Type: Log
Description: Guest Heartbeat established for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: major
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.212:
Type: Log
Description: Guest Heartbeat disconnected for instance <instance_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: major
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.213:
Type: Log
Description: "Guest Heartbeat failed for instance <instance_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.214:
Type: Log
Description: Instance <instance_name> has been renamed to <new_instance_name> owned by <tenant_name> on host <host_name>
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.215:
Type: Log
Description: "Guest Health Check failed for instance <instance_name>[, reason = <reason_text>]"
Entity_Instance_ID: tenant=<tenant-uuid>.instance=<instance-uuid>
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.216:
Type: Log
Description: "Entered Multi-Node Recovery Mode"
Entity_Instance_ID: subsystem=vim
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
700.217:
Type: Log
Description: "Exited Multi-Node Recovery Mode"
Entity_Instance_ID: subsystem=vim
Severity: critical
Alarm_Type: equipment
Probable_Cause: unspecified-reason
Service_Affecting: false
# ---------------------------------------------------------------------------
# STORAGE
# ---------------------------------------------------------------------------
800.001:
Type: Alarm
Description: |-
Storage Alarm Condition:
1 mons down, quorum 1,2 controller-1,storage-0
Entity_Instance_ID: cluster=<dist-fs-uuid>
Severity: [critical, major]
Proposed_Repair_Action: "If problem persists, contact next level of support."
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: equipment-malfunction
Service_Affecting:
critical: true
major: false
Suppression: false
800.010:
Type: Alarm
Description: |-
Potential data loss. No available OSDs in storage replication group.
Entity_Instance_ID: cluster=<dist-fs-uuid>.peergroup=<group-x>
Severity: [critical]
Proposed_Repair_Action: "Ensure storage hosts from replication group are unlocked and available.
Check if OSDs of each storage host are up and running.
If problem persists contact next level of support."
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: equipment-malfunction
Service_Affecting:
critical: true
Suppression: false
800.011:
Type: Alarm
Description: |-
Loss of replication in peergroup.
Entity_Instance_ID: cluster=<dist-fs-uuid>.peergroup=<group-x>
Severity: [major]
Proposed_Repair_Action: "Ensure storage hosts from replication group are unlocked and available.
Check if OSDs of each storage host are up and running.
If problem persists contact next level of support."
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: equipment
Probable_Cause: equipment-malfunction
Service_Affecting:
major: true
Suppression: false
800.002:
Type: Log
Description: ["Image storage media is full: There is not enough disk space on the image storage media.",
"Instance <instance name> snapshot failed: There is not enough disk space on the image storage media.",
"Supplied <attrs> (<supplied>) and <attrs> generated from uploaded image (<actual>) did not match. Setting image status to 'killed'.",
"Error in store configuration. Adding images to store is disabled.",
"Forbidden upload attempt: <exception>",
"Insufficient permissions on image storage media: <exception>",
"Denying attempt to upload image larger than <size> bytes.",
"Denying attempt to upload image because it exceeds the quota: <exception>",
"Received HTTP error while uploading image <image_id>",
"Client disconnected before sending all data to backend",
"Failed to upload image <image_id>"]
Entity_Instance_ID: ["image=<image-uuid>, instance=<instance-uuid>",
"tenant=<tenant-uuid>, instance=<instance-uuid>",
"image=<image-uuid>, instance=<instance-uuid>",
"image=<image-uuid>, instance=<instance-uuid>",
"image=<image-uuid>, instance=<instance-uuid>",
"image=<image-uuid>, instance=<instance-uuid>",
"image=<image-uuid>, instance=<instance-uuid>",
"image=<image-uuid>, instance=<instance-uuid>",
"image=<image-uuid>, instance=<instance-uuid>",
"image=<image-uuid>, instance=<instance-uuid>",
"image=<image-uuid>, instance=<instance-uuid>"]
Alarm_Type: [physical-violation,
physical-violation,
integrity-violation,
integrity-violation,
security-service-or-mechanism-violation,
security-service-or-mechanism-violation,
security-service-or-mechanism-violation,
security-service-or-mechanism-violation,
communication,
communication,
operational-violation]
Severity: warning
Probable_Cause: unspecified-reason
Service_Affecting: false
800.003:
Type: Alarm
Description: |-
Storage Alarm Condition:
total ceph cluster size greater than sum of individual pool quotas
Entity_Instance_ID: cluster=<dist-fs-uuid>
Severity: minor
Proposed_Repair_Action: "Update ceph storage pool quotas to use all available cluster space."
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: operational-violation
Probable_Cause: configuration-out-of-date
Service_Affecting: false
Suppression: false
# ---------------------------------------------------------------------------
# SOFTWARE
# ---------------------------------------------------------------------------
900.001:
Type: Alarm
Description: Patching operation in progress.
Entity_Instance_ID: host=controller
Severity: minor
Proposed_Repair_Action: Complete reboots of affected hosts.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: environmental
Probable_Cause: unspecified-reason
Service_Affecting: false
Suppression: false
900.002:
Type: Alarm
Description: Obsolete patch in system.
Entity_Instance_ID: host=controller
Severity: warning
Proposed_Repair_Action: Remove and delete obsolete patches.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: environmental
Probable_Cause: unspecified-reason
Service_Affecting: false
Suppression: false
900.003:
Type: Alarm
Description: Patch host install failure.
Entity_Instance_ID: host=<hostname>
Severity: major
Proposed_Repair_Action: Undo patching operation.
Maintenance_Action:
Inhibit_Alarms:
Alarm_Type: environmental
Probable_Cause: unspecified-reason
Service_Affecting: false
Suppression: false
...