Files
distcloud/distributedcloud/scripts/ipmi_sel_event_monitor.py
Gustavo Herzmann 5ae0ba1407 Optimize IPMI SEL event monitoring
This commit optimizes the SEL event monitoring script used during
subcloud enrollment. The script now runs a single ipmitool sel list -v
command instead of invoking ipmitool sel list followed by multiple
ipmitool sel get calls.

A new mode has been added to the monitoring script that monitors for
all success events, returning the last one, allowing the caller to
detect if multiple events were sent and skip directly to the latest
completed phase. If a failure event is detected, the script exits
immediately.

The IPMI codes used by auto-restore and enrollment have been updated
from OEM-reserved codes to the “System Event” sensor type, which the
BMC never interprets as an error.

Additionally, the script now stops sending the expected_region_name
restore override for factory restores, since the factory-installed
subcloud region name is not expected to match the system controller's
region name.

Test Plan:
01. PASS – Run subcloud enrollment and verify the following:
    - Events sent sequentially with delays are returned individually.
    - Events sent quickly without delays return only the latest event
      captured in the polling interval.
    - A failure event is returned immediately when detected.
    - Success events followed by a failure event return the failure
      event as soon as it appears.
    - If no events are detected, the script exits with a timeout error.
02. PASS – Run subcloud auto-restore and factory restore to verify the
    new codes are used and that both operations complete successfully.
03. PASS – Time the full enroll-init operation on a slow BMC and verify
    it completes faster than the previous implementation.

Story: 2011455
Task: 52858

Change-Id: Ib379e4e4c8308361357914f21fd8d73ab5bafbb4
Signed-off-by: Gustavo Herzmann <gustavo.herzmann@windriver.com>
2025-09-29 17:16:53 -03:00

720 lines
23 KiB
Python

#!/usr/bin/env python3
#
# Copyright (c) 2025 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
"""
Monitors the IPMI System Event Log (SEL) for a specific target event and
associated event data, or retrieves the last event ID.
The script operates in three modes:
- In monitoring mode, it periodically checks the SEL for the desired events.
If any of the events are found, the script returns a return code of 0 and the
matched event in json format. If an event is not found after the maximum
number of checks, a non-zero return code is returned.
- In success/failure monitoring mode, it'll return immediately on any success
event (returning the latest/most recent one found) or failure event. This
allows the playbook to call the script multiple times and get feedback about
which stage completed.
- Alternatively, using --get-last-event makes the script simply return the ID of
the most recent event in the SEL and exit. If the SEL is empty, it returns -1.
For the monitoring modes, it only monitors new events, existing events are
ignored unless --initial-event-id is specified to start monitoring from a
specific point.
"""
import argparse
import base64
import json
import logging
import os
import subprocess
import sys
import time
from typing import Dict
from typing import List
from typing import Optional
from typing import Tuple
from typing import Union
import netaddr
import yaml
# Configure logging to stderr so stdout stays clean for Ansible JSON output
logging.basicConfig(
stream=sys.stderr,
level=logging.INFO,
format="%(asctime)s %(levelname)s: %(message)s",
)
def hex_to_int(hex_str: str) -> int:
return int(hex_str, 16)
def run_command(cmd: list[str]) -> Optional[str]:
"""Run a shell command and return its output"""
try:
result = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True,
)
return result.stdout
except subprocess.CalledProcessError as e:
logging.error(f"Command failed: {' '.join(cmd)}")
logging.error(f"stderr: {e.stderr.strip()}")
return None
def exit_script(
success: bool,
message: str,
matched_data: Optional[str] = None,
result_data: Optional[Dict] = None,
) -> None:
if not success:
logging.error(message)
# Output as JSON to stdout for Ansible to parse
result = {"success": success, "message": message}
# Add additional info if an event was matched
if matched_data:
result["matched_data"] = matched_data
# Add extra result data for the success/failure mode
if result_data:
result.update(result_data)
print(json.dumps(result))
sys.exit(0 if success else 1)
class IpmiTool:
def __init__(self, host, user, password):
self.base_cmd = [
"ipmitool",
"-I",
"lanplus",
"-H",
host,
"-U",
user,
"-P",
password,
]
@classmethod
def from_config(cls, path: str) -> "IpmiTool":
"""Instantiates IpmiTool using a BMC config file"""
logging.info(f"Parsing BMC config from {path}")
if not os.path.isfile(path):
exit_script(False, f"BMC config file not found: {path}")
try:
with open(path, "r", encoding="utf-8") as yaml_config:
cfg = yaml.safe_load(yaml_config)
except Exception as ex:
exit_script(
False,
f"Unable to open or parse the BMC configuration file: {path} ({ex})",
)
for key in ("bmc_password", "bmc_address", "bmc_username"):
if key not in cfg:
exit_script(False, f"{key} is missing from the BMC configuration file")
try:
cfg["bmc_password"] = base64.b64decode(cfg["bmc_password"]).decode("utf-8")
except Exception as ex:
exit_script(
False, f"Failed to decode BMC password found in config file ({ex})"
)
# Validate IP address
try:
netaddr.IPAddress(cfg["bmc_address"])
except Exception as ex:
exit_script(
False,
"Invalid bmc_address found in the BMC configuration "
f"file: {cfg['bmc_address']} ({ex})",
)
return cls(cfg["bmc_address"], cfg["bmc_username"], cfg["bmc_password"])
@staticmethod
def get_id_from_line(event_line: str) -> int:
return hex_to_int(event_line.split("|")[0].strip())
@staticmethod
def parse_verbose_sel_output(output: str) -> Dict[int, Dict[str, str]]:
"""Parse the verbose SEL list output and return a dictionary of events
ipmitools sel list -v sample output:
SEL Record ID : 01ae
Record Type : 02
Timestamp : 09/26/2025 21:10:19
Generator ID : 0041
EvM Revision : 04
Sensor Type : System Event
Sensor Number : cc
Event Type : Reserved
Event Direction : Assertion Event
Event Data : cc10ef
Description :
SEL Record ID : 01af
Record Type : 02
Timestamp : 09/26/2025 21:20:51
Generator ID : 0020
EvM Revision : 04
Sensor Type : System ACPI Power State
Sensor Number : aa
Event Type : Sensor-specific Discrete
Event Direction : Assertion Event
Event Data : 00ffff
Description : S0/G0: working
"""
events = {}
current_event = {}
current_event_id = None
for line in output.splitlines():
line = line.strip()
# Skip empty lines and known error messages
if (
not line
or line.startswith("Running")
or line.startswith("Error")
or line.startswith("Invalid")
or line.startswith("Discovered")
):
continue
# If it's a new event line, store the old one and create a new one
if line.startswith("SEL Record ID"):
if current_event_id is not None and current_event:
events[current_event_id] = current_event
current_event_id = hex_to_int(line.split(":")[1].strip())
current_event = {}
continue
if ":" in line:
key, value = line.split(":", 1)
current_event[key.strip()] = value.strip()
# Handle the last event
if current_event_id is not None and current_event:
events[current_event_id] = current_event
return events
def get_all_events_detailed(self) -> Dict[int, Dict[str, str]]:
"""Get all events with detailed information using sel list -v"""
cmd = self.base_cmd + ["sel", "list", "-v"]
output = run_command(cmd)
if not output:
return {}
return self.parse_verbose_sel_output(output)
def get_all_events(self) -> list[str]:
"""Get all events in the SEL"""
cmd = self.base_cmd + ["sel", "list"]
output = run_command(cmd)
if not output:
return []
return output.strip().splitlines()
def get_last_event_id(self) -> Optional[int]:
"""Get the ID of the most recent event in the SEL"""
lines = self.get_all_events()
if not lines:
return None
return self.get_id_from_line(lines[-1])
def get_last_event_only(ipmi_tool: IpmiTool) -> None:
"""Get the last event ID and exit"""
last_event_id = ipmi_tool.get_last_event_id()
if last_event_id is not None:
message = f"Last event ID: {last_event_id}"
result = {"success": True, "message": message, "last_event_id": last_event_id}
else:
# SEL is empty, return -1 as the last event ID
message = "SEL is empty, returning -1 as last event ID"
result = {"success": True, "message": message, "last_event_id": -1}
print(json.dumps(result))
sys.exit(0)
def get_success_failure_events(
matching_events: Dict[int, Dict[str, str]],
failure_codes_lower: List[str],
success_codes_lower: List[str],
elapsed_time: float,
) -> Optional[Tuple[bool, str, Dict]]:
"""Process events in success/failure monitoring mode.
Returns immediately on first failure event, or returns the latest success event
after processing all events. Returns None if no success or failure events found.
"""
success_events = []
# Process events in ascending order by event ID
for event_id in sorted(matching_events.keys()):
event_details = matching_events[event_id]
event_data = event_details.get("Event Data", "")
if not event_data:
logging.warning(f"Event {event_id} has no Event Data field")
continue
event_data_lower = event_data.lower()
logging.info(f"Processing event ID {event_id}, data: {event_data}")
if event_data_lower in failure_codes_lower:
failure_event = {
"event_id": event_id,
"data": event_data,
"timestamp": event_details.get("Timestamp", ""),
"sensor_type": event_details.get("Sensor Type", ""),
"sensor_number": event_details.get("Sensor Number", ""),
}
message = (
f"FAILURE event detected! Event ID: {event_id}, "
f"Event Data: {event_data}"
)
failure_result = {
"failure_detected": True,
"failure_event": failure_event,
"elapsed_time": elapsed_time,
}
logging.error(message)
# Return immediately if we found any failure event
return False, message, failure_result
elif event_data_lower in success_codes_lower:
success_events.append(
{
"event_id": event_id,
"data": event_data,
"timestamp": event_details.get("Timestamp", ""),
"sensor_type": event_details.get("Sensor Type", ""),
"sensor_number": event_details.get("Sensor Number", ""),
}
)
logging.info(
f"SUCCESS event found! Event ID: {event_id}, "
f"Event Data: {event_data}"
)
# Return the latest detected success event
if success_events:
latest_success_event = success_events[-1]
message = (
"SUCCESS event detected! "
f"Event ID: {latest_success_event['event_id']}, "
f"Event Data: {latest_success_event['data']}"
)
success_result = {
"success_event_detected": True,
"success_event": latest_success_event,
"elapsed_time": elapsed_time,
}
return True, message, success_result
return None
def get_first_matching_event(
matching_events: Dict[int, Dict[str, str]],
target_codes_lower: List[str],
) -> Optional[Tuple[bool, str, str]]:
"""Process events in standard monitoring mode.
Returns the first matching event if found, returns None otherwise.
"""
for event_id, event_details in matching_events.items():
event_data = event_details.get("Event Data", "")
if not event_data:
logging.warning(f"Event {event_id} has no Event Data field")
continue
event_data_lower = event_data.lower()
logging.info(f"Processing event ID {event_id}, data: {event_data}")
if event_data_lower in target_codes_lower:
message = (
f"Target event found! Event ID: {event_id}, "
f"Event Data: {event_data}"
)
logging.info(message)
return True, message, event_data
logging.debug(
f"Event {event_id} data '{event_data}' doesn't match target codes"
)
return None
def monitor_events(
ipmi_tool: IpmiTool,
sensor_type_filter: Optional[str],
sensor_number_filter: Optional[str],
target_codes: List[str],
interval: float,
timeout: int,
initial_event_id: Optional[int] = None,
success_codes: Optional[List[str]] = None,
failure_codes: Optional[List[str]] = None,
) -> Union[Tuple[bool, str, Optional[str]], Tuple[bool, str, Dict]]:
"""Monitor IPMI SEL for target events
Monitoring mode (success_codes=None, failure_codes=None):
- Returns first matching event from target_codes
- Returns: (success, message, matched_data)
Success/failure monitoring mode (success_codes and failure_codes provided):
- Exits immediately on any failure_codes
- Exits immediately on any success_codes (returns latest one found)
- Returns: (success, message, result_dict)
"""
success_failure_mode = success_codes is not None and failure_codes is not None
if initial_event_id is not None:
last_event_id = initial_event_id
logging.info(f"Using provided initial event ID: {last_event_id}")
else:
last_event_id = ipmi_tool.get_last_event_id()
if last_event_id is None:
message = "Failed to get initial event ID, SEL might be empty"
logging.warning(message)
last_event_id = -1
logging.info(f"Starting monitoring from event ID: {last_event_id}")
target_codes_lower = [code.lower() for code in target_codes]
if success_failure_mode:
success_codes_lower = [code.lower() for code in success_codes]
failure_codes_lower = [code.lower() for code in failure_codes]
logging.info(
f"Success/failure mode: Success codes (return on any): {success_codes}"
)
logging.info(
f"Success/failure mode: Failure codes (exit on any): {failure_codes}"
)
else:
logging.info(f"Monitoring mode: Target codes: {target_codes}")
filter_info = []
if sensor_type_filter:
filter_info.append(f"sensor type: '{sensor_type_filter}'")
if sensor_number_filter:
filter_info.append(f"sensor number: '{sensor_number_filter}'")
if filter_info:
logging.info(f"Applying sensor filters: {', '.join(filter_info)}")
last_event_id_int = last_event_id
start_time = time.monotonic()
next_check_time = start_time
check_num = 0
while time.monotonic() - start_time < timeout:
check_num += 1
current_time = time.monotonic()
if current_time < next_check_time:
sleep_time = next_check_time - current_time
logging.debug(f"Waiting {sleep_time:.2f}s until next check")
time.sleep(sleep_time)
next_check_time += interval
elapsed_time = time.monotonic() - start_time
remaining_time = timeout - elapsed_time
logging.info(
f"Check {check_num}: Checking for new events... "
f"(elapsed: {elapsed_time:.1f}s, remaining: {remaining_time:.1f}s)"
)
check_start_time = time.monotonic()
all_events_detailed = ipmi_tool.get_all_events_detailed()
if not all_events_detailed:
logging.warning("Failed to get events or SEL is empty")
continue
current_event_id_int = max(all_events_detailed.keys())
logging.debug(
f"Current last event ID: {current_event_id_int}, "
f"previous: {last_event_id_int}"
)
if current_event_id_int == last_event_id_int:
check_duration = time.monotonic() - check_start_time
logging.debug(f"No new events found (check took {check_duration:.2f}s)")
continue
if current_event_id_int < last_event_id_int:
logging.info(
"Current last event ID is smaller than the previous ID, "
"assuming SEL was cleared..."
)
last_event_id_int = 0
new_events = {
event_id: details
for event_id, details in all_events_detailed.items()
if event_id > last_event_id_int
}
logging.info(f"Found {len(new_events)} new events")
# Filter events based on sensor type and number
matching_events = {}
for event_id, event_details in new_events.items():
sensor_type = event_details.get("Sensor Type", "")
sensor_number = event_details.get("Sensor Number", "")
if sensor_type_filter and sensor_type_filter.lower() != sensor_type.lower():
continue
if (
sensor_number_filter
and sensor_number_filter.lower() != sensor_number.lower()
):
continue
matching_events[event_id] = event_details
logging.info(f"Found {len(matching_events)} events matching sensor filters")
result = None
if success_failure_mode:
result = get_success_failure_events(
matching_events, failure_codes_lower, success_codes_lower, elapsed_time
)
else:
result = get_first_matching_event(matching_events, target_codes_lower)
# If we found a matching event during this poll interval, return immediately
if result is not None:
return result
last_event_id_int = current_event_id_int
check_duration = time.monotonic() - check_start_time
logging.debug(f"Check completed in {check_duration:.2f}s")
# Timeout reached
elapsed_time = time.monotonic() - start_time
if success_failure_mode:
message = (
f"Monitoring timed out after {elapsed_time:.1f}s without "
"detecting any success or failure events"
)
logging.warning(message)
timeout_result = {
"timeout_reached": True,
"elapsed_time": elapsed_time,
}
return False, message, timeout_result
message = f"Monitoring timed out after {elapsed_time:.1f} seconds"
logging.warning(message)
return False, message, None
def main():
parser = argparse.ArgumentParser(
description=(
"Monitor new IPMI SEL entries for success/failure events or get "
"last event ID."
),
epilog="""\
Examples:
1) Query mode: get the last SEL event id (or -1 if empty) and exit.
ipmi_sel_event_monitor.py \\
--config-file rvmc-config.yaml \\
--get-last-event
2) Monitoring mode: wait for first matching event and return immediately
ipmi_sel_event_monitor.py \\
--config-file rvmc-config.yaml \\
--sensor-type "System Event" \\
--sensor-number "cc" \\
--data-values "ffffe6,ffffe7" \\
--interval 15 \\
--timeout 300
3) Success/failure monitoring mode: return immediately on ANY success event
(latest one) or failure event.
ipmi_sel_event_monitor.py \\
--config-file rvmc-config.yaml \\
--sensor-type "System Event" \\
--sensor-number "cc" \\
--success-codes "ffffe0,ffffe9,ffffe2,ffffea,ffffe4,ffffe6" \\
--failure-codes "ffffe1,ffffec,ffffed,ffffee,ffffe3,fffff0" \\
--interval 15 \\
--timeout 300
""",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
# Common args
parser.add_argument(
"--config-file",
required=True,
help="Path to BMC configuration file containing host, username, and password",
)
parser.add_argument(
"--sensor-type",
help="Filter by sensor type (e.g., 'Unknown', 'System Event')",
)
parser.add_argument(
"--sensor-number",
help="Filter by sensor number (e.g., '01', 'ff')",
)
parser.add_argument(
"--initial-event-id",
type=int,
help=(
"Initial event ID to start monitoring from "
"(only monitor events after this ID)"
),
)
parser.add_argument(
"--interval", type=int, default=30, help="Check interval in seconds"
)
parser.add_argument("--timeout", type=int, default=3600, help="Timeout in seconds")
# Monitoring mode
parser.add_argument(
"--data-values",
help="Comma-separated list of event data values to match",
)
# Failure/success mode
parser.add_argument(
"--success-codes",
help=(
"Comma-separated list of success event codes "
"(return immediately on ANY of these)"
),
)
parser.add_argument(
"--failure-codes",
help=(
"Comma-separated list of failure event codes "
"(exit immediately on ANY of these)"
),
)
# Query mode
parser.add_argument(
"--get-last-event",
action="store_true",
help="Get the last event ID and exit (ignores other monitoring parameters)",
)
args = parser.parse_args()
if args.get_last_event or args.data_values:
# For query or monitoring mode, no extra validation is necessary
pass
elif args.success_codes or args.failure_codes:
if not args.success_codes:
parser.error("--success-codes is required when using success/failure mode")
if not args.failure_codes:
parser.error("--failure-codes is required when using success/failure mode")
else:
parser.error(
"Must specify either --get-last-event, --data-values, or both "
"--success-codes and --failure-codes"
)
try:
ipmi_tool = IpmiTool.from_config(args.config_file)
if args.get_last_event:
get_last_event_only(ipmi_tool)
elif args.success_codes and args.failure_codes:
success_codes = [code.strip() for code in args.success_codes.split(",")]
failure_codes = [code.strip() for code in args.failure_codes.split(",")]
all_target_codes = success_codes + failure_codes
success, message, result_data = monitor_events(
ipmi_tool,
args.sensor_type,
args.sensor_number,
all_target_codes,
args.interval,
args.timeout,
args.initial_event_id,
success_codes,
failure_codes,
)
exit_script(success, message, None, result_data)
else:
event_data_values = [d.strip() for d in args.data_values.split(",")]
success, message, matched_data = monitor_events(
ipmi_tool,
args.sensor_type,
args.sensor_number,
event_data_values,
args.interval,
args.timeout,
args.initial_event_id,
)
exit_script(success, message, matched_data)
except Exception as e:
exit_script(
False,
f"Unexpected exception while monitoring IPMI SEL for target events: {e}",
)
if __name__ == "__main__":
main()