From d1c6b39db881376191ee3eb8e9a73234a29a8c9f Mon Sep 17 00:00:00 2001 From: Dario Oliveira Date: Wed, 12 Nov 2025 12:49:44 -0300 Subject: [PATCH] Improve test_reinstall_standby_host from weekly regression Enhance the test by adding host health checks, retrieving alarm IDs, and verifying new alarms. Increased the timeout to 500s to ensure host availability validation after the reinstall process starts. Patch 2 - Increase is_host_unlocked timeout. Patch 3 - Moved the functions alarms_id and get_alarms to alarm_list_output - Refactored check_new_alarms: renamed it to is_new_alarm_since and moved it to alarm_list_output - Improved the health validation of hosts - Increased the default timeout of validate_equals_with_retry and wait_for_host_unlocked Change-Id: I4aa5586f5957eded45f975912e33e742c84096a9 Signed-off-by: Dario Oliveira --- .../alarms/alarm_list_keywords.py | 85 +++++++++++-------- .../alarms/objects/alarm_list_output.py | 55 +++++++++--- .../cloud_platform/health/health_keywords.py | 11 +++ .../system/host/system_host_lock_keywords.py | 2 +- .../host/system_host_reinstall_keywords.py | 2 +- .../storage/test_ceph_robustness.py | 39 ++++++--- 6 files changed, 133 insertions(+), 61 deletions(-) diff --git a/keywords/cloud_platform/fault_management/alarms/alarm_list_keywords.py b/keywords/cloud_platform/fault_management/alarms/alarm_list_keywords.py index c79208d8..99bf2c57 100644 --- a/keywords/cloud_platform/fault_management/alarms/alarm_list_keywords.py +++ b/keywords/cloud_platform/fault_management/alarms/alarm_list_keywords.py @@ -2,6 +2,7 @@ import re import time from framework.logging.automation_logger import get_logger +from framework.ssh.ssh_connection import SSHConnection from keywords.base_keyword import BaseKeyword from keywords.cloud_platform.command_wrappers import source_openrc from keywords.cloud_platform.fault_management.alarms.objects.alarm_list_object import AlarmListObject @@ -13,23 +14,36 @@ class AlarmListKeywords(BaseKeyword): Class for alarm list keywords """ - def __init__(self, ssh_connection): + def __init__(self, ssh_connection: SSHConnection) -> None: """ - Constructor + Constructor. + Args: - ssh_connection: + ssh_connection (SSHConnection): Active SSH connection used for remote operations. """ self._ssh_connection = ssh_connection self._check_interval_in_seconds = 3 self._timeout_in_seconds = 600 - def alarm_list(self) -> [AlarmListObject]: + def get_alarm_list(self) -> AlarmListOutput: """ - Keyword to get all alarms - Args: + Keyword to get all alarms. - Returns: the list of alarms + Returns: + AlarmListOutput: List of alarm objects retrieved from the system. + """ + output = self._ssh_connection.send(source_openrc("fm alarm-list --nowrap")) + self.validate_success_return_code(self._ssh_connection) + alarms_output = AlarmListOutput(output) + return alarms_output + + def alarm_list(self) -> AlarmListObject: + """ + Keyword to get all alarms. + + Returns: + AlarmListObject: List of alarm objects retrieved from the system. """ output = self._ssh_connection.send(source_openrc("fm alarm-list --nowrap")) self.validate_success_return_code(self._ssh_connection) @@ -37,20 +51,21 @@ class AlarmListKeywords(BaseKeyword): return alarms.get_alarms() - def wait_for_all_alarms_cleared(self): + def wait_for_all_alarms_cleared(self) -> None: """ + Wait for all alarms to be cleared. + This method waits for all alarms to be cleared in this SSH connection within the period defined by - 'get_timeout_in_seconds()'. Otherwise, this method raises TimeoutError exception. + get_timeout_in_seconds(). Otherwise, this method raises TimeoutError exception. Notes: - The alarms in this SSH connection are checked every 'get_check_interval_in_seconds()' seconds. + The alarms in this SSH connection are checked every get_check_interval_in_seconds() seconds. - Returns: - None + Returns: None Raises: - TimeoutError: if some alarm can not be cleared within a period defined by - 'get_timeout_in_seconds()' seconds; False otherwise. + TimeoutError: if some alarm can not be cleared within a period defined by + the `get_timeout_in_seconds()` seconds; False otherwise. """ # Retrieves the current alarms on this SSH connection @@ -71,8 +86,10 @@ class AlarmListKeywords(BaseKeyword): alarm_ids = ", ".join([alarm.get_alarm_id() for alarm in alarms]) raise TimeoutError(f"The alarms with the following IDs: {alarm_ids} could not be cleared within {self.get_timeout_in_seconds()} seconds.") - def wait_for_alarms_cleared(self, alarms: list[AlarmListObject]): + def wait_for_alarms_cleared(self, alarms: list[AlarmListObject]) -> None: """ + Wait for alarms be cleared + This method waits for the alarms defined in 'alarms' to be cleared in this SSH connection within the period defined by 'get_timeout_in_seconds()'. Otherwise, a TimeoutError exception is raised. @@ -80,16 +97,14 @@ class AlarmListKeywords(BaseKeyword): The alarms in this SSH connection are checked every 'get_check_interval_in_seconds()' seconds. Args: - alarms (list[AlarmListObject]): The list of alarms to be checked to see if they have been cleared in this - SSH connection. + alarms (list[AlarmListObject]): The list of alarms to be checked to see if they have been cleared + in this SSH connection. - Returns: - None + Returns: None Raises: TimeoutError: if some alarm can not be cleared within a period defined by - 'get_timeout_in_seconds()' seconds; False otherwise. - + the `get_timeout_in_seconds()` seconds; False otherwise. """ current_alarms = self.alarm_list() alarm_ids = ", ".join([alarm.get_alarm_id() for alarm in alarms]) @@ -118,6 +133,8 @@ class AlarmListKeywords(BaseKeyword): def wait_for_alarms_to_appear(self, alarms: list[AlarmListObject]) -> None: """ + Wait for an alarm to appear + Waits for the specified alarms to appear on the SSH connection within the timeout period defined by 'get_timeout_in_seconds()'. Validates Alarm ID, Reason Text, and Entity ID. @@ -157,8 +174,7 @@ class AlarmListKeywords(BaseKeyword): def alarms_match(self, observed_alarm_object: AlarmListObject, expected_alarm_object: AlarmListObject) -> bool: """ - Compares two AlarmListObject instances for equality based on - alarm ID, reason text, and entity ID. + Compares two AlarmListObject instances for equality based on alarm ID, reason text, and entity ID. Args: observed_alarm_object (AlarmListObject): The current alarm object to compare against. @@ -187,17 +203,17 @@ class AlarmListKeywords(BaseKeyword): def get_timeout_in_seconds(self) -> int: """ - Gets an integer representing the maximum time in seconds to wait for the alarms to be cleared. - Default value: 600. + Gets an integer representing the maximum time in seconds to wait for the alarms to be cleared, default value: 600. Returns: - (int): An integer representing the maximum time in seconds to wait for the alarms to be cleared. + int: An integer representing the maximum time in seconds to wait for the alarms to be cleared. """ return self._timeout_in_seconds def set_timeout_in_seconds(self, timeout_in_seconds: int): """ Sets the integer representation of the maximum time in seconds to wait for the alarms to be cleared. + Args: timeout_in_seconds (int): An integer representing the maximum time to wait for the alarms to be cleared. """ @@ -205,29 +221,30 @@ class AlarmListKeywords(BaseKeyword): def get_check_interval_in_seconds(self) -> int: """ - Gets an integer representing the interval in seconds at which this instance will check the alarms again. - Default value: 3. + Gets an integer representing the interval in seconds at which this instance will check the alarms again, default value: 3. Returns: - (int): An integer representing the interval in seconds at which this instance will check the alarms again. + int: An integer representing the interval in seconds at which this instance will check the alarms again. """ return self._check_interval_in_seconds def set_check_interval_in_seconds(self, check_interval_in_seconds: int) -> int: """ - Sets the integer representation of the interval in seconds at which this instance will check the alarms again. - Default value: 3. + Sets the integer representation of the interval in seconds at which this instance will check the alarms again, default value: 3. + + Args: + check_interval_in_seconds (int): An integer representing the interval in seconds to check the alarms again. Returns: - (int): An integer representing the interval in seconds at which this instance will check the alarms again. - + int: An integer representing the interval in seconds at which this instance will check the alarms again. """ return self._check_interval_in_seconds - def get_ssh_connection(self): + def get_ssh_connection(self) -> SSHConnection: """ Gets the SSH connection of this AlarmListKeywords instance. + Returns: SSHConnection: the SSH connection of this AlarmListKeywords instance. diff --git a/keywords/cloud_platform/fault_management/alarms/objects/alarm_list_output.py b/keywords/cloud_platform/fault_management/alarms/objects/alarm_list_output.py index e9fa248d..a09cbddc 100644 --- a/keywords/cloud_platform/fault_management/alarms/objects/alarm_list_output.py +++ b/keywords/cloud_platform/fault_management/alarms/objects/alarm_list_output.py @@ -14,22 +14,51 @@ class AlarmListOutput: for value in output_values: alarm_list_object = AlarmListObject() - if 'Alarm ID' in value: - alarm_list_object.set_alarm_id(value['Alarm ID']) - if 'Reason Text' in value: - alarm_list_object.set_reason_text(value['Reason Text']) - if 'Entity ID' in value: - alarm_list_object.set_entity_id(value['Entity ID']) - if 'Severity' in value: - alarm_list_object.set_severity(value['Severity']) - if 'Time Stamp' in value: - alarm_list_object.set_time_stamp(value['Time Stamp']) + if "Alarm ID" in value: + alarm_list_object.set_alarm_id(value["Alarm ID"]) + if "Reason Text" in value: + alarm_list_object.set_reason_text(value["Reason Text"]) + if "Entity ID" in value: + alarm_list_object.set_entity_id(value["Entity ID"]) + if "Severity" in value: + alarm_list_object.set_severity(value["Severity"]) + if "Time Stamp" in value: + alarm_list_object.set_time_stamp(value["Time Stamp"]) self.alarms.append(alarm_list_object) - def get_alarms(self) -> [AlarmListObject]: + def get_alarms(self) -> list[AlarmListObject]: """ - Returns the list of alarms - Returns: + Returns the list of alarms. + Returns: + list[AlarmListObject]: List of alarm objects. """ return self.alarms + + def alarms_id(self) -> list[str]: + """ + Return a list of alarm IDs from AlarmListObject instances. + + Returns: + list[str]: List of alarm IDs. + """ + return [alarm.get_alarm_id() for alarm in self.alarms] + + @staticmethod + def is_new_alarm_id_since(alarm_ids_before: list[str], alarm_ids_after: list[str]) -> bool: + """ + Check if there are new alarms compared to a previous state. + + Args: + alarm_ids_before (list[str]): Alarm IDs before the test. + alarm_ids_after (list[str]): Alarm IDs after the test. + + Returns: + bool: True if new alarms are present, False if no new alarms. + """ + alarm_ids_before_set = set(alarm_ids_before) + alarm_ids_after_set = set(alarm_ids_after) + + new_alarms = list(alarm_ids_after_set - alarm_ids_before_set) + + return len(new_alarms) != 0 diff --git a/keywords/cloud_platform/health/health_keywords.py b/keywords/cloud_platform/health/health_keywords.py index 34b54d99..6f588402 100644 --- a/keywords/cloud_platform/health/health_keywords.py +++ b/keywords/cloud_platform/health/health_keywords.py @@ -1,7 +1,9 @@ from framework.ssh.ssh_connection import SSHConnection +from framework.validation.validation import validate_equals from keywords.base_keyword import BaseKeyword from keywords.cloud_platform.fault_management.alarms.alarm_list_keywords import AlarmListKeywords from keywords.cloud_platform.system.application.system_application_list_keywords import SystemApplicationListKeywords +from keywords.cloud_platform.system.host.system_host_list_keywords import SystemHostListKeywords from keywords.k8s.pods.kubectl_get_pods_keywords import KubectlGetPodsKeywords @@ -45,3 +47,12 @@ class HealthKeywords(BaseKeyword): """Function to validate all apps are healthy and applied""" healthy_status = ["applied", "uploaded"] SystemApplicationListKeywords(self.ssh_connection).validate_all_apps_status(healthy_status) + + def validate_hosts_health(self): + """Function to validate all hosts are healthy""" + host_values = SystemHostListKeywords(self.ssh_connection).get_system_host_list().get_hosts() + + for host_value in host_values: + validate_equals(host_value.get_availability(), "available", f"The host {host_value.get_host_name()} availability is {host_value.get_availability()}") + validate_equals(host_value.get_administrative(), "unlocked", f"The host {host_value.get_host_name()} administrative is {host_value.get_administrative()}") + validate_equals(host_value.get_operational(), "enabled", f"The host {host_value.get_host_name()} operational is {host_value.get_operational()}") diff --git a/keywords/cloud_platform/system/host/system_host_lock_keywords.py b/keywords/cloud_platform/system/host/system_host_lock_keywords.py index 364bb000..b206476d 100644 --- a/keywords/cloud_platform/system/host/system_host_lock_keywords.py +++ b/keywords/cloud_platform/system/host/system_host_lock_keywords.py @@ -157,7 +157,7 @@ class SystemHostLockKeywords(BaseKeyword): except TimeoutError: # Alarm still exists, we can't unlock raise KeywordException("Failed unlock pre-check. Application apply was in progress") - def wait_for_host_unlocked(self, host_name: str, unlock_wait_timeout: int = 1800) -> bool: + def wait_for_host_unlocked(self, host_name: str, unlock_wait_timeout: int = 2800) -> bool: """ Wait for the host to be unlocked diff --git a/keywords/cloud_platform/system/host/system_host_reinstall_keywords.py b/keywords/cloud_platform/system/host/system_host_reinstall_keywords.py index a99443f2..bea7d7ff 100644 --- a/keywords/cloud_platform/system/host/system_host_reinstall_keywords.py +++ b/keywords/cloud_platform/system/host/system_host_reinstall_keywords.py @@ -91,7 +91,7 @@ class SystemHostReinstallKeywords(BaseKeyword): """ self.ssh_connection.send(source_openrc(f"system host-reinstall {host_name}")) self.validate_success_return_code(self.ssh_connection) - validate_equals_with_retry(lambda: SystemHostListKeywords(self.ssh_connection).get_system_host_list().get_host(host_name).get_availability(), expected_value="offline", validation_description="Waiting for host to go offline") + validate_equals_with_retry(lambda: SystemHostListKeywords(self.ssh_connection).get_system_host_list().get_host(host_name).get_availability(), expected_value="offline", validation_description="Waiting for host to go offline", timeout=500) is_host_reinstalled = self.wait_for_host_reinstall(host_name) if not is_host_reinstalled: host_value = SystemHostListKeywords(self.ssh_connection).get_system_host_list().get_host(host_name) diff --git a/testcases/cloud_platform/regression/storage/test_ceph_robustness.py b/testcases/cloud_platform/regression/storage/test_ceph_robustness.py index 47a533f3..0a0d6b41 100644 --- a/testcases/cloud_platform/regression/storage/test_ceph_robustness.py +++ b/testcases/cloud_platform/regression/storage/test_ceph_robustness.py @@ -4,11 +4,14 @@ from framework.logging.automation_logger import get_logger from framework.validation.validation import validate_equals from keywords.ceph.ceph_status_keywords import CephStatusKeywords from keywords.cloud_platform.fault_management.alarms.alarm_list_keywords import AlarmListKeywords +from keywords.cloud_platform.fault_management.alarms.objects.alarm_list_output import AlarmListOutput +from keywords.cloud_platform.health.health_keywords import HealthKeywords from keywords.cloud_platform.ssh.lab_connection_keywords import LabConnectionKeywords from keywords.cloud_platform.system.host.system_host_list_keywords import SystemHostListKeywords from keywords.cloud_platform.system.host.system_host_lock_keywords import SystemHostLockKeywords from keywords.cloud_platform.system.host.system_host_reboot_keywords import SystemHostRebootKeywords from keywords.cloud_platform.system.host.system_host_reinstall_keywords import SystemHostReinstallKeywords +from keywords.cloud_platform.system.host.system_host_swact_keywords import SystemHostSwactKeywords from keywords.server.power_keywords import PowerKeywords @@ -104,10 +107,13 @@ def test_reinstall_standby_host(): Test to validate standby controller reinstallation and ceph health. Test Steps: + - Check the hosts healthy + - Check if controller-0 is the active controller + - Get the active alarms - Lock standby controller - Reinstall standby controller - Unlock standby controller - - Checking if there are any active alarms + - Checking if there are any new active alarms - Checking storage backend health after reinstall. Args: None @@ -115,28 +121,37 @@ def test_reinstall_standby_host(): ssh_connection = LabConnectionKeywords().get_active_controller_ssh() system_host_list_keywords = SystemHostListKeywords(ssh_connection) - standby_controller = system_host_list_keywords.get_standby_controller().get_host_name() system_host_lock_keywords = SystemHostLockKeywords(ssh_connection) + standby_controller = system_host_list_keywords.get_standby_controller().get_host_name() system_host_reinstall_keywords = SystemHostReinstallKeywords(ssh_connection) ceph_status_keywords = CephStatusKeywords(ssh_connection) - alarm_list_keyword = AlarmListKeywords(ssh_connection) + alarm_list_keywords = AlarmListKeywords(ssh_connection) + health_keywords = HealthKeywords(ssh_connection) + system_host_swact_keywords = SystemHostSwactKeywords(ssh_connection) - get_logger().log_test_case_step("Checking if there are any active alarms") - alarms = alarm_list_keyword.alarm_list() - validate_equals(alarms, [], "No active alarms") + get_logger().log_test_case_step("Check the hosts healthy") + health_keywords.validate_hosts_health() - get_logger().log_test_case_step("Lock standby controller") + get_logger().log_test_case_step("Check if controller-0 is the active controller") + if standby_controller == "controller-0": + system_host_swact_keywords.host_swact() + standby_controller = system_host_list_keywords.get_standby_controller().get_host_name() + + get_logger().log_test_case_step("Get the active alarms") + initial_alarm_list_ids = alarm_list_keywords.get_alarm_list().alarms_id() + + get_logger().log_test_case_step(f"Lock {standby_controller}") system_host_lock_keywords.lock_host(standby_controller) - get_logger().log_test_case_step("Reinstall standby controller") + get_logger().log_test_case_step(f"Reinstall {standby_controller}") system_host_reinstall_keywords.reinstall_host(standby_controller) - get_logger().log_test_case_step("Unlock standby controller") + get_logger().log_test_case_step(f"Unlock {standby_controller}") system_host_lock_keywords.unlock_host(standby_controller) - get_logger().log_test_case_step("Checking if there are any active alarms") - alarms = alarm_list_keyword.alarm_list() - validate_equals(alarms, [], "No active alarms") + get_logger().log_test_case_step("Checking if there are any new active alarms") + final_alarm_list_ids = alarm_list_keywords.get_alarm_list().alarms_id() + validate_equals(AlarmListOutput.is_new_alarm_id_since(initial_alarm_list_ids, final_alarm_list_ids), False, "No new alarms should be present") get_logger().log_test_case_step("Checking storage backend health after reinstall.") ceph_status_keywords.wait_for_ceph_health_status(expect_health_status=True)