Inject failure and reattempt rehoming scenario

Enhanced the simplex subcloud rehoming test to include fault injection
and recovery validation. The test now verifies system resilience by
injecting an alarm that causes initial rehome failure, then clearing
the alarm and successfully completing the rehome operation.

The test validates that the rehoming process properly handles failure
conditions and can recover successfully when the blocking condition
is resolved, ensuring robust distributed cloud operations.

Test Plan:
PASS: Verify alarm injection causes rehome failure
PASS: Verify alarm clearance allows rehome success
PASS: Verify subcloud health after retry operation
PASS: Verify pod counts remain consistent

Change-Id: I18be511415920f37893096a96c4224db05590199
Signed-off-by: Abhishek jaiswal <abhishek.jaiswal@windriver.com>
This commit is contained in:
Abhishek jaiswal
2025-10-15 07:02:02 -04:00
parent 5e04769f82
commit d6acc9e758
2 changed files with 170 additions and 2 deletions

View File

@@ -285,4 +285,50 @@ def validate_greater_than(observed_value: int, baseline_value: int, validation_d
get_logger().log_error(f"Validation Failed - {validation_description}")
get_logger().log_error(f"Baseline: {baseline_value}")
get_logger().log_error(f"Observed: {observed_value}")
raise Exception("Validation Failed")
raise Exception("Validation Failed")
def validate_none(observed_value: Any, validation_description: str) -> None:
"""
This function will validate if the observed value is none.
Args:
observed_value (Any): Value that we see on the system.
validation_description (str): Description of this validation for logging purposes.
Returns: None
Raises:
Exception: raised when validate fails
"""
if observed_value is None:
get_logger().log_info(f"Validation Successful - {validation_description}")
else:
get_logger().log_error(f"Validation Failed - {validation_description}")
get_logger().log_error("Expected: None")
get_logger().log_error(f"Observed: {observed_value}")
raise Exception("Validation Failed")
def validate_not_none(observed_value: Any, validation_description: str) -> None:
"""
This function will validate if the observed value is not none.
Args:
observed_value (Any): Value that we see on the system.
validation_description (str): Description of this validation for logging purposes.
Returns: None
Raises:
Exception: raised when validate fails
"""
if observed_value is not None:
get_logger().log_info(f"Validation Successful - {validation_description}")
else:
get_logger().log_error(f"Validation Failed - {validation_description}")
get_logger().log_error("Expected: Not None")
get_logger().log_error(f"Observed: {observed_value}")
raise Exception("Validation Failed")

View File

@@ -4,7 +4,7 @@ from config.configuration_manager import ConfigurationManager
from config.lab.objects.lab_type_enum import LabTypeEnum
from framework.logging.automation_logger import get_logger
from framework.ssh.ssh_connection import SSHConnection
from framework.validation.validation import validate_equals, validate_not_equals
from framework.validation.validation import validate_equals, validate_none, validate_not_equals, validate_not_none
from keywords.cloud_platform.dcmanager.dcmanager_kube_rootca_update_strategy_keywords import DcmanagerKubeRootcaUpdateStrategyKeywords
from keywords.cloud_platform.dcmanager.dcmanager_subcloud_add_keywords import DcManagerSubcloudAddKeywords
from keywords.cloud_platform.dcmanager.dcmanager_subcloud_delete_keywords import DcManagerSubcloudDeleteKeywords
@@ -12,6 +12,9 @@ from keywords.cloud_platform.dcmanager.dcmanager_subcloud_list_keywords import D
from keywords.cloud_platform.dcmanager.dcmanager_subcloud_manager_keywords import DcManagerSubcloudManagerKeywords
from keywords.cloud_platform.dcmanager.dcmanager_subcloud_show_keywords import DcManagerSubcloudShowKeywords
from keywords.cloud_platform.deployment_assets.host_profile_yaml_keywords import HostProfileYamlKeywords
from keywords.cloud_platform.fault_management.alarms.alarm_list_keywords import AlarmListKeywords
from keywords.cloud_platform.fault_management.fm_client_cli.fm_client_cli_keywords import FaultManagementClientCLIKeywords
from keywords.cloud_platform.fault_management.fm_client_cli.object.fm_client_cli_object import FaultManagementClientCLIObject
from keywords.cloud_platform.health.health_keywords import HealthKeywords
from keywords.cloud_platform.ssh.lab_connection_keywords import LabConnectionKeywords
from keywords.cloud_platform.sync_files.sync_deployment_assets import SyncDeploymentAssets
@@ -414,3 +417,122 @@ def test_rehome_duplex_subcloud(request):
# Validate swact back to original state
dc_swact(subcloud_ssh)
verify_subcloud_healthy(destination_system_controller_ssh, subcloud_name)
@mark.p2
@mark.subcloud_lab_is_simplex
@mark.lab_has_secondary_system_controller
def test_rehome_simplex_subcloud_alarm_recovery_scenario(request):
"""
Verify rehome simplex subcloud with fault injection and recovery between two system controllers.
This test validates the rehoming process resilience by injecting a fault (alarm) that causes
the initial rehome operation to fail, then clearing the fault and successfully completing
the rehome operation on retry.
Test Steps:
1. Get a healthy simplex subcloud from the origin system controller
2. Count total pods on subcloud before rehoming (baseline measurement)
3. Inject fault by raising an alarm on subcloud to simulate failure condition
4. Validate alarm is present and properly configured on subcloud
5. Attempt rehome operation and verify it fails due to the alarm
6. Clear the injected alarm to remove failure condition
7. Retry rehome operation and verify it succeeds
8. Validate subcloud is healthy and properly managed after successful rehome
9. Count pods after rehoming and verify count matches baseline
10. Validate pod counts are identical before and after rehoming
Expected Results:
- Initial rehome fails with alarm present (rehome-failed status)
- Rehome succeeds after alarm is cleared
- Subcloud maintains same pod count after successful rehome
- Subcloud is healthy and in-sync after rehome completion
"""
# Initialize SSH connections to both system controllers
origin_system_controller_ssh = LabConnectionKeywords().get_active_controller_ssh()
destination_system_controller_ssh = LabConnectionKeywords().get_secondary_active_controller_ssh()
# Initialize DC manager keywords for both controllers
origin_dcm_list_kw = DcManagerSubcloudListKeywords(origin_system_controller_ssh)
destination_dcm_list_kw = DcManagerSubcloudListKeywords(destination_system_controller_ssh)
origin_dcm_sc_kw = DcManagerSubcloudManagerKeywords(origin_system_controller_ssh)
# Get a healthy simplex subcloud for testing
get_logger().log_info("Selecting healthy simplex subcloud for rehoming test")
simplex_subcloud = origin_dcm_list_kw.get_dcmanager_subcloud_list().get_healthy_subcloud_by_type(LabTypeEnum.SIMPLEX.value)
subcloud_name = simplex_subcloud.get_name()
# Retrieve deployment assets (bootstrap and install files) for the subcloud
get_logger().log_info(f"Retrieving deployment assets for subcloud {subcloud_name}")
deployment_assets_config = ConfigurationManager.get_deployment_assets_config()
subcloud_bootstrap_values = deployment_assets_config.get_subcloud_deployment_assets(subcloud_name).get_bootstrap_file()
subcloud_install_values = deployment_assets_config.get_subcloud_deployment_assets(subcloud_name).get_install_file()
# Establish SSH connection to the target subcloud
subcloud_ssh = LabConnectionKeywords().get_subcloud_ssh(subcloud_name)
# Baseline measurement: count pods before any rehoming operations
get_logger().log_info("Establishing baseline pod count before rehoming")
pods_before_rehome = count_pods_on_subcloud(subcloud_ssh)
# Fault injection phase: prepare and inject alarm to simulate failure condition
fm_client_cli_keywords = FaultManagementClientCLIKeywords(subcloud_ssh)
alarm_list_keywords = AlarmListKeywords(subcloud_ssh)
# Verify no conflicting alarms exist before injection
get_logger().log_info("Verifying no conflicting alarms exist on subcloud")
subcloud_alarms = alarm_list_keywords.alarm_list()
existing_alarm = next((alarm for alarm in subcloud_alarms if alarm.alarm_id == FaultManagementClientCLIObject.DEFAULT_ALARM_ID), None)
validate_none(existing_alarm, f"Alarm with ID {FaultManagementClientCLIObject.DEFAULT_ALARM_ID} should not exist before test injection")
# Create and inject test alarm to simulate failure condition
fm_client_cli_object = FaultManagementClientCLIObject()
fm_client_cli_object.set_alarm_id(FaultManagementClientCLIObject.DEFAULT_ALARM_ID)
fm_client_cli_object.set_entity_id(f"name={subcloud_name}")
get_logger().log_info(f"Injecting test alarm on subcloud {subcloud_name} to simulate failure condition")
fm_client_cli_keywords.raise_alarm(fm_client_cli_object)
# Verify alarm injection was successful
get_logger().log_info("Verifying alarm injection was successful")
subcloud_alarms = alarm_list_keywords.alarm_list()
injected_alarm = next((alarm for alarm in subcloud_alarms if alarm.alarm_id == fm_client_cli_object.get_alarm_id()), None)
validate_not_none(injected_alarm, f"Injected alarm with ID {fm_client_cli_object.get_alarm_id()} should be present on subcloud")
validate_equals(injected_alarm.get_entity_id(), fm_client_cli_object.get_entity_id(), "Injected alarm entity ID should match subcloud name")
# Prepare for rehoming: synchronize deployment assets between system controllers
get_logger().log_info(f"Synchronizing deployment assets for subcloud {subcloud_name} between system controllers")
sync_deployment_assets_between_system_controllers(origin_system_controller_ssh, destination_system_controller_ssh, subcloud_name, subcloud_bootstrap_values, subcloud_install_values)
# First rehome attempt: expect failure due to injected alarm
get_logger().log_info(f"Attempting initial rehome of {subcloud_name} (expecting failure due to alarm)")
origin_dcm_sc_kw.get_dcmanager_subcloud_unmanage(subcloud_name, 30)
DcManagerSubcloudAddKeywords(destination_system_controller_ssh).dcmanager_subcloud_add_migrate(subcloud_name, bootstrap_values=subcloud_bootstrap_values, install_values=subcloud_install_values)
# Verify rehome failed as expected
destination_dcm_list_kw.validate_subcloud_status(subcloud_name, status="rehome-failed")
get_logger().log_info(f"Rehome failed as expected due to alarm on subcloud {subcloud_name}")
# Cleanup failed rehome attempt: re-manage on origin and delete from destination
origin_dcm_sc_kw.get_dcmanager_subcloud_manage(subcloud_name, timeout=30)
origin_dcm_list_kw.validate_subcloud_sync_status(subcloud_name, "in-sync")
DcManagerSubcloudDeleteKeywords(destination_system_controller_ssh).dcmanager_subcloud_delete(subcloud_name)
# Recovery phase: clear injected alarm to remove failure condition
get_logger().log_info(f"Clearing injected alarm from subcloud {subcloud_name} to enable successful rehome")
fm_client_cli_keywords.delete_alarm(fm_client_cli_object)
# Second rehome attempt: expect success after alarm clearance
get_logger().log_info(f"Retrying rehome of {subcloud_name} after alarm clearance (expecting success)")
perform_rehome_operation(origin_system_controller_ssh, destination_system_controller_ssh, subcloud_name, subcloud_bootstrap_values, subcloud_install_values)
# Post-rehome validation: verify subcloud health and functionality
get_logger().log_info(f"Validating subcloud {subcloud_name} health after successful rehome")
verify_subcloud_healthy(destination_system_controller_ssh, subcloud_name)
get_logger().log_info(f"Rehome operation for subcloud {subcloud_name} completed successfully")
# Final validation: verify pod count consistency
get_logger().log_info("Performing final pod count validation")
pods_after_rehome = count_pods_on_subcloud(subcloud_ssh)
validate_equals(pods_before_rehome, pods_after_rehome, "Pod count must remain consistent before and after successful rehoming")