Fix AIO-DX failover issues

This fix is to fix AIO unexpected failover behaviors.
1. active controller reboots itself when standby controller
   reboot/lost power
2. standby controller becomes degraded after active controller
   reboot/lost power

Closes-bug: 1927133
Change-Id: If3c9f6251f689a89cd206c672092ba296f00bd6b
Signed-off-by: Bin Qian <bin.qian@windriver.com>
This commit is contained in:
Bin Qian 2021-05-04 11:33:43 -04:00
parent cb5fa9510f
commit 0b99b594f8
4 changed files with 106 additions and 13 deletions

View File

@ -206,8 +206,8 @@ SmFailoverFailPendingState::~SmFailoverFailPendingState()
SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
{
//SmFSMEventDataTypeT event_data_type = event_data->get_event_data_type();
bool duplex = false;
bool blind_guess = false;
switch (event)
{
case SM_FAILOVER_EVENT_IF_STATE_CHANGED:
@ -249,13 +249,10 @@ SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const
if(healthy)
{
blind_guess_scenario_start();
}
else
{
this->fsm.set_state(SM_FAILOVER_STATE_FAILED);
blind_guess = true;
}
}
else
if( !blind_guess )
{
SmSystemFailoverStatus& failover_status = SmSystemFailoverStatus::get_status();
SmErrorT error = sm_failover_ss_get_survivor(failover_status);

View File

@ -20,6 +20,7 @@
#include "sm_types.h"
#include "sm_debug.h"
#include "sm_node_utils.h"
#include "sm_node_api.h"
#include "sm_failover.h"
#include "sm_failover_fsm.h"
#include "sm_failover_ss.h"
@ -182,9 +183,54 @@ static bool sm_failover_failed_recovery_criteria_met( void )
return (criteria_met);
}
SmErrorT proceed_recovery()
{
SmErrorT error;
char peer_name[SM_NODE_NAME_MAX_CHAR];
char host_name[SM_NODE_NAME_MAX_CHAR];
// delete peer node
error = sm_node_api_get_peername(peer_name);
if(SM_OKAY != error)
{
DPRINTFI("Cannot retrieve peer's hostname, error %s", sm_error_str(error));
return error;
}
error = sm_node_api_delete_node(peer_name);
if(SM_OKAY != error)
{
DPRINTFI("Failed to delete peer %s, error %s", peer_name, sm_error_str(error));
return error;
}else
{
DPRINTFI("Peer %s is deleted.", peer_name);
}
// enable host
error = sm_node_api_get_hostname(host_name);
if(SM_OKAY != error)
{
DPRINTFI("Cannot retrieve hostname, error %s", sm_error_str(error));
return error;
}
error = sm_node_api_recover_node(host_name);
if(SM_OKAY != error)
{
DPRINTFI("Failed to recover %s, error %s", host_name, sm_error_str(error));
return error;
}else
{
DPRINTFI("Host %s is recovered.", host_name);
}
sm_node_utils_reset_unhealthy_flag();
DPRINTFI("Unhealthy flag is removed");
return SM_OKAY;
}
// The 'Failover Failed' state recovery audit handler
SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
{
SmErrorT error;
event_data=event_data;
switch (event)
{
@ -196,15 +242,21 @@ SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmF
DPRINTFI("************************************");
DPRINTFI("** Failover Failed state recovery **");
DPRINTFI("************************************");
sm_node_utils_reset_unhealthy_flag();
sm_failover_failed_process_restart(PROCESS_SM);
for ( int i = 0 ; i < 10 ; i++ )
error = proceed_recovery();
if(SM_OKAY != error)
{
// waiting for shutdown
sleep(1);
DPRINTFE("Cannot recover from failed state");
}else
{
sm_failover_failed_process_restart(PROCESS_SM);
for ( int i = 0 ; i < 10 ; i++ )
{
// waiting for shutdown
sleep(1);
}
DPRINTFE("Restart did not occur ; reinstating unhealthy flag ; recovery will retry");
sm_node_utils_set_unhealthy();
}
DPRINTFE("Restart did not occur ; reinstating unhealthy flag ; recovery will retry");
sm_node_utils_set_unhealthy();
}
else if ( ++_log_throttle > 1 )
{

View File

@ -742,6 +742,44 @@ SmErrorT sm_node_api_fail_node( char node_name[] )
}
// ****************************************************************************
// ****************************************************************************
// Node API - Recover Node
// ======================
SmErrorT sm_node_api_recover_node( char node_name[] )
{
SmDbNodeT node;
SmErrorT error;
error = sm_db_nodes_read( _sm_db_handle, node_name, &node );
if( SM_OKAY != error )
{
DPRINTFE( "Failed to read node (%s) information, error=%s.",
node_name, sm_error_str( error ) );
return( error );
}
if( node.oper_state != SM_NODE_OPERATIONAL_STATE_DISABLED ||
node.avail_status != SM_NODE_AVAIL_STATUS_FAILED )
{
DPRINTFD("Not in failure mode %s", node_name);
}
DPRINTFE("Node %s is to recover from failure mode.", node_name);
error = sm_node_api_update_node(
node_name,
node.admin_state,
SM_NODE_OPERATIONAL_STATE_ENABLED,
SM_NODE_AVAIL_STATUS_AVAILABLE);
if( SM_OKAY != error )
{
DPRINTFE( "Failed to set node (%s) failed, error=%s.",
node_name, sm_error_str( error ) );
}
return( error );
}
// ****************************************************************************
// ****************************************************************************
// Node API - Delete Node
// ======================

View File

@ -54,6 +54,12 @@ extern SmErrorT sm_node_api_update_node( char node_name[],
SmErrorT sm_node_api_fail_node( char node_name[] );
// ****************************************************************************
// ****************************************************************************
// Node API - Fail Node
// ======================
SmErrorT sm_node_api_recover_node( char node_name[] );
// ****************************************************************************
// ****************************************************************************
// Node API - Delete Node
// ======================