Fix AIO-DX failover issues
This fix is to fix AIO unexpected failover behaviors. 1. active controller reboots itself when standby controller reboot/lost power 2. standby controller becomes degraded after active controller reboot/lost power Closes-bug: 1927133 Change-Id: If3c9f6251f689a89cd206c672092ba296f00bd6b Signed-off-by: Bin Qian <bin.qian@windriver.com>
This commit is contained in:
parent
cb5fa9510f
commit
0b99b594f8
|
@ -206,8 +206,8 @@ SmFailoverFailPendingState::~SmFailoverFailPendingState()
|
||||||
|
|
||||||
SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
|
SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
|
||||||
{
|
{
|
||||||
//SmFSMEventDataTypeT event_data_type = event_data->get_event_data_type();
|
|
||||||
bool duplex = false;
|
bool duplex = false;
|
||||||
|
bool blind_guess = false;
|
||||||
switch (event)
|
switch (event)
|
||||||
{
|
{
|
||||||
case SM_FAILOVER_EVENT_IF_STATE_CHANGED:
|
case SM_FAILOVER_EVENT_IF_STATE_CHANGED:
|
||||||
|
@ -249,13 +249,10 @@ SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const
|
||||||
if(healthy)
|
if(healthy)
|
||||||
{
|
{
|
||||||
blind_guess_scenario_start();
|
blind_guess_scenario_start();
|
||||||
}
|
blind_guess = true;
|
||||||
else
|
|
||||||
{
|
|
||||||
this->fsm.set_state(SM_FAILOVER_STATE_FAILED);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
if( !blind_guess )
|
||||||
{
|
{
|
||||||
SmSystemFailoverStatus& failover_status = SmSystemFailoverStatus::get_status();
|
SmSystemFailoverStatus& failover_status = SmSystemFailoverStatus::get_status();
|
||||||
SmErrorT error = sm_failover_ss_get_survivor(failover_status);
|
SmErrorT error = sm_failover_ss_get_survivor(failover_status);
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
#include "sm_types.h"
|
#include "sm_types.h"
|
||||||
#include "sm_debug.h"
|
#include "sm_debug.h"
|
||||||
#include "sm_node_utils.h"
|
#include "sm_node_utils.h"
|
||||||
|
#include "sm_node_api.h"
|
||||||
#include "sm_failover.h"
|
#include "sm_failover.h"
|
||||||
#include "sm_failover_fsm.h"
|
#include "sm_failover_fsm.h"
|
||||||
#include "sm_failover_ss.h"
|
#include "sm_failover_ss.h"
|
||||||
|
@ -182,9 +183,54 @@ static bool sm_failover_failed_recovery_criteria_met( void )
|
||||||
return (criteria_met);
|
return (criteria_met);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SmErrorT proceed_recovery()
|
||||||
|
{
|
||||||
|
SmErrorT error;
|
||||||
|
char peer_name[SM_NODE_NAME_MAX_CHAR];
|
||||||
|
char host_name[SM_NODE_NAME_MAX_CHAR];
|
||||||
|
// delete peer node
|
||||||
|
error = sm_node_api_get_peername(peer_name);
|
||||||
|
if(SM_OKAY != error)
|
||||||
|
{
|
||||||
|
DPRINTFI("Cannot retrieve peer's hostname, error %s", sm_error_str(error));
|
||||||
|
return error;
|
||||||
|
}
|
||||||
|
error = sm_node_api_delete_node(peer_name);
|
||||||
|
if(SM_OKAY != error)
|
||||||
|
{
|
||||||
|
DPRINTFI("Failed to delete peer %s, error %s", peer_name, sm_error_str(error));
|
||||||
|
return error;
|
||||||
|
}else
|
||||||
|
{
|
||||||
|
DPRINTFI("Peer %s is deleted.", peer_name);
|
||||||
|
}
|
||||||
|
|
||||||
|
// enable host
|
||||||
|
error = sm_node_api_get_hostname(host_name);
|
||||||
|
if(SM_OKAY != error)
|
||||||
|
{
|
||||||
|
DPRINTFI("Cannot retrieve hostname, error %s", sm_error_str(error));
|
||||||
|
return error;
|
||||||
|
}
|
||||||
|
error = sm_node_api_recover_node(host_name);
|
||||||
|
if(SM_OKAY != error)
|
||||||
|
{
|
||||||
|
DPRINTFI("Failed to recover %s, error %s", host_name, sm_error_str(error));
|
||||||
|
return error;
|
||||||
|
}else
|
||||||
|
{
|
||||||
|
DPRINTFI("Host %s is recovered.", host_name);
|
||||||
|
}
|
||||||
|
|
||||||
|
sm_node_utils_reset_unhealthy_flag();
|
||||||
|
DPRINTFI("Unhealthy flag is removed");
|
||||||
|
return SM_OKAY;
|
||||||
|
}
|
||||||
|
|
||||||
// The 'Failover Failed' state recovery audit handler
|
// The 'Failover Failed' state recovery audit handler
|
||||||
SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
|
SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
|
||||||
{
|
{
|
||||||
|
SmErrorT error;
|
||||||
event_data=event_data;
|
event_data=event_data;
|
||||||
switch (event)
|
switch (event)
|
||||||
{
|
{
|
||||||
|
@ -196,7 +242,12 @@ SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmF
|
||||||
DPRINTFI("************************************");
|
DPRINTFI("************************************");
|
||||||
DPRINTFI("** Failover Failed state recovery **");
|
DPRINTFI("** Failover Failed state recovery **");
|
||||||
DPRINTFI("************************************");
|
DPRINTFI("************************************");
|
||||||
sm_node_utils_reset_unhealthy_flag();
|
error = proceed_recovery();
|
||||||
|
if(SM_OKAY != error)
|
||||||
|
{
|
||||||
|
DPRINTFE("Cannot recover from failed state");
|
||||||
|
}else
|
||||||
|
{
|
||||||
sm_failover_failed_process_restart(PROCESS_SM);
|
sm_failover_failed_process_restart(PROCESS_SM);
|
||||||
for ( int i = 0 ; i < 10 ; i++ )
|
for ( int i = 0 ; i < 10 ; i++ )
|
||||||
{
|
{
|
||||||
|
@ -206,6 +257,7 @@ SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmF
|
||||||
DPRINTFE("Restart did not occur ; reinstating unhealthy flag ; recovery will retry");
|
DPRINTFE("Restart did not occur ; reinstating unhealthy flag ; recovery will retry");
|
||||||
sm_node_utils_set_unhealthy();
|
sm_node_utils_set_unhealthy();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
else if ( ++_log_throttle > 1 )
|
else if ( ++_log_throttle > 1 )
|
||||||
{
|
{
|
||||||
if ( _log_throttle > SM_FAILOVER_FAILED_LOG_THROTTLE_THLD )
|
if ( _log_throttle > SM_FAILOVER_FAILED_LOG_THROTTLE_THLD )
|
||||||
|
|
|
@ -742,6 +742,44 @@ SmErrorT sm_node_api_fail_node( char node_name[] )
|
||||||
}
|
}
|
||||||
// ****************************************************************************
|
// ****************************************************************************
|
||||||
|
|
||||||
|
// ****************************************************************************
|
||||||
|
// Node API - Recover Node
|
||||||
|
// ======================
|
||||||
|
SmErrorT sm_node_api_recover_node( char node_name[] )
|
||||||
|
{
|
||||||
|
SmDbNodeT node;
|
||||||
|
SmErrorT error;
|
||||||
|
error = sm_db_nodes_read( _sm_db_handle, node_name, &node );
|
||||||
|
if( SM_OKAY != error )
|
||||||
|
{
|
||||||
|
DPRINTFE( "Failed to read node (%s) information, error=%s.",
|
||||||
|
node_name, sm_error_str( error ) );
|
||||||
|
return( error );
|
||||||
|
}
|
||||||
|
|
||||||
|
if( node.oper_state != SM_NODE_OPERATIONAL_STATE_DISABLED ||
|
||||||
|
node.avail_status != SM_NODE_AVAIL_STATUS_FAILED )
|
||||||
|
{
|
||||||
|
DPRINTFD("Not in failure mode %s", node_name);
|
||||||
|
}
|
||||||
|
|
||||||
|
DPRINTFE("Node %s is to recover from failure mode.", node_name);
|
||||||
|
|
||||||
|
error = sm_node_api_update_node(
|
||||||
|
node_name,
|
||||||
|
node.admin_state,
|
||||||
|
SM_NODE_OPERATIONAL_STATE_ENABLED,
|
||||||
|
SM_NODE_AVAIL_STATUS_AVAILABLE);
|
||||||
|
|
||||||
|
if( SM_OKAY != error )
|
||||||
|
{
|
||||||
|
DPRINTFE( "Failed to set node (%s) failed, error=%s.",
|
||||||
|
node_name, sm_error_str( error ) );
|
||||||
|
}
|
||||||
|
return( error );
|
||||||
|
}
|
||||||
|
// ****************************************************************************
|
||||||
|
|
||||||
// ****************************************************************************
|
// ****************************************************************************
|
||||||
// Node API - Delete Node
|
// Node API - Delete Node
|
||||||
// ======================
|
// ======================
|
||||||
|
|
|
@ -54,6 +54,12 @@ extern SmErrorT sm_node_api_update_node( char node_name[],
|
||||||
SmErrorT sm_node_api_fail_node( char node_name[] );
|
SmErrorT sm_node_api_fail_node( char node_name[] );
|
||||||
// ****************************************************************************
|
// ****************************************************************************
|
||||||
|
|
||||||
|
// ****************************************************************************
|
||||||
|
// Node API - Fail Node
|
||||||
|
// ======================
|
||||||
|
SmErrorT sm_node_api_recover_node( char node_name[] );
|
||||||
|
// ****************************************************************************
|
||||||
|
|
||||||
// ****************************************************************************
|
// ****************************************************************************
|
||||||
// Node API - Delete Node
|
// Node API - Delete Node
|
||||||
// ======================
|
// ======================
|
||||||
|
|
Loading…
Reference in New Issue