Merge "Fix AIO-DX failover issues"

This commit is contained in:
Zuul 2021-05-13 14:44:29 +00:00 committed by Gerrit Code Review
commit 74fda90dac
4 changed files with 106 additions and 13 deletions

View File

@ -206,8 +206,8 @@ SmFailoverFailPendingState::~SmFailoverFailPendingState()
SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data) SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
{ {
//SmFSMEventDataTypeT event_data_type = event_data->get_event_data_type();
bool duplex = false; bool duplex = false;
bool blind_guess = false;
switch (event) switch (event)
{ {
case SM_FAILOVER_EVENT_IF_STATE_CHANGED: case SM_FAILOVER_EVENT_IF_STATE_CHANGED:
@ -249,13 +249,10 @@ SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const
if(healthy) if(healthy)
{ {
blind_guess_scenario_start(); blind_guess_scenario_start();
} blind_guess = true;
else
{
this->fsm.set_state(SM_FAILOVER_STATE_FAILED);
} }
} }
else if( !blind_guess )
{ {
SmSystemFailoverStatus& failover_status = SmSystemFailoverStatus::get_status(); SmSystemFailoverStatus& failover_status = SmSystemFailoverStatus::get_status();
SmErrorT error = sm_failover_ss_get_survivor(failover_status); SmErrorT error = sm_failover_ss_get_survivor(failover_status);

View File

@ -20,6 +20,7 @@
#include "sm_types.h" #include "sm_types.h"
#include "sm_debug.h" #include "sm_debug.h"
#include "sm_node_utils.h" #include "sm_node_utils.h"
#include "sm_node_api.h"
#include "sm_failover.h" #include "sm_failover.h"
#include "sm_failover_fsm.h" #include "sm_failover_fsm.h"
#include "sm_failover_ss.h" #include "sm_failover_ss.h"
@ -182,9 +183,54 @@ static bool sm_failover_failed_recovery_criteria_met( void )
return (criteria_met); return (criteria_met);
} }
SmErrorT proceed_recovery()
{
SmErrorT error;
char peer_name[SM_NODE_NAME_MAX_CHAR];
char host_name[SM_NODE_NAME_MAX_CHAR];
// delete peer node
error = sm_node_api_get_peername(peer_name);
if(SM_OKAY != error)
{
DPRINTFI("Cannot retrieve peer's hostname, error %s", sm_error_str(error));
return error;
}
error = sm_node_api_delete_node(peer_name);
if(SM_OKAY != error)
{
DPRINTFI("Failed to delete peer %s, error %s", peer_name, sm_error_str(error));
return error;
}else
{
DPRINTFI("Peer %s is deleted.", peer_name);
}
// enable host
error = sm_node_api_get_hostname(host_name);
if(SM_OKAY != error)
{
DPRINTFI("Cannot retrieve hostname, error %s", sm_error_str(error));
return error;
}
error = sm_node_api_recover_node(host_name);
if(SM_OKAY != error)
{
DPRINTFI("Failed to recover %s, error %s", host_name, sm_error_str(error));
return error;
}else
{
DPRINTFI("Host %s is recovered.", host_name);
}
sm_node_utils_reset_unhealthy_flag();
DPRINTFI("Unhealthy flag is removed");
return SM_OKAY;
}
// The 'Failover Failed' state recovery audit handler // The 'Failover Failed' state recovery audit handler
SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data) SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
{ {
SmErrorT error;
event_data=event_data; event_data=event_data;
switch (event) switch (event)
{ {
@ -196,15 +242,21 @@ SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmF
DPRINTFI("************************************"); DPRINTFI("************************************");
DPRINTFI("** Failover Failed state recovery **"); DPRINTFI("** Failover Failed state recovery **");
DPRINTFI("************************************"); DPRINTFI("************************************");
sm_node_utils_reset_unhealthy_flag(); error = proceed_recovery();
sm_failover_failed_process_restart(PROCESS_SM); if(SM_OKAY != error)
for ( int i = 0 ; i < 10 ; i++ )
{ {
// waiting for shutdown DPRINTFE("Cannot recover from failed state");
sleep(1); }else
{
sm_failover_failed_process_restart(PROCESS_SM);
for ( int i = 0 ; i < 10 ; i++ )
{
// waiting for shutdown
sleep(1);
}
DPRINTFE("Restart did not occur ; reinstating unhealthy flag ; recovery will retry");
sm_node_utils_set_unhealthy();
} }
DPRINTFE("Restart did not occur ; reinstating unhealthy flag ; recovery will retry");
sm_node_utils_set_unhealthy();
} }
else if ( ++_log_throttle > 1 ) else if ( ++_log_throttle > 1 )
{ {

View File

@ -742,6 +742,44 @@ SmErrorT sm_node_api_fail_node( char node_name[] )
} }
// **************************************************************************** // ****************************************************************************
// ****************************************************************************
// Node API - Recover Node
// ======================
SmErrorT sm_node_api_recover_node( char node_name[] )
{
SmDbNodeT node;
SmErrorT error;
error = sm_db_nodes_read( _sm_db_handle, node_name, &node );
if( SM_OKAY != error )
{
DPRINTFE( "Failed to read node (%s) information, error=%s.",
node_name, sm_error_str( error ) );
return( error );
}
if( node.oper_state != SM_NODE_OPERATIONAL_STATE_DISABLED ||
node.avail_status != SM_NODE_AVAIL_STATUS_FAILED )
{
DPRINTFD("Not in failure mode %s", node_name);
}
DPRINTFE("Node %s is to recover from failure mode.", node_name);
error = sm_node_api_update_node(
node_name,
node.admin_state,
SM_NODE_OPERATIONAL_STATE_ENABLED,
SM_NODE_AVAIL_STATUS_AVAILABLE);
if( SM_OKAY != error )
{
DPRINTFE( "Failed to set node (%s) failed, error=%s.",
node_name, sm_error_str( error ) );
}
return( error );
}
// ****************************************************************************
// **************************************************************************** // ****************************************************************************
// Node API - Delete Node // Node API - Delete Node
// ====================== // ======================

View File

@ -54,6 +54,12 @@ extern SmErrorT sm_node_api_update_node( char node_name[],
SmErrorT sm_node_api_fail_node( char node_name[] ); SmErrorT sm_node_api_fail_node( char node_name[] );
// **************************************************************************** // ****************************************************************************
// ****************************************************************************
// Node API - Fail Node
// ======================
SmErrorT sm_node_api_recover_node( char node_name[] );
// ****************************************************************************
// **************************************************************************** // ****************************************************************************
// Node API - Delete Node // Node API - Delete Node
// ====================== // ======================