diff --git a/service-mgmt/sm/src/sm_failover_fail_pending_state.cpp b/service-mgmt/sm/src/sm_failover_fail_pending_state.cpp index 94431ba6..45cf89f0 100644 --- a/service-mgmt/sm/src/sm_failover_fail_pending_state.cpp +++ b/service-mgmt/sm/src/sm_failover_fail_pending_state.cpp @@ -206,8 +206,8 @@ SmFailoverFailPendingState::~SmFailoverFailPendingState() SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data) { - //SmFSMEventDataTypeT event_data_type = event_data->get_event_data_type(); bool duplex = false; + bool blind_guess = false; switch (event) { case SM_FAILOVER_EVENT_IF_STATE_CHANGED: @@ -249,13 +249,10 @@ SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const if(healthy) { blind_guess_scenario_start(); - } - else - { - this->fsm.set_state(SM_FAILOVER_STATE_FAILED); + blind_guess = true; } } - else + if( !blind_guess ) { SmSystemFailoverStatus& failover_status = SmSystemFailoverStatus::get_status(); SmErrorT error = sm_failover_ss_get_survivor(failover_status); diff --git a/service-mgmt/sm/src/sm_failover_failed_state.cpp b/service-mgmt/sm/src/sm_failover_failed_state.cpp index 1f41a15c..59aaf17f 100644 --- a/service-mgmt/sm/src/sm_failover_failed_state.cpp +++ b/service-mgmt/sm/src/sm_failover_failed_state.cpp @@ -20,6 +20,7 @@ #include "sm_types.h" #include "sm_debug.h" #include "sm_node_utils.h" +#include "sm_node_api.h" #include "sm_failover.h" #include "sm_failover_fsm.h" #include "sm_failover_ss.h" @@ -182,9 +183,54 @@ static bool sm_failover_failed_recovery_criteria_met( void ) return (criteria_met); } +SmErrorT proceed_recovery() +{ + SmErrorT error; + char peer_name[SM_NODE_NAME_MAX_CHAR]; + char host_name[SM_NODE_NAME_MAX_CHAR]; + // delete peer node + error = sm_node_api_get_peername(peer_name); + if(SM_OKAY != error) + { + DPRINTFI("Cannot retrieve peer's hostname, error %s", sm_error_str(error)); + return error; + } + error = sm_node_api_delete_node(peer_name); + if(SM_OKAY != error) + { + DPRINTFI("Failed to delete peer %s, error %s", peer_name, sm_error_str(error)); + return error; + }else + { + DPRINTFI("Peer %s is deleted.", peer_name); + } + + // enable host + error = sm_node_api_get_hostname(host_name); + if(SM_OKAY != error) + { + DPRINTFI("Cannot retrieve hostname, error %s", sm_error_str(error)); + return error; + } + error = sm_node_api_recover_node(host_name); + if(SM_OKAY != error) + { + DPRINTFI("Failed to recover %s, error %s", host_name, sm_error_str(error)); + return error; + }else + { + DPRINTFI("Host %s is recovered.", host_name); + } + + sm_node_utils_reset_unhealthy_flag(); + DPRINTFI("Unhealthy flag is removed"); + return SM_OKAY; +} + // The 'Failover Failed' state recovery audit handler SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data) { + SmErrorT error; event_data=event_data; switch (event) { @@ -196,15 +242,21 @@ SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmF DPRINTFI("************************************"); DPRINTFI("** Failover Failed state recovery **"); DPRINTFI("************************************"); - sm_node_utils_reset_unhealthy_flag(); - sm_failover_failed_process_restart(PROCESS_SM); - for ( int i = 0 ; i < 10 ; i++ ) + error = proceed_recovery(); + if(SM_OKAY != error) { - // waiting for shutdown - sleep(1); + DPRINTFE("Cannot recover from failed state"); + }else + { + sm_failover_failed_process_restart(PROCESS_SM); + for ( int i = 0 ; i < 10 ; i++ ) + { + // waiting for shutdown + sleep(1); + } + DPRINTFE("Restart did not occur ; reinstating unhealthy flag ; recovery will retry"); + sm_node_utils_set_unhealthy(); } - DPRINTFE("Restart did not occur ; reinstating unhealthy flag ; recovery will retry"); - sm_node_utils_set_unhealthy(); } else if ( ++_log_throttle > 1 ) { diff --git a/service-mgmt/sm/src/sm_node_api.cpp b/service-mgmt/sm/src/sm_node_api.cpp index 179ec770..c15d50b6 100644 --- a/service-mgmt/sm/src/sm_node_api.cpp +++ b/service-mgmt/sm/src/sm_node_api.cpp @@ -742,6 +742,44 @@ SmErrorT sm_node_api_fail_node( char node_name[] ) } // **************************************************************************** +// **************************************************************************** +// Node API - Recover Node +// ====================== +SmErrorT sm_node_api_recover_node( char node_name[] ) +{ + SmDbNodeT node; + SmErrorT error; + error = sm_db_nodes_read( _sm_db_handle, node_name, &node ); + if( SM_OKAY != error ) + { + DPRINTFE( "Failed to read node (%s) information, error=%s.", + node_name, sm_error_str( error ) ); + return( error ); + } + + if( node.oper_state != SM_NODE_OPERATIONAL_STATE_DISABLED || + node.avail_status != SM_NODE_AVAIL_STATUS_FAILED ) + { + DPRINTFD("Not in failure mode %s", node_name); + } + + DPRINTFE("Node %s is to recover from failure mode.", node_name); + + error = sm_node_api_update_node( + node_name, + node.admin_state, + SM_NODE_OPERATIONAL_STATE_ENABLED, + SM_NODE_AVAIL_STATUS_AVAILABLE); + + if( SM_OKAY != error ) + { + DPRINTFE( "Failed to set node (%s) failed, error=%s.", + node_name, sm_error_str( error ) ); + } + return( error ); +} +// **************************************************************************** + // **************************************************************************** // Node API - Delete Node // ====================== diff --git a/service-mgmt/sm/src/sm_node_api.h b/service-mgmt/sm/src/sm_node_api.h index 773b434d..e8874f52 100644 --- a/service-mgmt/sm/src/sm_node_api.h +++ b/service-mgmt/sm/src/sm_node_api.h @@ -54,6 +54,12 @@ extern SmErrorT sm_node_api_update_node( char node_name[], SmErrorT sm_node_api_fail_node( char node_name[] ); // **************************************************************************** +// **************************************************************************** +// Node API - Fail Node +// ====================== +SmErrorT sm_node_api_recover_node( char node_name[] ); +// **************************************************************************** + // **************************************************************************** // Node API - Delete Node // ======================