Introduce failover FSM
Introduce failover FSM to handle communication failure between controllers. Failover FSM has 4 states: Normal: when system running with full redundency Fail Pending: communication failure occured Failed: the controller is determined as failure. Its peer will assume service Survived: the controller is determined as survivor. Its peer has failed The controllers are in one of the below possible state pairs: normal/normal, fail-pending/fail-pending, failed/survived A failed controller will not resume responsbility before the system restores its full redundency (normal/normal) A survivor will not fail before the system restores its full redundency (normal/normal) Future implementation may allow an administrator to force a failed controller become active, to manually recover (with possiblity of losing data), should the survivor is no longer capable to provide service. Story: 2003577 Task: 26404 Change-Id: I51635e9e60b6fb6bad89e06c9f08d3f28e21db82 Signed-off-by: Bin Qian <bin.qian@windriver.com>
This commit is contained in:
parent
68b5ce3835
commit
edc8a56472
@ -2,4 +2,4 @@ SRC_DIR=$PKG_BASE
|
||||
COPY_LIST="$PKG_BASE/LICENSE"
|
||||
TAR_NAME=sm
|
||||
VERSION=1.0.0
|
||||
TIS_PATCH_VER=24
|
||||
TIS_PATCH_VER=26
|
||||
|
@ -107,6 +107,11 @@ SRCS+=sm_swact_state.c
|
||||
SRCS+=sm_worker_thread.cpp
|
||||
SRCS+=sm_task_affining_thread.c
|
||||
SRCS+=sm_node_swact_monitor.cpp
|
||||
SRCS+=sm_failover_fsm.cpp
|
||||
SRCS+=sm_failover_normal_state.cpp
|
||||
SRCS+=sm_failover_fail_pending_state.cpp
|
||||
SRCS+=sm_failover_failed_state.cpp
|
||||
SRCS+=sm_failover_survived_state.cpp
|
||||
SRCS+=sm_failover_ss.c
|
||||
SRCS+=sm_service_domain_interface_not_in_use_state.c
|
||||
SRCS+=sm_configuration_table.c
|
||||
|
@ -34,15 +34,8 @@
|
||||
#include "sm_util_types.h"
|
||||
#include "sm_failover_ss.h"
|
||||
#include "sm_failover_utils.h"
|
||||
|
||||
typedef enum
|
||||
{
|
||||
SM_FAILOVER_INTERFACE_UNKNOWN,
|
||||
SM_FAILOVER_INTERFACE_OK,
|
||||
SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT,
|
||||
SM_FAILOVER_INTERFACE_DOWN,
|
||||
SM_FAILOVER_INTERFACE_RECOVERING
|
||||
}SmFailoverInterfaceStateT;
|
||||
#include "sm_failover_fsm.h"
|
||||
#include "sm_api.h"
|
||||
|
||||
typedef enum
|
||||
{
|
||||
@ -57,7 +50,7 @@ typedef enum
|
||||
//as part of the gradual delivery of enhancement with more
|
||||
//complex algorithm to determine the failover survivor routine
|
||||
//the SM_FAILOVER_ACTION_ROUTINE will redirect the lookup to
|
||||
//new logic. Until all actions are migrate to new logic, the
|
||||
//new logic. Until all actions are migrated to new logic, the
|
||||
//lookup tables will be eliminated.
|
||||
SM_FAILOVER_ACTION_ROUTINE = 1 << 31,
|
||||
}SmFailoverActionT;
|
||||
@ -181,6 +174,79 @@ static bool _hello_msg_alive = true;
|
||||
static SmSystemModeT _system_mode;
|
||||
static time_t _last_report_ts = 0;
|
||||
static int _heartbeat_count = 0;
|
||||
static bool _peer_ready = false;
|
||||
|
||||
// The class is to accommodate batching i/f state changes
|
||||
// during the failover_audit interval.
|
||||
class FailoverEventHolder
|
||||
{
|
||||
public:
|
||||
FailoverEventHolder();
|
||||
void push_event(SmFailoverEventT, const ISmFSMEventData* event_data);
|
||||
void send_event();
|
||||
private:
|
||||
bool _event_to_send;
|
||||
SmFailoverEventT _failover_event;
|
||||
const ISmFSMEventData* _event_data;
|
||||
};
|
||||
|
||||
FailoverEventHolder::FailoverEventHolder()
|
||||
{
|
||||
_event_to_send = false;
|
||||
}
|
||||
|
||||
void FailoverEventHolder::push_event(SmFailoverEventT event, const ISmFSMEventData* event_data)
|
||||
{
|
||||
SmFailoverEventT list[] = {
|
||||
SM_INTERFACE_DOWN,
|
||||
SM_HEARTBEAT_LOST,
|
||||
SM_INTERFACE_UP,
|
||||
SM_HEARTBEAT_RECOVER
|
||||
};
|
||||
|
||||
unsigned int i;
|
||||
if(!_event_to_send)
|
||||
{
|
||||
_event_to_send = true;
|
||||
for(i = 0; i < sizeof(list) / sizeof(SmFailoverEventT); i ++)
|
||||
{
|
||||
if(event == list[i])
|
||||
{
|
||||
_failover_event = event;
|
||||
_event_data = event_data;
|
||||
_event_to_send = true;
|
||||
DPRINTFI("Push event %d", event);
|
||||
return;
|
||||
}
|
||||
}
|
||||
DPRINTFE("Runtime error: unsupported event %d", event);
|
||||
return;
|
||||
}
|
||||
|
||||
for(i = 0; list[i] != _failover_event && i < sizeof(list) / sizeof(SmFailoverEventT); i ++)
|
||||
{
|
||||
if(list[i] == event)
|
||||
{
|
||||
_failover_event = event;
|
||||
_event_data = event_data;
|
||||
return;
|
||||
}
|
||||
}
|
||||
DPRINTFE("Runtime error: unsupported event %d", event);
|
||||
}
|
||||
|
||||
void FailoverEventHolder::send_event()
|
||||
{
|
||||
if(_event_to_send)
|
||||
{
|
||||
DPRINTFI("Send event %d", _failover_event);
|
||||
SmFailoverFSM::get_fsm().send_event(_failover_event, _event_data);
|
||||
_event_to_send = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static FailoverEventHolder _event_holder;
|
||||
|
||||
// std 2+2/2+2+x failover actions
|
||||
SmFailoverActionPairT action_map_std_infra[16] =
|
||||
@ -435,6 +501,8 @@ void sm_failover_lost_heartbeat( SmFailoverInterfaceT* interface )
|
||||
{
|
||||
DPRINTFI("Interface %s lose heartbeat.", interface->interface_name);
|
||||
}
|
||||
|
||||
_event_holder.push_event(SM_HEARTBEAT_LOST, NULL);
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
||||
@ -466,6 +534,8 @@ void sm_failover_heartbeat_restore( SmFailoverInterfaceT* interface )
|
||||
{
|
||||
DPRINTFI("Interface %s heartbeat is OK.", interface->interface_name);
|
||||
}
|
||||
|
||||
_event_holder.push_event(SM_HEARTBEAT_RECOVER, NULL);
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
||||
@ -501,6 +571,8 @@ void sm_failover_interface_down( const char* const interface_name )
|
||||
{
|
||||
DPRINTFI("%d domain interfaces are impacted as i/f %s is down.",
|
||||
impacted, interface_name);
|
||||
|
||||
_event_holder.push_event(SM_INTERFACE_DOWN, NULL);
|
||||
}
|
||||
}
|
||||
// ****************************************************************************
|
||||
@ -537,6 +609,8 @@ void sm_failover_interface_up( const char* const interface_name )
|
||||
{
|
||||
DPRINTFI("%d domain interfaces are impacted as i/f %s is up.",
|
||||
impacted, interface_name);
|
||||
|
||||
_event_holder.push_event(SM_INTERFACE_UP, NULL);
|
||||
}
|
||||
}
|
||||
// ****************************************************************************
|
||||
@ -656,13 +730,22 @@ int _failover_get_if_state()
|
||||
// ****************************************************************************
|
||||
// Failover - get interface state
|
||||
// ==================
|
||||
SmErrorT sm_failover_if_state_get(SmHeartbeatMsgIfStateT *if_state)
|
||||
SmHeartbeatMsgIfStateT sm_failover_if_state_get()
|
||||
{
|
||||
mutex_holder holder(&_mutex);
|
||||
int if_state_flag = _failover_get_if_state();
|
||||
return (if_state_flag & 0b0111); //the lower 3 bits i/f state flag
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// Failover - get peer node interface state
|
||||
// ==================
|
||||
SmHeartbeatMsgIfStateT sm_failover_get_peer_if_state()
|
||||
{
|
||||
mutex_holder holder(&_mutex);
|
||||
|
||||
int if_state_flag = _failover_get_if_state();
|
||||
*if_state = (if_state_flag & 0b0111); //the lower 3 bits i/f state flag
|
||||
return SM_OKAY;
|
||||
return (_peer_if_state & 0b0111); //the lower 3 bits i/f state flag
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
||||
@ -923,84 +1006,66 @@ bool this_controller_unlocked()
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
||||
|
||||
// ****************************************************************************
|
||||
// Failover - pack schedule state into 2 bit (active/standby/failed)
|
||||
// SM starts managing failover after the node is scheduled, i.e, cannot be
|
||||
// in SM_NODE_STATE_UNKNOWN or SM_NODE_STATE_INIT state
|
||||
// =======================
|
||||
unsigned int sm_failover_pack_schedule_state(SmNodeScheduleStateT state)
|
||||
// Failover - set system to scheduled status
|
||||
// ==================
|
||||
SmErrorT sm_failover_set_system(const SmSystemFailoverStatus& failover_status)
|
||||
{
|
||||
static const unsigned int failed_state_bit_flag = 0;
|
||||
static const unsigned int active_state_bit_flag = 1;
|
||||
static const unsigned int standby_state_bit_flag = 1 << 1;
|
||||
unsigned int res;
|
||||
switch(state)
|
||||
//valid combinations of target system scheduling are:
|
||||
// active/standby
|
||||
// active/failed
|
||||
SmFailoverActionResultT result;
|
||||
SmNodeScheduleStateT host_target_state, peer_target_state;
|
||||
host_target_state = failover_status.get_host_schedule_state();
|
||||
peer_target_state = failover_status.get_peer_schedule_state();
|
||||
if(SM_NODE_STATE_ACTIVE == host_target_state)
|
||||
{
|
||||
case SM_NODE_STATE_ACTIVE:
|
||||
res = active_state_bit_flag;
|
||||
break;
|
||||
case SM_NODE_STATE_STANDBY:
|
||||
res = standby_state_bit_flag;
|
||||
break;
|
||||
case SM_NODE_STATE_FAILED:
|
||||
res = failed_state_bit_flag;
|
||||
break;
|
||||
default:
|
||||
res = failed_state_bit_flag;
|
||||
if(SM_NODE_STATE_STANDBY == _host_state &&
|
||||
SM_NODE_STATE_FAILED == peer_target_state)
|
||||
{
|
||||
result = sm_failover_activate_self();
|
||||
if(SM_FAILOVER_ACTION_RESULT_FAILED == result)
|
||||
{
|
||||
DPRINTFE("Failed to activate %s.", _host_name);
|
||||
return SM_FAILED;
|
||||
}
|
||||
result = sm_failover_disable_node(_peer_name);
|
||||
if(SM_FAILOVER_ACTION_RESULT_FAILED == result)
|
||||
{
|
||||
DPRINTFE("Failed to disable node %s.", _peer_name);
|
||||
return SM_FAILED;
|
||||
}
|
||||
}
|
||||
}else if(SM_NODE_STATE_STANDBY == host_target_state)
|
||||
{
|
||||
if(SM_NODE_STATE_ACTIVE == _host_state)
|
||||
{
|
||||
result = sm_failover_swact();
|
||||
if(SM_FAILOVER_ACTION_RESULT_FAILED == result)
|
||||
{
|
||||
DPRINTFE("Failed to swact.");
|
||||
return SM_FAILED;
|
||||
}
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// Failover - convert target scheduling state to action
|
||||
// =======================
|
||||
int sm_failover_get_action(const SmSystemFailoverStatus& failover_status)
|
||||
{
|
||||
DPRINTFI("Host to %s, Peer to %s.",
|
||||
sm_node_schedule_state_str(failover_status.host_schedule_state),
|
||||
sm_node_schedule_state_str(failover_status.peer_schedule_state)
|
||||
);
|
||||
unsigned int host_flag, peer_flag;
|
||||
host_flag = sm_failover_pack_schedule_state(failover_status.host_schedule_state);
|
||||
peer_flag = sm_failover_pack_schedule_state(failover_status.peer_schedule_state);
|
||||
|
||||
unsigned int flag = (host_flag | (peer_flag << 2));
|
||||
DPRINTFI("Failover scheduling flag %d", flag);
|
||||
SmFailoverActionPairT* actions;
|
||||
int action;
|
||||
|
||||
SmFailoverActionPairT action_map[16] = {
|
||||
{SM_FAILOVER_ACTION_FAIL_NODE, SM_FAILOVER_ACTION_FAIL_NODE}, //00 00
|
||||
{SM_FAILOVER_ACTION_DISABLE_STANDBY, SM_FAILOVER_ACTION_FAIL_NODE}, //01 00
|
||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //10 00
|
||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 00
|
||||
{SM_FAILOVER_ACTION_FAIL_NODE, SM_FAILOVER_ACTION_ACTIVATE}, //00 01
|
||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //01 01
|
||||
{SM_FAILOVER_ACTION_SWACT, SM_FAILOVER_ACTION_NO_ACTION}, //10 01
|
||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 01
|
||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //00 10
|
||||
{SM_FAILOVER_ACTION_NO_ACTION, SM_FAILOVER_ACTION_NO_ACTION}, //01 10
|
||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //10 10
|
||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 10
|
||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //00 11
|
||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //01 11
|
||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //10 11
|
||||
{SM_FAILOVER_ACTION_UNDEFINED, SM_FAILOVER_ACTION_UNDEFINED}, //11 11
|
||||
};
|
||||
|
||||
bool is_active = is_active_controller();
|
||||
actions = &(action_map[flag & 0xf]);
|
||||
if(is_active)
|
||||
else if(SM_NODE_STATE_FAILED == host_target_state)
|
||||
{
|
||||
action = actions->active_controller_action;
|
||||
result = sm_failover_disable_node(_host_name);
|
||||
if(SM_FAILOVER_ACTION_RESULT_FAILED == result)
|
||||
{
|
||||
DPRINTFE("Failed disable host %s.", _host_name);
|
||||
return SM_FAILED;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
action = actions->standby_controller_action;
|
||||
DPRINTFE("Runtime error. Unexpected scheduling state %s for host %s",
|
||||
sm_node_schedule_state_str(host_target_state),
|
||||
_host_name);
|
||||
return SM_FAILED;
|
||||
}
|
||||
return action;
|
||||
|
||||
return SM_OKAY;
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
||||
@ -1112,6 +1177,8 @@ void sm_failover_audit()
|
||||
return;
|
||||
}
|
||||
|
||||
_event_holder.send_event();
|
||||
|
||||
int64_t curr_node_state = if_state_flag;
|
||||
|
||||
if( _hello_msg_alive )
|
||||
@ -1160,34 +1227,9 @@ void sm_failover_audit()
|
||||
_log_nodes_state(action);
|
||||
|
||||
DPRINTFI("Action to take %d", action);
|
||||
|
||||
if (action & SM_FAILOVER_ACTION_ROUTINE)
|
||||
{
|
||||
SmSystemStatusT sys_status;
|
||||
SmSystemFailoverStatus failover_status;
|
||||
sys_status.system_mode = _system_mode;
|
||||
if(if_state_flag & SM_FAILOVER_HEARTBEAT_ALIVE)
|
||||
{
|
||||
sys_status.heartbeat_state = SM_HEARTBEAT_OK;
|
||||
}else
|
||||
{
|
||||
sys_status.heartbeat_state = SM_HEARTBEAT_LOSS;
|
||||
}
|
||||
|
||||
sys_status.host_status.node_name = _host_name;
|
||||
sys_status.host_status.interface_state = if_state_flag & 0x7;
|
||||
sys_status.host_status.current_schedule_state = _host_state;
|
||||
sys_status.peer_status.node_name = _peer_name;
|
||||
sys_status.peer_status.interface_state = _peer_if_state & 0x7;
|
||||
sys_status.peer_status.current_schedule_state = sm_get_controller_state(_peer_name);
|
||||
SmErrorT error = sm_failover_ss_get_survivor(sys_status, failover_status);
|
||||
if(SM_OKAY != error)
|
||||
{
|
||||
DPRINTFE("Failed to determine failover state. ");
|
||||
return;
|
||||
}
|
||||
|
||||
action = sm_failover_get_action(failover_status);
|
||||
action = SM_FAILOVER_ACTION_NO_ACTION;
|
||||
}
|
||||
|
||||
if (action & SM_FAILOVER_ACTION_ACTIVATE)
|
||||
@ -1367,6 +1409,67 @@ SmErrorT sm_failover_action()
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// Failover - get node interface info
|
||||
// ==================
|
||||
SmFailoverInterfaceStateT sm_failover_get_interface_info(SmInterfaceTypeT interface_type)
|
||||
{
|
||||
SmFailoverInterfaceInfo* res = NULL;
|
||||
switch(interface_type)
|
||||
{
|
||||
case SM_INTERFACE_MGMT:
|
||||
res = _mgmt_interface_info;
|
||||
break;
|
||||
case SM_INTERFACE_INFRA:
|
||||
res = _infra_interface_info;
|
||||
break;
|
||||
case SM_INTERFACE_OAM:
|
||||
res = _oam_interface_info;
|
||||
break;
|
||||
case SM_INTERFACE_UNKNOWN:
|
||||
break;
|
||||
}
|
||||
if(NULL != res)
|
||||
{
|
||||
return res->get_state();
|
||||
}
|
||||
return SM_FAILOVER_INTERFACE_UNKNOWN;
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
||||
static void sm_failover_node_set_callback( char node_name[],
|
||||
SmNodeSetActionT action, SmNodeAdminStateT admin_state,
|
||||
SmNodeOperationalStateT oper_state, SmNodeAvailStatusT avail_status,
|
||||
int seqno )
|
||||
{
|
||||
if(0 == strcmp(node_name, _host_name))
|
||||
{
|
||||
// care only the peer
|
||||
return;
|
||||
}
|
||||
|
||||
bool peer_ready = false;
|
||||
if(SM_NODE_SET_ACTION_UNLOCK == action)
|
||||
{
|
||||
peer_ready = SM_NODE_ADMIN_STATE_UNLOCKED == admin_state &&
|
||||
SM_NODE_OPERATIONAL_STATE_ENABLED == oper_state &&
|
||||
(SM_NODE_AVAIL_STATUS_AVAILABLE == avail_status ||
|
||||
SM_NODE_AVAIL_STATUS_DEGRADED == avail_status);
|
||||
}else if (SM_NODE_SET_ACTION_LOCK == action)
|
||||
{
|
||||
peer_ready = false;
|
||||
}else
|
||||
{
|
||||
return;
|
||||
}
|
||||
if(peer_ready && !_peer_ready)
|
||||
{
|
||||
DPRINTFI("Peer %s is now online.", node_name);
|
||||
}
|
||||
_peer_ready = peer_ready;
|
||||
}
|
||||
|
||||
static SmApiCallbacksT _api_callbacks = {0};
|
||||
// ****************************************************************************
|
||||
// Failover - Initialize
|
||||
// ======================
|
||||
@ -1375,6 +1478,14 @@ SmErrorT sm_failover_initialize( void )
|
||||
void* user_data[1] = {&_total_interfaces};
|
||||
bool enabled;
|
||||
SmErrorT error;
|
||||
|
||||
error = SmFailoverFSM::initialize();
|
||||
if( SM_OKAY != error )
|
||||
{
|
||||
DPRINTFE( "Failed to initialize failover FSM, error %s.", sm_error_str( error ) );
|
||||
return error;
|
||||
}
|
||||
|
||||
_system_mode = sm_node_utils_get_system_mode();
|
||||
DPRINTFI("System mode %s", sm_system_mode_str(_system_mode));
|
||||
|
||||
@ -1404,6 +1515,17 @@ SmErrorT sm_failover_initialize( void )
|
||||
return error;
|
||||
}
|
||||
|
||||
_api_callbacks.node_set = sm_failover_node_set_callback;
|
||||
|
||||
error = sm_api_register_callbacks( &_api_callbacks );
|
||||
if( SM_OKAY != error )
|
||||
{
|
||||
DPRINTFE( "Failed to register api callbacks, error=%s.",
|
||||
sm_error_str( error ) );
|
||||
return( error );
|
||||
}
|
||||
|
||||
|
||||
if( 0 == strcmp( SM_NODE_CONTROLLER_0_NAME, _host_name ) )
|
||||
{
|
||||
snprintf( _peer_name, sizeof(_peer_name), "%s", SM_NODE_CONTROLLER_1_NAME );
|
||||
@ -1491,6 +1613,13 @@ SmErrorT sm_failover_finalize( void )
|
||||
_total_interfaces = 0;
|
||||
|
||||
SmErrorT error;
|
||||
error = sm_api_deregister_callbacks( &_api_callbacks );
|
||||
if( SM_OKAY != error )
|
||||
{
|
||||
DPRINTFE( "Failed to deregister api callbacks, error=%s.",
|
||||
sm_error_str( error ) );
|
||||
}
|
||||
|
||||
sm_timer_deregister( failover_audit_timer_id );
|
||||
if( NULL != _sm_db_handle )
|
||||
{
|
||||
@ -1504,6 +1633,13 @@ SmErrorT sm_failover_finalize( void )
|
||||
_sm_db_handle = NULL;
|
||||
}
|
||||
|
||||
error = SmFailoverFSM::finalize();
|
||||
if( SM_OKAY != error )
|
||||
{
|
||||
DPRINTFE( "Failed to finalize failover FSM, error %s.", sm_error_str( error ) );
|
||||
return error;
|
||||
}
|
||||
|
||||
return SM_OKAY;
|
||||
}
|
||||
// ****************************************************************************
|
||||
@ -1575,6 +1711,11 @@ void dump_peer_if_state(FILE* fp)
|
||||
fprintf(fp, " Peer Interface state: %d\n", _peer_if_state);
|
||||
}
|
||||
|
||||
void dump_failover_fsm_state(FILE* fp)
|
||||
{
|
||||
fprintf(fp, " Failover FSM state: %d\n", SmFailoverFSM::get_fsm().get_state());
|
||||
}
|
||||
|
||||
// ****************************************************************************
|
||||
// Failover - dump state
|
||||
// ======================
|
||||
@ -1582,6 +1723,7 @@ void sm_failover_dump_state(FILE* fp)
|
||||
{
|
||||
fprintf( fp, "Failover \n\n" );
|
||||
dump_host_state(fp);
|
||||
dump_failover_fsm_state(fp);
|
||||
dump_interfaces_state(fp);
|
||||
dump_peer_if_state(fp);
|
||||
}
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include "sm_types.h"
|
||||
#include "sm_service_domain_interface_table.h"
|
||||
#include "sm_db_nodes.h"
|
||||
#include "sm_failover_ss.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
@ -88,7 +89,7 @@ extern void sm_failover_if_state_update(const char node_name[],
|
||||
|
||||
// ****************************************************************************
|
||||
// Failover - get local interface state
|
||||
extern SmErrorT sm_failover_if_state_get(SmHeartbeatMsgIfStateT*);
|
||||
extern SmHeartbeatMsgIfStateT sm_failover_if_state_get();
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
@ -115,6 +116,27 @@ SmErrorT sm_failover_degrade_clear(SmFailoverDegradeSourceT source);
|
||||
SmErrorT sm_failover_get_node(char* node_name, SmDbNodeT& node);
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// Failover - get node interface info
|
||||
// ==================
|
||||
SmFailoverInterfaceStateT sm_failover_get_interface_info(SmInterfaceTypeT interface_type);
|
||||
// ****************************************************************************
|
||||
|
||||
|
||||
// ****************************************************************************
|
||||
// Failover - get peer node interface state
|
||||
// ==================
|
||||
SmHeartbeatMsgIfStateT sm_failover_get_peer_if_state();
|
||||
// ****************************************************************************
|
||||
|
||||
|
||||
// ****************************************************************************
|
||||
// Failover - set system to scheduled status
|
||||
// ==================
|
||||
SmErrorT sm_failover_set_system(const SmSystemFailoverStatus& failover_status);
|
||||
// ****************************************************************************
|
||||
|
||||
|
||||
// ****************************************************************************
|
||||
// Failover - dump state
|
||||
// ==================
|
||||
|
141
service-mgmt/sm-1.0.0/src/sm_failover_fail_pending_state.cpp
Normal file
141
service-mgmt/sm-1.0.0/src/sm_failover_fail_pending_state.cpp
Normal file
@ -0,0 +1,141 @@
|
||||
//
|
||||
// Copyright (c) 2018 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
#include "sm_failover_fail_pending_state.h"
|
||||
#include <stdlib.h>
|
||||
#include "sm_types.h"
|
||||
#include "sm_limits.h"
|
||||
#include "sm_debug.h"
|
||||
#include "sm_failover.h"
|
||||
#include "sm_failover_fsm.h"
|
||||
#include "sm_failover_ss.h"
|
||||
#include "sm_failover_utils.h"
|
||||
#include "sm_node_utils.h"
|
||||
#include "sm_node_api.h"
|
||||
|
||||
static const int FAIL_PENDING_TIMEOUT = 2000; //2000ms
|
||||
|
||||
SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
|
||||
{
|
||||
//SmFSMEventDataTypeT event_data_type = event_data->get_event_data_type();
|
||||
switch (event)
|
||||
{
|
||||
case SM_HEARTBEAT_LOST:
|
||||
case SM_INTERFACE_DOWN:
|
||||
//getting worse, extend wait time
|
||||
case SM_INTERFACE_UP:
|
||||
case SM_HEARTBEAT_RECOVER:
|
||||
// Things are improving, extend wait time
|
||||
this->_register_timer();
|
||||
break;
|
||||
case SM_FAIL_PENDING_TIMEOUT:
|
||||
{
|
||||
SmSystemFailoverStatus failover_status;
|
||||
SmErrorT error = sm_failover_ss_get_survivor(failover_status);
|
||||
SmNodeScheduleStateT host_state = failover_status.get_host_schedule_state();
|
||||
SmNodeScheduleStateT peer_state = failover_status.get_peer_schedule_state();
|
||||
if(SM_OKAY != error)
|
||||
{
|
||||
DPRINTFE("Failed to get failover survivor. Error %s", sm_error_str(error));
|
||||
return error;
|
||||
}
|
||||
|
||||
error = sm_failover_set_system(failover_status);
|
||||
if(SM_NODE_STATE_FAILED == host_state)
|
||||
{
|
||||
this->fsm.set_state(SM_FAILOVER_STATE_FAILED);
|
||||
}
|
||||
else if(SM_NODE_STATE_ACTIVE == host_state)
|
||||
{
|
||||
if(SM_NODE_STATE_FAILED == peer_state)
|
||||
{
|
||||
this->fsm.set_state(SM_FAILOVER_STATE_SURVIVED);
|
||||
}else
|
||||
{
|
||||
this->fsm.set_state(SM_FAILOVER_STATE_NORMAL);
|
||||
}
|
||||
}
|
||||
else if(SM_NODE_STATE_STANDBY == host_state)
|
||||
{
|
||||
this->fsm.set_state(SM_FAILOVER_STATE_NORMAL);
|
||||
}else
|
||||
{
|
||||
DPRINTFE("Runtime error: unexpected scheduling state: %s",
|
||||
sm_node_schedule_state_str(host_state));
|
||||
}
|
||||
return error;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
DPRINTFE("Runtime error, unexpected event %d, at state %d", event, this->fsm.get_state());
|
||||
}
|
||||
return SM_OKAY;
|
||||
}
|
||||
|
||||
bool SmFailoverFailPendingState::_fail_pending_timeout(
|
||||
SmTimerIdT timer_id, int64_t user_data)
|
||||
{
|
||||
SmFailoverFSM::get_fsm().send_event(SM_FAIL_PENDING_TIMEOUT, NULL);
|
||||
return false;
|
||||
}
|
||||
|
||||
SmErrorT SmFailoverFailPendingState::enter_state()
|
||||
{
|
||||
DPRINTFI("Entering Pending state");
|
||||
SmErrorT error = this->_register_timer();
|
||||
if(SM_OKAY != error)
|
||||
{
|
||||
DPRINTFE("Failed to register fail pending timer. Error %s", sm_error_str(error));
|
||||
}
|
||||
return error;
|
||||
}
|
||||
|
||||
SmErrorT SmFailoverFailPendingState::_register_timer()
|
||||
{
|
||||
SmErrorT error;
|
||||
const char* timer_name = "FAIL PENDING TIMER";
|
||||
if(SM_TIMER_ID_INVALID != this->_pending_timer_id)
|
||||
{
|
||||
this->_deregister_timer();
|
||||
}
|
||||
|
||||
error = sm_timer_register( timer_name, FAIL_PENDING_TIMEOUT,
|
||||
SmFailoverFailPendingState::_fail_pending_timeout,
|
||||
0, &this->_pending_timer_id);
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
SmErrorT SmFailoverFailPendingState::_deregister_timer()
|
||||
{
|
||||
SmErrorT error;
|
||||
if(SM_TIMER_ID_INVALID == this->_pending_timer_id)
|
||||
{
|
||||
return SM_OKAY;
|
||||
}
|
||||
|
||||
error = sm_timer_deregister(this->_pending_timer_id);
|
||||
if( SM_OKAY != error )
|
||||
{
|
||||
DPRINTFE( "Failed to cancel fail pending timer, error=%s.",
|
||||
sm_error_str( error ) );
|
||||
}else
|
||||
{
|
||||
this->_pending_timer_id = SM_TIMER_ID_INVALID;
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
SmErrorT SmFailoverFailPendingState::exit_state()
|
||||
{
|
||||
SmErrorT error = this->_deregister_timer();
|
||||
if(SM_OKAY != error)
|
||||
{
|
||||
DPRINTFE("Failed to deregister fail pending timer. Error %s", sm_error_str(error));
|
||||
}
|
||||
DPRINTFI("Exiting Pending state");
|
||||
return error;
|
||||
}
|
31
service-mgmt/sm-1.0.0/src/sm_failover_fail_pending_state.h
Normal file
31
service-mgmt/sm-1.0.0/src/sm_failover_fail_pending_state.h
Normal file
@ -0,0 +1,31 @@
|
||||
//
|
||||
// Copyright (c) 2018 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
#ifndef __SM_FAILOVER_FAIL_PENDING_STATE_H__
|
||||
#define __SM_FAILOVER_FAIL_PENDING_STATE_H__
|
||||
#include "sm_types.h"
|
||||
#include "sm_failover_fsm.h"
|
||||
#include "sm_timer.h"
|
||||
|
||||
class SmFailoverFailPendingState : public SmFSMState
|
||||
{
|
||||
public:
|
||||
SmFailoverFailPendingState(SmFailoverFSM& fsm) : SmFSMState(fsm){}
|
||||
SmErrorT enter_state();
|
||||
SmErrorT exit_state();
|
||||
|
||||
protected:
|
||||
SmErrorT event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data);
|
||||
|
||||
private:
|
||||
SmTimerIdT _pending_timer_id;
|
||||
|
||||
static bool _fail_pending_timeout(SmTimerIdT timer_id, int64_t user_data);
|
||||
SmErrorT _register_timer();
|
||||
SmErrorT _deregister_timer();
|
||||
};
|
||||
|
||||
|
||||
#endif //__SM_FAILOVER_FAIL_PENDING_STATE_H__
|
66
service-mgmt/sm-1.0.0/src/sm_failover_failed_state.cpp
Normal file
66
service-mgmt/sm-1.0.0/src/sm_failover_failed_state.cpp
Normal file
@ -0,0 +1,66 @@
|
||||
//
|
||||
// Copyright (c) 2018 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
#include "sm_failover_failed_state.h"
|
||||
#include <stdlib.h>
|
||||
#include "sm_types.h"
|
||||
#include "sm_debug.h"
|
||||
#include "sm_failover_fsm.h"
|
||||
#include "sm_failover_ss.h"
|
||||
|
||||
SmErrorT SmFailoverFailedState::enter_state()
|
||||
{
|
||||
DPRINTFI("Entering Failed state");
|
||||
return SM_OKAY;
|
||||
}
|
||||
|
||||
SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
|
||||
{
|
||||
switch (event)
|
||||
{
|
||||
case SM_HEARTBEAT_LOST:
|
||||
case SM_INTERFACE_DOWN:
|
||||
case SM_INTERFACE_UP:
|
||||
// ignore, recover is possible only when heartbeat is recovered
|
||||
break;
|
||||
case SM_HEARTBEAT_RECOVER:
|
||||
{
|
||||
SmSystemFailoverStatus failover_status;
|
||||
SmErrorT error = sm_failover_ss_get_survivor(failover_status);
|
||||
SmNodeScheduleStateT host_state = failover_status.get_host_schedule_state();
|
||||
SmNodeScheduleStateT peer_state = failover_status.get_peer_schedule_state();
|
||||
if(SM_OKAY != error)
|
||||
{
|
||||
DPRINTFE("Failed to get failover survivor. Error %s", sm_error_str(error));
|
||||
return error;
|
||||
}
|
||||
|
||||
if(SM_NODE_STATE_FAILED == host_state)
|
||||
{
|
||||
// don't need to set to failed state, already here
|
||||
}
|
||||
else if(SM_NODE_STATE_STANDBY == host_state && SM_NODE_STATE_ACTIVE == peer_state)
|
||||
{
|
||||
// standby is the only possible state to be scheduled to from failed state
|
||||
this->fsm.set_state(SM_FAILOVER_STATE_NORMAL);
|
||||
}else
|
||||
{
|
||||
DPRINTFE("Runtime error: unexpected scheduling state: %s",
|
||||
sm_node_schedule_state_str(host_state));
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
default:
|
||||
DPRINTFE("Runtime error, unexpected event %d, at state %d", event, this->fsm.get_state());
|
||||
}
|
||||
return SM_OKAY;
|
||||
}
|
||||
|
||||
SmErrorT SmFailoverFailedState::exit_state()
|
||||
{
|
||||
DPRINTFI("Exiting Failed state");
|
||||
return SM_OKAY;
|
||||
}
|
25
service-mgmt/sm-1.0.0/src/sm_failover_failed_state.h
Normal file
25
service-mgmt/sm-1.0.0/src/sm_failover_failed_state.h
Normal file
@ -0,0 +1,25 @@
|
||||
//
|
||||
// Copyright (c) 2018 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
#ifndef __SM_FAILOVER_FAILED_STATE_H__
|
||||
#define __SM_FAILOVER_FAILED_STATE_H__
|
||||
#include "sm_types.h"
|
||||
#include "sm_failover_fsm.h"
|
||||
#include "sm_timer.h"
|
||||
|
||||
class SmFailoverFailedState : public SmFSMState
|
||||
{
|
||||
public:
|
||||
SmFailoverFailedState(SmFailoverFSM& fsm) : SmFSMState(fsm){}
|
||||
SmErrorT enter_state();
|
||||
SmErrorT exit_state();
|
||||
|
||||
protected:
|
||||
SmErrorT event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data);
|
||||
|
||||
};
|
||||
|
||||
|
||||
#endif //__SM_FAILOVER_FAILED_STATE_H__
|
164
service-mgmt/sm-1.0.0/src/sm_failover_fsm.cpp
Normal file
164
service-mgmt/sm-1.0.0/src/sm_failover_fsm.cpp
Normal file
@ -0,0 +1,164 @@
|
||||
//
|
||||
// Copyright (c) 2018 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
#include "sm_failover_fsm.h"
|
||||
#include <stdlib.h>
|
||||
#include "sm_debug.h"
|
||||
#include "sm_failover_normal_state.h"
|
||||
#include "sm_failover_fail_pending_state.h"
|
||||
#include "sm_failover_failed_state.h"
|
||||
#include "sm_failover_survived_state.h"
|
||||
|
||||
SmErrorT SmFSMState::enter_state()
|
||||
{
|
||||
// default implementation, nothing to do
|
||||
return SM_OKAY;
|
||||
};
|
||||
|
||||
SmErrorT SmFSMState::exit_state()
|
||||
{
|
||||
// default implementation, nothing to do
|
||||
return SM_OKAY;
|
||||
};
|
||||
|
||||
SmFailoverFSM SmFailoverFSM::_the_fsm;
|
||||
|
||||
void SmFailoverFSM::deregister_states()
|
||||
{
|
||||
for(int i = 0; i < MaxState; i ++)
|
||||
{
|
||||
SmFSMState* state = _state_handlers[i];
|
||||
if(NULL != state)
|
||||
{
|
||||
_state_handlers[i] = NULL;
|
||||
delete state;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SmFailoverFSM::SmFailoverFSM()
|
||||
{
|
||||
for(int i = 0; i < MaxState; i ++)
|
||||
{
|
||||
_state_handlers[i] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
SmFailoverFSM::~SmFailoverFSM()
|
||||
{
|
||||
}
|
||||
|
||||
SmErrorT SmFailoverFSM::send_event(SmFailoverEventT event, const ISmFSMEventData* event_data)
|
||||
{
|
||||
SmFSMState* state_handler = this->_state_handlers[this->get_state()];
|
||||
if(NULL == state_handler)
|
||||
{
|
||||
DPRINTFE("Runtime error. No handler for state %d", this->get_state());
|
||||
return SM_FAILED;
|
||||
}
|
||||
DPRINTFI("send_event %d\n", event);
|
||||
state_handler->event_handler(event, event_data);
|
||||
return SM_OKAY;
|
||||
}
|
||||
|
||||
SmErrorT SmFailoverFSM::set_state(SmFailoverStateT state)
|
||||
{
|
||||
if(state >= MaxState)
|
||||
{
|
||||
DPRINTFE("Runtime error. Don't understand state %d", state);
|
||||
return SM_FAILED;
|
||||
}
|
||||
if(state == this->get_state())
|
||||
{
|
||||
DPRINTFI("Already in %d state", state);
|
||||
return SM_OKAY;
|
||||
}
|
||||
|
||||
SmFSMState* state_handler = this->_state_handlers[this->get_state()];
|
||||
if(NULL == state_handler)
|
||||
{
|
||||
DPRINTFE("State %d was not registered", this->get_state());
|
||||
return SM_FAILED;
|
||||
}
|
||||
|
||||
SmErrorT error = state_handler->exit_state();
|
||||
if(SM_OKAY != error)
|
||||
{
|
||||
DPRINTFE("Exit state failed (%d), %d", this->get_state(), error);
|
||||
return SM_FAILED;
|
||||
}
|
||||
|
||||
SmFSMState* new_state_handler = this->_state_handlers[state];
|
||||
if(NULL == new_state_handler)
|
||||
{
|
||||
DPRINTFE("State %d was not registered", state);
|
||||
return SM_FAILED;
|
||||
}
|
||||
error = new_state_handler->enter_state();
|
||||
if(SM_OKAY != error)
|
||||
{
|
||||
DPRINTFE("Enter state failed (%d), %d", this->get_state(), error);
|
||||
return SM_FAILED;
|
||||
}
|
||||
|
||||
this->_current_state = state;
|
||||
return SM_OKAY;
|
||||
}
|
||||
|
||||
SmErrorT SmFailoverFSM::register_fsm_state(SmFailoverStateT state, SmFSMState* state_handler)
|
||||
{
|
||||
if(state >= MaxState)
|
||||
{
|
||||
DPRINTFE("Runtime error. Invalid state %d", state);
|
||||
return SM_FAILED;
|
||||
}
|
||||
|
||||
if(NULL != this->_state_handlers[state])
|
||||
{
|
||||
DPRINTFE("Runtime error. Duplicated state handle for state %d", state);
|
||||
return SM_FAILED;
|
||||
}
|
||||
|
||||
this->_state_handlers[state] = state_handler;
|
||||
return SM_OKAY;
|
||||
}
|
||||
|
||||
SmErrorT SmFailoverFSM::init_state()
|
||||
{
|
||||
SmErrorT error;
|
||||
SmFailoverStateT state = SM_FAILOVER_STATE_INITIAL;
|
||||
SmFSMState* new_state_handler = _state_handlers[state];
|
||||
if(NULL == new_state_handler)
|
||||
{
|
||||
DPRINTFE("State %d was not registered", state);
|
||||
return SM_FAILED;
|
||||
}
|
||||
error = new_state_handler->enter_state();
|
||||
if(SM_OKAY != error)
|
||||
{
|
||||
DPRINTFE("Enter state failed (%d), %d", this->get_state(), error);
|
||||
return SM_FAILED;
|
||||
}
|
||||
|
||||
_current_state = state;
|
||||
return SM_OKAY;
|
||||
}
|
||||
|
||||
SmErrorT SmFailoverFSM::initialize()
|
||||
{
|
||||
SmFailoverFSM& fsm = SmFailoverFSM::get_fsm();
|
||||
fsm.register_fsm_state(SM_FAILOVER_STATE_NORMAL, new SmFailoverNormalState(fsm));
|
||||
fsm.register_fsm_state(SM_FAILOVER_STATE_FAIL_PENDING, new SmFailoverFailPendingState(fsm));
|
||||
fsm.register_fsm_state(SM_FAILOVER_STATE_FAILED, new SmFailoverFailedState(fsm));
|
||||
fsm.register_fsm_state(SM_FAILOVER_STATE_SURVIVED, new SmFailoverSurvivedState(fsm));
|
||||
|
||||
return fsm.init_state();
|
||||
}
|
||||
|
||||
SmErrorT SmFailoverFSM::finalize()
|
||||
{
|
||||
SmFailoverFSM::get_fsm().deregister_states();
|
||||
return SM_OKAY;
|
||||
}
|
81
service-mgmt/sm-1.0.0/src/sm_failover_fsm.h
Normal file
81
service-mgmt/sm-1.0.0/src/sm_failover_fsm.h
Normal file
@ -0,0 +1,81 @@
|
||||
//
|
||||
// Copyright (c) 2018 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
#ifndef __SM_FAILOVER_FSM_H__
|
||||
#define __SM_FAILOVER_FSM_H__
|
||||
#include "sm_types.h"
|
||||
|
||||
typedef enum{
|
||||
SM_FAILOVER_STATE_NORMAL,
|
||||
SM_FAILOVER_STATE_FAIL_PENDING,
|
||||
SM_FAILOVER_STATE_FAILED,
|
||||
SM_FAILOVER_STATE_SURVIVED,
|
||||
SM_FAILOVER_STATE_MAX
|
||||
}SmFailoverStateT;
|
||||
|
||||
#define SM_FAILOVER_STATE_INITIAL SM_FAILOVER_STATE_NORMAL
|
||||
|
||||
typedef enum{
|
||||
SM_HEARTBEAT_LOST,
|
||||
SM_INTERFACE_DOWN,
|
||||
SM_INTERFACE_UP,
|
||||
SM_HEARTBEAT_RECOVER,
|
||||
SM_FAIL_PENDING_TIMEOUT,
|
||||
}SmFailoverEventT;
|
||||
|
||||
typedef int SmFSMEventDataTypeT;
|
||||
|
||||
class ISmFSMEventData
|
||||
{
|
||||
public:
|
||||
virtual SmFSMEventDataTypeT get_event_data_type() const = 0;
|
||||
};
|
||||
|
||||
class SmFailoverFSM;
|
||||
|
||||
class SmFSMState
|
||||
{
|
||||
public:
|
||||
SmFSMState(SmFailoverFSM& fsm) : fsm(fsm) {};
|
||||
virtual ~SmFSMState(){};
|
||||
virtual SmErrorT enter_state();
|
||||
virtual SmErrorT exit_state();
|
||||
protected:
|
||||
SmFailoverFSM& fsm;
|
||||
virtual SmErrorT event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)=0;
|
||||
friend SmFailoverFSM;
|
||||
};
|
||||
|
||||
class SmFailoverFSM
|
||||
{
|
||||
public:
|
||||
SmFailoverFSM();
|
||||
virtual ~SmFailoverFSM();
|
||||
|
||||
SmErrorT send_event(SmFailoverEventT event, const ISmFSMEventData* event_data);
|
||||
|
||||
SmErrorT register_fsm_state(SmFailoverStateT state, SmFSMState* state_handler);
|
||||
|
||||
SmErrorT set_state(SmFailoverStateT state);
|
||||
inline int get_state() const {return this->_current_state;}
|
||||
|
||||
static SmErrorT initialize();
|
||||
static SmErrorT finalize();
|
||||
inline static SmFailoverFSM& get_fsm() {
|
||||
return _the_fsm;
|
||||
}
|
||||
|
||||
private:
|
||||
static const int MaxState = SM_FAILOVER_STATE_MAX;
|
||||
SmFSMState* _state_handlers[MaxState];
|
||||
int _current_state;
|
||||
|
||||
static SmFailoverFSM _the_fsm;
|
||||
SmErrorT init_state();
|
||||
void deregister_states();
|
||||
};
|
||||
|
||||
|
||||
#endif //__SM_FAILOVER_FSM_H__
|
42
service-mgmt/sm-1.0.0/src/sm_failover_normal_state.cpp
Normal file
42
service-mgmt/sm-1.0.0/src/sm_failover_normal_state.cpp
Normal file
@ -0,0 +1,42 @@
|
||||
//
|
||||
// Copyright (c) 2018 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
#include "sm_failover_normal_state.h"
|
||||
#include <stdlib.h>
|
||||
#include "sm_types.h"
|
||||
#include "sm_debug.h"
|
||||
#include "sm_failover_fsm.h"
|
||||
|
||||
SmErrorT SmFailoverNormalState::enter_state()
|
||||
{
|
||||
DPRINTFI("Entering normal state");
|
||||
return SM_OKAY;
|
||||
}
|
||||
|
||||
SmErrorT SmFailoverNormalState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
|
||||
{
|
||||
switch (event)
|
||||
{
|
||||
case SM_HEARTBEAT_LOST:
|
||||
case SM_INTERFACE_DOWN:
|
||||
this->fsm.set_state(SM_FAILOVER_STATE_FAIL_PENDING);
|
||||
break;
|
||||
case SM_INTERFACE_UP:
|
||||
DPRINTFI("Interface is up");
|
||||
break;
|
||||
case SM_HEARTBEAT_RECOVER:
|
||||
DPRINTFI("Heartbeat is recovered");
|
||||
break;
|
||||
default:
|
||||
DPRINTFE("Runtime error, unexpected event %d, at state %d", event, this->fsm.get_state());
|
||||
}
|
||||
return SM_OKAY;
|
||||
}
|
||||
|
||||
SmErrorT SmFailoverNormalState::exit_state()
|
||||
{
|
||||
DPRINTFI("Exiting normal state");
|
||||
return SM_OKAY;
|
||||
}
|
22
service-mgmt/sm-1.0.0/src/sm_failover_normal_state.h
Normal file
22
service-mgmt/sm-1.0.0/src/sm_failover_normal_state.h
Normal file
@ -0,0 +1,22 @@
|
||||
//
|
||||
// Copyright (c) 2018 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
#ifndef __SM_FAILOVER_NORMAL_STATE_H__
|
||||
#define __SM_FAILOVER_NORMAL_STATE_H__
|
||||
#include "sm_types.h"
|
||||
#include "sm_failover_fsm.h"
|
||||
|
||||
class SmFailoverNormalState : public SmFSMState
|
||||
{
|
||||
public:
|
||||
SmFailoverNormalState(SmFailoverFSM& fsm) : SmFSMState(fsm){}
|
||||
SmErrorT enter_state();
|
||||
SmErrorT exit_state();
|
||||
protected:
|
||||
SmErrorT event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data);
|
||||
};
|
||||
|
||||
|
||||
#endif //__SM_FAILOVER_NORMAL_STATE_H__
|
@ -5,8 +5,77 @@
|
||||
//
|
||||
|
||||
#include "sm_failover_ss.h"
|
||||
#include <time.h>
|
||||
#include "sm_debug.h"
|
||||
#include "sm_limits.h"
|
||||
#include "sm_node_utils.h"
|
||||
#include "sm_node_api.h"
|
||||
#include "sm_failover_utils.h"
|
||||
#include "sm_node_utils.h"
|
||||
#include "sm_node_api.h"
|
||||
#include "sm_failover.h"
|
||||
|
||||
SmSystemFailoverStatus::SmSystemFailoverStatus()
|
||||
{
|
||||
SmErrorT error;
|
||||
char host_name[SM_NODE_NAME_MAX_CHAR];
|
||||
char peer_name[SM_NODE_NAME_MAX_CHAR];
|
||||
error = sm_node_utils_get_hostname(host_name);
|
||||
if( SM_OKAY != error )
|
||||
{
|
||||
DPRINTFE( "Failed to get hostname, error=%s.",
|
||||
sm_error_str( error ) );
|
||||
return;
|
||||
}
|
||||
error = sm_node_api_get_peername(peer_name);
|
||||
if( SM_OKAY != error )
|
||||
{
|
||||
DPRINTFE( "Failed to get peername, error=%s.",
|
||||
sm_error_str( error ) );
|
||||
return;
|
||||
}
|
||||
_host_schedule_state = sm_get_controller_state(host_name);
|
||||
_peer_schedule_state = sm_get_controller_state(peer_name);
|
||||
}
|
||||
|
||||
SmSystemFailoverStatus::~SmSystemFailoverStatus()
|
||||
{
|
||||
}
|
||||
|
||||
bool SmSystemFailoverStatus::_is_valid_schedule_state(SmNodeScheduleStateT state)
|
||||
{
|
||||
return (SM_NODE_STATE_ACTIVE == state ||
|
||||
SM_NODE_STATE_STANDBY == state ||
|
||||
SM_NODE_STATE_FAILED == state);
|
||||
}
|
||||
|
||||
void SmSystemFailoverStatus::set_host_schedule_state(SmNodeScheduleStateT state)
|
||||
{
|
||||
if(_is_valid_schedule_state(state))
|
||||
{
|
||||
if(_host_schedule_state != state)
|
||||
{
|
||||
_host_schedule_state=state;
|
||||
}
|
||||
}else
|
||||
{
|
||||
DPRINTFE("Runtime error, schedule state unknown %d", state);
|
||||
}
|
||||
}
|
||||
|
||||
void SmSystemFailoverStatus::set_peer_schedule_state(SmNodeScheduleStateT state)
|
||||
{
|
||||
if(_is_valid_schedule_state(state))
|
||||
{
|
||||
if(_peer_schedule_state != state)
|
||||
{
|
||||
_peer_schedule_state = state;
|
||||
}
|
||||
}else
|
||||
{
|
||||
DPRINTFE("Runtime error, schedule state unknown %d", state);
|
||||
}
|
||||
}
|
||||
|
||||
typedef enum
|
||||
{
|
||||
@ -38,15 +107,76 @@ static int get_node_if_healthy_score(unsigned int interface_state)
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
||||
SmErrorT _get_system_status(SmSystemStatusT& sys_status, char host_name[], char peer_name[])
|
||||
{
|
||||
SmErrorT error;
|
||||
SmNodeScheduleStateT host_state;
|
||||
|
||||
error = sm_node_utils_get_hostname(host_name);
|
||||
if( SM_OKAY != error )
|
||||
{
|
||||
DPRINTFE( "Failed to get hostname, error=%s.",
|
||||
sm_error_str( error ) );
|
||||
return error;
|
||||
}
|
||||
error = sm_node_api_get_peername(peer_name);
|
||||
if( SM_OKAY != error )
|
||||
{
|
||||
DPRINTFE( "Failed to get peername, error=%s.",
|
||||
sm_error_str( error ) );
|
||||
return error;
|
||||
}
|
||||
host_state = sm_get_controller_state(host_name);
|
||||
|
||||
sys_status.system_mode = sm_node_utils_get_system_mode();
|
||||
sys_status.host_status.mgmt_state = sm_failover_get_interface_info(SM_INTERFACE_MGMT);
|
||||
sys_status.host_status.infra_state = sm_failover_get_interface_info(SM_INTERFACE_INFRA);
|
||||
sys_status.host_status.oam_state = sm_failover_get_interface_info(SM_INTERFACE_OAM);
|
||||
|
||||
if(SM_FAILOVER_INTERFACE_OK == sys_status.host_status.mgmt_state ||
|
||||
SM_FAILOVER_INTERFACE_OK == sys_status.host_status.oam_state ||
|
||||
SM_FAILOVER_INTERFACE_OK == sys_status.host_status.infra_state)
|
||||
{
|
||||
sys_status.heartbeat_state = SM_HEARTBEAT_OK;
|
||||
}else
|
||||
{
|
||||
sys_status.heartbeat_state = SM_HEARTBEAT_LOSS;
|
||||
}
|
||||
|
||||
sys_status.host_status.node_name = host_name;
|
||||
sys_status.host_status.interface_state = sm_failover_if_state_get();
|
||||
sys_status.host_status.current_schedule_state = host_state;
|
||||
sys_status.peer_status.node_name = peer_name;
|
||||
|
||||
sys_status.peer_status.interface_state = sm_failover_get_peer_if_state();
|
||||
sys_status.peer_status.current_schedule_state = sm_get_controller_state(peer_name);
|
||||
return SM_OKAY;
|
||||
}
|
||||
|
||||
// ****************************************************************************
|
||||
// sm_failover_ss_get_survivor - select the failover survivor
|
||||
// This is the main entry/container for the failover logic to determine how
|
||||
// to schedule the controllers, i.e, active/standby or active/failure.
|
||||
// ===================
|
||||
SmErrorT sm_failover_ss_get_survivor(SmSystemFailoverStatus& selection)
|
||||
{
|
||||
SmSystemStatusT sys_status;
|
||||
char host_name[SM_NODE_NAME_MAX_CHAR];
|
||||
char peer_name[SM_NODE_NAME_MAX_CHAR];
|
||||
|
||||
SmErrorT error = _get_system_status(sys_status, host_name, peer_name);
|
||||
if(SM_OKAY != error)
|
||||
{
|
||||
DPRINTFE("Failed to retrieve system status. Error %s", sm_error_str(error));
|
||||
return error;
|
||||
}
|
||||
return sm_failover_ss_get_survivor(sys_status, selection);
|
||||
}
|
||||
|
||||
SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection)
|
||||
{
|
||||
selection.host_schedule_state = system_status.host_status.current_schedule_state;
|
||||
selection.peer_schedule_state = system_status.peer_status.current_schedule_state;
|
||||
selection.set_host_schedule_state(system_status.host_status.current_schedule_state);
|
||||
selection.set_peer_schedule_state(system_status.peer_status.current_schedule_state);
|
||||
if(SM_HEARTBEAT_OK == system_status.heartbeat_state)
|
||||
{
|
||||
int host_healthy_score, peer_healthy_score;
|
||||
@ -55,26 +185,26 @@ SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSys
|
||||
if( peer_healthy_score < host_healthy_score )
|
||||
{
|
||||
//host is more healthy
|
||||
selection.host_schedule_state = SM_NODE_STATE_ACTIVE;
|
||||
selection.peer_schedule_state = SM_NODE_STATE_STANDBY;
|
||||
selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE);
|
||||
selection.set_peer_schedule_state(SM_NODE_STATE_STANDBY);
|
||||
}else if(peer_healthy_score > host_healthy_score)
|
||||
{
|
||||
//peer is more healthy
|
||||
selection.host_schedule_state = SM_NODE_STATE_STANDBY;
|
||||
selection.peer_schedule_state = SM_NODE_STATE_ACTIVE;
|
||||
selection.set_host_schedule_state(SM_NODE_STATE_STANDBY);
|
||||
selection.set_peer_schedule_state(SM_NODE_STATE_ACTIVE);
|
||||
}
|
||||
}
|
||||
|
||||
if(system_status.host_status.current_schedule_state != selection.host_schedule_state ||
|
||||
system_status.peer_status.current_schedule_state != selection.peer_schedule_state )
|
||||
if(system_status.host_status.current_schedule_state != selection.get_host_schedule_state() ||
|
||||
system_status.peer_status.current_schedule_state != selection.get_peer_schedule_state() )
|
||||
{
|
||||
DPRINTFI("Uncontrolled swact starts. Host from %s to %s, Peer from %s to %s.",
|
||||
sm_node_schedule_state_str(system_status.host_status.current_schedule_state),
|
||||
sm_node_schedule_state_str(selection.host_schedule_state),
|
||||
sm_node_schedule_state_str(selection.get_host_schedule_state()),
|
||||
sm_node_schedule_state_str(system_status.peer_status.current_schedule_state),
|
||||
sm_node_schedule_state_str(selection.peer_schedule_state)
|
||||
sm_node_schedule_state_str(selection.get_peer_schedule_state())
|
||||
);
|
||||
}
|
||||
return SM_OKAY;
|
||||
}
|
||||
// ****************************************************************************
|
||||
// ****************************************************************************
|
||||
|
@ -8,14 +8,18 @@
|
||||
#define __SM_FAILOVER_SS_H__
|
||||
#include <stdio.h>
|
||||
#include "sm_types.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
//#ifdef __cplusplus
|
||||
//extern "C" {
|
||||
//#endif
|
||||
|
||||
typedef struct
|
||||
{
|
||||
const char* node_name;
|
||||
unsigned int interface_state;
|
||||
unsigned int heartbeat_state;
|
||||
SmFailoverInterfaceStateT mgmt_state;
|
||||
SmFailoverInterfaceStateT infra_state;
|
||||
SmFailoverInterfaceStateT oam_state;
|
||||
SmNodeScheduleStateT current_schedule_state;
|
||||
}SmNodeStatusT;
|
||||
|
||||
@ -40,18 +44,36 @@ typedef struct
|
||||
}SmSystemStatusT;
|
||||
|
||||
|
||||
typedef struct
|
||||
class SmSystemFailoverStatus
|
||||
{
|
||||
SmNodeScheduleStateT host_schedule_state;
|
||||
SmNodeScheduleStateT peer_schedule_state;
|
||||
}SmSystemFailoverStatus;
|
||||
public:
|
||||
SmSystemFailoverStatus();
|
||||
virtual ~SmSystemFailoverStatus();
|
||||
inline SmNodeScheduleStateT get_host_schedule_state() const {
|
||||
return _host_schedule_state;
|
||||
}
|
||||
void set_host_schedule_state(SmNodeScheduleStateT state);
|
||||
inline SmNodeScheduleStateT get_peer_schedule_state() const {
|
||||
return _peer_schedule_state;
|
||||
}
|
||||
void set_peer_schedule_state(SmNodeScheduleStateT state);
|
||||
private:
|
||||
SmNodeScheduleStateT _host_schedule_state;
|
||||
SmNodeScheduleStateT _peer_schedule_state;
|
||||
static const char filename[];
|
||||
static const char file_format[];
|
||||
|
||||
bool _is_valid_schedule_state(SmNodeScheduleStateT state);
|
||||
};
|
||||
|
||||
// ****************************************************************************
|
||||
// sm_failover_ss_get_survivor - select the failover survivor
|
||||
// ===================
|
||||
SmErrorT sm_failover_ss_get_survivor(SmSystemFailoverStatus& selection);
|
||||
|
||||
SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
//#ifdef __cplusplus
|
||||
//}
|
||||
//#endif
|
||||
#endif // __SM_FAILOVER_SS_H__
|
||||
|
62
service-mgmt/sm-1.0.0/src/sm_failover_survived_state.cpp
Normal file
62
service-mgmt/sm-1.0.0/src/sm_failover_survived_state.cpp
Normal file
@ -0,0 +1,62 @@
|
||||
//
|
||||
// Copyright (c) 2018 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
#include "sm_failover_survived_state.h"
|
||||
#include <stdlib.h>
|
||||
#include "sm_types.h"
|
||||
#include "sm_debug.h"
|
||||
#include "sm_failover_fsm.h"
|
||||
#include "sm_failover_ss.h"
|
||||
|
||||
SmErrorT SmFailoverSurvivedState::enter_state()
|
||||
{
|
||||
DPRINTFI("Entering Survived state");
|
||||
return SM_OKAY;
|
||||
}
|
||||
|
||||
SmErrorT SmFailoverSurvivedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
|
||||
{
|
||||
switch (event)
|
||||
{
|
||||
case SM_HEARTBEAT_LOST:
|
||||
case SM_INTERFACE_DOWN:
|
||||
case SM_INTERFACE_UP:
|
||||
// ignore, recover is possible only when heartbeat is recovered
|
||||
break;
|
||||
case SM_HEARTBEAT_RECOVER:
|
||||
{
|
||||
SmSystemFailoverStatus failover_status;
|
||||
SmErrorT error = sm_failover_ss_get_survivor(failover_status);
|
||||
SmNodeScheduleStateT host_state = failover_status.get_host_schedule_state();
|
||||
SmNodeScheduleStateT peer_state = failover_status.get_peer_schedule_state();
|
||||
if(SM_OKAY != error)
|
||||
{
|
||||
DPRINTFE("Failed to get failover survivor. Error %s", sm_error_str(error));
|
||||
return error;
|
||||
}
|
||||
|
||||
if(SM_NODE_STATE_ACTIVE == host_state && SM_NODE_STATE_STANDBY == peer_state)
|
||||
{
|
||||
// Active is the only possible state to be scheduled to from survived state
|
||||
this->fsm.set_state(SM_FAILOVER_STATE_NORMAL);
|
||||
}else
|
||||
{
|
||||
DPRINTFE("Runtime error: unexpected scheduling state: host %s, peer %s",
|
||||
sm_node_schedule_state_str(host_state),
|
||||
sm_node_schedule_state_str(peer_state));
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
DPRINTFE("Runtime error, unexpected event %d, at state %d", event, this->fsm.get_state());
|
||||
}
|
||||
return SM_OKAY;
|
||||
}
|
||||
|
||||
SmErrorT SmFailoverSurvivedState::exit_state()
|
||||
{
|
||||
DPRINTFI("Exiting Survived state");
|
||||
return SM_OKAY;
|
||||
}
|
23
service-mgmt/sm-1.0.0/src/sm_failover_survived_state.h
Normal file
23
service-mgmt/sm-1.0.0/src/sm_failover_survived_state.h
Normal file
@ -0,0 +1,23 @@
|
||||
//
|
||||
// Copyright (c) 2018 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
#ifndef __SM_FAILOVER_SURVIVED_STATE_H__
|
||||
#define __SM_FAILOVER_SURVIVED_STATE_H__
|
||||
#include "sm_types.h"
|
||||
#include "sm_failover_fsm.h"
|
||||
|
||||
class SmFailoverSurvivedState : public SmFSMState
|
||||
{
|
||||
public:
|
||||
SmFailoverSurvivedState(SmFailoverFSM& fsm) : SmFSMState(fsm){}
|
||||
SmErrorT enter_state();
|
||||
SmErrorT exit_state();
|
||||
|
||||
protected:
|
||||
SmErrorT event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data);
|
||||
};
|
||||
|
||||
|
||||
#endif //__SM_FAILOVER_SURVIVED_STATE_H__
|
@ -208,11 +208,7 @@ SmErrorT sm_heartbeat_msg_send_alive( SmNetworkTypeT network_type, char node_nam
|
||||
|
||||
heartbeat_msg.u.if_state_msg.msg_size = htons((uint16_t)sizeof(SmHeartbeatMsgAliveRev2T));
|
||||
SmHeartbeatMsgIfStateT if_state;
|
||||
SmErrorT error = sm_failover_if_state_get(&if_state);
|
||||
if(SM_OKAY != error)
|
||||
{
|
||||
return SM_FAILED;
|
||||
}
|
||||
if_state = sm_failover_if_state_get();
|
||||
heartbeat_msg.u.if_state_msg.if_state = htonl(if_state);
|
||||
|
||||
if( SM_AUTH_TYPE_HMAC_SHA512 == auth_type )
|
||||
|
@ -58,6 +58,36 @@ SmErrorT sm_node_api_get_hostname( char node_name[] )
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// Node API - Get Peer Name
|
||||
// ========================
|
||||
SmErrorT sm_node_api_get_peername(char peer_name[SM_NODE_NAME_MAX_CHAR])
|
||||
{
|
||||
char node_name[SM_NODE_NAME_MAX_CHAR];
|
||||
SmErrorT error = sm_node_api_get_hostname(node_name);
|
||||
if(SM_OKAY != error)
|
||||
{
|
||||
return error;
|
||||
}
|
||||
|
||||
char format[] = "name <> '%s'";
|
||||
char query[SM_NODE_NAME_MAX_CHAR + sizeof(format)];
|
||||
unsigned int cnt = snprintf(query, sizeof(query), format, node_name);
|
||||
if(cnt < sizeof(query) - 1)
|
||||
{
|
||||
SmDbNodeT node;
|
||||
error = sm_db_nodes_query(_sm_db_handle, query, &node);
|
||||
if(SM_OKAY != error)
|
||||
{
|
||||
return error;
|
||||
}
|
||||
strncpy(peer_name, node.name, SM_NODE_NAME_MAX_CHAR);
|
||||
return SM_OKAY;
|
||||
}
|
||||
return SM_FAILED;
|
||||
}
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// Node API - Configuration Complete
|
||||
// =================================
|
||||
|
@ -10,6 +10,7 @@
|
||||
|
||||
#include "sm_types.h"
|
||||
#include "sm_uuid.h"
|
||||
#include "sm_limits.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
@ -21,6 +22,12 @@ extern "C" {
|
||||
extern SmErrorT sm_node_api_get_hostname( char node_name[] );
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// Node API - Get Peer Name
|
||||
// ========================
|
||||
extern SmErrorT sm_node_api_get_peername(char peer_name[SM_NODE_NAME_MAX_CHAR]);
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// Node API - Configuration Complete
|
||||
// =================================
|
||||
|
@ -706,6 +706,16 @@ typedef enum
|
||||
SM_NODE_STATE_MAX
|
||||
}SmNodeScheduleStateT;
|
||||
|
||||
|
||||
typedef enum
|
||||
{
|
||||
SM_FAILOVER_INTERFACE_UNKNOWN,
|
||||
SM_FAILOVER_INTERFACE_OK,
|
||||
SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT,
|
||||
SM_FAILOVER_INTERFACE_DOWN,
|
||||
SM_FAILOVER_INTERFACE_RECOVERING
|
||||
}SmFailoverInterfaceStateT;
|
||||
|
||||
// ****************************************************************************
|
||||
// Types - System Mode String
|
||||
// =======================================
|
||||
|
Loading…
Reference in New Issue
Block a user