split-brain avoidance improvement

This change enables one way communication via BMC (if configured)
through mtce.
when 2 controllers lost all communications to each other.
The algorithm is:
when communications all lost,
both active and standby controllers, verify its interfaces (mgmt,
infra, and oam)
if active controller is healthy, it will request a bmc reset
thorugh mtce, against standby controller.
if standby controller is healthy, it will active itself and wait
a total 45 seconds before requesting a bmc reset through mtce,
against the active controller.

Changes also include:
1. adding new initial failover state.
   initial state is a state before the node is enabled
2. remove failover thread.
   using worker thread action to perform time consuming operations
3. remove entire failover action table

Story: 2003577
Task:  24901
Change-Id: I7d294d40e84469df6b6a6f6dd490cf3c4557b711
Signed-off-by: Bin Qian <bin.qian@windriver.com>
This commit is contained in:
Bin Qian
2018-10-26 12:11:11 -04:00
parent d987f1bcdf
commit 133da10b08
28 changed files with 1031 additions and 830 deletions

View File

@@ -10,53 +10,48 @@
#include "sm_failover_fsm.h"
#include "sm_failover_ss.h"
SmErrorT SmFailoverSurvivedState::enter_state()
static void _audit_failover_state()
{
DPRINTFI("Entering Survived state");
return SM_OKAY;
SmSystemFailoverStatus& failover_status = SmSystemFailoverStatus::get_status();
SmErrorT error = sm_failover_ss_get_survivor(failover_status);
SmNodeScheduleStateT host_state = failover_status.get_host_schedule_state();
SmNodeScheduleStateT peer_state = failover_status.get_peer_schedule_state();
if(SM_OKAY != error)
{
DPRINTFE("Failed to get failover survivor. Error %s", sm_error_str(error));
return;
}
if(SM_NODE_STATE_ACTIVE == host_state && SM_NODE_STATE_FAILED == peer_state)
{
// nothing changed yet
}
else if(SM_NODE_STATE_ACTIVE == host_state && SM_NODE_STATE_STANDBY == peer_state)
{
// Active is the only possible state to be scheduled to from survived state
SmFailoverFSM::get_fsm().set_state(SM_FAILOVER_STATE_NORMAL);
}else
{
DPRINTFE("Runtime error: unexpected scheduling state: host %s, peer %s",
sm_node_schedule_state_str(host_state),
sm_node_schedule_state_str(peer_state));
}
}
SmErrorT SmFailoverSurvivedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
{
switch (event)
{
case SM_HEARTBEAT_LOST:
case SM_INTERFACE_DOWN:
case SM_INTERFACE_UP:
// ignore, recover is possible only when heartbeat is recovered
case SM_FAILOVER_EVENT_IF_STATE_CHANGED:
break;
case SM_HEARTBEAT_RECOVER:
{
SmSystemFailoverStatus failover_status;
SmErrorT error = sm_failover_ss_get_survivor(failover_status);
SmNodeScheduleStateT host_state = failover_status.get_host_schedule_state();
SmNodeScheduleStateT peer_state = failover_status.get_peer_schedule_state();
if(SM_OKAY != error)
{
DPRINTFE("Failed to get failover survivor. Error %s", sm_error_str(error));
return error;
}
if(SM_NODE_STATE_ACTIVE == host_state && SM_NODE_STATE_STANDBY == peer_state)
{
// Active is the only possible state to be scheduled to from survived state
this->fsm.set_state(SM_FAILOVER_STATE_NORMAL);
}else
{
DPRINTFE("Runtime error: unexpected scheduling state: host %s, peer %s",
sm_node_schedule_state_str(host_state),
sm_node_schedule_state_str(peer_state));
}
}
case SM_FAILOVER_EVENT_NODE_ENABLED:
_audit_failover_state();
break;
default:
DPRINTFE("Runtime error, unexpected event %d, at state %d", event, this->fsm.get_state());
DPRINTFE("Runtime error, unexpected event %s, at state %s",
sm_failover_event_str(event),
sm_failover_state_str(this->fsm.get_state()));
}
return SM_OKAY;
}
SmErrorT SmFailoverSurvivedState::exit_state()
{
DPRINTFI("Exiting Survived state");
return SM_OKAY;
}