split-brain avoidance improvement

This change enables one way communication via BMC (if configured) through mtce. when 2 controllers lost all communications to each other. The algorithm is: when communications all lost, both active and standby controllers, verify its interfaces (mgmt, infra, and oam) if active controller is healthy, it will request a bmc reset thorugh mtce, against standby controller. if standby controller is healthy, it will active itself and wait a total 45 seconds before requesting a bmc reset through mtce, against the active controller. Changes also include: 1. adding new initial failover state. initial state is a state before the node is enabled 2. remove failover thread. using worker thread action to perform time consuming operations 3. remove entire failover action table Story: 2003577 Task: 24901 Change-Id: I7d294d40e84469df6b6a6f6dd490cf3c4557b711 Signed-off-by: Bin Qian <bin.qian@windriver.com>
2018-10-26 12:11:11 -04:00
parent d987f1bcdf
commit 133da10b08
28 changed files with 1031 additions and 830 deletions
--- a/service-mgmt/sm-1.0.0/src/sm_failover_survived_state.cpp
+++ b/service-mgmt/sm-1.0.0/src/sm_failover_survived_state.cpp
@@ -10,53 +10,48 @@
 #include "sm_failover_fsm.h"
 #include "sm_failover_ss.h"

-SmErrorT SmFailoverSurvivedState::enter_state()
+static void _audit_failover_state()
 {
-    DPRINTFI("Entering Survived state");
-    return SM_OKAY;
+    SmSystemFailoverStatus& failover_status = SmSystemFailoverStatus::get_status();
+    SmErrorT error = sm_failover_ss_get_survivor(failover_status);
+    SmNodeScheduleStateT host_state = failover_status.get_host_schedule_state();
+    SmNodeScheduleStateT peer_state = failover_status.get_peer_schedule_state();
+    if(SM_OKAY != error)
+    {
+        DPRINTFE("Failed to get failover survivor. Error %s", sm_error_str(error));
+        return;
+    }
+
+    if(SM_NODE_STATE_ACTIVE == host_state && SM_NODE_STATE_FAILED == peer_state)
+    {
+        // nothing changed yet
+    }
+    else if(SM_NODE_STATE_ACTIVE == host_state && SM_NODE_STATE_STANDBY == peer_state)
+    {
+        // Active is the only possible state to be scheduled to from survived state
+        SmFailoverFSM::get_fsm().set_state(SM_FAILOVER_STATE_NORMAL);
+    }else
+    {
+        DPRINTFE("Runtime error: unexpected scheduling state: host %s, peer %s",
+            sm_node_schedule_state_str(host_state),
+            sm_node_schedule_state_str(peer_state));
+    }
 }

 SmErrorT SmFailoverSurvivedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
 {
    switch (event)
    {
-        case SM_HEARTBEAT_LOST:
-        case SM_INTERFACE_DOWN:
-        case SM_INTERFACE_UP:
-            // ignore, recover is possible only when heartbeat is recovered
+        case SM_FAILOVER_EVENT_IF_STATE_CHANGED:
            break;
-        case SM_HEARTBEAT_RECOVER:
-            {
-                SmSystemFailoverStatus failover_status;
-                SmErrorT error = sm_failover_ss_get_survivor(failover_status);
-                SmNodeScheduleStateT host_state = failover_status.get_host_schedule_state();
-                SmNodeScheduleStateT peer_state = failover_status.get_peer_schedule_state();
-                if(SM_OKAY != error)
-                {
-                    DPRINTFE("Failed to get failover survivor. Error %s", sm_error_str(error));
-                    return error;
-                }
-
-                if(SM_NODE_STATE_ACTIVE == host_state && SM_NODE_STATE_STANDBY == peer_state)
-                {
-                    // Active is the only possible state to be scheduled to from survived state
-                    this->fsm.set_state(SM_FAILOVER_STATE_NORMAL);
-                }else
-                {
-                    DPRINTFE("Runtime error: unexpected scheduling state: host %s, peer %s",
-                        sm_node_schedule_state_str(host_state),
-                        sm_node_schedule_state_str(peer_state));
-                }
-            }
+        case SM_FAILOVER_EVENT_NODE_ENABLED:
+            _audit_failover_state();
            break;
        default:
-            DPRINTFE("Runtime error, unexpected event %d, at state %d", event, this->fsm.get_state());
+            DPRINTFE("Runtime error, unexpected event %s, at state %s",
+                sm_failover_event_str(event),
+                sm_failover_state_str(this->fsm.get_state()));
    }
    return SM_OKAY;
 }

-SmErrorT SmFailoverSurvivedState::exit_state()
-{
-    DPRINTFI("Exiting Survived state");
-    return SM_OKAY;
-}