Use hbs cluster info to determine best survivor
Uses cluster hbs info to determine which controller to be the survivor when communication lost between 2 controllers with the new rules: 1. If a controller is the only controller to connect to storage-0, it is choosen to be the survivor 2. A controller that can reach more nodes is choosen to be the survivor. 3. A controller is choosen to be failed if it cannot reach any nodes. Story: 2003577 Task: 27704 Change-Id: I79659e1a788b865536500fc125fd65ae2f34123d Signed-off-by: Bin Qian <bin.qian@windriver.com>
This commit is contained in:
parent
28e293bda5
commit
ad8665a1b7
@ -2,4 +2,4 @@ SRC_DIR=$PKG_BASE
|
|||||||
COPY_LIST="$PKG_BASE/LICENSE"
|
COPY_LIST="$PKG_BASE/LICENSE"
|
||||||
TAR_NAME=sm
|
TAR_NAME=sm
|
||||||
VERSION=1.0.0
|
VERSION=1.0.0
|
||||||
TIS_PATCH_VER=27
|
TIS_PATCH_VER=28
|
||||||
|
@ -21,7 +21,15 @@
|
|||||||
#include "sm_debug.h"
|
#include "sm_debug.h"
|
||||||
#include "sm_limits.h"
|
#include "sm_limits.h"
|
||||||
#include "sm_selobj.h"
|
#include "sm_selobj.h"
|
||||||
#include "sm_timer.h"
|
#include "sm_worker_thread.h"
|
||||||
|
|
||||||
|
// uncomment when debugging this module to enabled DPRINTFD output to log file
|
||||||
|
// #define __DEBUG__MSG__
|
||||||
|
|
||||||
|
#ifdef __DEBUG__MSG__
|
||||||
|
#undef DPRINTFD
|
||||||
|
#define DPRINTFD DPRINTFI
|
||||||
|
#endif
|
||||||
|
|
||||||
#define LOOPBACK_IP "127.0.0.1"
|
#define LOOPBACK_IP "127.0.0.1"
|
||||||
#define SM_CLIENT_PORT_KEY "sm_client_port"
|
#define SM_CLIENT_PORT_KEY "sm_client_port"
|
||||||
@ -64,6 +72,45 @@ bool operator!=(const SmClusterHbsStateT& lhs, const SmClusterHbsStateT& rhs)
|
|||||||
return !(lhs == rhs);
|
return !(lhs == rhs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void log_cluster_hbs_state(const SmClusterHbsStateT& state)
|
||||||
|
{
|
||||||
|
if(0 == state.last_update)
|
||||||
|
{
|
||||||
|
DPRINTFI("Cluster hbs state not available");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct timespec ts;
|
||||||
|
clock_gettime(CLOCK_REALTIME, &ts);
|
||||||
|
int secs_since_update = ts.tv_sec - state.last_update;
|
||||||
|
|
||||||
|
if(state.storage0_enabled)
|
||||||
|
{
|
||||||
|
DPRINTFI("Cluster hbs last updated %d secs ago, storage-0 is provisioned, "
|
||||||
|
"from controller-0: %d nodes enabled, %d nodes reachable, storage-0 %s responding "
|
||||||
|
"from controller-1: %d nodes enabled, %d nodes reachable, storage-0 %s responding",
|
||||||
|
secs_since_update,
|
||||||
|
state.controllers[0].number_of_node_enabled,
|
||||||
|
state.controllers[0].number_of_node_reachable,
|
||||||
|
state.controllers[0].storage0_responding ? "is" : "is not",
|
||||||
|
state.controllers[1].number_of_node_enabled,
|
||||||
|
state.controllers[1].number_of_node_reachable,
|
||||||
|
state.controllers[1].storage0_responding ? "is" : "is not"
|
||||||
|
);
|
||||||
|
}else
|
||||||
|
{
|
||||||
|
DPRINTFI("Cluster hbs last updated %d secs ago, storage-0 is not provisioned, "
|
||||||
|
"from controller-0: %d nodes enabled, %d nodes reachable, "
|
||||||
|
"from controller-1: %d nodes enabled, %d nodes reachable",
|
||||||
|
secs_since_update,
|
||||||
|
state.controllers[0].number_of_node_enabled,
|
||||||
|
state.controllers[0].number_of_node_reachable,
|
||||||
|
state.controllers[1].number_of_node_enabled,
|
||||||
|
state.controllers[1].number_of_node_reachable
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pthread_mutex_t SmClusterHbsInfoMsg::_mutex;
|
pthread_mutex_t SmClusterHbsInfoMsg::_mutex;
|
||||||
const unsigned short Invalid_Req_Id = 0;
|
const unsigned short Invalid_Req_Id = 0;
|
||||||
int SmClusterHbsInfoMsg::_sock = -1;
|
int SmClusterHbsInfoMsg::_sock = -1;
|
||||||
@ -103,8 +150,8 @@ bool SmClusterHbsInfoMsg::_process_cluster_hbs_history(mtce_hbs_cluster_history_
|
|||||||
DPRINTFE("Invalid oldest entry index %d", history.oldest_entry_index);
|
DPRINTFE("Invalid oldest entry index %d", history.oldest_entry_index);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
int newest_entry_index = (history.oldest_entry_index + history.entries) % MTCE_HBS_HISTORY_ENTRIES;
|
|
||||||
|
|
||||||
|
int newest_entry_index = (history.oldest_entry_index + history.entries - 1) % MTCE_HBS_HISTORY_ENTRIES;
|
||||||
mtce_hbs_cluster_entry_type& entry = history.entry[newest_entry_index];
|
mtce_hbs_cluster_entry_type& entry = history.entry[newest_entry_index];
|
||||||
|
|
||||||
SmClusterHbsInfoT& controller_state = state.controllers[history.controller];
|
SmClusterHbsInfoT& controller_state = state.controllers[history.controller];
|
||||||
@ -112,7 +159,11 @@ bool SmClusterHbsInfoMsg::_process_cluster_hbs_history(mtce_hbs_cluster_history_
|
|||||||
if(entry.hosts_responding > controller_state.number_of_node_reachable)
|
if(entry.hosts_responding > controller_state.number_of_node_reachable)
|
||||||
{
|
{
|
||||||
controller_state.number_of_node_reachable = entry.hosts_responding;
|
controller_state.number_of_node_reachable = entry.hosts_responding;
|
||||||
|
controller_state.number_of_node_enabled = entry.hosts_enabled;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
DPRINTFD("Oldest index %d, entries %d, newest index %d, nodes %d",
|
||||||
|
history.oldest_entry_index, history.entries, newest_entry_index, entry.hosts_responding);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -123,7 +174,6 @@ void SmClusterHbsInfoMsg::_cluster_hbs_info_msg_received( int selobj, int64_t us
|
|||||||
while(true)
|
while(true)
|
||||||
{
|
{
|
||||||
int bytes_read = recv( selobj, &msg, sizeof(msg), MSG_NOSIGNAL | MSG_DONTWAIT );
|
int bytes_read = recv( selobj, &msg, sizeof(msg), MSG_NOSIGNAL | MSG_DONTWAIT );
|
||||||
DPRINTFD("msg received %d bytes. buffer size %d", bytes_read, sizeof(msg));
|
|
||||||
if(bytes_read < 0)
|
if(bytes_read < 0)
|
||||||
{
|
{
|
||||||
if(EAGAIN != errno)
|
if(EAGAIN != errno)
|
||||||
@ -132,6 +182,7 @@ void SmClusterHbsInfoMsg::_cluster_hbs_info_msg_received( int selobj, int64_t us
|
|||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
DPRINTFD("msg received %d bytes. buffer size %d", bytes_read, sizeof(msg));
|
||||||
if(size_of_msg_header > (unsigned int)bytes_read)
|
if(size_of_msg_header > (unsigned int)bytes_read)
|
||||||
{
|
{
|
||||||
DPRINTFE("size not right, msg size %d, expected not less than %d",
|
DPRINTFE("size not right, msg size %d, expected not less than %d",
|
||||||
@ -173,6 +224,8 @@ void SmClusterHbsInfoMsg::_cluster_hbs_info_msg_received( int selobj, int64_t us
|
|||||||
{
|
{
|
||||||
_cluster_hbs_state_previous = _cluster_hbs_state_current;
|
_cluster_hbs_state_previous = _cluster_hbs_state_current;
|
||||||
_cluster_hbs_state_current = state;
|
_cluster_hbs_state_current = state;
|
||||||
|
DPRINTFD("cluster hbs state changed");
|
||||||
|
log_cluster_hbs_state(_cluster_hbs_state_current);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -220,6 +273,13 @@ SmErrorT SmClusterHbsInfoMsg::_get_address(const char* port_key, struct sockaddr
|
|||||||
return SM_OKAY;
|
return SM_OKAY;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void send_query(SmSimpleAction&)
|
||||||
|
{
|
||||||
|
SmClusterHbsInfoMsg::cluster_hbs_info_query();
|
||||||
|
}
|
||||||
|
|
||||||
|
static SmSimpleAction _query_hbs_cluster_info_action("send hbs-cluster query", send_query);
|
||||||
|
|
||||||
// ****************************************************************************
|
// ****************************************************************************
|
||||||
// SmClusterHbsInfoMsg::cluster_hbs_info_query -
|
// SmClusterHbsInfoMsg::cluster_hbs_info_query -
|
||||||
// trigger a query of cluster hbs info.
|
// trigger a query of cluster hbs info.
|
||||||
@ -244,35 +304,37 @@ bool SmClusterHbsInfoMsg::cluster_hbs_info_query(cluster_hbs_query_ready_callbac
|
|||||||
char query[request_size];
|
char query[request_size];
|
||||||
unsigned short reqid;
|
unsigned short reqid;
|
||||||
struct timespec ts;
|
struct timespec ts;
|
||||||
mutex_holder holder(&_mutex);
|
|
||||||
if(0 != clock_gettime(CLOCK_REALTIME, &ts))
|
|
||||||
{
|
{
|
||||||
DPRINTFE("Failed to get realtime");
|
mutex_holder holder(&_mutex);
|
||||||
reqid = (unsigned short)1;
|
if(0 != clock_gettime(CLOCK_REALTIME, &ts))
|
||||||
}else
|
{
|
||||||
{
|
DPRINTFE("Failed to get realtime");
|
||||||
unsigned short* v = (unsigned short*)(&ts.tv_nsec);
|
reqid = (unsigned short)1;
|
||||||
reqid = (*v) % 0xFFFE + 1;
|
}else
|
||||||
}
|
{
|
||||||
|
unsigned short* v = (unsigned short*)(&ts.tv_nsec);
|
||||||
|
reqid = (*v) % 0xFFFE + 1;
|
||||||
|
}
|
||||||
|
|
||||||
struct sockaddr_in addr;
|
struct sockaddr_in addr;
|
||||||
if(SM_OKAY != _get_address(SM_SERVER_PORT_KEY, &addr))
|
if(SM_OKAY != _get_address(SM_SERVER_PORT_KEY, &addr))
|
||||||
{
|
{
|
||||||
DPRINTFE("Failed to get address");
|
DPRINTFE("Failed to get address");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
int msg_size = snprintf(query, sizeof(query), json_fmt, reqid);
|
int msg_size = snprintf(query, sizeof(query), json_fmt, reqid);
|
||||||
|
|
||||||
DPRINTFD("msg (%d:%d) to send %s", msg_size + 1, strlen(query), query);
|
DPRINTFD("send %d bytes %s", msg_size, query);
|
||||||
if(0 > sendto(_sock, query, msg_size + 1, 0, (sockaddr*)&addr, sizeof(addr)))
|
if(0 > sendto(_sock, query, msg_size, 0, (sockaddr*)&addr, sizeof(addr)))
|
||||||
{
|
{
|
||||||
DPRINTFE("Failed to send msg. Error %s", strerror(errno));
|
DPRINTFE("Failed to send msg. Error %s", strerror(errno));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if(NULL != callback)
|
if(NULL != callback)
|
||||||
{
|
{
|
||||||
_callbacks.push_back(callback);
|
_callbacks.push_back(callback);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -386,6 +448,7 @@ SmErrorT SmClusterHbsInfoMsg::initialize()
|
|||||||
return SM_FAILED;
|
return SM_FAILED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SmWorkerThread::get_worker().add_action(&_query_hbs_cluster_info_action);
|
||||||
return SM_OKAY;
|
return SM_OKAY;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -432,6 +495,7 @@ void SmClusterHbsInfoMsg::dump_hbs_record(FILE* fp)
|
|||||||
|
|
||||||
if(0 != _cluster_hbs_state_previous.last_update)
|
if(0 != _cluster_hbs_state_previous.last_update)
|
||||||
{
|
{
|
||||||
|
t = ts.tv_sec - _cluster_hbs_state_previous.last_update;
|
||||||
fprintf(fp, "\n Previous state, since %d seconds ago\n", (int)t);
|
fprintf(fp, "\n Previous state, since %d seconds ago\n", (int)t);
|
||||||
|
|
||||||
fprintf(fp, " storage-0 is %s configured\n", _cluster_hbs_state_previous.storage0_enabled ? "" : "not");
|
fprintf(fp, " storage-0 is %s configured\n", _cluster_hbs_state_previous.storage0_enabled ? "" : "not");
|
||||||
|
@ -21,7 +21,10 @@ struct _SmClusterHbsInfoT
|
|||||||
{
|
{
|
||||||
bool storage0_responding;
|
bool storage0_responding;
|
||||||
int number_of_node_reachable;
|
int number_of_node_reachable;
|
||||||
_SmClusterHbsInfoT() : storage0_responding(false), number_of_node_reachable(0)
|
int number_of_node_enabled;
|
||||||
|
_SmClusterHbsInfoT() : storage0_responding(false),
|
||||||
|
number_of_node_reachable(0),
|
||||||
|
number_of_node_enabled(0)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -47,6 +50,12 @@ typedef struct
|
|||||||
bool operator==(const SmClusterHbsStateT& lhs, const SmClusterHbsStateT& rhs);
|
bool operator==(const SmClusterHbsStateT& lhs, const SmClusterHbsStateT& rhs);
|
||||||
bool operator!=(const SmClusterHbsStateT& lhs, const SmClusterHbsStateT& rhs);
|
bool operator!=(const SmClusterHbsStateT& lhs, const SmClusterHbsStateT& rhs);
|
||||||
|
|
||||||
|
inline bool is_valid(const SmClusterHbsStateT& state)
|
||||||
|
{
|
||||||
|
return state.last_update > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void log_cluster_hbs_state(const SmClusterHbsStateT& state);
|
||||||
|
|
||||||
typedef void(*cluster_hbs_query_ready_callback)();
|
typedef void(*cluster_hbs_query_ready_callback)();
|
||||||
// ****************************************************************************
|
// ****************************************************************************
|
||||||
|
@ -29,6 +29,7 @@
|
|||||||
#include "sm_service_domain_neighbor_fsm.h"
|
#include "sm_service_domain_neighbor_fsm.h"
|
||||||
#include "sm_service_domain_member_table.h"
|
#include "sm_service_domain_member_table.h"
|
||||||
#include "sm_service_domain_interface_fsm.h"
|
#include "sm_service_domain_interface_fsm.h"
|
||||||
|
#include "sm_service_domain_fsm.h"
|
||||||
#include "sm_heartbeat_msg.h"
|
#include "sm_heartbeat_msg.h"
|
||||||
#include "sm_node_swact_monitor.h"
|
#include "sm_node_swact_monitor.h"
|
||||||
#include "sm_util_types.h"
|
#include "sm_util_types.h"
|
||||||
@ -602,36 +603,33 @@ SmFailoverActionResultT sm_failover_swact()
|
|||||||
// ****************************************************************************
|
// ****************************************************************************
|
||||||
// Failover - fail self
|
// Failover - fail self
|
||||||
// ==================
|
// ==================
|
||||||
SmFailoverActionResultT sm_failover_fail_self()
|
SmErrorT sm_failover_fail_self()
|
||||||
{
|
{
|
||||||
DPRINTFI("To disable %s", _host_name);
|
DPRINTFI("To disable %s", _host_name);
|
||||||
SmErrorT error = sm_node_fsm_event_handler(
|
SmErrorT error = sm_node_fsm_event_handler(
|
||||||
_host_name, SM_NODE_EVENT_DISABLED, NULL, "Host is isolated" );
|
_host_name, SM_NODE_EVENT_DISABLED, NULL, "Host is failed" );
|
||||||
if( SM_OKAY != error )
|
if( SM_OKAY != error )
|
||||||
{
|
{
|
||||||
DPRINTFE("Failed to disable %s, error: %s", _host_name, sm_error_str(error));
|
DPRINTFE("Failed to disable %s, error: %s", _host_name, sm_error_str(error));
|
||||||
return SM_FAILOVER_ACTION_RESULT_FAILED;
|
return SM_FAILED;
|
||||||
}
|
}
|
||||||
|
|
||||||
sm_node_utils_set_unhealthy();
|
sm_node_utils_set_unhealthy();
|
||||||
|
|
||||||
error = sm_node_api_fail_node( _host_name );
|
error = sm_node_api_fail_node( _host_name );
|
||||||
if (SM_OKAY == error )
|
if (SM_OKAY != error )
|
||||||
{
|
|
||||||
return SM_FAILOVER_ACTION_RESULT_OK;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
{
|
||||||
DPRINTFE("Failed to set %s failed, error %s.", _host_name, sm_error_str(error));
|
DPRINTFE("Failed to set %s failed, error %s.", _host_name, sm_error_str(error));
|
||||||
return SM_FAILOVER_ACTION_RESULT_FAILED;
|
return SM_FAILED;
|
||||||
}
|
}
|
||||||
|
return SM_OKAY;
|
||||||
}
|
}
|
||||||
// ****************************************************************************
|
// ****************************************************************************
|
||||||
|
|
||||||
// ****************************************************************************
|
// ****************************************************************************
|
||||||
// Failover - disable node
|
// Failover - disable node
|
||||||
// ==================
|
// ==================
|
||||||
SmFailoverActionResultT sm_failover_disable_node(char* node_name)
|
SmErrorT sm_failover_disable_node(char* node_name)
|
||||||
{
|
{
|
||||||
DPRINTFI("To disable %s", node_name);
|
DPRINTFI("To disable %s", node_name);
|
||||||
|
|
||||||
@ -645,9 +643,9 @@ SmFailoverActionResultT sm_failover_disable_node(char* node_name)
|
|||||||
{
|
{
|
||||||
DPRINTFE( "Failed to disable node %s, error=%s.",
|
DPRINTFE( "Failed to disable node %s, error=%s.",
|
||||||
node_name, sm_error_str( error ) );
|
node_name, sm_error_str( error ) );
|
||||||
return SM_FAILOVER_ACTION_RESULT_FAILED;
|
return SM_FAILED;
|
||||||
}
|
}
|
||||||
return SM_FAILOVER_ACTION_RESULT_OK;
|
return SM_OKAY;
|
||||||
}
|
}
|
||||||
// ****************************************************************************
|
// ****************************************************************************
|
||||||
|
|
||||||
@ -796,6 +794,21 @@ bool this_controller_unlocked()
|
|||||||
}
|
}
|
||||||
// ****************************************************************************
|
// ****************************************************************************
|
||||||
|
|
||||||
|
static SmErrorT sm_ensure_leader_scheduler()
|
||||||
|
{
|
||||||
|
char controller_domain[] = "controller";
|
||||||
|
char reason_text[SM_LOG_REASON_TEXT_MAX_CHAR] = "Loss of heartbeat";
|
||||||
|
|
||||||
|
SmErrorT error = sm_service_domain_fsm_set_state(
|
||||||
|
controller_domain,
|
||||||
|
SM_SERVICE_DOMAIN_STATE_LEADER,
|
||||||
|
reason_text );
|
||||||
|
if(SM_OKAY != error)
|
||||||
|
{
|
||||||
|
DPRINTFE("Failed to ensure leader scheduler. Error %s", sm_error_str(error));
|
||||||
|
}
|
||||||
|
return error;
|
||||||
|
}
|
||||||
// ****************************************************************************
|
// ****************************************************************************
|
||||||
// Failover - set system to scheduled status
|
// Failover - set system to scheduled status
|
||||||
// ==================
|
// ==================
|
||||||
@ -808,6 +821,16 @@ SmErrorT sm_failover_set_system(const SmSystemFailoverStatus& failover_status)
|
|||||||
SmNodeScheduleStateT host_target_state, peer_target_state;
|
SmNodeScheduleStateT host_target_state, peer_target_state;
|
||||||
host_target_state = failover_status.get_host_schedule_state();
|
host_target_state = failover_status.get_host_schedule_state();
|
||||||
peer_target_state = failover_status.get_peer_schedule_state();
|
peer_target_state = failover_status.get_peer_schedule_state();
|
||||||
|
SmHeartbeatStateT heartbeat_state = failover_status.get_heartbeat_state();
|
||||||
|
if(SM_HEARTBEAT_OK != heartbeat_state)
|
||||||
|
{
|
||||||
|
if(SM_OKAY != sm_ensure_leader_scheduler())
|
||||||
|
{
|
||||||
|
DPRINTFE("Failed to set new leader scheduler to local");
|
||||||
|
return SM_FAILED;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if(SM_NODE_STATE_ACTIVE == host_target_state)
|
if(SM_NODE_STATE_ACTIVE == host_target_state)
|
||||||
{
|
{
|
||||||
if(SM_NODE_STATE_STANDBY == _host_state &&
|
if(SM_NODE_STATE_STANDBY == _host_state &&
|
||||||
@ -818,8 +841,7 @@ SmErrorT sm_failover_set_system(const SmSystemFailoverStatus& failover_status)
|
|||||||
DPRINTFE("Failed to activate %s.", _host_name);
|
DPRINTFE("Failed to activate %s.", _host_name);
|
||||||
return SM_FAILED;
|
return SM_FAILED;
|
||||||
}
|
}
|
||||||
result = sm_failover_disable_node(_peer_name);
|
if(SM_OKAY != sm_failover_disable_node(_peer_name))
|
||||||
if(SM_FAILOVER_ACTION_RESULT_FAILED == result)
|
|
||||||
{
|
{
|
||||||
DPRINTFE("Failed to disable node %s.", _peer_name);
|
DPRINTFE("Failed to disable node %s.", _peer_name);
|
||||||
return SM_FAILED;
|
return SM_FAILED;
|
||||||
@ -839,8 +861,7 @@ SmErrorT sm_failover_set_system(const SmSystemFailoverStatus& failover_status)
|
|||||||
}
|
}
|
||||||
else if(SM_NODE_STATE_FAILED == host_target_state)
|
else if(SM_NODE_STATE_FAILED == host_target_state)
|
||||||
{
|
{
|
||||||
result = sm_failover_disable_node(_host_name);
|
if(SM_OKAY != sm_failover_fail_self())
|
||||||
if(SM_FAILOVER_ACTION_RESULT_FAILED == result)
|
|
||||||
{
|
{
|
||||||
DPRINTFE("Failed disable host %s.", _host_name);
|
DPRINTFE("Failed disable host %s.", _host_name);
|
||||||
return SM_FAILED;
|
return SM_FAILED;
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
#include "sm_failover_fail_pending_state.h"
|
#include "sm_failover_fail_pending_state.h"
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
#include "sm_cluster_hbs_info_msg.h"
|
||||||
#include "sm_types.h"
|
#include "sm_types.h"
|
||||||
#include "sm_limits.h"
|
#include "sm_limits.h"
|
||||||
#include "sm_debug.h"
|
#include "sm_debug.h"
|
||||||
@ -17,7 +18,8 @@
|
|||||||
#include "sm_node_api.h"
|
#include "sm_node_api.h"
|
||||||
#include "sm_worker_thread.h"
|
#include "sm_worker_thread.h"
|
||||||
|
|
||||||
static const int FAIL_PENDING_TIMEOUT = 2000; //2000ms
|
static const int FAIL_PENDING_TIMEOUT = 2000; // 2seconds
|
||||||
|
static const int DELAY_QUERY_HBS_MS = FAIL_PENDING_TIMEOUT - 200; // give 200ms for hbs agent to respond
|
||||||
|
|
||||||
static SmTimerIdT action_timer_id = SM_TIMER_ID_INVALID;
|
static SmTimerIdT action_timer_id = SM_TIMER_ID_INVALID;
|
||||||
static const int RESET_TIMEOUT = 10 * 1000; // 10 seconds for a reset command to reboot a node
|
static const int RESET_TIMEOUT = 10 * 1000; // 10 seconds for a reset command to reboot a node
|
||||||
@ -294,6 +296,20 @@ SmErrorT SmFailoverFailPendingState::enter_state()
|
|||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void _cluster_hbs_response_callback()
|
||||||
|
{
|
||||||
|
const SmClusterHbsStateT& cluster_hbs_state = SmClusterHbsInfoMsg::get_current_state();
|
||||||
|
log_cluster_hbs_state(cluster_hbs_state);
|
||||||
|
SmSystemFailoverStatus::get_status().set_cluster_hbs_state(cluster_hbs_state);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool SmFailoverFailPendingState::_delay_query_hbs_timeout(
|
||||||
|
SmTimerIdT timer_id, int64_t user_data)
|
||||||
|
{
|
||||||
|
SmClusterHbsInfoMsg::cluster_hbs_info_query(_cluster_hbs_response_callback);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
SmErrorT SmFailoverFailPendingState::_register_timer()
|
SmErrorT SmFailoverFailPendingState::_register_timer()
|
||||||
{
|
{
|
||||||
SmErrorT error;
|
SmErrorT error;
|
||||||
@ -303,31 +319,47 @@ SmErrorT SmFailoverFailPendingState::_register_timer()
|
|||||||
this->_deregister_timer();
|
this->_deregister_timer();
|
||||||
}
|
}
|
||||||
|
|
||||||
error = sm_timer_register( timer_name, FAIL_PENDING_TIMEOUT,
|
error = sm_timer_register(timer_name, FAIL_PENDING_TIMEOUT,
|
||||||
SmFailoverFailPendingState::_fail_pending_timeout,
|
SmFailoverFailPendingState::_fail_pending_timeout,
|
||||||
0, &this->_pending_timer_id);
|
0, &this->_pending_timer_id);
|
||||||
|
|
||||||
|
const char* delay_query_hbs_timer_name = "DELAY QUERY HBS";
|
||||||
|
|
||||||
|
error = sm_timer_register(delay_query_hbs_timer_name, DELAY_QUERY_HBS_MS,
|
||||||
|
SmFailoverFailPendingState::_delay_query_hbs_timeout,
|
||||||
|
0, &this->_delay_query_hbs_timer_id);
|
||||||
|
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
SmErrorT SmFailoverFailPendingState::_deregister_timer()
|
SmErrorT SmFailoverFailPendingState::_deregister_timer()
|
||||||
{
|
{
|
||||||
SmErrorT error;
|
SmErrorT error = SM_OKAY;
|
||||||
if(SM_TIMER_ID_INVALID == this->_pending_timer_id)
|
if(SM_TIMER_ID_INVALID != this->_pending_timer_id)
|
||||||
{
|
{
|
||||||
return SM_OKAY;
|
error = sm_timer_deregister(this->_pending_timer_id);
|
||||||
|
if( SM_OKAY != error )
|
||||||
|
{
|
||||||
|
DPRINTFE( "Failed to cancel fail pending timer, error=%s.",
|
||||||
|
sm_error_str( error ) );
|
||||||
|
}else
|
||||||
|
{
|
||||||
|
this->_pending_timer_id = SM_TIMER_ID_INVALID;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
error = sm_timer_deregister(this->_pending_timer_id);
|
if(SM_TIMER_ID_INVALID != this->_delay_query_hbs_timer_id)
|
||||||
if( SM_OKAY != error )
|
|
||||||
{
|
{
|
||||||
DPRINTFE( "Failed to cancel fail pending timer, error=%s.",
|
error = sm_timer_deregister(this->_delay_query_hbs_timer_id);
|
||||||
sm_error_str( error ) );
|
if( SM_OKAY != error )
|
||||||
}else
|
{
|
||||||
{
|
DPRINTFE( "Failed to cancel query hbs info timer, error=%s.",
|
||||||
this->_pending_timer_id = SM_TIMER_ID_INVALID;
|
sm_error_str( error ) );
|
||||||
|
}else
|
||||||
|
{
|
||||||
|
this->_delay_query_hbs_timer_id = SM_TIMER_ID_INVALID;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -22,8 +22,11 @@ class SmFailoverFailPendingState : public SmFSMState
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
SmTimerIdT _pending_timer_id;
|
SmTimerIdT _pending_timer_id;
|
||||||
|
SmTimerIdT _delay_query_hbs_timer_id;
|
||||||
|
|
||||||
static bool _fail_pending_timeout(SmTimerIdT timer_id, int64_t user_data);
|
static bool _fail_pending_timeout(SmTimerIdT timer_id, int64_t user_data);
|
||||||
|
static bool _delay_query_hbs_timeout(SmTimerIdT timer_id, int64_t user_data);
|
||||||
|
static void cluster_hbs_response_callback();
|
||||||
SmErrorT _register_timer();
|
SmErrorT _register_timer();
|
||||||
SmErrorT _deregister_timer();
|
SmErrorT _deregister_timer();
|
||||||
};
|
};
|
||||||
|
@ -10,41 +10,15 @@
|
|||||||
#include "sm_failover_fsm.h"
|
#include "sm_failover_fsm.h"
|
||||||
#include "sm_failover_ss.h"
|
#include "sm_failover_ss.h"
|
||||||
|
|
||||||
static void _audit_failover_state()
|
|
||||||
{
|
|
||||||
SmSystemFailoverStatus& failover_status = SmSystemFailoverStatus::get_status();
|
|
||||||
SmErrorT error = sm_failover_ss_get_survivor(failover_status);
|
|
||||||
SmNodeScheduleStateT host_state = failover_status.get_host_schedule_state();
|
|
||||||
SmNodeScheduleStateT peer_state = failover_status.get_peer_schedule_state();
|
|
||||||
if(SM_OKAY != error)
|
|
||||||
{
|
|
||||||
DPRINTFE("Failed to get failover survivor. Error %s", sm_error_str(error));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(SM_NODE_STATE_FAILED == host_state)
|
|
||||||
{
|
|
||||||
// don't need to set to failed state, already here
|
|
||||||
}
|
|
||||||
else if(SM_NODE_STATE_STANDBY == host_state && SM_NODE_STATE_ACTIVE == peer_state)
|
|
||||||
{
|
|
||||||
// standby is the only possible state to be scheduled to from failed state
|
|
||||||
SmFailoverFSM::get_fsm().set_state(SM_FAILOVER_STATE_NORMAL);
|
|
||||||
}else
|
|
||||||
{
|
|
||||||
DPRINTFE("Runtime error: unexpected scheduling state: %s",
|
|
||||||
sm_node_schedule_state_str(host_state));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
|
SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
|
||||||
{
|
{
|
||||||
|
// Currently the only supported scenario to recover from failure is
|
||||||
|
// reboot triggered by mtce.
|
||||||
|
// So once entering failed state, wait for reboot to reenter the normal state.
|
||||||
switch (event)
|
switch (event)
|
||||||
{
|
{
|
||||||
case SM_FAILOVER_EVENT_IF_STATE_CHANGED:
|
case SM_FAILOVER_EVENT_IF_STATE_CHANGED:
|
||||||
DPRINTFI("sm_heartbeat_recover event received.");
|
// event will be fired, but couldn't bring fsm state back to normal
|
||||||
_audit_failover_state();
|
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
|
@ -40,6 +40,7 @@ void SmIFStateChangedEventData::set_interface_state(
|
|||||||
DPRINTFE("Runtime error: invalid interface type %d", interface_type);
|
DPRINTFE("Runtime error: invalid interface type %d", interface_type);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
SmFailoverInterfaceStateT SmIFStateChangedEventData::get_interface_state(SmInterfaceTypeT interface_type) const
|
SmFailoverInterfaceStateT SmIFStateChangedEventData::get_interface_state(SmInterfaceTypeT interface_type) const
|
||||||
{
|
{
|
||||||
switch (interface_type)
|
switch (interface_type)
|
||||||
@ -54,7 +55,6 @@ SmFailoverInterfaceStateT SmIFStateChangedEventData::get_interface_state(SmInter
|
|||||||
DPRINTFE("Runtime error: invalid interface type %d", interface_type);
|
DPRINTFE("Runtime error: invalid interface type %d", interface_type);
|
||||||
return SM_FAILOVER_INTERFACE_UNKNOWN;
|
return SM_FAILOVER_INTERFACE_UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SmErrorT SmFSMState::enter_state()
|
SmErrorT SmFSMState::enter_state()
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
#include "sm_failover_utils.h"
|
#include "sm_failover_utils.h"
|
||||||
#include "sm_failover_fsm.h"
|
#include "sm_failover_fsm.h"
|
||||||
#include "sm_failover_ss.h"
|
#include "sm_failover_ss.h"
|
||||||
|
#include "sm_cluster_hbs_info_msg.h"
|
||||||
|
|
||||||
SmErrorT SmFailoverNormalState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
|
SmErrorT SmFailoverNormalState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
|
||||||
{
|
{
|
||||||
@ -79,6 +80,29 @@ SmErrorT SmFailoverNormalState::exit_state()
|
|||||||
failover_status.set_peer_pre_failure_schedule_state(peer_state);
|
failover_status.set_peer_pre_failure_schedule_state(peer_state);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const SmClusterHbsStateT& cluster_hbs_state_cur = SmClusterHbsInfoMsg::get_current_state();
|
||||||
|
const SmClusterHbsStateT& cluster_hbs_state_pre = SmClusterHbsInfoMsg::get_previous_state();
|
||||||
|
SmClusterHbsStateT pre_failure_cluster_hsb_state;
|
||||||
|
if(!is_valid(cluster_hbs_state_cur))
|
||||||
|
{
|
||||||
|
DPRINTFE("No cluster hbs state available");
|
||||||
|
}else
|
||||||
|
{
|
||||||
|
struct timespec ts;
|
||||||
|
clock_gettime(CLOCK_REALTIME, &ts);
|
||||||
|
if(ts.tv_sec - cluster_hbs_state_cur.last_update <= 1 && cluster_hbs_state_pre.last_update != 0)
|
||||||
|
{
|
||||||
|
// cluster hbs state changed within past 1 second, take the pre state as pre-failure state.
|
||||||
|
pre_failure_cluster_hsb_state = cluster_hbs_state_pre;
|
||||||
|
}else
|
||||||
|
{
|
||||||
|
pre_failure_cluster_hsb_state = cluster_hbs_state_cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
log_cluster_hbs_state(pre_failure_cluster_hsb_state);
|
||||||
|
failover_status.set_pre_failure_cluster_hbs_state(pre_failure_cluster_hsb_state);
|
||||||
|
}
|
||||||
|
|
||||||
SmFSMState::exit_state();
|
SmFSMState::exit_state();
|
||||||
return SM_OKAY;
|
return SM_OKAY;
|
||||||
}
|
}
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
//
|
//
|
||||||
|
|
||||||
#include "sm_failover_ss.h"
|
#include "sm_failover_ss.h"
|
||||||
|
#include <string.h>
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
#include "sm_debug.h"
|
#include "sm_debug.h"
|
||||||
#include "sm_limits.h"
|
#include "sm_limits.h"
|
||||||
@ -15,6 +16,14 @@
|
|||||||
#include "sm_node_api.h"
|
#include "sm_node_api.h"
|
||||||
#include "sm_failover.h"
|
#include "sm_failover.h"
|
||||||
|
|
||||||
|
// uncomment when debugging this module to enabled DPRINTFD output to log file
|
||||||
|
// #define __DEBUG__MSG__
|
||||||
|
|
||||||
|
#ifdef __DEBUG__MSG__
|
||||||
|
#undef DPRINTFD
|
||||||
|
#define DPRINTFD DPRINTFI
|
||||||
|
#endif
|
||||||
|
|
||||||
//
|
//
|
||||||
SmErrorT _get_survivor_dc(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection);
|
SmErrorT _get_survivor_dc(const SmSystemStatusT& system_status, SmSystemFailoverStatus& selection);
|
||||||
|
|
||||||
@ -117,6 +126,26 @@ void SmSystemFailoverStatus::set_host_pre_failure_schedule_state(SmNodeScheduleS
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void SmSystemFailoverStatus::set_cluster_hbs_state(const SmClusterHbsStateT& state)
|
||||||
|
{
|
||||||
|
if( !is_valid(state) )
|
||||||
|
{
|
||||||
|
DPRINTFE("Runtime error. Invalid cluster hbs state");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
_cluster_hbs_state = state;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SmSystemFailoverStatus::set_pre_failure_cluster_hbs_state(const SmClusterHbsStateT& state)
|
||||||
|
{
|
||||||
|
if( !is_valid(state) )
|
||||||
|
{
|
||||||
|
DPRINTFE("Runtime error. Invalid cluster hbs state");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
_pre_failure_cluster_hbs_state = state;
|
||||||
|
}
|
||||||
|
|
||||||
void SmSystemFailoverStatus::set_peer_schedule_state(SmNodeScheduleStateT state)
|
void SmSystemFailoverStatus::set_peer_schedule_state(SmNodeScheduleStateT state)
|
||||||
{
|
{
|
||||||
if(_is_valid_schedule_state(state))
|
if(_is_valid_schedule_state(state))
|
||||||
@ -250,6 +279,8 @@ SmErrorT _get_system_status(SmSystemStatusT& sys_status, char host_name[], char
|
|||||||
sys_status.heartbeat_state = SM_HEARTBEAT_LOSS;
|
sys_status.heartbeat_state = SM_HEARTBEAT_LOSS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SmSystemFailoverStatus::get_status().set_heartbeat_state(sys_status.heartbeat_state);
|
||||||
|
|
||||||
sys_status.host_status.node_name = host_name;
|
sys_status.host_status.node_name = host_name;
|
||||||
sys_status.host_status.interface_state = sm_failover_if_state_get();
|
sys_status.host_status.interface_state = sm_failover_if_state_get();
|
||||||
sys_status.host_status.current_schedule_state = host_state;
|
sys_status.host_status.current_schedule_state = host_state;
|
||||||
@ -319,8 +350,154 @@ SmErrorT sm_failover_ss_get_survivor(const SmSystemStatusT& system_status, SmSys
|
|||||||
}else
|
}else
|
||||||
{
|
{
|
||||||
DPRINTFI("Loss of heartbeat ALL");
|
DPRINTFI("Loss of heartbeat ALL");
|
||||||
selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE);
|
bool expect_storage_0 = false;
|
||||||
selection.set_peer_schedule_state(SM_NODE_STATE_FAILED);
|
SmClusterHbsStateT pre_failure_cluster_hbs_state = selection.get_pre_failure_cluster_hbs_state();
|
||||||
|
SmClusterHbsStateT current_cluster_hbs_state = selection.get_cluster_hbs_state();
|
||||||
|
bool has_cluser_info = true;
|
||||||
|
int max_nodes_available = 0;
|
||||||
|
if(is_valid(pre_failure_cluster_hbs_state))
|
||||||
|
{
|
||||||
|
expect_storage_0 = pre_failure_cluster_hbs_state.storage0_enabled;
|
||||||
|
for(unsigned int i = 0; i < max_controllers; i ++)
|
||||||
|
{
|
||||||
|
if(max_nodes_available < pre_failure_cluster_hbs_state.controllers[i].number_of_node_reachable)
|
||||||
|
{
|
||||||
|
max_nodes_available = pre_failure_cluster_hbs_state.controllers[i].number_of_node_reachable;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}else if(is_valid(current_cluster_hbs_state))
|
||||||
|
{
|
||||||
|
expect_storage_0 = current_cluster_hbs_state.storage0_enabled;
|
||||||
|
for(unsigned int i = 0; i < max_controllers; i ++)
|
||||||
|
{
|
||||||
|
if(max_nodes_available < pre_failure_cluster_hbs_state.controllers[i].number_of_node_reachable)
|
||||||
|
{
|
||||||
|
max_nodes_available = pre_failure_cluster_hbs_state.controllers[i].number_of_node_reachable;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}else
|
||||||
|
{
|
||||||
|
has_cluser_info = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(has_cluser_info && max_nodes_available > 1)
|
||||||
|
{
|
||||||
|
DPRINTFD("storage-0 is %s", expect_storage_0 ? "enabled":"not enabled");
|
||||||
|
int this_controller_index, peer_controller_index;
|
||||||
|
|
||||||
|
char host_name[SM_NODE_NAME_MAX_CHAR];
|
||||||
|
SmErrorT error = sm_node_utils_get_hostname(host_name);
|
||||||
|
if( SM_OKAY != error )
|
||||||
|
{
|
||||||
|
DPRINTFE( "Failed to get hostname, error=%s.",
|
||||||
|
sm_error_str( error ) );
|
||||||
|
return SM_FAILED;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(0 == strncmp(SM_NODE_CONTROLLER_0_NAME, host_name, sizeof(SM_NODE_CONTROLLER_0_NAME)))
|
||||||
|
{
|
||||||
|
this_controller_index = 0;
|
||||||
|
peer_controller_index = 1;
|
||||||
|
}else
|
||||||
|
{
|
||||||
|
this_controller_index = 1;
|
||||||
|
peer_controller_index = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool survivor_selected = false;
|
||||||
|
if(expect_storage_0)
|
||||||
|
{
|
||||||
|
if(current_cluster_hbs_state.controllers[this_controller_index].storage0_responding &&
|
||||||
|
!current_cluster_hbs_state.controllers[peer_controller_index].storage0_responding)
|
||||||
|
{
|
||||||
|
DPRINTFI("peer cannot reach storage-0. host can. host will be survivor");
|
||||||
|
selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE);
|
||||||
|
selection.set_peer_schedule_state(SM_NODE_STATE_FAILED);
|
||||||
|
survivor_selected = true;
|
||||||
|
}else if(!current_cluster_hbs_state.controllers[this_controller_index].storage0_responding &&
|
||||||
|
current_cluster_hbs_state.controllers[peer_controller_index].storage0_responding)
|
||||||
|
{
|
||||||
|
DPRINTFI("host cannot reach storage-0. peer can. peer will be survivor");
|
||||||
|
selection.set_host_schedule_state(SM_NODE_STATE_FAILED);
|
||||||
|
selection.set_peer_schedule_state(SM_NODE_STATE_ACTIVE);
|
||||||
|
survivor_selected = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!survivor_selected)
|
||||||
|
{
|
||||||
|
// so no storage-0 or storage-0 state same on both side
|
||||||
|
if(current_cluster_hbs_state.controllers[this_controller_index].number_of_node_reachable >
|
||||||
|
current_cluster_hbs_state.controllers[peer_controller_index].number_of_node_reachable)
|
||||||
|
{
|
||||||
|
DPRINTFI("host reaches %d nodes, peer reaches %d nodes, host will be survivor",
|
||||||
|
current_cluster_hbs_state.controllers[this_controller_index].number_of_node_reachable,
|
||||||
|
current_cluster_hbs_state.controllers[peer_controller_index].number_of_node_reachable
|
||||||
|
);
|
||||||
|
selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE);
|
||||||
|
selection.set_peer_schedule_state(SM_NODE_STATE_FAILED);
|
||||||
|
survivor_selected = true;
|
||||||
|
}else if (current_cluster_hbs_state.controllers[this_controller_index].number_of_node_reachable <
|
||||||
|
current_cluster_hbs_state.controllers[peer_controller_index].number_of_node_reachable)
|
||||||
|
{
|
||||||
|
DPRINTFI("host reaches %d nodes, peer reaches %d nodes, peer will be survivor",
|
||||||
|
current_cluster_hbs_state.controllers[this_controller_index].number_of_node_reachable,
|
||||||
|
current_cluster_hbs_state.controllers[peer_controller_index].number_of_node_reachable
|
||||||
|
);
|
||||||
|
selection.set_host_schedule_state(SM_NODE_STATE_FAILED);
|
||||||
|
selection.set_peer_schedule_state(SM_NODE_STATE_ACTIVE);
|
||||||
|
survivor_selected = true;
|
||||||
|
}else
|
||||||
|
{
|
||||||
|
if(pre_failure_cluster_hbs_state != current_cluster_hbs_state)
|
||||||
|
{
|
||||||
|
if(0 == current_cluster_hbs_state.controllers[this_controller_index].number_of_node_reachable)
|
||||||
|
{
|
||||||
|
// Cannot reach any nodes, I am dead
|
||||||
|
DPRINTFI("host cannot reach any nodes, peer will be survivor",
|
||||||
|
current_cluster_hbs_state.controllers[this_controller_index].number_of_node_reachable,
|
||||||
|
current_cluster_hbs_state.controllers[peer_controller_index].number_of_node_reachable
|
||||||
|
);
|
||||||
|
selection.set_host_schedule_state(SM_NODE_STATE_FAILED);
|
||||||
|
selection.set_peer_schedule_state(SM_NODE_STATE_ACTIVE);
|
||||||
|
}else
|
||||||
|
{
|
||||||
|
// equaly split, failed the standby
|
||||||
|
if(SM_NODE_STATE_ACTIVE == system_status.host_status.current_schedule_state)
|
||||||
|
{
|
||||||
|
DPRINTFI("host reaches %d nodes, peer reaches %d nodes, host will be survivor",
|
||||||
|
current_cluster_hbs_state.controllers[this_controller_index].number_of_node_reachable,
|
||||||
|
current_cluster_hbs_state.controllers[peer_controller_index].number_of_node_reachable
|
||||||
|
);
|
||||||
|
selection.set_peer_schedule_state(SM_NODE_STATE_FAILED);
|
||||||
|
}else
|
||||||
|
{
|
||||||
|
DPRINTFI("host reaches %d nodes, peer reaches %d nodes, peer will be survivor",
|
||||||
|
current_cluster_hbs_state.controllers[this_controller_index].number_of_node_reachable,
|
||||||
|
current_cluster_hbs_state.controllers[peer_controller_index].number_of_node_reachable
|
||||||
|
);
|
||||||
|
selection.set_host_schedule_state(SM_NODE_STATE_FAILED);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// no connectivity status changed? peer sm is not responding
|
||||||
|
DPRINTFI("Peer SM is not responding, host will be survivor");
|
||||||
|
selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE);
|
||||||
|
selection.set_peer_schedule_state(SM_NODE_STATE_FAILED);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// no cluster info, peer is assumed down
|
||||||
|
// the connecting to majority nodes rule is postponed
|
||||||
|
DPRINTFI("No cluster hbs info, host will be survivor");
|
||||||
|
selection.set_host_schedule_state(SM_NODE_STATE_ACTIVE);
|
||||||
|
selection.set_peer_schedule_state(SM_NODE_STATE_FAILED);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(SM_SYSTEM_MODE_CPE_DUPLEX == system_status.system_mode)
|
if(SM_SYSTEM_MODE_CPE_DUPLEX == system_status.system_mode)
|
||||||
|
@ -8,6 +8,7 @@
|
|||||||
#define __SM_FAILOVER_SS_H__
|
#define __SM_FAILOVER_SS_H__
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "sm_types.h"
|
#include "sm_types.h"
|
||||||
|
#include "sm_cluster_hbs_info_msg.h"
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
@ -30,13 +31,13 @@ typedef enum
|
|||||||
SM_HEARTBEAT_INDIRECT,
|
SM_HEARTBEAT_INDIRECT,
|
||||||
//no heartbeat
|
//no heartbeat
|
||||||
SM_HEARTBEAT_LOSS
|
SM_HEARTBEAT_LOSS
|
||||||
}SmHeartbeatStatusT;
|
}SmHeartbeatStateT;
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
SmNodeStatusT host_status;
|
SmNodeStatusT host_status;
|
||||||
SmNodeStatusT peer_status;
|
SmNodeStatusT peer_status;
|
||||||
SmHeartbeatStatusT heartbeat_state;
|
SmHeartbeatStateT heartbeat_state;
|
||||||
SmSystemModeT system_mode;
|
SmSystemModeT system_mode;
|
||||||
}SmSystemStatusT;
|
}SmSystemStatusT;
|
||||||
|
|
||||||
@ -48,11 +49,30 @@ class SmSystemFailoverStatus
|
|||||||
inline SmNodeScheduleStateT get_host_schedule_state() const {
|
inline SmNodeScheduleStateT get_host_schedule_state() const {
|
||||||
return _host_schedule_state;
|
return _host_schedule_state;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline SmNodeScheduleStateT get_host_pre_failure_schedule_state() const {
|
inline SmNodeScheduleStateT get_host_pre_failure_schedule_state() const {
|
||||||
return _host_pre_failure_schedule_state;
|
return _host_pre_failure_schedule_state;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline SmClusterHbsStateT get_cluster_hbs_state() const {
|
||||||
|
return _cluster_hbs_state;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline SmClusterHbsStateT get_pre_failure_cluster_hbs_state() const {
|
||||||
|
return _pre_failure_cluster_hbs_state;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void set_heartbeat_state(SmHeartbeatStateT heartbeat_state)
|
||||||
|
{
|
||||||
|
_heartbeat_state = heartbeat_state;
|
||||||
|
}
|
||||||
|
inline SmHeartbeatStateT get_heartbeat_state() const {
|
||||||
|
return _heartbeat_state;
|
||||||
|
}
|
||||||
void set_host_schedule_state(SmNodeScheduleStateT state);
|
void set_host_schedule_state(SmNodeScheduleStateT state);
|
||||||
void set_host_pre_failure_schedule_state(SmNodeScheduleStateT state);
|
void set_host_pre_failure_schedule_state(SmNodeScheduleStateT state);
|
||||||
|
void set_cluster_hbs_state(const SmClusterHbsStateT& state);
|
||||||
|
void set_pre_failure_cluster_hbs_state(const SmClusterHbsStateT& state);
|
||||||
inline SmNodeScheduleStateT get_peer_schedule_state() const {
|
inline SmNodeScheduleStateT get_peer_schedule_state() const {
|
||||||
return _peer_schedule_state;
|
return _peer_schedule_state;
|
||||||
}
|
}
|
||||||
@ -68,8 +88,11 @@ class SmSystemFailoverStatus
|
|||||||
SmSystemFailoverStatus();
|
SmSystemFailoverStatus();
|
||||||
SmNodeScheduleStateT _host_pre_failure_schedule_state;
|
SmNodeScheduleStateT _host_pre_failure_schedule_state;
|
||||||
SmNodeScheduleStateT _peer_pre_failure_schedule_state;
|
SmNodeScheduleStateT _peer_pre_failure_schedule_state;
|
||||||
|
SmClusterHbsStateT _pre_failure_cluster_hbs_state;
|
||||||
SmNodeScheduleStateT _host_schedule_state;
|
SmNodeScheduleStateT _host_schedule_state;
|
||||||
SmNodeScheduleStateT _peer_schedule_state;
|
SmNodeScheduleStateT _peer_schedule_state;
|
||||||
|
SmClusterHbsStateT _cluster_hbs_state;
|
||||||
|
SmHeartbeatStateT _heartbeat_state;
|
||||||
static const char filename[];
|
static const char filename[];
|
||||||
static const char file_format[];
|
static const char file_format[];
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user