ad8665a1b7
Uses cluster hbs info to determine which controller to be the survivor when communication lost between 2 controllers with the new rules: 1. If a controller is the only controller to connect to storage-0, it is choosen to be the survivor 2. A controller that can reach more nodes is choosen to be the survivor. 3. A controller is choosen to be failed if it cannot reach any nodes. Story: 2003577 Task: 27704 Change-Id: I79659e1a788b865536500fc125fd65ae2f34123d Signed-off-by: Bin Qian <bin.qian@windriver.com>
385 lines
12 KiB
C++
385 lines
12 KiB
C++
//
|
|
// Copyright (c) 2018 Wind River Systems, Inc.
|
|
//
|
|
// SPDX-License-Identifier: Apache-2.0
|
|
//
|
|
#include "sm_failover_fail_pending_state.h"
|
|
#include <stdlib.h>
|
|
#include <unistd.h>
|
|
#include "sm_cluster_hbs_info_msg.h"
|
|
#include "sm_types.h"
|
|
#include "sm_limits.h"
|
|
#include "sm_debug.h"
|
|
#include "sm_failover.h"
|
|
#include "sm_failover_fsm.h"
|
|
#include "sm_failover_ss.h"
|
|
#include "sm_failover_utils.h"
|
|
#include "sm_node_utils.h"
|
|
#include "sm_node_api.h"
|
|
#include "sm_worker_thread.h"
|
|
|
|
static const int FAIL_PENDING_TIMEOUT = 2000; // 2seconds
|
|
static const int DELAY_QUERY_HBS_MS = FAIL_PENDING_TIMEOUT - 200; // give 200ms for hbs agent to respond
|
|
|
|
static SmTimerIdT action_timer_id = SM_TIMER_ID_INVALID;
|
|
static const int RESET_TIMEOUT = 10 * 1000; // 10 seconds for a reset command to reboot a node
|
|
static const int GO_ACTIVE_TIMEOUT = 30 * 1000; // 30 seconds for standby node go active
|
|
static const int WAIT_RESET_TIMEOUT = RESET_TIMEOUT + 5 * 1000; // extra 5 seconds for sending reset command
|
|
|
|
static void disable_peer(SmSimpleAction&)
|
|
{
|
|
SmErrorT error;
|
|
int counter = 0;
|
|
bool done = false;
|
|
DPRINTFI("To send mtce api to disable peer");
|
|
|
|
while(!done && counter < 5)
|
|
{
|
|
error = sm_failover_disable_peer();
|
|
counter ++;
|
|
if(SM_OKAY != error)
|
|
{
|
|
int wait_interval_us = 1000000; // 1 second
|
|
DPRINTFE("Failed to disable peer, reattempt in %d ms",
|
|
wait_interval_us / 1000);
|
|
usleep(wait_interval_us); // wait 1 second before retry
|
|
}else
|
|
{
|
|
done = true;
|
|
}
|
|
}
|
|
}
|
|
static SmSimpleAction _disable_peer_action("disable-peer", disable_peer);
|
|
|
|
static bool wait_for_reset_timeout(SmTimerIdT timer_id, int64_t user_data)
|
|
{
|
|
//timer never rearm, clear the timer id
|
|
action_timer_id = SM_TIMER_ID_INVALID;
|
|
sm_failover_utils_reset_stayfailed_flag();
|
|
DPRINTFI("stayfailed flag file is removed");
|
|
SmFailoverFSM::get_fsm().set_state(SM_FAILOVER_STATE_SURVIVED);
|
|
return false;
|
|
}
|
|
|
|
static bool reset_standby_timeout(SmTimerIdT timer_id, int64_t user_data)
|
|
{
|
|
//timer never rearm, clear the timer id
|
|
action_timer_id = SM_TIMER_ID_INVALID;
|
|
sm_failover_utils_set_stayfailed_flag();
|
|
DPRINTFI("stayfailed flag file is set");
|
|
const char* timer_name = "confirm standby reset";
|
|
SmErrorT error = sm_timer_register( timer_name, WAIT_RESET_TIMEOUT + WAIT_RESET_TIMEOUT,
|
|
wait_for_reset_timeout,
|
|
0, &action_timer_id);
|
|
|
|
if(SM_OKAY != error)
|
|
{
|
|
DPRINTFE("Failed to register confirm standby reset timer");
|
|
}
|
|
DPRINTFI("wait for %d secs...", (WAIT_RESET_TIMEOUT + WAIT_RESET_TIMEOUT) / 1000);
|
|
return false;
|
|
}
|
|
|
|
SmErrorT blind_guess_start_active()
|
|
{
|
|
SmErrorT error;
|
|
SmWorkerThread::get_worker().add_priority_action(&_disable_peer_action);
|
|
|
|
const char* timer_name = "reset standby";
|
|
error = sm_timer_register( timer_name, RESET_TIMEOUT,
|
|
reset_standby_timeout,
|
|
0, &action_timer_id);
|
|
|
|
if(SM_OKAY != error)
|
|
{
|
|
DPRINTFE("Failed to register reset standby timer");
|
|
return SM_FAILED;
|
|
}
|
|
|
|
DPRINTFI("wait for %d secs...", RESET_TIMEOUT / 1000);
|
|
return SM_OKAY;
|
|
}
|
|
|
|
static void standby_go_active_failed()
|
|
{
|
|
sm_failover_utils_set_stayfailed_flag();
|
|
DPRINTFI("stayfailed flag file is set");
|
|
SmFailoverFSM::get_fsm().set_state(SM_FAILOVER_STATE_FAILED);
|
|
}
|
|
|
|
bool standby_wait_timeout(SmTimerIdT timer_id, int64_t user_data)
|
|
{
|
|
//timer never rearm, clear the timer id
|
|
action_timer_id = SM_TIMER_ID_INVALID;
|
|
sm_failover_utils_reset_stayfailed_flag();
|
|
DPRINTFI("stayfailed flag file is removed");
|
|
char host_name[SM_NODE_NAME_MAX_CHAR];
|
|
SmErrorT error = sm_node_utils_get_hostname(host_name);
|
|
if( SM_OKAY != error )
|
|
{
|
|
DPRINTFE( "Failed to get hostname, error=%s.",
|
|
sm_error_str( error ) );
|
|
|
|
standby_go_active_failed();
|
|
return false;
|
|
}
|
|
|
|
SmNodeScheduleStateT host_state = sm_get_controller_state(host_name);
|
|
if(SM_NODE_STATE_ACTIVE != host_state)
|
|
{
|
|
standby_go_active_failed();
|
|
return false;
|
|
}
|
|
|
|
SmWorkerThread::get_worker().add_priority_action(&_disable_peer_action);
|
|
|
|
SmFailoverFSM::get_fsm().set_state(SM_FAILOVER_STATE_SURVIVED);
|
|
return false;
|
|
}
|
|
|
|
static void blind_guess_start_standby()
|
|
{
|
|
SmErrorT error;
|
|
sm_failover_utils_set_stayfailed_flag();
|
|
DPRINTFI("set stayfailed flag file");
|
|
|
|
sm_failover_activate_self();
|
|
const char* timer_name = "active controller action timeout";
|
|
error = sm_timer_register( timer_name, WAIT_RESET_TIMEOUT + GO_ACTIVE_TIMEOUT,
|
|
standby_wait_timeout,
|
|
0, &action_timer_id);
|
|
|
|
if(SM_OKAY != error)
|
|
{
|
|
DPRINTFE("Failed to register blindguess action timer");
|
|
}
|
|
DPRINTFI("wait for %d secs", (WAIT_RESET_TIMEOUT + GO_ACTIVE_TIMEOUT) / 1000);
|
|
}
|
|
|
|
void blind_guess_scenario_start()
|
|
{
|
|
SmSystemFailoverStatus& failover_status = SmSystemFailoverStatus::get_status();
|
|
SmNodeScheduleStateT host_state = failover_status.get_host_pre_failure_schedule_state();
|
|
SmNodeScheduleStateT peer_state = failover_status.get_peer_pre_failure_schedule_state();
|
|
DPRINTFI("Pre failure host state %s, peer state %s",
|
|
sm_node_schedule_state_str(host_state), sm_node_schedule_state_str(peer_state));
|
|
if(SM_NODE_STATE_ACTIVE == host_state)
|
|
{
|
|
if(SM_NODE_STATE_STANDBY == peer_state)
|
|
{
|
|
blind_guess_start_active();
|
|
}
|
|
}else if(SM_NODE_STATE_STANDBY == host_state)
|
|
{
|
|
blind_guess_start_standby();
|
|
}
|
|
}
|
|
|
|
SmFailoverFailPendingState::SmFailoverFailPendingState(SmFailoverFSM& fsm) : SmFSMState(fsm)
|
|
{
|
|
this->_pending_timer_id = SM_TIMER_ID_INVALID;
|
|
}
|
|
|
|
SmFailoverFailPendingState::~SmFailoverFailPendingState()
|
|
{
|
|
this->_deregister_timer();
|
|
}
|
|
|
|
SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
|
|
{
|
|
//SmFSMEventDataTypeT event_data_type = event_data->get_event_data_type();
|
|
bool duplex = false;
|
|
switch (event)
|
|
{
|
|
case SM_FAILOVER_EVENT_IF_STATE_CHANGED:
|
|
// Situation are changing, extend wait time
|
|
this->_register_timer();
|
|
break;
|
|
|
|
case SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT:
|
|
sm_node_utils_is_aio_duplex(&duplex);
|
|
if( duplex &&
|
|
0 == (sm_failover_get_if_state() & SM_FAILOVER_HEARTBEAT_ALIVE))
|
|
{
|
|
SmSystemModeT system_mode = sm_node_utils_get_system_mode();
|
|
SmInterfaceTypeT interfaces_to_check[3] = {SM_INTERFACE_UNKNOWN};
|
|
bool healthy = true;
|
|
if(sm_failover_utils_is_stayfailed())
|
|
{
|
|
healthy = false;
|
|
}
|
|
|
|
if(SM_SYSTEM_MODE_CPE_DUPLEX_DC == system_mode)
|
|
{
|
|
interfaces_to_check[0] = SM_INTERFACE_OAM;
|
|
}else
|
|
{
|
|
interfaces_to_check[0] = SM_INTERFACE_MGMT;
|
|
interfaces_to_check[1] = SM_INTERFACE_INFRA;
|
|
}
|
|
for(int i = 0; interfaces_to_check[i] != SM_INTERFACE_UNKNOWN; i ++)
|
|
{
|
|
SmFailoverInterfaceStateT if_state = sm_failover_get_interface_info(interfaces_to_check[i]);
|
|
if(SM_FAILOVER_INTERFACE_DOWN == if_state)
|
|
{
|
|
healthy = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if(healthy)
|
|
{
|
|
blind_guess_scenario_start();
|
|
}
|
|
}
|
|
else
|
|
{
|
|
SmSystemFailoverStatus& failover_status = SmSystemFailoverStatus::get_status();
|
|
SmErrorT error = sm_failover_ss_get_survivor(failover_status);
|
|
SmNodeScheduleStateT host_state = failover_status.get_host_schedule_state();
|
|
SmNodeScheduleStateT peer_state = failover_status.get_peer_schedule_state();
|
|
if(SM_OKAY != error)
|
|
{
|
|
DPRINTFE("Failed to get failover survivor. Error %s", sm_error_str(error));
|
|
return error;
|
|
}
|
|
|
|
error = sm_failover_set_system(failover_status);
|
|
if(SM_NODE_STATE_FAILED == host_state)
|
|
{
|
|
this->fsm.set_state(SM_FAILOVER_STATE_FAILED);
|
|
}
|
|
else if(SM_NODE_STATE_ACTIVE == host_state)
|
|
{
|
|
if(SM_NODE_STATE_FAILED == peer_state)
|
|
{
|
|
this->fsm.set_state(SM_FAILOVER_STATE_SURVIVED);
|
|
}else
|
|
{
|
|
this->fsm.set_state(SM_FAILOVER_STATE_NORMAL);
|
|
}
|
|
}
|
|
else if(SM_NODE_STATE_STANDBY == host_state)
|
|
{
|
|
this->fsm.set_state(SM_FAILOVER_STATE_NORMAL);
|
|
}else
|
|
{
|
|
DPRINTFE("Runtime error: unexpected scheduling state: %s",
|
|
sm_node_schedule_state_str(host_state));
|
|
}
|
|
return error;
|
|
}
|
|
break;
|
|
default:
|
|
DPRINTFE("Runtime error, unexpected event %s, at state %s",
|
|
sm_failover_event_str(event),
|
|
sm_failover_state_str(this->fsm.get_state()));
|
|
}
|
|
return SM_OKAY;
|
|
}
|
|
|
|
bool SmFailoverFailPendingState::_fail_pending_timeout(
|
|
SmTimerIdT timer_id, int64_t user_data)
|
|
{
|
|
SmFailoverFSM::get_fsm().send_event(SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT, NULL);
|
|
return false;
|
|
}
|
|
|
|
SmErrorT SmFailoverFailPendingState::enter_state()
|
|
{
|
|
SmFSMState::enter_state();
|
|
SmErrorT error = this->_register_timer();
|
|
if(SM_OKAY != error)
|
|
{
|
|
DPRINTFE("Failed to register fail pending timer. Error %s", sm_error_str(error));
|
|
}
|
|
return error;
|
|
}
|
|
|
|
void _cluster_hbs_response_callback()
|
|
{
|
|
const SmClusterHbsStateT& cluster_hbs_state = SmClusterHbsInfoMsg::get_current_state();
|
|
log_cluster_hbs_state(cluster_hbs_state);
|
|
SmSystemFailoverStatus::get_status().set_cluster_hbs_state(cluster_hbs_state);
|
|
}
|
|
|
|
bool SmFailoverFailPendingState::_delay_query_hbs_timeout(
|
|
SmTimerIdT timer_id, int64_t user_data)
|
|
{
|
|
SmClusterHbsInfoMsg::cluster_hbs_info_query(_cluster_hbs_response_callback);
|
|
return false;
|
|
}
|
|
|
|
SmErrorT SmFailoverFailPendingState::_register_timer()
|
|
{
|
|
SmErrorT error;
|
|
const char* timer_name = "FAIL PENDING TIMER";
|
|
if(SM_TIMER_ID_INVALID != this->_pending_timer_id)
|
|
{
|
|
this->_deregister_timer();
|
|
}
|
|
|
|
error = sm_timer_register(timer_name, FAIL_PENDING_TIMEOUT,
|
|
SmFailoverFailPendingState::_fail_pending_timeout,
|
|
0, &this->_pending_timer_id);
|
|
|
|
const char* delay_query_hbs_timer_name = "DELAY QUERY HBS";
|
|
|
|
error = sm_timer_register(delay_query_hbs_timer_name, DELAY_QUERY_HBS_MS,
|
|
SmFailoverFailPendingState::_delay_query_hbs_timeout,
|
|
0, &this->_delay_query_hbs_timer_id);
|
|
|
|
return error;
|
|
}
|
|
|
|
SmErrorT SmFailoverFailPendingState::_deregister_timer()
|
|
{
|
|
SmErrorT error = SM_OKAY;
|
|
if(SM_TIMER_ID_INVALID != this->_pending_timer_id)
|
|
{
|
|
error = sm_timer_deregister(this->_pending_timer_id);
|
|
if( SM_OKAY != error )
|
|
{
|
|
DPRINTFE( "Failed to cancel fail pending timer, error=%s.",
|
|
sm_error_str( error ) );
|
|
}else
|
|
{
|
|
this->_pending_timer_id = SM_TIMER_ID_INVALID;
|
|
}
|
|
}
|
|
|
|
if(SM_TIMER_ID_INVALID != this->_delay_query_hbs_timer_id)
|
|
{
|
|
error = sm_timer_deregister(this->_delay_query_hbs_timer_id);
|
|
if( SM_OKAY != error )
|
|
{
|
|
DPRINTFE( "Failed to cancel query hbs info timer, error=%s.",
|
|
sm_error_str( error ) );
|
|
}else
|
|
{
|
|
this->_delay_query_hbs_timer_id = SM_TIMER_ID_INVALID;
|
|
}
|
|
}
|
|
return error;
|
|
}
|
|
|
|
SmErrorT SmFailoverFailPendingState::exit_state()
|
|
{
|
|
SmErrorT error = this->_deregister_timer();
|
|
if(SM_OKAY != error)
|
|
{
|
|
DPRINTFE("Failed to deregister fail pending timer. Error %s", sm_error_str(error));
|
|
}
|
|
if(SM_TIMER_ID_INVALID != action_timer_id)
|
|
{
|
|
error = sm_timer_deregister(action_timer_id);
|
|
action_timer_id = SM_TIMER_ID_INVALID;
|
|
if( SM_OKAY != error)
|
|
{
|
|
DPRINTFE("Failed to deregister action timer. Error %s", sm_error_str(error));
|
|
}
|
|
}
|
|
SmFSMState::exit_state();
|
|
return error;
|
|
}
|