diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index 2d02bd97..b98c34d4 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -263,6 +263,7 @@ typedef enum #define MTC_TASK_ENABLE_WORK_FAIL "Enable Action Failed" #define MTC_TASK_ENABLE_WORK_TO "Enable Action Timeout" #define MTC_TASK_ENABLE_FAIL_HB "Enable Heartbeat Failure, re-enabling" +#define MTC_TASK_RECOVERY_FAIL_HB "Graceful Recovery Heartbeat Failure, re-enabling" #define MTC_TASK_RECOVERY_FAIL "Graceful Recovery Failed, re-enabling" #define MTC_TASK_RECOVERY_WAIT "Graceful Recovery Wait" #define MTC_TASK_RECOVERED "Gracefully Recovered" diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index 638d12fd..e2320430 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -4131,6 +4131,18 @@ int nodeLinkClass::get_uptime_refresh_ctr ( string & hostname ) return (0); } + +int nodeLinkClass::get_mtce_flags ( string & hostname ) +{ + nodeLinkClass::node* node_ptr ; + node_ptr = nodeLinkClass::getNode ( hostname ); + if ( node_ptr != NULL ) + { + return ( node_ptr->mtce_flags ); + } + return (0); +} + void nodeLinkClass::set_mtce_flags ( string hostname, int flags, int iface ) { nodeLinkClass::node* node_ptr = nodeLinkClass::getNode ( hostname ); @@ -4883,7 +4895,10 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa /* Otherwise this is a single host that has recovered * possibly as part of a mnfa group or simply a lone wolf */ - else + else if (( node_ptr->hbs_minor[MGMNT_IFACE] == false ) && + (( clstr_network_provisioned == false ) || + (( clstr_network_provisioned == true ) && + ( node_ptr->hbs_minor[CLSTR_IFACE] == false )))) { if ( node_ptr->mnfa_graceful_recovery == true ) { @@ -4891,6 +4906,8 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa mnfa_awol_list.remove(node_ptr->hostname); } + /* Don't recover until heartbeat is working over all + * monitored interfaces */ mnfa_recover_host ( node_ptr ); if ( mnfa_active == true ) @@ -5044,11 +5061,28 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface } return ; } + else if ( node_ptr->recoveryStage == MTC_RECOVERY__HEARTBEAT_SOAK ) + { + elog ("%s %s *** Heartbeat Loss *** (during recovery soak)\n", + hostname.c_str(), + get_iface_name_str(iface)); + force_full_enable ( node_ptr ); + return ; + } mnfa_add_host ( node_ptr , iface ); if ( mnfa_active == false ) { + /* if node is already in graceful recovery just ignore the event */ + if ( node_ptr->graceful_recovery_counter != 0 ) + { + dlog ("%s %s loss event ; already in graceful recovery try %d", + hostname.c_str(), + get_iface_name_str(iface), + node_ptr->graceful_recovery_counter ); + return ; + } elog ("%s %s *** Heartbeat Loss ***\n", hostname.c_str(), get_iface_name_str(iface)); if ( iface == CLSTR_IFACE ) { diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index 0ea25e1a..5df2ce56 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -1775,6 +1775,7 @@ public: #define MTC_FLAG__I_AM_LOCKED (0x00000008) */ void set_mtce_flags ( string hostname, int flags, int iface ); + int get_mtce_flags ( string & hostname ); /** Updates the node's health code * Codes are found in nodeBase.h diff --git a/mtce/src/heartbeat/hbsAgent.cpp b/mtce/src/heartbeat/hbsAgent.cpp index c891fdb4..ecd6941a 100644 --- a/mtce/src/heartbeat/hbsAgent.cpp +++ b/mtce/src/heartbeat/hbsAgent.cpp @@ -2127,7 +2127,7 @@ void daemon_service_run ( void ) { if ( hostname != hbsInv.my_hostname ) { - hbsInv.mon_host ( hostname, false, true ); + hbsInv.mon_host ( hostname, false, false ); hbs_cluster_del ( hostname ); ilog ("%s heartbeat service disabled by stop command", hostname.c_str()); diff --git a/mtce/src/maintenance/mtcCtrlMsg.cpp b/mtce/src/maintenance/mtcCtrlMsg.cpp index 45232d43..a77131f3 100755 --- a/mtce/src/maintenance/mtcCtrlMsg.cpp +++ b/mtce/src/maintenance/mtcCtrlMsg.cpp @@ -1262,13 +1262,69 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) { elog ("%s Failed to send inventory to heartbeat service\n", hostname.c_str()); } - /* Send the start event to the heartbeat service for all enabled hosts */ - if (( obj_ptr->get_adminState ( hostname ) == MTC_ADMIN_STATE__UNLOCKED ) && + /* Consider sending the 'start' request to the heartbeat service + * for all enabled hosts except for this active controller. */ + if (( obj_ptr->my_hostname != hostname ) && + ( obj_ptr->get_adminState ( hostname ) == MTC_ADMIN_STATE__UNLOCKED ) && ( obj_ptr->get_operState ( hostname ) == MTC_OPER_STATE__ENABLED ) && ((obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__AVAILABLE ) || (obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__DEGRADED ))) { - send_hbs_command ( hostname, MTC_CMD_START_HOST, controller ); + /* However, bypass sending heartbeat 'start' for nodes that + * are not ready to heartbeat; enabling, configuring, testing. + * Such cases are if a host is: + * + * 1. running the add_handler or + * 2. running the enable_handler or + * 3. running the enable_subf_handler or + * 4. not configured or + * 5. not tested (goenabled not complete) + * + */ + mtc_nodeAdminAction_enum current_action = + obj_ptr->get_adminAction (hostname); + if (( current_action != MTC_ADMIN_ACTION__ADD ) && + ( current_action != MTC_ADMIN_ACTION__ENABLE ) && + ( current_action != MTC_ADMIN_ACTION__ENABLE_SUBF )) + { + int mtce_flags = obj_ptr->get_mtce_flags(hostname); + if (( mtce_flags & MTC_FLAG__I_AM_CONFIGURED ) && + ( mtce_flags & MTC_FLAG__I_AM_HEALTHY ) && + ( mtce_flags & MTC_FLAG__MAIN_GOENABLED )) + { + if (( obj_ptr->system_type != SYSTEM_TYPE__NORMAL ) && + ( obj_ptr->get_nodetype ( hostname ) & CONTROLLER_TYPE )) + { + /* If its an AIO then its worker subfunction + * needs to have been be configured and tested. */ + if (( mtce_flags & MTC_FLAG__SUBF_CONFIGURED ) && + ( mtce_flags & MTC_FLAG__SUBF_GOENABLED )) + { + ilog("%s heartbeat start (AIO controller)", + hostname.c_str()); + send_hbs_command ( hostname, MTC_CMD_START_HOST, controller ); + } + else + { + wlog ("%s not heartbeat ready (subf) (oob:%x)", + hostname.c_str(), + mtce_flags); + } + } + else + { + ilog("%s heartbeat start (from ready event)", + hostname.c_str()); + send_hbs_command ( hostname, MTC_CMD_START_HOST, controller ); + } + } + else + { + wlog ("%s not heartbeat ready (main) (oob:%x)", + hostname.c_str(), + mtce_flags); + } + } } } ilog ("%s %s inventory push ... done", diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index e1924af8..49ac2684 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -1637,9 +1637,10 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->http_retries_cur = 0 ; node_ptr->unknown_health_reported = false ; - plog ("%s %sGraceful Recovery (uptime was %d)\n", + plog ("%s %sGraceful Recovery (%d) (uptime was %d)\n", node_ptr->hostname.c_str(), node_ptr->mnfa_graceful_recovery ? "MNFA " : "", + node_ptr->graceful_recovery_counter, node_ptr->uptime ); /* Cancel any outstanding timers */ @@ -1773,10 +1774,11 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) else if ( node_ptr->mnfa_graceful_recovery == true ) { - if ( node_ptr->uptime > MTC_MINS_10 ) + if ( node_ptr->uptime > MTC_MINS_15 ) { /* did not reboot case */ - wlog ("%s Connectivity Recovered ; host did not reset\n", node_ptr->hostname.c_str()); + wlog ("%s Connectivity Recovered ; host did not reset (uptime:%d)\n", + node_ptr->hostname.c_str(), node_ptr->uptime); wlog ("%s ... continuing with MNFA graceful recovery\n", node_ptr->hostname.c_str()); wlog ("%s ... with no affect to host services\n", node_ptr->hostname.c_str()); @@ -1789,7 +1791,8 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) else { /* did reboot case */ - wlog ("%s Connectivity Recovered ; host has reset\n", node_ptr->hostname.c_str()); + wlog ("%s Connectivity Recovered ; host has reset (uptime:%d)\n", + node_ptr->hostname.c_str(), node_ptr->uptime); ilog ("%s ... continuing with MNFA graceful recovery\n", node_ptr->hostname.c_str()); ilog ("%s ... without additional reboot %s\n", node_ptr->hostname.c_str(), node_ptr->bm_ip.empty() ? "or reset" : "" ); @@ -1807,12 +1810,13 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) break ; } } - else if (( node_ptr->uptime_save ) && ( node_ptr->uptime >= node_ptr->uptime_save )) + else if ( node_ptr->uptime > MTC_MINS_15 ) { /* did not reboot case */ - wlog ("%s Connectivity Recovered ; host did not reset%s\n", + wlog ("%s Connectivity Recovered ; host did not reset%s (uptime:%d)", node_ptr->hostname.c_str(), - node_ptr->was_dor_recovery_mode ? " (DOR)" : "" ); + node_ptr->was_dor_recovery_mode ? " (DOR)" : "", + node_ptr->uptime); wlog ("%s ... continuing with graceful recovery\n", node_ptr->hostname.c_str()); wlog ("%s ... with no affect to host services\n", node_ptr->hostname.c_str()); @@ -1906,7 +1910,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) { int timeout = 0 ; - /* Set the FSM task state to booting */ + /* Set the FSM task state to 'Graceful Recovery Wait' */ node_ptr->uptime = 0 ; mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_WAIT ); @@ -2443,10 +2447,10 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) } else /* success path */ { - /* allow the fsm to wait for up to 1 minute for the - * hbsClient's ready event before starting heartbeat + /* allow the fsm to wait for up to 'worker config timeout' + * for the hbsClient's ready event before starting heartbeat * test. */ - mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_WORKER_CONFIG_TIMEOUT ); recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_START ); } break ; @@ -2503,6 +2507,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr ) { if ( node_ptr->mtcTimer.ring == true ) { + ilog ("%s heartbeating", node_ptr->hostname.c_str()); /* if heartbeat is not working then we will * never get here and enable the host */ recoveryStageChange ( node_ptr, MTC_RECOVERY__STATE_CHANGE ); @@ -6261,6 +6266,40 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->hostname.c_str(), node_ptr->uptime ); break ; } + /* Handle catching and recovering/restoring hosts that might + * have been in the Graceful Recovery Wait state. + * + * Prevents an extra reboot for hosts that might be in + * Graceful Recovery over a maintenance process restart. */ + else if (( NOT_THIS_HOST ) && + ( !node_ptr->task.compare(MTC_TASK_RECOVERY_WAIT))) + { + ilog ("%s is in %s ; restoring state", + node_ptr->hostname.c_str(), + MTC_TASK_RECOVERY_WAIT); + + /* Complete necessary add operations before switching + * to Recovery */ + LOAD_NODETYPE_TIMERS ; + workQueue_purge ( node_ptr ); + if (( hostUtil_is_valid_bm_type ( node_ptr->bm_type )) && + ( hostUtil_is_valid_ip_addr ( node_ptr->bm_ip )) && + ( hostUtil_is_valid_username ( node_ptr->bm_un ))) + { + set_bm_prov ( node_ptr, true ) ; + } + mtcTimer_reset ( node_ptr->mtcTimer ); + adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); + node_ptr->addStage = MTC_ADD__START; + + /* Switch into recovery_handler's Graceful Recovery Wait + * state with the Graceful Recovery Wait timeout */ + adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER ); + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, + node_ptr->mtcalive_timeout ); + recoveryStageChange ( node_ptr, MTC_RECOVERY__MTCALIVE_WAIT ); + break ; + } else { if ( is_controller(node_ptr) ) diff --git a/mtce/src/maintenance/mtcSubfHdlrs.cpp b/mtce/src/maintenance/mtcSubfHdlrs.cpp index 6f9646da..5c994f4a 100644 --- a/mtce/src/maintenance/mtcSubfHdlrs.cpp +++ b/mtce/src/maintenance/mtcSubfHdlrs.cpp @@ -110,14 +110,16 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) if ( node_ptr->mtce_flags & MTC_FLAG__SUBF_CONFIGURED ) { mtcTimer_reset (node_ptr->mtcTimer); - plog ("%s Subf Configured OK\n", name.c_str()); + plog ("%s Subf Configured OK (oob:%x)\n", + name.c_str(), node_ptr->mtce_flags); enableStageChange ( node_ptr, MTC_ENABLE__GOENABLED_TIMER ); alarm_config_clear ( node_ptr ); break ; } - if ((( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED )) || - (( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY ))) + if (( node_ptr->mtce_flags ) && + (( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED ) || + ( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY ))) { mtcTimer_reset (node_ptr->mtcTimer); @@ -140,9 +142,10 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr ) /* timeout handling */ else if ( node_ptr->mtcTimer.ring == true ) { - elog ("%s configuration timeout (%d secs)\n", + elog ("%s configuration timeout (%d secs) (oob:%x)\n", name.c_str(), - MTC_WORKER_CONFIG_TIMEOUT ); + MTC_WORKER_CONFIG_TIMEOUT, + node_ptr->mtce_flags); alarm_config_failure ( node_ptr );