From 4e132af30884d2a81700be0667a7b03cab1d3d94 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Wed, 12 Dec 2018 08:10:30 -0500 Subject: [PATCH] Mtce: fix hbsClient active monitoring over config reload The maintenance process monitor is failing the hbsClient process over config or process reload operations. The issue relates to the hbsClient's subfunction being 'last-config' without pmon properly gating the active monitoring FSM from starting until the passive monitoring phase is complete and in the MANAGE state. Test Plan PASS: Verify active monitoring failure detection and handling PASS: Verify proper process monitoring over pmond config reload PASS: Verify proper process monitoring over SIGHUP -> pmond PASS: Verify proper process monitoring over SIGUSR2 -> pmond PASS: Verify proper process monitoring over process failure recovery PASS: Verify pmond regression test soak ; on active and inactive controllers PASS: Verify pmond regression test soak ; on compute node PASS: Verify pmond regression test soak ; kill/recovery function PASS: Verify pmond regression test soak ; restart function PASS: Verify pmond regression test soak ; alarming function PASS: Verify pmond handles critical process failure with no restart config PASS: Verify pmond handles ntpd process failure PASS: Verify AIO DX Install PASS: Verify AIO DX Inactive Controller process management over Lock/Unlock. Change-Id: Ie2fe7b6ce479f660725e5600498cc98f36f78337 Closes-Bug: 1807724 Signed-off-by: Eric MacDonald --- mtce/centos/build_srpm.data | 2 +- mtce/src/pmon/pmonFsm.cpp | 9 ++++++--- mtce/src/pmon/pmonHdlr.cpp | 19 ++++++++++--------- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/mtce/centos/build_srpm.data b/mtce/centos/build_srpm.data index 9e6662d0..8784f945 100644 --- a/mtce/centos/build_srpm.data +++ b/mtce/centos/build_srpm.data @@ -1,3 +1,3 @@ SRC_DIR="src" -TIS_PATCH_VER=143 +TIS_PATCH_VER=144 BUILD_IS_SLOW=5 diff --git a/mtce/src/pmon/pmonFsm.cpp b/mtce/src/pmon/pmonFsm.cpp index 69a32839..f48e9713 100644 --- a/mtce/src/pmon/pmonFsm.cpp +++ b/mtce/src/pmon/pmonFsm.cpp @@ -566,8 +566,6 @@ int pmon_passive_handler ( process_config_type * ptr ) /* Start the monitor debounce timer. */ mtcTimer_reset ( ptr->pt_ptr ); - mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, ptr->startuptime ); - passiveStageChange ( ptr, PMON_STAGE__MONITOR_WAIT ) ; /* Don't wait for the debounce timer to take this process out of 'commanded restart' mode. * Do it now, otherwise tight patch loop stress testing might fail */ @@ -576,6 +574,12 @@ int pmon_passive_handler ( process_config_type * ptr ) ilog ("%s Restarted\n", ptr->process ) ptr->restart = false ; ptr->registered = false ; + passiveStageChange ( ptr, PMON_STAGE__MANAGE ) ; + } + else + { + mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, ptr->startuptime ); + passiveStageChange ( ptr, PMON_STAGE__MONITOR_WAIT ) ; } break ; } @@ -614,7 +618,6 @@ int pmon_passive_handler ( process_config_type * ptr ) /* Start debounce monitor phase */ passiveStageChange ( ptr, PMON_STAGE__MONITOR ) ; - // ilog ("%s Monitor Start ...\n", ptr->process); process_running ( ptr ); ilog ("%s Monitor (%d)\n", ptr->process, ptr->pid ); } diff --git a/mtce/src/pmon/pmonHdlr.cpp b/mtce/src/pmon/pmonHdlr.cpp index 996dbccb..2024bb48 100644 --- a/mtce/src/pmon/pmonHdlr.cpp +++ b/mtce/src/pmon/pmonHdlr.cpp @@ -1014,6 +1014,7 @@ void _get_events ( void ) { bool running = false ; + /* ignore is ignore */ if ( process_config[i].ignore == true ) { process_config[i].failed = false ; @@ -1021,12 +1022,12 @@ void _get_events ( void ) continue ; } - else if ( process_config[i].stage == PMON_STAGE__POLLING ) - { - continue ; - } - - else if ( process_config[i].status_monitoring ) + /* only look for events for process that are + * - in the managed state and + * - not monitored by 'status + */ + else if (( process_config[i].stage != PMON_STAGE__MANAGE ) || + ( process_config[i].status_monitoring )) { continue ; } @@ -2016,7 +2017,8 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr ) /* Run the FSM for this failed process */ pmon_passive_handler ( &process_config[i] ) ; } - else if ( process_config[i].active_monitoring ) + else if (( process_config[i].active_monitoring ) && + ( process_config[i].stage == PMON_STAGE__MANAGE )) { // if ( process_config[i].active_failed == false ) if ( process_config[i].failed == false ) @@ -2032,8 +2034,7 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr ) /* Audit to ensure that running processes are * registered with the kernel */ - if (( process_config[i].stage != PMON_STAGE__POLLING ) && - ( process_config[i].stage != PMON_STAGE__START_WAIT ) && + if (( process_config[i].stage == PMON_STAGE__MANAGE ) && ( process_config[i].registered == false ) && ( _pmon_ctrl_ptr->event_mode ) && ( process_config[i].restart == false ) &&