Make pmon-restart honour startuptime config option

Each monitored process's config file contains a startuptime
label that specifies how many seconds it takes for that newly
started process to stabalize and produce its pidfile.

The pmon-restart feature needs to delay monitoring
newly restarted process for 'startuptime' seconds.

Failing to do so can cause it to fail the restarted
process to early if there is pidfile creation delay.

Test Plan:

PASS: Verify collectd pmon-restart function with soak ;
      > 5000+ collectd pmon-restarts.
PASS: Verify pmond regression test suite (test-pmon-action.sh)
      > restart command ; graceful restart all monitored processes. (5 loops)
      > kill command    ; kill and recover all monitored processes. (5 loops)

Regression:

PASS: Verify pmon-stop command/function
PASS: Verify pmon-start command/function also honors the startuptime.
PASS: Verify pmon-stop auto start after auto-start timeout
PASS: Verify System Install
PASS: Verify Patching (soak)

Change-Id: I9fd7bba8e49fe4c28281539ab4930bdac370ef11
Closes-Bug: #1844724
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2019-10-07 09:14:33 -04:00
parent 4c541f50d4
commit 8b757de303
2 changed files with 32 additions and 33 deletions

View File

@ -562,7 +562,7 @@ int pmon_passive_handler ( process_config_type * ptr )
/* Spawn the process */ /* Spawn the process */
case PMON_STAGE__RESPAWN: case PMON_STAGE__RESPAWN:
{ {
ilog ("%s stability period (%d secs)\n", ptr->process, ptr->debounce ); dlog ("%s stability period (%d secs)\n", ptr->process, ptr->debounce );
/* Restart the process */ /* Restart the process */
respawn_process ( ptr ) ; respawn_process ( ptr ) ;
@ -570,20 +570,9 @@ int pmon_passive_handler ( process_config_type * ptr )
/* Start the monitor debounce timer. */ /* Start the monitor debounce timer. */
mtcTimer_reset ( ptr->pt_ptr ); mtcTimer_reset ( ptr->pt_ptr );
/* Don't wait for the debounce timer to take this process out of 'commanded restart' mode. mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, ptr->startuptime );
* Do it now, otherwise tight patch loop stress testing might fail */ passiveStageChange ( ptr, PMON_STAGE__MONITOR_WAIT ) ;
if ( ptr->restart == true )
{
ilog ("%s Restarted\n", ptr->process )
ptr->restart = false ;
ptr->registered = false ;
passiveStageChange ( ptr, PMON_STAGE__MANAGE ) ;
}
else
{
mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, ptr->startuptime );
passiveStageChange ( ptr, PMON_STAGE__MONITOR_WAIT ) ;
}
break ; break ;
} }
@ -619,10 +608,21 @@ int pmon_passive_handler ( process_config_type * ptr )
/* clear the monitor debounce counter */ /* clear the monitor debounce counter */
ptr->debounce_cnt = 0 ; ptr->debounce_cnt = 0 ;
/* Start debounce monitor phase */ if ( ptr->restart == true )
passiveStageChange ( ptr, PMON_STAGE__MONITOR ) ; {
process_running ( ptr ); ptr->restart = false ;
ilog ("%s Monitor (%d)\n", ptr->process, ptr->pid ); ptr->registered = false ;
ptr->pid = get_process_pid ( ptr );
ilog ("%s Restarted (%d)\n", ptr->process, ptr->pid );
passiveStageChange ( ptr, PMON_STAGE__MANAGE ) ;
}
else
{
/* Start debounce monitor phase */
passiveStageChange ( ptr, PMON_STAGE__MONITOR ) ;
process_running ( ptr );
ilog ("%s Monitor (%d)\n", ptr->process, ptr->pid );
}
} }
ptr->sigchld_rxed = false ; ptr->sigchld_rxed = false ;

View File

@ -297,13 +297,16 @@ void manage_process_failure ( process_config_type * ptr )
* If not, then query the current system state and save it. */ * If not, then query the current system state and save it. */
if ( _pmon_ctrl_ptr->system_state != MTC_SYSTEM_STATE__STOPPING ) if ( _pmon_ctrl_ptr->system_state != MTC_SYSTEM_STATE__STOPPING )
{ {
elog ("%s failed (%d) (p:%d a:%d)\n", ptr->process,
ptr->pid,
ptr->failed,
ptr->active_failed);
/* update current state */ /* update current state */
_pmon_ctrl_ptr->system_state = get_system_state(); _pmon_ctrl_ptr->system_state = get_system_state();
} }
else
/* Ignore process failures while in stopping (i.e. shutdown) mode */
if ( _pmon_ctrl_ptr->system_state == MTC_SYSTEM_STATE__STOPPING )
{ {
/* Ignore process failures while in stopping (i.e. shutdown) mode */
/* don't report process failures during system shutdown. */ /* don't report process failures during system shutdown. */
wlog ("%s terminated by system shutdown (pid:%d) ; ignoring\n", wlog ("%s terminated by system shutdown (pid:%d) ; ignoring\n",
ptr->process , ptr->pid ); ptr->process , ptr->pid );
@ -317,7 +320,6 @@ void manage_process_failure ( process_config_type * ptr )
slog ("%s process is in the stopped state\n", ptr->process); slog ("%s process is in the stopped state\n", ptr->process);
} }
elog ("%s failed (%d) (p:%d a:%d)\n", ptr->process, ptr->pid, ptr->failed, ptr->active_failed);
passiveStageChange ( ptr, PMON_STAGE__MANAGE) ; passiveStageChange ( ptr, PMON_STAGE__MANAGE) ;
if ( ptr->failed == false ) if ( ptr->failed == false )
@ -836,15 +838,12 @@ int process_config_load (process_config_type * pc_ptr, const char * config_file_
int get_process_pid ( process_config_type * ptr ) int get_process_pid ( process_config_type * ptr )
{ {
int pid = 0 ; int pid = 0 ;
FILE * pid_file_stream = fopen ( ptr->pidfile, "r" ); if ( ptr )
if ( pid_file_stream != NULL )
{ {
int num = fscanf ( pid_file_stream, "%d", &pid); if ( daemon_is_file_present ( ptr->pidfile ) == true )
if ( num != 1 )
{ {
wlog ("fscanf failed to read pid from %s\n", ptr->pidfile ); pid = daemon_get_file_int ( ptr->pidfile );
} }
fclose (pid_file_stream);
} }
return (pid); return (pid);
} }
@ -939,7 +938,7 @@ bool kill_running_process ( int pid )
daemon_remove_file ( ptr->pidfile ); daemon_remove_file ( ptr->pidfile );
} }
} }
wlog ("%s kill succeeded (%d)\n", proc_name_ptr, pid ); wlog ("%s Killed (%d)\n", proc_name_ptr, pid );
rc = true ; rc = true ;
} }
else else
@ -1100,7 +1099,7 @@ int unregister_process ( process_config_type * ptr )
} }
else else
{ {
ilog ("%s unregistered (%d)\n", ptr->process, ptr->pid ); ilog ("%s Unregister (%d)\n", ptr->process, ptr->pid );
} }
} }
ptr->registered = false ; ptr->registered = false ;
@ -1202,7 +1201,7 @@ int respawn_process ( process_config_type * ptr )
unregister_process ( ptr ); unregister_process ( ptr );
if ( process_running ( ptr ) == true ) if ( process_running ( ptr ) == true )
{ {
ilog ("%s restart of running process\n", ptr->process ); dlog ("%s still running\n", ptr->process );
restart = true ; restart = true ;
kill_running_process ( ptr->pid ); kill_running_process ( ptr->pid );
} }
@ -1454,7 +1453,7 @@ void daemon_sigchld_hdlr ( void )
{ {
if ( process_ptr->status_monitoring == false ) if ( process_ptr->status_monitoring == false )
{ {
dlog ("%s spawn failed (rc:%d:%x) (%ld.%03ld secs)\n", ilog ("%s spawn failed (rc:%d:%x) (%ld.%03ld secs)\n",
process_ptr->process, process_ptr->process,
process_ptr->status, process_ptr->status,
process_ptr->status, process_ptr->status,