pmond: add support for no script label in conf files

Many new services being added to our system are no longer accompanied
with an init script ; only a service file. With the migration from sysvinit
to systemd pmond still requires process conf files to provide a script label.

This update removes that dependency. Instead, pmond will use the service
or script label to find the most appropriate process failure recover method
while handling the omition of either but not both of the service and script
labels.

The change is to first search for a service file that corresponds with the
service label in the conf file.
If the service label does not exist then the script label is looked at.
If the basename of the script has a corresponding service file then use it.
If no service file is found then the full pathed script is searched for.
If no script file is found then the process monitor errors out.

This update also makes an improvement to how pmond deals with the absence
of the hostw process. Current code base blocks startup if it cannot connect
to the hostw process.

This update implements host watchdog socket failure auto recovery while
continuing to monitor processes. With this update, if the host watchdog
process is restarted or is not running then pmond will continue to monitor
processes while periodically trying to recover connection to the host
watchdog once it does recover.

Change-Id: Icf27090d4d00954195b0ac931474587c67341207
Signed-off-by: Jack Ding <jack.ding@windriver.com>
This commit is contained in:
Eric MacDonald 2018-05-13 17:53:32 -04:00 committed by Jack Ding
parent 784a5f8a6c
commit 04055390fa
9 changed files with 144 additions and 103 deletions

View File

@ -194,6 +194,9 @@ typedef enum
PMOND_RECOVERY_METHOD__SYSTEMD = 1, PMOND_RECOVERY_METHOD__SYSTEMD = 1,
} recovery_method_type ; } recovery_method_type ;
#define SYSTEMD_SERVICE_FILE_DIR1 ((const char *)"/etc/systemd/system")
#define SYSTEMD_SERVICE_FILE_DIR2 ((const char *)"/usr/lib/systemd/system")
/* /*
* Used to mark a configured process * Used to mark a configured process
* This aids in freeing duped memory over a process re-config * This aids in freeing duped memory over a process re-config
@ -267,7 +270,7 @@ int setup_signal_handler ( int rt_signal_num );
/* Monitored Process Config Bit Mask */ /* Monitored Process Config Bit Mask */
#define CONF_PROCESS (0x0001) #define CONF_PROCESS (0x0001)
#define CONF_SCRIPT (0x0002) #define CONF_RECOVERY (0x0002)
#define CONF_STYLE (0x0004) #define CONF_STYLE (0x0004)
#define CONF_PIDFILE (0x0008) #define CONF_PIDFILE (0x0008)
#define CONF_RESTARTS (0x0010) #define CONF_RESTARTS (0x0010)
@ -286,8 +289,8 @@ int setup_signal_handler ( int rt_signal_num );
/* Monitored Passive Process Config Mask */ /* Monitored Passive Process Config Mask */
#define CONF_MASK (CONF_PROCESS | \ #define CONF_MASK (CONF_PROCESS | \
CONF_SCRIPT | \
CONF_STYLE | \ CONF_STYLE | \
CONF_RECOVERY | \
CONF_PIDFILE | \ CONF_PIDFILE | \
CONF_SEVERITY | \ CONF_SEVERITY | \
CONF_RESTARTS | \ CONF_RESTARTS | \
@ -302,8 +305,8 @@ int setup_signal_handler ( int rt_signal_num );
/* Monitored Status Process Config Mask */ /* Monitored Status Process Config Mask */
#define CONF_STATUS_MON_MASK (CONF_PROCESS | \ #define CONF_STATUS_MON_MASK (CONF_PROCESS | \
CONF_SCRIPT | \
CONF_STYLE | \ CONF_STYLE | \
CONF_RECOVERY | \
CONF_SEVERITY | \ CONF_SEVERITY | \
CONF_RESTARTS | \ CONF_RESTARTS | \
CONF_INTERVAL | \ CONF_INTERVAL | \

View File

@ -524,6 +524,43 @@ void pmon_timer_handler ( int sig, siginfo_t *si, void *uc)
} }
} }
/****************************************************************************
*
* Name : service_file_exists
*
* Description: Look in some well known places for the specified service file.
*
* Returns : Return true if the specified service file is found.
*
* Updates : If the service file is found then update the supplied
* character string buffer with the full path/name of that
* service file.
*
****************************************************************************/
bool service_file_exists ( string service_filename,
char * path_n_name_ptr,
int max_len )
{
/* load the name of the service file */
snprintf ( path_n_name_ptr, max_len, "%s/%s",
SYSTEMD_SERVICE_FILE_DIR1,
service_filename.data());
if (( path_n_name_ptr ) && (strnlen ( path_n_name_ptr, max_len )))
{
if ( daemon_is_file_present ( path_n_name_ptr ) == true )
return true ;
}
snprintf ( path_n_name_ptr, max_len, "%s/%s",
SYSTEMD_SERVICE_FILE_DIR2,
service_filename.data());
if (( path_n_name_ptr ) && ( strnlen ( path_n_name_ptr, max_len )))
{
if ( daemon_is_file_present ( path_n_name_ptr ) == true )
return true ;
}
return false ;
}
/***************************************************************************** /*****************************************************************************
* *
* Name : process_config_load * Name : process_config_load
@ -533,8 +570,8 @@ void pmon_timer_handler ( int sig, siginfo_t *si, void *uc)
*****************************************************************************/ *****************************************************************************/
int process_config_load (process_config_type * pc_ptr, const char * config_file_ptr ) int process_config_load (process_config_type * pc_ptr, const char * config_file_ptr )
{ {
char service_name_buf [_MAX_LEN_] ; char recovery_method_buf [_MAX_LEN_] ;
memset (service_name_buf,0, sizeof(service_name_buf)); memset (recovery_method_buf,0, sizeof(recovery_method_buf));
if ( _pmon_ctrl_ptr->processes >= MAX_PROCESSES ) if ( _pmon_ctrl_ptr->processes >= MAX_PROCESSES )
{ {
@ -566,40 +603,59 @@ int process_config_load (process_config_type * pc_ptr, const char * config_file_
pc_ptr->startuptime = PMON_MIN_START_DELAY ; pc_ptr->startuptime = PMON_MIN_START_DELAY ;
} }
/* default recovery method to process init script */ /* Many process conf files came from a sysvinit origin and might not
snprintf ( &service_name_buf[0], _MAX_LEN_, "%s", pc_ptr->script ); * have a service file label. Account for that in the following
* load of recovery_method_buf.
* Accept a script name if the service name is missing. */
bool recovery_method_found = false ;
/* Print error logs if there is no recovery method present for this service/process */ /* look for the service file */
if ( _pmon_ctrl_ptr->recovery_method == PMOND_RECOVERY_METHOD__SYSTEMD ) if ( pc_ptr->service )
{ {
/* If the config file does not specify a service name string service = pc_ptr->service ;
* then the service name defaults to the process name */ if ( service.find(".service") == string::npos )
if ( ! pc_ptr->service ) service.append(".service");
if ( service_file_exists(service, &recovery_method_buf[0], _MAX_LEN_) == true )
recovery_method_found = true ;
}
else if ( pc_ptr->script )
{ {
snprintf ( &service_name_buf[0], _MAX_LEN_, "%s/%s.service", SYSTEMD_SERVICE_FILE_DIR, pc_ptr->process ); string script = basename((char*)pc_ptr->script);
if ( daemon_is_file_present ( service_name_buf ) == false ) if ( script.find(".service") == string::npos )
script.append(".service");
if ( service_file_exists(script, &recovery_method_buf[0], _MAX_LEN_) == true )
recovery_method_found = true ;
else
{ {
if ( daemon_is_file_present ( pc_ptr->script ) == false ) /* resort to the script file only */
/* load the name of the process init script */
snprintf ( &recovery_method_buf[0], _MAX_LEN_, "%s", pc_ptr->script );
if ( daemon_is_file_present ( recovery_method_buf ) == true )
{ {
/* print a log if we have no recovery method */ recovery_method_found = true ;
wlog ("%s has no recovery method\n", pc_ptr->process ); }
wlog ("... neither %s nor %s exist\n", service_name_buf, pc_ptr->script ); else
{
wlog ("%s has script but not found (%s)\n",
pc_ptr->process, recovery_method_buf );
} }
} }
} }
else else
{
snprintf ( &service_name_buf[0], _MAX_LEN_, "%s/%s.service", SYSTEMD_SERVICE_FILE_DIR, pc_ptr->service );
if ( daemon_is_file_present ( service_name_buf ) == false )
{ {
/* print a log if we have no recovery method */ /* print a log if we have no recovery method */
wlog ("%s service has no recovery method\n", pc_ptr->service ); wlog ("%s has no recovery method ; process not monitored\n", pc_ptr->process );
wlog ("... %s does not exist\n", service_name_buf ); wlog ("... conf file has no 'service' or 'script' recovery entry\n");
} return (FAIL_NOT_FOUND);
}
} }
update_config_option ( &pc_ptr->recovery_method , service_name_buf ); if ( recovery_method_found == false )
{
wlog ("%s has no recovery method found ; process not monitored\n", pc_ptr->process );
return (FAIL_NOT_FOUND);
}
update_config_option ( &pc_ptr->recovery_method , recovery_method_buf );
if ( !strcmp ( pc_ptr->mode, "status" ) ) if ( !strcmp ( pc_ptr->mode, "status" ) )
{ {
@ -710,7 +766,7 @@ int process_config_load (process_config_type * pc_ptr, const char * config_file_
* that subfunction init is complete */ * that subfunction init is complete */
ilog ("%7s Def : %-30s %-8s - %s (%s)\n", pc_ptr->mode, ilog ("%7s Def : %-30s %-8s - %s (%s)\n", pc_ptr->mode,
pc_ptr->process, pc_ptr->process,
pc_ptr->ignore ? "ignored" : pc_ptr->severity, service_name_buf, pc_ptr->ignore ? "ignored" : pc_ptr->severity, recovery_method_buf,
pc_ptr->subfunction); pc_ptr->subfunction);
/* defer subfunction processes to the FSM to get enabled */ /* defer subfunction processes to the FSM to get enabled */
pc_ptr->stage = PMON_STAGE__POLLING ; pc_ptr->stage = PMON_STAGE__POLLING ;
@ -724,7 +780,7 @@ int process_config_load (process_config_type * pc_ptr, const char * config_file_
ilog ("%7s Mon : %-30s %-8s - %s\n", pc_ptr->mode, ilog ("%7s Mon : %-30s %-8s - %s\n", pc_ptr->mode,
pc_ptr->process, pc_ptr->process,
pc_ptr->ignore ? "ignored" : pc_ptr->severity, service_name_buf); pc_ptr->ignore ? "ignored" : pc_ptr->severity, recovery_method_buf);
pc_ptr->stage = PMON_STAGE__MANAGE ; pc_ptr->stage = PMON_STAGE__MANAGE ;
} }
// mem_log_process ( pc_ptr ); // mem_log_process ( pc_ptr );
@ -1870,6 +1926,11 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr )
if ( pmonTimer_hostwd.ring == true ) if ( pmonTimer_hostwd.ring == true )
{ {
/* inservice recovery from hostw connection failures */
if ( sock_ptr->hostwd_sock == 0 )
{
hostwd_port_init();
}
if ( ctrl_ptr->event_mode == true ) if ( ctrl_ptr->event_mode == true )
{ {
pmon_send_hostwd ( ); pmon_send_hostwd ( );

View File

@ -116,14 +116,15 @@ int pmon_process_config ( void * user,
} }
if (MATCH("process", "service")) if (MATCH("process", "service"))
{ {
ptr->mask |= CONF_RECOVERY ;
ptr->service = strdup(value); ptr->service = strdup(value);
dlog1 ("Service : %s\n", ptr->service ); dlog1 ("Service : %s\n", ptr->service );
rc = PASS ; rc = PASS ;
} }
else if (MATCH("process", "script")) else if (MATCH("process", "script"))
{ {
ptr->mask |= CONF_SCRIPT ; ptr->mask |= CONF_RECOVERY ;
ptr->status_mask |= CONF_SCRIPT ; ptr->status_mask |= CONF_RECOVERY ;
ptr->script = strdup(value); ptr->script = strdup(value);
dlog1 ("Script : %s\n", ptr->script ); dlog1 ("Script : %s\n", ptr->script );
} }
@ -423,7 +424,7 @@ int socket_init ( void )
* host watchdog process */ * host watchdog process */
if ( rc == PASS ) if ( rc == PASS )
{ {
rc = hostwd_port_init ( ); hostwd_port_init ( );
} }
pmon_inbox_init ( ); pmon_inbox_init ( );
@ -500,22 +501,8 @@ int daemon_init ( string iface, string nodetype_str )
pmon_timer_init (); pmon_timer_init ();
} }
/*
* Setup the recovery method based on the O/S
*
* WRL - SYSVINIT
* CENTOS - SYSTEMD
*
**/
if ( daemon_is_file_present ( CENTOS_RELEASE_FILE ) )
{
pmon_ctrl.recovery_method = PMOND_RECOVERY_METHOD__SYSTEMD ; pmon_ctrl.recovery_method = PMOND_RECOVERY_METHOD__SYSTEMD ;
pmon_ctrl.system_state = get_system_state(); pmon_ctrl.system_state = get_system_state();
}
else
{
pmon_ctrl.recovery_method = PMOND_RECOVERY_METHOD__SYSVINIT ;
}
ilog ("Recovery Method: %s\n", pmon_ctrl.recovery_method ? "systemd via systemctl" : "sysvinit via script" ); ilog ("Recovery Method: %s\n", pmon_ctrl.recovery_method ? "systemd via systemctl" : "sysvinit via script" );
return (rc); return (rc);
} }

View File

@ -78,24 +78,16 @@ int pulse_port_init ( void )
} }
/* Setup the Unix Host Watchdog Socket */ /* Setup the Unix Host Watchdog Socket */
#define _THROTTLE_LEVEL (5)
int hostwd_port_init ( void ) int hostwd_port_init ( void )
{ {
int rc = FAIL ;
int fail_count = 0 ;
memset(&pmon_sock.hostwd_addr, 0, sizeof(pmon_sock.hostwd_addr)); memset(&pmon_sock.hostwd_addr, 0, sizeof(pmon_sock.hostwd_addr));
while (rc == FAIL)
{
int len;
int connected;
pmon_sock.hostwd_sock = socket(AF_UNIX, SOCK_DGRAM, 0); pmon_sock.hostwd_sock = socket(AF_UNIX, SOCK_DGRAM, 0);
if (pmon_sock.hostwd_sock <= 0) { if (pmon_sock.hostwd_sock <= 0)
if ( fail_count++ > _THROTTLE_LEVEL ) { {
wlog("Could not connect to create hostwd socket - will retry\n"); wlog("Could not connect to create hostwd socket - will retry\n");
} pmon_sock.hostwd_sock = 0 ;
sleep(1); return (FAIL_SOCKET_CREATE);
continue;
} }
/* Set up the socket address */ /* Set up the socket address */
@ -109,22 +101,19 @@ int hostwd_port_init ( void )
strncpy( &(pmon_sock.hostwd_addr.sun_path[1]), strncpy( &(pmon_sock.hostwd_addr.sun_path[1]),
HOSTW_UNIX_SOCKNAME, HOSTW_UNIX_SOCKNAME,
UNIX_PATH_MAX-1); UNIX_PATH_MAX-1);
len = sizeof(pmon_sock.hostwd_addr); int len = sizeof(pmon_sock.hostwd_addr);
int connected = connect( pmon_sock.hostwd_sock, (sockaddr*) &pmon_sock.hostwd_addr,
connected = connect( pmon_sock.hostwd_sock, (sockaddr*) &pmon_sock.hostwd_addr,
len); len);
if (connected == -1) { if (connected == -1)
if ( fail_count++ > _THROTTLE_LEVEL ) { {
wlog("Could not connect to hostwd port - will retry\n"); wlog("Could not connect to hostwd port - will retry\n");
} if ( pmon_sock.hostwd_sock )
close(pmon_sock.hostwd_sock); close(pmon_sock.hostwd_sock);
pmon_sock.hostwd_sock = 0; pmon_sock.hostwd_sock = 0;
sleep(1); return (FAIL_CONNECT);
} else {
rc = PASS;
} }
} ilog ("connected to host watchdog\n");
return (rc); return (PASS);
} }
/* Build a message for host watchdog, and send it */ /* Build a message for host watchdog, and send it */
@ -174,7 +163,13 @@ int pmon_send_hostwd ( void )
{ {
elog("Error sending message to host watchdog -- error %d (%s)\n", elog("Error sending message to host watchdog -- error %d (%s)\n",
errno, strerror(errno)); errno, strerror(errno));
if ( pmon_sock.hostwd_sock )
{
close(pmon_sock.hostwd_sock);
pmon_sock.hostwd_sock = 0;
}
return (FAIL); return (FAIL);
} }
} }
return (FAIL); return (FAIL);

View File

@ -2,7 +2,6 @@
process = acpid process = acpid
service = acpid service = acpid
pidfile = /var/run/acpid.pid pidfile = /var/run/acpid.pid
script = /etc/init.d/acpid
style = lsb ; ocf or lsb style = lsb ; ocf or lsb
severity = minor ; minor, major, critical severity = minor ; minor, major, critical
restarts = 3 ; restart retries before error assertion restarts = 3 ; restart retries before error assertion

View File

@ -2,7 +2,6 @@
process = nslcd process = nslcd
service = nslcd service = nslcd
pidfile = /var/run/nslcd/nslcd.pid pidfile = /var/run/nslcd/nslcd.pid
script = /etc/init.d/openldap
style = lsb ; ocf or lsb style = lsb ; ocf or lsb
severity = major ; minor, major, critical severity = major ; minor, major, critical
restarts = 3 ; restart retries before error assertion restarts = 3 ; restart retries before error assertion

View File

@ -2,7 +2,6 @@
process = ntpd process = ntpd
service = ntpd service = ntpd
pidfile = /var/run/ntp.pid pidfile = /var/run/ntp.pid
script = /etc/init.d/ntpd
style = lsb ; ocf or lsb style = lsb ; ocf or lsb
severity = minor ; minor, major, critical severity = minor ; minor, major, critical
restarts = 0 ; restart retries before error assertion restarts = 0 ; restart retries before error assertion

View File

@ -2,7 +2,6 @@
process = sshd process = sshd
service = sshd service = sshd
pidfile = /var/run/sshd.pid pidfile = /var/run/sshd.pid
script = /etc/init.d/sshd
style = lsb ; ocf or lsb style = lsb ; ocf or lsb
severity = minor ; minor, major, critical severity = minor ; minor, major, critical
restarts = 10 ; restart retries before error assertion restarts = 10 ; restart retries before error assertion

View File

@ -2,7 +2,6 @@
process = syslog-ng process = syslog-ng
service = syslog-ng service = syslog-ng
pidfile = /var/run/syslog-ng/syslog-ng.pid pidfile = /var/run/syslog-ng/syslog-ng.pid
script = /etc/init.d/syslog
style = lsb ; ocf or lsb style = lsb ; ocf or lsb
severity = minor ; minor, major, critical severity = minor ; minor, major, critical
restarts = 2 ; restart retries before error assertion restarts = 2 ; restart retries before error assertion