Add reattempt and collect more data for SM init failure

Multiple report to AIO-SX that SM failed its intialization due to
a SQL failure. The issue had not been reproduced in DEV environment.
This change adds logging, reattempt and collect SM troubleshooting
data when SM fails in such situation.
For potential recovery before pmon start actively monitoring SM,
setting systemd restart=on-failure. Also set RestartSec=10 seconds
in order to give pmon enough time to catch the failure and restart
SM.

Partial-bug: 1915894
Change-Id: I5899e401742510158cd9c59a664b1dc329bb1075
Signed-off-by: Bin Qian <bin.qian@windriver.com>
This commit is contained in:
Bin Qian 2020-12-15 15:55:25 -05:00
parent c5f753c3bb
commit f39ca95924
3 changed files with 36 additions and 5 deletions

View File

@ -9,6 +9,8 @@
#include "sm_debug.h"
#include "sm_db.h"
#include "sm_db_iterator.h"
#include "sm_failover_utils.h"
// ****************************************************************************
// Database For-Each
@ -21,6 +23,8 @@ SmErrorT sm_db_foreach( const char* db_name, const char* db_table,
SmDbIteratorT it;
SmErrorT error, error2;
DPRINTFI("Entering db foreach");
error = sm_db_iterator_initialize( db_name, db_table, db_query, &it );
if( SM_OKAY != error )
{
@ -74,6 +78,7 @@ ERROR:
return( error );
}
DPRINTFI("Exiting db foreach");
return( error );
}
// ****************************************************************************

View File

@ -11,6 +11,8 @@ ExecStart=/etc/init.d/sm start
ExecStop=/etc/init.d/sm stop
PIDFile=/var/run/sm.pid
KillMode=process
RestartSec=10
Restart=on-failure
[Install]
WantedBy=multi-user.target

View File

@ -9,6 +9,8 @@
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
#include <unistd.h>
#include <signal.h>
#include "sm_types.h"
#include "sm_debug.h"
@ -28,6 +30,7 @@
#include "sm_node_swact_monitor.h"
#include "sm_failover_fsm.h"
#include "sm_configure.h"
#include "sm_troubleshoot.h"
#define SM_NODE_AUDIT_TIMER_IN_MS 1000
#define SM_INTERFACE_AUDIT_TIMER_IN_MS 1000
@ -318,6 +321,7 @@ static void sm_main_event_handler_service_group_state_callback(
SmErrorT sm_main_event_handler_initialize( void )
{
SmErrorT error;
int i;
memset( &_api_callbacks, 0, sizeof(_api_callbacks) );
memset( &_notify_api_callbacks, 0, sizeof(_notify_api_callbacks) );
@ -367,12 +371,32 @@ SmErrorT sm_main_event_handler_initialize( void )
return( error );
}
error = sm_main_event_handler_release_service_groups();
if( SM_OKAY != error )
#define MAX_REATTEMPT 20
for(i = 0; i < MAX_REATTEMPT; i ++)
{
DPRINTFE( "Failed to release service groups, error=%s.",
sm_error_str( error ) );
return( error );
error = sm_main_event_handler_release_service_groups();
if( SM_OKAY != error )
{
DPRINTFE( "Failed to release service groups, error=%s.",
sm_error_str( error ) );
if( i == 0)
{
// collect SM troubleshooting data when it fails
DPRINTFE("Initialization failed, dumping troubleshooting data");
sm_troubleshoot_dump_data("Release service groups failed");
}
usleep(1000000);
}
else
{
break;
}
}
if (error != SM_OKAY)
{
DPRINTFE("Failed to release service groups, after %d attempts", i);
return error;
}
error = sm_api_initialize();