Merge "Failure case handling of LUKS service"
This commit is contained in:
commit
125601c2f9
@ -41,6 +41,7 @@
|
||||
#define CH_COMP_ALARM_ID ((const char *)"200.013") /* Combo Host Compute Failure - on last Controller */
|
||||
#define SENSORCFG_ALARM_ID ((const char *)"200.014") /* Sensor configuration alarm ; i.e. could not add */
|
||||
#define SENSORGROUP_ALARM_ID ((const char *)"200.015") /* Sensor Group Read Error */
|
||||
#define LUKS_ALARM_ID ((const char *)"200.016") /* LUKS volume failure alarm */
|
||||
|
||||
#define EVENT_LOG_ID ((const char *)"200.020")
|
||||
#define COMMAND_LOG_ID ((const char *)"200.021")
|
||||
|
@ -86,6 +86,7 @@ void daemon_exit ( void );
|
||||
#define MTC_FLAG__SM_DEGRADED (0x00000080)
|
||||
#define MTC_FLAG__PATCHING (0x00000100) /* Patching in progress */
|
||||
#define MTC_FLAG__PATCHED (0x00000200) /* Patched but not reset */
|
||||
#define MTC_FLAG__LUKS_VOL_FAILED (0x00000400)
|
||||
#define MTC_FLAG__SM_UNHEALTHY (0x00001000)
|
||||
|
||||
#define MTC_UNHEALTHY_THRESHOLD (3)
|
||||
@ -289,6 +290,7 @@ typedef enum
|
||||
#define MTC_TASK_AR_DISABLED_SERVICES "Service Failure, threshold reached, Lock/Unlock to retry"
|
||||
#define MTC_TASK_AR_DISABLED_ENABLE "Enable Failure, threshold reached, Lock/Unlock to retry"
|
||||
#define MTC_TASK_AR_DISABLED_HEARTBEAT "Heartbeat Failure, threshold reached, Lock/Unlock to retry"
|
||||
#define MTC_TASK_AR_DISABLED_LUKS "LUKS volume failure, threshold reached, Lock/Unlock to retry"
|
||||
|
||||
#define MTC_TASK_RESET_FAIL "Reset Failed"
|
||||
#define MTC_TASK_RESET_QUEUE "Reset Failed, retrying (%d of %d)"
|
||||
@ -1020,7 +1022,7 @@ string get_configStages_str ( mtc_configStages_enum stage );
|
||||
#define DEGRADE_MASK_CONFIG 0x00000400
|
||||
#define DEGRADE_MASK_COLLECTD 0x00000800
|
||||
#define DEGRADE_MASK_ENABLE 0x00001000
|
||||
#define DEGRADE_MASK_RES4 0x00002000
|
||||
#define DEGRADE_MASK_LUKS 0x00002000
|
||||
#define DEGRADE_MASK_RES5 0x00004000
|
||||
#define DEGRADE_MASK_RES6 0x00008000
|
||||
|
||||
@ -1261,6 +1263,7 @@ typedef enum
|
||||
MTC_AR_DISABLE_CAUSE__GOENABLE,
|
||||
MTC_AR_DISABLE_CAUSE__HOST_SERVICES,
|
||||
MTC_AR_DISABLE_CAUSE__HEARTBEAT,
|
||||
MTC_AR_DISABLE_CAUSE__LUKS,
|
||||
MTC_AR_DISABLE_CAUSE__LAST,
|
||||
MTC_AR_DISABLE_CAUSE__NONE,
|
||||
} autorecovery_disable_cause_enum ;
|
||||
|
@ -39,6 +39,7 @@
|
||||
#define CH_COMP_ALARM_ID ((const char *)"200.013") /* Combo Host Compute Failure - on last Controller */
|
||||
#define SENSORCFG_ALARM_ID ((const char *)"200.014") /* Sensor configuration alarm ; i.e. could not add */
|
||||
#define SENSORGROUP_ALARM_ID ((const char *)"200.015") /* Sensor Group Read Error */
|
||||
#define LUKS_ALARM_ID ((const char *)"200.016") /* LUKS volume failure alarm */
|
||||
|
||||
#define EVENT_LOG_ID ((const char *)"200.020")
|
||||
#define COMMAND_LOG_ID ((const char *)"200.021")
|
||||
|
@ -1640,6 +1640,41 @@ int nodeLinkClass::lazy_graceful_fs_reboot ( struct nodeLinkClass::node * node_p
|
||||
return (FAIL);
|
||||
}
|
||||
|
||||
/* Generate a log and a critical alarm if the LUKS volume config failed */
|
||||
int nodeLinkClass::alarm_luks_failure ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
if ( (node_ptr->degrade_mask & DEGRADE_MASK_LUKS) == 0 )
|
||||
{
|
||||
node_ptr->degrade_mask |= DEGRADE_MASK_LUKS ;
|
||||
}
|
||||
|
||||
if ( node_ptr->alarms[MTC_ALARM_ID__LUKS] != FM_ALARM_SEVERITY_CRITICAL )
|
||||
{
|
||||
elog ("%s critical luks filesystem config failure\n", node_ptr->hostname.c_str());
|
||||
|
||||
mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__LUKS );
|
||||
node_ptr->alarms[MTC_ALARM_ID__LUKS] = FM_ALARM_SEVERITY_CRITICAL ;
|
||||
}
|
||||
return (PASS);
|
||||
}
|
||||
|
||||
/* Clear the luks alarm and degrade flag */
|
||||
int nodeLinkClass::alarm_luks_clear ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
if ( node_ptr->degrade_mask & DEGRADE_MASK_LUKS )
|
||||
{
|
||||
node_ptr->degrade_mask &= ~DEGRADE_MASK_LUKS ;
|
||||
}
|
||||
|
||||
if ( node_ptr->alarms[MTC_ALARM_ID__LUKS] != FM_ALARM_SEVERITY_CLEAR )
|
||||
{
|
||||
ilog ("%s luks config alarm clear\n", node_ptr->hostname.c_str());
|
||||
|
||||
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__LUKS );
|
||||
node_ptr->alarms[MTC_ALARM_ID__LUKS] = FM_ALARM_SEVERITY_CLEAR ;
|
||||
}
|
||||
return (PASS);
|
||||
}
|
||||
|
||||
/* Generate a log and a critical alarm if the node config failed */
|
||||
int nodeLinkClass::alarm_config_failure ( struct nodeLinkClass::node * node_ptr )
|
||||
|
@ -1106,6 +1106,9 @@ private:
|
||||
int alarm_config_clear ( struct nodeLinkClass::node * node_ptr );
|
||||
int alarm_config_failure ( struct nodeLinkClass::node * node_ptr );
|
||||
|
||||
int alarm_luks_clear ( struct nodeLinkClass::node * node_ptr );
|
||||
int alarm_luks_failure ( struct nodeLinkClass::node * node_ptr );
|
||||
|
||||
int alarm_compute_clear ( struct nodeLinkClass::node * node_ptr, bool force );
|
||||
int alarm_compute_failure ( struct nodeLinkClass::node * node_ptr , EFmAlarmSeverityT sev );
|
||||
|
||||
|
@ -184,6 +184,34 @@ void mtcAlarm_init ( void )
|
||||
"and Switch Activity (Swact) to it as soon as possible. If the alarm "
|
||||
"persists then Lock/Unlock host to recover its local compute service.");
|
||||
|
||||
/** LUKS volume config failure Alarm Entry *************************************/
|
||||
|
||||
ptr = &alarm_list[MTC_ALARM_ID__LUKS];
|
||||
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
|
||||
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", LUKS_ALARM_ID);
|
||||
|
||||
ptr->name = "LUKS volume failure" ;
|
||||
ptr->instc_prefix = "" ;
|
||||
|
||||
ptr->minor_reason =
|
||||
ptr->major_reason =
|
||||
ptr->critl_reason = "LUKS volume is not active or functioning properly.";
|
||||
ptr->clear_reason = "'LUKS volume' has been successfully unsealed and service is functioning properly.";
|
||||
|
||||
ptr->alarm.alarm_type = FM_ALARM_OPERATIONAL;
|
||||
ptr->alarm.probable_cause = FM_ALARM_APP_SUBSYS_FAILURE ;
|
||||
ptr->alarm.inhibit_alarms = FM_FALSE ;
|
||||
ptr->alarm.service_affecting = FM_FALSE ;
|
||||
ptr->alarm.suppression = FM_TRUE ;
|
||||
|
||||
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
|
||||
ptr->alarm.alarm_state = FM_ALARM_STATE_CLEAR ; /* Dynamic */
|
||||
|
||||
snprintf (ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH,
|
||||
"If this alarm does not automatically clear after some time and "
|
||||
"continues to be asserted after Host is locked and unlocked then "
|
||||
"contact next level of support for root cause analysis and recovery.");
|
||||
|
||||
/** Init Event Log Entry *************************************************/
|
||||
|
||||
ptr = &alarm_list[MTC_LOG_ID__EVENT];
|
||||
@ -315,6 +343,7 @@ string _getIdentity ( mtc_alarm_id_enum id )
|
||||
case MTC_ALARM_ID__ENABLE: return (ENABLE_ALARM_ID);
|
||||
case MTC_ALARM_ID__BM: return (BM_ALARM_ID);
|
||||
case MTC_ALARM_ID__CH_COMP: return (CH_COMP_ALARM_ID);
|
||||
case MTC_ALARM_ID__LUKS: return (LUKS_ALARM_ID);
|
||||
case MTC_LOG_ID__EVENT: return (EVENT_LOG_ID);
|
||||
case MTC_LOG_ID__COMMAND: return (COMMAND_LOG_ID);
|
||||
case MTC_LOG_ID__STATECHANGE: return (STATECHANGE_LOG_ID);
|
||||
|
@ -30,6 +30,7 @@ typedef enum
|
||||
MTC_ALARM_ID__ENABLE,
|
||||
MTC_ALARM_ID__BM,
|
||||
MTC_ALARM_ID__CH_COMP, /* Combo Host Compute Failure - on last Controller */
|
||||
MTC_ALARM_ID__LUKS,
|
||||
|
||||
MTC_LOG_ID__EVENT,
|
||||
MTC_LOG_ID__COMMAND,
|
||||
|
@ -771,6 +771,11 @@ int create_mtcAlive_msg ( mtc_message_type & msg, int cmd, string identity, int
|
||||
|
||||
/* Insert the mtce flags */
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] = 0 ;
|
||||
|
||||
//Check if LUKS FS manager service is active
|
||||
int exitstatus = system("cryptsetup status luks_encrypted_vault");
|
||||
if ( 0 != exitstatus )
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__LUKS_VOL_FAILED ;
|
||||
if ( daemon_is_file_present ( CONFIG_COMPLETE_FILE ) )
|
||||
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__I_AM_CONFIGURED ;
|
||||
if ( daemon_is_file_present ( CONFIG_FAIL_FILE ) )
|
||||
|
@ -406,6 +406,8 @@ static int mtc_config_handler ( void * user,
|
||||
mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HOST_SERVICES] = atoi(value);
|
||||
else if (MATCH("agent", "ar_heartbeat_threshold"))
|
||||
mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HEARTBEAT] = atoi(value);
|
||||
else if (MATCH("agent", "ar_luks_threshold"))
|
||||
mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__LUKS] = atoi(value);
|
||||
|
||||
else if (MATCH("agent", "ar_config_interval"))
|
||||
mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__CONFIG] = atoi(value);
|
||||
@ -415,6 +417,8 @@ static int mtc_config_handler ( void * user,
|
||||
mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__HOST_SERVICES] = atoi(value);
|
||||
else if (MATCH("agent", "ar_heartbeat_interval"))
|
||||
mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__HEARTBEAT] = atoi(value);
|
||||
else if (MATCH("agent", "ar_luks_interval"))
|
||||
mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__LUKS] = atoi(value);
|
||||
|
||||
|
||||
else
|
||||
@ -757,6 +761,9 @@ int daemon_configure ( void )
|
||||
ilog("AR Heartbeat: %d (threshold) %d sec (retry interval)",
|
||||
mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HEARTBEAT],
|
||||
mtcInv.ar_interval [MTC_AR_DISABLE_CAUSE__HEARTBEAT]);
|
||||
ilog("AR luks : %d (threshold) %d sec (retry interval)",
|
||||
mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__LUKS],
|
||||
mtcInv.ar_interval [MTC_AR_DISABLE_CAUSE__LUKS]);
|
||||
|
||||
/* Get this Controller Activity State */
|
||||
mtc_config.active = daemon_get_run_option ("active") ;
|
||||
|
@ -1099,8 +1099,27 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
node_ptr->hbsClient_ready = false ;
|
||||
mtcTimer_reset ( node_ptr->mtcTimer );
|
||||
|
||||
/* Check for LUKS volume availability */
|
||||
if ( node_ptr->mtce_flags & MTC_FLAG__LUKS_VOL_FAILED )
|
||||
{
|
||||
elog ("%s LUKS volume failure (oob:%x)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->mtce_flags)
|
||||
|
||||
/* raise an alarm for the failure of the config */
|
||||
alarm_luks_failure ( node_ptr );
|
||||
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_CONFIG_FAIL );
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
|
||||
|
||||
/* handle auto recovery for this failure */
|
||||
if ( ar_manage ( node_ptr,
|
||||
MTC_AR_DISABLE_CAUSE__LUKS,
|
||||
MTC_TASK_AR_DISABLED_LUKS ) != PASS )
|
||||
break ;
|
||||
}
|
||||
/* Check to see if the host is/got configured correctly */
|
||||
if ((( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED )) ||
|
||||
else if ((( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED )) ||
|
||||
(( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY )))
|
||||
{
|
||||
elog ("%s configuration failed or incomplete (oob:%x)\n",
|
||||
@ -6341,7 +6360,8 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
(( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_CONFIG)) ||
|
||||
( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_GOENABLE))||
|
||||
( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_SERVICES))||
|
||||
( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_HEARTBEAT))))
|
||||
( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_HEARTBEAT))||
|
||||
(!node_ptr->task.compare(MTC_TASK_AR_DISABLED_LUKS))))
|
||||
{
|
||||
if ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_CONFIG ))
|
||||
{
|
||||
@ -6362,6 +6382,11 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__HEARTBEAT ;
|
||||
}
|
||||
else if ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_LUKS ))
|
||||
{
|
||||
node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__LUKS ;
|
||||
alarm_luks_failure ( node_ptr );
|
||||
}
|
||||
node_ptr->ar_disabled = true ;
|
||||
|
||||
if ( THIS_HOST )
|
||||
@ -7949,10 +7974,19 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
/* clear the SM degrade flag */
|
||||
node_ptr->degrade_mask &= ~DEGRADE_MASK_SM ;
|
||||
|
||||
ilog ("%s sm degrade clear\n", node_ptr->hostname.c_str());
|
||||
}
|
||||
|
||||
/* In-service luks volume config failure handling */
|
||||
if ( !(node_ptr->mtce_flags & MTC_FLAG__LUKS_VOL_FAILED))
|
||||
{
|
||||
alarm_luks_clear ( node_ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
alarm_luks_failure ( node_ptr );
|
||||
}
|
||||
|
||||
/*
|
||||
* In-service Config Failure/Alarm handling
|
||||
*/
|
||||
|
@ -56,6 +56,7 @@ ar_config_threshold = 2
|
||||
ar_goenable_threshold = 2
|
||||
ar_hostservices_threshold = 2
|
||||
ar_heartbeat_threshold = 2
|
||||
ar_luks_threshold = 2
|
||||
|
||||
; Service specific Auto Recovery retry interval.
|
||||
;
|
||||
@ -68,6 +69,7 @@ ar_config_interval = 30
|
||||
ar_goenable_interval = 30
|
||||
ar_hostservices_interval = 30
|
||||
ar_heartbeat_interval = 600
|
||||
ar_luks_interval = 30
|
||||
|
||||
api_retries = 10 ; number of API retries b4 failure
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user