Recover service failure after success audit

This is a quick fix for service in rare situation that a service change
state in below sequence:

enabled-active --> enabled-active-failed --> disabled-failed -->
enabled-active-failed
When service state change directly from disabled-failed to
enabled-active-failed, the failed state will not recover.

Also in a seperate situation, a service failed state is cleared
prematurely when the service enters into enabled-active state from
enabling state because of failed/timeout audit. Since service could 
get into failed state because of an audit mismatch, service should be 
recovered to normal from failed state after a successful audit.

The fix ensure service failed state is cleared after successful audit
in enabled-active state. No changes in other service states.

Change-Id: Ie4052fec9b1579e6da97e9e1486d7f38eafa74ea
Closes-Bug: 1829880
Signed-off-by: Bin Qian <bin.qian@windriver.com>
This commit is contained in:
Bin Qian 2019-05-30 11:34:55 -04:00
parent 0259011983
commit cf92d7d64d
4 changed files with 56 additions and 9 deletions

View File

@ -183,7 +183,6 @@ static bool sm_service_audit_timeout( SmTimerIdT timer_id, int64_t user_data )
}
service->action_running = SM_SERVICE_ACTION_NONE;
service->action_pid = -1;
service->action_timer_id = SM_TIMER_ID_INVALID;
error = service_audit_result_handler( service, action_running,

View File

@ -176,12 +176,9 @@ SmErrorT sm_service_enabled_active_state_exit( SmServiceT* service )
SmErrorT sm_service_enabled_active_state_transition( SmServiceT* service,
SmServiceStateT from_state )
{
if( SM_SERVICE_STATE_ENABLING == from_state )
{
service->status = SM_SERVICE_STATUS_NONE;
service->condition = SM_SERVICE_CONDITION_NONE;
}
// do not clear the failure condition here. If the failure is triggered
// by an audit state mismatch, a full recovery cycle ends at the first
// audit success
return( SM_OKAY );
}
// ****************************************************************************
@ -290,7 +287,10 @@ SmErrorT sm_service_enabled_active_state_event_handler( SmServiceT* service,
break;
case SM_SERVICE_EVENT_AUDIT_SUCCESS:
DPRINTFD( "Service (%s) audit success.", service->name );
if(sm_service_clear_failure_state(service))
{
DPRINTFI( "Service (%s) audit success as recovered.", service->name );
}
break;
case SM_SERVICE_EVENT_AUDIT_MISMATCH:

View File

@ -29,6 +29,48 @@ static SmListT* _services = NULL;
static SmDbHandleT* _sm_db_handle = NULL;
static SmErrorT sm_service_table_add( void* user_data[], void* record );
// ****************************************************************************
// Service Table - clear failure state
// returns true if service is in a failure state
// ====================
bool sm_service_clear_failure_state(SmServiceT* service)
{
SmErrorT error;
bool prev_failure_condition;
prev_failure_condition =
service->recover ||
service->fail_count > 0 ||
service->action_fail_count > 0 ||
service->transition_fail_count > 0 ||
service->status == SM_SERVICE_STATUS_FAILED ||
service->condition == SM_SERVICE_CONDITION_RECOVERY_FAILURE ||
service->condition == SM_SERVICE_CONDITION_ACTION_FAILURE ||
service->condition == SM_SERVICE_CONDITION_FATAL_FAILURE;
if( prev_failure_condition )
{
service->recover = false;
service->fail_count = 0;
service->action_fail_count = 0;
service->transition_fail_count = 0;
service->status = SM_SERVICE_STATUS_NONE;
service->condition = SM_SERVICE_CONDITION_NONE;
error = sm_service_table_persist( service );
if( SM_OKAY != error )
{
DPRINTFE( "Failed to persist service (%s) data, error=%s.",
service->name, sm_error_str(error) );
}
DPRINTFI( "Cleared previous failure condition for service (%s) "
"in % state.", service->name, sm_service_state_str(service->state) );
}
return prev_failure_condition;
}
// ****************************************************************************
// Service Table - Read
// ====================

View File

@ -60,9 +60,15 @@ typedef struct
bool provisioned;
} SmServiceT;
typedef void (*SmServiceTableForEachCallbackT)
typedef void (*SmServiceTableForEachCallbackT)
(void* user_data[], SmServiceT* service);
// ****************************************************************************
// Service Table - clear failure state
// ====================
extern bool sm_service_clear_failure_state(SmServiceT* service);
// ****************************************************************************
// ****************************************************************************
// Service Table - Read
// ====================