From cf92d7d64d5c72e73a86a5729f3a081e4742229c Mon Sep 17 00:00:00 2001 From: Bin Qian Date: Thu, 30 May 2019 11:34:55 -0400 Subject: [PATCH] Recover service failure after success audit This is a quick fix for service in rare situation that a service change state in below sequence: enabled-active --> enabled-active-failed --> disabled-failed --> enabled-active-failed When service state change directly from disabled-failed to enabled-active-failed, the failed state will not recover. Also in a seperate situation, a service failed state is cleared prematurely when the service enters into enabled-active state from enabling state because of failed/timeout audit. Since service could get into failed state because of an audit mismatch, service should be recovered to normal from failed state after a successful audit. The fix ensure service failed state is cleared after successful audit in enabled-active state. No changes in other service states. Change-Id: Ie4052fec9b1579e6da97e9e1486d7f38eafa74ea Closes-Bug: 1829880 Signed-off-by: Bin Qian --- service-mgmt/sm-1.0.0/src/sm_service_audit.c | 1 - .../src/sm_service_enabled_active_state.c | 14 +++---- service-mgmt/sm-1.0.0/src/sm_service_table.c | 42 +++++++++++++++++++ service-mgmt/sm-1.0.0/src/sm_service_table.h | 8 +++- 4 files changed, 56 insertions(+), 9 deletions(-) diff --git a/service-mgmt/sm-1.0.0/src/sm_service_audit.c b/service-mgmt/sm-1.0.0/src/sm_service_audit.c index 3bcf6295..4fa1cdc4 100644 --- a/service-mgmt/sm-1.0.0/src/sm_service_audit.c +++ b/service-mgmt/sm-1.0.0/src/sm_service_audit.c @@ -183,7 +183,6 @@ static bool sm_service_audit_timeout( SmTimerIdT timer_id, int64_t user_data ) } service->action_running = SM_SERVICE_ACTION_NONE; - service->action_pid = -1; service->action_timer_id = SM_TIMER_ID_INVALID; error = service_audit_result_handler( service, action_running, diff --git a/service-mgmt/sm-1.0.0/src/sm_service_enabled_active_state.c b/service-mgmt/sm-1.0.0/src/sm_service_enabled_active_state.c index 677b8dfa..5726309b 100644 --- a/service-mgmt/sm-1.0.0/src/sm_service_enabled_active_state.c +++ b/service-mgmt/sm-1.0.0/src/sm_service_enabled_active_state.c @@ -176,12 +176,9 @@ SmErrorT sm_service_enabled_active_state_exit( SmServiceT* service ) SmErrorT sm_service_enabled_active_state_transition( SmServiceT* service, SmServiceStateT from_state ) { - if( SM_SERVICE_STATE_ENABLING == from_state ) - { - service->status = SM_SERVICE_STATUS_NONE; - service->condition = SM_SERVICE_CONDITION_NONE; - } - + // do not clear the failure condition here. If the failure is triggered + // by an audit state mismatch, a full recovery cycle ends at the first + // audit success return( SM_OKAY ); } // **************************************************************************** @@ -290,7 +287,10 @@ SmErrorT sm_service_enabled_active_state_event_handler( SmServiceT* service, break; case SM_SERVICE_EVENT_AUDIT_SUCCESS: - DPRINTFD( "Service (%s) audit success.", service->name ); + if(sm_service_clear_failure_state(service)) + { + DPRINTFI( "Service (%s) audit success as recovered.", service->name ); + } break; case SM_SERVICE_EVENT_AUDIT_MISMATCH: diff --git a/service-mgmt/sm-1.0.0/src/sm_service_table.c b/service-mgmt/sm-1.0.0/src/sm_service_table.c index 374699a5..05cda37e 100644 --- a/service-mgmt/sm-1.0.0/src/sm_service_table.c +++ b/service-mgmt/sm-1.0.0/src/sm_service_table.c @@ -29,6 +29,48 @@ static SmListT* _services = NULL; static SmDbHandleT* _sm_db_handle = NULL; static SmErrorT sm_service_table_add( void* user_data[], void* record ); + +// **************************************************************************** +// Service Table - clear failure state +// returns true if service is in a failure state +// ==================== +bool sm_service_clear_failure_state(SmServiceT* service) +{ + SmErrorT error; + bool prev_failure_condition; + prev_failure_condition = + service->recover || + service->fail_count > 0 || + service->action_fail_count > 0 || + service->transition_fail_count > 0 || + service->status == SM_SERVICE_STATUS_FAILED || + service->condition == SM_SERVICE_CONDITION_RECOVERY_FAILURE || + service->condition == SM_SERVICE_CONDITION_ACTION_FAILURE || + service->condition == SM_SERVICE_CONDITION_FATAL_FAILURE; + if( prev_failure_condition ) + { + service->recover = false; + service->fail_count = 0; + service->action_fail_count = 0; + service->transition_fail_count = 0; + service->status = SM_SERVICE_STATUS_NONE; + service->condition = SM_SERVICE_CONDITION_NONE; + + error = sm_service_table_persist( service ); + if( SM_OKAY != error ) + { + DPRINTFE( "Failed to persist service (%s) data, error=%s.", + service->name, sm_error_str(error) ); + } + + DPRINTFI( "Cleared previous failure condition for service (%s) " + "in % state.", service->name, sm_service_state_str(service->state) ); + + } + + return prev_failure_condition; +} + // **************************************************************************** // Service Table - Read // ==================== diff --git a/service-mgmt/sm-1.0.0/src/sm_service_table.h b/service-mgmt/sm-1.0.0/src/sm_service_table.h index 06b5a2af..8b8d303a 100644 --- a/service-mgmt/sm-1.0.0/src/sm_service_table.h +++ b/service-mgmt/sm-1.0.0/src/sm_service_table.h @@ -60,9 +60,15 @@ typedef struct bool provisioned; } SmServiceT; -typedef void (*SmServiceTableForEachCallbackT) +typedef void (*SmServiceTableForEachCallbackT) (void* user_data[], SmServiceT* service); +// **************************************************************************** +// Service Table - clear failure state +// ==================== +extern bool sm_service_clear_failure_state(SmServiceT* service); +// **************************************************************************** + // **************************************************************************** // Service Table - Read // ====================