metal/mtce/src/hwmon/hwmonHdlr.cpp

/*
 * Copyright (c) 2013-2017 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
 */

 /**
  * @file
  * Wind River CGCS Platform Process Monitor Service Handler
  */

#include "daemon_ini.h"

#include "nodeBase.h"     /* for ... mtce common definitions          */
#include "jsonUtil.h"     /* for ... json utilities                   */
#include "regexUtil.h"    /* for ... regexUtil_pattern_match          */
#include "tokenUtil.h"    /* for ... tokenUtil_new_token              */
#include "nodeUtil.h"     /* for ... mtce common utilities            */
#include "bmcUtil.h"      /* for ... mtce-common board management     */
#include "hwmon.h"        /* for ... service module header            */
#include "hwmonUtil.h"    /* for ... utilities, ie clear_logged_state */
#include "hwmonClass.h"   /* for ... service class definition         */
#include "hwmonSensor.h"  /* for ... this mpodule header              */
#include "hwmonHttp.h"    /* for ... hwmonHttp_mod_group              */
#include "hwmonAlarm.h"   /* for ... hwmonAlarm_major                 */
#include "hwmonBmc.h"    /* for ... QUANTA_SAMPLE_PROFILE_..         */

/* Declare the Hardware Monitor Inventory Object */
hwmonHostClass hostInv ;

/* Public interface to get the Hardware Monitor Inventory object */
hwmonHostClass * get_hwmonHostClass_ptr ( void )
{
    return (&hostInv);
}

/* Preserve a local copy of a pointer to the control struct to
 * avoid having to publish a get utility prototype into hwmon.h */
static hwmon_ctrl_type * _hwmon_ctrl_ptr = NULL ;

/* hwmonTimer_audit - get_events periodic audit timer */
static struct mtc_timer hwmonTimer_audit ;
static struct mtc_timer hwmonTimer_token ;

/** List of server profile files */
std::list<string> profile_files ;
std::list<string>::iterator string_iter_ptr ;


 /*****************************************************************************
 *
 * Name       : _stage_change
 *
 * Description: Change the sensor monitor FSM stage.
 *
 ****************************************************************************/

static std::string monitorStages_str[HWMON_SENSOR_MONITOR__STAGES+1];
void _stage_change ( string hostname, monitor_ctrl_stage_enum & nowStage, monitor_ctrl_stage_enum newStage )
{
    if ( newStage < HWMON_SENSOR_MONITOR__STAGES )
    {
        clog ("%s sensor monitor stage change from %s -> %s\n",
                  hostname.c_str(),
                  monitorStages_str[nowStage].c_str(),
                  monitorStages_str[newStage].c_str());
        nowStage = newStage ;
    }
    else
    {
        slog ("%s sensor monitor stage change to '%d' is invalid ; switching to START\n",
                  hostname.c_str(),
                  newStage );
        nowStage = HWMON_SENSOR_MONITOR__START ;
    }
}

/*******************************************************************
 *          Module Initialize and Finalizes Interfaces             *
 *******************************************************************/

/* Initial init of timers. */
/* Not run on a sighup     */
void hwmon_timer_init ( void )
{
    mtcTimer_init ( hwmonTimer_audit, "controller", "audit timer" ) ;
    mtcTimer_init ( hwmonTimer_token, "controller", "token timer") ;
}

/* Register realtime signal handler with the kernel */
int signal_hdlr_init ( int sig_num )
{
    int rc ;
    UNUSED(sig_num) ;

#ifdef WANT_MORE_SIGNAL_HANDLING
    memset (&_pmon_ctrl_ptr->info, 0, sizeof(_pmon_ctrl_ptr->info));
    memset (&_pmon_ctrl_ptr->prev, 0, sizeof(_pmon_ctrl_ptr->info));

    _pmon_ctrl_ptr->info.sa_sigaction = _process_death_hdlr ;
    _pmon_ctrl_ptr->info.sa_flags = (SA_NOCLDSTOP | SA_NOCLDWAIT | SA_SIGINFO) ;

    rc = sigaction ( sig_num, &_pmon_ctrl_ptr->info , &_pmon_ctrl_ptr->prev );
    if ( rc )
    {
        elog("Registering Realtime Signal %d - (%d) (%s)\n",
              sig_num, errno, strerror(errno));
        rc = FAIL_SIGNAL_INIT ;
    }
    else
    {
        ilog("Registering Realtime Signal %d\n", sig_num);
    }
#else
    rc = PASS ;
#endif
    return (rc) ;
}

/*
 * Init the handler
 *    - Must support re-init that might occur over a SIGHUP
 **/
int hwmon_hdlr_init ( hwmon_ctrl_type * ctrl_ptr )
{
    int rc = PASS ;

    /* Save the control pointer */
    _hwmon_ctrl_ptr = ctrl_ptr ;

    monitorStages_str[HWMON_SENSOR_MONITOR__START]   = "Start" ;
    monitorStages_str[HWMON_SENSOR_MONITOR__DELAY]   = "Delay" ;
    monitorStages_str[HWMON_SENSOR_MONITOR__READ]    = "Read"  ;
    monitorStages_str[HWMON_SENSOR_MONITOR__PARSE]   = "Parse" ;
    monitorStages_str[HWMON_SENSOR_MONITOR__CHECK]   = "Check" ;
    monitorStages_str[HWMON_SENSOR_MONITOR__UPDATE]  = "Update";
    monitorStages_str[HWMON_SENSOR_MONITOR__HANDLE]  = "Handle";
    monitorStages_str[HWMON_SENSOR_MONITOR__FAIL]    = "Fail"  ;
    monitorStages_str[HWMON_SENSOR_MONITOR__POWER]   = "Power Query" ;
    monitorStages_str[HWMON_SENSOR_MONITOR__RESTART] = "Restart" ;
    monitorStages_str[HWMON_SENSOR_MONITOR__IDLE]    = "Idle"  ;

    return (rc) ;
}


/* Module Cleanup */
void hwmon_hdlr_fini ( hwmon_ctrl_type * ctrl_ptr )
{
    UNUSED(ctrl_ptr) ;
}

/*******************************************************************
 *                   Module Utilities                              *
 ******************************************************************/
/* SIGCHLD handler support - for waitpid */
void daemon_sigchld_hdlr ( void )
{
    dlog3 ("Received SIGCHLD ...\n");
}


/* Looks up the timer ID and asserts the corresponding ringer */
void hwmonHostClass::timer_handler ( int sig, siginfo_t *si, void *uc)
{
    timer_t * tid_ptr = (void**)si->si_value.sival_ptr ;
    struct hwmonHostClass::hwmon_host * hwmon_host_ptr ;

    /* Avoid compiler errors/warnings for parms we must
     * have but currently do nothing with */
    UNUSED(sig);
    UNUSED(uc);

    if ( tid_ptr == NULL )
    {
        return ;
    }
    else if ( *tid_ptr == NULL )
    {
        return ;
    }

    /* Audit Timer */
    else if ( *tid_ptr == hwmonTimer_audit.tid )
    {
        hwmonTimer_audit.ring = true ;
        return ;
    }
    /* Token refresh Timer */
    else if ( *tid_ptr == hwmonTimer_token.tid )
    {
        mtcTimer_stop_int_safe ( hwmonTimer_token );
        hwmonTimer_token.ring = true ;
        return ;
    }
    else
    {
        hwmon_host_ptr = getHost_timer ( *tid_ptr ) ;
        if ( hwmon_host_ptr )
        {
            if (( *tid_ptr == hwmon_host_ptr->monitor_ctrl.timer.tid ) )
            {
                mtcTimer_stop_int_safe ( hwmon_host_ptr->monitor_ctrl.timer );
                hwmon_host_ptr->monitor_ctrl.timer.ring = true ;
                return ;
            }
            else if (( *tid_ptr == hwmon_host_ptr->bmc_thread_ctrl.timer.tid ) )
            {
                mtcTimer_stop_int_safe ( hwmon_host_ptr->bmc_thread_ctrl.timer );
                hwmon_host_ptr->bmc_thread_ctrl.timer.ring = true ;
                return ;
            }
            else if (( *tid_ptr == hwmon_host_ptr->ping_info.timer.tid ) )
            {
                mtcTimer_stop_int_safe ( hwmon_host_ptr->ping_info.timer );
                hwmon_host_ptr->ping_info.timer.ring = true ;
                return ;
            }
            else if (( *tid_ptr == hwmon_host_ptr->hostTimer.tid ) )
            {
                mtcTimer_stop_int_safe ( hwmon_host_ptr->hostTimer );
                hwmon_host_ptr->hostTimer.ring = true ;
                return ;
            }
            else if (( *tid_ptr == hwmon_host_ptr->addTimer.tid ) )
            {
                mtcTimer_stop_int_safe ( hwmon_host_ptr->addTimer );
                hwmon_host_ptr->addTimer.ring = true ;
                return ;
            }
            else if (( *tid_ptr == hwmon_host_ptr->relearnTimer.tid ) )
            {
                mtcTimer_stop_int_safe ( hwmon_host_ptr->relearnTimer );
                hwmon_host_ptr->relearnTimer.ring = true ;
                hwmon_host_ptr->relearn = false ;
                return ;
            }
            else if (( *tid_ptr == hwmon_host_ptr->secretTimer.tid ) )
            {
                mtcTimer_stop_int_safe ( hwmon_host_ptr->secretTimer );
                hwmon_host_ptr->secretTimer.ring = true ;
                return ;
            }
        }
    }
    mtcTimer_stop_tid_int_safe (tid_ptr);
}

#ifdef WANT_SENSOR_TOGGLE
bool toggle = false ;
#endif

void hwmon_service ( hwmon_ctrl_type * ctrl_ptr )
{
    std::list<int> socks ;
    struct timeval waitd;
    fd_set readfds;

    daemon_config_type * config_ptr = daemon_get_cfg_ptr();
    hwmon_socket_type  * sock_ptr   = getSock_ptr();

    hostInv.hostBase.my_hostname = ctrl_ptr->my_hostname ;
    hostInv.hostBase.my_local_ip = ctrl_ptr->my_local_ip ;
    hostInv.hostBase.my_float_ip = ctrl_ptr->my_float_ip ;

    if ( config_ptr->token_refresh_rate )
    {
        if ( config_ptr->token_refresh_rate < 300 )
        {
            ilog ("Starting 'Token' Refresh timer (%d seconds)\n",
                  (config_ptr->token_refresh_rate) );
        }
        else
        {
            ilog ("Starting 'Token' Refresh timer (%d minutes)\n",
                  (config_ptr->token_refresh_rate/60) );
        }
        if ( mtcTimer_start ( hwmonTimer_token,
                              hwmonTimer_handler,
                              config_ptr->token_refresh_rate ) != PASS )
        {
            elog ("Failed to start 'Token' Refresh Timer\n");
            daemon_exit ( ) ;
        }
    }

    // client_len = sizeof(client_addr);

    socks.clear();
    if ( sock_ptr->cmd_sock )
    {
        socks.push_front (sock_ptr->cmd_sock->getFD());
    }
    else
    {
        elog ("cannot service Null cmd_sock\n");
    }

    socks.sort();

    ilog ("Starting 'Audit' timer (%d secs)\n", ctrl_ptr->audit_period );
    mtcTimer_start ( hwmonTimer_audit, hwmonTimer_handler, ctrl_ptr->audit_period );

    for ( ; ; )
    {
        /* Initialize the master fd_set */
        FD_ZERO(&readfds);

        /* add the command receiver socket ro the FD set mask */
        if ( sock_ptr->cmd_sock )
        {
            if ( sock_ptr->cmd_sock->getFD())
            {
                FD_SET(sock_ptr->cmd_sock->getFD(), &readfds);
            }
            else
            {
                /* force a re-init if we have no FD */
                sock_ptr->cmd_sock->sock_ok(false);
            }
        } /* Null sockts are auto recovered below */

        waitd.tv_sec  = 0;
        waitd.tv_usec = (SOCKET_WAIT*3) ;

        /* This is used as a delay up to select_timeout */
        int rc = select( socks.back()+1, &readfds, NULL, NULL, &waitd);

        /* If the select time out expired then  */
        if (( rc < 0 ) || ( rc == 0 ))
        {
            /* Check to see if the select call failed. */
            /* ... but filter Interrupt signal         */
            if (( rc < 0 ) && ( errno != EINTR ))
            {
                elog ( "Select Failed (rc:%d) %s \n", errno, strerror(errno));
            }
        }
        else if ( FD_ISSET(sock_ptr->cmd_sock->getFD(), &readfds))
        {
            rc = hwmon_service_inbox ();
            if ( rc > RETRY )
            {
                elog ("Failure servicing inbox (rc:%d)\n", rc);
            }
        }
        else
        {
            wlog ("unexpected select (%d)\n", rc );
        }

        if ( hwmonTimer_audit.ring == true )
        {
            mtcTimer_dump_data ();
            hostInv.set_degrade_audit();
            hwmonTimer_audit.ring = false ;

#ifdef WANT_FIT_TESTING
            if ( daemon_want_fit ( FIT_CODE__HWMON__AVOID_TOKEN_REFRESH ))
            {
                if ( hwmonTimer_token.ring == true )
                    hwmonTimer_token.ring = false ;
            }
#endif
        }

        /* Handle refreshing the authentication token */
        tokenUtil_log_refresh ();
        tokenUtil_manage_token ( ctrl_ptr->httpEvent,
                                 ctrl_ptr->my_hostname,
                                 config_ptr->token_refresh_rate,
                                 hwmonTimer_token,
                                 hwmonTimer_handler );

        /* Run the FSM */
        hostInv.hwmon_fsm ( ) ;

        daemon_signal_hdlr ();

        daemon_load_fit ( );
    }
}

/* Add Host Handler
 * ---------------------------*/
int hwmonHostClass::add_host_handler ( struct hwmonHostClass::hwmon_host * host_ptr )
{
    switch ( host_ptr->addStage )
    {
        case HWMON_ADD__WAIT:
        {
            if ( mtcTimer_expired ( host_ptr->addTimer ))
            {
                host_ptr->addTimer.ring = false ;
                addStageChange ( host_ptr , HWMON_ADD__START );
            }
            break ;
        }
        case HWMON_ADD__START:
        {
            /* force load of sensors from database if sensors = 0 and they exist */
            int rc = hwmonHostClass::bmc_load_sensor_model ( host_ptr ) ;
            if ( rc == PASS )
            {
                mtcTimer_start ( host_ptr->addTimer, hwmonTimer_handler, 1);
                addStageChange (host_ptr, HWMON_ADD__STATES);
            }
            else
            {
                /* there might be issue accessing the sysinv database */
                int delay = (rand()%30)+1 ;
                wlog ("%s bmc_load_sensor_model failed (rc:%d) ; retrying in %d secs\n", host_ptr->hostname.c_str(), rc , delay);
                mtcTimer_start ( host_ptr->addTimer, hwmonTimer_handler, delay );
                addStageChange ( host_ptr , HWMON_ADD__WAIT );
            }
            break ;
        }
        case HWMON_ADD__STATES:
        {
            if ( mtcTimer_expired ( host_ptr->addTimer ))
            {
                if ( host_ptr->sensors )
                {
                    int rc ;
                    /* manage the alarm and degrade states of all the sensors over process
                     * startup when the sensor model is already found in the database ;
                     * typical case over process restart. */
                    if (( rc = manage_startup_states ( host_ptr ) ) == PASS )
                    {
                        /* run the audit right away just to update the host degrade state
                         * if it needs it ; like over a SWACT */
                        degrade_state_audit ( host_ptr ) ;

                        ilog ("%s add complete (groups:%d sensors:%d)\n", host_ptr->hostname.c_str(), host_ptr->groups, host_ptr->sensors );
                    }
                    else
                    {
                        int delay = (rand()%30)+1 ;
                        if ( host_ptr->alarmed_config == false )
                        {
                            host_ptr->alarmed_config = true ;
                            hwmonAlarm_minor ( host_ptr->hostname, HWMON_ALARM_ID__SENSORCFG, "profile", REASON_DEGRADED );
                        }
                        wlog ("%s manage_startup_states failed (rc:%d) ; retrying in %d secs\n", host_ptr->hostname.c_str(), rc, delay );
                        mtcTimer_start ( host_ptr->addTimer, hwmonTimer_handler, delay );
                        break ;
                    }
                }
                else
                {
                    ilog ("%s no sensor model ; must be learned ; using %s\n",
                              host_ptr->hostname.c_str(),
                              bmcUtil_getProtocol_str(host_ptr->protocol).c_str());
                }
                addStageChange ( host_ptr , HWMON_ADD__DONE );
            }
            break ;
        }
        case HWMON_ADD__DONE:
        {
            ilog ("%s add complete ; %d sensors %d groups\n", host_ptr->hostname.c_str(), host_ptr->sensors, host_ptr->groups );
            break ;
        }
        default:
        {
            slog ("%s invalid 'add' stage\n", host_ptr->hostname.c_str() );
            if ( host_ptr->addTimer.tid ) mtcTimer_stop ( host_ptr->addTimer );
            mtcTimer_start ( host_ptr->addTimer, hwmonTimer_handler, (rand()%10)+1);
            addStageChange ( host_ptr , HWMON_ADD__DONE );
            break ;
        }
    }
    return (PASS);
}

/* Inventory Object wrapper - does a node lookup and calls the timer handler */
void hwmonTimer_handler ( int sig, siginfo_t *si, void *uc)
{
    hwmonHostClass * obj_ptr = get_hwmonHostClass_ptr() ;
    obj_ptr->timer_handler ( sig, si, uc );
}

/*****************************************************************************
 *
 * Name       : interval_change_handler
 *
 * Purpose:   : Handles setting the monitoring audit interval.
 *
 * Description: The following conditions are handled.
 *
 *              if  host_ptr->interval is zero then it and all the groups
 *              are set to the default value.
 *
 *              If there is existing inventory then host_ptr->interval
 *              is set to the shortest group interval.
 *
 *              With no existing inventory all groups are set to
 *              HWMON_DEFAULT_AUDIT_INTERVAL
 *
 *              if host_ptr->interval is not zero then all the group intervals
 *              are set to that value.
 *
 *****************************************************************************/

int hwmonHostClass::interval_change_handler ( struct hwmonHostClass::hwmon_host * host_ptr )
{
    int rc = RETRY ;

    dlog ("%s interval change handler\n", host_ptr->hostname.c_str());

    /* Don't issue a request if there is one active already */
    if ( host_ptr->event.base == NULL )
    {
        rc = PASS ;

        if ( host_ptr->interval < HWMON_MIN_AUDIT_INTERVAL )
        {
            ilog ("%s setting audit interval\n", host_ptr->hostname.c_str());
            if ( host_ptr->groups )
            {
                int smallest = HWMON_DEFAULT_LARGE_INTERVAL ;

                /* get the smallest interval */
                for ( int g = 0 ; g < host_ptr->groups ; ++g )
                {
                    if ( smallest > host_ptr->group[g].group_interval )
                    {
                        smallest = host_ptr->group[g].group_interval ;
                    }
                }
                /* Should be no bigger than the smallest group interval setting. */
                host_ptr->interval = smallest ;
            }
            else
            {
                /* default first 'learning' audit interval */
                host_ptr->interval = 5 ;
            }
        }

        if (( host_ptr->relearn == true ) &&
            ( host_ptr->model_attributes_preserved.interval != host_ptr->interval ))
        {
            host_ptr->interval = host_ptr->model_attributes_preserved.interval ;
            ilog ("%s audit interval restored to %d seconds\n",
                         host_ptr->hostname.c_str(),
                         host_ptr->interval);
        }

        string interval_string = itos(host_ptr->interval) ;

        for ( int g = 0 ; g < host_ptr->groups ; ++g )
        {
            daemon_signal_hdlr();


            if ( host_ptr->interval != host_ptr->group[g].group_interval )
            {
                /* only updat the group if they differ */
                if ( host_ptr->group[g].group_interval != host_ptr->interval )
                {
                    /* update the group interval. Even though bmc
                     * montoring does not need it, we need to be
                     * backwards compatible.
                     *
                     * bmc monitors all groups at the same interval */
                    int old = host_ptr->group[g].group_interval ;
                    host_ptr->group[g].group_interval = host_ptr->interval ;

                    rc = hwmonHttp_mod_group ( host_ptr->hostname,
                                               host_ptr->event,
                                               host_ptr->group[g].group_uuid,
                                               "audit_interval_group",
                                               interval_string );

                    if ( rc )
                    {
                         elog ("%s failed to update '%s' group audit interval (%d of %d); will retry later\n",
                                   host_ptr->hostname.c_str(),
                                   host_ptr->group[g].group_name.c_str(),
                                   g, host_ptr->groups );
                         break ;
                    }
                    else
                    {
                        char str [100] ;
                        snprintf ( &str[0], 100, "audit interval changed from %d to %d seconds",
                                   old,
                                   host_ptr->group[g].group_interval);

                        hwmonLog ( host_ptr->hostname,
                                   HWMON_ALARM_ID__SENSORGROUP,
                                   FM_ALARM_SEVERITY_CLEAR,
                                   host_ptr->group[g].group_name, str );
                    }
                }
            }
        }
        /* retry until pass - retries are spaced by audit interval */
        if ( rc == PASS )
        {
            /* TODO: remove error detection and correction */
            if ( host_ptr->interval == 0 )
            {
                slog ("%s failed to set interval correctly\n",host_ptr->hostname.c_str());

                host_ptr->interval = HWMON_DEFAULT_AUDIT_INTERVAL ;
            }

            host_ptr->interval_changed = false ;
        }
    }

    ilog ("%s sensor monitoring period is %d seconds\n",
              host_ptr->hostname.c_str(),
              host_ptr->interval );

    return (rc);
}


/* Hardware Monitor Handler
 * --------------------------
 *
 * TODO: Need grouping to enable the groups in the database
 *     group_ptr->group_state = "enabled" ;
 *     hwmonHttp_mod_group ( host_ptr->hostname, host_ptr->event , group_ptr->group_uuid, "state" , group_ptr->group_state );
 *     if ( group_ptr->group_state.compare("enabled") )
 * TODO: Need grouping disabled on state transition from monitoring enabled to disabled
 *
 *
 *  */
int hwmonHostClass::bmc_sensor_monitor ( struct hwmonHostClass::hwmon_host * host_ptr )
{
    int rc = RETRY ;

    if ( host_ptr )
    {
        /* Check the stage */
        if ( host_ptr->monitor_ctrl.stage < HWMON_SENSOR_MONITOR__STAGES )
        {
            flog ("%s sensor monitor stage (%s)\n",
                       host_ptr->hostname.c_str(),
                       monitorStages_str[host_ptr->monitor_ctrl.stage].c_str());
        }
        else
        {
            slog ("%s bad sensor monitor state (%d) - forcing into IDLE\n",
                      host_ptr->hostname.c_str(),
                      host_ptr->monitor_ctrl.stage);

            _stage_change ( host_ptr->hostname,
                            host_ptr->monitor_ctrl.stage,
                            HWMON_SENSOR_MONITOR__START );
        }

        /* check for a new model relearn request */
        if ( host_ptr->relearn_request == true )
        {
            int relearn_time = MTC_MINS_1 ;

            /* gracefully handle delete model failure retry.
             * if there is a relearn timer running then wait for it
             * to expire. This way previously failed relear request
             * retries are throttled. */
            if ( mtcTimer_expired ( host_ptr->relearnTimer ) == false )
            {
                /* TODO: test FIT */
                return (RETRY);
            }

            ilog ("%s handling sensor model relearn request\n",
                      host_ptr->hostname.c_str());

            rc = bmc_delete_sensor_model ( host_ptr );
            if ( rc != PASS )
            {
                elog ("%s delete model failure ; retry in %d seconds\n",
                          host_ptr->hostname.c_str(), relearn_time );

                /* If we got an error then wait relearn_time
                 * before trying again */
                mtcTimer_start ( host_ptr->relearnTimer,
                                 hwmonTimer_handler,
                                 relearn_time );
                return (RETRY);
            }

            relearn_time = MTC_MINS_2;

            /* enter relearn mode */
            host_ptr->relearn = true ;

            /* exit relearn request mode.
             * allow the relearn operation to proceed */
            host_ptr->relearn_request = false ;

            this->monitor_soon ( host_ptr );

            /* start the relearn timer */
            mtcTimer_start ( host_ptr->relearnTimer,
                             hwmonTimer_handler,
                             relearn_time );
        }

        switch ( host_ptr->monitor_ctrl.stage )
        {
            /******************************************************************
             *
             * The IDLE stage is the default start and do nothing stage while
             * monitoring is disabled.
             *
             * Stage Transition: external
             *
             ******************************************************************/
            case HWMON_SENSOR_MONITOR__IDLE:
            {
                break ;
            }

            /******************************************************************
             *
             * A delayed START
             *
             *****************************************************************/
            case HWMON_SENSOR_MONITOR__RESTART:
            {
                if ( mtcTimer_expired ( host_ptr->monitor_ctrl.timer ) )
                {
                    _stage_change ( host_ptr->hostname,
                                    host_ptr->monitor_ctrl.stage,
                                    HWMON_SENSOR_MONITOR__START );
                }
                break ;
            }


            /******************************************************************
             *
             * The START stage is the default stage and starts sensor
             * monitoring if enabled for this host.
             *
             * The start process begins with adding a small randomized delay
             * before the first READ so that over a process (re)start we don't
             * jolt the process by trying to read sensors from all hosts at the
             * same time.
             *
             * Stage Transition:
             *
             *  Success path -> HWMON_SENSOR_MONITOR__DELAY
             *  Failure Path -> HWMON_SENSOR_MONITOR__IDLE
             *
             ******************************************************************/
            case HWMON_SENSOR_MONITOR__START:
            {
                mtcTimer_reset ( host_ptr->monitor_ctrl.timer );
                if ( host_ptr->monitor )
                {
                   /* Handle Audit Interval Change */
                    if ( host_ptr->interval_changed )
                    {
                        interval_change_handler ( host_ptr );
                    }

                    /* Handle power state query
                     * - don't depend on poweron if in relearn mode.
                     * - otherwise we need to ensure the model is learned
                     *   while the host power is on.
                     * See comments in HWMON_SENSOR_MONITOR__POWER for details */
                    if (( host_ptr->sensors == 0 ) &&
                        ( host_ptr->poweron == false ) &&
                        ( host_ptr->relearn == false ))
                    {
                        if ( host_ptr->bmc_thread_ctrl.id )
                        {
                            wlog ("%s sensor monitor thread is unexpectedly active ; retry soon\n", host_ptr->hostname.c_str());
                            thread_kill ( host_ptr->bmc_thread_ctrl, host_ptr->bmc_thread_info );
                            sleep (1);
                            break ;
                        }

                        host_ptr->accounting_bad_count = 0 ;
                        host_ptr->bmc_thread_ctrl.id = 0       ;
                        host_ptr->bmc_thread_ctrl.done = false ;

                        host_ptr->bmc_thread_info.data.clear() ;
                        host_ptr->bmc_thread_info.status_string.clear();
                        host_ptr->bmc_thread_info.status = -1  ;
                        host_ptr->bmc_thread_info.progress = 0 ;
                        host_ptr->bmc_thread_info.id = 0       ;
                        host_ptr->bmc_thread_info.signal = 0   ;
                        host_ptr->bmc_thread_info.command = BMC_THREAD_CMD__POWER_STATUS ;
                        host_ptr->bmc_thread_info.proto = host_ptr->protocol ;

                        /* Update / Setup the BMC query credentials */
                        host_ptr->thread_extra_info.bm_ip = host_ptr->bm_ip ;
                        host_ptr->thread_extra_info.bm_un = host_ptr->bm_un ;
                        host_ptr->thread_extra_info.bm_pw = host_ptr->bm_pw ;

                        rc = thread_launch ( host_ptr->bmc_thread_ctrl,
                                             host_ptr->bmc_thread_info ) ;
                        if ( rc != PASS )
                        {
                            host_ptr->bmc_thread_info.status = rc ;
                            host_ptr->bmc_thread_info.status_string =
                            "failed to launch power query thread" ;

                            _stage_change ( host_ptr->hostname,
                                    host_ptr->monitor_ctrl.stage,
                                    HWMON_SENSOR_MONITOR__FAIL );
                        }
                        else
                        {
                            /* Assign the extra data pointer */
                            host_ptr->bmc_thread_info.extra_info_ptr = (void*)&host_ptr->thread_extra_info ;

                            /* start an umbrella timer 5 seconds longer than
                             * the default thread FSM timout */
                            mtcTimer_start ( host_ptr->monitor_ctrl.timer,
                                             hwmonTimer_handler,
                                             (DEFAULT_THREAD_TIMEOUT_SECS+5) );

                            _stage_change ( host_ptr->hostname,
                                            host_ptr->monitor_ctrl.stage,
                                            HWMON_SENSOR_MONITOR__POWER );
                        }
                        break ;
                    }
                    else if ( host_ptr->interval )
                    {
                        /* Assign the extra data pointer */
                        host_ptr->bmc_thread_info.extra_info_ptr = (void*)&host_ptr->thread_extra_info ;

                        /* randomize the first audit a little so that over a swact we don't spike hwmond */
                        int r = (rand() % MTC_MINS_1) + 1 ;

                        /* poll all the sensors right away - between 1 and 10 seconds */
                        ilog ("%s sensor monitoring begins in %d seconds\n",
                                  host_ptr->hostname.c_str(), r );

                        mtcTimer_start ( host_ptr->monitor_ctrl.timer, hwmonTimer_handler, r );
                        _stage_change  ( host_ptr->hostname,
                                         host_ptr->monitor_ctrl.stage,
                                         HWMON_SENSOR_MONITOR__DELAY );
                        break ;
                    }
                    else
                    {
                        host_ptr->interval_changed = true ;
                        blog ("%s audit interval is zero ; auto correcting\n", host_ptr->hostname.c_str());
                        break ;
                    }
                }
                else
                {
                    ilog ("%s sensor monitoring disabled\n", host_ptr->hostname.c_str());
                }

                _stage_change ( host_ptr->hostname,
                                host_ptr->monitor_ctrl.stage,
                                HWMON_SENSOR_MONITOR__IDLE );
                break ;
            }


            /******************************************************************
             *
             * The POWER stage handles a power query response.
             *
             * The START is re-invoked if the power query fails or
             * shows that the power is off.
             *
             * Stage Transition:
             *
             *  Success path -> HWMON_SENSOR_MONITOR__DELAY
             *  Failure Path -> HWMON_SENSOR_MONITOR__START
             *
             ******************************************************************/
            case HWMON_SENSOR_MONITOR__POWER:
            {
                /* handle thread execution umbrella timeout */
                if ( mtcTimer_expired ( host_ptr->monitor_ctrl.timer ) )
                {
                    host_ptr->monitor_ctrl.timer.ring = false ;

                    wlog ("%s power query thread timeout\n",
                              host_ptr->hostname.c_str());

                    thread_kill ( host_ptr->bmc_thread_ctrl, host_ptr->bmc_thread_info );

                    mtcTimer_start ( host_ptr->monitor_ctrl.timer,
                                     hwmonTimer_handler, MTC_MINS_1 );

                    _stage_change ( host_ptr->hostname,
                                    host_ptr->monitor_ctrl.stage,
                                    HWMON_SENSOR_MONITOR__RESTART );
                }

                /* check for 'thread done' completion */
                else if ( thread_done( host_ptr->bmc_thread_ctrl ) )
                {
                    /* Consume done results */
                    mtcTimer_reset ( host_ptr->monitor_ctrl.timer );

                    if ( host_ptr->bmc_thread_info.status )
                    {
                        elog ("%s %s thread %2d failed (rc:%d) (%d:%d)\n",
                                  host_ptr->bmc_thread_ctrl.hostname.c_str(),
                                  host_ptr->bmc_thread_ctrl.name.c_str(),
                                  host_ptr->bmc_thread_info.command,
                                  host_ptr->bmc_thread_info.status,
                                  host_ptr->bmc_thread_info.progress,
                                  host_ptr->bmc_thread_info.runcount);

                        wlog ("%s ... %s\n",
                                  host_ptr->bmc_thread_ctrl.hostname.c_str(),
                                  host_ptr->bmc_thread_info.status_string.c_str());
                    }
                    else
                    {
                        dlog ("%s '%s' thread '%d' command is done ; (%d:%d) (rc:%d)\n",
                                  host_ptr->bmc_thread_ctrl.hostname.c_str(),
                                  host_ptr->bmc_thread_ctrl.name.c_str(),
                                  host_ptr->bmc_thread_info.command,
                                  host_ptr->bmc_thread_info.progress,
                                  host_ptr->bmc_thread_info.runcount,
                                  host_ptr->bmc_thread_info.status);

                        blog2("%s ... status: %s\n",
                                  host_ptr->bmc_thread_ctrl.hostname.c_str(),
                                  host_ptr->bmc_thread_info.status_string.c_str());

#ifdef WANT_FIT_TESTING
                        if ( daemon_want_fit ( FIT_CODE__HWMON__NO_DATA, host_ptr->hostname ))
                        {
                            host_ptr->bmc_thread_info.data.clear ();
                            host_ptr->bmc_thread_info.status = 0 ;
                            host_ptr->bmc_thread_info.status_string.clear ();
                            slog ("%s FIT No Power Status Data\n", host_ptr->hostname.c_str());
                        }
#endif

                        if ( host_ptr->bmc_thread_info.data.empty())
                        {
                            wlog ("%s power query status empty ; retrying query\n",
                                      host_ptr->hostname.c_str());
                        }
                        else if ( host_ptr->bmc_thread_info.data.find (BMC_POWER_ON_STATUS) == string::npos )
                        {
                            host_ptr->poweron = false ;
                            wlog ("%s sensor model learning delayed ; need power on",
                                      host_ptr->hostname.c_str());
                        }
                        else
                        {
                            /* OK, this is what we have been waiting for */
                            ilog ("%s power is on", host_ptr->hostname.c_str());
                            host_ptr->poweron = true ;
                        }
                    }

                    host_ptr->bmc_thread_ctrl.done = true ;

                    /* Start monitoring in 10 seconds */
                    int delay = MTC_SECS_10 ;

                    /* If power is off, retry in 2 minutes ; hold-off period */
                    if ( host_ptr->poweron == false )
                        delay = MTC_MINS_2 ;

                    mtcTimer_start ( host_ptr->monitor_ctrl.timer,
                                     hwmonTimer_handler, delay );

                    _stage_change ( host_ptr->hostname,
                                    host_ptr->monitor_ctrl.stage,
                                    HWMON_SENSOR_MONITOR__RESTART );
                }
                break ;
            }

            /******************************************************************
             *
             * The DELAY stage inserts time after a failure recovery or
             * between successive sensor READ intervals.
             *
             * The failure path is invoked if the 'thread' stage is not IDLE
             * when the DELAY period expires.
             *
             * Stage Transition:
             *
             *  Success path -> HWMON_SENSOR_MONITOR__READ
             *  Failure Path -> HWMON_SENSOR_MONITOR__FAIL
             *
             ******************************************************************/
            case HWMON_SENSOR_MONITOR__DELAY:
            {
                if ( mtcTimer_expired ( host_ptr->monitor_ctrl.timer ) )
                {
                    host_ptr->monitor_ctrl.timer.ring = false ;

                    /* if there was a previous connection failure being handled
                     * then give it time to resolve */
                    if ( !thread_idle ( host_ptr->bmc_thread_ctrl ) )
                    {
                        wlog ("%s rejecting thread run stage change ; FSM not IDLE (thread stage:%s)\n",
                                  host_ptr->hostname.c_str(),
                                  thread_stage(host_ptr->bmc_thread_ctrl).c_str());

                        _stage_change ( host_ptr->hostname,
                                        host_ptr->monitor_ctrl.stage,
                                        HWMON_SENSOR_MONITOR__FAIL );
                    }
                    else
                    {
                        _stage_change ( host_ptr->hostname,
                                        host_ptr->monitor_ctrl.stage,
                                        HWMON_SENSOR_MONITOR__READ );
                    }
                }
                /* Handle Audit Interval Change ...
                 * While we are waiting for the next audit check to see if we have received
                 * an monitor interval change. If we have then update the database with the
                 * new data, force this interval to finish and on the next audit the new
                 * interval will be loaded */
                else if ( host_ptr->interval_changed )
                {
                    interval_change_handler ( host_ptr );

                    /* force this audit interval to expire but don't include this in the
                     * pass case only. Give sysinv it some time before the next retry */
                    mtcTimer_stop ( host_ptr->monitor_ctrl.timer );
                    host_ptr->monitor_ctrl.timer.ring = true ;
                }
                break ;
            }


            /******************************************************************
             *
             * The READ stage requests the launch of the hwmonThread_bmc
             * thread that will read the sensor data from the specified host.
             *
             * An umbrella timeout timer is started on behalf of the PARSE
             * stage to detect threadUtil FSM not completing.
             *
             * Launch will fail if attempted if the thread is already running
             * or if the launch request returns a failure.
             *
             * Stage Transition:
             *
             *  Success path -> HWMON_SENSOR_MONITOR__PARSE
             *  Failure Path -> HWMON_SENSOR_MONITOR__FAIL
             *
             ******************************************************************/
            case HWMON_SENSOR_MONITOR__READ:
            {
                if ( host_ptr->bmc_thread_ctrl.id )
                {
                    host_ptr->bmc_thread_info.status = FAIL_THREAD_RUNNING ;
                    host_ptr->bmc_thread_info.status_string =
                    "sensor monitor thread is unexpectedly active ; handling as failure" ;
                    _stage_change ( host_ptr->hostname,
                                    host_ptr->monitor_ctrl.stage,
                                    HWMON_SENSOR_MONITOR__FAIL );
                    break ;
                }

                host_ptr->accounting_bad_count = 0 ;
                host_ptr->bmc_thread_ctrl.id = 0       ;
                host_ptr->bmc_thread_ctrl.done = false ;

                host_ptr->bmc_thread_info.data.clear() ;
                host_ptr->bmc_thread_info.status_string.clear();
                host_ptr->bmc_thread_info.status = -1  ;
                host_ptr->bmc_thread_info.progress = 0 ;
                host_ptr->bmc_thread_info.id = 0       ;
                host_ptr->bmc_thread_info.signal = 0   ;
                host_ptr->bmc_thread_info.command = BMC_THREAD_CMD__READ_SENSORS ;
                host_ptr->bmc_thread_info.proto = host_ptr->protocol ;

                /* Update / Setup the BMC query credentials */
                host_ptr->thread_extra_info.bm_ip = host_ptr->bm_ip ;
                host_ptr->thread_extra_info.bm_un = host_ptr->bm_un ;
                host_ptr->thread_extra_info.bm_pw = host_ptr->bm_pw ;

                rc = thread_launch ( host_ptr->bmc_thread_ctrl, host_ptr->bmc_thread_info ) ;
                if ( rc != PASS )
                {
                    host_ptr->bmc_thread_info.status = rc ;
                    host_ptr->bmc_thread_info.status_string =
                    "failed to launch sensor monitoring thread" ;

                    _stage_change ( host_ptr->hostname,
                                    host_ptr->monitor_ctrl.stage,
                                    HWMON_SENSOR_MONITOR__FAIL );
                }
                else
                {
                    /* start an umbrella timer 5 seconds longer than
                     * the default thread FSM timout */
                    mtcTimer_start ( host_ptr->monitor_ctrl.timer,
                                     hwmonTimer_handler,
                                     (DEFAULT_THREAD_TIMEOUT_SECS+5) );

                    _stage_change ( host_ptr->hostname,
                                    host_ptr->monitor_ctrl.stage,
                                    HWMON_SENSOR_MONITOR__PARSE );
                }
                break ;
            }

            /******************************************************************
             * The PARSE stage has 2 main functions
             *
             *  1. Wait for the bmc command completion from the READ stage
             *     while monitoring for and handling the unbrella timeout case.
             *
             *  2. PARSE the sensor data json string into the sample list
             *
             *          sample[MAX_HOST_SENSORS]
             *
             *     The number of sensors read by thread is specified in
             *
             *         thread_extra_info.samples
             *
             * Failure case is invoked for
             *  - thread completion umbrella timeout.
             *  - thread completion error
             *  - sensor data parse error
             *
             * Stage Transition:
             *
             *  Success path -> HWMON_SENSOR_MONITOR__CHECK
             *  Failure Path -> HWMON_SENSOR_MONITOR__FAIL
             *
             ******************************************************************/
            case HWMON_SENSOR_MONITOR__PARSE:
            {
                daemon_signal_hdlr ();

                /* Unbrella timeout timer check */
                if ( mtcTimer_expired ( host_ptr->monitor_ctrl.timer ) )
                {
                    host_ptr->monitor_ctrl.timer.ring = false ;
                    host_ptr->bmc_thread_info.status = FAIL_TIMEOUT ;
                    host_ptr->bmc_thread_info.status_string =
                    "timeout waiting for sensor read data" ;

                    _stage_change ( host_ptr->hostname,
                                    host_ptr->monitor_ctrl.stage,
                                    HWMON_SENSOR_MONITOR__FAIL );
                }

                /* check for 'thread done' completion */
                else if ( thread_done( host_ptr->bmc_thread_ctrl ) )
                {
                    /* Consume done results */
                    mtcTimer_stop ( host_ptr->monitor_ctrl.timer );

                    if ( host_ptr->bmc_thread_info.status )
                    {
                        host_ptr->bmc_thread_ctrl.retries++ ;
                        if (!(host_ptr->bmc_thread_ctrl.retries>MAX_THREAD_RETRIES))
                        {
                            if ( host_ptr->bmc_thread_info.status == FAIL_SYSTEM_CALL )
                            {
                                elog ("%s '%s' system call failed (retry %d of %d)",
                                          host_ptr->bmc_thread_info.log_prefix,
                                          bmcUtil_getCmd_str(
                                          host_ptr->bmc_thread_info.command).c_str(),
                                          host_ptr->bmc_thread_ctrl.retries,
                                          MAX_THREAD_RETRIES);
                            }
                            else if (( host_ptr->bmc_thread_ctrl.retries == (MAX_THREAD_RETRIES-1)) ||
                                     (  host_ptr->bmc_thread_ctrl.retries == 1 ))
                            {
                                elog ("%s '%s' failed (rc:%d) (retry %d of %d) (%d:%d)\n",
                                          host_ptr->bmc_thread_info.log_prefix,
                                          bmcUtil_getCmd_str(
                                          host_ptr->bmc_thread_info.command).c_str(),
                                          host_ptr->bmc_thread_info.status,
                                          host_ptr->bmc_thread_ctrl.retries,
                                          MAX_THREAD_RETRIES,
                                          host_ptr->bmc_thread_info.progress,
                                          host_ptr->bmc_thread_info.runcount);
                            }

                            /* don't flood the logs with the same error data over and over */
                            if ( host_ptr->bmc_thread_ctrl.retries == 1 )
                            {
                                blog ("%s ... %s\n",
                                          host_ptr->bmc_thread_ctrl.hostname.c_str(),
                                          host_ptr->bmc_thread_info.status_string.c_str());
                            }

                            host_ptr->bmc_thread_ctrl.done = true ;
                            mtcTimer_start ( host_ptr->monitor_ctrl.timer, hwmonTimer_handler, THREAD_RETRY_DELAY_SECS );
                            _stage_change ( host_ptr->hostname,
                                            host_ptr->monitor_ctrl.stage,
                                            HWMON_SENSOR_MONITOR__DELAY );
                            break ;
                        }
                    }
                    else
                    {
                        dlog ("%s '%s' thread '%d' command is done ; (%d:%d) (rc:%d)\n",
                                  host_ptr->bmc_thread_ctrl.hostname.c_str(),
                                  host_ptr->bmc_thread_ctrl.name.c_str(),
                                  host_ptr->bmc_thread_info.command,
                                  host_ptr->bmc_thread_info.progress,
                                  host_ptr->bmc_thread_info.runcount,
                                  host_ptr->bmc_thread_info.status);
                        blog2 ("%s ... data: %s\n",
                                  host_ptr->bmc_thread_ctrl.hostname.c_str(),
                                  host_ptr->bmc_thread_info.status_string.c_str());
                    }
                    host_ptr->bmc_thread_ctrl.done = true ;
                    host_ptr->bmc_thread_ctrl.retries = 0 ;

#ifdef WANT_FIT_TESTING
                    if ( daemon_want_fit ( FIT_CODE__HWMON__NO_DATA, host_ptr->hostname ))
                    {
                        host_ptr->bmc_thread_info.data.clear ();
                        host_ptr->bmc_thread_info.status = 0 ;
                        host_ptr->bmc_thread_info.status_string.clear ();
                    }
#endif

                    if ( host_ptr->bmc_thread_info.status == PASS )
                    {
                        /* NOTE: This parsing method is not leaking memory ; verified ! */

                        json_bool status ;
                        struct json_object * raw_obj = json_tokener_parse( host_ptr->bmc_thread_info.data.data() );
                        if ( raw_obj )
                        {
                            /* Look for ... BMC_JSON__SENSOR_DATA_MESSAGE_HEADER */
                            struct json_object * req_obj = (struct json_object *)(NULL) ;
                            status = json_object_object_get_ex ( raw_obj, BMC_JSON__SENSOR_DATA_MESSAGE_HEADER, &req_obj );
                            if (( status == true ) && req_obj )
                            {
                                char * msg_ptr = (char*)json_object_to_json_string(req_obj) ;
                                host_ptr->json_bmc_sensors = msg_ptr ;
                                if ( msg_ptr )
                                {
                                    host_ptr->bmc_thread_info.status = bmc_load_sensor_samples ( host_ptr , msg_ptr);
                                    if ( host_ptr->bmc_thread_info.status == PASS )
                                    {
                                        if ( host_ptr->samples != host_ptr->sensors )
                                        {
                                            if ( host_ptr->quanta_server == false )
                                            {
                                                blog ("%s read %d sensor samples but expected %d\n",
                                                          host_ptr->hostname.c_str(),
                                                          host_ptr->samples,
                                                          host_ptr->sensors );
                                            }
                                        }
                                        _stage_change ( host_ptr->hostname, host_ptr->monitor_ctrl.stage, HWMON_SENSOR_MONITOR__CHECK );

                                    }
                                    else
                                    {
                                        host_ptr->bmc_thread_info.status_string = "failed to load sensor data" ;
                                    }
                                }
                                else
                                {
                                    host_ptr->bmc_thread_info.status_string = "failed to get json message after header" ;
                                    host_ptr->bmc_thread_info.status = FAIL_JSON_PARSE ;
                                }
                            }
                            else
                            {
                                host_ptr->bmc_thread_info.status_string = "failed to find '" ;
                                host_ptr->bmc_thread_info.status_string.append(BMC_JSON__SENSOR_DATA_MESSAGE_HEADER);
                                host_ptr->bmc_thread_info.status_string.append("' label") ;
                                host_ptr->bmc_thread_info.status = FAIL_JSON_PARSE ;
                            }
                        }
                        else
                        {
                            host_ptr->bmc_thread_info.status_string = "failed to parse bmc sensor data string" ;
                            host_ptr->bmc_thread_info.status = FAIL_JSON_PARSE ;
                        }

                        if (raw_obj) json_object_put(raw_obj);
                    }

                    if ( host_ptr->bmc_thread_info.status )
                    {
                        /* Handle thread error status */
                        if ( host_ptr->groups == 0 )
                        {
                            if ( host_ptr->alarmed_config == false )
                            {
                                host_ptr->alarmed_config = true ;
                                hwmonAlarm_minor ( host_ptr->hostname, HWMON_ALARM_ID__SENSORCFG, "profile", REASON_DEGRADED );
                            }
                        }
                        else
                        {
                            bmc_set_group_state ( host_ptr, "failed" );
                        }

                        _stage_change ( host_ptr->hostname,
                                        host_ptr->monitor_ctrl.stage,
                                        HWMON_SENSOR_MONITOR__FAIL );
                    }
                } /* end handling of done command */
                break ;
            }

            /******************************************************************
             *
             * The CHECK stage is run on the last parsed sample data loaded
             * into the temporary sample sensor data list ...
             *
             *      host_ptr->sample[MAX_HOST_SENSORS]
             *
             *  The number of samples loaded into the sample is
             *  specified in
             *
             *      host_ptr->samples
             *
             *  The CHECK is intended to identify sensor data corruption or
             *  model changes that might occur over a BMC firmware upgrade.
             *
             *  The CHECK involves performing a checksum of all the sensor
             *  names in each list and comparing that checksum to the last
             *  time the sensors were read.
             *
             *  A stored checksum of zero indicates the first sample read.
             *  If at that time host_ptr->sensors == 0 then a call to
             *  bmc_create_sensor_model is made to create a new sensor
             *  model based on these last sample readings.
             *
             *  If the stored checksums do not match the current checksums
             *  then that constitutes a sensor mismatch with a design log.
             *  The mismatch counter is incremented. If the mismatch
             *  counter exceeds its threshold then the current sensor model
             *  is deleted and re-created using the new data.
             *
             *  A customer log is created whenever a host's sensor model
             *  is created or re-created.
             *
             * Stage Transition:
             *
             *  Success path -> HWMON_SENSOR_MONITOR__UPDATE
             *  Failure Path -> HWMON_SENSOR_MONITOR__FAIL
             *
             *********************************************************************/
            case HWMON_SENSOR_MONITOR__CHECK:
            {
                unsigned short temp_checksum ;

                daemon_signal_hdlr ();

                /* Handle cases where we got an incomplete sensor reading */
                if ( host_ptr->thread_extra_info.samples == 0 )
                {
                    if ( host_ptr->bmc_thread_info.status == PASS )
                    {
                        host_ptr->bmc_thread_info.status        = FAIL_INVALID_DATA ;
                        host_ptr->bmc_thread_info.status_string = "incomplete sensor data reading" ;
                    }
                    _stage_change ( host_ptr->hostname,
                                    host_ptr->monitor_ctrl.stage,
                                    HWMON_SENSOR_MONITOR__FAIL );
                    break ;
                }

                /* get the checksum for this sample set */
                temp_checksum =
                checksum_sample_profile ( host_ptr->hostname,
                                          host_ptr->thread_extra_info.samples,
                                         &host_ptr->sample[0]);

                blog1 ("%s samples profile checksum : %04x:%04x (%d:%d:%d)\n",
                           host_ptr->hostname.c_str(),
                           temp_checksum,
                           host_ptr->sample_sensor_checksum,
                           host_ptr->samples,
                           host_ptr->sensors,
                           host_ptr->thread_extra_info.samples);

                /* Initialize the sample checksums and counts for the first reading case */
                if ( host_ptr->sample_sensor_checksum == 0 )
                {
                    // host_ptr->samples = host_ptr->thread_extra_info.samples ;
                    host_ptr->sample_sensor_checksum = temp_checksum ;
                }

                /* look for first sensor reading case with an empty database profile.
                 * This can occur over a fresh provisioning or a model recreation */
                if ( host_ptr->sensors == 0 )
                {
                    ilog ("%s samples profile checksum : %04x (%d sensors) (%d samples)\n",
                              host_ptr->hostname.c_str(),
                              host_ptr->sample_sensor_checksum,
                              host_ptr->sensors,
                              host_ptr->samples);

                    /* check the sample model against known Quanta Server profile checksums and sensor numbers */
                    if (((( host_ptr->sample_sensor_checksum  == QUANTA_SAMPLE_PROFILE_CHECKSUM_VER_13_53 ) || ( host_ptr->sample_sensor_checksum ==  QUANTA_SAMPLE_PROFILE_CHECKSUM_VER_13_50 )) &&
                         (( host_ptr->samples == QUANTA_SAMPLE_PROFILE_SENSORS_VER_13_53) || (QUANTA_SAMPLE_PROFILE_CHECKSUM_VER_13_50 ))) ||
                        (( host_ptr->sample_sensor_checksum == QUANTA_SAMPLE_PROFILE_CHECKSUM_VER_13___ )) ||
                        (( host_ptr->sample_sensor_checksum == QUANTA_SAMPLE_PROFILE_CHECKSUM_VER_13_53b )) ||
                        (( host_ptr->sample_sensor_checksum == QUANTA_SAMPLE_PROFILE_CHECKSUM_VER_13_47 ) && ( host_ptr->samples == QUANTA_SAMPLE_PROFILE_SENSORS_VER_13_47 )) ||
                        (( host_ptr->sample_sensor_checksum == QUANTA_SAMPLE_PROFILE_CHECKSUM_VER_13_42 ) && ( host_ptr->samples == QUANTA_SAMPLE_PROFILE_SENSORS_VER_13_42 )) ||
                        (( host_ptr->sample_sensor_checksum == QUANTA_SAMPLE_PROFILE_CHECKSUM_VER__3_29 ) && ( host_ptr->samples == QUANTA_SAMPLE_PROFILE_SENSORS_VER__3_29 )))
                    {
                        /* TODO: can also add search for missing sensors */
                        ilog ("%s -----------------------------------------------\n", host_ptr->hostname.c_str());
                        ilog ("%s is a Quanta server based on sensor sample data\n", host_ptr->hostname.c_str());
                        ilog ("%s -----------------------------------------------\n", host_ptr->hostname.c_str());
                        host_ptr->quanta_server = true ;
                    }

                    /* Create a sensor model from 'this' sample data */
                    if ( bmc_create_sensor_model ( host_ptr ) != PASS )
                    {
                        elog ("%s failed to create sensor model (in sysinv)\n",
                                  host_ptr->hostname.c_str());
                    }
                }

                if ( host_ptr->profile_sensor_checksum == 0 )
                {
                    host_ptr->profile_sensor_checksum =
                    checksum_sensor_profile ( host_ptr->hostname,
                                              host_ptr->sensors,
                                             &host_ptr->sensor[0]);
                }

                if (( host_ptr->sensors == 0 ) || ( host_ptr->groups == 0 ))
                {
                    elog ("%s has read %d sensors but cannot process with no sensor model (%d:%d)\n",
                              host_ptr->hostname.c_str(),
                              host_ptr->thread_extra_info.samples,
                              host_ptr->sensors,
                              host_ptr->groups);

                    _stage_change ( host_ptr->hostname,
                                    host_ptr->monitor_ctrl.stage,
                                    HWMON_SENSOR_MONITOR__START );
                }
                else
                {
                    blog ("%s has read %d sensors ... processing results\n",
                              host_ptr->hostname.c_str(), host_ptr->samples);

                    _stage_change ( host_ptr->hostname,
                                    host_ptr->monitor_ctrl.stage,
                                    HWMON_SENSOR_MONITOR__UPDATE );
                }
                break ;
            }

            /******************************************************************
             *
             * The UPDATE stage translates the string based sensor sample
             * data's 'status' to a severity and adds that to the sensors'
             * sample_severity member in the sensor list.
             *
             *     host_ptr->sensor[MAX_SENSORS].sample_severity
             *
             * Stage Transition:
             *
             *  Success path -> HWMON_SENSOR_MONITOR__HANDLE
             *  Failure Path -> HWMON_SENSOR_MONITOR__FAIL
             *
             *****************************************************************/
            case HWMON_SENSOR_MONITOR__UPDATE:
            {
                if ( host_ptr->sensor_query_count++ == START_DEBOUCE_COUNT )
                {
                    /* onetime log showing debounce mode started */
                    ilog ("%s sensor status deboucing enabled\n", host_ptr->hostname.c_str());
                }

                daemon_signal_hdlr ();

                /* handle clearing the config alarm if its raised but we are
                 * now at a point where the sensors are readable */
                if ( host_ptr->alarmed_config == true )
                {
                    host_ptr->alarmed_config = false ;
                    hwmonAlarm_clear ( host_ptr->hostname, HWMON_ALARM_ID__SENSORCFG, "profile", REASON_OK );
                }

                if ( bmc_update_sensors ( host_ptr ) == PASS )
                {
                    if ( ( rc = bmc_set_group_state ( host_ptr, "enabled" ) ) == PASS )
                    {
                        _stage_change ( host_ptr->hostname,
                                        host_ptr->monitor_ctrl.stage,
                                        HWMON_SENSOR_MONITOR__HANDLE );
                    }
                    else
                    {
                        elog ("%s failed to set group state to 'enabled' (in sysinv) (rc:%d)\n",
                                  host_ptr->hostname.c_str(), rc);

                        _stage_change  ( host_ptr->hostname,
                                         host_ptr->monitor_ctrl.stage,
                                         HWMON_SENSOR_MONITOR__FAIL );
                    }
                }
                else
                {
                    elog ("%s failed to update sensor data (in hwmon) (rc:%d)\n",
                              host_ptr->hostname.c_str(), rc);

                    _stage_change ( host_ptr->hostname,
                                    host_ptr->monitor_ctrl.stage,
                                    HWMON_SENSOR_MONITOR__FAIL );
                }
                break ;
            }
            case HWMON_SENSOR_MONITOR__HANDLE:
            {
                /**************************************************************
                 *
                 * Loop over all the sensors handling their current severity.
                 *
                 * At this point the new severities are in
                 * sensor_ptr->sample_severity.
                 *
                 * After a sensor is serviced in this loop that
                 * sensor_ptr->sample_severity is copied to ptr->severity
                 * to be compared against on the next audit interval.
                 *
                 *************************************************************/
                for ( int i = 0 ; i < host_ptr->sensors ; i++ )
                {
                    /*
                     *   This variable controls whether status change actions
                     *   need to be taken at the end of this loop for sensor
                     *   in context. Assume sensor status is not changed.
                     */
                    bool mod_status = false ;

                    /* lets use a local pointer to make the code easier to read */
                    sensor_type * ptr = &host_ptr->sensor[i] ;

                    /* Local copy of new severity */
                    sensor_severity_enum severity = ptr->sample_severity ;

                    /* Things can get a little busy so lets make sure we
                     * service the signal handler and incoming http requests
                     * from sysinv.
                     */
                    daemon_signal_hdlr ();
                    hwmonHttp_server_look ();

                    /* Internasl error checking ; never seen but just in case.
                     * Skip over and swerr about null sensor name */
                    if ( ptr->sensorname.empty() )
                    {
                        slog ("%s %d sensor name is empty\n", host_ptr->hostname.c_str(), i );
                        continue ;
                    }

                    if ( ptr->updated == false )
                    {
                        host_ptr->accounting_bad_count++ ;

                        /*
                         * Force a sensor MINOR if we fail to get status from
                         * it NOT_FOUND_COUNT_BEFORE_MINOR or more times in a row
                         *
                         * This debounces the one of sensor update misses but the
                         * log above at least shows if/when this is happening.
                         */
                        if ( ++ptr->not_updated_status_change_count >= NOT_FOUND_COUNT_BEFORE_MINOR )
                        {
                            severity = HWMON_SEVERITY_MINOR ;
                        }
                    }
                    else
                    {
                        ptr->not_updated_status_change_count = 0 ;
                    }

                    if ( severity != ptr->severity)
                    {
                        ilog ("%s %s status change ; %s:%s -> %s\n",
                                  host_ptr->hostname.c_str(),
                                  ptr->sensorname.c_str(),
                                  get_severity(ptr->severity).c_str(),
                                  ptr->status.c_str(),
                                  get_severity(severity).c_str());
                    }

                    blog1 ("%s %s curr:%s this:%s last:%s\n",
                               host_ptr->hostname.c_str(),
                               ptr->sensorname.c_str(),
                               ptr->status.c_str(),
                               ptr->sample_status.c_str(),
                               ptr->sample_status_last.c_str());

                    if ( severity == HWMON_SEVERITY_GOOD )
                    {
                        if ( ptr->status.compare("ok") )
                        {
                            /* don't bother printing a log for sensors that
                             * go from offline to ok */
                            if ( ptr->status != "offline" )
                            {
                                ilog ("%s %s is ok (was %s)\n",
                                          host_ptr->hostname.c_str(),
                                          ptr->sensorname.c_str(),
                                          ptr->status.c_str());
                            }

                            /* last state was not 'ok' */
                            mod_status  = true ;
                            ptr->status = "ok" ;
                            clear_ignored_state (ptr );
                            clear_logged_state (ptr );
                        }

                        /* TODO: verify clearing sensor that has cleared over a process restart */
                        if ((( ptr->suppress == false ) && ( ptr->severity != HWMON_SEVERITY_GOOD )) ||
                             ((ptr->alarmed  == true ) || ( ptr->degraded == true )))
                        {
                            hwmonHostClass::manage_sensor_state ( host_ptr->hostname, ptr , HWMON_SEVERITY_GOOD );
                        }
                    }
                    else
                    {
                        /* Handle transition from offline to online
                         *  - clear any alarm that exhists for a sensor
                         *    coming out of the offline state is no longer
                         *    offline.
                         **/
                        if (( severity != HWMON_SEVERITY_OFFLINE ) && ( !ptr->status.compare("offline") ))
                        {
                            wlog ("%s %s sensor returned from '%s' with '%s' severity [alarmed:%s]\n",
                                      host_ptr->hostname.c_str(),
                                      ptr->sensorname.c_str(),
                                      ptr->status.c_str(),
                                      get_severity(severity).c_str(),
                                      ptr->alarmed ? "Yes" : "No");

                            /* Clear the alarm and allow it to be re-raised if the issue exists */
                            clear_asserted_alarm ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR, ptr, REASON_ONLINE );
                        }

                        if ( severity == HWMON_SEVERITY_OFFLINE )
                        {
                            if ( ptr->status.compare("offline"))
                            {
                                if ( ptr->alarmed == true )
                                {
                                    hwmonAlarm_clear ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR, ptr->sensorname, REASON_OFFLINE );
                                    ptr->alarmed  = false ;
                                }
                                ptr->degraded = false ;

                                if  ( ptr->critl.logged || ptr->major.logged || ptr->minor.logged )
                                {
                                     hwmonLog_clear ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR, ptr->sensorname, REASON_OFFLINE );
                                     ptr->critl.logged = ptr->major.logged = ptr->minor.logged = false ;
                                }
                                mod_status  = true      ;
                                blog ("%s %s sensor status change '%s' -> 'offline'\n",
                                          host_ptr->hostname.c_str(),
                                          ptr->status.c_str(),
                                          ptr->sensorname.c_str());
                                ptr->status = "offline" ;
                            }
                        }
                        else if ( severity == HWMON_SEVERITY_MINOR )
                        {
                            /* logs and alarms state changes are handled when the ignore
                             * action is set in the modify handler so there is no need
                             * to call the manager in the ignore case */
                            if (( ptr->suppress == false ) && ( ptr->actions_minor.compare (HWMON_ACTION_IGNORE)))
                            {
                                hwmonHostClass::manage_sensor_state ( host_ptr->hostname, ptr, HWMON_SEVERITY_MINOR );
                            }
                            else
                            {
                                if ( ptr->alarmed == true )
                                {
                                    /* We may have transitioned to ignore from an alarm state so check and clear if an alarm exists */
                                    clear_asserted_alarm ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR, ptr, REASON_IGNORED );
                                }
                                clear_logged_state  ( ptr ) ;
                            }

                            /* still maintain the status
                             * ... if not minor then set it to minor */
                            if ( ptr->status.compare("minor") )
                            {
                                ptr->status = "minor" ;
                                mod_status = true ;
                            }
                        }
                        else if ( severity == HWMON_SEVERITY_MAJOR )
                        {
                            /* logs and alarms state changes are handled when the ignore
                             * action is set in the modify handler so there is no need
                             * to call the manager in the ignore case */
                            if (( ptr->suppress == false ) && ( ptr->actions_major.compare (HWMON_ACTION_IGNORE)))
                            {
                                hwmonHostClass::manage_sensor_state ( host_ptr->hostname, ptr, HWMON_SEVERITY_MAJOR );
                            }
                            else
                            {
                                if ( ptr->alarmed == true )
                                {
                                    /* We may have transitioned to ignore from an alarm state so check and clear if an alarm exists */
                                    clear_asserted_alarm ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR, ptr, REASON_IGNORED );
                                }
                                clear_logged_state  ( ptr ) ;
                            }

                            /* if not major then set it to major */
                            if ( ptr->status.compare("major") )
                            {
                                ptr->status = "major" ;
                                mod_status = true ;
                            }
                        }
                        else if (( severity == HWMON_SEVERITY_CRITICAL ) ||
                                 ( severity == HWMON_SEVERITY_NONRECOVERABLE ))
                        {
                            /* log and alarm state changes are handled when the ignore
                             * action is set in the modify handler so there is no need
                             * to call the manager in the ignore case */
                            if (( ptr->suppress == false ) && ( ptr->actions_critl.compare (HWMON_ACTION_IGNORE)))
                            {
                                if ( !ptr->actions_critl.compare (HWMON_ACTION_RESET))
                                {
                                    if ( host_ptr->monitor == false )
                                    {
                                        /* Ignore event while we are not monitoring */
                                        ilog ("%s %s ignoring 'reset action' while not monitoring\n",
                                                  host_ptr->hostname.c_str(),
                                                  ptr->sensorname.c_str());
                                    }
                                    else
                                    {
                                        if ( ptr->critl.alarmed == false )
                                        {
                                            hwmonAlarm_critical ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR,
                                                                  ptr->sensorname, REASON_RESETTING ) ;
                                        }
                                        clear_alarmed_state ( ptr );
                                        set_alarmed_severity ( ptr, FM_ALARM_SEVERITY_CRITICAL );

                                        if ( ptr->degraded == false )
                                        {
                                            ptr->degraded = true ;
                                        }

                                        clear_ignored_state ( ptr );
                                        clear_logged_state ( ptr );

                                        /* Send reset request to mtcAgent */
                                        wlog ("%s requesting 'reset' due to critical '%s' sensor\n",
                                                  host_ptr->hostname.c_str(),
                                                  ptr->sensorname.c_str());

                                        hwmon_send_event ( host_ptr->hostname,
                                                           MTC_EVENT_HWMON_RESET,
                                                           ptr->sensorname.data());
                                    }
                                }
                                else if ( !ptr->actions_critl.compare (HWMON_ACTION_POWERCYCLE))
                                {
                                    if ( host_ptr->monitor == false )
                                    {
                                        /* Ignore event while we are not monitoring */
                                        ilog ("%s %s ignoring 'power-cycle action' while not monitoring\n",
                                                  host_ptr->hostname.c_str(),
                                                  ptr->sensorname.c_str());
                                    }
                                    else
                                    {
                                        if ( ptr->critl.alarmed == false )
                                        {
                                            hwmonAlarm_critical ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR,
                                                                  ptr->sensorname, REASON_POWERCYCLING ) ;
                                        }
                                        clear_alarmed_state ( ptr );

                                        set_alarmed_severity ( ptr, FM_ALARM_SEVERITY_CRITICAL );

                                        if ( ptr->degraded == false )
                                        {
                                            ptr->degraded = true ;
                                        }

                                        clear_ignored_state ( ptr );
                                        clear_logged_state  ( ptr );

                                        wlog ("%s requesting 'powercycle' due to critical '%s' sensor\n",
                                                  host_ptr->hostname.c_str(),
                                                  ptr->sensorname.c_str());

                                        /* Send reset request to mtcAgent */
                                        hwmon_send_event ( host_ptr->hostname,
                                                           MTC_EVENT_HWMON_POWERCYCLE,
                                                           ptr->sensorname.data());
                                    }
                                }
                                else
                                {
                                    hwmonHostClass::manage_sensor_state ( host_ptr->hostname, ptr, HWMON_SEVERITY_CRITICAL );
                                }
                            }
                            else
                            {
                                if ( ptr->alarmed == true )
                                {
                                    /* We may have transitioned to ignore from an alarm state so check and clear if an alarm exists */
                                    clear_asserted_alarm ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR, ptr, REASON_IGNORED );
                                }
                                else
                                {
                                    blog2 ("%s %s is not alarmed\n", host_ptr->hostname.c_str(), ptr->sensorname.c_str() );
                                }
                                clear_logged_state  ( ptr ) ;
                            }

                            /* if not critical then set it to critical */
                            if ( ptr->status.compare("critical") )
                            {
                                ptr->status = "critical" ;
                                mod_status = true ;
                            }
                        }
                        else
                        {
                             slog ("%s unknown severity (%d)\n",  host_ptr->hostname.c_str(), severity );
                        }
                    } /* end else that look at non-good severities */

                    if ( mod_status == true )
                    {
                        hwmonHttp_mod_sensor ( host_ptr->hostname, host_ptr->event , ptr->uuid, "status" , ptr->status );
                    }
                    ptr->severity = severity ;

                } /* end for loop over all sensors */
                if ( host_ptr->bmc_fw_version.empty() )
                {
                    string fn = (IPMITOOL_OUTPUT_DIR + host_ptr->hostname + "_mc_info") ;
                    if ( daemon_is_file_present ( fn.data() ) )
                    {
                        host_ptr->bmc_fw_version =
                            get_bmc_version_string ( host_ptr->hostname,
                                                     fn.data() );
                    }
                    if ( !host_ptr->bmc_fw_version.empty() )
                    {
                        ilog ("%s bmc fw version: %s\n",
                                  host_ptr->hostname.c_str(),
                                  host_ptr->bmc_fw_version.c_str());
                    }
                }

                /* Start the next group interval timer */
                if ( host_ptr->interval < HWMON_MIN_AUDIT_INTERVAL )
                {
                    ilog ("%s monitor interval set to a %d secs cadence (%d)\n",
                              host_ptr->hostname.c_str(),
                              HWMON_DEFAULT_AUDIT_INTERVAL,
                              host_ptr->interval);
                    host_ptr->interval = HWMON_DEFAULT_AUDIT_INTERVAL ;
                    interval_change_handler ( host_ptr );
                }

                /* exit sensor model relearn mode if we have sensors and groups */
                if (( host_ptr->relearn == true ) &&
                    ( host_ptr->sensors ) && ( host_ptr->groups ))
                {
                    mtcTimer_reset ( host_ptr->relearnTimer );
                    host_ptr->relearn = false ;
                    plog ("%s sensor model relearn complete\n",
                              host_ptr->hostname.c_str());
                }

                mtcTimer_start ( host_ptr->monitor_ctrl.timer,
                                 hwmonTimer_handler,
                                 host_ptr->interval );

                _stage_change ( host_ptr->hostname,
                                host_ptr->monitor_ctrl.stage,
                                HWMON_SENSOR_MONITOR__DELAY );
                break ;
            }

            case HWMON_SENSOR_MONITOR__FAIL:
            {
                host_ptr->ping_info.ok = false ;
                host_ptr->bmc_thread_ctrl.retries = 0 ;

                mtcTimer_reset ( host_ptr->monitor_ctrl.timer );

                if ( host_ptr->bmc_thread_info.status )
                {
                    elog ("%s sensor monitoring failure (rc:%d)\n",
                              host_ptr->hostname.c_str(),
                              host_ptr->bmc_thread_info.status );
                    if ( host_ptr->bmc_thread_info.data.length() )
                    {
                        string _temp = host_ptr->bmc_thread_info.status_string ;
                        size_t pos = _temp.find ("-f", 0) ;

                        if ( pos != std::string::npos )
                        {
                            /* don't log the password filename */
                            elog ("%s ... %s\n",
                                      host_ptr->hostname.c_str(),
                                      _temp.substr(0,pos).c_str());
                        }
                        else
                        {
                            elog ("%s ... %s\n",
                                      host_ptr->hostname.c_str(),
                                      host_ptr->bmc_thread_info.status_string.c_str());
                        }
                    }
                }

                if ( host_ptr->bmc_thread_ctrl.id )
                {
                    slog ("%s sensor monitor thread is unexpectedly active ; handling as failure\n",
                              host_ptr->hostname.c_str());

                    thread_kill ( host_ptr->bmc_thread_ctrl, host_ptr->bmc_thread_info );
                }

                if ( host_ptr->interval )
                {
                    bmc_set_group_state ( host_ptr, "failed" ) ;

                    _stage_change  ( host_ptr->hostname,
                                     host_ptr->monitor_ctrl.stage,
                                     HWMON_SENSOR_MONITOR__START );
                }
                else
                {
                    /* TODO: Error case that should not happen ; need to force reprovision */
                    _stage_change ( host_ptr->hostname,
                                    host_ptr->monitor_ctrl.stage,
                                    HWMON_SENSOR_MONITOR__IDLE );
                }
                break ;
            }
            case HWMON_SENSOR_MONITOR__STAGES:
            default:
            {
                slog ("%s Invalid stage (%d)\n",
                          host_ptr->hostname.c_str(),
                          host_ptr->monitor_ctrl.stage );

                _stage_change ( host_ptr->hostname,
                                host_ptr->monitor_ctrl.stage,
                                HWMON_SENSOR_MONITOR__START );
            }
        }
    }
    return (rc);
}

/* Delete Handler
 * ----------------- */
int hwmonHostClass::delete_handler ( struct hwmonHostClass::hwmon_host * host_ptr )
{
    if ( host_ptr == NULL )
    {
        slog ("delete handler called with null pointer\n");
        return (FAIL_NULL_POINTER);
    }

    switch ( host_ptr->delStage )
    {
        case HWMON_DEL__START:
        {
            ilog ("%s Delete Operation Started\n", host_ptr->hostname.c_str());
            host_ptr->retries = 0 ;

            if ( host_ptr->bmc_thread_ctrl.stage != THREAD_STAGE__IDLE )
            {
                int delay = THREAD_POST_KILL_WAIT ;
                thread_kill ( host_ptr->bmc_thread_ctrl , host_ptr->bmc_thread_info) ;

                ilog ("%s thread active ; sending kill ; waiting %d seconds\n",
                          host_ptr->hostname.c_str(), delay );
                mtcTimer_reset ( host_ptr->hostTimer );
                mtcTimer_start ( host_ptr->hostTimer, hwmonTimer_handler, delay );
                host_ptr->delStage = HWMON_DEL__WAIT ;
            }
            else
            {
                host_ptr->delStage = HWMON_DEL__DONE ;
            }


            break ;
        }
        case HWMON_DEL__WAIT:
        {
            if ( mtcTimer_expired ( host_ptr->hostTimer ) )
            {
                if ( host_ptr->bmc_thread_ctrl.stage != THREAD_STAGE__IDLE )
                {
                    if ( host_ptr->retries++ < 3 )
                    {
                        wlog ("%s still waiting on active thread ; sending another kill signal (try %d or %d)\n",
                                  host_ptr->hostname.c_str(), host_ptr->retries, 3 );

                        thread_kill ( host_ptr->bmc_thread_ctrl, host_ptr->bmc_thread_info ) ;
                        mtcTimer_start ( host_ptr->hostTimer, hwmonTimer_handler, THREAD_POST_KILL_WAIT );
                        break ;
                    }
                    else
                    {
                        elog ("%s thread refuses to stop ; giving up ...\n",
                                  host_ptr->hostname.c_str());
                    }
                }
                host_ptr->delStage = HWMON_DEL__DONE ;
            }
            break ;
        }
        case HWMON_DEL__DONE:
        {
            /* ok now delete the host */
            del_host ( host_ptr->hostname );
            this->host_deleted = true ;
            break ;
        }
        default:
        {
            ilog ("%s invalid delete stage (%d) ; correcting ...\n", host_ptr->hostname.c_str(), host_ptr->delStage );
            host_ptr->delStage = HWMON_DEL__START ;
        }
    }
    return (PASS);
}


/*****************************************************************************
 *
 * Name       : manage_startup_states
 *
 * Description: Manage the sensor startup states.
 *
 *              This means failure log, alarm and degraded states on
 *              startup for groups and sensors
 *
 *****************************************************************************/

bool hwmonHostClass::manage_startup_states ( struct hwmonHostClass::hwmon_host * host_ptr )
{
    int rc = PASS ;
    if ( host_ptr )
    {

        std::list<hwmonAlarm_entity_status_type>::iterator _iter_ptr ;
        std::list<hwmonAlarm_entity_status_type> alarm_list ;
        alarm_list.clear();

        /**********************    Manage Profile Alarms    ***********************/

        /* clear this config alarm as it is not used anymore - handles patchback case.
         * Its cheaper to send a clear than it is to query for it first */
        hwmonAlarm_clear ( host_ptr->hostname, HWMON_ALARM_ID__SENSORCFG, "sensor", REASON_OK );

#ifdef WANT_QUERY_SENSOR_CONFIG_ALARM
        /* We don't degrade for sensor config error - this is similar to a
         * BMC access error in mtcAgent where we only raise a minor alarm */
        if ( hwmon_alarm_query ( host_ptr->hostname, HWMON_ALARM_ID__SENSORCFG, "profile" ) != FM_ALARM_SEVERITY_CLEAR )
             host_ptr->alarmed_config = true ;
#endif
        if ( host_ptr->alarmed_config == false )
        {
            hwmonAlarm_clear ( host_ptr->hostname, HWMON_ALARM_ID__SENSORCFG, "profile", REASON_OK );
            host_ptr->alarmed_config = false ;
        }

        /**********************    Manage Group Alarms    ***********************/
        string entity = "host=" + host_ptr->hostname + ".sensorgroup=" ;

        /* 1. Query for all group alarms */
        rc = hwmonAlarm_query_entity ( host_ptr->hostname, entity, alarm_list );
        if ( rc != PASS )
        {
            elog ("%s sensorgroup alarm query failed\n", host_ptr->hostname.c_str() );
            return (FAIL_OPERATION);
        }

        /* 2. Search the alarm list for orphan groups
         *    - group alarms that are not in the current group list
         *    - should not occur but is a catch all for stuck group alarms */
        for ( _iter_ptr = alarm_list.begin(); _iter_ptr != alarm_list.end(); ++_iter_ptr )
        {
            bool found = false ;
            for ( int g = 0 ; g < host_ptr->groups ; g++ )
            {
                string _temp = entity + host_ptr->group[g].group_name ;
                if ( _iter_ptr->instance.compare(_temp) == 0 )
                {
                    found = true ;
                    break ;
                }
            }
            if ( found == false )
            {
                string groupname = _iter_ptr->instance.substr (entity.length()) ;
                wlog ("%s found orphan group alarm '%s' ; clearing\n", host_ptr->hostname.c_str(), groupname.c_str() );
                hwmonAlarm_clear ( host_ptr->hostname, HWMON_ALARM_ID__SENSORGROUP, groupname, REASON_DEPROVISIONED );
            }
        }

        /* 3. Look up each alarmed group and then manage that alarm */
        for ( int g = 0 ; g < host_ptr->groups ; g++ )
        {
            struct sensor_group_type * group_ptr = &host_ptr->group[g] ;
            bool found = false ;
            bool raise = false ;
            bool clear = false ;
            daemon_signal_hdlr ();

            if ( alarm_list.size() )
            {
                for ( _iter_ptr = alarm_list.begin(); _iter_ptr != alarm_list.end(); ++_iter_ptr )
                {
                    string _temp = entity + group_ptr->group_name ;
                    if ( _iter_ptr->instance.compare(_temp) == 0 )
                    {
                        ilog ("%s '%s' group '%s' alarm already set\n",
                                  host_ptr->hostname.c_str(),
                                  host_ptr->group[g].group_name.c_str(),
                                  alarmUtil_getSev_str(_iter_ptr->severity).c_str());
                        found = true ;
                        break ;
                    }
                }
            }

            /* Note: if found == true then the group_ptr points to the group that
             *       has the alarm raised and _iter_ptr point to the alarm info */

            /* Determine if this alarm needs to be raised or cleared ... or left alone
             * Database state takes precidence of all */
            if ( group_ptr->group_state.compare("failed") == 0 )
            {
                group_ptr->failed  = true ;
                group_ptr->alarmed = true ;
                if ( found == true )
                {
                    if ( _iter_ptr->severity != FM_ALARM_SEVERITY_MAJOR )
                    {
                        slog ("%s %s group alarm severity incorrect (%d:%s) ; correcting \n",
                                  host_ptr->hostname.c_str(),
                                  _iter_ptr->entity.c_str(),
                                  _iter_ptr->severity,
                                  alarmUtil_getSev_str(_iter_ptr->severity).c_str());
                        raise = true ;
                    }
                }
                else
                {
                    raise = true ;
                }
            }
            else
            {
                group_ptr->failed  = false ;
                group_ptr->alarmed = false ;
                if ( found == true )
                {
                    clear = true ;
                }
            }

            if ( raise == true )
            {
                group_ptr->failed  = true ;
                group_ptr->alarmed = true ;
                hwmonAlarm_major ( host_ptr->hostname, HWMON_ALARM_ID__SENSORGROUP, group_ptr->group_name, REASON_DEGRADED );
            }

            if ( clear == true )
            {
                group_ptr->failed  = false ;
                group_ptr->alarmed = false ;
                hwmonAlarm_clear ( host_ptr->hostname, HWMON_ALARM_ID__SENSORGROUP, group_ptr->group_name, REASON_OK );
            }
        }

        /**********************    Manage Sensor Alarms    ***********************/

        /* 1. Query Sensor Alarm States from FM */
        entity = "host=" + host_ptr->hostname + ".sensor=" ;

        rc = hwmonAlarm_query_entity ( host_ptr->hostname, entity, alarm_list );
        if ( rc != PASS )
        {
            elog ("%s sensor alarm query failed\n", host_ptr->hostname.c_str() );
            return (FAIL_OPERATION);
        }

        /* 2. Search the alarm list for orphan sensors
         *    - sensor alarms that are not in the current sensor list
         *    - should not occur but is a catch all for stuck sensor alarms */
        for ( _iter_ptr = alarm_list.begin (); _iter_ptr != alarm_list.end () ; ++_iter_ptr )
        {
            bool found = false ;
            for ( int s = 0 ; s < host_ptr->sensors ; s++ )
            {
                string _temp = entity + host_ptr->sensor[s].sensorname ;
                if ( _iter_ptr->instance.compare(_temp) == 0 )
                {
                    ilog ("%s '%s' sensor '%s' alarm already set\n",
                               host_ptr->hostname.c_str(),
                               host_ptr->sensor[s].sensorname.c_str(),
                               alarmUtil_getSev_str(_iter_ptr->severity).c_str());
                    found = true ;
                    break ;
                }
            }
            if ( found == false )
            {
                string sensorname = _iter_ptr->instance.substr (entity.length()) ;
                wlog ("%s found orphan sensor alarm '%s' ; clearing\n", host_ptr->hostname.c_str(), sensorname.c_str() );
                hwmonAlarm_clear ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR, sensorname, REASON_DEPROVISIONED );
            }
        }

        /* 3. manage the state of sensors alarms */
        for ( int s = 0 ; s < host_ptr->sensors ; s++ )
        {
             std::list<hwmonAlarm_entity_status_type>::iterator _iter_ptr ;
             sensor_type * sensor_ptr = &host_ptr->sensor[s] ;
             string reason = REASON_OK ;
             bool found = false ;
             bool clear = false ;
             bool minor = false ;
             bool major = false ;
             bool critl = false ;

             daemon_signal_hdlr ();

             if ( alarm_list.size() )
             {
                 for ( _iter_ptr  = alarm_list.begin () ;
                       _iter_ptr != alarm_list.end () ;
                     ++_iter_ptr )
                 {
                     string _temp = entity + sensor_ptr->sensorname ;
                     if ( _iter_ptr->instance.compare(_temp) == 0 )
                     {
                         found = true ;
                         break ;
                     }
                 }
             }

             /* Note: if found == true then the sensor_ptr points to the sensor that
              *       has the alarm raised and _iter_ptr point to the alarm info */

             /* Determine if this alarm needs to be raised or cleared ... or left alone
              * Database state takes precidence of all */
             if ( sensor_ptr->status.compare("ok") == 0 )
             {
                 clear_alarmed_state  ( sensor_ptr );
                 clear_degraded_state ( sensor_ptr );
                 if ( found == true )
                 {
                     clear = true ;
                 }
             }
             else if ( sensor_ptr->status.compare("offline") == 0 )
             {
                 clear_alarmed_state  ( sensor_ptr );
                 clear_degraded_state ( sensor_ptr );
                 if ( found == true )
                 {
                     clear = true ;
                 }
             }
             else if ( sensor_ptr->status.compare("minor") == 0 )
             {
                 if ( sensor_ptr->actions_minor.compare("alarm"))
                 {
                     if ( found == true )
                     {
                         clear = true ;
                     }
                     if ( sensor_ptr->actions_minor.compare("log") == 0 )
                     {
                         set_logged_severity ( sensor_ptr, FM_ALARM_SEVERITY_MINOR );
                         reason = REASON_SET_TO_LOG ;
                     }
                     if ( sensor_ptr->actions_major.compare("ignore") == 0 )
                     {
                         set_ignored_severity ( sensor_ptr, FM_ALARM_SEVERITY_MINOR );
                         reason = REASON_IGNORED ;
                     }
                 }
                 else if ( sensor_ptr->suppress == true )
                 {
                     if ( found == true )
                     {
                         reason = REASON_SUPPRESSED ;
                         clear = true ;
                     }
                 }
                 /**
                  *  else this is an alarm case ...
                  *    - if no alarm found then raise the minor alarm
                  *    - if alarm found but not in proper severity then
                  *      raise the minor alarm
                  **/
                 else
                 {
                     set_alarmed_severity ( sensor_ptr , FM_ALARM_SEVERITY_MINOR );
                     clear_degraded_state ( sensor_ptr );
                     if ((  found == false ) ||
                         (( found == true ) && ( _iter_ptr->severity != FM_ALARM_SEVERITY_MINOR )))
                     {
                         /* correct the severity of the alarm */
                         minor = true ;
                     }
                 }
             }
             else if ( sensor_ptr->status.compare("major") == 0 )
             {
                 if ( sensor_ptr->actions_major.compare("alarm"))
                 {
                     if ( found == true )
                     {
                         clear = true ;
                     }
                     if ( sensor_ptr->actions_major.compare("log") == 0 )
                     {
                         set_logged_severity ( sensor_ptr, FM_ALARM_SEVERITY_MAJOR );
                         reason = REASON_SET_TO_LOG ;
                     }
                     if ( sensor_ptr->actions_major.compare("ignore") == 0 )
                     {
                         set_ignored_severity ( sensor_ptr, FM_ALARM_SEVERITY_MAJOR ) ;
                         reason = REASON_IGNORED ;
                     }
                 }
                 else if ( sensor_ptr->suppress == true )
                 {
                     if ( found == true )
                     {
                         reason = REASON_SUPPRESSED ;
                         clear = true ;
                     }
                 }
                 /**
                  *  else this is an alarm case ...
                  *    - if no alarm found then raise the major alarm
                  *    - if alarm found but not in proper severity then
                  *      raise the major alarm
                  **/
                 else
                 {
                     set_alarmed_severity ( sensor_ptr , FM_ALARM_SEVERITY_MAJOR );
                     set_degraded_state   ( sensor_ptr );
                     if ((  found == false ) ||
                         (( found == true ) && ( _iter_ptr->severity != FM_ALARM_SEVERITY_MAJOR )))
                     {
                         /* correct the severity of the alarm */
                         major = true ;
                     }
                 }
             }
             else if ( sensor_ptr->status.compare("critical") == 0 )
             {
                 if ( sensor_ptr->actions_critl.compare("alarm"))
                 {
                     if ( found == true )
                     {
                         clear = true ;
                     }
                     if ( sensor_ptr->actions_critl.compare("log") == 0 )
                     {
                         set_logged_severity ( sensor_ptr, FM_ALARM_SEVERITY_CRITICAL ) ;
                         reason = REASON_SET_TO_LOG ;
                     }
                     if ( sensor_ptr->actions_critl.compare("ignore") == 0 )
                     {
                         set_ignored_severity ( sensor_ptr , FM_ALARM_SEVERITY_CRITICAL ) ;
                         reason = REASON_IGNORED ;
                     }
                 }
                 else if ( sensor_ptr->suppress == true )
                 {
                     if ( found == true )
                     {
                         reason = REASON_SUPPRESSED ;
                         clear = true ;
                     }
                 }
                 /**
                  *  else this is an alarm case ...
                  *    - if no alarm found then raise the critical alarm
                  *    - if alarm found but not in proper severity then
                  *      raise the critical alarm
                  **/
                 else
                 {
                     set_alarmed_severity ( sensor_ptr , FM_ALARM_SEVERITY_CRITICAL );
                     set_degraded_state   ( sensor_ptr );
                     if ((  found == false ) ||
                         (( found == true ) && ( _iter_ptr->severity != FM_ALARM_SEVERITY_CRITICAL )))
                     {
                         /* correct the severity of the alarm */
                         critl = true ;
                     }
                 }
             }

             if ( clear == true )
             {
                 clear_alarmed_state ( sensor_ptr );
                 clear_degraded_state ( sensor_ptr );
                 hwmonAlarm_clear ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, reason );
             }

             else if ( minor == true )
             {
                 clear_degraded_state ( sensor_ptr );
                 set_alarmed_severity ( sensor_ptr, FM_ALARM_SEVERITY_MINOR );
                 hwmonAlarm_minor ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, REASON_DEGRADED );
             }

             else if ( major == true )
             {
                 set_degraded_state   ( sensor_ptr );
                 set_alarmed_severity ( sensor_ptr, FM_ALARM_SEVERITY_MAJOR );
                 hwmonAlarm_major ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, REASON_DEGRADED );
             }

             else if ( critl == true )
             {
                 set_degraded_state ( sensor_ptr );
                 set_alarmed_severity ( sensor_ptr, FM_ALARM_SEVERITY_CRITICAL);
                 hwmonAlarm_critical ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, REASON_DEGRADED );
             }
             // sensorState_print ( host_ptr->hostname, sensor_ptr );
         }
    }
    else
    {
        rc = FAIL_NULL_POINTER ;
    }
    return (rc);
}

/*****************************************************************************
 *
 * Name       : monitor_now
 *
 * Description: Force monitor to occur immediately.
 *
 ****************************************************************************/

void hwmonHostClass::monitor_now ( struct hwmonHostClass::hwmon_host * host_ptr )
{
    if ( host_ptr )
    {
        if (  host_ptr->monitor_ctrl.stage == HWMON_SENSOR_MONITOR__DELAY )
        {
            mtcTimer_reset ( host_ptr->monitor_ctrl.timer );
            host_ptr->monitor_ctrl.timer.ring = true ;
            dlog ("%s force monitor now\n", host_ptr->hostname.c_str() );
        }
    }
    else
    {
        slog ("null host pointer\n");
    }
}

/*****************************************************************************
 *
 * Name        : monitor_soon
 *
 * Description: Force monitor to occur in 5 seconds.
 *
 ****************************************************************************/

void hwmonHostClass::monitor_soon ( struct hwmonHostClass::hwmon_host * host_ptr )
{
    if ( host_ptr )
    {
        int delay = MTC_SECS_5 ;

        wlog ("%s sensor monitoring FSM stage (%d) aborted\n",
                  host_ptr->hostname.c_str(),
                  host_ptr->monitor_ctrl.stage);

        if ( host_ptr->bmc_thread_ctrl.id )
        {
            ilog ("%s stopping current thread (%lu)\n", host_ptr->hostname.c_str(), host_ptr->bmc_thread_ctrl.id );
            thread_kill ( host_ptr->bmc_thread_ctrl, host_ptr->bmc_thread_info );

            /* have to wait a bit longer than THREAD_POST_KILL_WAIT for the thread kill to happen */
            delay += THREAD_POST_KILL_WAIT ;
        }

        _stage_change ( host_ptr->hostname,
                        host_ptr->monitor_ctrl.stage,
                        HWMON_SENSOR_MONITOR__DELAY) ;

        mtcTimer_reset ( host_ptr->monitor_ctrl.timer );
        mtcTimer_start ( host_ptr->monitor_ctrl.timer,
                         hwmonTimer_handler, delay );

        ilog ("%s sensor monitoring will resume in %d seconds\n",
                  host_ptr->hostname.c_str(), delay );
    }
    else
    {
        slog ("null host pointer\n");
    }
}