Merge remote-tracking branch 'origin/master' into f/centos8

Signed-off-by: Charles Short <charles.short@windriver.com> Change-Id: I63728d8d3a20b98c3114ebead4bd3007fbe187b5
2021-05-19 15:25:20 -04:00 · 2021-05-19 15:25:20 -04:00 · 6c2905e665
parent 7cd3a20539 9f861eaaa5
commit 6c2905e665
64 changed files with 1816 additions and 619 deletions
--- a/bsp-files/filter_out_from_controller
+++ b/bsp-files/filter_out_from_controller
@ -25,6 +25,7 @@ mtce-guestServer
 nfscheck
 radvd
 config-gate-worker
+isolcpus-device-plugin
 kernel-rt
 kernel-module-igb-uio
 kernel-module-igb-uio-rt
@ -33,6 +34,7 @@ kernel-rt-modules-extra
 kmod-e1000e-rt
 kmod-i40e-rt
 kmod-iavf-rt
+kmod-ice-rt
 kmod-ixgbe-rt
 kmod-ixgbevf-rt
 kmod-igb_uio-rt
@ -53,3 +55,7 @@ openvswitch-config
 pci-irq-affinity-agent
 kvm-timer-advance
 sysinv-fpga-agent
+kernel-rt-headers
+kernel-rt-devel
+kernel-headers
+kernel-devel
--- a/bsp-files/filter_out_from_smallsystem
+++ b/bsp-files/filter_out_from_smallsystem
@ -13,6 +13,7 @@ kernel-rt-modules-extra
 kmod-e1000e-rt
 kmod-i40e-rt
 kmod-iavf-rt
+kmod-ice-rt
 kmod-ixgbe-rt
 kmod-ixgbevf-rt
 kmod-igb_uio-rt
@ -26,3 +27,5 @@ qat17-rt
 kernel-rt-tools
 kernel-rt-tools-libs
 kmod-drbd-rt
+kernel-rt-headers
+kernel-rt-devel
--- a/bsp-files/filter_out_from_smallsystem_lowlatency
+++ b/bsp-files/filter_out_from_smallsystem_lowlatency
@ -11,6 +11,7 @@ kernel-module-igb-uio
 kmod-e1000e
 kmod-i40e
 kmod-iavf
+kmod-ice
 kmod-ixgbe
 kmod-ixgbevf
 kmod-igb_uio
@ -23,3 +24,5 @@ kernel-tools
 kernel-tools-libs
 kmod-drbd
 kernel-modules-extra
+kernel-headers
+kernel-devel
--- a/bsp-files/filter_out_from_storage
+++ b/bsp-files/filter_out_from_storage
@ -69,6 +69,7 @@ influxdb
 influxdb-extensions
 io-monitor
 io-scheduler
+isolcpus-device-plugin
 isomd5sum
 ipxe-roms-qemu
 kernel-module-openvswitch
@ -120,8 +121,6 @@ nova-tests
 nova-api-proxy
 nova-placement-api
 novnc
-net-snmp
-net-snmp-config
 openstack-aodh-api
 openstack-aodh-commmon
 openstack-aodh-compat
@ -256,7 +255,6 @@ qemu-kvm-ev
 qemu-kvm-tools-ev
 radvd
 rubygem-rdoc
-snmp-ext
 task-cloud-compute
 task-cloud-controller
 tgt
@ -290,6 +288,7 @@ kernel-rt-modules-extra
 kmod-e1000e-rt
 kmod-i40e-rt
 kmod-iavf-rt
+kmod-ice-rt
 kmod-ixgbe-rt
 kmod-ixgbevf-rt
 kmod-igb_uio-rt
@ -304,7 +303,6 @@ kernel-rt-tools
 kernel-rt-tools-libs
 NaviCLI-Linux-64-x86-en_US
 kmod-drbd-rt
-snmp-audittrail
 wrs-ssl
 tpm2-tools
 tss2
@ -340,6 +338,11 @@ stx-oidc-auth-helm
 stx-cert-manager-helm
 stx-nginx-ingress-controller-helm
 stx-portieris-helm
+stx-snmp-helm
 stx-vault-helm
 sysinv-fpga-agent
 k8s-pod-recovery
+kernel-rt-headers
+kernel-rt-devel
+kernel-headers
+kernel-devel
--- a/bsp-files/filter_out_from_worker
+++ b/bsp-files/filter_out_from_worker
@ -81,8 +81,6 @@ nova-tests
 nova-api-proxy
 nova-placement-api
 novnc
-net-snmp
-net-snmp-config
 openldap-backend-bdb
 openldap-backend-dnssrv
 openldap-backend-hdb
@ -138,7 +136,6 @@ python-swiftclient
 python-wsme
 fm-mgr
 fm-rest-api
-snmp-ext
 sm
 sm-api
 sm-client
@ -258,6 +255,7 @@ kernel-rt-modules-extra
 kmod-e1000e-rt
 kmod-i40e-rt
 kmod-iavf-rt
+kmod-ice-rt
 kmod-ixgbe-rt
 kmod-ixgbevf-rt
 kmod-igb_uio-rt
@ -272,7 +270,6 @@ kernel-rt-tools
 kernel-rt-tools-libs
 NaviCLI-Linux-64-x86-en_US
 kmod-drbd-rt
-snmp-audittrail
 wrs-ssl
 tpm2-tools
 tss2
@ -301,5 +298,8 @@ stx-oidc-auth-helm
 stx-cert-manager-helm
 stx-nginx-ingress-controller-helm
 stx-portieris-helm
+stx-snmp-helm
 stx-vault-helm
 k8s-pod-recovery
+kernel-rt-headers
+kernel-rt-devel
--- a/bsp-files/filter_out_from_worker_lowlatency
+++ b/bsp-files/filter_out_from_worker_lowlatency
@ -81,8 +81,6 @@ nova-tests
 nova-api-proxy
 nova-placement-api
 novnc
-net-snmp
-net-snmp-config
 neutron-plugin-ml2
 neutron-server
 neutron-tests
@ -141,7 +139,6 @@ python-swiftclient
 python-wsme
 fm-mgr
 fm-rest-api
-snmp-ext
 sm
 sm-api
 sm-client
@ -261,6 +258,7 @@ kernel-module-igb-uio
 kmod-e1000e
 kmod-i40e
 kmod-iavf
+kmod-ice
 kmod-ixgbe
 kmod-ixgbevf
 kmod-igb_uio
@ -274,7 +272,6 @@ kernel-tools-libs
 kernel-modules-extra
 NaviCLI-Linux-64-x86-en_US
 kmod-drbd-rt
-snmp-audittrail
 wrs-ssl
 tpm2-tools
 tss2
@ -302,5 +299,8 @@ stx-oidc-auth-helm
 stx-cert-manager-helm
 stx-nginx-ingress-controller-helm
 stx-portieris-helm
+stx-snmp-helm
 stx-vault-helm
 k8s-pod-recovery
+kernel-headers
+kernel-devel
--- a/bsp-files/kickstarts/pre_disk_aio.cfg
+++ b/bsp-files/kickstarts/pre_disk_aio.cfg
@ -29,11 +29,12 @@
 ## ETCD_STOR_SIZE = 5GiB
 ## CEPH_MON_SIZE = 20GiB
 ## KUBELET_STOR_SIZE = 10GiB
+## DC_VAULT_SIZE = 15GiB
 ## RESERVED_PE = 16MiB (based on pesize=32768)
 ##
-## CGCS_PV_SIZE = (10 + 2*10 + 25 + 8 + 16 + 2 + 1 + 30 + 16 + 5 + 20 + 10)GiB + 16MiB/1024 = 163.02GiB
+## CGCS_PV_SIZE = (10 + 2*10 + 25 + 8 + 16 + 2 + 1 + 30 + 16 + 5 + 20 + 10 + 15)GiB + 16MiB/1024 = 178.02GiB
 ##
-##***************************************************************************************************
+##**********************************************************************************************************
 ## Small disk install - (for disks below 240GB)
 ##  - DB size is doubled to allow for upgrades
 ##
@ -50,11 +51,12 @@
 ## ETCD_STOR_SIZE = 5GiB
 ## CEPH_MON_SIZE = 20GiB
 ## KUBELET_STOR_SIZE = 10GiB
+## DC_VAULT_SIZE = 15GiB
 ## RESERVED_PE = 16MiB (based on pesize=32768)
 ##
-## CGCS_PV_SIZE = (10 + 2*5 + 20 + 8 + 16 + 2 + 1 + 30 + 16 + 5 + 20 + 10)GiB + 16MiB/1024 = 148.02GiB
+## CGCS_PV_SIZE = (10 + 2*5 + 20 + 8 + 16 + 2 + 1 + 30 + 16 + 5 + 20 + 10 + 15)GiB + 16MiB/1024 = 163.02GiB
 ##
-##***************************************************************************************************
+##*********************************************************************************************************
 ## Tiny disk install - (for disks below 154GB)
 ##
 ## NOTE: Tiny disk setup is mainly for StarlingX running in QEMU/KVM VM.
@ -89,15 +91,15 @@ EFI_SIZE=300
 #   which are DEFAULT_SMALL_DISK_SIZE
 #             MINIMUM_SMALL_DISK_SIZE
 default_small_disk_size=240
-minimum_small_disk_size=181
+minimum_small_disk_size=196
 sz=$(blockdev --getsize64 $rootfs_device)
 # Round CGCS_PV_SIZE to the closest upper value that can be divided by 1024.
 if [ $sz -gt $(($default_small_disk_size*$gb)) ] ; then
-    # Large disk: CGCS_PV_SIZE=164GiB*1024=167936
-    CGCS_PV_SIZE=167936
+    # Large disk: CGCS_PV_SIZE=179GiB*1024=183296
+    CGCS_PV_SIZE=183296
 elif [ $sz -ge $(($minimum_small_disk_size*$gb)) ] ; then
-    # Small disk: CGCS_PV_SIZE=149GiB*1024=152576
-    CGCS_PV_SIZE=152576
+    # Small disk: CGCS_PV_SIZE=164GiB*1024=167936
+    CGCS_PV_SIZE=167936
 else
    # Tiny disk: CGCS_PV_SIZE=43GiB*1024=44032
    # Using a disk with a size under 60GiB will fail.
--- a/bsp-files/kickstarts/pre_disk_setup_common.cfg
+++ b/bsp-files/kickstarts/pre_disk_setup_common.cfg
@ -167,6 +167,13 @@ else
        # Avoid wiping ceph osds if sysinv tells us so
        if [ ${WIPE_CEPH_OSDS} == "false" ]; then
            wipe_dev="true"
+
+            pvs | grep -q "$dev *ceph"
+            if [ $? -eq 0 ]; then
+                wlog "skip rook provisoned disk $dev"
+                continue
+            fi
+
            part_numbers=( `parted -s $dev print | awk '$1 == "Number" {i=1; next}; i {print $1}'` )
            # Scanning the partitions looking for CEPH OSDs and
            # skipping any disk found with such partitions
@ -178,7 +185,15 @@ else
                    wipe_dev="false"
                    break
                fi
+
+                pvs | grep -q -e "${dev}${part_number} *ceph" -e "${dev}p${part_number} *ceph"
+                if [ $? -eq 0 ]; then
+                    wlog "Rook OSD found on $dev$part_number, skip wipe"
+                    wipe_dev="false"
+                    break
+                fi
            done
+
            if [ "$wipe_dev" == "false" ]; then
                continue
            fi
--- a/installer/pxe-network-installer/centos/build_srpm.data
+++ b/installer/pxe-network-installer/centos/build_srpm.data
@ -6,6 +6,6 @@ COPY_LIST="pxe-network-installer/* \
           /import/mirrors/CentOS/stx-installer/vmlinuz \
 "

-TIS_PATCH_VER=28
+TIS_PATCH_VER=PKG_GITREVCOUNT+13
 BUILD_IS_BIG=4
 BUILD_IS_SLOW=4
--- a/installer/pxe-network-installer/centos/pxe-network-installer.spec
+++ b/installer/pxe-network-installer/centos/pxe-network-installer.spec
@ -110,6 +110,7 @@ install -v -m 644 %{_sourcedir}/efi-centos-pxe-worker_lowlatency-install \
 install -v -m 644 %{_sourcedir}/efi-centos-pxe-smallsystem_lowlatency-install \
    %{buildroot}/pxeboot/pxelinux.cfg.files/efi-pxe-smallsystem_lowlatency-install-%{platform_release}

+ln -sf /pxeboot/EFI/grubx64.efi %{buildroot}/pxeboot/grubx64.efi

 sed -i "s/xxxSW_VERSIONxxx/%{platform_release}/g" \
    %{buildroot}/pxeboot/pxelinux.cfg.files/pxe-* \
--- a/mtce-common/src/common/bmcUtil.cpp
+++ b/mtce-common/src/common/bmcUtil.cpp
@ -274,9 +274,9 @@ void bmcUtil_create_pw_file ( thread_info_type * info_ptr,
 *
 *************************************************************************/

-string bmcUtil_create_data_fn ( string & hostname,
-                                string   file_suffix,
-                     bmc_protocol_enum   protocol )
+string bmcUtil_create_data_fn ( const string & hostname,
+                                string file_suffix,
+                     bmc_protocol_enum protocol )
 {
    /* create the output filename */
    string datafile ;
--- a/mtce-common/src/common/bmcUtil.h
+++ b/mtce-common/src/common/bmcUtil.h
@ -82,6 +82,14 @@ typedef struct

 } bmc_info_type ;

+typedef struct
+{
+    string hostname;
+    string host_ip ;
+    string   bm_ip ;
+    string   bm_un ;
+    string   bm_pw ;
+} bmcUtil_accessInfo_type ;

 /* BMC commands */
 typedef enum
@ -107,6 +115,7 @@ typedef enum
 #define BMC_QUERY_FILE_SUFFIX          ((const char *)("_root_query"))
 #define BMC_INFO_FILE_SUFFIX           ((const char *)("_bmc_info"))
 #define BMC_POWER_CMD_FILE_SUFFIX      ((const char *)("_power_cmd_result"))
+#define BMC_RESET_CMD_FILE_SUFFIX      ((const char *)("_reset"))
 #define BMC_BOOTDEV_CMD_FILE_SUFFIX    ((const char *)("_bootdev"))
 #define BMC_RESTART_CAUSE_FILE_SUFFIX  ((const char *)("_restart_cause"))
 #define BMC_POWER_STATUS_FILE_SUFFIX   ((const char *)("_power_status"))
@ -137,9 +146,9 @@ void bmcUtil_create_pw_file ( thread_info_type * info_ptr,
                             bmc_protocol_enum   protocol );

 /* create the output filename */
-string bmcUtil_create_data_fn ( string & hostname,
-                                string   file_suffix,
-                     bmc_protocol_enum   protocol );
+string bmcUtil_create_data_fn ( const string & hostname,
+                                string file_suffix,
+                     bmc_protocol_enum protocol );

 /*  Get power state from query response data. */
 int bmcUtil_is_power_on ( string              hostname,
--- a/mtce-common/src/common/hostUtil.cpp
+++ b/mtce-common/src/common/hostUtil.cpp
@ -130,6 +130,14 @@ bool hostUtil_is_valid_username ( string un )
    return (false);
 }

+bool hostUtil_is_valid_pw ( string pw )
+{
+    if ( !pw.empty() )
+        if ( pw.compare(NONE) )
+            return (true);
+    return (false);
+}
+
 bool hostUtil_is_valid_mac_addr ( string mac )
 {
    if ( !mac.empty() )
--- a/mtce-common/src/common/hostUtil.h
+++ b/mtce-common/src/common/hostUtil.h
@ -46,6 +46,7 @@ string hostUtil_getPrefixPath  ( void );
 bool hostUtil_is_valid_uuid    ( string uuid );
 bool hostUtil_is_valid_ip_addr ( string ip );
 bool hostUtil_is_valid_username ( string un );
+bool hostUtil_is_valid_pw      ( string pw );
 bool hostUtil_is_valid_bm_type ( string bm_type );

 int  hostUtil_mktmpfile ( string hostname, string basename, string & filename, string data );
--- a/mtce-common/src/common/ipmiUtil.cpp
+++ b/mtce-common/src/common/ipmiUtil.cpp
@ -202,3 +202,66 @@ int ipmiUtil_bmc_info_load ( string hostname, const char * filename, bmc_info_ty
    ipmiUtil_bmc_info_log ( hostname, bmc_info, rc );
    return (rc);
 }
+
+
+int ipmiUtil_reset_host_now ( string hostname,
+                              bmcUtil_accessInfo_type accessInfo,
+                              string output_filename)
+{
+    dlog("%s %s BMC [IP:%s UN:%s]",
+          accessInfo.hostname.c_str(),
+          accessInfo.host_ip.c_str(),
+          accessInfo.bm_ip.c_str(),
+          accessInfo.bm_un.c_str());
+
+    if (daemon_is_file_present ( BMC_OUTPUT_DIR ) == false )
+        daemon_make_dir(BMC_OUTPUT_DIR) ;
+    if (daemon_is_file_present ( IPMITOOL_OUTPUT_DIR ) == false )
+        daemon_make_dir(IPMITOOL_OUTPUT_DIR) ;
+
+    /* create temp password file */
+    thread_info_type info ;
+    info.hostname = accessInfo.hostname ;
+    info.password_file = "" ;
+    info.pw_file_fd = 0 ;
+
+    /* Use common utility to create a temp pw file */
+    bmcUtil_create_pw_file ( &info, accessInfo.bm_pw, BMC_PROTOCOL__IPMITOOL );
+
+    /* create request */
+    string request =
+    ipmiUtil_create_request ( IPMITOOL_POWER_RESET_CMD,
+                              accessInfo.bm_ip,
+                              accessInfo.bm_un,
+                              info.password_file,
+                              output_filename );
+
+    /* issue request
+     *
+     * Note: Could launch a thread to avoid any stall.
+     *       However, mtcClient can withstand up to a 25 second stall
+     *       before pmon will fail it due to active monitoring.
+     *       UT showed that there is no stall at all. */
+    unsigned long long latency_threshold_secs = DEFAULT_SYSTEM_REQUEST_LATENCY_SECS ;
+    unsigned long long before_time = gettime_monotonic_nsec () ;
+    int rc = system ( request.data()) ;
+    unsigned long long after_time = gettime_monotonic_nsec () ;
+    unsigned long long delta_time = after_time-before_time ;
+    if ( rc )
+    {
+        wlog("system call failed ; rc:%d [%d:%s]", rc, errno, strerror(errno) );
+        rc = FAIL_SYSTEM_CALL ;
+    }
+    if ( delta_time > (latency_threshold_secs*1000000000))
+    {
+        wlog ("%s bmc system call took %2llu.%-8llu sec", hostname.c_str(),
+              (delta_time > NSEC_TO_SEC) ? (delta_time/NSEC_TO_SEC) : 0,
+              (delta_time > NSEC_TO_SEC) ? (delta_time%NSEC_TO_SEC) : 0);
+    }
+
+    /* Cleanup */
+    if ( info.pw_file_fd > 0 )
+        close(info.pw_file_fd);
+    daemon_remove_file ( info.password_file.data());
+    return (rc);
+}
--- a/mtce-common/src/common/ipmiUtil.h
+++ b/mtce-common/src/common/ipmiUtil.h
@ -57,6 +57,8 @@ int ipmiUtil_init ( void );

 int  ipmiUtil_bmc_info_load ( string hostname, const char * filename, bmc_info_type & mc_info );

+int  ipmiUtil_reset_host_now ( string hostname, bmcUtil_accessInfo_type accessInfo, string output_filename );
+
 /* Create the ipmi request */
 string ipmiUtil_create_request ( string cmd, string & ip, string & un, string & pw, string & out );

--- a/mtce-common/src/common/nodeBase.cpp
+++ b/mtce-common/src/common/nodeBase.cpp
@ -149,6 +149,8 @@ const char * get_mtcNodeCommand_str ( int cmd )
        case MTC_REQ_MTCALIVE:    return ("mtcAlive req");
        case MTC_MSG_LOCKED:      return ("locked msg");
        case MTC_CMD_LAZY_REBOOT: return ("lazy reboot");
+        case MTC_MSG_INFO:        return ("info msg");
+        case MTC_CMD_SYNC:        return ("sync");

        /* goenabled commands and messages */
        case MTC_MSG_MAIN_GOENABLED:         return ("goEnabled main msg");
@ -199,7 +201,8 @@ const char * get_mtcNodeCommand_str ( int cmd )
        case MTC_EVENT_PMON_MAJOR: return("pmon major event");
        case MTC_EVENT_PMON_MINOR: return("pmon minor event");
        case MTC_EVENT_PMON_LOG:   return("pmon log");
-        case MTC_EVENT_PMOND_RAISE: return("pmon raise");
+        case MTC_EVENT_PMOND_RAISE: return("pmond raise");
+        case MTC_EVENT_PMOND_CLEAR: return("pmond clear");

        /* data port events */
        case MTC_EVENT_AVS_CLEAR:    return("AVS clear");
@ -394,10 +397,9 @@ void mtc_stages_init ( void )
   recoveryStages_str[MTC_RECOVERY__HEARTBEAT_START    ] = "Heartbeat-Start";
   recoveryStages_str[MTC_RECOVERY__HEARTBEAT_SOAK     ] = "Heartbeat-Soak";
   recoveryStages_str[MTC_RECOVERY__STATE_CHANGE       ] = "State Change";
-   recoveryStages_str[MTC_RECOVERY__ENABLE_START       ] = "Enable-Start";
   recoveryStages_str[MTC_RECOVERY__FAILURE            ] = "Failure";
   recoveryStages_str[MTC_RECOVERY__WORKQUEUE_WAIT     ] = "WorkQ-Wait";
-   recoveryStages_str[MTC_RECOVERY__ENABLE_WAIT        ] = "Enable-Wait";
+   recoveryStages_str[MTC_RECOVERY__ENABLE             ] = "Enable";
   recoveryStages_str[MTC_RECOVERY__STAGES             ] = "unknown";

   disableStages_str [MTC_DISABLE__START               ] = "Disable-Start";
--- a/mtce-common/src/common/nodeBase.h
+++ b/mtce-common/src/common/nodeBase.h
@ -185,7 +185,7 @@ typedef enum
 #define DEFAULT_MTCALIVE_TIMEOUT    (1200)
 #define DEFAULT_GOENABLE_TIMEOUT     (300)
 #define DEFAULT_DOR_MODE_TIMEOUT      (20)
-#define DEFAULT_DOR_MODE_CPE_TIMEOUT (600)
+#define DEFAULT_DOR_MODE_AIO_TIMEOUT (600)

 /** TODO: Convert names to omit JSON part */
 #define MTC_JSON_INV_LABEL     "ihosts"
@ -263,6 +263,7 @@ typedef enum
 #define MTC_TASK_ENABLE_WORK_FAIL  "Enable Action Failed"
 #define MTC_TASK_ENABLE_WORK_TO    "Enable Action Timeout"
 #define MTC_TASK_ENABLE_FAIL_HB    "Enable Heartbeat Failure, re-enabling"
+#define MTC_TASK_RECOVERY_FAIL_HB  "Graceful Recovery Heartbeat Failure, re-enabling"
 #define MTC_TASK_RECOVERY_FAIL     "Graceful Recovery Failed, re-enabling"
 #define MTC_TASK_RECOVERY_WAIT     "Graceful Recovery Wait"
 #define MTC_TASK_RECOVERED         "Gracefully Recovered"
@ -311,7 +312,7 @@ typedef enum
 #define MTC_TASK_POWERCYCLE_FAIL   "Critical Event Power-Cycle %d; failed"
 #define MTC_TASK_POWERCYCLE_DOWN   "Critical Event Power-Down ; due to persistent critical sensor"
 #define MTC_TASK_RESETTING_HOST    "Resetting Host, critical sensor"
-#define MTC_TASK_CPE_SX_UNLOCK_MSG "Unlocking, please stand-by while the system gracefully reboots"
+#define MTC_TASK_AIO_SX_UNLOCK_MSG "Unlocking, please stand-by while the system gracefully reboots"
 #define MTC_TASK_SELF_UNLOCK_MSG   "Unlocking active controller, please stand-by while it reboots"
 #define MTC_TASK_FAILED_SWACT_REQ  "Critical failure.Requesting SWACT to enabled standby controller"
 #define MTC_TASK_FAILED_NO_BACKUP  "Critical failure.Please provision/enable standby controller"
@ -383,8 +384,8 @@ typedef enum
 /* 5 milliseconds */
 #define MTCAGENT_SELECT_TIMEOUT (5000)

-/* dedicate more idle time in CPE ; there is less maintenance to do */
-#define MTCAGENT_CPE_SELECT_TIMEOUT (10000)
+/* dedicate more idle time in AIO ; there is less maintenance to do */
+#define MTCAGENT_AIO_SELECT_TIMEOUT (10000)

 /** Number of retries maintenance will do when it experiences
 *  a REST API call failure ; any failure */
@ -751,7 +752,9 @@ typedef struct
 #define MTC_CMD_START_STORAGE_SVCS    19  /*   to host */
 #define MTC_CMD_LAZY_REBOOT           20  /*   to host */
 #define MTC_CMD_HOST_SVCS_RESULT      21  /*   to host */
-#define MTC_CMD_LAST                  22
+#define MTC_MSG_INFO                  22  /*   to host */
+#define MTC_CMD_SYNC                  23  /*   to host */
+#define MTC_CMD_LAST                  24

 #define RESET_PROG_MAX_REBOOTS_B4_RESET (5)
 #define RESET_PROG_MAX_REBOOTS_B4_RETRY (RESET_PROG_MAX_REBOOTS_B4_RESET+2)
@ -946,7 +949,7 @@ typedef enum
 string get_delStages_str ( mtc_delStages_enum stage );


-#define MTC_MAX_FAST_ENABLES (3)
+#define MTC_MAX_FAST_ENABLES (5)
 typedef enum
 {
    MTC_RECOVERY__START =  0,
@ -972,10 +975,9 @@ typedef enum
    MTC_RECOVERY__HEARTBEAT_START,
    MTC_RECOVERY__HEARTBEAT_SOAK,
    MTC_RECOVERY__STATE_CHANGE,
-    MTC_RECOVERY__ENABLE_START,
    MTC_RECOVERY__FAILURE,
    MTC_RECOVERY__WORKQUEUE_WAIT,
-    MTC_RECOVERY__ENABLE_WAIT,
+    MTC_RECOVERY__ENABLE,
    MTC_RECOVERY__STAGES,
 } mtc_recoveryStages_enum ;

@ -1263,6 +1265,14 @@ typedef enum
    MTC_AR_DISABLE_CAUSE__NONE,
 } autorecovery_disable_cause_enum ;

+/* code that represents a specific group of maintenance information
+ * ... typically for a specific feature */
+typedef enum
+{
+    MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO,
+    MTC_INFO_CODE__LAST
+} mtcInfo_enum ;
+
 /* Service Based Auto Recovery Control Structure */
 typedef struct
 {
--- a/mtce-common/src/common/threadUtil.cpp
+++ b/mtce-common/src/common/threadUtil.cpp
@ -309,6 +309,48 @@ bool thread_idle ( thread_ctrl_type & ctrl )
    return (false);
 }

+/****************************************************************************
+ *
+ * Name       : thread_done_consume
+ *
+ * Description: Return to IDLE stage.
+ *
+ ****************************************************************************/
+
+int thread_done_consume ( thread_ctrl_type & ctrl, thread_info_type & info )
+{
+    if ( ctrl.stage == THREAD_STAGE__IDLE )
+    {
+        return PASS ;
+    }
+    else if ( ctrl.done == false )
+    {
+        if ( info.runcount > ctrl.runcount )
+        {
+            ilog("%s thread cleanup ; cmd:%d ; cnt:%d:%d",
+                 info.hostname.c_str(),
+                 info.command,
+                 ctrl.runcount,
+                 info.runcount);
+            ctrl.done = true ;
+            ctrl.stage = THREAD_STAGE__DONE ;
+            thread_handler (ctrl, info);
+            return PASS ;
+        }
+        else
+        {
+            thread_kill(ctrl, info);
+            return RETRY ;
+        }
+    }
+    else
+    {
+        ctrl.stage = THREAD_STAGE__DONE ;
+        thread_handler( ctrl, info );
+        return PASS ;
+    }
+}
+
 /****************************************************************************
 *
 * Name       : thread_launch
@ -381,7 +423,7 @@ void thread_kill ( thread_ctrl_type & ctrl, thread_info_type & info )
        ( ctrl.stage != THREAD_STAGE__WAIT ) &&
        ( ctrl.stage != THREAD_STAGE__IDLE ))
    {
-        blog ("%s kill request\n", ctrl.hostname.c_str() );
+        wlog ("%s kill request\n", ctrl.hostname.c_str() );
        _stage_change ( ctrl, THREAD_STAGE__KILL );
    }
 }
--- a/mtce-common/src/common/threadUtil.h
+++ b/mtce-common/src/common/threadUtil.h
@ -284,6 +284,7 @@ bool   thread_done   ( thread_ctrl_type & ctrl );
 bool   thread_idle   ( thread_ctrl_type & ctrl );
 void   thread_kill   ( thread_ctrl_type & ctrl , thread_info_type & info );
 string thread_stage  ( thread_ctrl_type & ctrl );
+int    thread_done_consume ( thread_ctrl_type & ctrl, thread_info_type & info );

 /* Cooperative service of cancel and exit requests from parent */
 void pthread_signal_handler ( thread_info_type * info_ptr );
--- a/mtce-common/src/daemon/daemon_common.h
+++ b/mtce-common/src/daemon/daemon_common.h
@ -38,15 +38,15 @@ using namespace std ;
 /* List of different types */
 typedef enum
 {
-    SYSTEM_TYPE__NORMAL                  =0,
-    SYSTEM_TYPE__CPE_MODE__DUPLEX        =1,
-    SYSTEM_TYPE__CPE_MODE__DUPLEX_DIRECT =2,
-    SYSTEM_TYPE__CPE_MODE__SIMPLEX       =3,
+    SYSTEM_TYPE__NORMAL             =0,
+    SYSTEM_TYPE__AIO__DUPLEX        =1,
+    SYSTEM_TYPE__AIO__DUPLEX_DIRECT =2,
+    SYSTEM_TYPE__AIO__SIMPLEX       =3,
 } system_type_enum ;


 /** Called by signal handler on daemon exit
-  * Performs cleanup by closing open files 
+  * Performs cleanup by closing open files
  * and freeing used memory */
 void daemon_exit ( void );

--- a/mtce-common/src/daemon/daemon_files.cpp
+++ b/mtce-common/src/daemon/daemon_files.cpp
@ -347,7 +347,7 @@ string daemon_mgmnt_iface ( void )
 system_type_enum daemon_system_type ( void )
 {
    char buffer  [BUFFER];
-    system_type_enum system_type = SYSTEM_TYPE__CPE_MODE__SIMPLEX ;
+    system_type_enum system_type = SYSTEM_TYPE__AIO__SIMPLEX ;
    FILE * cfg_file_stream = fopen ( PLATFORM_CONF_FILE, "r" );
    if ( cfg_file_stream != NULL )
    {
@ -401,11 +401,11 @@ system_type_enum daemon_system_type ( void )
                        if ( !mode.empty() )
                        {
                            if ( mode.compare("duplex") == 0 )
-                                system_type = SYSTEM_TYPE__CPE_MODE__DUPLEX ;
+                                system_type = SYSTEM_TYPE__AIO__DUPLEX ;
                            else if ( mode.compare("duplex-direct") == 0 )
-                                system_type = SYSTEM_TYPE__CPE_MODE__DUPLEX_DIRECT ;
+                                system_type = SYSTEM_TYPE__AIO__DUPLEX_DIRECT ;
                            else if ( mode.compare("simplex") == 0 )
-                                system_type = SYSTEM_TYPE__CPE_MODE__SIMPLEX ;
+                                system_type = SYSTEM_TYPE__AIO__SIMPLEX ;
                            else
                            {
                                elog ("%s All-In-One system type ; mode unknown\n", SYSTEM_TYPE_PREFIX );
@ -438,21 +438,21 @@ system_type_enum daemon_system_type ( void )
            ilog("%s Standard System\n", SYSTEM_TYPE_PREFIX);
            break ;
        }
-        case SYSTEM_TYPE__CPE_MODE__DUPLEX_DIRECT:
+        case SYSTEM_TYPE__AIO__DUPLEX_DIRECT:
        {
            ilog ("%s All-in-one Duplex Direct Connect\n", SYSTEM_TYPE_PREFIX );
            break ;
        }
-        case SYSTEM_TYPE__CPE_MODE__DUPLEX:
+        case SYSTEM_TYPE__AIO__DUPLEX:
        {
            ilog ("%s All-in-one Duplex\n", SYSTEM_TYPE_PREFIX );
            break ;
        }
-        case SYSTEM_TYPE__CPE_MODE__SIMPLEX:
+        case SYSTEM_TYPE__AIO__SIMPLEX:
        default:
        {
            ilog ("%s All-in-one Simplex \n", SYSTEM_TYPE_PREFIX );
-            system_type = SYSTEM_TYPE__CPE_MODE__SIMPLEX ;
+            system_type = SYSTEM_TYPE__AIO__SIMPLEX ;
            break ;
        }
    }
--- a/mtce-control/src/scripts/hbsAgent.service
+++ b/mtce-control/src/scripts/hbsAgent.service
@ -1,22 +1,13 @@
 [Unit]
 Description=StarlingX Maintenance Heartbeat Agent
-After=network.target syslog.service config.service
+After=hbsClient.service
 Before=pmon.service

 [Service]
 Type=forking
 ExecStart=/etc/rc.d/init.d/hbsAgent start
-ExecStop=/etc/rc.d/init.d/hbsAgent start
+ExecStop=/etc/rc.d/init.d/hbsAgent stop
 PIDFile=/var/run/hbsAgent.pid
-KillMode=process
-SendSIGKILL=no
-
-# Process recovery is handled by pmond if its running.
-# Delay 10 seconds to give pmond a chance to recover
-# before systemd kicks in to do it as a backup plan.
-Restart=always
-RestartSec=10

 [Install]
 WantedBy=multi-user.target
-
--- a/mtce/src/alarm/scripts/mtcalarm.logrotate
+++ b/mtce/src/alarm/scripts/mtcalarm.logrotate
@ -1,17 +1,19 @@
-#daily
-nodateext
-start 1
-compress
-copytruncate
-notifempty
-missingok
+#
+# Copyright (c) 2018-2021 Wind River Systems, Inc.
+#
+# SPDX-License-Identifier: Apache-2.0

 /var/log/mtcalarmd.log
 {
+    create 0640 root root
+    start 1
    size 10M
    rotate 20
-    sharedscripts
+    compress
+    notifempty
+    missingok
    postrotate
        systemctl reload syslog-ng > /dev/null 2>&1 || true
    endscript
+    delaycompress
 }
--- a/mtce/src/common/nodeClass.cpp
+++ b/mtce/src/common/nodeClass.cpp
@ -660,7 +660,7 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
    {
        ptr->alarms[id] = FM_ALARM_SEVERITY_CLEAR ;
    }
-    ptr->alarms_loaded   = false ;
+    ptr->active_alarms = "" ; /* no active alarms */

    ptr->cfgEvent.base   = NULL ;
    ptr->sysinvEvent.base= NULL ;
@ -778,6 +778,7 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
    return ptr ;
 }

+
 struct nodeLinkClass::node* nodeLinkClass::getNode ( string hostname )
 {
   /* check for empty list condition */
@ -2706,7 +2707,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
        node_ptr->operState   = operState_str_to_enum   (inv.oper.data ());
        node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());

-        if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
+        if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
        {
            node_ptr->operState_subf   = operState_str_to_enum (inv.oper_subf.data());
            node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
@ -2818,7 +2819,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
                    node_ptr->operState   = operState_str_to_enum   (inv.oper.data ());
                    node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());

-                    if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
+                    if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
                    {
                        node_ptr->operState_subf   = operState_str_to_enum (inv.oper_subf.data());
                        node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
@ -2835,7 +2836,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
                    node_ptr->operState   = operState_str_to_enum   (inv.oper.data ());
                    node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());

-                    if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
+                    if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
                    {
                        node_ptr->operState_subf   = operState_str_to_enum (inv.oper_subf.data());
                        node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
@ -2853,7 +2854,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
                    node_ptr->operState   = operState_str_to_enum   (inv.oper.data ());
                    node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());

-                    if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
+                    if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
                    {
                        node_ptr->operState_subf   = operState_str_to_enum (inv.oper_subf.data());
                        node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
@ -2871,7 +2872,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
                    node_ptr->operState   = operState_str_to_enum   (inv.oper.data ());
                    node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());

-                    if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
+                    if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
                    {
                        node_ptr->operState_subf   = operState_str_to_enum (inv.oper_subf.data());
                        node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
@ -2889,7 +2890,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
                    node_ptr->operState   = operState_str_to_enum   (inv.oper.data ());
                    node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());

-                    if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
+                    if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
                    {
                        node_ptr->operState_subf   = operState_str_to_enum (inv.oper_subf.data());
                        node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
@ -2940,7 +2941,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
                    node_ptr->operState   = MTC_OPER_STATE__DISABLED ;
                    node_ptr->availStatus = MTC_AVAIL_STATUS__OFFLINE ;

-                    if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
+                    if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
                    {
                        node_ptr->operState_subf   = MTC_OPER_STATE__DISABLED ;
                        node_ptr->availStatus_subf = MTC_AVAIL_STATUS__OFFLINE ;
@ -2958,7 +2959,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
                node_ptr->operState   = operState_str_to_enum   (inv.oper.data ());
                node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());

-                if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
+                if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
                {
                    node_ptr->operState_subf   = operState_str_to_enum (inv.oper_subf.data());
                    node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
@ -3295,6 +3296,102 @@ void nodeLinkClass::mtcInfo_log ( struct nodeLinkClass::node * node_ptr )
    }
 }

+/***************************************************************************
+ *
+ * Name        : build_mtcInfo_dict
+ *
+ * Purpose     : Build a json dictionary for the specified info code enum
+ *
+ * Assumptions : Only MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO is supported
+ *
+ * Returns     : Returns a json dictionary of mtcInfo.
+ *
+ *               {
+ *                  "controller-0":{
+ *                      "ip":"192.168.204.2",
+ *                      "bm_ip":"xxx.xxx.xx.23",
+ *                      "bm_un":"root",
+ *                      "bm_pw":"root"
+ *                   },
+ *                   "controller-1":{
+ *                      "ip":"192.168.204.3",
+ *                      "bm_ip":"xxx.xxx.xx.24",
+ *                      "bm_un":"root",
+ *                      "bm_pw":"root"
+ *                   }
+ *               }
+ *
+ **************************************************************************/
+
+string nodeLinkClass::build_mtcInfo_dict ( mtcInfo_enum mtcInfo_code )
+{
+    string mtcInfo_dict = "" ;
+
+    /* loop/exit control */
+    int temp = 0 ;
+
+    /* should never happen but better to be safe */
+    if ( head == NULL )
+        return mtcInfo_dict ;
+
+    /* force the update to be a dictionary */
+    mtcInfo_dict = "{" ;
+
+    for ( struct node * ptr = head ;  ; ptr = ptr->next )
+    {
+        if (( ptr->nodetype & CONTROLLER_TYPE ) &&
+            ( mtcInfo_code == MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO ))
+        {
+            if ( temp )
+                mtcInfo_dict.append(",");
+            mtcInfo_dict.append("\"" + ptr->hostname + "\":{");
+            mtcInfo_dict.append("\"mgmt_ip\":\"" + ptr->ip + "\",");
+            mtcInfo_dict.append("\"bm_ip\":\"" + ptr->bm_ip + "\",");
+            mtcInfo_dict.append("\"bm_un\":\"" + ptr->bm_un + "\",");
+            mtcInfo_dict.append("\"bm_pw\":\"" + ptr->bm_pw + "\"}");
+            if ( ++temp >= 2 )
+                break ;
+        }
+        if (( ptr->next == NULL ) || ( ptr == tail ))
+           break ;
+    }
+    mtcInfo_dict.append("}");
+    return mtcInfo_dict ;
+}
+
+/**************************************************************************
+ *
+ * Name          : mtcInfo_handler
+ *
+ * Purpose       : Send mtcInfo update to provisioned controllers when
+ *                 the push flag is set.
+ *
+ **************************************************************************/
+
+void nodeLinkClass::mtcInfo_handler ( void )
+{
+    /* This is set in the bm_handler once access to the BMC using
+     * provisioned credentials have been verified. */
+    if ( this->want_mtcInfo_push )
+    {
+        /* handler will enhance when more codes are introduced */
+        mtcInfo_enum mtcInfo_code = MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO ;
+
+        string mtcInfo_dict = build_mtcInfo_dict(mtcInfo_code);
+        if ( ! mtcInfo_dict.empty() )
+        {
+            string temp = CONTROLLER_0 ;
+            send_mtc_cmd ( temp, MTC_MSG_INFO, MGMNT_INTERFACE, mtcInfo_dict);
+            if ( this->controllers > 1 )
+            {
+                temp = CONTROLLER_1;
+                send_mtc_cmd ( temp, MTC_MSG_INFO, MGMNT_INTERFACE, mtcInfo_dict);
+            }
+        }
+        this->want_mtcInfo_push = false ;
+    }
+}
+
 /* Lock Rules
 *
 * 1. Cannot lock this controller
@ -4034,6 +4131,18 @@ int  nodeLinkClass::get_uptime_refresh_ctr ( string & hostname )
    return (0);
 }

+
+int nodeLinkClass::get_mtce_flags ( string & hostname )
+{
+    nodeLinkClass::node* node_ptr ;
+    node_ptr = nodeLinkClass::getNode ( hostname );
+    if ( node_ptr != NULL )
+    {
+        return ( node_ptr->mtce_flags );
+    }
+    return (0);
+}
+
 void nodeLinkClass::set_mtce_flags ( string hostname, int flags, int iface )
 {
    nodeLinkClass::node* node_ptr = nodeLinkClass::getNode ( hostname );
@ -4114,7 +4223,7 @@ void nodeLinkClass::set_mtce_flags ( string hostname, int flags, int iface )


        /* Deal with sub-function if AIO controller host */
-        if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
+        if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
        {
            if ( flags & MTC_FLAG__SUBF_GOENABLED )
            {
@ -4422,6 +4531,18 @@ string nodeLinkClass::get_bm_ip   ( string hostname )
    return ("");
 }

+string nodeLinkClass::get_bm_pw   ( string hostname )
+{
+    nodeLinkClass::node* node_ptr ;
+    node_ptr = nodeLinkClass::getNode ( hostname );
+    if ( node_ptr != NULL )
+    {
+         return (node_ptr->bm_pw);
+    }
+    elog ("%s bm pw lookup failed\n", hostname.c_str() );
+    return ("");
+}
+
 string nodeLinkClass::get_bm_un   ( string hostname )
 {
    nodeLinkClass::node* node_ptr ;
@ -4774,7 +4895,10 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa

            /* Otherwise this is a single host that has recovered
             * possibly as part of a mnfa group or simply a lone wolf */
-            else
+            else if (( node_ptr->hbs_minor[MGMNT_IFACE] == false ) &&
+                     (( clstr_network_provisioned == false ) ||
+                      (( clstr_network_provisioned == true ) &&
+                       ( node_ptr->hbs_minor[CLSTR_IFACE] == false ))))
            {
                if ( node_ptr->mnfa_graceful_recovery == true )
                {
@ -4782,6 +4906,8 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
                    mnfa_awol_list.remove(node_ptr->hostname);
                }

+                /* Don't recover until heartbeat is working over all
+                 * monitored interfaces */
                mnfa_recover_host ( node_ptr );

                if ( mnfa_active == true )
@ -4819,17 +4945,17 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
    }

     if ( temp_count != mnfa_host_count[iface] )
-     {    
+     {
         slog ("%s MNFA host tally (%s:%d incorrect - expected %d) ; correcting\n",
                   node_ptr->hostname.c_str(),
                   get_iface_name_str(iface),
                   mnfa_host_count[iface], temp_count );
                   mnfa_host_count[iface] = temp_count ;
         mnfa_host_count[iface] = temp_count ;
-     }    
+     }
     else
     {
-         wlog ("%s MNFA host tally (%s:%d)\n",
+         dlog ("%s MNFA host tally (%s:%d)\n",
                   node_ptr->hostname.c_str(),
                   get_iface_name_str(iface),
                   mnfa_host_count[iface] );
@ -4935,11 +5061,28 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
            }
            return ;
        }
+        else if ( node_ptr->recoveryStage == MTC_RECOVERY__HEARTBEAT_SOAK )
+        {
+            elog ("%s %s *** Heartbeat Loss *** (during recovery soak)\n",
+                      hostname.c_str(),
+                      get_iface_name_str(iface));
+            force_full_enable ( node_ptr );
+            return ;
+        }

        mnfa_add_host ( node_ptr , iface );

        if ( mnfa_active == false )
        {
+            /* if node is already in graceful recovery just ignore the event */
+            if ( node_ptr->graceful_recovery_counter != 0 )
+            {
+                dlog ("%s %s loss event ; already in graceful recovery try %d",
+                          hostname.c_str(),
+                          get_iface_name_str(iface),
+                          node_ptr->graceful_recovery_counter );
+                return ;
+            }
            elog ("%s %s *** Heartbeat Loss ***\n", hostname.c_str(), get_iface_name_str(iface));
            if ( iface == CLSTR_IFACE )
            {
@ -4980,6 +5123,15 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
    }
 }

+/****************************************************************************
+ *
+ * Name       : manage_heartbeat_clear
+ *
+ * Description: Manage clearing heartbeat failure status
+ *
+ * Assuptions : Called by Both hbsAgent and mtcAgent
+ *
+ ***************************************************************************/
 void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
 {
    nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
@ -4995,13 +5147,17 @@ void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
            node_ptr->heartbeat_failed[i] = false ;
            if ( i == MGMNT_IFACE )
            {
-                node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
-                node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
+                if ( heartbeat )
+                    node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
+                if ( maintenance )
+                    node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
            }
            if ( i == CLSTR_IFACE )
            {
-                node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
-                node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
+                if ( heartbeat )
+                    node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
+                if ( maintenance )
+                    node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
            }
        }
    }
@ -5010,13 +5166,17 @@ void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
        node_ptr->heartbeat_failed[iface] = false ;
        if ( iface == MGMNT_IFACE )
        {
-            node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
-            node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
+            if ( heartbeat )
+                node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
+            if ( maintenance )
+                node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
        }
        else if ( iface == CLSTR_IFACE )
        {
-            node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
-            node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
+            if ( heartbeat )
+                node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
+            if ( maintenance )
+                node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
        }
    }
 }
@ -5795,9 +5955,6 @@ int nodeLinkClass::critical_process_failed( string & hostname,
                      node_ptr->hostname.c_str()); /* dlog */
        }

-        /* Start fresh the next time we enter graceful recovery handler */
-        node_ptr->graceful_recovery_counter = 0 ;
-
        /* Set node as unlocked-disabled-failed */
        allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
                                   MTC_OPER_STATE__DISABLED,
@ -6755,7 +6912,7 @@ int nodeLinkClass::disableStageChange ( struct nodeLinkClass::node * node_ptr,
 }

 /** Validate and log Recovery stage changes */
-int nodeLinkClass::recoveryStageChange  ( struct nodeLinkClass::node * node_ptr, 
+int nodeLinkClass::recoveryStageChange  ( struct nodeLinkClass::node * node_ptr,
                                          mtc_recoveryStages_enum newHdlrStage )
 {
    int rc = PASS ;
@ -6763,14 +6920,14 @@ int nodeLinkClass::recoveryStageChange  ( struct nodeLinkClass::node * node_ptr,
    if (( newHdlrStage >= MTC_RECOVERY__STAGES ) ||
        ( node_ptr->recoveryStage >= MTC_RECOVERY__STAGES ))
    {
-        slog ("%s Invalid recovery stage (%d:%d)\n", 
+        slog ("%s Invalid recovery stage (%d:%d)\n",
                  node_ptr->hostname.c_str(),
-                  node_ptr->recoveryStage, 
+                  node_ptr->recoveryStage,
                  newHdlrStage );

        if ( newHdlrStage < MTC_RECOVERY__STAGES )
        {
-            clog ("%s ? -> %s\n", 
+            clog ("%s ? -> %s\n",
               node_ptr->hostname.c_str(),
               get_recoveryStages_str(newHdlrStage).c_str());

@ -6782,11 +6939,11 @@ int nodeLinkClass::recoveryStageChange  ( struct nodeLinkClass::node * node_ptr,
            rc = FAIL ;
        }
    }
-    else 
+    else
    {
-        clog ("%s %s -> %s\n", 
+        clog ("%s %s -> %s\n",
               node_ptr->hostname.c_str(),
-               get_recoveryStages_str(node_ptr->recoveryStage).c_str(), 
+               get_recoveryStages_str(node_ptr->recoveryStage).c_str(),
               get_recoveryStages_str(newHdlrStage).c_str());

        node_ptr->recoveryStage = newHdlrStage  ;
@ -7514,7 +7671,7 @@ int nodeLinkClass::ar_manage ( struct nodeLinkClass::node * node_ptr,
        mtcInvApi_update_states ( node_ptr, "unlocked", "disabled", "failed" );

        if (( NOT_THIS_HOST ) &&
-            ( this->system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX ))
+            ( this->system_type != SYSTEM_TYPE__AIO__SIMPLEX ))
        {
            if ( ++node_ptr->ar_count[node_ptr->ar_cause] >=
                  this->ar_threshold [node_ptr->ar_cause] )
@ -7746,7 +7903,11 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen

            if ( true_false == true )
            {
-                ilog ("%s heartbeat start", hostname.c_str());
+                ilog ("%s %s heartbeat %sstart",
+                          hostname.c_str(),
+                          get_iface_name_str(iface),
+                          node_ptr->monitor[iface] ? "re" : "");
+
                node_ptr->no_work_log_throttle = 0 ;
                node_ptr->b2b_misses_count[iface] = 0 ;
                node_ptr->hbs_misses_count[iface] = 0 ;
@ -7758,7 +7919,12 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen
            }
            else
            {
-                ilog ("%s heartbeat stop", hostname.c_str());
+                if (  node_ptr->monitor[iface] == true )
+                {
+                    ilog ("%s %s heartbeat stop",
+                              hostname.c_str(),
+                              get_iface_name_str(iface));
+                }
            }
            node_ptr->monitor[iface] = true_false ;
        }
@ -7771,7 +7937,7 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen
 void nodeLinkClass::set_hwmond_monitor_state ( string & hostname, bool state )
 {
    if ( hostname.length() )
-    {  
+    {
        struct nodeLinkClass::node* node_ptr ;
        node_ptr = nodeLinkClass::getNode ( hostname );
        if ( node_ptr != NULL )
@ -8511,7 +8677,7 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p



-#define HBS_LOSS_REPORT_THROTTLE (100)
+#define HBS_LOSS_REPORT_THROTTLE (100000)
 int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
 {
    int lost = 0  ;
@ -8551,6 +8717,13 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )

            if ( pulse_ptr->b2b_misses_count[iface] > 1 )
            {
+                if ( pulse_ptr->b2b_misses_count[iface] < hbs_failure_threshold )
+                {
+                    hbs_cluster_change ( pulse_ptr->hostname + " " +
+                            get_iface_name_str(iface) +
+                            " heartbeat miss " +
+                            itos(pulse_ptr->b2b_misses_count[iface]));
+                }
                if ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold )
                {
                    if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
@ -8657,57 +8830,43 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
                }
            }

-            /* Turn the cluster-host heartbeat loss into a degrade only
-             * condition if the clstr_degrade_only flag is set */
-            if (( iface == CLSTR_IFACE ) &&
-                ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) &&
-                ( clstr_degrade_only == true ))
-            {
-                /* Only print the log at the threshold boundary */
-                if (( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
-                {
-                    if ( this->active_controller )
-                    {
-                        manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
-                    }
-
-                    wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
-                               pulse_ptr->hostname.c_str(),
-                               get_iface_name_str(iface) );
-                    hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
-                }
-            }
-
            /* Turn the clstr heartbeat loss into a degrade only
             * condition for inactive controller on normal system. */
-            else if (( iface == CLSTR_IFACE ) &&
-                     ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) &&
-                     ( this->system_type == SYSTEM_TYPE__NORMAL ) &&
-                     (( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE ))
+            if (( iface == CLSTR_IFACE ) &&
+                ((( this->system_type == SYSTEM_TYPE__NORMAL ) &&
+                 (( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE )) ||
+                 ( clstr_degrade_only == true )))
            {
                /* Only print the log at the threshold boundary */
-                if ( (pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
+                if ( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE == hbs_failure_threshold )
                {
                    if ( this->active_controller )
                    {
                        manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
                    }
-                    wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
+                    wlog ( "%s %s *** Heartbeat Loss *** (degrade only due to %s)\n",
                               pulse_ptr->hostname.c_str(),
-                               get_iface_name_str(iface));
-                    hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
+                               get_iface_name_str(iface),
+                               clstr_degrade_only ? "config option" : "system type");
+                    hbs_cluster_change ( pulse_ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat loss" );
                }
            }

            else if ((pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
+            // else if ( pulse_ptr->hbs_failure[iface] == false )
            {
-                elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(),
-                                                        get_iface_name_str(iface) );
+                elog ("%s %s *** Heartbeat Loss *** (b2b_misses:0x%x)\n",
+                          pulse_ptr->hostname.c_str(),
+                          get_iface_name_str(iface),
+                          pulse_ptr->b2b_misses_count[iface]);
+                hbs_cluster_change ( pulse_ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat loss" );

                if ( this->active_controller )
                {
-                    manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
-
+                    if ( pulse_ptr->hbs_failure[iface] == false )
+                    {
+                        manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
+                    }
                    /* report this host as failed */
                    if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS )
                    {
@ -8715,10 +8874,8 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
                    }
                }
                else
-                {
                    pulse_ptr->hbs_failure[iface] = true ;
-                }
-                hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
+
                pulse_ptr->hbs_failure_count[iface]++ ;
            }
            if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] )
@ -8963,21 +9120,21 @@ void nodeLinkClass::mem_log_mtcalive ( struct nodeLinkClass::node * node_ptr )
 {
    char str[MAX_MEM_LOG_DATA] ;

-    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tmtcAlive: online:%c offline:%c Cnt:%d Gate:%s Misses:%d\n", 
-                node_ptr->hostname.c_str(), 
+    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tmtcAlive: online:%c offline:%c Cnt:%d Gate:%s Misses:%d\n",
+                node_ptr->hostname.c_str(),
                node_ptr->mtcAlive_online ? 'Y' : 'N',
                node_ptr->mtcAlive_offline ? 'Y' : 'N',
                node_ptr->mtcAlive_count,
                node_ptr->mtcAlive_gate ? "closed" : "open",
-                node_ptr->mtcAlive_misses); 
+                node_ptr->mtcAlive_misses);
    mem_log (str);
 }

 void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr )
 {
    char str[MAX_MEM_LOG_DATA] ;
-    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAlarm List:%s%s%s%s%s%s\n", 
-               node_ptr->hostname.c_str(), 
+    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAlarm List:%s%s%s%s%s%s\n",
+               node_ptr->hostname.c_str(),
               node_ptr->alarms[MTC_ALARM_ID__LOCK    ] ? " Locked"   : " .",
               node_ptr->alarms[MTC_ALARM_ID__CONFIG  ] ? " Config"   : " .",
               node_ptr->alarms[MTC_ALARM_ID__ENABLE  ] ? " Enable"   : " .",
@ -8987,6 +9144,18 @@ void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr )
    mem_log (str);
 }

+void nodeLinkClass::mem_log_alarm2 ( struct nodeLinkClass::node * node_ptr )
+{
+    if ( ! node_ptr->active_alarms.empty() )
+    {
+        char str[MAX_MEM_LOG_DATA] ;
+        snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tActive Alarms:%s\n",
+                   node_ptr->hostname.c_str(),
+                   node_ptr->active_alarms.c_str());
+        mem_log (str);
+    }
+}
+
 void nodeLinkClass::mem_log_stage ( struct nodeLinkClass::node * node_ptr )
 {
    char str[MAX_MEM_LOG_DATA] ;
@ -9037,8 +9206,8 @@ void nodeLinkClass::mem_log_network ( struct nodeLinkClass::node * node_ptr )
 {
    char str[MAX_MEM_LOG_DATA] ;
    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s %s cluster_host_ip: %s Uptime: %u\n",
-                node_ptr->hostname.c_str(), 
-                node_ptr->mac.c_str(), 
+                node_ptr->hostname.c_str(),
+                node_ptr->mac.c_str(),
                node_ptr->ip.c_str(),
                node_ptr->clstr_ip.c_str(),
                node_ptr->uptime );
@ -9050,11 +9219,11 @@ void nodeLinkClass::mem_log_heartbeat ( struct nodeLinkClass::node * node_ptr )
    char str[MAX_MEM_LOG_DATA] ;
    for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
    {
-        snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Minor:%s Degrade:%s Failed:%s  Monitor:%s\n", 
-                   node_ptr->hostname.c_str(), 
+        snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Minor:%s Degrade:%s Failed:%s  Monitor:%s\n",
+                   node_ptr->hostname.c_str(),
                   get_iface_name_str (iface),
-                   node_ptr->hbs_minor[iface] ? "true " : "false", 
-                   node_ptr->hbs_degrade[iface] ? "true " : "false", 
+                   node_ptr->hbs_minor[iface] ? "true " : "false",
+                   node_ptr->hbs_degrade[iface] ? "true " : "false",
                   node_ptr->hbs_failure[iface] ? "true " : "false",
                   node_ptr->monitor[iface] ? "YES" : "no"  );
        mem_log (str);
@ -9083,8 +9252,8 @@ void nodeLinkClass::mem_log_hbs_cnts ( struct nodeLinkClass::node * node_ptr )
 void nodeLinkClass::mem_log_test_info ( struct nodeLinkClass::node * node_ptr )
 {
    char str[MAX_MEM_LOG_DATA] ;
-    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tOOS Stage:%s Runs:%d - INSV Stage:%s Runs:%d\n", 
-                node_ptr->hostname.c_str(), 
+    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tOOS Stage:%s Runs:%d - INSV Stage:%s Runs:%d\n",
+                node_ptr->hostname.c_str(),
                get_oosTestStages_str(node_ptr->oosTestStage).c_str(),
                node_ptr->oos_test_count,
                get_insvTestStages_str(node_ptr->insvTestStage).c_str(),
@ -9117,7 +9286,7 @@ void nodeLinkClass::mem_log_type_info ( struct nodeLinkClass::node * node_ptr )
                node_ptr->function);
    mem_log (str);

-    if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
+    if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
    {
        snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tSub-Function: %s (%u) (SubFunc Enabled:%c)\n",
                node_ptr->hostname.c_str(),
@ -9156,6 +9325,7 @@ void nodeLinkClass::memDumpNodeState ( string hostname )
            // mem_log_reset_info ( node_ptr );
            mem_log_power_info ( node_ptr );
            mem_log_alarm1     ( node_ptr );
+            mem_log_alarm2     ( node_ptr );
            mem_log_mtcalive   ( node_ptr );
            mem_log_stage      ( node_ptr );
            mem_log_bm         ( node_ptr );
--- a/mtce/src/common/nodeClass.h
+++ b/mtce/src/common/nodeClass.h
@ -76,11 +76,11 @@ using namespace std;
 #define LARGE_SYSTEM \
    ( this->system_type == SYSTEM_TYPE__NORMAL )

-#define CPE_SYSTEM \
+#define AIO_SYSTEM \
    ( this->system_type != SYSTEM_TYPE__NORMAL )

-#define SIMPLEX_CPE_SYSTEM \
-    ( this->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX )
+#define SIMPLEX_AIO_SYSTEM \
+    ( this->system_type == SYSTEM_TYPE__AIO__SIMPLEX )

 /**
 * @addtogroup nodeLinkClass
@ -652,12 +652,12 @@ private:

        /** @} private_monitoring_services_variables */

-        /* List of alarms and current severity */
-        #define MAX_ALARMS           (10)
+        /* List of alarms current severity */
        EFmAlarmSeverityT alarms[MAX_ALARMS];

-        /* tracks whether the alarms for this host have been loaded already or not */
-        bool alarms_loaded ;
+        /* string containing active alarms and their severity
+         * ... for logging purposes only */
+        string active_alarms ;

        /** true if this host has recovered before the mnfa timeout period.
         *  This bool flags the graceful recovery handler that this node
@ -665,8 +665,6 @@ private:
         *  and uptime accordingly */
        bool mnfa_graceful_recovery ;

-        int stress_iteration ;
-
        /* BMC Protocol Learning Controls and State */

        /* specifies what BMC protocol is selected for this host
@ -828,10 +826,13 @@ private:
    int oos_test_handler   ( struct nodeLinkClass::node * node_ptr );
    int insv_test_handler  ( struct nodeLinkClass::node * node_ptr );
    int stress_handler     ( struct nodeLinkClass::node * node_ptr );
-    int bmc_handler         ( struct nodeLinkClass::node * node_ptr );
+    int bmc_handler        ( struct nodeLinkClass::node * node_ptr );
    int degrade_handler    ( struct nodeLinkClass::node * node_ptr );
+
    int uptime_handler     ( void );

+    void mtcInfo_handler   ( void );
+
    int host_services_handler ( struct nodeLinkClass::node * node_ptr );

    /* Starts the specified 'reset or powercycle' recovery monitor */
@ -840,6 +841,9 @@ private:
    /* server specific power state query handler */
    bool (*is_poweron_handler) (string hostname, string query_response );

+    /* Audit that monitors and auto corrects alarm state mismatches */
+    void mtcAlarm_audit ( struct nodeLinkClass::node * node_ptr );
+
    /* Calculate the overall reset progression timeout */
    int calc_reset_prog_timeout ( struct nodeLinkClass::node * node_ptr, int retries );

@ -851,13 +855,22 @@ private:
    void ctl_mtcAlive_gate ( struct nodeLinkClass::node * node_ptr, bool gate_state );
    void set_mtcAlive      ( struct nodeLinkClass::node * node_ptr, int interface );

+    /*********               mtcInfo in the database              ************/
    int    mtcInfo_set ( struct nodeLinkClass::node * node_ptr, string key, string value );
    string mtcInfo_get ( struct nodeLinkClass::node * node_ptr, string key );
    void   mtcInfo_clr ( struct nodeLinkClass::node * node_ptr, string key );
    void   mtcInfo_log ( struct nodeLinkClass::node * node_ptr );
-
    int    set_mtcInfo ( struct nodeLinkClass::node * node_ptr, string & mtc_info );

+    /*********       mtcInfo that gets puished out to daemons      ***********/
+
+
+    /* flag telling mtce when a mtcInfo push needs to be done */
+    bool want_mtcInfo_push = false ;
+
+    /* performs the mtcInfo push */
+    void push_mtcInfo ( void );
+
    /*****************************************************************************
     *
     * Name       : bmc_command_send
@ -1192,11 +1205,11 @@ private:
     * Set to true when the autorecovery threshold is reached
     * and we want to avoid taking further autorecovery action
     * even though it may be requested. */
-    bool autorecovery_disabled ;
+    bool autorecovery_disabled = false ;

    /* Set to true by fault detection methods that are
     * autorecoverable when in simplex mode. */
-    bool autorecovery_enabled ;
+    bool autorecovery_enabled = false ;

    /** Tracks the number of hosts that 'are currently' in service trouble
     *  wrt heartbeat (above minor threshold).
@ -1292,6 +1305,7 @@ private:
    void mem_log_state1    ( struct nodeLinkClass::node * node_ptr );
    void mem_log_state2    ( struct nodeLinkClass::node * node_ptr );
    void mem_log_alarm1    ( struct nodeLinkClass::node * node_ptr );
+    void mem_log_alarm2    ( struct nodeLinkClass::node * node_ptr );
    void mem_log_mtcalive  ( struct nodeLinkClass::node * node_ptr );
    void mem_log_stage     ( struct nodeLinkClass::node * node_ptr );
    void mem_log_test_info ( struct nodeLinkClass::node * node_ptr );
@ -1464,11 +1478,14 @@ public:

    /***********************************************************/

+    /** Number of provisioned controllers */
+    int controllers = 0 ;
+
    /** Number of provisioned hosts (nodes) */
-    int hosts  ;
+    int hosts = 0 ;

    /* Set to True while waiting for UNLOCK_READY_FILE in simplex mode */
-    bool unlock_ready_wait ;
+    bool unlock_ready_wait = false ;

    /** Host has been deleted */
    bool host_deleted ;
@ -1517,6 +1534,9 @@ public:
    /** Return the number of inventoried hosts */
    int num_hosts ( void );

+    /** Return the number of inventoried controllers */
+    int num_controllers ( void );
+
    /** **********************************************************************
      *
      * Name       : nodeLinkClass::workQueue_enqueue
@ -1664,6 +1684,9 @@ public:
    /* Clear heartbeat failed flag for all interfaces */
    void manage_heartbeat_clear   ( string hostname, iface_enum iface );

+    /* Build a json dictionary of containing code specified maintenance info */
+    string build_mtcInfo_dict ( mtcInfo_enum mtcInfo_code );
+
   /** Test and Debug Members and Variables */

    /** Print node info banner */
@ -1752,6 +1775,7 @@ public:
        #define MTC_FLAG__I_AM_LOCKED      (0x00000008)
    */
    void set_mtce_flags ( string hostname, int flags, int iface );
+    int  get_mtce_flags ( string & hostname );

    /** Updates the node's health code
      * Codes are found in nodeBase.h
@ -1789,6 +1813,7 @@ public:

    string get_bm_ip   ( string hostname );
    string get_bm_un   ( string hostname );
+    string get_bm_pw   ( string hostname );
    string get_bm_type ( string hostname );

    string get_hostname_from_bm_ip ( string bm_ip );
--- a/mtce/src/fsmon/scripts/fsmon.logrotate
+++ b/mtce/src/fsmon/scripts/fsmon.logrotate
@ -1,15 +1,19 @@
-#daily
-nodateext
+#
+# Copyright (c) 2015-2021 Wind River Systems, Inc.
+#
+# SPDX-License-Identifier: Apache-2.0

 /var/log/fsmond.log
 {
-    size 10M
+    create 0640 root root
    start 1
-    missingok
+    size 10M
    rotate 20
    compress
-    sharedscripts
+    notifempty
+    missingok
    postrotate
        systemctl reload syslog-ng > /dev/null 2>&1 || true
    endscript
+    delaycompress
 }
--- a/mtce/src/heartbeat/Makefile
+++ b/mtce/src/heartbeat/Makefile
@ -13,7 +13,7 @@ LDLIBS = -lstdc++ -ldaemon -lcommon -lthreadUtil -lpthread -lfmcommon -lalarm -l
 INCLUDES = -I. -I/usr/include/mtce-daemon -I/usr/include/mtce-common
 INCLUDES += -I../common -I../alarm -I../maintenance -I../public

-CCFLAGS = -g -O2 -Wall -Wextra -Werror
+CCFLAGS = -g -O2 -Wall -Wextra -Werror -std=c++11

 STATIC_ANALYSIS_TOOL = cppcheck
 STATIC_ANALYSIS_TOOL_EXISTS = $(shell [[ -e `which $(STATIC_ANALYSIS_TOOL)` ]] && echo 1 || echo 0)
--- a/mtce/src/heartbeat/hbsAgent.cpp
+++ b/mtce/src/heartbeat/hbsAgent.cpp
@ -1381,6 +1381,7 @@ int daemon_init ( string iface, string nodetype )
        hbs_ctrl.locked = true ;
    }

+
    daemon_init_fit();
    return (rc);
 }
@ -1521,6 +1522,7 @@ void hbs_sm_handler ( void )
 *              False if time delta is greater
 *
 ***************************************************************************/
+#define HUGE_NUMBER_B2B_SM_HEARTBEAT_MISSES (10000)
 bool manage_sm_heartbeat ( void )
 {
    struct timespec ts ;
@ -1532,8 +1534,9 @@ bool manage_sm_heartbeat ( void )
    if ( delta_in_ms > SM_HEARTBEAT_PULSE_PERIOD_MSECS )
    {
        sm_heartbeat_count = 0;
-        if (( ++sm_heartbeat_count_b2b_misses < 20 )||
-            (!( sm_heartbeat_count_b2b_misses % 100 )))
+        if ((( ++sm_heartbeat_count_b2b_misses < 20 ) ||
+            (!( sm_heartbeat_count_b2b_misses % 1000 ))) &&
+            ( sm_heartbeat_count_b2b_misses < HUGE_NUMBER_B2B_SM_HEARTBEAT_MISSES ))
        {
            wlog("SM Heartbeat missing since %ld.%03ld secs ago ; HBS Period Misses:%3d ; Running HB Count:%4d",
                  delta.secs, delta.msecs,
@ -1817,6 +1820,10 @@ void daemon_service_run ( void )
                        inv.name = hbsInv.my_hostname ;
                        inv.nodetype = CONTROLLER_TYPE ;
                        hbsInv.add_heartbeat_host ( inv );
+
+                        /* add this host to local inventory */
+                        hostname_inventory.push_front(hbsInv.my_hostname);
+                        ilog ("%s added to inventory (self)", hbsInv.my_hostname.c_str());
                    }

                    /* enable the base level signal handler latency monitor */
@ -1841,7 +1848,7 @@ void daemon_service_run ( void )
                    clock_gettime (CLOCK_MONOTONIC, &sm_heartbeat_timestamp_last );

                    /* no need for the heartbeat audit in a simplex system */
-                    if ( hbsInv.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )
+                    if ( hbsInv.system_type != SYSTEM_TYPE__AIO__SIMPLEX )
                    {
                        /* start the state audit */
                        /* run the first audit in 30 seconds */
@ -2056,7 +2063,7 @@ void daemon_service_run ( void )
                                          hbsInv.active_controller ? "" : "in" );

                                /* no need for the heartbeat audit in a simplex system */
-                                if ( hbsInv.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )
+                                if ( hbsInv.system_type != SYSTEM_TYPE__AIO__SIMPLEX )
                                {
                                    /* Due to activity state change we will dump
                                     * the heartbeat cluster state at now time
@ -2074,6 +2081,7 @@ void daemon_service_run ( void )
                            inv.nodetype = msg.parm[0];
                            hbsInv.add_heartbeat_host ( inv ) ;
                            hostname_inventory.push_back ( inv.name );
+                            hostname_inventory.unique(); // avoid duplicates
                            ilog ("%s added to heartbeat service (%d)\n",
                                      inv.name.c_str(),
                                      inv.nodetype);
@ -2119,7 +2127,7 @@ void daemon_service_run ( void )
                        {
                            if ( hostname != hbsInv.my_hostname )
                            {
-                                hbsInv.mon_host ( hostname, false, true );
+                                hbsInv.mon_host ( hostname, false, false );
                                hbs_cluster_del ( hostname );
                                ilog ("%s heartbeat service disabled by stop command",
                                          hostname.c_str());
@ -2366,6 +2374,7 @@ void daemon_service_run ( void )
                    arrival_histogram[iface] = "" ;
                    unexpected_pulse_list[iface] = "" ;

+
                    rc = hbs_pulse_request ( (iface_enum)iface, seq_num, ri, rri );
                    if ( rc != 0 )
                    {
@ -2523,7 +2532,9 @@ void daemon_service_run ( void )
                }
            }
            /* log cluster throttled */
-            if (( heartbeat_ok == false ) && ( !( sm_heartbeat_count_b2b_misses % 100 )))
+            if ((( heartbeat_ok == false ) &&
+                ( !( sm_heartbeat_count_b2b_misses % 1000 ))) &&
+                ( sm_heartbeat_count_b2b_misses < HUGE_NUMBER_B2B_SM_HEARTBEAT_MISSES ))
            {
                hbs_state_audit ( );
            }
--- a/mtce/src/heartbeat/hbsBase.h
+++ b/mtce/src/heartbeat/hbsBase.h
@ -326,7 +326,7 @@ void hbs_cluster_log  ( string & hostname, mtce_hbs_cluster_type & cluster, stri
 void hbs_sm_handler ( void );

 /* send the cluster vault to SM */
-void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason );
+int hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason );

 /* copy cluster data from src to dst */
 void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst );
@ -338,6 +338,10 @@ void hbs_cluster_dump ( mtce_hbs_cluster_type & vault );
 /* Heartbeat service state audit */
 void hbs_state_audit ( void );

+/* Send state change message to SM if there has been a
+ * state change in the last period */
+void hbs_cluster_change_notifier ( void );
+
 /**
 * @} hbs_base
 */
--- a/mtce/src/heartbeat/hbsCluster.cpp
+++ b/mtce/src/heartbeat/hbsCluster.cpp
@ -69,6 +69,8 @@ typedef struct

    msgClassSock * sm_socket_ptr ;

+    string cluster_change_reason ;
+
 } hbs_cluster_ctrl_type ;

 /* Cluster control structire construct allocation. */
@ -122,6 +124,8 @@ void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr )
    {
        ctrl.sm_socket_ptr = sm_socket_ptr ;
    }
+    ctrl.cluster_change_reason = "";
+
    ctrl.log_throttle = 0 ;
 }

@ -173,7 +177,30 @@ void hbs_cluster_nums ( unsigned short this_controller,

 void hbs_cluster_change ( string cluster_change_reason )
 {
-    hbs_cluster_send( ctrl.sm_socket_ptr, 0, cluster_change_reason );
+    ilog ("reason: %s", cluster_change_reason.c_str());
+    if ( ctrl.cluster_change_reason.empty() )
+        ctrl.cluster_change_reason = cluster_change_reason ;
+    else
+        ctrl.cluster_change_reason.append("," + cluster_change_reason) ;
+}
+
+/****************************************************************************
+ *
+ * Name        : hbs_cluster_change_notifier
+ *
+ * Description : Send SM the cluster info if there has been a state change.
+ *
+ ***************************************************************************/
+void hbs_cluster_change_notifier ( void )
+{
+    if ( ! ctrl.cluster_change_reason.empty () )
+    {
+        if ( hbs_cluster_send( ctrl.sm_socket_ptr, 0,
+                               ctrl.cluster_change_reason ) == PASS )
+        {
+            ctrl.cluster_change_reason.clear();
+        }
+    }
 }

 /****************************************************************************
@ -444,6 +471,7 @@ void hbs_cluster_update ( iface_enum iface,
            wlog_throttled ( ctrl.log_throttle, THROTTLE_COUNT,
                             "Unable to store history beyond %d ",
                             ctrl.cluster.histories );
+            hbs_cluster_change_notifier ();
            return ;
        }
        else
@ -544,6 +572,8 @@ void hbs_cluster_update ( iface_enum iface,
    else
        history_ptr->oldest_entry_index++ ;

+    hbs_cluster_change_notifier ();
+
    /* clear the log throttle if we are updating history ok. */
    ctrl.log_throttle = 0 ;
 }
@ -647,12 +677,12 @@ unsigned short hbs_cluster_unused_bytes ( void )
 *
 ***************************************************************************/

-void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason )
+int hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason )
 {
+    int rc = FAIL_SOCKET_SENDTO ;
    ctrl.cluster.reqid = (unsigned short)reqid ;
    if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true ))
    {
-        ilog ("cluster state notification Reason: %s", reason.c_str());
        int len = sizeof(mtce_hbs_cluster_type)-hbs_cluster_unused_bytes();
        int bytes = sm_client_sock->write((char*)&ctrl.cluster, len);
        if ( bytes <= 0 )
@ -660,12 +690,19 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason
             elog ("failed to send cluster vault to SM (bytes=%d) (%d:%s)\n",
                    bytes , errno, strerror(errno));
        }
-        hbs_cluster_dump ( ctrl.cluster );
+        else
+        {
+            /* limit the string length */
+            ilog ("reason: %s", reason.substr(0,80).c_str());
+            hbs_cluster_dump ( ctrl.cluster );
+            rc = PASS ;
+        }
    }
    else
    {
        wlog ("cannot send cluster info due to socket error");
    }
+    return(rc);
 }

 /****************************************************************************
@ -689,7 +726,7 @@ void hbs_history_save ( string hostname,
        {
            if ( hbs_cluster_cmp( sample, ctrl.cluster.history[h] ) )
            {
-                 hbs_cluster_change ("peer controller cluster event " +
+                 hbs_cluster_change ("peer cluster delta " +
                 hbs_cluster_network_name((mtce_hbs_network_enum)sample.network));
            }

--- a/mtce/src/heartbeat/hbsStubs.cpp
+++ b/mtce/src/heartbeat/hbsStubs.cpp
@ -279,8 +279,14 @@ void nodeLinkClass::mnfa_enter ( void )
 void nodeLinkClass::mnfa_exit  ( bool force )
 { force = force ; }

-int send_mtc_cmd ( string & hostname, int cmd, int interface )
-{ UNUSED(hostname); UNUSED(cmd); UNUSED(interface); return PASS ; }
+int send_mtc_cmd ( string & hostname, int cmd, int interface, string json_dict)
+{
+    UNUSED(hostname);
+    UNUSED(cmd);
+    UNUSED(interface);
+    UNUSED(json_dict);
+    return PASS ;
+}

 int nodeLinkClass::mtcInvApi_subf_states ( string hostname,
                                           string oper_subf,
--- a/mtce/src/hostw/scripts/hostw.logrotate
+++ b/mtce/src/hostw/scripts/hostw.logrotate
@ -1,16 +1,19 @@
-#daily
-nodateext
+#
+# Copyright (c) 2020-2021 Wind River Systems, Inc.
+#
+# SPDX-License-Identifier: Apache-2.0

 /var/log/hostwd.log
 {
-    nodateext
-    size 10M
+    create 0640 root root
    start 1
-    missingok
+    size 10M
    rotate 20
    compress
-    sharedscripts
+    notifempty
+    missingok
    postrotate
        systemctl reload syslog-ng > /dev/null 2>&1 || true
    endscript
+    delaycompress
 }
--- a/mtce/src/hwmon/hwmonSensor.cpp
+++ b/mtce/src/hwmon/hwmonSensor.cpp
@ -254,7 +254,7 @@ void hwmonGroup_init ( string & hostname , struct sensor_group_type * group_ptr
        group_ptr->actions_critical_choices.append(HWMON_ACTION_ALARM);

        /* Don't support reset and power cycle in AIO simplex mode */
-        if ( obj_ptr->system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )
+        if ( obj_ptr->system_type != SYSTEM_TYPE__AIO__SIMPLEX )
        {
            group_ptr->actions_critical_choices.append(",");
            group_ptr->actions_critical_choices.append(HWMON_ACTION_RESET);
--- a/mtce/src/hwmon/hwmonThreads.cpp
+++ b/mtce/src/hwmon/hwmonThreads.cpp
@ -964,6 +964,10 @@ static int _parse_redfish_sensor_data( char * json_str_ptr, thread_info_type * i
                        {
                            strcpy(_sample_list[samples].status, "cr");
                        }
+                        else if  (!strcmp (health.data(), REDFISH_SEVERITY__NONRECOVERABLE ))
+                        {
+                            strcpy(_sample_list[samples].status, "nr");
+                        }
                        else
                        {
                            strcpy(_sample_list[samples].status, "na");
--- a/mtce/src/hwmon/hwmonThreads.h
+++ b/mtce/src/hwmon/hwmonThreads.h
@ -33,6 +33,7 @@
 #define REDFISH_SEVERITY__GOOD     "OK"
 #define REDFISH_SEVERITY__MAJOR    "Warning"
 #define REDFISH_SEVERITY__CRITICAL "Critical"
+#define REDFISH_SEVERITY__NONRECOVERABLE "NonRecoverable"

 #define BMC_SENSOR_DEFAULT_UNIT_TYPE_TEMP    "degrees"
 #define BMC_SENSOR_DEFAULT_UNIT_TYPE_VOLT    "Volts"
--- a/mtce/src/hwmon/scripts/hwmon.logrotate
+++ b/mtce/src/hwmon/scripts/hwmon.logrotate
@ -1,28 +1,21 @@
-#daily
-nodateext
-start 1
-missingok
-notifempty
-compress
-sharedscripts
-postrotate
-    systemctl reload syslog-ng > /dev/null 2>&1 || true
-endscript
+#
+# Copyright (c) 2020-2021 Wind River Systems, Inc.
+#
+# SPDX-License-Identifier: Apache-2.0

 /var/log/hwmond.log
-{
-    size 50M
-    rotate 5
-}
-
 /var/log/hwmond_event.log
-{
-    size 50M
-    rotate 5
-}
-
 /var/log/hwmond_api.log
 {
+    create 0640 root root
+    start 1
    size 50M
    rotate 5
+    compress
+    notifempty
+    missingok
+    postrotate
+        systemctl reload syslog-ng > /dev/null 2>&1 || true
+    endscript
+    delaycompress
 }
--- a/mtce/src/lmon/scripts/lmon.logrotate
+++ b/mtce/src/lmon/scripts/lmon.logrotate
@ -1,16 +1,19 @@
-#daily
-nodateext
+#
+# Copyright (c) 2020-2021 Wind River Systems, Inc.
+#
+# SPDX-License-Identifier: Apache-2.0

 /var/log/lmond.log
 {
-    nodateext
-    size 10M
+    create 0640 root root
    start 1
-    missingok
+    size 10M
    rotate 20
    compress
-    sharedscripts
+    notifempty
+    missingok
    postrotate
        systemctl reload syslog-ng > /dev/null 2>&1 || true
    endscript
+    delaycompress
 }
--- a/mtce/src/maintenance/Makefile
+++ b/mtce/src/maintenance/Makefile
@ -54,7 +54,7 @@ BINS = mtcAgent mtcClient
 LDLIBS += -lstdc++ -ldaemon -lcommon -lthreadUtil -lbmcUtils -lfmcommon -lalarm -lpthread -lrt -levent -ljson-c -lamon -lcrypto -luuid
 INCLUDES = -I. -I/usr/include/mtce-daemon -I/usr/include/mtce-common
 INCLUDES += -I../common -I../alarm -I../heartbeat -I../hwmon -I../public
-CCFLAGS += -g -O2 -Wall -Wextra -Werror -Wno-missing-braces
+CCFLAGS += -g -O2 -Wall -Wextra -Werror -Wno-missing-braces -std=c++11

 STATIC_ANALYSIS_TOOL = cppcheck
 STATIC_ANALYSIS_TOOL_EXISTS = $(shell [[ -e `which $(STATIC_ANALYSIS_TOOL)` ]] && echo 1 || echo 0)
--- a/mtce/src/maintenance/mtcAlarm.cpp
+++ b/mtce/src/maintenance/mtcAlarm.cpp
@ -26,6 +26,7 @@ using namespace std;
 #include "daemon_common.h" /*                                           */

 #include "nodeBase.h"      /*                                           */
+#include "nodeClass.h"     /*                                           */
 #include "nodeTimers.h"    /*                                           */
 #include "nodeUtil.h"      /*                                           */
 #include "mtcAlarm.h"      /* for ... this module header                */
@ -379,8 +380,169 @@ void mtcAlarm_clear_all ( string hostname )
    }
 }

+/****************************************************************************
+ *
+ * Name       : mtcAlarm_audit
+ *
+ * Purpose    : Monitor and Auto-Correct maintenance alarms
+ *
+ * Description: Query locked state alarm (raw)
+ *              if successful
+ *                 - Query alarms
+ *                 - compare to running state
+ *                 - correct mismatches ; internal state takes precidence
+ *                 - log all alarm state changes
+ *
+ ****************************************************************************/
+
+void nodeLinkClass::mtcAlarm_audit ( struct nodeLinkClass::node * node_ptr )
+{
+   /*
+    * Read locked state alarm directly to detect fm access failures.
+    * If successful further reads are done using a wrapper utility.
+    */
+    SFmAlarmDataT alarm_query  ;
+    AlarmFilter   alarm_filter ;
+    EFmErrorT     rc           ;
+
+    memset(&alarm_query, 0, sizeof(alarm_query));
+    memset(&alarm_filter, 0, sizeof(alarm_filter));
+    snprintf ( &alarm_filter.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s",
+               LOCK_ALARM_ID);
+    snprintf ( &alarm_filter.entity_instance_id[0], FM_MAX_BUFFER_LENGTH, "%s%s",
+                    ENTITY_PREFIX, node_ptr->hostname.data());
+    rc = fm_get_fault ( &alarm_filter, &alarm_query );
+    if (( rc != FM_ERR_OK ) && ( rc != FM_ERR_ENTITY_NOT_FOUND ))
+    {
+        wlog("%s alarm query failure ; code:%d",
+                 node_ptr->hostname.c_str(),
+                 rc );
+        return ;
+    }
+
+    /* With FM comms proven working lets check the other mtc alarms */
+    string active_alarms = "";
+    for ( int i = 0 ; i < MAX_ALARMS ; i++ )
+    {
+        mtc_alarm_id_enum id = (mtc_alarm_id_enum)i ;
+        if ( id == MTC_ALARM_ID__LOCK )
+        {
+            /* Unexpected severity case */
+            if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
+            {
+                if ( alarm_query.severity != FM_ALARM_SEVERITY_WARNING )
+                {
+                    node_ptr->alarms[id] = FM_ALARM_SEVERITY_WARNING ;
+
+                    wlog("%s %s alarm mismatch ; %s -> %s",
+                             node_ptr->hostname.c_str(),
+                             _getIdentity(id).c_str(),
+                             alarmUtil_getSev_str(alarm_query.severity).c_str(),
+                             alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
+
+                    mtcAlarm_warning ( node_ptr->hostname, MTC_ALARM_ID__LOCK );
+
+                }
+                if (!active_alarms.empty())
+                    active_alarms.append(", ");
+                active_alarms.append(_getIdentity(id) + ":");
+                active_alarms.append(alarmUtil_getSev_str(node_ptr->alarms[id]));
+            }
+            /* Unexpected assertion case */
+            else if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
+                     (  alarm_query.severity != FM_ALARM_SEVERITY_CLEAR ))
+            {
+                node_ptr->alarms[id] = FM_ALARM_SEVERITY_CLEAR ;
+
+                wlog("%s %s alarm mismatch ; %s -> %s",
+                         node_ptr->hostname.c_str(),
+                         _getIdentity(id).c_str(),
+                         alarmUtil_getSev_str(alarm_query.severity).c_str(),
+                         alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
+
+                mtcAlarm_clear ( node_ptr->hostname, id );
+            }
+        }
+        else if (( id == MTC_ALARM_ID__CONFIG ) ||
+                 ( id == MTC_ALARM_ID__ENABLE ) ||
+                 ( id == MTC_ALARM_ID__BM     ) ||
+                 ( id == MTC_ALARM_ID__CH_CONT) ||
+                 ( id == MTC_ALARM_ID__CH_COMP))
+        {
+            EFmAlarmSeverityT severity = mtcAlarm_state ( node_ptr->hostname, id);
+            if ( severity != node_ptr->alarms[id] )
+            {
+                ilog ("%s %s alarm mismatch ; %s -> %s",
+                          node_ptr->hostname.c_str(),
+                          _getIdentity(id).c_str(),
+                           alarmUtil_getSev_str(severity).c_str(),
+                           alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
+
+                if ( node_ptr->alarms[id] == FM_ALARM_SEVERITY_CLEAR )
+                {
+                    mtcAlarm_clear ( node_ptr->hostname, id );
+                }
+                else
+                {
+                    mtcAlarm_raise ( node_ptr->hostname, id, node_ptr->alarms[id] );
+                }
+            }
+            if ( node_ptr->alarms[id] != FM_ALARM_SEVERITY_CLEAR )
+            {
+                if (!active_alarms.empty())
+                    active_alarms.append(", ");
+                active_alarms.append(_getIdentity(id) + ":");
+                active_alarms.append(alarmUtil_getSev_str(node_ptr->alarms[id]));
+            }
+        }
+        /* else don't care about other alarm ids ; logs events etc */
+    }
+
+    /* manage logging of active alarms */
+    if ( !active_alarms.empty() )
+    {
+        if ( node_ptr->active_alarms != active_alarms )
+        {
+            ilog ("%s active alarms: %s",
+                      node_ptr->hostname.c_str(),
+                      active_alarms.c_str());
+
+            node_ptr->active_alarms = active_alarms ;
+        }
+        /* else
+         *    do nothing because there are active alarms
+         *    that have not changed since the last audit.
+         */
+    }
+    else if ( ! node_ptr->active_alarms.empty() )
+    {
+        /* clear active alarm list since there 'were' active alarms
+         * but there are no longer active alarms */
+        node_ptr->active_alarms.clear();
+        ilog ("%s no active alarms", node_ptr->hostname.c_str());
+    }
+    /* else
+     *    no active alarms ; don't log */
+}
+
 /*************************   A L A R M I N G   **************************/

+/* Raise the specified maintenance alarm severity */
+int mtcAlarm_raise ( string hostname, mtc_alarm_id_enum id, EFmAlarmSeverityT severity )
+{
+    switch ( severity )
+    {
+        case FM_ALARM_SEVERITY_MINOR:
+            return (mtcAlarm_minor(hostname,id));
+        case FM_ALARM_SEVERITY_MAJOR:
+            return (mtcAlarm_major(hostname,id));
+        case FM_ALARM_SEVERITY_CRITICAL:
+            return (mtcAlarm_critical(hostname,id));
+        default:
+            return (FAIL_BAD_PARM);
+    }
+}
+
 /* Clear the specified hosts's maintenance alarm */
 int mtcAlarm_clear ( string hostname, mtc_alarm_id_enum id )
 {
--- a/mtce/src/maintenance/mtcAlarm.h
+++ b/mtce/src/maintenance/mtcAlarm.h
@ -95,6 +95,9 @@ string mtcAlarm_getId_str ( mtc_alarm_id_enum id );
 /** Clear the specified maintenance alarm for specific host */
 int  mtcAlarm_clear    ( string hostname, mtc_alarm_id_enum id );

+/** Raise specified severity level alarm for the specified host */
+int mtcAlarm_raise ( string hostname, mtc_alarm_id_enum id, EFmAlarmSeverityT severity );
+
 /** Assert a specified mtce alarm against the specified host with a WARNING severity level */
 int  mtcAlarm_warning  ( string hostname, mtc_alarm_id_enum id );

--- a/mtce/src/maintenance/mtcBmcUtil.cpp
+++ b/mtce/src/maintenance/mtcBmcUtil.cpp
@ -39,6 +39,26 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr,
 {
    int rc = PASS ;

+    /* handle 'kill of in-progress' thread or 'done but not consumed' thread */
+    if ( ! thread_idle ( node_ptr->bmc_thread_ctrl ))
+    {
+        if ( ! thread_done ( node_ptr->bmc_thread_ctrl ))
+        {
+            thread_kill ( node_ptr->bmc_thread_ctrl,
+                          node_ptr->bmc_thread_info );
+            return (RETRY);
+        }
+        else
+        {
+             mtcTimer_reset ( node_ptr->bmc_thread_ctrl.timer );
+             if ( thread_done_consume ( node_ptr->bmc_thread_ctrl,
+                                        node_ptr->bmc_thread_info ) != PASS )
+             {
+                 return (RETRY);
+             }
+        }
+    }
+
    node_ptr->bmc_thread_info.command = command ;

    /* Update / Setup the BMC access credentials */
@ -437,6 +457,13 @@ bmc_command_recv_cleanup:

    if ( rc != RETRY )
    {
+        ilog ("%s %s recv '%s' command (%s) (rc:%d)",
+                  node_ptr->hostname.c_str(),
+                  node_ptr->bmc_thread_ctrl.name.c_str(),
+                  bmcUtil_getCmd_str(node_ptr->bmc_thread_info.command).c_str(),
+                  bmcUtil_getProtocol_str(node_ptr->bmc_protocol).c_str(),
+                  rc);
+
        node_ptr->bmc_thread_ctrl.done = true ;
        node_ptr->bmc_thread_ctrl.retries = 0 ;
        node_ptr->bmc_thread_ctrl.id = 0 ;
--- a/mtce/src/maintenance/mtcCompMsg.cpp
+++ b/mtce/src/maintenance/mtcCompMsg.cpp
@ -20,7 +20,7 @@

 #include <stdio.h>
 #include <string.h>
-#include <sys/un.h>      /* for ... unix domain sockets     */
+#include <sys/un.h>    /* for ... unix domain sockets     */
 #include <arpa/inet.h>
 #include <sys/socket.h>
 #include <net/if.h>
@ -29,8 +29,8 @@
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <errno.h>
-#include <list>        /* for the list of conf file names */
-
+#include <list>        /* for ... list of conf file names */
+#include <unistd.h>    /* for ... sync                    */

 using namespace std;

@ -70,11 +70,15 @@ void stop_pmon( void )
 {
    /* max pipe command response length */
    #define PIPE_COMMAND_RESPON_LEN (100)
+
+    ilog("Stopping collectd.");
+    int rc = system("/usr/local/sbin/pmon-stop collectd");
+    sleep (2);
    ilog("Stopping pmon to prevent process recovery during shutdown");
    for ( int retry = 0 ; retry < 5 ; retry++ )
    {
        char pipe_cmd_output [PIPE_COMMAND_RESPON_LEN] ;
-        int rc = system("/usr/bin/systemctl stop pmon");
+        rc = system("/usr/bin/systemctl stop pmon");
        sleep(2);

        /* confirm pmon is no longer active */
@ -204,6 +208,24 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface )
            mlog1 ("mtcAlive request received (%s network)\n", interface_name.c_str());
            return ( send_mtcAlive_msg ( sock_ptr, get_who_i_am(), interface ));
        }
+        else if ( msg.cmd == MTC_MSG_INFO )
+        {
+            mlog1("mtc 'info' message received (%s network)\n", interface_name.c_str());
+            load_mtcInfo_msg ( msg );
+            return ( PASS ); /* no ack for this message */
+        }
+        else if ( msg.cmd == MTC_CMD_SYNC )
+        {
+            ilog ("mtc '%s' message received (%s network)\n",
+                   get_mtcNodeCommand_str(msg.cmd),
+                   interface_name.c_str());
+
+            ilog ("Sync Start");
+            sync ();
+            ilog ("Sync Done");
+
+            return ( PASS ); /* no ack for this message */
+        }
        else if ( msg.cmd == MTC_MSG_LOCKED )
        {
            /* Only recreate the file if its not already present */
@ -603,7 +625,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface )
 }

 /** Send an event to the mtcAgent **/
-int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_name_ptr )
+int mtce_send_event ( mtc_socket_type * sock_ptr, unsigned int cmd , const char * mtce_name_ptr )
 {
    mtc_message_type event ;

@ -619,6 +641,24 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_na
        /* We don't use the buffer for mtce events to remove it from the size */
        bytes = ((sizeof(mtc_message_type))-(BUF_SIZE));
    }
+    else if ( cmd == MTC_EVENT_MONITOR_READY )
+    {
+        string event_info = "{\"" ;
+        event_info.append(MTC_JSON_INV_NAME);
+        event_info.append("\":\"");
+        event_info.append(get_hostname());
+        event_info.append("\",\"");
+        event_info.append(MTC_JSON_SERVICE);
+        event_info.append("\":\"");
+        event_info.append(MTC_SERVICE_MTCCLIENT_NAME );
+        event_info.append("\"}");
+
+        size_t len =  event_info.length()+1 ;
+        snprintf ( &event.hdr[0], MSG_HEADER_SIZE, "%s", get_mtce_event_header());
+        snprintf ( &event.buf[0], len, "%s", event_info.data());
+        bytes = ((sizeof(mtc_message_type))-(BUF_SIZE-len));
+        ilog ("%s %s ready", get_hostname().c_str(), MTC_SERVICE_MTCCLIENT_NAME);
+    }
    else if (( cmd == MTC_EVENT_AVS_CLEAR    ) ||
             ( cmd == MTC_EVENT_AVS_MAJOR    ) ||
             ( cmd == MTC_EVENT_AVS_CRITICAL ))
@ -666,7 +706,7 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_na
    {
        if ( bytes == 0 )
        {
-           slog ("message send failed ; message size=0 for cmd:%d is 0\n", event.cmd );
+           slog ("message send failed ; message size=0 for cmd:0x%x is 0\n", event.cmd );
           rc = FAIL_NO_DATA ;
        }
        else if ((rc = sock_ptr->mtc_client_tx_socket->write((char*)&event.hdr[0], bytes))!= bytes )
@ -912,15 +952,18 @@ int send_mtcAlive_msg ( mtc_socket_type * sock_ptr, string identity, int interfa
        }

        /* Send to controller-1 cluster address */
-        if (( sock_ptr->mtc_client_tx_socket_c1_clstr ) &&
-            ( sock_ptr->mtc_client_tx_socket_c1_clstr->sock_ok() == true ))
+        if ( get_ctrl_ptr()->system_type != SYSTEM_TYPE__AIO__SIMPLEX )
        {
-            print_mtc_message ( CONTROLLER_1, MTC_CMD_TX, msg, get_iface_name_str(CLSTR_INTERFACE), false );
-            sock_ptr->mtc_client_tx_socket_c1_clstr->write((char*)&msg.hdr[0], bytes ) ;
-        }
-        else
-        {
-            elog("mtc_client_tx_socket_c1_clstr not ok");
+            if (( sock_ptr->mtc_client_tx_socket_c1_clstr ) &&
+                ( sock_ptr->mtc_client_tx_socket_c1_clstr->sock_ok() == true ))
+            {
+                print_mtc_message ( CONTROLLER_1, MTC_CMD_TX, msg, get_iface_name_str(CLSTR_INTERFACE), false );
+                sock_ptr->mtc_client_tx_socket_c1_clstr->write((char*)&msg.hdr[0], bytes ) ;
+            }
+            else
+            {
+                elog("mtc_client_tx_socket_c1_clstr not ok");
+            }
        }
    }
    else
@ -933,32 +976,59 @@ int send_mtcAlive_msg ( mtc_socket_type * sock_ptr, string identity, int interfa
    return (PASS) ;
 }

-/* Accelerated Virtual Switch 'events' socket
- * - for receiving data port state change event
- * Event strings are
-  *
-  * {"type":"port-state", "severity":"critical|major|clear"}
-  *
-  * type:port-state - the provider network data port status has changed to the supplied fault severity
-  *
-  * severity:
-  *   critical - port has failed and is not part of an aggregate or is the last port in an aggregate (degrade, disable services)
-  *   major    - port has failed and is part of an aggregate with other inservice-ports (degrade only)
-  *   clear    - port has recovered from a failed state and is operational (clear degrade, enable services)
-  *
-  * NOTE: The port status can transition from any of the above states to any other state.
-  *
-  * The neutron agent monitors the vswitch ports at a 2 second interval.
-  * If a port changes link state during the polling period, it will
-  * raise/clear the alarm, but now also calculates the impact of that port
-  * failure on the provider network data interface.
-  *
-  * The overall aggregated state across all provider network interfaces will
-  * be reported to maintenance when ports enter a link down or up state.
-  * The agent will also periodically send the current provider network port
-  * status to maintenance every 30 seconds.
-  *
-  */
+int send_mtcClient_cmd ( mtc_socket_type * sock_ptr, int cmd, string hostname, string address, int port)
+{
+    mtc_message_type msg ;
+    int bytes = 0 ;
+    MEMSET_ZERO (msg);
+    snprintf ( &msg.hdr[0], MSG_HEADER_SIZE, "%s", get_cmd_req_msg_header());
+    msg.cmd = cmd ;
+
+    switch ( cmd )
+    {
+        case MTC_CMD_SYNC:
+        {
+            ilog ("Sending '%s' command to %s:%s:%d",
+                   get_mtcNodeCommand_str(cmd),
+                   hostname.c_str(),
+                   address.c_str(), port);
+
+            msg.num = 0   ;
+
+            /* buffer  not used in this message */
+            bytes = ((sizeof(mtc_message_type))-(BUF_SIZE));
+
+            break ;
+        }
+        default:
+        {
+            slog("Unsupported command ; %s:%d", get_mtcNodeCommand_str(cmd), cmd );
+            return (FAIL_BAD_CASE);
+        }
+    }
+    int rc = FAIL ;
+
+    /* Send to controller floating address */
+    if (( sock_ptr->mtc_client_tx_socket ) &&
+        ( sock_ptr->mtc_client_tx_socket->sock_ok() == true ))
+    {
+        print_mtc_message ( hostname, MTC_CMD_TX, msg, get_iface_name_str(MGMNT_INTERFACE), false );
+        rc = sock_ptr->mtc_client_tx_socket->write((char*)&msg.hdr[0], bytes, address.data(), port ) ;
+        if ( 0 >= rc )
+        {
+            elog("failed to send command to mtcClient (%d) (%d:%s)", rc, errno, strerror(errno));
+            rc = FAIL_SOCKET_SENDTO ;
+        }
+        else
+            rc = PASS ;
+    }
+    else
+    {
+        elog("mtc_client_tx_socket not ok");
+        rc = FAIL_BAD_STATE ;
+    }
+    return (rc) ;
+}

 int mtcCompMsg_testhead ( void )
 {
--- a/mtce/src/maintenance/mtcCtrlMsg.cpp
+++ b/mtce/src/maintenance/mtcCtrlMsg.cpp
@ -443,6 +443,34 @@ int mtc_service_inbox ( nodeLinkClass   *  obj_ptr,
                    obj_ptr->declare_service_ready ( hostname, MTC_SERVICE_HEARTBEAT );
                    return (PASS);
                }
+                else if ( service == MTC_SERVICE_MTCCLIENT_NAME )
+                {
+                    ilog ("%s %s ready", hostname.c_str(), MTC_SERVICE_MTCCLIENT_NAME);
+
+                    /* if this ready event is from the mtcClient of a
+                     * controller that has valid bmc access info then
+                     * build the 'peer controller kill' mtcInfo and
+                     * send it to that mtcClient */
+                    if ( obj_ptr->get_nodetype ( hostname ) & CONTROLLER_TYPE )
+                    {
+                        string bm_pw = obj_ptr->get_bm_pw ( hostname ) ;
+                        if ( !bm_pw.empty() && ( bm_pw != NONE ))
+                        {
+                            string bm_un = obj_ptr->get_bm_un ( hostname ) ;
+                            string bm_ip = obj_ptr->get_bm_ip ( hostname ) ;
+                            if (( hostUtil_is_valid_username  ( bm_un )) &&
+                                ( hostUtil_is_valid_ip_addr   ( bm_ip )))
+                            {
+                                send_mtc_cmd ( hostname,
+                                               MTC_MSG_INFO,
+                                               MGMNT_INTERFACE,
+                                               obj_ptr->build_mtcInfo_dict (
+                                MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO));
+                            }
+                        }
+                    }
+                    return (PASS);
+                }
                if (  service == MTC_SERVICE_HWMOND_NAME )
                {
                    std::list<string>::iterator temp ;
@ -578,11 +606,12 @@ int mtc_service_inbox ( nodeLinkClass   *  obj_ptr,
    return (rc);
 }

-int send_mtc_cmd ( string & hostname, int cmd , int interface )
+int send_mtc_cmd ( string & hostname, int cmd , int interface, string json_dict )
 {
    int rc = FAIL ;
    bool force = false ;
    mtc_message_type mtc_cmd ;
+    string data = "" ;
    mtc_socket_type * sock_ptr = get_sockPtr ();
    memset (&mtc_cmd,0,sizeof(mtc_message_type));

@ -592,6 +621,16 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface )

    switch ( cmd )
    {
+        case MTC_MSG_INFO:
+        {
+            snprintf ( &mtc_cmd.hdr[0], MSG_HEADER_SIZE, "%s" , get_cmd_req_msg_header() );
+            mtc_cmd.cmd = cmd ;
+            mtc_cmd.num = 0 ;
+            data = "{\"mtcInfo\":" + json_dict + "}";
+            ilog("%s mtc info update", hostname.c_str());
+            rc = PASS ;
+            break ;
+        }
        case MTC_REQ_MTCALIVE:
        {
            snprintf ( &mtc_cmd.hdr[0], MSG_HEADER_SIZE, "%s" , get_cmd_req_msg_header() );
@ -689,11 +728,20 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface )
         * Note: the minus 1 is to overwrite the null */
        snprintf ( &mtc_cmd.hdr[MSG_HEADER_SIZE-1], MSG_HEADER_SIZE, "%s", obj_ptr->get_hostIfaceMac(hostname, MGMNT_IFACE).data());

-        string data = "{\"address\":\"";
-        data.append(obj_ptr->my_float_ip) ;
-        data.append("\",\"interface\":\"");
-        data.append(get_iface_name_str(interface));
-        data.append("\"}");
+        /* If data is empty then at least add where the message came from */
+        if ( data.empty() )
+        {
+            data = "{\"address\":\"";
+            data.append(obj_ptr->my_float_ip) ;
+            data.append("\",\"interface\":\"");
+            data.append(get_iface_name_str(interface));
+            data.append("\"}");
+        }
+        else
+        {
+            ; /* data is already pre loaded by the command case above */
+        }
+        /* copy data into message buffer */
        snprintf ( &mtc_cmd.buf[0], data.length()+1, "%s", data.data());
        bytes = (sizeof(mtc_message_type)-(BUF_SIZE-(data.length()+1)));

@ -1176,7 +1224,7 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
    else if ( msg.cmd == MTC_EVENT_HEARTBEAT_READY )
    {
        /* no heartbeating in simplex mode */
-        if ( obj_ptr->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX )
+        if ( obj_ptr->system_type == SYSTEM_TYPE__AIO__SIMPLEX )
        {
            return (PASS);
        }
@ -1214,13 +1262,68 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
            {
                elog ("%s Failed to send inventory to heartbeat service\n", hostname.c_str());
            }
-            /* Send the start event to the heartbeat service for all enabled hosts */
+            /* Consider sending the 'start' request to the heartbeat service
+             * for all enabled hosts. */
            if (( obj_ptr->get_adminState  ( hostname ) == MTC_ADMIN_STATE__UNLOCKED ) &&
                ( obj_ptr->get_operState   ( hostname ) == MTC_OPER_STATE__ENABLED ) &&
                ((obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__AVAILABLE ) ||
                 (obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__DEGRADED )))
            {
-                send_hbs_command ( hostname, MTC_CMD_START_HOST, controller );
+                /* However, bypass sending heartbeat 'start' for nodes that
+                 * are not ready to heartbeat; enabling, configuring, testing.
+                 * Such cases are if a host is:
+                 *
+                 * 1. running the add_handler or
+                 * 2. running the enable_handler or
+                 * 3. running the enable_subf_handler or
+                 * 4. not configured or
+                 * 5. not tested (goenabled not complete)
+                 *
+                 */
+                mtc_nodeAdminAction_enum current_action =
+                    obj_ptr->get_adminAction (hostname);
+                if (( current_action != MTC_ADMIN_ACTION__ADD ) &&
+                    ( current_action != MTC_ADMIN_ACTION__ENABLE ) &&
+                    ( current_action != MTC_ADMIN_ACTION__ENABLE_SUBF ))
+                {
+                    int mtce_flags = obj_ptr->get_mtce_flags(hostname);
+                    if (( mtce_flags & MTC_FLAG__I_AM_CONFIGURED ) &&
+                        ( mtce_flags & MTC_FLAG__I_AM_HEALTHY  ) &&
+                        ( mtce_flags & MTC_FLAG__MAIN_GOENABLED ))
+                    {
+                        if (( obj_ptr->system_type != SYSTEM_TYPE__NORMAL ) &&
+                            ( obj_ptr->get_nodetype ( hostname ) & CONTROLLER_TYPE ))
+                        {
+                            /* If its an AIO then its worker subfunction
+                             * needs to have been be configured and tested. */
+                            if (( mtce_flags & MTC_FLAG__SUBF_CONFIGURED ) &&
+                                ( mtce_flags & MTC_FLAG__SUBF_GOENABLED ))
+                            {
+                                ilog("%s heartbeat start (AIO controller)",
+                                         hostname.c_str());
+                                send_hbs_command ( hostname, MTC_CMD_START_HOST, controller );
+                            }
+                            else
+                            {
+                                wlog ("%s not heartbeat ready (subf) (oob:%x)",
+                                          hostname.c_str(),
+                                          mtce_flags);
+                            }
+                        }
+                        else
+                        {
+                            ilog("%s heartbeat start (from ready event)",
+                                     hostname.c_str());
+                            send_hbs_command ( hostname, MTC_CMD_START_HOST, controller );
+                        }
+                    }
+                    else
+                    {
+                        wlog ("%s not heartbeat ready (main) (oob:%x)",
+                                  hostname.c_str(),
+                                  mtce_flags);
+                    }
+                }
            }
        }
        ilog ("%s %s inventory push ... done",
--- a/mtce/src/maintenance/mtcInvApi.cpp
+++ b/mtce/src/maintenance/mtcInvApi.cpp
@ -974,7 +974,7 @@ int nodeLinkClass::mtcInvApi_update_states_now ( struct nodeLinkClass::node * no
    else
        avail = " " ;

-    if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
+    if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
    {
        if ( ! oper_subf.empty() )
        {
@ -1016,7 +1016,7 @@ int nodeLinkClass::mtcInvApi_update_states_now ( struct nodeLinkClass::node * no
        this->sysinvEvent.payload.erase(len-1,1);
        this->sysinvEvent.payload.append ( "]");

-        if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
+        if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
        {
            ilog ("%s %s-%s-%s %s-%s\n",
                      node_ptr->hostname.c_str(),
--- a/mtce/src/maintenance/mtcNodeComp.cpp
+++ b/mtce/src/maintenance/mtcNodeComp.cpp
@ -43,9 +43,9 @@
 #include <signal.h>
 #include <fcntl.h>
 #include <errno.h>
-//#include <syslog.h>    /* for ... syslog                  */
 #include <sys/stat.h>
 #include <list>
+#include <json-c/json.h> /* for ... json_tokener_parse                    */

 using namespace std;

@ -56,6 +56,10 @@ using namespace std;
 #include "nodeBase.h"       /* for ... Common Definitions                 */
 #include "nodeTimers.h"     /* fpr ... Timer Service                      */
 #include "nodeUtil.h"       /* for ... Common Utilities                   */
+#include "hostUtil.h"       /* for ... hostUtil_is_valid_...              */
+#include "jsonUtil.h"       /* for ... jsonUtil_get_key_value_string      */
+#include "bmcUtil.h"        /* for ... bmcUtil_accessInfo_type            */
+#include "ipmiUtil.h"       /* for ... ipmiUtil_reset_host_now            */
 #include "nodeMacro.h"      /* for ... CREATE_NONBLOCK_INET_UDP_RX_SOCKET */
 #include "mtcNodeMsg.h"     /* for ... common maintenance messaging       */
 #include "mtcNodeComp.h"    /* for ... this module header                 */
@ -96,7 +100,7 @@ string get_hostname ( void )
 * Daemon Configuration Structure - The allocated struct
 * @see daemon_common.h for daemon_config_type struct format.
 */
-static daemon_config_type mtc_config ; 
+static daemon_config_type mtc_config ;
 daemon_config_type * daemon_get_cfg_ptr () { return &mtc_config ; }

 /**
@ -106,6 +110,8 @@ daemon_config_type * daemon_get_cfg_ptr () { return &mtc_config ; }
 static mtc_socket_type mtc_sock   ;
 static mtc_socket_type * sock_ptr ;

+static bmcUtil_accessInfo_type peer_controller = {"none","none","none","none","none"};
+static bmcUtil_accessInfo_type this_controller = {"none","none","none","none","none"};

 int run_goenabled_scripts ( string type );

@ -138,6 +144,16 @@ void timer_handler ( int sig, siginfo_t *si, void *uc)
        mtcTimer_stop_int_safe ( ctrl.hostservices.timer );
        ctrl.hostservices.timer.ring = true ;
    }
+    else if ( *tid_ptr == ctrl.peer_ctrlr_reset.sync_timer.tid )
+    {
+        ctrl.peer_ctrlr_reset.sync_timer.ring = true ;
+        mtcTimer_stop_int_safe ( ctrl.peer_ctrlr_reset.sync_timer );
+    }
+    else if ( *tid_ptr == ctrl.peer_ctrlr_reset.audit_timer.tid )
+    {
+        /* use auto restart */
+        ctrl.peer_ctrlr_reset.audit_timer.ring = true ;
+    }
    else
    {
        mtcTimer_stop_tid_int_safe ( tid_ptr );
@ -207,9 +223,8 @@ void daemon_exit ( void )
    exit (0) ;
 }

-                                 
 /* Startup config read */
-static int mtc_config_handler ( void * user, 
+static int mtc_config_handler ( void * user,
                          const char * section,
                          const char * name,
                          const char * value)
@ -236,11 +251,14 @@ static int mtc_config_handler ( void * user,
        config_ptr->failsafe_shutdown_delay = atoi(value);
        ilog ("Shutdown TO : %d secs\n", config_ptr->failsafe_shutdown_delay );
    }
-    else
+    if (( ctrl.nodetype & CONTROLLER_TYPE ) &&
+        (MATCH("client", "sync_b4_peer_ctrlr_reset")))
    {
-        return (PASS);
+        ctrl.peer_ctrlr_reset.sync = atoi(value);
+        ilog("SyncB4 Reset: %s",
+              ctrl.peer_ctrlr_reset.sync ? "Yes" : "No" );
    }
-    return (FAIL);
+    return (PASS);
 }

 /* Read the mtc.ini file and load control    */
@ -431,7 +449,7 @@ void setup_clstr_tx_sockets ( void )
            mtc_sock.mtc_client_tx_socket_c0_clstr->sock_ok(false);
        }
    }
-    if ( ctrl.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )
+    if ( ctrl.system_type != SYSTEM_TYPE__AIO__SIMPLEX )
    {
        dlog ("setup of %s TX\n", CONTROLLER_1_CLUSTER_HOST);

@ -946,6 +964,65 @@ void _manage_goenabled_tests ( void )
    _scripts_cleanup (ctrl.active_script_set) ;
 }

+int issue_reset_and_cleanup ( void )
+{
+    int rc = FAIL ;
+    const char peer_ctrlr [] = "Peer controller reset" ;
+
+    ilog("SM %s request", peer_ctrlr );
+    /* check creds */
+    if (( hostUtil_is_valid_ip_addr  ( peer_controller.bm_ip ) == false ) ||
+        ( hostUtil_is_valid_username ( peer_controller.bm_un ) == false ) ||
+        ( hostUtil_is_valid_pw       ( peer_controller.bm_pw ) == false ))
+    {
+        elog("%s cannot reset peer BMC host at %s due to invalid credentials",
+                 ctrl.hostname, peer_controller.bm_ip.c_str());
+        return (rc);
+    }
+
+    /* create output filename - no need to delete after operation */
+    string output_filename = bmcUtil_create_data_fn ( ctrl.hostname,
+                             BMC_RESET_CMD_FILE_SUFFIX,
+                             BMC_PROTOCOL__IPMITOOL );
+    if ( output_filename.empty() )
+    {
+        elog("%s ; failed to create output filename", peer_ctrlr);
+        rc = FAIL_STRING_EMPTY ;
+    }
+    else if ( ipmiUtil_reset_host_now ( ctrl.hostname,
+                                        peer_controller,
+                                        output_filename ) == PASS )
+    {
+        string result = daemon_get_file_str ( output_filename.data() );
+        ilog("%s succeeded", peer_ctrlr);
+
+        /* don't fail the operation if the result is unexpected ; but log it */
+        if ( result.compare( IPMITOOL_POWER_RESET_RESP ) )
+        {
+            dlog("... but reset command output was unexpected ; %s",
+                      result.c_str());
+        }
+        rc = PASS ;
+    }
+    else
+    {
+        elog("%s failed", peer_ctrlr);
+        rc = FAIL_OPERATION ;
+    }
+
+    if ( rc == PASS )
+    {
+        /* give the host a chance to reset before
+         * telling SM the reset is done */
+        sleep (2) ;
+
+        /* Don't want to remove the file if the reset was not successful */
+        dlog("removing %s", RESET_PEER_NOW );
+        daemon_remove_file ( RESET_PEER_NOW );
+    }
+    return (rc);
+}
+

 /* The main service loop */
 int daemon_init ( string iface, string nodetype_str )
@ -963,6 +1040,7 @@ int daemon_init ( string iface, string nodetype_str )
    ctrl.subfunction = 0 ;
    ctrl.system_type = daemon_system_type ();
    ctrl.clstr_iface_provisioned = false ;
+    ctrl.peer_ctrlr_reset.sync = false ;

    /* convert node type to integer */
    ctrl.nodetype = get_host_function_mask ( nodetype_str ) ;
@ -1018,6 +1096,13 @@ int daemon_init ( string iface, string nodetype_str )
    mtcTimer_init ( ctrl.goenabled.timer, &ctrl.hostname[0], "goenable timer" );
    mtcTimer_init ( ctrl.hostservices.timer, &ctrl.hostname[0], "host services timer" );

+    /* initialize peer controller reset feature */
+    mtcTimer_init ( ctrl.peer_ctrlr_reset.audit_timer, &ctrl.hostname[0], "peer ctrlr reset audit timer" ),
+    mtcTimer_init ( ctrl.peer_ctrlr_reset.sync_timer, &ctrl.hostname[0], "peer ctrlr reset sync timer" ),
+    ctrl.peer_ctrlr_reset.sync_timer.ring = false ;
+    ctrl.peer_ctrlr_reset.audit_timer.ring = false ;
+    ctrl.peer_ctrlr_reset.audit_period = PEER_CTRLR_AUDIT_PERIOD ;
+
    /* initialize the script group control structures */
    script_ctrl_init ( &ctrl.goenabled    );
    script_ctrl_init ( &ctrl.hostservices );
@ -1073,6 +1158,17 @@ void daemon_service_run ( void )
    /* Send first mtcAlive ASAP */
    mtcTimer_start ( ctrl.timer, timer_handler, 1 );

+    /* Monitor for peer controller reset requests when this
+     * daemon runs on a controller */
+    if ( ctrl.nodetype & CONTROLLER_TYPE )
+    {
+        mtcTimer_start ( ctrl.peer_ctrlr_reset.audit_timer,
+                         timer_handler,
+                         ctrl.peer_ctrlr_reset.audit_period );
+    }
+
+    mtce_send_event ( sock_ptr, MTC_EVENT_MONITOR_READY, NULL );
+
    /* lets go select so that the sock does not go crazy */
    dlog ("%s running main loop with %d msecs socket timeout\n",
                       &ctrl.hostname[0], (SOCKET_WAIT/1000) );
@ -1305,8 +1401,20 @@ void daemon_service_run ( void )
                socket_reinit = true ;
            }

-            /* Clstr Tx */
-            else if (( ctrl.clstr_iface_provisioned == true ) &&
+            /* Clstr Tx ; AIO SX */
+            else if ((ctrl.system_type == SYSTEM_TYPE__AIO__SIMPLEX) &&
+                     ( ctrl.clstr_iface_provisioned == true ) &&
+                     (( mtc_sock.mtc_client_tx_socket_c0_clstr == NULL ) ||
+                      ( mtc_sock.mtc_client_tx_socket_c0_clstr->sock_ok() == false )))
+            {
+                wlog ("calling setup_clstr_tx_sockets (auto-recovery)\n");
+                setup_clstr_tx_sockets();
+                socket_reinit = true ;
+            }
+
+            /* Clstr Tx ; not AIO SX */
+            else if ((ctrl.system_type != SYSTEM_TYPE__AIO__SIMPLEX) &&
+                     ( ctrl.clstr_iface_provisioned == true ) &&
                     (( mtc_sock.mtc_client_tx_socket_c0_clstr == NULL ) ||
                      ( mtc_sock.mtc_client_tx_socket_c1_clstr == NULL ) ||
                      ( mtc_sock.mtc_client_tx_socket_c0_clstr->sock_ok() == false ) ||
@ -1384,7 +1492,51 @@ void daemon_service_run ( void )
                }
            }
        }
-
+        /* service controller specific audits */
+        if ( ctrl.nodetype & CONTROLLER_TYPE )
+        {
+            /* peer controller reset service audit */
+            if ( ctrl.peer_ctrlr_reset.audit_timer.ring )
+            {
+                if ( daemon_is_file_present ( RESET_PEER_NOW ) )
+                {
+                    if ( ctrl.peer_ctrlr_reset.sync )
+                    {
+                        if ( ctrl.peer_ctrlr_reset.sync_timer.ring )
+                        {
+                            issue_reset_and_cleanup ();
+                            ctrl.peer_ctrlr_reset.sync_timer.ring = false ;
+                        }
+                        else if ( ctrl.peer_ctrlr_reset.sync_timer.tid == NULL )
+                        {
+                            if ( send_mtcClient_cmd ( &mtc_sock,
+                                                       MTC_CMD_SYNC,
+                                                       peer_controller.hostname,
+                                                       peer_controller.host_ip,
+                                                       mtc_config.mtc_rx_mgmnt_port) == PASS )
+                            {
+                                mtcTimer_start ( ctrl.peer_ctrlr_reset.sync_timer, timer_handler, MTC_SECS_10 );
+                                ilog("... waiting for peer controller to sync - %d secs", MTC_SECS_10);
+                            }
+                            else
+                            {
+                                elog("failed to send 'sync' command to peer controller mtcClient");
+                                ctrl.peer_ctrlr_reset.sync_timer.ring = true ;
+                            }
+                        }
+                        else
+                        {
+                            ; /* wait longer */
+                        }
+                    }
+                    else
+                    {
+                        issue_reset_and_cleanup ();
+                    }
+                }
+                ctrl.peer_ctrlr_reset.audit_timer.ring = false ;
+            }
+        }
        daemon_signal_hdlr ();
    }
    daemon_exit();
@ -1573,7 +1725,7 @@ int run_hostservices_scripts ( unsigned int cmd )


    /* For the stop command we need the mtcClient to run both controller and
-     * worker stop services if we are on a CPE system.
+     * worker stop services if we are on a AIO system.
     * This saves the mtcAgent from having to issue and manage 2 commands,
     * one for controller and 1 for worker */
    if ( ctrl.system_type != SYSTEM_TYPE__NORMAL )
@ -1750,7 +1902,6 @@ void daemon_sigchld_hdlr ( void )
        }
        default:
        {
-            wlog ("child handler running with no active script set (%d)\n", ctrl.active_script_set );
            return ;
        }
    }
@ -1820,6 +1971,84 @@ void daemon_sigchld_hdlr ( void )
    }
 }

+/***************************************************************************
+ *
+ * Name       : load_mtcInfo_msg
+ *
+ * Description: Extract the mtc info from the MTC_MSG_INFO message.
+ *
+ * Assumptions: So far only the peer controller reset feature uses this.
+ *
+ * Returns    : Nothing
+ *
+ ***************************************************************************/
+
+void load_mtcInfo_msg ( mtc_message_type & msg )
+{
+    if ( ctrl.nodetype & CONTROLLER_TYPE )
+    {
+        mlog1("%s", &msg.buf[0]);
+        struct json_object *_obj = json_tokener_parse( &msg.buf[0] );
+        if ( _obj )
+        {
+            if ( strcmp(&ctrl.hostname[0], CONTROLLER_0 ))
+                peer_controller.hostname = CONTROLLER_0 ;
+            else
+                peer_controller.hostname = CONTROLLER_1 ;
+
+            struct json_object *info_obj = (struct json_object *)(NULL);
+            json_bool json_rc = json_object_object_get_ex( _obj,
+                                                          "mtcInfo",
+                                                          &info_obj );
+            if ( ( json_rc == TRUE ) && ( info_obj ))
+            {
+                struct json_object *ctrl_obj = (struct json_object *)(NULL);
+                json_bool json_rc =
+                json_object_object_get_ex( info_obj,
+                                           peer_controller.hostname.data(),
+                                          &ctrl_obj );
+
+                if (( json_rc == TRUE ) && ( ctrl_obj ))
+                {
+                    peer_controller.host_ip = jsonUtil_get_key_value_string(ctrl_obj, MTC_JSON_INV_HOSTIP) ;
+                    peer_controller.bm_ip = jsonUtil_get_key_value_string(ctrl_obj, MTC_JSON_INV_BMIP) ;
+                    peer_controller.bm_un = jsonUtil_get_key_value_string(ctrl_obj, "bm_un");
+                    peer_controller.bm_pw = jsonUtil_get_key_value_string(ctrl_obj, "bm_pw");
+
+                    /* log the mc info but not the bmc password ; only
+                     * indicate that it looks 'ok' or 'is 'none' */
+                    ilog ("%s is my peer [host:%s bmc:%s:%s:%s]",
+                           peer_controller.hostname.c_str(),
+                           peer_controller.host_ip.c_str(),
+                           peer_controller.bm_ip.c_str(),
+                           peer_controller.bm_un.c_str(),
+                           hostUtil_is_valid_pw(peer_controller.bm_pw) ? "ok":"none");
+                }
+                else
+                {
+                    wlog("peer mtcInfo missing (rc:%d) ; %s",
+                          json_rc, &msg.buf[0]);
+                }
+            }
+            else
+            {
+                wlog("mtcInfo label parse error (rc:%d) ; %s",
+                      json_rc, &msg.buf[0]);
+            }
+            json_object_put(_obj);
+        }
+        else
+        {
+            wlog("message buffer tokenize error ; %s", &msg.buf[0]);
+        }
+    }
+    else
+    {
+        slog("%s got mtcInfo ; unexpected for this nodetype", ctrl.hostname);
+    }
+}
+
+
 /* Push daemon state to log file */
 void daemon_dump_info ( void )
 {
@ -1853,13 +2082,13 @@ int daemon_run_testhead ( void )
    * STAGE 1: some test
    ************************************************/
    printf ( "| Test  %d : Maintenance Service Test ............. ", stage );
-    if ( rc != PASS )    
+    if ( rc != PASS )
    {
       FAILED_STR ;
       rc = FAIL ;
    }
    else
-       PASSED ; 
+       PASSED ;

    printf  ("+---------------------------------------------------------+\n");
    return PASS ;
--- a/mtce/src/maintenance/mtcNodeComp.h
+++ b/mtce/src/maintenance/mtcNodeComp.h
@ -17,6 +17,10 @@
 #include <string.h>
 #include <unistd.h>

+using namespace std;
+
+#include "nodeTimers.h"     /* for ... Timer Service  */
+
 /** Compute Config mask */
 #define CONFIG_CLIENT_MASK  (CONFIG_AGENT_MTC_MGMNT_PORT  |\
                             CONFIG_CLIENT_MTC_MGMNT_PORT |\
@ -59,6 +63,22 @@ typedef struct
 } script_ctrl_type ;
 void script_ctrl_init ( script_ctrl_type * script_ctrl_ptr );

+/* peer controller reset control structure and associated definitions */
+
+/* This is a flag file set by SM when SM wants maintanence to perform a
+ * BMC reset of the other (peer) controller */
+#define RESET_PEER_NOW "/var/run/.sm_reset_peer"
+
+#define PEER_CTRLR_AUDIT_PERIOD (2)
+typedef struct
+{
+    struct
+    mtc_timer  sync_timer  ;
+    mtc_timer audit_timer  ;
+    int       audit_period ;
+    bool      sync   ;
+} peer_ctrlr_reset_type ;
+
 typedef struct
 {
    char             hostname [MAX_HOST_NAME_SIZE+1];
@ -76,7 +96,7 @@ typedef struct
    unsigned int     function ;
    unsigned int  subfunction ;

-    struct mtc_timer timer ; /* mtcAlive timer */
+    struct mtc_timer timer       ; /* mtcAlive timer */

    bool             clstr_iface_provisioned ;

@ -102,6 +122,7 @@ typedef struct
    /* Where to send events */
    string mtcAgent_ip ;

+    peer_ctrlr_reset_type peer_ctrlr_reset;
 } ctrl_type ;

 ctrl_type * get_ctrl_ptr ( void );
@ -109,5 +130,6 @@ ctrl_type * get_ctrl_ptr ( void );
 bool is_subfunction_worker ( void );
 int run_goenabled_scripts ( mtc_socket_type * sock_ptr , string requestor );
 int run_hostservices_scripts ( unsigned int cmd );
+void load_mtcInfo_msg ( mtc_message_type & msg );

 #endif
--- a/mtce/src/maintenance/mtcNodeCtrl.cpp
+++ b/mtce/src/maintenance/mtcNodeCtrl.cpp
@ -1187,15 +1187,6 @@ int _self_provision ( void )

            if ( my_identity.name == record_info.name )
            {
-                /* If the active controller was 'locked' and is being auto-corrected
-                 * to 'unlocked' then ensure that there is no locked alarm set for it */
-                if ( record_info.admin != "locked" )
-                {
-                        mtcAlarm_clear ( my_identity.name, MTC_ALARM_ID__LOCK );
-                        /* this is not required because its already inited to clear */
-                        // node_ptr->alarms[MTC_ALARM_ID__LOCK] = FM_ALARM_SEVERITY_CLEAR
-                }
-
                if ( my_identity.mac != record_info.mac )
                {
                    wlog ("%s mac address mismatch (%s - %s)\n",
@ -1326,6 +1317,7 @@ void nodeLinkClass::fsm ( void )
            daemon_signal_hdlr ();
            mtcHttpSvr_look ( mtce_event );
        }
+        mtcInv.mtcInfo_handler();
    }
 }

@ -1515,9 +1507,9 @@ void daemon_service_run ( void )

    if ( ts.tv_sec < MTC_MINS_15 )
    {
-        /* CPE DOR window is much greater in CPE since heartbeat
-         * cannot start until the inactive CPE has run both manifests */
-        int timeout = DEFAULT_DOR_MODE_CPE_TIMEOUT ;
+        /* AIO DOR window is much greater in AIO since heartbeat
+         * cannot start until the inactive AIO has run both manifests */
+        int timeout = DEFAULT_DOR_MODE_AIO_TIMEOUT ;

        /* override the timeout to a smaller value for normal system */
        if ( mtcInv.system_type == SYSTEM_TYPE__NORMAL )
@ -1601,7 +1593,7 @@ void daemon_service_run ( void )
        if ( mtcInv.system_type == SYSTEM_TYPE__NORMAL )
            mtc_sock.waitd.tv_usec = MTCAGENT_SELECT_TIMEOUT ;
        else
-            mtc_sock.waitd.tv_usec = MTCAGENT_CPE_SELECT_TIMEOUT ;
+            mtc_sock.waitd.tv_usec = MTCAGENT_AIO_SELECT_TIMEOUT ;

        /* This is used as a delay up to select_timeout */
        rc = select( socks.back()+1, &mtc_sock.readfds, NULL, NULL, &mtc_sock.waitd);
--- a/mtce/src/maintenance/mtcNodeFsm.cpp
+++ b/mtce/src/maintenance/mtcNodeFsm.cpp
@ -63,6 +63,11 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )

    /* Monitor and Manage active threads */
    thread_handler ( node_ptr->bmc_thread_ctrl, node_ptr->bmc_thread_info );
+    if ( node_ptr->bmc_thread_ctrl.stage == THREAD_STAGE__KILL )
+    {
+        /* do nothing while thread is being killed */
+        return RETRY ;
+    }

    /* manage the host connected state and board management alarms */
    nodeLinkClass::bmc_handler ( node_ptr );
@ -310,10 +315,10 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
    }

    /****************************************************************************
-     * No Op: Do nothing for this Healthy Enabled Locked CPE Simplex Host
+     * No Op: Do nothing for this Healthy Enabled Locked AIO Simplex Host
     ****************************************************************************
     */
-    else if (( this->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX ) &&
+    else if (( this->system_type == SYSTEM_TYPE__AIO__SIMPLEX ) &&
             ( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) &&
             ( node_ptr->adminState  == MTC_ADMIN_STATE__LOCKED ))
    {
--- a/mtce/src/maintenance/mtcNodeHdlrs.cpp
+++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp
@ -481,7 +481,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
        if ( node_ptr->adminAction == MTC_ADMIN_ACTION__UNLOCK )
        {
            bool aio = false ;
-            if ( SIMPLEX_CPE_SYSTEM )
+            if ( SIMPLEX_AIO_SYSTEM )
                aio = true ;
            else
                aio = false ;
@ -525,7 +525,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
                }
            }
            mtcInvApi_update_states_now ( node_ptr, "unlocked", "disabled" , "offline", "disabled", "offline" );
-            mtcInvApi_update_task_now   ( node_ptr, aio ? MTC_TASK_CPE_SX_UNLOCK_MSG : MTC_TASK_SELF_UNLOCK_MSG );
+            mtcInvApi_update_task_now   ( node_ptr, aio ? MTC_TASK_AIO_SX_UNLOCK_MSG : MTC_TASK_SELF_UNLOCK_MSG );

            wlog ("%s unlocking %s with reboot\n",
                      my_hostname.c_str(),
@ -546,7 +546,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
             * Condition 1: While there is no in-service backup controller
             *              to swact to. In this case the ctive controller
             *              - is only degraded to avoid a system outage.
-             *              - the CPE subfunction is failed
+             *              - the AIO subfunction is failed
             *              - worker SubFunction Alarm is raised
             *              - Enable alarm is raised
             *              - A process monitor alarm may also be raised if
@ -648,7 +648,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
                }
                else
                {
-                    if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
+                    if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
                    {
                        /* Raise Critical Compute Function Alarm */
                        alarm_compute_failure ( node_ptr , FM_ALARM_SEVERITY_CRITICAL );
@ -661,7 +661,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
            node_ptr->graceful_recovery_counter = 0 ;
            node_ptr->health_threshold_counter  = 0 ;

-            if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
+            if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
            {
                node_ptr->inservice_failed_subf = true ;
                subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED,
@ -1358,7 +1358,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
                 * have a worker function and the heartbeat for those hosts
                 * are started at the end of the subfunction handler. */
                if (( THIS_HOST ) ||
-                   (( CPE_SYSTEM ) && ( is_controller(node_ptr)) ))
+                   (( AIO_SYSTEM ) && ( is_controller(node_ptr)) ))
                {
                    enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE );
                }
@ -1523,8 +1523,8 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
            if ( is_controller(node_ptr) )
            {
                /* Defer telling SM the controller state if
-                 * this is a CPE and this is the only controller */
-                if ( CPE_SYSTEM && ( num_controllers_enabled() > 0 ))
+                 * this is a AIO and this is the only controller */
+                if ( AIO_SYSTEM && ( num_controllers_enabled() > 0 ))
                {
                    wlog ("%s deferring SM enable notification till subfunction-enable complete\n",
                              node_ptr->hostname.c_str());
@ -1555,7 +1555,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )

            enableStageChange ( node_ptr, MTC_ENABLE__START );

-            if (( CPE_SYSTEM ) && ( is_controller(node_ptr)))
+            if (( AIO_SYSTEM ) && ( is_controller(node_ptr)))
            {
                ilog ("%s running worker sub-function enable handler\n", node_ptr->hostname.c_str());
                mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLING_SUBF );
@ -1637,9 +1637,10 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
            node_ptr->http_retries_cur = 0 ;
            node_ptr->unknown_health_reported = false ;

-            plog ("%s %sGraceful Recovery (uptime was %d)\n",
+            plog ("%s %sGraceful Recovery (%d) (uptime was %d)\n",
                      node_ptr->hostname.c_str(),
                      node_ptr->mnfa_graceful_recovery ? "MNFA " : "",
+                      node_ptr->graceful_recovery_counter,
                      node_ptr->uptime );

            /* Cancel any outstanding timers */
@ -1660,7 +1661,8 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
             *   2. Setting the node operational state to Disabled
             *   3. Setting the Enable action
             */
-            if ( ++node_ptr->graceful_recovery_counter > MTC_MAX_FAST_ENABLES )
+            node_ptr->graceful_recovery_counter++ ;
+            if ( node_ptr->graceful_recovery_counter > MTC_MAX_FAST_ENABLES )
            {
                /* gate off further mtcAlive messaging timme the offline
                * handler runs. This prevents stale messages from making it
@ -1772,10 +1774,11 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )

                else if ( node_ptr->mnfa_graceful_recovery == true )
                {
-                    if ( node_ptr->uptime > MTC_MINS_10 )
+                    if ( node_ptr->uptime > MTC_MINS_15 )
                    {
                        /* did not reboot case */
-                        wlog ("%s Connectivity Recovered ; host did not reset\n", node_ptr->hostname.c_str());
+                        wlog ("%s Connectivity Recovered ; host did not reset (uptime:%d)\n",
+                                  node_ptr->hostname.c_str(), node_ptr->uptime);
                        wlog ("%s ... continuing with MNFA graceful recovery\n", node_ptr->hostname.c_str());
                        wlog ("%s ... with no affect to host services\n", node_ptr->hostname.c_str());

@ -1788,7 +1791,8 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
                    else
                    {
                        /* did reboot case */
-                        wlog ("%s Connectivity Recovered ; host has reset\n", node_ptr->hostname.c_str());
+                        wlog ("%s Connectivity Recovered ; host has reset (uptime:%d)\n",
+                                  node_ptr->hostname.c_str(),  node_ptr->uptime);
                        ilog ("%s ... continuing with MNFA graceful recovery\n", node_ptr->hostname.c_str());
                        ilog ("%s ... without additional reboot %s\n",
                                  node_ptr->hostname.c_str(), node_ptr->bm_ip.empty() ? "or reset" : "" );
@ -1806,12 +1810,13 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
                        break ;
                    }
                }
-                else if (( node_ptr->uptime_save ) && ( node_ptr->uptime >= node_ptr->uptime_save ))
+                else if ( node_ptr->uptime > MTC_MINS_15 )
                {
                    /* did not reboot case */
-                    wlog ("%s Connectivity Recovered ; host did not reset%s\n",
+                    wlog ("%s Connectivity Recovered ; host did not reset%s (uptime:%d)",
                              node_ptr->hostname.c_str(),
-                              node_ptr->was_dor_recovery_mode ? " (DOR)" : "" );
+                              node_ptr->was_dor_recovery_mode ? " (DOR)" : "",
+                              node_ptr->uptime);

                    wlog ("%s ... continuing with graceful recovery\n", node_ptr->hostname.c_str());
                    wlog ("%s ... with no affect to host services\n", node_ptr->hostname.c_str());
@ -1875,7 +1880,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
                                           MTC_OPER_STATE__DISABLED,
                                           MTC_AVAIL_STATUS__FAILED );

-                if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
+                if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
                {
                    subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED,
                                               MTC_AVAIL_STATUS__FAILED );
@ -1905,7 +1910,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
        {
            int timeout = 0 ;

-            /* Set the FSM task state to booting */
+            /* Set the FSM task state to 'Graceful Recovery Wait' */
            node_ptr->uptime = 0 ;
            mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_WAIT );

@ -2266,7 +2271,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
            {
                /* The active controller would never get/be here but
                 * if it did then just fall through to change state. */
-                if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
+                if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
                {
                    /* Here we need to run the sub-fnction goenable and start
                     * host services if this is the other controller in a AIO
@ -2442,10 +2447,10 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
            }
            else /* success path */
            {
-                /* allow the fsm to wait for up to 1 minute for the
-                 * hbsClient's ready event before starting heartbeat
+                /* allow the fsm to wait for up to 'worker config timeout'
+                 * for the hbsClient's ready event before starting heartbeat
                 * test. */
-                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 );
+                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_WORKER_CONFIG_TIMEOUT );
                recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_START );
            }
            break ;
@ -2502,6 +2507,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
        {
            if ( node_ptr->mtcTimer.ring == true )
            {
+                ilog ("%s heartbeating", node_ptr->hostname.c_str());
                /* if heartbeat is not working then we will
                 * never get here and enable the host */
                recoveryStageChange ( node_ptr, MTC_RECOVERY__STATE_CHANGE );
@ -2510,7 +2516,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
        }
        case MTC_RECOVERY__STATE_CHANGE:
        {
-            if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
+            if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
            {
                /* Set node as unlocked-enabled */
                subfStateChange ( node_ptr, MTC_OPER_STATE__ENABLED,
@ -2555,7 +2561,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
            else if ( rc == PASS )
            {
                /* Start Graceful Recovery */
-                recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE_START ) ;
+                recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE ) ;
                break ;
            }
            else if ( rc == FAIL_WORKQ_TIMEOUT )
@ -2571,51 +2577,37 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
            nodeLinkClass::force_full_enable ( node_ptr );
            break ;
        }
-        case MTC_RECOVERY__ENABLE_START:
+        case MTC_RECOVERY__ENABLE:
        {
-            /* Create the recovery enable timer. This timer is short.
-             * A node need to stay enabled with the hartbeat service
-             * running for a period of time before declaring it enabled */
-            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
-
-            recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE_WAIT ) ;
-            break;
-        }
-        case MTC_RECOVERY__ENABLE_WAIT:
-        {
-            /* When this timer fires the host has been up for enough time */
-            if ( node_ptr->mtcTimer.ring == true )
+            if ( is_controller(node_ptr) )
            {
-                if ( is_controller(node_ptr) )
+                if ( mtcSmgrApi_request ( node_ptr,
+                                          CONTROLLER_ENABLED,
+                                          SMGR_MAX_RETRIES ) != PASS )
                {
-                    if ( mtcSmgrApi_request ( node_ptr,
-                                              CONTROLLER_ENABLED,
-                                              SMGR_MAX_RETRIES ) != PASS )
-                    {
-                        wlog ("%s Failed to send 'unlocked-disabled' to HA Service Manager ; allowing enable\n",
-                              node_ptr->hostname.c_str());
-                    }
+                    wlog ("%s Failed to send 'unlocked-enabled' to HA Service Manager ; allowing enable\n",
+                          node_ptr->hostname.c_str());
                }
-                /* Node Has Recovered */
-                node_ptr->graceful_recovery_counter = 0 ;
-                recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
-                adminActionChange   ( node_ptr, MTC_ADMIN_ACTION__NONE );
-                node_ptr->health_threshold_counter = 0 ;
-                node_ptr->enabled_count++ ;
-                node_ptr->http_retries_cur = 0 ;
-
-                doneQueue_purge ( node_ptr );
-                if ( node_ptr->was_dor_recovery_mode )
-                {
-                    report_dor_recovery (  node_ptr , "is ENABLED" );
-                }
-                else
-                {
-                    plog ("%s is ENABLED (Gracefully Recovered)\n",
-                              node_ptr->hostname.c_str());
-                }
-                alarm_enabled_clear ( node_ptr, false );
            }
+            /* Node Has Recovered */
+            node_ptr->graceful_recovery_counter = 0 ;
+            recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
+            adminActionChange   ( node_ptr, MTC_ADMIN_ACTION__NONE );
+            node_ptr->health_threshold_counter = 0 ;
+            node_ptr->enabled_count++ ;
+            node_ptr->http_retries_cur = 0 ;
+
+            doneQueue_purge ( node_ptr );
+            if ( node_ptr->was_dor_recovery_mode )
+            {
+                report_dor_recovery (  node_ptr , "is ENABLED" );
+            }
+            else
+            {
+                plog ("%s is ENABLED (Gracefully Recovered)\n",
+                          node_ptr->hostname.c_str());
+            }
+            alarm_enabled_clear ( node_ptr, false );
            break ;
        }
        default:
@ -2783,7 +2775,7 @@ int nodeLinkClass::disable_handler  ( struct nodeLinkClass::node * node_ptr )
                                           MTC_OPER_STATE__DISABLED,
                                           locked_status );

-                if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
+                if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
                {
                    subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED,
                                                locked_status );
@ -3432,7 +3424,7 @@ int nodeLinkClass::online_handler ( struct nodeLinkClass::node * node_ptr )

                        /* otherwise change state */
                        mtcInvApi_update_state(node_ptr, MTC_JSON_INV_AVAIL,"offline" );
-                        if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
+                        if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
                        {
                            mtcInvApi_update_state(node_ptr, MTC_JSON_INV_AVAIL_SUBF,"offline" );
                        }
@ -3473,7 +3465,7 @@ int nodeLinkClass::online_handler ( struct nodeLinkClass::node * node_ptr )
                                  node_ptr->hostname.c_str());

                        mtcInvApi_update_state ( node_ptr, MTC_JSON_INV_AVAIL, "online" );
-                        if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
+                        if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
                        {
                            mtcInvApi_update_state ( node_ptr, MTC_JSON_INV_AVAIL_SUBF, "online" );
                        }
@ -6093,7 +6085,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )

            mtcInfo_log(node_ptr);

-            if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
+            if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
            {
                if ( daemon_is_file_present ( CONFIG_COMPLETE_WORKER ) == false )
                {
@ -6120,52 +6112,38 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
                mtcInvApi_update_state ( node_ptr, "availability", "available" );
            }

-            /* handle other cases */
-            EFmAlarmSeverityT sev = mtcAlarm_state ( node_ptr->hostname,
-                                                     MTC_ALARM_ID__ENABLE);
+            /* Query FM for existing Enable and Config alarm status */
+            EFmAlarmSeverityT enable_alarm_severity =
+                mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__ENABLE);
+            EFmAlarmSeverityT config_alarm_severity =
+                mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__CONFIG);

-            if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
+            /* Clear generic enable alarm over process restart.
+             * Will get reasserted if the cause condition still exists */
+            if ( enable_alarm_severity != FM_ALARM_SEVERITY_CLEAR )
            {
-                node_ptr->alarms[MTC_ALARM_ID__LOCK] = FM_ALARM_SEVERITY_WARNING ;
-
-                /* If the node is locked then the Enable alarm
-                 * should not be present */
-                if ( sev != FM_ALARM_SEVERITY_CLEAR )
-                {
-                    mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
-                    sev = FM_ALARM_SEVERITY_CLEAR ;
-                }
+                ilog ("%s found enable alarm ; clearing %s",
+                          node_ptr->hostname.c_str(),
+                          alarmUtil_getSev_str(enable_alarm_severity).c_str());
+                mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
            }

-            /* Manage enable alarm over process restart.
-             *
-             * - clear the alarm in the active controller case
-             * - maintain the alarm, set degrade state in MAJOR and CRIT cases
-             * - clear alarm for all other severities.
-             */
-            if ( THIS_HOST )
+            /* The config alarm is maintained if it exists.
+             * The in-service test handler will clear the alarm
+             * if the config failure is gone */
+            if ( config_alarm_severity != FM_ALARM_SEVERITY_CLEAR )
            {
-                if ( sev != FM_ALARM_SEVERITY_CLEAR )
-                {
-                    mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
-                }
-            }
-            else
-            {
-                if (( sev == FM_ALARM_SEVERITY_CRITICAL ) ||
-                    ( sev == FM_ALARM_SEVERITY_MAJOR ))
-                {
-                    node_ptr->alarms[MTC_ALARM_ID__ENABLE] = sev ;
-                    node_ptr->degrade_mask |= DEGRADE_MASK_ENABLE ;
-                }
-                else if ( sev != FM_ALARM_SEVERITY_CLEAR )
-                {
-                    mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
-                }
+                node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ;
+                node_ptr->alarms[MTC_ALARM_ID__CONFIG] = config_alarm_severity ;
+                ilog ("%s found config alarm ; loaded %s",
+                          node_ptr->hostname.c_str(),
+                          alarmUtil_getSev_str(config_alarm_severity).c_str());
            }

            if ( is_controller(node_ptr) )
            {
+                this->controllers++ ;
+
                mtc_cmd_enum state = CONTROLLER_DISABLED ;

                if (( node_ptr->adminState   == MTC_ADMIN_STATE__UNLOCKED ) &&
@ -6199,7 +6177,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
                    {
                        ilog ("%s %s\n",node_ptr->hostname.c_str(), MTC_TASK_SWACT_COMPLETE );

-                        /* Work Around for issue: */
                        mtcInvApi_update_uptime ( node_ptr, node_ptr->uptime );

                        mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_COMPLETE );
@ -6233,7 +6210,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
                    mtcSmgrApi_request ( node_ptr, state , SWACT_FAIL_THRESHOLD );
                }
            }
-
            if ( daemon_get_cfg_ptr()->debug_level & 1 )
                nodeLinkClass::host_print (node_ptr);

@ -6290,6 +6266,40 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
                              node_ptr->hostname.c_str(), node_ptr->uptime );
                    break ;
                }
+                /* Handle catching and recovering/restoring hosts that might
+                 * have been in the Graceful Recovery Wait state.
+                 *
+                 * Prevents an extra reboot for hosts that might be in
+                 * Graceful Recovery over a maintenance process restart. */
+                else if (( NOT_THIS_HOST ) &&
+                         ( !node_ptr->task.compare(MTC_TASK_RECOVERY_WAIT)))
+                {
+                    ilog ("%s is in %s ; restoring state",
+                              node_ptr->hostname.c_str(),
+                              MTC_TASK_RECOVERY_WAIT);
+
+                    /* Complete necessary add operations before switching
+                     * to Recovery */
+                    LOAD_NODETYPE_TIMERS ;
+                    workQueue_purge ( node_ptr );
+                    if (( hostUtil_is_valid_bm_type  ( node_ptr->bm_type )) &&
+                        ( hostUtil_is_valid_ip_addr  ( node_ptr->bm_ip )) &&
+                        ( hostUtil_is_valid_username ( node_ptr->bm_un )))
+                    {
+                        set_bm_prov ( node_ptr, true ) ;
+                    }
+                    mtcTimer_reset ( node_ptr->mtcTimer );
+                    adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
+                    node_ptr->addStage = MTC_ADD__START;
+
+                    /* Switch into recovery_handler's Graceful Recovery Wait
+                     * state with the Graceful Recovery Wait timeout */
+                    adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
+                    mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler,
+                                     node_ptr->mtcalive_timeout );
+                    recoveryStageChange ( node_ptr, MTC_RECOVERY__MTCALIVE_WAIT );
+                    break ;
+                }
                else
                {
                    if ( is_controller(node_ptr) )
@ -6354,7 +6364,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )

            send_hbs_command   ( node_ptr->hostname, MTC_CMD_ADD_HOST );

-            if ( ( CPE_SYSTEM ) || ( is_worker (node_ptr) == true ))
+            if ( ( AIO_SYSTEM ) || ( is_worker (node_ptr) == true ))
            {
                send_guest_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
            }
@ -6368,6 +6378,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
        }
        case MTC_ADD__WORKQUEUE_WAIT:
        {
+
            rc = workQueue_done ( node_ptr );
            if ( rc == RETRY )
            {
@ -6393,11 +6404,11 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
                ( node_ptr->operState  == MTC_OPER_STATE__ENABLED ))
            {
                /* start the heartbeat service in all cases except for
-                 * THIS host and CPE controller hosts */
+                 * THIS host and AIO controller hosts */
                if ( NOT_THIS_HOST )
                {
                    if (( LARGE_SYSTEM ) ||
-                        (( CPE_SYSTEM ) && ( this->dor_mode_active == false )))
+                        (( AIO_SYSTEM ) && ( this->dor_mode_active == false )))
                    {
                        send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
                    }
@ -6430,7 +6441,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
                node_ptr->configAction = MTC_CONFIG_ACTION__INSTALL_PASSWD ;
            }

-            if (( ! SIMPLEX_CPE_SYSTEM ) &&
+            if (( ! SIMPLEX_AIO_SYSTEM ) &&
                ( node_ptr->bmc_provisioned == true ))
            {
                mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__BM );
@ -6438,7 +6449,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
            }

            /* Special Add handling for the AIO system */
-            if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
+            if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
            {
                if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
                    ( node_ptr->operState  == MTC_OPER_STATE__ENABLED ))
@ -6455,6 +6466,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
            }

            node_ptr->addStage = MTC_ADD__START;
+
            plog ("%s Host Add Completed (uptime:%d)\n", node_ptr->hostname.c_str(), node_ptr->uptime );
            node_ptr->add_completed = true ;
            break ;
@ -6635,6 +6647,8 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
                        mtcInfo_set ( node_ptr, MTCE_INFO_KEY__BMC_PROTOCOL, BMC_PROTOCOL__IPMI_STR );
                        node_ptr->bmc_protocol = BMC_PROTOCOL__IPMITOOL ;
                    }
+                    /* store mtcInfo, which specifies the selected BMC protocol,
+                     * into the sysinv database */
                    mtcInvApi_update_mtcInfo ( node_ptr );

                    ilog ("%s bmc control using %s:%s",
@ -6751,8 +6765,15 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
                        node_ptr->bmc_thread_ctrl.done = true  ;
                        node_ptr->bmc_thread_info.command = 0  ;
                    }
+                    /* store mtcInfo, which specifies the selected BMC protocol,
+                     * into the sysinv database */
                    mtcInvApi_update_mtcInfo ( node_ptr );

+                    /* push the BMC access info out to the mtcClient when
+                     * a controller's BMC connection is established/verified */
+                    if ( node_ptr->nodetype & CONTROLLER_TYPE )
+                        this->want_mtcInfo_push = true ;
+
                    send_hwmon_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
                    send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );
                }
@ -6942,6 +6963,11 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
                                }
                            } /* end power off detection handling     */

+                            /* push the BMC access info out to the mtcClient when
+                             * a controller's BMC connection is established/verified */
+                            if ( node_ptr->nodetype & CONTROLLER_TYPE )
+                                this->want_mtcInfo_push = true ;
+
                            send_hwmon_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
                            send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );

@ -7199,6 +7225,9 @@ int nodeLinkClass::oos_test_handler ( struct nodeLinkClass::node * node_ptr )
                }
            }

+            /* audit alarms */
+            mtcAlarm_audit (node_ptr );
+
            break ;
        }
        case MTC_OOS_TEST__WAIT:
@ -7494,7 +7523,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
             *  In the restart case the subfunction fsm enable handler is not run so
             *  we try to detect the missing goenabled_subf flag as an inservice test.
             *
-             *  Only in CPE type
+             *  Only in AIO type
             *   - clear the alarm if the issue goes away -
             *     i.e. the goenabled tests eventually pass. Today
             *     hey are not re-run in the background but someday they may be
@ -7502,7 +7531,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
             *     and we have only a single enabled controller (which must be this one)
             *     and the alarm is not already raised.
             **/
-            if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
+            if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
            {
                if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
                    ( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
@ -7597,7 +7626,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
                }
            }

-            /* Monitor the health of the host - no pass file */
+            /* Monitor the health of the host */
            if ((  node_ptr->adminState  == MTC_ADMIN_STATE__UNLOCKED ) &&
                (  node_ptr->operState   == MTC_OPER_STATE__ENABLED   ) &&
                (( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
@ -7623,6 +7652,11 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
                    ilog ("%s sm degrade clear\n", node_ptr->hostname.c_str());
                }

+                /*
+                 * In-service Config Failure/Alarm handling
+                 */
+
+                /* Detect new config failure condition */
                if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY)
                {
                    /* not healthy .... */
@ -7634,16 +7668,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
                        {
                            wlog_throttled ( node_ptr->health_threshold_counter, (MTC_UNHEALTHY_THRESHOLD*10), "%s is UNHEALTHY\n", node_ptr->hostname.c_str());
                            if ( node_ptr->health_threshold_counter >= MTC_UNHEALTHY_THRESHOLD )
-                            {
-                                node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ;
-
-                                /* threshold is reached so raise the config alarm if it is not already raised */
-                                if ( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CRITICAL )
-                                {
-                                    mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__CONFIG );
-                                    node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CRITICAL ;
-                                }
-                            }
+                                alarm_config_failure ( node_ptr );
                        }
                    }
                    else
@ -7663,6 +7688,12 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
                        }
                    }
                }
+                /* or correct an alarmed config failure that has cleared */
+                else if ( node_ptr->degrade_mask & DEGRADE_MASK_CONFIG )
+                {
+                    if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_HEALTHY )
+                        alarm_config_clear ( node_ptr );
+                }
                else
                {
                    node_ptr->health_threshold_counter = 0 ;
--- a/mtce/src/maintenance/mtcNodeMnfa.cpp
+++ b/mtce/src/maintenance/mtcNodeMnfa.cpp
@ -159,19 +159,20 @@ void nodeLinkClass::mnfa_recover_host ( struct nodeLinkClass::node * node_ptr )

    if ( node_ptr->mnfa_graceful_recovery == true )
    {
-        /* Restart the heartbeat for this recovered host */
-        // send_hbs_command ( node_ptr->hostname, MTC_RESTART_HBS );
-
        if ( node_ptr->adminAction != MTC_ADMIN_ACTION__RECOVER )
        {
-            ilog ("%s graceful recovery from MNFA\n", node_ptr->hostname.c_str());
-            recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
-            adminActionChange   ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
+            ilog ("%s graceful recovery (graceful recover count:%d)",
+                      node_ptr->hostname.c_str(),
+                      node_ptr->graceful_recovery_counter);
        }
        else
        {
-            wlog ("%s already gracefully recovering\n", node_ptr->hostname.c_str() );
+            wlog ("%s graceful recovery restart (graceful recover count:%d)",
+                      node_ptr->hostname.c_str(),
+                      node_ptr->graceful_recovery_counter );
        }
+        recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
+        adminActionChange   ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
    }
 }

@ -298,43 +299,38 @@ void nodeLinkClass::mnfa_exit ( bool force )
         * Clear heartbeat degrades */
        for ( struct node * ptr = head ;  ; ptr = ptr->next )
        {
-            if ((( ptr->hbs_minor[CLSTR_IFACE] == true ) ||
-                 ( ptr->hbs_minor[MGMNT_IFACE] == true )) &&
-                 ( ptr->operState == MTC_OPER_STATE__ENABLED ))
+            std::list<string>::iterator mnfa_awol_ptr  ;
+            for ( mnfa_awol_ptr = mnfa_awol_list.begin() ;
+                  mnfa_awol_ptr != mnfa_awol_list.end() ;
+                  mnfa_awol_ptr++ )
            {
-                ptr->hbs_minor[MGMNT_IFACE] = false ;
-                ptr->hbs_minor[CLSTR_IFACE] = false ;
+                /* skip host if not in the mnfa pool */
+                if ( ptr->hostname.compare(*(mnfa_awol_ptr)) )
+                   continue ;

-                if ( force == true )
+                if ((( ptr->hbs_minor[CLSTR_IFACE] == true ) ||
+                     ( ptr->hbs_minor[MGMNT_IFACE] == true )) &&
+                     ( ptr->operState == MTC_OPER_STATE__ENABLED ))
                {
-                    elog ("... %s failed ; auto-recovering\n",
-                               ptr->hostname.c_str());
+                    ptr->hbs_minor[MGMNT_IFACE] = false ;
+                    ptr->hbs_minor[CLSTR_IFACE] = false ;

-                    /* Set node as failed */
-                    availStatusChange ( ptr, MTC_AVAIL_STATUS__FAILED );
-                    enableStageChange ( ptr, MTC_ENABLE__START );
-                    adminActionChange ( ptr, MTC_ADMIN_ACTION__NONE );
-                }
-                else
-                {
-                    if ( ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )
+                    if ( force == true )
                    {
-                        if ( ptr->degrade_mask == 0 )
-                        {
-                            availStatusChange ( ptr, MTC_AVAIL_STATUS__AVAILABLE );
-                        }
-                    }
+                        elog ("... %s failed ; auto-recovering\n",
+                                   ptr->hostname.c_str());

-                    if ( ptr->adminAction != MTC_ADMIN_ACTION__RECOVER )
-                    {
-                        recoveryStageChange ( ptr, MTC_RECOVERY__START );
-                        adminActionChange   ( ptr, MTC_ADMIN_ACTION__RECOVER );
+                        /* Set node as failed */
+                        availStatusChange ( ptr, MTC_AVAIL_STATUS__FAILED );
+                        enableStageChange ( ptr, MTC_ENABLE__START );
+                        adminActionChange ( ptr, MTC_ADMIN_ACTION__NONE );
                    }
                    else
                    {
-                        wlog ("%s already gracefully recovering\n", ptr->hostname.c_str() );
+                        mnfa_recover_host ( ptr );
                    }
                }
+                break ;
            }
            if (( ptr->next == NULL ) || ( ptr == tail ))
                break ;
--- a/mtce/src/maintenance/mtcNodeMsg.h
+++ b/mtce/src/maintenance/mtcNodeMsg.h
@ -125,11 +125,13 @@ int send_mtcAlive_msg ( mtc_socket_type * sock_ptr, string identity, int interfa

 int recv_mtc_reply_noblock ( void );

-int send_mtc_cmd ( string & hostname, int cmd, int interface );
+int send_mtc_cmd ( string & hostname, int cmd, int interface , string json_dict="" );
 int mtc_service_command ( mtc_socket_type * sock_ptr , int interface );
 int mtc_set_availStatus ( string & hostname, mtc_nodeAvailStatus_enum status );
-int mtce_send_event    ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_name_ptr );
+int mtce_send_event    ( mtc_socket_type * sock_ptr, unsigned int cmd , const char * mtce_name_ptr );
 int mtc_clstr_init     ( mtc_socket_type * sock_ptr , char * iface );
 string get_who_i_am ( void );

+int send_mtcClient_cmd ( mtc_socket_type * sock_ptr, int cmd, string hostname, string address, int port);
+
 #endif
--- a/mtce/src/maintenance/mtcSmgrApi.cpp
+++ b/mtce/src/maintenance/mtcSmgrApi.cpp
@ -96,7 +96,7 @@ int nodeLinkClass::mtcSmgrApi_request ( struct nodeLinkClass::node * node_ptr, m
    int rc = PASS ;
    string operation_string = "unknown" ;

-    if ( system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX )
+    if ( system_type == SYSTEM_TYPE__AIO__SIMPLEX )
    {
        dlog ("%s simpex mode ; SM '%d' request not sent\n", node_ptr->hostname.c_str(), operation );
        return ( PASS );
--- a/mtce/src/maintenance/mtcSubfHdlrs.cpp
+++ b/mtce/src/maintenance/mtcSubfHdlrs.cpp
@ -110,14 +110,16 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
            if ( node_ptr->mtce_flags & MTC_FLAG__SUBF_CONFIGURED )
            {
                mtcTimer_reset (node_ptr->mtcTimer);
-                plog ("%s Subf Configured OK\n", name.c_str());
+                plog ("%s Subf Configured OK (oob:%x)\n",
+                          name.c_str(), node_ptr->mtce_flags);
                enableStageChange ( node_ptr, MTC_ENABLE__GOENABLED_TIMER );
                alarm_config_clear ( node_ptr );
                break ;
            }

-            if ((( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED )) ||
-                ((  node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY )))
+            if (( node_ptr->mtce_flags ) &&
+                (( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED ) ||
+                 (  node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY )))
            {
                mtcTimer_reset (node_ptr->mtcTimer);

@ -140,9 +142,10 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
            /* timeout handling */
            else if ( node_ptr->mtcTimer.ring == true )
            {
-                elog ("%s configuration timeout (%d secs)\n",
+                elog ("%s configuration timeout (%d secs) (oob:%x)\n",
                          name.c_str(),
-                          MTC_WORKER_CONFIG_TIMEOUT );
+                          MTC_WORKER_CONFIG_TIMEOUT,
+                          node_ptr->mtce_flags);

                alarm_config_failure ( node_ptr );

@ -169,7 +172,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
             *
             * issue: subfunction go-enable patching script fails and
             * maintenance reboots the active controller when no-reboot
-             * patching maintenance in CPE.
+             * patching maintenance in AIO.
             *
             * The fix is to avoid running the subfunction go-enabled tests
             * on self while patching.
@ -490,7 +493,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )

                fail = true ;
            }
-            else if ( this->system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )
+            else if ( this->system_type != SYSTEM_TYPE__AIO__SIMPLEX )
            {
                /* Loop over the heartbeat interfaces and fail the Enable if any of them are failing */
                for ( int i = 0 ; i < MAX_IFACES ; i++ )
--- a/mtce/src/pmon/pmon.h
+++ b/mtce/src/pmon/pmon.h
@ -231,6 +231,7 @@ typedef struct
    recovery_method_type recovery_method ; /**< How processes are recovered */
    bool reload_config ;
    bool patching_in_progress ;
+    bool last_alarm_query_pass;

 } pmon_ctrl_type ;
 void pmon_set_ctrl_ptr ( pmon_ctrl_type * ctrl_ptr );
--- a/mtce/src/pmon/pmonAlarm.cpp
+++ b/mtce/src/pmon/pmonAlarm.cpp
@ -38,14 +38,14 @@ void pmonAlarm_init ( void )
    alarmUtil_type * ptr ;

    /** Process Failure Alarm ****************************************************/
-    
+
    ptr = &alarm_list[PMON_ALARM_ID__PMOND];
    memset  (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
    snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", PMOND_ALARM_ID);

    ptr->name = "process failure" ;
    ptr->instc_prefix = "process=" ;
-     
+
    ptr->critl_reason = "";
    ptr->minor_reason = "";
    ptr->major_reason = "";
@ -56,12 +56,12 @@ void pmonAlarm_init ( void )
    ptr->alarm.inhibit_alarms    = FM_FALSE;
    ptr->alarm.service_affecting = FM_TRUE ;
    ptr->alarm.suppression       = FM_TRUE ;
-            
+
    ptr->alarm.severity          = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
    ptr->alarm.alarm_state       = FM_ALARM_STATE_CLEAR    ; /* Dynamic */

-    snprintf (ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, 
-              "If problem consistently occurs after Host is locked and unlocked then " 
+    snprintf (ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH,
+              "If problem consistently occurs after Host is locked and unlocked then "
              "contact next level of support for root cause analysis and recovery.");
 }

@ -97,38 +97,46 @@ EFmAlarmSeverityT pmonAlarm_state ( string hostname, pmon_alarm_id_enum id )

 /******************************************************************************
 *
- * Name       : manage_queried_alarms
+ * Name       : query_alarms
 *
 * Description: query FM for all the existing process monitor alarms and build
 *              up the callers 'saved_alarm_list' with those process names and
 *              corresponding severity.
 *
- * Assumptions: If the hostname is passed in as not empty then assume the clear
- *              is requested.
- *
 * Updates    : callers saved_alarm_list
 *
+ * Returns    : PASS if FM returns no error
+ *              FAIL_REQUEST      ... alarmUtil_query_identity failed
+ *              FAIL_OPERATION    ... fm_get_fault failed
+ *              FAIL_NULL_POINTER ... failed to get memory
+ *
 ******************************************************************************/

-void manage_queried_alarms (  list<active_process_alarms_type> & saved_alarm_list, string hostname )
+int query_alarms (  list<active_process_alarms_type> & saved_alarm_list, string hostname )
 {
+    static const char HOSTNAME_LABEL [] = "host=" ;
+    static const char PROCNAME_LABEL [] = ".process=" ;
+
+    int rc = FAIL ;
    saved_alarm_list.clear();

-    /**
-     *  Query all the pmon alarms and if there is an alarm for a
-     *  process that is functioing properly then clear the alarm.
-     **/
    SFmAlarmDataT * alarm_list_ptr = (SFmAlarmDataT*) malloc ((sizeof(SFmAlarmDataT)*PMON_MAX_ALARMS));
    if ( alarm_list_ptr )
    {
-        if ( alarmUtil_query_identity ( pmonAlarm_getId_str(PMON_ALARM_ID__PMOND), alarm_list_ptr, PMON_MAX_ALARMS ) == PASS )
+        /* Query all the pmon alarms  */
+        rc = alarmUtil_query_identity ( pmonAlarm_getId_str(PMON_ALARM_ID__PMOND), alarm_list_ptr, PMON_MAX_ALARMS );
+        if ( rc == RETRY )
+        {
+            dlog ("no %s alarms found",  pmonAlarm_getId_str(PMON_ALARM_ID__PMOND).c_str());
+            rc = PASS ;
+        }
+        else if ( rc == PASS )
        {
            for ( int i = 0 ; i < PMON_MAX_ALARMS ; ++i )
            {
                /* loop over each active alarm and maintain its activity state */
                if ( strnlen ((alarm_list_ptr+i)->entity_instance_id , MAX_FILENAME_LEN ) )
                {
-                    int rc ;
                    AlarmFilter   alarm_filter ;
                    SFmAlarmDataT alarm_query  ;
                    memset(&alarm_query, 0, sizeof(alarm_query));
@ -139,34 +147,49 @@ void manage_queried_alarms (  list<active_process_alarms_type> & saved_alarm_lis

                    if (( rc = fm_get_fault ( &alarm_filter, &alarm_query )) == FM_ERR_OK )
                    {
-                        string entity = alarm_filter.entity_instance_id ;
-                        size_t pos = entity.find("process=");
-                        if ( pos != std::string::npos )
-                        {
-                            string pn = entity.substr(pos+strlen("process="));
-                            ilog ("%s alarm is %s (process:%s)\n", alarm_filter.entity_instance_id,
-                                 alarmUtil_getSev_str(alarm_query.severity).c_str(), pn.c_str());
+                        rc = PASS ;

-                            /* filter out 'process=pmond' as that alarm is handled by hbsAgent */
-                            if ( pn.compare("pmond") )
+                        string entity = alarm_filter.entity_instance_id ;
+                        size_t pos_hn = entity.find(HOSTNAME_LABEL);
+                        size_t pos_pn = entity.find(PROCNAME_LABEL);
+
+                        if (( pos_hn != std::string::npos ) &&
+                            ( pos_pn != std::string::npos ))
+                        {
+                            string hn = entity.substr(pos_hn+strlen(HOSTNAME_LABEL), pos_pn-strlen(HOSTNAME_LABEL));
+                            string pn = entity.substr(pos_pn+strlen(PROCNAME_LABEL));
+
+                            /* verify hostname */
+                            if ( ( hn.length() == 0 ) || ( hn != hostname ) )
                            {
-                                if ( !hostname.empty() )
-                                {
-                                    pmonAlarm_clear ( hostname, PMON_ALARM_ID__PMOND, pn );
-                                }
-                                else
-                                {
-                                     active_process_alarms_type this_alarm ;
-                                     this_alarm.process  = pn ;
-                                     this_alarm.severity = alarm_query.severity ;
-                                     saved_alarm_list.push_front ( this_alarm  );
-                                }
+                                /* ignore alarms not for this host */
+                                dlog ("%s %s %s alarm not for this host",
+                                          entity.c_str(),
+                                          hn.c_str(),
+                                          pn.c_str());
+                                continue ;
+                            }
+                            dlog ("%s alarm is %s (process:%s)\n",
+                                      alarm_filter.entity_instance_id,
+                                      alarmUtil_getSev_str(alarm_query.severity).c_str(),
+                                      pn.c_str());
+
+                            /* filter out 'process=pmond'
+                             * ... that alarm is handled by hbsAgent */
+                            if ( pn != MTC_SERVICE_PMOND_NAME )
+                            {
+                                 active_process_alarms_type this_alarm ;
+                                 this_alarm.process  = pn ;
+                                 this_alarm.severity = alarm_query.severity ;
+                                 saved_alarm_list.push_front ( this_alarm  );
                            }
                        }
                    }
                    else
                    {
-                        ilog ("fm_get_fault failed (rc:%d)\n", rc );
+                        wlog ("fm_get_fault failed (rc:%d)\n", rc );
+                        rc = FAIL_OPERATION ;
+                        break ;
                    }
                }
                else
@ -174,10 +197,21 @@ void manage_queried_alarms (  list<active_process_alarms_type> & saved_alarm_lis
                    dlog2 ("last entry %d\n", i);
                    break ;
                }
-            }
+            } /* for loop */
+        }
+        else
+        {
+            wlog("failed to query alarms from fm ; rc:%d", rc);
+            rc = FAIL_REQUEST ;
        }
        free(alarm_list_ptr);
    }
+    else
+    {
+        elog ("unable to allocate memory for alarm list");
+        rc = FAIL_NULL_POINTER ;
+    }
+    return (rc);
 }

 /*************************   A L A R M I N G   **************************/
--- a/mtce/src/pmon/pmonAlarm.h
+++ b/mtce/src/pmon/pmonAlarm.h
@ -37,8 +37,10 @@ typedef struct
    EFmAlarmSeverityT severity ;
 } active_process_alarms_type   ;

-/* Clear any pending alarms if the specified hostname is valid */
-void manage_queried_alarms (  list<active_process_alarms_type> & alarm_list, string hostname="" );
+/* Query FM for a list of Process Monitor (200.006) alarms */
+int query_alarms (  list<active_process_alarms_type> & alarm_list, string hostname="" );
+
+void alarmed_process_audit ( void );

 void pmonAlarm_init ( void );

--- a/mtce/src/pmon/pmonHdlr.cpp
+++ b/mtce/src/pmon/pmonHdlr.cpp
@ -41,15 +41,6 @@ static struct mtc_timer ptimer[MAX_PROCESSES] ;
 std::list<string> config_files ;
 std::list<string>::iterator string_iter_ptr ;

-/* If there is an alarm in the list that matches one in the process list
- * then update that process with its severity and failed state.
- * If there is a process in the saved list that is not in the process list
- * then clear its alarm as it is no longer valid.
- */
-void manage_process_alarms (  list<active_process_alarms_type> & _list,
-                              process_config_type * const ptr,
-                              int const processes );
-
 static process_config_type process_config[MAX_PROCESSES] ;

 /* lookup process control by index  and return its pointer if found.
@ -216,6 +207,7 @@ void pmon_timer_init ( void )
        /* Init the timer for this process */
        mtcTimer_init ( process_config[i].pt_ptr, _pmon_ctrl_ptr->my_hostname, "process" ) ;
    }
+    _pmon_ctrl_ptr->last_alarm_query_pass = false ;
 }

 void _process_death_hdlr ( int sig_num, siginfo_t * info_ptr, void * context_ptr );
@ -371,7 +363,7 @@ void init_process_config_memory ( void )
 * all the process config files from /etc/pmon.d */
 void load_processes ( void )
 {
-    list<active_process_alarms_type> saved_alarm_list ;
+    list<active_process_alarms_type> queried_alarm_list ;

    int rc = PASS ;

@ -385,10 +377,6 @@ void load_processes ( void )
        close_process_socket ( &process_config[i] );
    }

-    /* Query fm for existing pmon process alarms and
-     * for each that is found store their 'name' and
-     * 'severity' in the passed in saved list */
-    manage_queried_alarms ( saved_alarm_list );

    /* init the process config memory */
    init_process_config_memory ();
@ -454,13 +442,8 @@ void load_processes ( void )
    }
    _pmon_ctrl_ptr->reload_config = false ;

-    /* If there were process alarms that existed over the reload
-     * then ensure that those processes are updated with that information. */
-    if ( saved_alarm_list.size () )
-    {
-        ilog ("there are %ld active alarms over reload\n", saved_alarm_list.size());
-        manage_process_alarms ( saved_alarm_list, &process_config[0], _pmon_ctrl_ptr->processes );
-    }
+    /* use the audit to clear pre-existing alarms at process startup */
+    alarmed_process_audit ();
 }


@ -1702,65 +1685,124 @@ void _process_death_hdlr ( int sig_num, siginfo_t * info_ptr, void * context_ptr
    }
 }

-/************************************************************************
+/***************************************************************************
 *
- * Name :       manage_process_alarms
+ * Name       : alarmed_process_audit
 *
- * Description: This interface manages process alarms over a process
- *              configuration reload
+ * Purpose    : Verify the process state matches the queried alarm state
 *
- * Steps:
+ * Description: To correct process alarm state mismatches.
 *
- * 1. Loop over each item in the list and mark the process as failed
- *    with the specified severity level.
- *
- * 2. If the process is not found then clear its alarm as it is no
- *    longer a valid process in the new profile and we don't want a
- *    lingering stuck alarm.
- *
- *************************************************************************/
+ ***************************************************************************/

-void manage_process_alarms (  list<active_process_alarms_type> & _list,
-                              process_config_type * const ptr,
-                              int const processes )
+void alarmed_process_audit ( void )
 {
-    /* get out if the list is empty ; should not have been called if
-     * empty but ... just in case */
-    if ( ! _list.empty() )
+    /* Don't audit FM in service after the last query was successful.
+     * There is a blocking issue that needs to be dealt with */
+    if ( _pmon_ctrl_ptr->last_alarm_query_pass == true )
+        return ;
+
+    /*
+     * Query fm for existing pmon process alarms and
+     * for each that is found store their 'name' and
+     * 'severity' in the passed in queried_alarm_list.
+     */
+    list<active_process_alarms_type> queried_alarm_list ;
+    int rc = query_alarms ( queried_alarm_list, get_ctrl_ptr()->my_hostname );
+    _pmon_ctrl_ptr->last_alarm_query_pass = (rc == PASS);
+
+    /* just return if query failed */
+    if ( _pmon_ctrl_ptr->last_alarm_query_pass == false )
+        return ;
+
+    if ( queried_alarm_list.size () )
    {
        list<active_process_alarms_type>::iterator _iter_ptr ;

+        alog ("audit found %ld active alarms", queried_alarm_list.size());
+
        /* loop over the list ... */
-        for ( _iter_ptr=_list.begin(); _iter_ptr!=_list.end(); ++_iter_ptr )
+        for (   _iter_ptr=queried_alarm_list.begin();
+                _iter_ptr!=queried_alarm_list.end();
+              ++_iter_ptr )
        {
-            /* for each item assum it is not found */
            bool found = false ;
+            alog ("%s audit", _iter_ptr->process.c_str());

-            /* try and find this process in the new process profile */
-            for ( int i = 0 ; i < processes ; i++ )
+            /* find this process*/
+            for ( int i = 0 ; (i < _pmon_ctrl_ptr->processes) && !found ; i++ )
            {
-                if ( ! _iter_ptr->process.compare((ptr+i)->process) )
-                {
-                    /* If the process is found then mark it as failed and update its severity.
-                     * At this point we then assume that there is an alarm raised for this process. */
-                    found = true ;
+                process_config_type * ptr = &process_config[i];

-                   (ptr+i)->failed = false ;
-                    wlog ("%s process was failed critical ; clearing existing alarm\n", _iter_ptr->process.c_str() );
-                    pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, PMON_ALARM_ID__PMOND, _iter_ptr->process );
+                if ( ! _iter_ptr->process.compare(ptr->process) )
+                {
+                    found = true ;
+                    if ( ptr->failed == false )
+                    {
+                        ilog ("%s stale alarm ; clearing",
+                                  _iter_ptr->process.c_str() );
+
+                        pmonAlarm_clear ( get_ctrl_ptr()->my_hostname,
+                                          PMON_ALARM_ID__PMOND,
+                                          _iter_ptr->process );
+                    }
+                    else if ( _iter_ptr->severity != ptr->alarm_severity )
+                    {
+                        wlog ("%s alarm severity mismatch ; %s -> %s ; correcting",
+                                  ptr->process,
+                                  alarmUtil_getSev_str(_iter_ptr->severity).c_str(),
+                                  alarmUtil_getSev_str(ptr->alarm_severity).c_str());
+                        if ( ptr->alarm_severity == FM_ALARM_SEVERITY_MINOR )
+                        {
+                            pmonAlarm_minor(get_ctrl_ptr()->my_hostname,
+                                            PMON_ALARM_ID__PMOND,
+                                            ptr->process, 0);
+                        }
+                        else if (ptr->alarm_severity == FM_ALARM_SEVERITY_MAJOR )
+                        {
+                            pmonAlarm_major(get_ctrl_ptr()->my_hostname,
+                                            PMON_ALARM_ID__PMOND,
+                                            ptr->process);
+                        }
+                        else if (ptr->alarm_severity == FM_ALARM_SEVERITY_CRITICAL )
+                        {
+                             pmonAlarm_critical(get_ctrl_ptr()->my_hostname,
+                                                PMON_ALARM_ID__PMOND,
+                                                ptr->process);
+                        }
+                        else
+                        {
+                            wlog ("%s unexpected severity '%s' ; clearing alarm",
+                                      ptr->process,
+                                      ptr->severity);
+
+                            pmonAlarm_clear ( get_ctrl_ptr()->my_hostname,
+                                              PMON_ALARM_ID__PMOND,
+                                              ptr->process );
+                        }
+                    }
+                    else
+                    {
+                        alog ("%s is alarmed '%s' ; audit",
+                                  ptr->process,
+                                  ptr->severity);
+                    }
                }
            }
-
            /* if not found then just clear the alarm */
            if ( found == false)
            {
-                wlog ("%s process alarm clear ; not in current process profile\n", _iter_ptr->process.c_str() );
-                pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, PMON_ALARM_ID__PMOND, _iter_ptr->process );
+                wlog ("%s is not a monitored process ; clearing alarm",
+                          _iter_ptr->process.c_str());
+                pmonAlarm_clear ( get_ctrl_ptr()->my_hostname,
+                                  PMON_ALARM_ID__PMOND,
+                                  _iter_ptr->process );
            }
        }
    }
 }

+
 void pmon_service ( pmon_ctrl_type * ctrl_ptr )
 {
    std::list<int> socks ;
@ -1931,6 +1973,8 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr )
        {
            _get_events ();
            mtcTimer_start ( pmonTimer_audit, pmon_timer_handler, audit_period );
+
+            alarmed_process_audit ();
        }

        /* Run the degrade set/clear by audit */
--- a/mtce/src/pmon/scripts/pmon.logrotate
+++ b/mtce/src/pmon/scripts/pmon.logrotate
@ -1,16 +1,19 @@
-#daily
-nodateext
+#
+# Copyright (c) 2015-2021 Wind River Systems, Inc.
+#
+# SPDX-License-Identifier: Apache-2.0

 /var/log/pmond.log
 {
-    nodateext
-    size 10M
+    create 0640 root root
    start 1
-    missingok
+    size 10M
    rotate 20
    compress
-    sharedscripts
+    notifempty
+    missingok
    postrotate
        systemctl reload syslog-ng > /dev/null 2>&1 || true
    endscript
+    delaycompress
 }
--- a/mtce/src/pmon/scripts/pmond.conf
+++ b/mtce/src/pmon/scripts/pmond.conf
--- a/mtce/src/scripts/crashdump.logrotate
+++ b/mtce/src/scripts/crashdump.logrotate
@ -1,7 +1,11 @@
+#
+# Copyright (c) 2020-2021 Wind River Systems, Inc.
+#
+# SPDX-License-Identifier: Apache-2.0
+
 /var/log/crash/vmcore.tar
 /var/log/crash/vmcore_first.tar
 {
-    nodateext
    size 1K
    start 1
    rotate 1
--- a/mtce/src/scripts/mtc.conf
+++ b/mtce/src/scripts/mtc.conf
@ -87,6 +87,10 @@ sched_delay_threshold = 300  ; scheduler delay time in msecs that will trigger
 daemon_log_port = 2121       ; daemon logger port
 mtcalarm_req_port = 2122     ;

+sync_b4_peer_ctrlr_reset = 0 ; issue a sync command to peer controller mtcClient
+                             ;   before issuing BMC reset.
+
+
 [timeouts]                   ; configurable maintenance timeout values in seconds

 failsafe_shutdown_delay = 120;
--- a/mtce/src/scripts/mtce.logrotate
+++ b/mtce/src/scripts/mtce.logrotate
@ -1,59 +1,67 @@
-#daily
-
-# Apply all these options to all the logs
-nodateext
-start 1
-compress
-notifempty
-missingok
-sharedscripts
-postrotate
-    systemctl reload syslog-ng > /dev/null 2>&1 || true
-endscript
-
+#
+# Copyright (c) 2015-2021 Wind River Systems, Inc.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
 /var/log/mtcAgent.log
 {
-    size 100M
+    create 0640 root root
+    start 1
    rotate 10
+    size 100M
+    compress
+    notifempty
+    missingok
+    postrotate
+        systemctl reload syslog-ng > /dev/null 2>&1 || true
+    endscript
+    delaycompress
 }

 /var/log/hbsAgent.log
-{
-    size 20M
-    rotate 5
-}
-
 /var/log/mtcClient.log
-{
-    size 20M
-    rotate 5
-}
-
 /var/log/hbsClient.log
 {
-    size 20M
+    create 0640 root root
+    start 1
    rotate 5
+    size 20M
+    compress
+    notifempty
+    missingok
+    postrotate
+        systemctl reload syslog-ng > /dev/null 2>&1 || true
+    endscript
+    delaycompress
 }

 /var/log/mtclogd.log
 {
-    size 10M
+    create 0640 root root
+    start 1
    rotate 5
+    size 10M
+    compress
+    notifempty
+    missingok
+    postrotate
+        systemctl reload syslog-ng > /dev/null 2>&1 || true
+    endscript
+    delaycompress
 }

+# The mtclogd opens and closes these log files on every log addition.
+# Therefore does not require a notification over log rotation.
+/var/log/mtcAgent_event.log
+/var/log/mtcAgent_alarm.log
 /var/log/mtcAgent_api.log
 {
-    size 20M
+    create 0640 root root
+    start 1
    rotate 5
-}
-
-/var/log/mtcAgent_event.log
-{
-    size 20M
-    rotate 5
-}
-/var/log/mtcAgent_alarm.log
-{
    size 10M
-    rotate 5
+    compress
+    notifempty
+    missingok
+    delaycompress
 }
--- a/mtce/src/scripts/wipedisk
+++ b/mtce/src/scripts/wipedisk
@ -18,6 +18,28 @@ usage ()
    exit 1
 }

+# Systemd automatically remounts all the mounted filesystems at shutdown
+# When we are deleting a partition, we have to unmount its corresponding filesystem
+# because remounting deleted filesystems at shutdown will throw errors
+unmount_fs()
+{
+  local fs=$1
+  local ret_code=0
+  echo "Trying to unmount $fs"
+  if findmnt $fs > /dev/null 2>&1 ; then
+      if umount -f $fs ; then
+         echo "$fs has been successfully unmounted"
+      else
+         echo "Error! Failed to unmount $fs"
+         ret_code=1
+      fi
+  else
+      echo "Warning! $fs is not mounted"
+      ret_code=2
+  fi
+  return $ret_code
+}
+
 OPTS=`getopt -o h -l force -- "$@"`
 if [ $? != 0 ]
 then
@ -100,11 +122,14 @@ fi
 BACKUP_PART_GUID="BA5EBA11-0000-1111-2222-000000000002"
 part_type_guid_str="Partition GUID code"

+# get the nodetype variable to check later if this node is a controller
+. /etc/platform/platform.conf
+
 for dev in $WIPE_HDD
 do
    if [[ -e $dev ]]
    then
-        if [ "$dev" == "$rootfs" ]
+        if [[ "$dev" == "$rootfs" && "${nodetype}" == "controller" ]]
        then
            part_numbers=( $(parted -s $dev print | awk '$1 == "Number" {i=1; next}; i {print $1}') )
            for part_number in "${part_numbers[@]}"; do
@ -128,6 +153,7 @@ do
                # Skip / or we will lose access to the tools on the system.
                if [[ $part != $rootfs_part ]]
                then
+                    unmount_fs $part
                    dd if=/dev/zero of=$part bs=512 count=34
                    dd if=/dev/zero of=$part bs=512 count=34 seek=$((`blockdev --getsz $part` - 34))
                fi
@ -141,6 +167,7 @@ do
        else
            echo "Wiping $dev..."
            wipefs -f -a $dev
+            unmount_fs $dev

            # Clearing previous GPT tables or LVM data
            # Delete the first few bytes at the start and end of the partition. This is required with