Merge remote-tracking branch 'origin/master' into f/centos8

Signed-off-by: Charles Short <charles.short@windriver.com>
Change-Id: I63728d8d3a20b98c3114ebead4bd3007fbe187b5
This commit is contained in:
Charles Short 2021-05-19 15:25:20 -04:00
commit 6c2905e665
64 changed files with 1816 additions and 619 deletions

View File

@ -25,6 +25,7 @@ mtce-guestServer
nfscheck
radvd
config-gate-worker
isolcpus-device-plugin
kernel-rt
kernel-module-igb-uio
kernel-module-igb-uio-rt
@ -33,6 +34,7 @@ kernel-rt-modules-extra
kmod-e1000e-rt
kmod-i40e-rt
kmod-iavf-rt
kmod-ice-rt
kmod-ixgbe-rt
kmod-ixgbevf-rt
kmod-igb_uio-rt
@ -53,3 +55,7 @@ openvswitch-config
pci-irq-affinity-agent
kvm-timer-advance
sysinv-fpga-agent
kernel-rt-headers
kernel-rt-devel
kernel-headers
kernel-devel

View File

@ -13,6 +13,7 @@ kernel-rt-modules-extra
kmod-e1000e-rt
kmod-i40e-rt
kmod-iavf-rt
kmod-ice-rt
kmod-ixgbe-rt
kmod-ixgbevf-rt
kmod-igb_uio-rt
@ -26,3 +27,5 @@ qat17-rt
kernel-rt-tools
kernel-rt-tools-libs
kmod-drbd-rt
kernel-rt-headers
kernel-rt-devel

View File

@ -11,6 +11,7 @@ kernel-module-igb-uio
kmod-e1000e
kmod-i40e
kmod-iavf
kmod-ice
kmod-ixgbe
kmod-ixgbevf
kmod-igb_uio
@ -23,3 +24,5 @@ kernel-tools
kernel-tools-libs
kmod-drbd
kernel-modules-extra
kernel-headers
kernel-devel

View File

@ -69,6 +69,7 @@ influxdb
influxdb-extensions
io-monitor
io-scheduler
isolcpus-device-plugin
isomd5sum
ipxe-roms-qemu
kernel-module-openvswitch
@ -120,8 +121,6 @@ nova-tests
nova-api-proxy
nova-placement-api
novnc
net-snmp
net-snmp-config
openstack-aodh-api
openstack-aodh-commmon
openstack-aodh-compat
@ -256,7 +255,6 @@ qemu-kvm-ev
qemu-kvm-tools-ev
radvd
rubygem-rdoc
snmp-ext
task-cloud-compute
task-cloud-controller
tgt
@ -290,6 +288,7 @@ kernel-rt-modules-extra
kmod-e1000e-rt
kmod-i40e-rt
kmod-iavf-rt
kmod-ice-rt
kmod-ixgbe-rt
kmod-ixgbevf-rt
kmod-igb_uio-rt
@ -304,7 +303,6 @@ kernel-rt-tools
kernel-rt-tools-libs
NaviCLI-Linux-64-x86-en_US
kmod-drbd-rt
snmp-audittrail
wrs-ssl
tpm2-tools
tss2
@ -340,6 +338,11 @@ stx-oidc-auth-helm
stx-cert-manager-helm
stx-nginx-ingress-controller-helm
stx-portieris-helm
stx-snmp-helm
stx-vault-helm
sysinv-fpga-agent
k8s-pod-recovery
kernel-rt-headers
kernel-rt-devel
kernel-headers
kernel-devel

View File

@ -81,8 +81,6 @@ nova-tests
nova-api-proxy
nova-placement-api
novnc
net-snmp
net-snmp-config
openldap-backend-bdb
openldap-backend-dnssrv
openldap-backend-hdb
@ -138,7 +136,6 @@ python-swiftclient
python-wsme
fm-mgr
fm-rest-api
snmp-ext
sm
sm-api
sm-client
@ -258,6 +255,7 @@ kernel-rt-modules-extra
kmod-e1000e-rt
kmod-i40e-rt
kmod-iavf-rt
kmod-ice-rt
kmod-ixgbe-rt
kmod-ixgbevf-rt
kmod-igb_uio-rt
@ -272,7 +270,6 @@ kernel-rt-tools
kernel-rt-tools-libs
NaviCLI-Linux-64-x86-en_US
kmod-drbd-rt
snmp-audittrail
wrs-ssl
tpm2-tools
tss2
@ -301,5 +298,8 @@ stx-oidc-auth-helm
stx-cert-manager-helm
stx-nginx-ingress-controller-helm
stx-portieris-helm
stx-snmp-helm
stx-vault-helm
k8s-pod-recovery
kernel-rt-headers
kernel-rt-devel

View File

@ -81,8 +81,6 @@ nova-tests
nova-api-proxy
nova-placement-api
novnc
net-snmp
net-snmp-config
neutron-plugin-ml2
neutron-server
neutron-tests
@ -141,7 +139,6 @@ python-swiftclient
python-wsme
fm-mgr
fm-rest-api
snmp-ext
sm
sm-api
sm-client
@ -261,6 +258,7 @@ kernel-module-igb-uio
kmod-e1000e
kmod-i40e
kmod-iavf
kmod-ice
kmod-ixgbe
kmod-ixgbevf
kmod-igb_uio
@ -274,7 +272,6 @@ kernel-tools-libs
kernel-modules-extra
NaviCLI-Linux-64-x86-en_US
kmod-drbd-rt
snmp-audittrail
wrs-ssl
tpm2-tools
tss2
@ -302,5 +299,8 @@ stx-oidc-auth-helm
stx-cert-manager-helm
stx-nginx-ingress-controller-helm
stx-portieris-helm
stx-snmp-helm
stx-vault-helm
k8s-pod-recovery
kernel-headers
kernel-devel

View File

@ -29,11 +29,12 @@
## ETCD_STOR_SIZE = 5GiB
## CEPH_MON_SIZE = 20GiB
## KUBELET_STOR_SIZE = 10GiB
## DC_VAULT_SIZE = 15GiB
## RESERVED_PE = 16MiB (based on pesize=32768)
##
## CGCS_PV_SIZE = (10 + 2*10 + 25 + 8 + 16 + 2 + 1 + 30 + 16 + 5 + 20 + 10)GiB + 16MiB/1024 = 163.02GiB
## CGCS_PV_SIZE = (10 + 2*10 + 25 + 8 + 16 + 2 + 1 + 30 + 16 + 5 + 20 + 10 + 15)GiB + 16MiB/1024 = 178.02GiB
##
##***************************************************************************************************
##**********************************************************************************************************
## Small disk install - (for disks below 240GB)
## - DB size is doubled to allow for upgrades
##
@ -50,11 +51,12 @@
## ETCD_STOR_SIZE = 5GiB
## CEPH_MON_SIZE = 20GiB
## KUBELET_STOR_SIZE = 10GiB
## DC_VAULT_SIZE = 15GiB
## RESERVED_PE = 16MiB (based on pesize=32768)
##
## CGCS_PV_SIZE = (10 + 2*5 + 20 + 8 + 16 + 2 + 1 + 30 + 16 + 5 + 20 + 10)GiB + 16MiB/1024 = 148.02GiB
## CGCS_PV_SIZE = (10 + 2*5 + 20 + 8 + 16 + 2 + 1 + 30 + 16 + 5 + 20 + 10 + 15)GiB + 16MiB/1024 = 163.02GiB
##
##***************************************************************************************************
##*********************************************************************************************************
## Tiny disk install - (for disks below 154GB)
##
## NOTE: Tiny disk setup is mainly for StarlingX running in QEMU/KVM VM.
@ -89,15 +91,15 @@ EFI_SIZE=300
# which are DEFAULT_SMALL_DISK_SIZE
# MINIMUM_SMALL_DISK_SIZE
default_small_disk_size=240
minimum_small_disk_size=181
minimum_small_disk_size=196
sz=$(blockdev --getsize64 $rootfs_device)
# Round CGCS_PV_SIZE to the closest upper value that can be divided by 1024.
if [ $sz -gt $(($default_small_disk_size*$gb)) ] ; then
# Large disk: CGCS_PV_SIZE=164GiB*1024=167936
CGCS_PV_SIZE=167936
# Large disk: CGCS_PV_SIZE=179GiB*1024=183296
CGCS_PV_SIZE=183296
elif [ $sz -ge $(($minimum_small_disk_size*$gb)) ] ; then
# Small disk: CGCS_PV_SIZE=149GiB*1024=152576
CGCS_PV_SIZE=152576
# Small disk: CGCS_PV_SIZE=164GiB*1024=167936
CGCS_PV_SIZE=167936
else
# Tiny disk: CGCS_PV_SIZE=43GiB*1024=44032
# Using a disk with a size under 60GiB will fail.

View File

@ -167,6 +167,13 @@ else
# Avoid wiping ceph osds if sysinv tells us so
if [ ${WIPE_CEPH_OSDS} == "false" ]; then
wipe_dev="true"
pvs | grep -q "$dev *ceph"
if [ $? -eq 0 ]; then
wlog "skip rook provisoned disk $dev"
continue
fi
part_numbers=( `parted -s $dev print | awk '$1 == "Number" {i=1; next}; i {print $1}'` )
# Scanning the partitions looking for CEPH OSDs and
# skipping any disk found with such partitions
@ -178,7 +185,15 @@ else
wipe_dev="false"
break
fi
pvs | grep -q -e "${dev}${part_number} *ceph" -e "${dev}p${part_number} *ceph"
if [ $? -eq 0 ]; then
wlog "Rook OSD found on $dev$part_number, skip wipe"
wipe_dev="false"
break
fi
done
if [ "$wipe_dev" == "false" ]; then
continue
fi

View File

@ -6,6 +6,6 @@ COPY_LIST="pxe-network-installer/* \
/import/mirrors/CentOS/stx-installer/vmlinuz \
"
TIS_PATCH_VER=28
TIS_PATCH_VER=PKG_GITREVCOUNT+13
BUILD_IS_BIG=4
BUILD_IS_SLOW=4

View File

@ -110,6 +110,7 @@ install -v -m 644 %{_sourcedir}/efi-centos-pxe-worker_lowlatency-install \
install -v -m 644 %{_sourcedir}/efi-centos-pxe-smallsystem_lowlatency-install \
%{buildroot}/pxeboot/pxelinux.cfg.files/efi-pxe-smallsystem_lowlatency-install-%{platform_release}
ln -sf /pxeboot/EFI/grubx64.efi %{buildroot}/pxeboot/grubx64.efi
sed -i "s/xxxSW_VERSIONxxx/%{platform_release}/g" \
%{buildroot}/pxeboot/pxelinux.cfg.files/pxe-* \

View File

@ -274,9 +274,9 @@ void bmcUtil_create_pw_file ( thread_info_type * info_ptr,
*
*************************************************************************/
string bmcUtil_create_data_fn ( string & hostname,
string file_suffix,
bmc_protocol_enum protocol )
string bmcUtil_create_data_fn ( const string & hostname,
string file_suffix,
bmc_protocol_enum protocol )
{
/* create the output filename */
string datafile ;

View File

@ -82,6 +82,14 @@ typedef struct
} bmc_info_type ;
typedef struct
{
string hostname;
string host_ip ;
string bm_ip ;
string bm_un ;
string bm_pw ;
} bmcUtil_accessInfo_type ;
/* BMC commands */
typedef enum
@ -107,6 +115,7 @@ typedef enum
#define BMC_QUERY_FILE_SUFFIX ((const char *)("_root_query"))
#define BMC_INFO_FILE_SUFFIX ((const char *)("_bmc_info"))
#define BMC_POWER_CMD_FILE_SUFFIX ((const char *)("_power_cmd_result"))
#define BMC_RESET_CMD_FILE_SUFFIX ((const char *)("_reset"))
#define BMC_BOOTDEV_CMD_FILE_SUFFIX ((const char *)("_bootdev"))
#define BMC_RESTART_CAUSE_FILE_SUFFIX ((const char *)("_restart_cause"))
#define BMC_POWER_STATUS_FILE_SUFFIX ((const char *)("_power_status"))
@ -137,9 +146,9 @@ void bmcUtil_create_pw_file ( thread_info_type * info_ptr,
bmc_protocol_enum protocol );
/* create the output filename */
string bmcUtil_create_data_fn ( string & hostname,
string file_suffix,
bmc_protocol_enum protocol );
string bmcUtil_create_data_fn ( const string & hostname,
string file_suffix,
bmc_protocol_enum protocol );
/* Get power state from query response data. */
int bmcUtil_is_power_on ( string hostname,

View File

@ -130,6 +130,14 @@ bool hostUtil_is_valid_username ( string un )
return (false);
}
bool hostUtil_is_valid_pw ( string pw )
{
if ( !pw.empty() )
if ( pw.compare(NONE) )
return (true);
return (false);
}
bool hostUtil_is_valid_mac_addr ( string mac )
{
if ( !mac.empty() )

View File

@ -46,6 +46,7 @@ string hostUtil_getPrefixPath ( void );
bool hostUtil_is_valid_uuid ( string uuid );
bool hostUtil_is_valid_ip_addr ( string ip );
bool hostUtil_is_valid_username ( string un );
bool hostUtil_is_valid_pw ( string pw );
bool hostUtil_is_valid_bm_type ( string bm_type );
int hostUtil_mktmpfile ( string hostname, string basename, string & filename, string data );

View File

@ -202,3 +202,66 @@ int ipmiUtil_bmc_info_load ( string hostname, const char * filename, bmc_info_ty
ipmiUtil_bmc_info_log ( hostname, bmc_info, rc );
return (rc);
}
int ipmiUtil_reset_host_now ( string hostname,
bmcUtil_accessInfo_type accessInfo,
string output_filename)
{
dlog("%s %s BMC [IP:%s UN:%s]",
accessInfo.hostname.c_str(),
accessInfo.host_ip.c_str(),
accessInfo.bm_ip.c_str(),
accessInfo.bm_un.c_str());
if (daemon_is_file_present ( BMC_OUTPUT_DIR ) == false )
daemon_make_dir(BMC_OUTPUT_DIR) ;
if (daemon_is_file_present ( IPMITOOL_OUTPUT_DIR ) == false )
daemon_make_dir(IPMITOOL_OUTPUT_DIR) ;
/* create temp password file */
thread_info_type info ;
info.hostname = accessInfo.hostname ;
info.password_file = "" ;
info.pw_file_fd = 0 ;
/* Use common utility to create a temp pw file */
bmcUtil_create_pw_file ( &info, accessInfo.bm_pw, BMC_PROTOCOL__IPMITOOL );
/* create request */
string request =
ipmiUtil_create_request ( IPMITOOL_POWER_RESET_CMD,
accessInfo.bm_ip,
accessInfo.bm_un,
info.password_file,
output_filename );
/* issue request
*
* Note: Could launch a thread to avoid any stall.
* However, mtcClient can withstand up to a 25 second stall
* before pmon will fail it due to active monitoring.
* UT showed that there is no stall at all. */
unsigned long long latency_threshold_secs = DEFAULT_SYSTEM_REQUEST_LATENCY_SECS ;
unsigned long long before_time = gettime_monotonic_nsec () ;
int rc = system ( request.data()) ;
unsigned long long after_time = gettime_monotonic_nsec () ;
unsigned long long delta_time = after_time-before_time ;
if ( rc )
{
wlog("system call failed ; rc:%d [%d:%s]", rc, errno, strerror(errno) );
rc = FAIL_SYSTEM_CALL ;
}
if ( delta_time > (latency_threshold_secs*1000000000))
{
wlog ("%s bmc system call took %2llu.%-8llu sec", hostname.c_str(),
(delta_time > NSEC_TO_SEC) ? (delta_time/NSEC_TO_SEC) : 0,
(delta_time > NSEC_TO_SEC) ? (delta_time%NSEC_TO_SEC) : 0);
}
/* Cleanup */
if ( info.pw_file_fd > 0 )
close(info.pw_file_fd);
daemon_remove_file ( info.password_file.data());
return (rc);
}

View File

@ -57,6 +57,8 @@ int ipmiUtil_init ( void );
int ipmiUtil_bmc_info_load ( string hostname, const char * filename, bmc_info_type & mc_info );
int ipmiUtil_reset_host_now ( string hostname, bmcUtil_accessInfo_type accessInfo, string output_filename );
/* Create the ipmi request */
string ipmiUtil_create_request ( string cmd, string & ip, string & un, string & pw, string & out );

View File

@ -149,6 +149,8 @@ const char * get_mtcNodeCommand_str ( int cmd )
case MTC_REQ_MTCALIVE: return ("mtcAlive req");
case MTC_MSG_LOCKED: return ("locked msg");
case MTC_CMD_LAZY_REBOOT: return ("lazy reboot");
case MTC_MSG_INFO: return ("info msg");
case MTC_CMD_SYNC: return ("sync");
/* goenabled commands and messages */
case MTC_MSG_MAIN_GOENABLED: return ("goEnabled main msg");
@ -199,7 +201,8 @@ const char * get_mtcNodeCommand_str ( int cmd )
case MTC_EVENT_PMON_MAJOR: return("pmon major event");
case MTC_EVENT_PMON_MINOR: return("pmon minor event");
case MTC_EVENT_PMON_LOG: return("pmon log");
case MTC_EVENT_PMOND_RAISE: return("pmon raise");
case MTC_EVENT_PMOND_RAISE: return("pmond raise");
case MTC_EVENT_PMOND_CLEAR: return("pmond clear");
/* data port events */
case MTC_EVENT_AVS_CLEAR: return("AVS clear");
@ -394,10 +397,9 @@ void mtc_stages_init ( void )
recoveryStages_str[MTC_RECOVERY__HEARTBEAT_START ] = "Heartbeat-Start";
recoveryStages_str[MTC_RECOVERY__HEARTBEAT_SOAK ] = "Heartbeat-Soak";
recoveryStages_str[MTC_RECOVERY__STATE_CHANGE ] = "State Change";
recoveryStages_str[MTC_RECOVERY__ENABLE_START ] = "Enable-Start";
recoveryStages_str[MTC_RECOVERY__FAILURE ] = "Failure";
recoveryStages_str[MTC_RECOVERY__WORKQUEUE_WAIT ] = "WorkQ-Wait";
recoveryStages_str[MTC_RECOVERY__ENABLE_WAIT ] = "Enable-Wait";
recoveryStages_str[MTC_RECOVERY__ENABLE ] = "Enable";
recoveryStages_str[MTC_RECOVERY__STAGES ] = "unknown";
disableStages_str [MTC_DISABLE__START ] = "Disable-Start";

View File

@ -185,7 +185,7 @@ typedef enum
#define DEFAULT_MTCALIVE_TIMEOUT (1200)
#define DEFAULT_GOENABLE_TIMEOUT (300)
#define DEFAULT_DOR_MODE_TIMEOUT (20)
#define DEFAULT_DOR_MODE_CPE_TIMEOUT (600)
#define DEFAULT_DOR_MODE_AIO_TIMEOUT (600)
/** TODO: Convert names to omit JSON part */
#define MTC_JSON_INV_LABEL "ihosts"
@ -263,6 +263,7 @@ typedef enum
#define MTC_TASK_ENABLE_WORK_FAIL "Enable Action Failed"
#define MTC_TASK_ENABLE_WORK_TO "Enable Action Timeout"
#define MTC_TASK_ENABLE_FAIL_HB "Enable Heartbeat Failure, re-enabling"
#define MTC_TASK_RECOVERY_FAIL_HB "Graceful Recovery Heartbeat Failure, re-enabling"
#define MTC_TASK_RECOVERY_FAIL "Graceful Recovery Failed, re-enabling"
#define MTC_TASK_RECOVERY_WAIT "Graceful Recovery Wait"
#define MTC_TASK_RECOVERED "Gracefully Recovered"
@ -311,7 +312,7 @@ typedef enum
#define MTC_TASK_POWERCYCLE_FAIL "Critical Event Power-Cycle %d; failed"
#define MTC_TASK_POWERCYCLE_DOWN "Critical Event Power-Down ; due to persistent critical sensor"
#define MTC_TASK_RESETTING_HOST "Resetting Host, critical sensor"
#define MTC_TASK_CPE_SX_UNLOCK_MSG "Unlocking, please stand-by while the system gracefully reboots"
#define MTC_TASK_AIO_SX_UNLOCK_MSG "Unlocking, please stand-by while the system gracefully reboots"
#define MTC_TASK_SELF_UNLOCK_MSG "Unlocking active controller, please stand-by while it reboots"
#define MTC_TASK_FAILED_SWACT_REQ "Critical failure.Requesting SWACT to enabled standby controller"
#define MTC_TASK_FAILED_NO_BACKUP "Critical failure.Please provision/enable standby controller"
@ -383,8 +384,8 @@ typedef enum
/* 5 milliseconds */
#define MTCAGENT_SELECT_TIMEOUT (5000)
/* dedicate more idle time in CPE ; there is less maintenance to do */
#define MTCAGENT_CPE_SELECT_TIMEOUT (10000)
/* dedicate more idle time in AIO ; there is less maintenance to do */
#define MTCAGENT_AIO_SELECT_TIMEOUT (10000)
/** Number of retries maintenance will do when it experiences
* a REST API call failure ; any failure */
@ -751,7 +752,9 @@ typedef struct
#define MTC_CMD_START_STORAGE_SVCS 19 /* to host */
#define MTC_CMD_LAZY_REBOOT 20 /* to host */
#define MTC_CMD_HOST_SVCS_RESULT 21 /* to host */
#define MTC_CMD_LAST 22
#define MTC_MSG_INFO 22 /* to host */
#define MTC_CMD_SYNC 23 /* to host */
#define MTC_CMD_LAST 24
#define RESET_PROG_MAX_REBOOTS_B4_RESET (5)
#define RESET_PROG_MAX_REBOOTS_B4_RETRY (RESET_PROG_MAX_REBOOTS_B4_RESET+2)
@ -946,7 +949,7 @@ typedef enum
string get_delStages_str ( mtc_delStages_enum stage );
#define MTC_MAX_FAST_ENABLES (3)
#define MTC_MAX_FAST_ENABLES (5)
typedef enum
{
MTC_RECOVERY__START = 0,
@ -972,10 +975,9 @@ typedef enum
MTC_RECOVERY__HEARTBEAT_START,
MTC_RECOVERY__HEARTBEAT_SOAK,
MTC_RECOVERY__STATE_CHANGE,
MTC_RECOVERY__ENABLE_START,
MTC_RECOVERY__FAILURE,
MTC_RECOVERY__WORKQUEUE_WAIT,
MTC_RECOVERY__ENABLE_WAIT,
MTC_RECOVERY__ENABLE,
MTC_RECOVERY__STAGES,
} mtc_recoveryStages_enum ;
@ -1263,6 +1265,14 @@ typedef enum
MTC_AR_DISABLE_CAUSE__NONE,
} autorecovery_disable_cause_enum ;
/* code that represents a specific group of maintenance information
* ... typically for a specific feature */
typedef enum
{
MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO,
MTC_INFO_CODE__LAST
} mtcInfo_enum ;
/* Service Based Auto Recovery Control Structure */
typedef struct
{

View File

@ -309,6 +309,48 @@ bool thread_idle ( thread_ctrl_type & ctrl )
return (false);
}
/****************************************************************************
*
* Name : thread_done_consume
*
* Description: Return to IDLE stage.
*
****************************************************************************/
int thread_done_consume ( thread_ctrl_type & ctrl, thread_info_type & info )
{
if ( ctrl.stage == THREAD_STAGE__IDLE )
{
return PASS ;
}
else if ( ctrl.done == false )
{
if ( info.runcount > ctrl.runcount )
{
ilog("%s thread cleanup ; cmd:%d ; cnt:%d:%d",
info.hostname.c_str(),
info.command,
ctrl.runcount,
info.runcount);
ctrl.done = true ;
ctrl.stage = THREAD_STAGE__DONE ;
thread_handler (ctrl, info);
return PASS ;
}
else
{
thread_kill(ctrl, info);
return RETRY ;
}
}
else
{
ctrl.stage = THREAD_STAGE__DONE ;
thread_handler( ctrl, info );
return PASS ;
}
}
/****************************************************************************
*
* Name : thread_launch
@ -381,7 +423,7 @@ void thread_kill ( thread_ctrl_type & ctrl, thread_info_type & info )
( ctrl.stage != THREAD_STAGE__WAIT ) &&
( ctrl.stage != THREAD_STAGE__IDLE ))
{
blog ("%s kill request\n", ctrl.hostname.c_str() );
wlog ("%s kill request\n", ctrl.hostname.c_str() );
_stage_change ( ctrl, THREAD_STAGE__KILL );
}
}

View File

@ -284,6 +284,7 @@ bool thread_done ( thread_ctrl_type & ctrl );
bool thread_idle ( thread_ctrl_type & ctrl );
void thread_kill ( thread_ctrl_type & ctrl , thread_info_type & info );
string thread_stage ( thread_ctrl_type & ctrl );
int thread_done_consume ( thread_ctrl_type & ctrl, thread_info_type & info );
/* Cooperative service of cancel and exit requests from parent */
void pthread_signal_handler ( thread_info_type * info_ptr );

View File

@ -38,15 +38,15 @@ using namespace std ;
/* List of different types */
typedef enum
{
SYSTEM_TYPE__NORMAL =0,
SYSTEM_TYPE__CPE_MODE__DUPLEX =1,
SYSTEM_TYPE__CPE_MODE__DUPLEX_DIRECT =2,
SYSTEM_TYPE__CPE_MODE__SIMPLEX =3,
SYSTEM_TYPE__NORMAL =0,
SYSTEM_TYPE__AIO__DUPLEX =1,
SYSTEM_TYPE__AIO__DUPLEX_DIRECT =2,
SYSTEM_TYPE__AIO__SIMPLEX =3,
} system_type_enum ;
/** Called by signal handler on daemon exit
* Performs cleanup by closing open files
* Performs cleanup by closing open files
* and freeing used memory */
void daemon_exit ( void );

View File

@ -347,7 +347,7 @@ string daemon_mgmnt_iface ( void )
system_type_enum daemon_system_type ( void )
{
char buffer [BUFFER];
system_type_enum system_type = SYSTEM_TYPE__CPE_MODE__SIMPLEX ;
system_type_enum system_type = SYSTEM_TYPE__AIO__SIMPLEX ;
FILE * cfg_file_stream = fopen ( PLATFORM_CONF_FILE, "r" );
if ( cfg_file_stream != NULL )
{
@ -401,11 +401,11 @@ system_type_enum daemon_system_type ( void )
if ( !mode.empty() )
{
if ( mode.compare("duplex") == 0 )
system_type = SYSTEM_TYPE__CPE_MODE__DUPLEX ;
system_type = SYSTEM_TYPE__AIO__DUPLEX ;
else if ( mode.compare("duplex-direct") == 0 )
system_type = SYSTEM_TYPE__CPE_MODE__DUPLEX_DIRECT ;
system_type = SYSTEM_TYPE__AIO__DUPLEX_DIRECT ;
else if ( mode.compare("simplex") == 0 )
system_type = SYSTEM_TYPE__CPE_MODE__SIMPLEX ;
system_type = SYSTEM_TYPE__AIO__SIMPLEX ;
else
{
elog ("%s All-In-One system type ; mode unknown\n", SYSTEM_TYPE_PREFIX );
@ -438,21 +438,21 @@ system_type_enum daemon_system_type ( void )
ilog("%s Standard System\n", SYSTEM_TYPE_PREFIX);
break ;
}
case SYSTEM_TYPE__CPE_MODE__DUPLEX_DIRECT:
case SYSTEM_TYPE__AIO__DUPLEX_DIRECT:
{
ilog ("%s All-in-one Duplex Direct Connect\n", SYSTEM_TYPE_PREFIX );
break ;
}
case SYSTEM_TYPE__CPE_MODE__DUPLEX:
case SYSTEM_TYPE__AIO__DUPLEX:
{
ilog ("%s All-in-one Duplex\n", SYSTEM_TYPE_PREFIX );
break ;
}
case SYSTEM_TYPE__CPE_MODE__SIMPLEX:
case SYSTEM_TYPE__AIO__SIMPLEX:
default:
{
ilog ("%s All-in-one Simplex \n", SYSTEM_TYPE_PREFIX );
system_type = SYSTEM_TYPE__CPE_MODE__SIMPLEX ;
system_type = SYSTEM_TYPE__AIO__SIMPLEX ;
break ;
}
}

View File

@ -1,22 +1,13 @@
[Unit]
Description=StarlingX Maintenance Heartbeat Agent
After=network.target syslog.service config.service
After=hbsClient.service
Before=pmon.service
[Service]
Type=forking
ExecStart=/etc/rc.d/init.d/hbsAgent start
ExecStop=/etc/rc.d/init.d/hbsAgent start
ExecStop=/etc/rc.d/init.d/hbsAgent stop
PIDFile=/var/run/hbsAgent.pid
KillMode=process
SendSIGKILL=no
# Process recovery is handled by pmond if its running.
# Delay 10 seconds to give pmond a chance to recover
# before systemd kicks in to do it as a backup plan.
Restart=always
RestartSec=10
[Install]
WantedBy=multi-user.target

View File

@ -1,17 +1,19 @@
#daily
nodateext
start 1
compress
copytruncate
notifempty
missingok
#
# Copyright (c) 2018-2021 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
/var/log/mtcalarmd.log
{
create 0640 root root
start 1
size 10M
rotate 20
sharedscripts
compress
notifempty
missingok
postrotate
systemctl reload syslog-ng > /dev/null 2>&1 || true
endscript
delaycompress
}

View File

@ -660,7 +660,7 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
{
ptr->alarms[id] = FM_ALARM_SEVERITY_CLEAR ;
}
ptr->alarms_loaded = false ;
ptr->active_alarms = "" ; /* no active alarms */
ptr->cfgEvent.base = NULL ;
ptr->sysinvEvent.base= NULL ;
@ -778,6 +778,7 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
return ptr ;
}
struct nodeLinkClass::node* nodeLinkClass::getNode ( string hostname )
{
/* check for empty list condition */
@ -2706,7 +2707,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
@ -2818,7 +2819,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
@ -2835,7 +2836,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
@ -2853,7 +2854,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
@ -2871,7 +2872,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
@ -2889,7 +2890,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
@ -2940,7 +2941,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
node_ptr->operState = MTC_OPER_STATE__DISABLED ;
node_ptr->availStatus = MTC_AVAIL_STATUS__OFFLINE ;
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = MTC_OPER_STATE__DISABLED ;
node_ptr->availStatus_subf = MTC_AVAIL_STATUS__OFFLINE ;
@ -2958,7 +2959,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
@ -3295,6 +3296,102 @@ void nodeLinkClass::mtcInfo_log ( struct nodeLinkClass::node * node_ptr )
}
}
/***************************************************************************
*
* Name : build_mtcInfo_dict
*
* Purpose : Build a json dictionary for the specified info code enum
*
* Assumptions : Only MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO is supported
*
* Returns : Returns a json dictionary of mtcInfo.
*
* {
* "controller-0":{
* "ip":"192.168.204.2",
* "bm_ip":"xxx.xxx.xx.23",
* "bm_un":"root",
* "bm_pw":"root"
* },
* "controller-1":{
* "ip":"192.168.204.3",
* "bm_ip":"xxx.xxx.xx.24",
* "bm_un":"root",
* "bm_pw":"root"
* }
* }
*
**************************************************************************/
string nodeLinkClass::build_mtcInfo_dict ( mtcInfo_enum mtcInfo_code )
{
string mtcInfo_dict = "" ;
/* loop/exit control */
int temp = 0 ;
/* should never happen but better to be safe */
if ( head == NULL )
return mtcInfo_dict ;
/* force the update to be a dictionary */
mtcInfo_dict = "{" ;
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if (( ptr->nodetype & CONTROLLER_TYPE ) &&
( mtcInfo_code == MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO ))
{
if ( temp )
mtcInfo_dict.append(",");
mtcInfo_dict.append("\"" + ptr->hostname + "\":{");
mtcInfo_dict.append("\"mgmt_ip\":\"" + ptr->ip + "\",");
mtcInfo_dict.append("\"bm_ip\":\"" + ptr->bm_ip + "\",");
mtcInfo_dict.append("\"bm_un\":\"" + ptr->bm_un + "\",");
mtcInfo_dict.append("\"bm_pw\":\"" + ptr->bm_pw + "\"}");
if ( ++temp >= 2 )
break ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
mtcInfo_dict.append("}");
return mtcInfo_dict ;
}
/**************************************************************************
*
* Name : mtcInfo_handler
*
* Purpose : Send mtcInfo update to provisioned controllers when
* the push flag is set.
*
**************************************************************************/
void nodeLinkClass::mtcInfo_handler ( void )
{
/* This is set in the bm_handler once access to the BMC using
* provisioned credentials have been verified. */
if ( this->want_mtcInfo_push )
{
/* handler will enhance when more codes are introduced */
mtcInfo_enum mtcInfo_code = MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO ;
string mtcInfo_dict = build_mtcInfo_dict(mtcInfo_code);
if ( ! mtcInfo_dict.empty() )
{
string temp = CONTROLLER_0 ;
send_mtc_cmd ( temp, MTC_MSG_INFO, MGMNT_INTERFACE, mtcInfo_dict);
if ( this->controllers > 1 )
{
temp = CONTROLLER_1;
send_mtc_cmd ( temp, MTC_MSG_INFO, MGMNT_INTERFACE, mtcInfo_dict);
}
}
this->want_mtcInfo_push = false ;
}
}
/* Lock Rules
*
* 1. Cannot lock this controller
@ -4034,6 +4131,18 @@ int nodeLinkClass::get_uptime_refresh_ctr ( string & hostname )
return (0);
}
int nodeLinkClass::get_mtce_flags ( string & hostname )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
return ( node_ptr->mtce_flags );
}
return (0);
}
void nodeLinkClass::set_mtce_flags ( string hostname, int flags, int iface )
{
nodeLinkClass::node* node_ptr = nodeLinkClass::getNode ( hostname );
@ -4114,7 +4223,7 @@ void nodeLinkClass::set_mtce_flags ( string hostname, int flags, int iface )
/* Deal with sub-function if AIO controller host */
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
if ( flags & MTC_FLAG__SUBF_GOENABLED )
{
@ -4422,6 +4531,18 @@ string nodeLinkClass::get_bm_ip ( string hostname )
return ("");
}
string nodeLinkClass::get_bm_pw ( string hostname )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
return (node_ptr->bm_pw);
}
elog ("%s bm pw lookup failed\n", hostname.c_str() );
return ("");
}
string nodeLinkClass::get_bm_un ( string hostname )
{
nodeLinkClass::node* node_ptr ;
@ -4774,7 +4895,10 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
/* Otherwise this is a single host that has recovered
* possibly as part of a mnfa group or simply a lone wolf */
else
else if (( node_ptr->hbs_minor[MGMNT_IFACE] == false ) &&
(( clstr_network_provisioned == false ) ||
(( clstr_network_provisioned == true ) &&
( node_ptr->hbs_minor[CLSTR_IFACE] == false ))))
{
if ( node_ptr->mnfa_graceful_recovery == true )
{
@ -4782,6 +4906,8 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
mnfa_awol_list.remove(node_ptr->hostname);
}
/* Don't recover until heartbeat is working over all
* monitored interfaces */
mnfa_recover_host ( node_ptr );
if ( mnfa_active == true )
@ -4819,17 +4945,17 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
}
if ( temp_count != mnfa_host_count[iface] )
{
{
slog ("%s MNFA host tally (%s:%d incorrect - expected %d) ; correcting\n",
node_ptr->hostname.c_str(),
get_iface_name_str(iface),
mnfa_host_count[iface], temp_count );
mnfa_host_count[iface] = temp_count ;
mnfa_host_count[iface] = temp_count ;
}
}
else
{
wlog ("%s MNFA host tally (%s:%d)\n",
dlog ("%s MNFA host tally (%s:%d)\n",
node_ptr->hostname.c_str(),
get_iface_name_str(iface),
mnfa_host_count[iface] );
@ -4935,11 +5061,28 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
}
return ;
}
else if ( node_ptr->recoveryStage == MTC_RECOVERY__HEARTBEAT_SOAK )
{
elog ("%s %s *** Heartbeat Loss *** (during recovery soak)\n",
hostname.c_str(),
get_iface_name_str(iface));
force_full_enable ( node_ptr );
return ;
}
mnfa_add_host ( node_ptr , iface );
if ( mnfa_active == false )
{
/* if node is already in graceful recovery just ignore the event */
if ( node_ptr->graceful_recovery_counter != 0 )
{
dlog ("%s %s loss event ; already in graceful recovery try %d",
hostname.c_str(),
get_iface_name_str(iface),
node_ptr->graceful_recovery_counter );
return ;
}
elog ("%s %s *** Heartbeat Loss ***\n", hostname.c_str(), get_iface_name_str(iface));
if ( iface == CLSTR_IFACE )
{
@ -4980,6 +5123,15 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
}
}
/****************************************************************************
*
* Name : manage_heartbeat_clear
*
* Description: Manage clearing heartbeat failure status
*
* Assuptions : Called by Both hbsAgent and mtcAgent
*
***************************************************************************/
void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
{
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
@ -4995,13 +5147,17 @@ void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
node_ptr->heartbeat_failed[i] = false ;
if ( i == MGMNT_IFACE )
{
node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
if ( heartbeat )
node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
if ( maintenance )
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
}
if ( i == CLSTR_IFACE )
{
node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
if ( heartbeat )
node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
if ( maintenance )
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
}
}
}
@ -5010,13 +5166,17 @@ void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
node_ptr->heartbeat_failed[iface] = false ;
if ( iface == MGMNT_IFACE )
{
node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
if ( heartbeat )
node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
if ( maintenance )
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
}
else if ( iface == CLSTR_IFACE )
{
node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
if ( heartbeat )
node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
if ( maintenance )
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
}
}
}
@ -5795,9 +5955,6 @@ int nodeLinkClass::critical_process_failed( string & hostname,
node_ptr->hostname.c_str()); /* dlog */
}
/* Start fresh the next time we enter graceful recovery handler */
node_ptr->graceful_recovery_counter = 0 ;
/* Set node as unlocked-disabled-failed */
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
MTC_OPER_STATE__DISABLED,
@ -6755,7 +6912,7 @@ int nodeLinkClass::disableStageChange ( struct nodeLinkClass::node * node_ptr,
}
/** Validate and log Recovery stage changes */
int nodeLinkClass::recoveryStageChange ( struct nodeLinkClass::node * node_ptr,
int nodeLinkClass::recoveryStageChange ( struct nodeLinkClass::node * node_ptr,
mtc_recoveryStages_enum newHdlrStage )
{
int rc = PASS ;
@ -6763,14 +6920,14 @@ int nodeLinkClass::recoveryStageChange ( struct nodeLinkClass::node * node_ptr,
if (( newHdlrStage >= MTC_RECOVERY__STAGES ) ||
( node_ptr->recoveryStage >= MTC_RECOVERY__STAGES ))
{
slog ("%s Invalid recovery stage (%d:%d)\n",
slog ("%s Invalid recovery stage (%d:%d)\n",
node_ptr->hostname.c_str(),
node_ptr->recoveryStage,
node_ptr->recoveryStage,
newHdlrStage );
if ( newHdlrStage < MTC_RECOVERY__STAGES )
{
clog ("%s ? -> %s\n",
clog ("%s ? -> %s\n",
node_ptr->hostname.c_str(),
get_recoveryStages_str(newHdlrStage).c_str());
@ -6782,11 +6939,11 @@ int nodeLinkClass::recoveryStageChange ( struct nodeLinkClass::node * node_ptr,
rc = FAIL ;
}
}
else
else
{
clog ("%s %s -> %s\n",
clog ("%s %s -> %s\n",
node_ptr->hostname.c_str(),
get_recoveryStages_str(node_ptr->recoveryStage).c_str(),
get_recoveryStages_str(node_ptr->recoveryStage).c_str(),
get_recoveryStages_str(newHdlrStage).c_str());
node_ptr->recoveryStage = newHdlrStage ;
@ -7514,7 +7671,7 @@ int nodeLinkClass::ar_manage ( struct nodeLinkClass::node * node_ptr,
mtcInvApi_update_states ( node_ptr, "unlocked", "disabled", "failed" );
if (( NOT_THIS_HOST ) &&
( this->system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX ))
( this->system_type != SYSTEM_TYPE__AIO__SIMPLEX ))
{
if ( ++node_ptr->ar_count[node_ptr->ar_cause] >=
this->ar_threshold [node_ptr->ar_cause] )
@ -7746,7 +7903,11 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen
if ( true_false == true )
{
ilog ("%s heartbeat start", hostname.c_str());
ilog ("%s %s heartbeat %sstart",
hostname.c_str(),
get_iface_name_str(iface),
node_ptr->monitor[iface] ? "re" : "");
node_ptr->no_work_log_throttle = 0 ;
node_ptr->b2b_misses_count[iface] = 0 ;
node_ptr->hbs_misses_count[iface] = 0 ;
@ -7758,7 +7919,12 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen
}
else
{
ilog ("%s heartbeat stop", hostname.c_str());
if ( node_ptr->monitor[iface] == true )
{
ilog ("%s %s heartbeat stop",
hostname.c_str(),
get_iface_name_str(iface));
}
}
node_ptr->monitor[iface] = true_false ;
}
@ -7771,7 +7937,7 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen
void nodeLinkClass::set_hwmond_monitor_state ( string & hostname, bool state )
{
if ( hostname.length() )
{
{
struct nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
@ -8511,7 +8677,7 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p
#define HBS_LOSS_REPORT_THROTTLE (100)
#define HBS_LOSS_REPORT_THROTTLE (100000)
int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
{
int lost = 0 ;
@ -8551,6 +8717,13 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
if ( pulse_ptr->b2b_misses_count[iface] > 1 )
{
if ( pulse_ptr->b2b_misses_count[iface] < hbs_failure_threshold )
{
hbs_cluster_change ( pulse_ptr->hostname + " " +
get_iface_name_str(iface) +
" heartbeat miss " +
itos(pulse_ptr->b2b_misses_count[iface]));
}
if ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold )
{
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
@ -8657,57 +8830,43 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
}
}
/* Turn the cluster-host heartbeat loss into a degrade only
* condition if the clstr_degrade_only flag is set */
if (( iface == CLSTR_IFACE ) &&
( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) &&
( clstr_degrade_only == true ))
{
/* Only print the log at the threshold boundary */
if (( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
{
if ( this->active_controller )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
}
wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
pulse_ptr->hostname.c_str(),
get_iface_name_str(iface) );
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
}
}
/* Turn the clstr heartbeat loss into a degrade only
* condition for inactive controller on normal system. */
else if (( iface == CLSTR_IFACE ) &&
( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) &&
( this->system_type == SYSTEM_TYPE__NORMAL ) &&
(( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE ))
if (( iface == CLSTR_IFACE ) &&
((( this->system_type == SYSTEM_TYPE__NORMAL ) &&
(( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE )) ||
( clstr_degrade_only == true )))
{
/* Only print the log at the threshold boundary */
if ( (pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
if ( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE == hbs_failure_threshold )
{
if ( this->active_controller )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
}
wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
wlog ( "%s %s *** Heartbeat Loss *** (degrade only due to %s)\n",
pulse_ptr->hostname.c_str(),
get_iface_name_str(iface));
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
get_iface_name_str(iface),
clstr_degrade_only ? "config option" : "system type");
hbs_cluster_change ( pulse_ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat loss" );
}
}
else if ((pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
// else if ( pulse_ptr->hbs_failure[iface] == false )
{
elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface) );
elog ("%s %s *** Heartbeat Loss *** (b2b_misses:0x%x)\n",
pulse_ptr->hostname.c_str(),
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface]);
hbs_cluster_change ( pulse_ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat loss" );
if ( this->active_controller )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
if ( pulse_ptr->hbs_failure[iface] == false )
{
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
}
/* report this host as failed */
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS )
{
@ -8715,10 +8874,8 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
}
}
else
{
pulse_ptr->hbs_failure[iface] = true ;
}
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
pulse_ptr->hbs_failure_count[iface]++ ;
}
if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] )
@ -8963,21 +9120,21 @@ void nodeLinkClass::mem_log_mtcalive ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tmtcAlive: online:%c offline:%c Cnt:%d Gate:%s Misses:%d\n",
node_ptr->hostname.c_str(),
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tmtcAlive: online:%c offline:%c Cnt:%d Gate:%s Misses:%d\n",
node_ptr->hostname.c_str(),
node_ptr->mtcAlive_online ? 'Y' : 'N',
node_ptr->mtcAlive_offline ? 'Y' : 'N',
node_ptr->mtcAlive_count,
node_ptr->mtcAlive_gate ? "closed" : "open",
node_ptr->mtcAlive_misses);
node_ptr->mtcAlive_misses);
mem_log (str);
}
void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAlarm List:%s%s%s%s%s%s\n",
node_ptr->hostname.c_str(),
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAlarm List:%s%s%s%s%s%s\n",
node_ptr->hostname.c_str(),
node_ptr->alarms[MTC_ALARM_ID__LOCK ] ? " Locked" : " .",
node_ptr->alarms[MTC_ALARM_ID__CONFIG ] ? " Config" : " .",
node_ptr->alarms[MTC_ALARM_ID__ENABLE ] ? " Enable" : " .",
@ -8987,6 +9144,18 @@ void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr )
mem_log (str);
}
void nodeLinkClass::mem_log_alarm2 ( struct nodeLinkClass::node * node_ptr )
{
if ( ! node_ptr->active_alarms.empty() )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tActive Alarms:%s\n",
node_ptr->hostname.c_str(),
node_ptr->active_alarms.c_str());
mem_log (str);
}
}
void nodeLinkClass::mem_log_stage ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
@ -9037,8 +9206,8 @@ void nodeLinkClass::mem_log_network ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s %s cluster_host_ip: %s Uptime: %u\n",
node_ptr->hostname.c_str(),
node_ptr->mac.c_str(),
node_ptr->hostname.c_str(),
node_ptr->mac.c_str(),
node_ptr->ip.c_str(),
node_ptr->clstr_ip.c_str(),
node_ptr->uptime );
@ -9050,11 +9219,11 @@ void nodeLinkClass::mem_log_heartbeat ( struct nodeLinkClass::node * node_ptr )
char str[MAX_MEM_LOG_DATA] ;
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Minor:%s Degrade:%s Failed:%s Monitor:%s\n",
node_ptr->hostname.c_str(),
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Minor:%s Degrade:%s Failed:%s Monitor:%s\n",
node_ptr->hostname.c_str(),
get_iface_name_str (iface),
node_ptr->hbs_minor[iface] ? "true " : "false",
node_ptr->hbs_degrade[iface] ? "true " : "false",
node_ptr->hbs_minor[iface] ? "true " : "false",
node_ptr->hbs_degrade[iface] ? "true " : "false",
node_ptr->hbs_failure[iface] ? "true " : "false",
node_ptr->monitor[iface] ? "YES" : "no" );
mem_log (str);
@ -9083,8 +9252,8 @@ void nodeLinkClass::mem_log_hbs_cnts ( struct nodeLinkClass::node * node_ptr )
void nodeLinkClass::mem_log_test_info ( struct nodeLinkClass::node * node_ptr )
{
char str[MAX_MEM_LOG_DATA] ;
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tOOS Stage:%s Runs:%d - INSV Stage:%s Runs:%d\n",
node_ptr->hostname.c_str(),
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tOOS Stage:%s Runs:%d - INSV Stage:%s Runs:%d\n",
node_ptr->hostname.c_str(),
get_oosTestStages_str(node_ptr->oosTestStage).c_str(),
node_ptr->oos_test_count,
get_insvTestStages_str(node_ptr->insvTestStage).c_str(),
@ -9117,7 +9286,7 @@ void nodeLinkClass::mem_log_type_info ( struct nodeLinkClass::node * node_ptr )
node_ptr->function);
mem_log (str);
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tSub-Function: %s (%u) (SubFunc Enabled:%c)\n",
node_ptr->hostname.c_str(),
@ -9156,6 +9325,7 @@ void nodeLinkClass::memDumpNodeState ( string hostname )
// mem_log_reset_info ( node_ptr );
mem_log_power_info ( node_ptr );
mem_log_alarm1 ( node_ptr );
mem_log_alarm2 ( node_ptr );
mem_log_mtcalive ( node_ptr );
mem_log_stage ( node_ptr );
mem_log_bm ( node_ptr );

View File

@ -76,11 +76,11 @@ using namespace std;
#define LARGE_SYSTEM \
( this->system_type == SYSTEM_TYPE__NORMAL )
#define CPE_SYSTEM \
#define AIO_SYSTEM \
( this->system_type != SYSTEM_TYPE__NORMAL )
#define SIMPLEX_CPE_SYSTEM \
( this->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX )
#define SIMPLEX_AIO_SYSTEM \
( this->system_type == SYSTEM_TYPE__AIO__SIMPLEX )
/**
* @addtogroup nodeLinkClass
@ -652,12 +652,12 @@ private:
/** @} private_monitoring_services_variables */
/* List of alarms and current severity */
#define MAX_ALARMS (10)
/* List of alarms current severity */
EFmAlarmSeverityT alarms[MAX_ALARMS];
/* tracks whether the alarms for this host have been loaded already or not */
bool alarms_loaded ;
/* string containing active alarms and their severity
* ... for logging purposes only */
string active_alarms ;
/** true if this host has recovered before the mnfa timeout period.
* This bool flags the graceful recovery handler that this node
@ -665,8 +665,6 @@ private:
* and uptime accordingly */
bool mnfa_graceful_recovery ;
int stress_iteration ;
/* BMC Protocol Learning Controls and State */
/* specifies what BMC protocol is selected for this host
@ -828,10 +826,13 @@ private:
int oos_test_handler ( struct nodeLinkClass::node * node_ptr );
int insv_test_handler ( struct nodeLinkClass::node * node_ptr );
int stress_handler ( struct nodeLinkClass::node * node_ptr );
int bmc_handler ( struct nodeLinkClass::node * node_ptr );
int bmc_handler ( struct nodeLinkClass::node * node_ptr );
int degrade_handler ( struct nodeLinkClass::node * node_ptr );
int uptime_handler ( void );
void mtcInfo_handler ( void );
int host_services_handler ( struct nodeLinkClass::node * node_ptr );
/* Starts the specified 'reset or powercycle' recovery monitor */
@ -840,6 +841,9 @@ private:
/* server specific power state query handler */
bool (*is_poweron_handler) (string hostname, string query_response );
/* Audit that monitors and auto corrects alarm state mismatches */
void mtcAlarm_audit ( struct nodeLinkClass::node * node_ptr );
/* Calculate the overall reset progression timeout */
int calc_reset_prog_timeout ( struct nodeLinkClass::node * node_ptr, int retries );
@ -851,13 +855,22 @@ private:
void ctl_mtcAlive_gate ( struct nodeLinkClass::node * node_ptr, bool gate_state );
void set_mtcAlive ( struct nodeLinkClass::node * node_ptr, int interface );
/********* mtcInfo in the database ************/
int mtcInfo_set ( struct nodeLinkClass::node * node_ptr, string key, string value );
string mtcInfo_get ( struct nodeLinkClass::node * node_ptr, string key );
void mtcInfo_clr ( struct nodeLinkClass::node * node_ptr, string key );
void mtcInfo_log ( struct nodeLinkClass::node * node_ptr );
int set_mtcInfo ( struct nodeLinkClass::node * node_ptr, string & mtc_info );
/********* mtcInfo that gets puished out to daemons ***********/
/* flag telling mtce when a mtcInfo push needs to be done */
bool want_mtcInfo_push = false ;
/* performs the mtcInfo push */
void push_mtcInfo ( void );
/*****************************************************************************
*
* Name : bmc_command_send
@ -1192,11 +1205,11 @@ private:
* Set to true when the autorecovery threshold is reached
* and we want to avoid taking further autorecovery action
* even though it may be requested. */
bool autorecovery_disabled ;
bool autorecovery_disabled = false ;
/* Set to true by fault detection methods that are
* autorecoverable when in simplex mode. */
bool autorecovery_enabled ;
bool autorecovery_enabled = false ;
/** Tracks the number of hosts that 'are currently' in service trouble
* wrt heartbeat (above minor threshold).
@ -1292,6 +1305,7 @@ private:
void mem_log_state1 ( struct nodeLinkClass::node * node_ptr );
void mem_log_state2 ( struct nodeLinkClass::node * node_ptr );
void mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr );
void mem_log_alarm2 ( struct nodeLinkClass::node * node_ptr );
void mem_log_mtcalive ( struct nodeLinkClass::node * node_ptr );
void mem_log_stage ( struct nodeLinkClass::node * node_ptr );
void mem_log_test_info ( struct nodeLinkClass::node * node_ptr );
@ -1464,11 +1478,14 @@ public:
/***********************************************************/
/** Number of provisioned controllers */
int controllers = 0 ;
/** Number of provisioned hosts (nodes) */
int hosts ;
int hosts = 0 ;
/* Set to True while waiting for UNLOCK_READY_FILE in simplex mode */
bool unlock_ready_wait ;
bool unlock_ready_wait = false ;
/** Host has been deleted */
bool host_deleted ;
@ -1517,6 +1534,9 @@ public:
/** Return the number of inventoried hosts */
int num_hosts ( void );
/** Return the number of inventoried controllers */
int num_controllers ( void );
/** **********************************************************************
*
* Name : nodeLinkClass::workQueue_enqueue
@ -1664,6 +1684,9 @@ public:
/* Clear heartbeat failed flag for all interfaces */
void manage_heartbeat_clear ( string hostname, iface_enum iface );
/* Build a json dictionary of containing code specified maintenance info */
string build_mtcInfo_dict ( mtcInfo_enum mtcInfo_code );
/** Test and Debug Members and Variables */
/** Print node info banner */
@ -1752,6 +1775,7 @@ public:
#define MTC_FLAG__I_AM_LOCKED (0x00000008)
*/
void set_mtce_flags ( string hostname, int flags, int iface );
int get_mtce_flags ( string & hostname );
/** Updates the node's health code
* Codes are found in nodeBase.h
@ -1789,6 +1813,7 @@ public:
string get_bm_ip ( string hostname );
string get_bm_un ( string hostname );
string get_bm_pw ( string hostname );
string get_bm_type ( string hostname );
string get_hostname_from_bm_ip ( string bm_ip );

View File

@ -1,15 +1,19 @@
#daily
nodateext
#
# Copyright (c) 2015-2021 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
/var/log/fsmond.log
{
size 10M
create 0640 root root
start 1
missingok
size 10M
rotate 20
compress
sharedscripts
notifempty
missingok
postrotate
systemctl reload syslog-ng > /dev/null 2>&1 || true
endscript
delaycompress
}

View File

@ -13,7 +13,7 @@ LDLIBS = -lstdc++ -ldaemon -lcommon -lthreadUtil -lpthread -lfmcommon -lalarm -l
INCLUDES = -I. -I/usr/include/mtce-daemon -I/usr/include/mtce-common
INCLUDES += -I../common -I../alarm -I../maintenance -I../public
CCFLAGS = -g -O2 -Wall -Wextra -Werror
CCFLAGS = -g -O2 -Wall -Wextra -Werror -std=c++11
STATIC_ANALYSIS_TOOL = cppcheck
STATIC_ANALYSIS_TOOL_EXISTS = $(shell [[ -e `which $(STATIC_ANALYSIS_TOOL)` ]] && echo 1 || echo 0)

View File

@ -1381,6 +1381,7 @@ int daemon_init ( string iface, string nodetype )
hbs_ctrl.locked = true ;
}
daemon_init_fit();
return (rc);
}
@ -1521,6 +1522,7 @@ void hbs_sm_handler ( void )
* False if time delta is greater
*
***************************************************************************/
#define HUGE_NUMBER_B2B_SM_HEARTBEAT_MISSES (10000)
bool manage_sm_heartbeat ( void )
{
struct timespec ts ;
@ -1532,8 +1534,9 @@ bool manage_sm_heartbeat ( void )
if ( delta_in_ms > SM_HEARTBEAT_PULSE_PERIOD_MSECS )
{
sm_heartbeat_count = 0;
if (( ++sm_heartbeat_count_b2b_misses < 20 )||
(!( sm_heartbeat_count_b2b_misses % 100 )))
if ((( ++sm_heartbeat_count_b2b_misses < 20 ) ||
(!( sm_heartbeat_count_b2b_misses % 1000 ))) &&
( sm_heartbeat_count_b2b_misses < HUGE_NUMBER_B2B_SM_HEARTBEAT_MISSES ))
{
wlog("SM Heartbeat missing since %ld.%03ld secs ago ; HBS Period Misses:%3d ; Running HB Count:%4d",
delta.secs, delta.msecs,
@ -1817,6 +1820,10 @@ void daemon_service_run ( void )
inv.name = hbsInv.my_hostname ;
inv.nodetype = CONTROLLER_TYPE ;
hbsInv.add_heartbeat_host ( inv );
/* add this host to local inventory */
hostname_inventory.push_front(hbsInv.my_hostname);
ilog ("%s added to inventory (self)", hbsInv.my_hostname.c_str());
}
/* enable the base level signal handler latency monitor */
@ -1841,7 +1848,7 @@ void daemon_service_run ( void )
clock_gettime (CLOCK_MONOTONIC, &sm_heartbeat_timestamp_last );
/* no need for the heartbeat audit in a simplex system */
if ( hbsInv.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )
if ( hbsInv.system_type != SYSTEM_TYPE__AIO__SIMPLEX )
{
/* start the state audit */
/* run the first audit in 30 seconds */
@ -2056,7 +2063,7 @@ void daemon_service_run ( void )
hbsInv.active_controller ? "" : "in" );
/* no need for the heartbeat audit in a simplex system */
if ( hbsInv.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )
if ( hbsInv.system_type != SYSTEM_TYPE__AIO__SIMPLEX )
{
/* Due to activity state change we will dump
* the heartbeat cluster state at now time
@ -2074,6 +2081,7 @@ void daemon_service_run ( void )
inv.nodetype = msg.parm[0];
hbsInv.add_heartbeat_host ( inv ) ;
hostname_inventory.push_back ( inv.name );
hostname_inventory.unique(); // avoid duplicates
ilog ("%s added to heartbeat service (%d)\n",
inv.name.c_str(),
inv.nodetype);
@ -2119,7 +2127,7 @@ void daemon_service_run ( void )
{
if ( hostname != hbsInv.my_hostname )
{
hbsInv.mon_host ( hostname, false, true );
hbsInv.mon_host ( hostname, false, false );
hbs_cluster_del ( hostname );
ilog ("%s heartbeat service disabled by stop command",
hostname.c_str());
@ -2366,6 +2374,7 @@ void daemon_service_run ( void )
arrival_histogram[iface] = "" ;
unexpected_pulse_list[iface] = "" ;
rc = hbs_pulse_request ( (iface_enum)iface, seq_num, ri, rri );
if ( rc != 0 )
{
@ -2523,7 +2532,9 @@ void daemon_service_run ( void )
}
}
/* log cluster throttled */
if (( heartbeat_ok == false ) && ( !( sm_heartbeat_count_b2b_misses % 100 )))
if ((( heartbeat_ok == false ) &&
( !( sm_heartbeat_count_b2b_misses % 1000 ))) &&
( sm_heartbeat_count_b2b_misses < HUGE_NUMBER_B2B_SM_HEARTBEAT_MISSES ))
{
hbs_state_audit ( );
}

View File

@ -326,7 +326,7 @@ void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, stri
void hbs_sm_handler ( void );
/* send the cluster vault to SM */
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason );
int hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason );
/* copy cluster data from src to dst */
void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst );
@ -338,6 +338,10 @@ void hbs_cluster_dump ( mtce_hbs_cluster_type & vault );
/* Heartbeat service state audit */
void hbs_state_audit ( void );
/* Send state change message to SM if there has been a
* state change in the last period */
void hbs_cluster_change_notifier ( void );
/**
* @} hbs_base
*/

View File

@ -69,6 +69,8 @@ typedef struct
msgClassSock * sm_socket_ptr ;
string cluster_change_reason ;
} hbs_cluster_ctrl_type ;
/* Cluster control structire construct allocation. */
@ -122,6 +124,8 @@ void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr )
{
ctrl.sm_socket_ptr = sm_socket_ptr ;
}
ctrl.cluster_change_reason = "";
ctrl.log_throttle = 0 ;
}
@ -173,7 +177,30 @@ void hbs_cluster_nums ( unsigned short this_controller,
void hbs_cluster_change ( string cluster_change_reason )
{
hbs_cluster_send( ctrl.sm_socket_ptr, 0, cluster_change_reason );
ilog ("reason: %s", cluster_change_reason.c_str());
if ( ctrl.cluster_change_reason.empty() )
ctrl.cluster_change_reason = cluster_change_reason ;
else
ctrl.cluster_change_reason.append("," + cluster_change_reason) ;
}
/****************************************************************************
*
* Name : hbs_cluster_change_notifier
*
* Description : Send SM the cluster info if there has been a state change.
*
***************************************************************************/
void hbs_cluster_change_notifier ( void )
{
if ( ! ctrl.cluster_change_reason.empty () )
{
if ( hbs_cluster_send( ctrl.sm_socket_ptr, 0,
ctrl.cluster_change_reason ) == PASS )
{
ctrl.cluster_change_reason.clear();
}
}
}
/****************************************************************************
@ -444,6 +471,7 @@ void hbs_cluster_update ( iface_enum iface,
wlog_throttled ( ctrl.log_throttle, THROTTLE_COUNT,
"Unable to store history beyond %d ",
ctrl.cluster.histories );
hbs_cluster_change_notifier ();
return ;
}
else
@ -544,6 +572,8 @@ void hbs_cluster_update ( iface_enum iface,
else
history_ptr->oldest_entry_index++ ;
hbs_cluster_change_notifier ();
/* clear the log throttle if we are updating history ok. */
ctrl.log_throttle = 0 ;
}
@ -647,12 +677,12 @@ unsigned short hbs_cluster_unused_bytes ( void )
*
***************************************************************************/
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason )
int hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason )
{
int rc = FAIL_SOCKET_SENDTO ;
ctrl.cluster.reqid = (unsigned short)reqid ;
if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true ))
{
ilog ("cluster state notification Reason: %s", reason.c_str());
int len = sizeof(mtce_hbs_cluster_type)-hbs_cluster_unused_bytes();
int bytes = sm_client_sock->write((char*)&ctrl.cluster, len);
if ( bytes <= 0 )
@ -660,12 +690,19 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason
elog ("failed to send cluster vault to SM (bytes=%d) (%d:%s)\n",
bytes , errno, strerror(errno));
}
hbs_cluster_dump ( ctrl.cluster );
else
{
/* limit the string length */
ilog ("reason: %s", reason.substr(0,80).c_str());
hbs_cluster_dump ( ctrl.cluster );
rc = PASS ;
}
}
else
{
wlog ("cannot send cluster info due to socket error");
}
return(rc);
}
/****************************************************************************
@ -689,7 +726,7 @@ void hbs_history_save ( string hostname,
{
if ( hbs_cluster_cmp( sample, ctrl.cluster.history[h] ) )
{
hbs_cluster_change ("peer controller cluster event " +
hbs_cluster_change ("peer cluster delta " +
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network));
}

View File

@ -279,8 +279,14 @@ void nodeLinkClass::mnfa_enter ( void )
void nodeLinkClass::mnfa_exit ( bool force )
{ force = force ; }
int send_mtc_cmd ( string & hostname, int cmd, int interface )
{ UNUSED(hostname); UNUSED(cmd); UNUSED(interface); return PASS ; }
int send_mtc_cmd ( string & hostname, int cmd, int interface, string json_dict)
{
UNUSED(hostname);
UNUSED(cmd);
UNUSED(interface);
UNUSED(json_dict);
return PASS ;
}
int nodeLinkClass::mtcInvApi_subf_states ( string hostname,
string oper_subf,

View File

@ -1,16 +1,19 @@
#daily
nodateext
#
# Copyright (c) 2020-2021 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
/var/log/hostwd.log
{
nodateext
size 10M
create 0640 root root
start 1
missingok
size 10M
rotate 20
compress
sharedscripts
notifempty
missingok
postrotate
systemctl reload syslog-ng > /dev/null 2>&1 || true
endscript
delaycompress
}

View File

@ -254,7 +254,7 @@ void hwmonGroup_init ( string & hostname , struct sensor_group_type * group_ptr
group_ptr->actions_critical_choices.append(HWMON_ACTION_ALARM);
/* Don't support reset and power cycle in AIO simplex mode */
if ( obj_ptr->system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )
if ( obj_ptr->system_type != SYSTEM_TYPE__AIO__SIMPLEX )
{
group_ptr->actions_critical_choices.append(",");
group_ptr->actions_critical_choices.append(HWMON_ACTION_RESET);

View File

@ -964,6 +964,10 @@ static int _parse_redfish_sensor_data( char * json_str_ptr, thread_info_type * i
{
strcpy(_sample_list[samples].status, "cr");
}
else if (!strcmp (health.data(), REDFISH_SEVERITY__NONRECOVERABLE ))
{
strcpy(_sample_list[samples].status, "nr");
}
else
{
strcpy(_sample_list[samples].status, "na");

View File

@ -33,6 +33,7 @@
#define REDFISH_SEVERITY__GOOD "OK"
#define REDFISH_SEVERITY__MAJOR "Warning"
#define REDFISH_SEVERITY__CRITICAL "Critical"
#define REDFISH_SEVERITY__NONRECOVERABLE "NonRecoverable"
#define BMC_SENSOR_DEFAULT_UNIT_TYPE_TEMP "degrees"
#define BMC_SENSOR_DEFAULT_UNIT_TYPE_VOLT "Volts"

View File

@ -1,28 +1,21 @@
#daily
nodateext
start 1
missingok
notifempty
compress
sharedscripts
postrotate
systemctl reload syslog-ng > /dev/null 2>&1 || true
endscript
#
# Copyright (c) 2020-2021 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
/var/log/hwmond.log
{
size 50M
rotate 5
}
/var/log/hwmond_event.log
{
size 50M
rotate 5
}
/var/log/hwmond_api.log
{
create 0640 root root
start 1
size 50M
rotate 5
compress
notifempty
missingok
postrotate
systemctl reload syslog-ng > /dev/null 2>&1 || true
endscript
delaycompress
}

View File

@ -1,16 +1,19 @@
#daily
nodateext
#
# Copyright (c) 2020-2021 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
/var/log/lmond.log
{
nodateext
size 10M
create 0640 root root
start 1
missingok
size 10M
rotate 20
compress
sharedscripts
notifempty
missingok
postrotate
systemctl reload syslog-ng > /dev/null 2>&1 || true
endscript
delaycompress
}

View File

@ -54,7 +54,7 @@ BINS = mtcAgent mtcClient
LDLIBS += -lstdc++ -ldaemon -lcommon -lthreadUtil -lbmcUtils -lfmcommon -lalarm -lpthread -lrt -levent -ljson-c -lamon -lcrypto -luuid
INCLUDES = -I. -I/usr/include/mtce-daemon -I/usr/include/mtce-common
INCLUDES += -I../common -I../alarm -I../heartbeat -I../hwmon -I../public
CCFLAGS += -g -O2 -Wall -Wextra -Werror -Wno-missing-braces
CCFLAGS += -g -O2 -Wall -Wextra -Werror -Wno-missing-braces -std=c++11
STATIC_ANALYSIS_TOOL = cppcheck
STATIC_ANALYSIS_TOOL_EXISTS = $(shell [[ -e `which $(STATIC_ANALYSIS_TOOL)` ]] && echo 1 || echo 0)

View File

@ -26,6 +26,7 @@ using namespace std;
#include "daemon_common.h" /* */
#include "nodeBase.h" /* */
#include "nodeClass.h" /* */
#include "nodeTimers.h" /* */
#include "nodeUtil.h" /* */
#include "mtcAlarm.h" /* for ... this module header */
@ -379,8 +380,169 @@ void mtcAlarm_clear_all ( string hostname )
}
}
/****************************************************************************
*
* Name : mtcAlarm_audit
*
* Purpose : Monitor and Auto-Correct maintenance alarms
*
* Description: Query locked state alarm (raw)
* if successful
* - Query alarms
* - compare to running state
* - correct mismatches ; internal state takes precidence
* - log all alarm state changes
*
****************************************************************************/
void nodeLinkClass::mtcAlarm_audit ( struct nodeLinkClass::node * node_ptr )
{
/*
* Read locked state alarm directly to detect fm access failures.
* If successful further reads are done using a wrapper utility.
*/
SFmAlarmDataT alarm_query ;
AlarmFilter alarm_filter ;
EFmErrorT rc ;
memset(&alarm_query, 0, sizeof(alarm_query));
memset(&alarm_filter, 0, sizeof(alarm_filter));
snprintf ( &alarm_filter.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s",
LOCK_ALARM_ID);
snprintf ( &alarm_filter.entity_instance_id[0], FM_MAX_BUFFER_LENGTH, "%s%s",
ENTITY_PREFIX, node_ptr->hostname.data());
rc = fm_get_fault ( &alarm_filter, &alarm_query );
if (( rc != FM_ERR_OK ) && ( rc != FM_ERR_ENTITY_NOT_FOUND ))
{
wlog("%s alarm query failure ; code:%d",
node_ptr->hostname.c_str(),
rc );
return ;
}
/* With FM comms proven working lets check the other mtc alarms */
string active_alarms = "";
for ( int i = 0 ; i < MAX_ALARMS ; i++ )
{
mtc_alarm_id_enum id = (mtc_alarm_id_enum)i ;
if ( id == MTC_ALARM_ID__LOCK )
{
/* Unexpected severity case */
if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
{
if ( alarm_query.severity != FM_ALARM_SEVERITY_WARNING )
{
node_ptr->alarms[id] = FM_ALARM_SEVERITY_WARNING ;
wlog("%s %s alarm mismatch ; %s -> %s",
node_ptr->hostname.c_str(),
_getIdentity(id).c_str(),
alarmUtil_getSev_str(alarm_query.severity).c_str(),
alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
mtcAlarm_warning ( node_ptr->hostname, MTC_ALARM_ID__LOCK );
}
if (!active_alarms.empty())
active_alarms.append(", ");
active_alarms.append(_getIdentity(id) + ":");
active_alarms.append(alarmUtil_getSev_str(node_ptr->alarms[id]));
}
/* Unexpected assertion case */
else if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( alarm_query.severity != FM_ALARM_SEVERITY_CLEAR ))
{
node_ptr->alarms[id] = FM_ALARM_SEVERITY_CLEAR ;
wlog("%s %s alarm mismatch ; %s -> %s",
node_ptr->hostname.c_str(),
_getIdentity(id).c_str(),
alarmUtil_getSev_str(alarm_query.severity).c_str(),
alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
mtcAlarm_clear ( node_ptr->hostname, id );
}
}
else if (( id == MTC_ALARM_ID__CONFIG ) ||
( id == MTC_ALARM_ID__ENABLE ) ||
( id == MTC_ALARM_ID__BM ) ||
( id == MTC_ALARM_ID__CH_CONT) ||
( id == MTC_ALARM_ID__CH_COMP))
{
EFmAlarmSeverityT severity = mtcAlarm_state ( node_ptr->hostname, id);
if ( severity != node_ptr->alarms[id] )
{
ilog ("%s %s alarm mismatch ; %s -> %s",
node_ptr->hostname.c_str(),
_getIdentity(id).c_str(),
alarmUtil_getSev_str(severity).c_str(),
alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
if ( node_ptr->alarms[id] == FM_ALARM_SEVERITY_CLEAR )
{
mtcAlarm_clear ( node_ptr->hostname, id );
}
else
{
mtcAlarm_raise ( node_ptr->hostname, id, node_ptr->alarms[id] );
}
}
if ( node_ptr->alarms[id] != FM_ALARM_SEVERITY_CLEAR )
{
if (!active_alarms.empty())
active_alarms.append(", ");
active_alarms.append(_getIdentity(id) + ":");
active_alarms.append(alarmUtil_getSev_str(node_ptr->alarms[id]));
}
}
/* else don't care about other alarm ids ; logs events etc */
}
/* manage logging of active alarms */
if ( !active_alarms.empty() )
{
if ( node_ptr->active_alarms != active_alarms )
{
ilog ("%s active alarms: %s",
node_ptr->hostname.c_str(),
active_alarms.c_str());
node_ptr->active_alarms = active_alarms ;
}
/* else
* do nothing because there are active alarms
* that have not changed since the last audit.
*/
}
else if ( ! node_ptr->active_alarms.empty() )
{
/* clear active alarm list since there 'were' active alarms
* but there are no longer active alarms */
node_ptr->active_alarms.clear();
ilog ("%s no active alarms", node_ptr->hostname.c_str());
}
/* else
* no active alarms ; don't log */
}
/************************* A L A R M I N G **************************/
/* Raise the specified maintenance alarm severity */
int mtcAlarm_raise ( string hostname, mtc_alarm_id_enum id, EFmAlarmSeverityT severity )
{
switch ( severity )
{
case FM_ALARM_SEVERITY_MINOR:
return (mtcAlarm_minor(hostname,id));
case FM_ALARM_SEVERITY_MAJOR:
return (mtcAlarm_major(hostname,id));
case FM_ALARM_SEVERITY_CRITICAL:
return (mtcAlarm_critical(hostname,id));
default:
return (FAIL_BAD_PARM);
}
}
/* Clear the specified hosts's maintenance alarm */
int mtcAlarm_clear ( string hostname, mtc_alarm_id_enum id )
{

View File

@ -95,6 +95,9 @@ string mtcAlarm_getId_str ( mtc_alarm_id_enum id );
/** Clear the specified maintenance alarm for specific host */
int mtcAlarm_clear ( string hostname, mtc_alarm_id_enum id );
/** Raise specified severity level alarm for the specified host */
int mtcAlarm_raise ( string hostname, mtc_alarm_id_enum id, EFmAlarmSeverityT severity );
/** Assert a specified mtce alarm against the specified host with a WARNING severity level */
int mtcAlarm_warning ( string hostname, mtc_alarm_id_enum id );

View File

@ -39,6 +39,26 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr,
{
int rc = PASS ;
/* handle 'kill of in-progress' thread or 'done but not consumed' thread */
if ( ! thread_idle ( node_ptr->bmc_thread_ctrl ))
{
if ( ! thread_done ( node_ptr->bmc_thread_ctrl ))
{
thread_kill ( node_ptr->bmc_thread_ctrl,
node_ptr->bmc_thread_info );
return (RETRY);
}
else
{
mtcTimer_reset ( node_ptr->bmc_thread_ctrl.timer );
if ( thread_done_consume ( node_ptr->bmc_thread_ctrl,
node_ptr->bmc_thread_info ) != PASS )
{
return (RETRY);
}
}
}
node_ptr->bmc_thread_info.command = command ;
/* Update / Setup the BMC access credentials */
@ -437,6 +457,13 @@ bmc_command_recv_cleanup:
if ( rc != RETRY )
{
ilog ("%s %s recv '%s' command (%s) (rc:%d)",
node_ptr->hostname.c_str(),
node_ptr->bmc_thread_ctrl.name.c_str(),
bmcUtil_getCmd_str(node_ptr->bmc_thread_info.command).c_str(),
bmcUtil_getProtocol_str(node_ptr->bmc_protocol).c_str(),
rc);
node_ptr->bmc_thread_ctrl.done = true ;
node_ptr->bmc_thread_ctrl.retries = 0 ;
node_ptr->bmc_thread_ctrl.id = 0 ;

View File

@ -20,7 +20,7 @@
#include <stdio.h>
#include <string.h>
#include <sys/un.h> /* for ... unix domain sockets */
#include <sys/un.h> /* for ... unix domain sockets */
#include <arpa/inet.h>
#include <sys/socket.h>
#include <net/if.h>
@ -29,8 +29,8 @@
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#include <list> /* for the list of conf file names */
#include <list> /* for ... list of conf file names */
#include <unistd.h> /* for ... sync */
using namespace std;
@ -70,11 +70,15 @@ void stop_pmon( void )
{
/* max pipe command response length */
#define PIPE_COMMAND_RESPON_LEN (100)
ilog("Stopping collectd.");
int rc = system("/usr/local/sbin/pmon-stop collectd");
sleep (2);
ilog("Stopping pmon to prevent process recovery during shutdown");
for ( int retry = 0 ; retry < 5 ; retry++ )
{
char pipe_cmd_output [PIPE_COMMAND_RESPON_LEN] ;
int rc = system("/usr/bin/systemctl stop pmon");
rc = system("/usr/bin/systemctl stop pmon");
sleep(2);
/* confirm pmon is no longer active */
@ -204,6 +208,24 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface )
mlog1 ("mtcAlive request received (%s network)\n", interface_name.c_str());
return ( send_mtcAlive_msg ( sock_ptr, get_who_i_am(), interface ));
}
else if ( msg.cmd == MTC_MSG_INFO )
{
mlog1("mtc 'info' message received (%s network)\n", interface_name.c_str());
load_mtcInfo_msg ( msg );
return ( PASS ); /* no ack for this message */
}
else if ( msg.cmd == MTC_CMD_SYNC )
{
ilog ("mtc '%s' message received (%s network)\n",
get_mtcNodeCommand_str(msg.cmd),
interface_name.c_str());
ilog ("Sync Start");
sync ();
ilog ("Sync Done");
return ( PASS ); /* no ack for this message */
}
else if ( msg.cmd == MTC_MSG_LOCKED )
{
/* Only recreate the file if its not already present */
@ -603,7 +625,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface )
}
/** Send an event to the mtcAgent **/
int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_name_ptr )
int mtce_send_event ( mtc_socket_type * sock_ptr, unsigned int cmd , const char * mtce_name_ptr )
{
mtc_message_type event ;
@ -619,6 +641,24 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_na
/* We don't use the buffer for mtce events to remove it from the size */
bytes = ((sizeof(mtc_message_type))-(BUF_SIZE));
}
else if ( cmd == MTC_EVENT_MONITOR_READY )
{
string event_info = "{\"" ;
event_info.append(MTC_JSON_INV_NAME);
event_info.append("\":\"");
event_info.append(get_hostname());
event_info.append("\",\"");
event_info.append(MTC_JSON_SERVICE);
event_info.append("\":\"");
event_info.append(MTC_SERVICE_MTCCLIENT_NAME );
event_info.append("\"}");
size_t len = event_info.length()+1 ;
snprintf ( &event.hdr[0], MSG_HEADER_SIZE, "%s", get_mtce_event_header());
snprintf ( &event.buf[0], len, "%s", event_info.data());
bytes = ((sizeof(mtc_message_type))-(BUF_SIZE-len));
ilog ("%s %s ready", get_hostname().c_str(), MTC_SERVICE_MTCCLIENT_NAME);
}
else if (( cmd == MTC_EVENT_AVS_CLEAR ) ||
( cmd == MTC_EVENT_AVS_MAJOR ) ||
( cmd == MTC_EVENT_AVS_CRITICAL ))
@ -666,7 +706,7 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_na
{
if ( bytes == 0 )
{
slog ("message send failed ; message size=0 for cmd:%d is 0\n", event.cmd );
slog ("message send failed ; message size=0 for cmd:0x%x is 0\n", event.cmd );
rc = FAIL_NO_DATA ;
}
else if ((rc = sock_ptr->mtc_client_tx_socket->write((char*)&event.hdr[0], bytes))!= bytes )
@ -912,15 +952,18 @@ int send_mtcAlive_msg ( mtc_socket_type * sock_ptr, string identity, int interfa
}
/* Send to controller-1 cluster address */
if (( sock_ptr->mtc_client_tx_socket_c1_clstr ) &&
( sock_ptr->mtc_client_tx_socket_c1_clstr->sock_ok() == true ))
if ( get_ctrl_ptr()->system_type != SYSTEM_TYPE__AIO__SIMPLEX )
{
print_mtc_message ( CONTROLLER_1, MTC_CMD_TX, msg, get_iface_name_str(CLSTR_INTERFACE), false );
sock_ptr->mtc_client_tx_socket_c1_clstr->write((char*)&msg.hdr[0], bytes ) ;
}
else
{
elog("mtc_client_tx_socket_c1_clstr not ok");
if (( sock_ptr->mtc_client_tx_socket_c1_clstr ) &&
( sock_ptr->mtc_client_tx_socket_c1_clstr->sock_ok() == true ))
{
print_mtc_message ( CONTROLLER_1, MTC_CMD_TX, msg, get_iface_name_str(CLSTR_INTERFACE), false );
sock_ptr->mtc_client_tx_socket_c1_clstr->write((char*)&msg.hdr[0], bytes ) ;
}
else
{
elog("mtc_client_tx_socket_c1_clstr not ok");
}
}
}
else
@ -933,32 +976,59 @@ int send_mtcAlive_msg ( mtc_socket_type * sock_ptr, string identity, int interfa
return (PASS) ;
}
/* Accelerated Virtual Switch 'events' socket
* - for receiving data port state change event
* Event strings are
*
* {"type":"port-state", "severity":"critical|major|clear"}
*
* type:port-state - the provider network data port status has changed to the supplied fault severity
*
* severity:
* critical - port has failed and is not part of an aggregate or is the last port in an aggregate (degrade, disable services)
* major - port has failed and is part of an aggregate with other inservice-ports (degrade only)
* clear - port has recovered from a failed state and is operational (clear degrade, enable services)
*
* NOTE: The port status can transition from any of the above states to any other state.
*
* The neutron agent monitors the vswitch ports at a 2 second interval.
* If a port changes link state during the polling period, it will
* raise/clear the alarm, but now also calculates the impact of that port
* failure on the provider network data interface.
*
* The overall aggregated state across all provider network interfaces will
* be reported to maintenance when ports enter a link down or up state.
* The agent will also periodically send the current provider network port
* status to maintenance every 30 seconds.
*
*/
int send_mtcClient_cmd ( mtc_socket_type * sock_ptr, int cmd, string hostname, string address, int port)
{
mtc_message_type msg ;
int bytes = 0 ;
MEMSET_ZERO (msg);
snprintf ( &msg.hdr[0], MSG_HEADER_SIZE, "%s", get_cmd_req_msg_header());
msg.cmd = cmd ;
switch ( cmd )
{
case MTC_CMD_SYNC:
{
ilog ("Sending '%s' command to %s:%s:%d",
get_mtcNodeCommand_str(cmd),
hostname.c_str(),
address.c_str(), port);
msg.num = 0 ;
/* buffer not used in this message */
bytes = ((sizeof(mtc_message_type))-(BUF_SIZE));
break ;
}
default:
{
slog("Unsupported command ; %s:%d", get_mtcNodeCommand_str(cmd), cmd );
return (FAIL_BAD_CASE);
}
}
int rc = FAIL ;
/* Send to controller floating address */
if (( sock_ptr->mtc_client_tx_socket ) &&
( sock_ptr->mtc_client_tx_socket->sock_ok() == true ))
{
print_mtc_message ( hostname, MTC_CMD_TX, msg, get_iface_name_str(MGMNT_INTERFACE), false );
rc = sock_ptr->mtc_client_tx_socket->write((char*)&msg.hdr[0], bytes, address.data(), port ) ;
if ( 0 >= rc )
{
elog("failed to send command to mtcClient (%d) (%d:%s)", rc, errno, strerror(errno));
rc = FAIL_SOCKET_SENDTO ;
}
else
rc = PASS ;
}
else
{
elog("mtc_client_tx_socket not ok");
rc = FAIL_BAD_STATE ;
}
return (rc) ;
}
int mtcCompMsg_testhead ( void )
{

View File

@ -443,6 +443,34 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr,
obj_ptr->declare_service_ready ( hostname, MTC_SERVICE_HEARTBEAT );
return (PASS);
}
else if ( service == MTC_SERVICE_MTCCLIENT_NAME )
{
ilog ("%s %s ready", hostname.c_str(), MTC_SERVICE_MTCCLIENT_NAME);
/* if this ready event is from the mtcClient of a
* controller that has valid bmc access info then
* build the 'peer controller kill' mtcInfo and
* send it to that mtcClient */
if ( obj_ptr->get_nodetype ( hostname ) & CONTROLLER_TYPE )
{
string bm_pw = obj_ptr->get_bm_pw ( hostname ) ;
if ( !bm_pw.empty() && ( bm_pw != NONE ))
{
string bm_un = obj_ptr->get_bm_un ( hostname ) ;
string bm_ip = obj_ptr->get_bm_ip ( hostname ) ;
if (( hostUtil_is_valid_username ( bm_un )) &&
( hostUtil_is_valid_ip_addr ( bm_ip )))
{
send_mtc_cmd ( hostname,
MTC_MSG_INFO,
MGMNT_INTERFACE,
obj_ptr->build_mtcInfo_dict (
MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO));
}
}
}
return (PASS);
}
if ( service == MTC_SERVICE_HWMOND_NAME )
{
std::list<string>::iterator temp ;
@ -578,11 +606,12 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr,
return (rc);
}
int send_mtc_cmd ( string & hostname, int cmd , int interface )
int send_mtc_cmd ( string & hostname, int cmd , int interface, string json_dict )
{
int rc = FAIL ;
bool force = false ;
mtc_message_type mtc_cmd ;
string data = "" ;
mtc_socket_type * sock_ptr = get_sockPtr ();
memset (&mtc_cmd,0,sizeof(mtc_message_type));
@ -592,6 +621,16 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface )
switch ( cmd )
{
case MTC_MSG_INFO:
{
snprintf ( &mtc_cmd.hdr[0], MSG_HEADER_SIZE, "%s" , get_cmd_req_msg_header() );
mtc_cmd.cmd = cmd ;
mtc_cmd.num = 0 ;
data = "{\"mtcInfo\":" + json_dict + "}";
ilog("%s mtc info update", hostname.c_str());
rc = PASS ;
break ;
}
case MTC_REQ_MTCALIVE:
{
snprintf ( &mtc_cmd.hdr[0], MSG_HEADER_SIZE, "%s" , get_cmd_req_msg_header() );
@ -689,11 +728,20 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface )
* Note: the minus 1 is to overwrite the null */
snprintf ( &mtc_cmd.hdr[MSG_HEADER_SIZE-1], MSG_HEADER_SIZE, "%s", obj_ptr->get_hostIfaceMac(hostname, MGMNT_IFACE).data());
string data = "{\"address\":\"";
data.append(obj_ptr->my_float_ip) ;
data.append("\",\"interface\":\"");
data.append(get_iface_name_str(interface));
data.append("\"}");
/* If data is empty then at least add where the message came from */
if ( data.empty() )
{
data = "{\"address\":\"";
data.append(obj_ptr->my_float_ip) ;
data.append("\",\"interface\":\"");
data.append(get_iface_name_str(interface));
data.append("\"}");
}
else
{
; /* data is already pre loaded by the command case above */
}
/* copy data into message buffer */
snprintf ( &mtc_cmd.buf[0], data.length()+1, "%s", data.data());
bytes = (sizeof(mtc_message_type)-(BUF_SIZE-(data.length()+1)));
@ -1176,7 +1224,7 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
else if ( msg.cmd == MTC_EVENT_HEARTBEAT_READY )
{
/* no heartbeating in simplex mode */
if ( obj_ptr->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX )
if ( obj_ptr->system_type == SYSTEM_TYPE__AIO__SIMPLEX )
{
return (PASS);
}
@ -1214,13 +1262,68 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
{
elog ("%s Failed to send inventory to heartbeat service\n", hostname.c_str());
}
/* Send the start event to the heartbeat service for all enabled hosts */
/* Consider sending the 'start' request to the heartbeat service
* for all enabled hosts. */
if (( obj_ptr->get_adminState ( hostname ) == MTC_ADMIN_STATE__UNLOCKED ) &&
( obj_ptr->get_operState ( hostname ) == MTC_OPER_STATE__ENABLED ) &&
((obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__AVAILABLE ) ||
(obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__DEGRADED )))
{
send_hbs_command ( hostname, MTC_CMD_START_HOST, controller );
/* However, bypass sending heartbeat 'start' for nodes that
* are not ready to heartbeat; enabling, configuring, testing.
* Such cases are if a host is:
*
* 1. running the add_handler or
* 2. running the enable_handler or
* 3. running the enable_subf_handler or
* 4. not configured or
* 5. not tested (goenabled not complete)
*
*/
mtc_nodeAdminAction_enum current_action =
obj_ptr->get_adminAction (hostname);
if (( current_action != MTC_ADMIN_ACTION__ADD ) &&
( current_action != MTC_ADMIN_ACTION__ENABLE ) &&
( current_action != MTC_ADMIN_ACTION__ENABLE_SUBF ))
{
int mtce_flags = obj_ptr->get_mtce_flags(hostname);
if (( mtce_flags & MTC_FLAG__I_AM_CONFIGURED ) &&
( mtce_flags & MTC_FLAG__I_AM_HEALTHY ) &&
( mtce_flags & MTC_FLAG__MAIN_GOENABLED ))
{
if (( obj_ptr->system_type != SYSTEM_TYPE__NORMAL ) &&
( obj_ptr->get_nodetype ( hostname ) & CONTROLLER_TYPE ))
{
/* If its an AIO then its worker subfunction
* needs to have been be configured and tested. */
if (( mtce_flags & MTC_FLAG__SUBF_CONFIGURED ) &&
( mtce_flags & MTC_FLAG__SUBF_GOENABLED ))
{
ilog("%s heartbeat start (AIO controller)",
hostname.c_str());
send_hbs_command ( hostname, MTC_CMD_START_HOST, controller );
}
else
{
wlog ("%s not heartbeat ready (subf) (oob:%x)",
hostname.c_str(),
mtce_flags);
}
}
else
{
ilog("%s heartbeat start (from ready event)",
hostname.c_str());
send_hbs_command ( hostname, MTC_CMD_START_HOST, controller );
}
}
else
{
wlog ("%s not heartbeat ready (main) (oob:%x)",
hostname.c_str(),
mtce_flags);
}
}
}
}
ilog ("%s %s inventory push ... done",

View File

@ -974,7 +974,7 @@ int nodeLinkClass::mtcInvApi_update_states_now ( struct nodeLinkClass::node * no
else
avail = " " ;
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
if ( ! oper_subf.empty() )
{
@ -1016,7 +1016,7 @@ int nodeLinkClass::mtcInvApi_update_states_now ( struct nodeLinkClass::node * no
this->sysinvEvent.payload.erase(len-1,1);
this->sysinvEvent.payload.append ( "]");
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
ilog ("%s %s-%s-%s %s-%s\n",
node_ptr->hostname.c_str(),

View File

@ -43,9 +43,9 @@
#include <signal.h>
#include <fcntl.h>
#include <errno.h>
//#include <syslog.h> /* for ... syslog */
#include <sys/stat.h>
#include <list>
#include <json-c/json.h> /* for ... json_tokener_parse */
using namespace std;
@ -56,6 +56,10 @@ using namespace std;
#include "nodeBase.h" /* for ... Common Definitions */
#include "nodeTimers.h" /* fpr ... Timer Service */
#include "nodeUtil.h" /* for ... Common Utilities */
#include "hostUtil.h" /* for ... hostUtil_is_valid_... */
#include "jsonUtil.h" /* for ... jsonUtil_get_key_value_string */
#include "bmcUtil.h" /* for ... bmcUtil_accessInfo_type */
#include "ipmiUtil.h" /* for ... ipmiUtil_reset_host_now */
#include "nodeMacro.h" /* for ... CREATE_NONBLOCK_INET_UDP_RX_SOCKET */
#include "mtcNodeMsg.h" /* for ... common maintenance messaging */
#include "mtcNodeComp.h" /* for ... this module header */
@ -96,7 +100,7 @@ string get_hostname ( void )
* Daemon Configuration Structure - The allocated struct
* @see daemon_common.h for daemon_config_type struct format.
*/
static daemon_config_type mtc_config ;
static daemon_config_type mtc_config ;
daemon_config_type * daemon_get_cfg_ptr () { return &mtc_config ; }
/**
@ -106,6 +110,8 @@ daemon_config_type * daemon_get_cfg_ptr () { return &mtc_config ; }
static mtc_socket_type mtc_sock ;
static mtc_socket_type * sock_ptr ;
static bmcUtil_accessInfo_type peer_controller = {"none","none","none","none","none"};
static bmcUtil_accessInfo_type this_controller = {"none","none","none","none","none"};
int run_goenabled_scripts ( string type );
@ -138,6 +144,16 @@ void timer_handler ( int sig, siginfo_t *si, void *uc)
mtcTimer_stop_int_safe ( ctrl.hostservices.timer );
ctrl.hostservices.timer.ring = true ;
}
else if ( *tid_ptr == ctrl.peer_ctrlr_reset.sync_timer.tid )
{
ctrl.peer_ctrlr_reset.sync_timer.ring = true ;
mtcTimer_stop_int_safe ( ctrl.peer_ctrlr_reset.sync_timer );
}
else if ( *tid_ptr == ctrl.peer_ctrlr_reset.audit_timer.tid )
{
/* use auto restart */
ctrl.peer_ctrlr_reset.audit_timer.ring = true ;
}
else
{
mtcTimer_stop_tid_int_safe ( tid_ptr );
@ -207,9 +223,8 @@ void daemon_exit ( void )
exit (0) ;
}
/* Startup config read */
static int mtc_config_handler ( void * user,
static int mtc_config_handler ( void * user,
const char * section,
const char * name,
const char * value)
@ -236,11 +251,14 @@ static int mtc_config_handler ( void * user,
config_ptr->failsafe_shutdown_delay = atoi(value);
ilog ("Shutdown TO : %d secs\n", config_ptr->failsafe_shutdown_delay );
}
else
if (( ctrl.nodetype & CONTROLLER_TYPE ) &&
(MATCH("client", "sync_b4_peer_ctrlr_reset")))
{
return (PASS);
ctrl.peer_ctrlr_reset.sync = atoi(value);
ilog("SyncB4 Reset: %s",
ctrl.peer_ctrlr_reset.sync ? "Yes" : "No" );
}
return (FAIL);
return (PASS);
}
/* Read the mtc.ini file and load control */
@ -431,7 +449,7 @@ void setup_clstr_tx_sockets ( void )
mtc_sock.mtc_client_tx_socket_c0_clstr->sock_ok(false);
}
}
if ( ctrl.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )
if ( ctrl.system_type != SYSTEM_TYPE__AIO__SIMPLEX )
{
dlog ("setup of %s TX\n", CONTROLLER_1_CLUSTER_HOST);
@ -946,6 +964,65 @@ void _manage_goenabled_tests ( void )
_scripts_cleanup (ctrl.active_script_set) ;
}
int issue_reset_and_cleanup ( void )
{
int rc = FAIL ;
const char peer_ctrlr [] = "Peer controller reset" ;
ilog("SM %s request", peer_ctrlr );
/* check creds */
if (( hostUtil_is_valid_ip_addr ( peer_controller.bm_ip ) == false ) ||
( hostUtil_is_valid_username ( peer_controller.bm_un ) == false ) ||
( hostUtil_is_valid_pw ( peer_controller.bm_pw ) == false ))
{
elog("%s cannot reset peer BMC host at %s due to invalid credentials",
ctrl.hostname, peer_controller.bm_ip.c_str());
return (rc);
}
/* create output filename - no need to delete after operation */
string output_filename = bmcUtil_create_data_fn ( ctrl.hostname,
BMC_RESET_CMD_FILE_SUFFIX,
BMC_PROTOCOL__IPMITOOL );
if ( output_filename.empty() )
{
elog("%s ; failed to create output filename", peer_ctrlr);
rc = FAIL_STRING_EMPTY ;
}
else if ( ipmiUtil_reset_host_now ( ctrl.hostname,
peer_controller,
output_filename ) == PASS )
{
string result = daemon_get_file_str ( output_filename.data() );
ilog("%s succeeded", peer_ctrlr);
/* don't fail the operation if the result is unexpected ; but log it */
if ( result.compare( IPMITOOL_POWER_RESET_RESP ) )
{
dlog("... but reset command output was unexpected ; %s",
result.c_str());
}
rc = PASS ;
}
else
{
elog("%s failed", peer_ctrlr);
rc = FAIL_OPERATION ;
}
if ( rc == PASS )
{
/* give the host a chance to reset before
* telling SM the reset is done */
sleep (2) ;
/* Don't want to remove the file if the reset was not successful */
dlog("removing %s", RESET_PEER_NOW );
daemon_remove_file ( RESET_PEER_NOW );
}
return (rc);
}
/* The main service loop */
int daemon_init ( string iface, string nodetype_str )
@ -963,6 +1040,7 @@ int daemon_init ( string iface, string nodetype_str )
ctrl.subfunction = 0 ;
ctrl.system_type = daemon_system_type ();
ctrl.clstr_iface_provisioned = false ;
ctrl.peer_ctrlr_reset.sync = false ;
/* convert node type to integer */
ctrl.nodetype = get_host_function_mask ( nodetype_str ) ;
@ -1018,6 +1096,13 @@ int daemon_init ( string iface, string nodetype_str )
mtcTimer_init ( ctrl.goenabled.timer, &ctrl.hostname[0], "goenable timer" );
mtcTimer_init ( ctrl.hostservices.timer, &ctrl.hostname[0], "host services timer" );
/* initialize peer controller reset feature */
mtcTimer_init ( ctrl.peer_ctrlr_reset.audit_timer, &ctrl.hostname[0], "peer ctrlr reset audit timer" ),
mtcTimer_init ( ctrl.peer_ctrlr_reset.sync_timer, &ctrl.hostname[0], "peer ctrlr reset sync timer" ),
ctrl.peer_ctrlr_reset.sync_timer.ring = false ;
ctrl.peer_ctrlr_reset.audit_timer.ring = false ;
ctrl.peer_ctrlr_reset.audit_period = PEER_CTRLR_AUDIT_PERIOD ;
/* initialize the script group control structures */
script_ctrl_init ( &ctrl.goenabled );
script_ctrl_init ( &ctrl.hostservices );
@ -1073,6 +1158,17 @@ void daemon_service_run ( void )
/* Send first mtcAlive ASAP */
mtcTimer_start ( ctrl.timer, timer_handler, 1 );
/* Monitor for peer controller reset requests when this
* daemon runs on a controller */
if ( ctrl.nodetype & CONTROLLER_TYPE )
{
mtcTimer_start ( ctrl.peer_ctrlr_reset.audit_timer,
timer_handler,
ctrl.peer_ctrlr_reset.audit_period );
}
mtce_send_event ( sock_ptr, MTC_EVENT_MONITOR_READY, NULL );
/* lets go select so that the sock does not go crazy */
dlog ("%s running main loop with %d msecs socket timeout\n",
&ctrl.hostname[0], (SOCKET_WAIT/1000) );
@ -1305,8 +1401,20 @@ void daemon_service_run ( void )
socket_reinit = true ;
}
/* Clstr Tx */
else if (( ctrl.clstr_iface_provisioned == true ) &&
/* Clstr Tx ; AIO SX */
else if ((ctrl.system_type == SYSTEM_TYPE__AIO__SIMPLEX) &&
( ctrl.clstr_iface_provisioned == true ) &&
(( mtc_sock.mtc_client_tx_socket_c0_clstr == NULL ) ||
( mtc_sock.mtc_client_tx_socket_c0_clstr->sock_ok() == false )))
{
wlog ("calling setup_clstr_tx_sockets (auto-recovery)\n");
setup_clstr_tx_sockets();
socket_reinit = true ;
}
/* Clstr Tx ; not AIO SX */
else if ((ctrl.system_type != SYSTEM_TYPE__AIO__SIMPLEX) &&
( ctrl.clstr_iface_provisioned == true ) &&
(( mtc_sock.mtc_client_tx_socket_c0_clstr == NULL ) ||
( mtc_sock.mtc_client_tx_socket_c1_clstr == NULL ) ||
( mtc_sock.mtc_client_tx_socket_c0_clstr->sock_ok() == false ) ||
@ -1384,7 +1492,51 @@ void daemon_service_run ( void )
}
}
}
/* service controller specific audits */
if ( ctrl.nodetype & CONTROLLER_TYPE )
{
/* peer controller reset service audit */
if ( ctrl.peer_ctrlr_reset.audit_timer.ring )
{
if ( daemon_is_file_present ( RESET_PEER_NOW ) )
{
if ( ctrl.peer_ctrlr_reset.sync )
{
if ( ctrl.peer_ctrlr_reset.sync_timer.ring )
{
issue_reset_and_cleanup ();
ctrl.peer_ctrlr_reset.sync_timer.ring = false ;
}
else if ( ctrl.peer_ctrlr_reset.sync_timer.tid == NULL )
{
if ( send_mtcClient_cmd ( &mtc_sock,
MTC_CMD_SYNC,
peer_controller.hostname,
peer_controller.host_ip,
mtc_config.mtc_rx_mgmnt_port) == PASS )
{
mtcTimer_start ( ctrl.peer_ctrlr_reset.sync_timer, timer_handler, MTC_SECS_10 );
ilog("... waiting for peer controller to sync - %d secs", MTC_SECS_10);
}
else
{
elog("failed to send 'sync' command to peer controller mtcClient");
ctrl.peer_ctrlr_reset.sync_timer.ring = true ;
}
}
else
{
; /* wait longer */
}
}
else
{
issue_reset_and_cleanup ();
}
}
ctrl.peer_ctrlr_reset.audit_timer.ring = false ;
}
}
daemon_signal_hdlr ();
}
daemon_exit();
@ -1573,7 +1725,7 @@ int run_hostservices_scripts ( unsigned int cmd )
/* For the stop command we need the mtcClient to run both controller and
* worker stop services if we are on a CPE system.
* worker stop services if we are on a AIO system.
* This saves the mtcAgent from having to issue and manage 2 commands,
* one for controller and 1 for worker */
if ( ctrl.system_type != SYSTEM_TYPE__NORMAL )
@ -1750,7 +1902,6 @@ void daemon_sigchld_hdlr ( void )
}
default:
{
wlog ("child handler running with no active script set (%d)\n", ctrl.active_script_set );
return ;
}
}
@ -1820,6 +1971,84 @@ void daemon_sigchld_hdlr ( void )
}
}
/***************************************************************************
*
* Name : load_mtcInfo_msg
*
* Description: Extract the mtc info from the MTC_MSG_INFO message.
*
* Assumptions: So far only the peer controller reset feature uses this.
*
* Returns : Nothing
*
***************************************************************************/
void load_mtcInfo_msg ( mtc_message_type & msg )
{
if ( ctrl.nodetype & CONTROLLER_TYPE )
{
mlog1("%s", &msg.buf[0]);
struct json_object *_obj = json_tokener_parse( &msg.buf[0] );
if ( _obj )
{
if ( strcmp(&ctrl.hostname[0], CONTROLLER_0 ))
peer_controller.hostname = CONTROLLER_0 ;
else
peer_controller.hostname = CONTROLLER_1 ;
struct json_object *info_obj = (struct json_object *)(NULL);
json_bool json_rc = json_object_object_get_ex( _obj,
"mtcInfo",
&info_obj );
if ( ( json_rc == TRUE ) && ( info_obj ))
{
struct json_object *ctrl_obj = (struct json_object *)(NULL);
json_bool json_rc =
json_object_object_get_ex( info_obj,
peer_controller.hostname.data(),
&ctrl_obj );
if (( json_rc == TRUE ) && ( ctrl_obj ))
{
peer_controller.host_ip = jsonUtil_get_key_value_string(ctrl_obj, MTC_JSON_INV_HOSTIP) ;
peer_controller.bm_ip = jsonUtil_get_key_value_string(ctrl_obj, MTC_JSON_INV_BMIP) ;
peer_controller.bm_un = jsonUtil_get_key_value_string(ctrl_obj, "bm_un");
peer_controller.bm_pw = jsonUtil_get_key_value_string(ctrl_obj, "bm_pw");
/* log the mc info but not the bmc password ; only
* indicate that it looks 'ok' or 'is 'none' */
ilog ("%s is my peer [host:%s bmc:%s:%s:%s]",
peer_controller.hostname.c_str(),
peer_controller.host_ip.c_str(),
peer_controller.bm_ip.c_str(),
peer_controller.bm_un.c_str(),
hostUtil_is_valid_pw(peer_controller.bm_pw) ? "ok":"none");
}
else
{
wlog("peer mtcInfo missing (rc:%d) ; %s",
json_rc, &msg.buf[0]);
}
}
else
{
wlog("mtcInfo label parse error (rc:%d) ; %s",
json_rc, &msg.buf[0]);
}
json_object_put(_obj);
}
else
{
wlog("message buffer tokenize error ; %s", &msg.buf[0]);
}
}
else
{
slog("%s got mtcInfo ; unexpected for this nodetype", ctrl.hostname);
}
}
/* Push daemon state to log file */
void daemon_dump_info ( void )
{
@ -1853,13 +2082,13 @@ int daemon_run_testhead ( void )
* STAGE 1: some test
************************************************/
printf ( "| Test %d : Maintenance Service Test ............. ", stage );
if ( rc != PASS )
if ( rc != PASS )
{
FAILED_STR ;
rc = FAIL ;
}
else
PASSED ;
PASSED ;
printf ("+---------------------------------------------------------+\n");
return PASS ;

View File

@ -17,6 +17,10 @@
#include <string.h>
#include <unistd.h>
using namespace std;
#include "nodeTimers.h" /* for ... Timer Service */
/** Compute Config mask */
#define CONFIG_CLIENT_MASK (CONFIG_AGENT_MTC_MGMNT_PORT |\
CONFIG_CLIENT_MTC_MGMNT_PORT |\
@ -59,6 +63,22 @@ typedef struct
} script_ctrl_type ;
void script_ctrl_init ( script_ctrl_type * script_ctrl_ptr );
/* peer controller reset control structure and associated definitions */
/* This is a flag file set by SM when SM wants maintanence to perform a
* BMC reset of the other (peer) controller */
#define RESET_PEER_NOW "/var/run/.sm_reset_peer"
#define PEER_CTRLR_AUDIT_PERIOD (2)
typedef struct
{
struct
mtc_timer sync_timer ;
mtc_timer audit_timer ;
int audit_period ;
bool sync ;
} peer_ctrlr_reset_type ;
typedef struct
{
char hostname [MAX_HOST_NAME_SIZE+1];
@ -76,7 +96,7 @@ typedef struct
unsigned int function ;
unsigned int subfunction ;
struct mtc_timer timer ; /* mtcAlive timer */
struct mtc_timer timer ; /* mtcAlive timer */
bool clstr_iface_provisioned ;
@ -102,6 +122,7 @@ typedef struct
/* Where to send events */
string mtcAgent_ip ;
peer_ctrlr_reset_type peer_ctrlr_reset;
} ctrl_type ;
ctrl_type * get_ctrl_ptr ( void );
@ -109,5 +130,6 @@ ctrl_type * get_ctrl_ptr ( void );
bool is_subfunction_worker ( void );
int run_goenabled_scripts ( mtc_socket_type * sock_ptr , string requestor );
int run_hostservices_scripts ( unsigned int cmd );
void load_mtcInfo_msg ( mtc_message_type & msg );
#endif

View File

@ -1187,15 +1187,6 @@ int _self_provision ( void )
if ( my_identity.name == record_info.name )
{
/* If the active controller was 'locked' and is being auto-corrected
* to 'unlocked' then ensure that there is no locked alarm set for it */
if ( record_info.admin != "locked" )
{
mtcAlarm_clear ( my_identity.name, MTC_ALARM_ID__LOCK );
/* this is not required because its already inited to clear */
// node_ptr->alarms[MTC_ALARM_ID__LOCK] = FM_ALARM_SEVERITY_CLEAR
}
if ( my_identity.mac != record_info.mac )
{
wlog ("%s mac address mismatch (%s - %s)\n",
@ -1326,6 +1317,7 @@ void nodeLinkClass::fsm ( void )
daemon_signal_hdlr ();
mtcHttpSvr_look ( mtce_event );
}
mtcInv.mtcInfo_handler();
}
}
@ -1515,9 +1507,9 @@ void daemon_service_run ( void )
if ( ts.tv_sec < MTC_MINS_15 )
{
/* CPE DOR window is much greater in CPE since heartbeat
* cannot start until the inactive CPE has run both manifests */
int timeout = DEFAULT_DOR_MODE_CPE_TIMEOUT ;
/* AIO DOR window is much greater in AIO since heartbeat
* cannot start until the inactive AIO has run both manifests */
int timeout = DEFAULT_DOR_MODE_AIO_TIMEOUT ;
/* override the timeout to a smaller value for normal system */
if ( mtcInv.system_type == SYSTEM_TYPE__NORMAL )
@ -1601,7 +1593,7 @@ void daemon_service_run ( void )
if ( mtcInv.system_type == SYSTEM_TYPE__NORMAL )
mtc_sock.waitd.tv_usec = MTCAGENT_SELECT_TIMEOUT ;
else
mtc_sock.waitd.tv_usec = MTCAGENT_CPE_SELECT_TIMEOUT ;
mtc_sock.waitd.tv_usec = MTCAGENT_AIO_SELECT_TIMEOUT ;
/* This is used as a delay up to select_timeout */
rc = select( socks.back()+1, &mtc_sock.readfds, NULL, NULL, &mtc_sock.waitd);

View File

@ -63,6 +63,11 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
/* Monitor and Manage active threads */
thread_handler ( node_ptr->bmc_thread_ctrl, node_ptr->bmc_thread_info );
if ( node_ptr->bmc_thread_ctrl.stage == THREAD_STAGE__KILL )
{
/* do nothing while thread is being killed */
return RETRY ;
}
/* manage the host connected state and board management alarms */
nodeLinkClass::bmc_handler ( node_ptr );
@ -310,10 +315,10 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
}
/****************************************************************************
* No Op: Do nothing for this Healthy Enabled Locked CPE Simplex Host
* No Op: Do nothing for this Healthy Enabled Locked AIO Simplex Host
****************************************************************************
*/
else if (( this->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX ) &&
else if (( this->system_type == SYSTEM_TYPE__AIO__SIMPLEX ) &&
( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) &&
( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ))
{

View File

@ -481,7 +481,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__UNLOCK )
{
bool aio = false ;
if ( SIMPLEX_CPE_SYSTEM )
if ( SIMPLEX_AIO_SYSTEM )
aio = true ;
else
aio = false ;
@ -525,7 +525,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
}
}
mtcInvApi_update_states_now ( node_ptr, "unlocked", "disabled" , "offline", "disabled", "offline" );
mtcInvApi_update_task_now ( node_ptr, aio ? MTC_TASK_CPE_SX_UNLOCK_MSG : MTC_TASK_SELF_UNLOCK_MSG );
mtcInvApi_update_task_now ( node_ptr, aio ? MTC_TASK_AIO_SX_UNLOCK_MSG : MTC_TASK_SELF_UNLOCK_MSG );
wlog ("%s unlocking %s with reboot\n",
my_hostname.c_str(),
@ -546,7 +546,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
* Condition 1: While there is no in-service backup controller
* to swact to. In this case the ctive controller
* - is only degraded to avoid a system outage.
* - the CPE subfunction is failed
* - the AIO subfunction is failed
* - worker SubFunction Alarm is raised
* - Enable alarm is raised
* - A process monitor alarm may also be raised if
@ -648,7 +648,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
}
else
{
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
/* Raise Critical Compute Function Alarm */
alarm_compute_failure ( node_ptr , FM_ALARM_SEVERITY_CRITICAL );
@ -661,7 +661,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->graceful_recovery_counter = 0 ;
node_ptr->health_threshold_counter = 0 ;
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
node_ptr->inservice_failed_subf = true ;
subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED,
@ -1358,7 +1358,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
* have a worker function and the heartbeat for those hosts
* are started at the end of the subfunction handler. */
if (( THIS_HOST ) ||
(( CPE_SYSTEM ) && ( is_controller(node_ptr)) ))
(( AIO_SYSTEM ) && ( is_controller(node_ptr)) ))
{
enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE );
}
@ -1523,8 +1523,8 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
if ( is_controller(node_ptr) )
{
/* Defer telling SM the controller state if
* this is a CPE and this is the only controller */
if ( CPE_SYSTEM && ( num_controllers_enabled() > 0 ))
* this is a AIO and this is the only controller */
if ( AIO_SYSTEM && ( num_controllers_enabled() > 0 ))
{
wlog ("%s deferring SM enable notification till subfunction-enable complete\n",
node_ptr->hostname.c_str());
@ -1555,7 +1555,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
enableStageChange ( node_ptr, MTC_ENABLE__START );
if (( CPE_SYSTEM ) && ( is_controller(node_ptr)))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr)))
{
ilog ("%s running worker sub-function enable handler\n", node_ptr->hostname.c_str());
mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLING_SUBF );
@ -1637,9 +1637,10 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->http_retries_cur = 0 ;
node_ptr->unknown_health_reported = false ;
plog ("%s %sGraceful Recovery (uptime was %d)\n",
plog ("%s %sGraceful Recovery (%d) (uptime was %d)\n",
node_ptr->hostname.c_str(),
node_ptr->mnfa_graceful_recovery ? "MNFA " : "",
node_ptr->graceful_recovery_counter,
node_ptr->uptime );
/* Cancel any outstanding timers */
@ -1660,7 +1661,8 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
* 2. Setting the node operational state to Disabled
* 3. Setting the Enable action
*/
if ( ++node_ptr->graceful_recovery_counter > MTC_MAX_FAST_ENABLES )
node_ptr->graceful_recovery_counter++ ;
if ( node_ptr->graceful_recovery_counter > MTC_MAX_FAST_ENABLES )
{
/* gate off further mtcAlive messaging timme the offline
* handler runs. This prevents stale messages from making it
@ -1772,10 +1774,11 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
else if ( node_ptr->mnfa_graceful_recovery == true )
{
if ( node_ptr->uptime > MTC_MINS_10 )
if ( node_ptr->uptime > MTC_MINS_15 )
{
/* did not reboot case */
wlog ("%s Connectivity Recovered ; host did not reset\n", node_ptr->hostname.c_str());
wlog ("%s Connectivity Recovered ; host did not reset (uptime:%d)\n",
node_ptr->hostname.c_str(), node_ptr->uptime);
wlog ("%s ... continuing with MNFA graceful recovery\n", node_ptr->hostname.c_str());
wlog ("%s ... with no affect to host services\n", node_ptr->hostname.c_str());
@ -1788,7 +1791,8 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
else
{
/* did reboot case */
wlog ("%s Connectivity Recovered ; host has reset\n", node_ptr->hostname.c_str());
wlog ("%s Connectivity Recovered ; host has reset (uptime:%d)\n",
node_ptr->hostname.c_str(), node_ptr->uptime);
ilog ("%s ... continuing with MNFA graceful recovery\n", node_ptr->hostname.c_str());
ilog ("%s ... without additional reboot %s\n",
node_ptr->hostname.c_str(), node_ptr->bm_ip.empty() ? "or reset" : "" );
@ -1806,12 +1810,13 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
break ;
}
}
else if (( node_ptr->uptime_save ) && ( node_ptr->uptime >= node_ptr->uptime_save ))
else if ( node_ptr->uptime > MTC_MINS_15 )
{
/* did not reboot case */
wlog ("%s Connectivity Recovered ; host did not reset%s\n",
wlog ("%s Connectivity Recovered ; host did not reset%s (uptime:%d)",
node_ptr->hostname.c_str(),
node_ptr->was_dor_recovery_mode ? " (DOR)" : "" );
node_ptr->was_dor_recovery_mode ? " (DOR)" : "",
node_ptr->uptime);
wlog ("%s ... continuing with graceful recovery\n", node_ptr->hostname.c_str());
wlog ("%s ... with no affect to host services\n", node_ptr->hostname.c_str());
@ -1875,7 +1880,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
MTC_OPER_STATE__DISABLED,
MTC_AVAIL_STATUS__FAILED );
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED,
MTC_AVAIL_STATUS__FAILED );
@ -1905,7 +1910,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
{
int timeout = 0 ;
/* Set the FSM task state to booting */
/* Set the FSM task state to 'Graceful Recovery Wait' */
node_ptr->uptime = 0 ;
mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_WAIT );
@ -2266,7 +2271,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
{
/* The active controller would never get/be here but
* if it did then just fall through to change state. */
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
/* Here we need to run the sub-fnction goenable and start
* host services if this is the other controller in a AIO
@ -2442,10 +2447,10 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
}
else /* success path */
{
/* allow the fsm to wait for up to 1 minute for the
* hbsClient's ready event before starting heartbeat
/* allow the fsm to wait for up to 'worker config timeout'
* for the hbsClient's ready event before starting heartbeat
* test. */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_WORKER_CONFIG_TIMEOUT );
recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_START );
}
break ;
@ -2502,6 +2507,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
{
if ( node_ptr->mtcTimer.ring == true )
{
ilog ("%s heartbeating", node_ptr->hostname.c_str());
/* if heartbeat is not working then we will
* never get here and enable the host */
recoveryStageChange ( node_ptr, MTC_RECOVERY__STATE_CHANGE );
@ -2510,7 +2516,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
}
case MTC_RECOVERY__STATE_CHANGE:
{
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
/* Set node as unlocked-enabled */
subfStateChange ( node_ptr, MTC_OPER_STATE__ENABLED,
@ -2555,7 +2561,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
else if ( rc == PASS )
{
/* Start Graceful Recovery */
recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE_START ) ;
recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE ) ;
break ;
}
else if ( rc == FAIL_WORKQ_TIMEOUT )
@ -2571,51 +2577,37 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
nodeLinkClass::force_full_enable ( node_ptr );
break ;
}
case MTC_RECOVERY__ENABLE_START:
case MTC_RECOVERY__ENABLE:
{
/* Create the recovery enable timer. This timer is short.
* A node need to stay enabled with the hartbeat service
* running for a period of time before declaring it enabled */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE_WAIT ) ;
break;
}
case MTC_RECOVERY__ENABLE_WAIT:
{
/* When this timer fires the host has been up for enough time */
if ( node_ptr->mtcTimer.ring == true )
if ( is_controller(node_ptr) )
{
if ( is_controller(node_ptr) )
if ( mtcSmgrApi_request ( node_ptr,
CONTROLLER_ENABLED,
SMGR_MAX_RETRIES ) != PASS )
{
if ( mtcSmgrApi_request ( node_ptr,
CONTROLLER_ENABLED,
SMGR_MAX_RETRIES ) != PASS )
{
wlog ("%s Failed to send 'unlocked-disabled' to HA Service Manager ; allowing enable\n",
node_ptr->hostname.c_str());
}
wlog ("%s Failed to send 'unlocked-enabled' to HA Service Manager ; allowing enable\n",
node_ptr->hostname.c_str());
}
/* Node Has Recovered */
node_ptr->graceful_recovery_counter = 0 ;
recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
node_ptr->health_threshold_counter = 0 ;
node_ptr->enabled_count++ ;
node_ptr->http_retries_cur = 0 ;
doneQueue_purge ( node_ptr );
if ( node_ptr->was_dor_recovery_mode )
{
report_dor_recovery ( node_ptr , "is ENABLED" );
}
else
{
plog ("%s is ENABLED (Gracefully Recovered)\n",
node_ptr->hostname.c_str());
}
alarm_enabled_clear ( node_ptr, false );
}
/* Node Has Recovered */
node_ptr->graceful_recovery_counter = 0 ;
recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
node_ptr->health_threshold_counter = 0 ;
node_ptr->enabled_count++ ;
node_ptr->http_retries_cur = 0 ;
doneQueue_purge ( node_ptr );
if ( node_ptr->was_dor_recovery_mode )
{
report_dor_recovery ( node_ptr , "is ENABLED" );
}
else
{
plog ("%s is ENABLED (Gracefully Recovered)\n",
node_ptr->hostname.c_str());
}
alarm_enabled_clear ( node_ptr, false );
break ;
}
default:
@ -2783,7 +2775,7 @@ int nodeLinkClass::disable_handler ( struct nodeLinkClass::node * node_ptr )
MTC_OPER_STATE__DISABLED,
locked_status );
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED,
locked_status );
@ -3432,7 +3424,7 @@ int nodeLinkClass::online_handler ( struct nodeLinkClass::node * node_ptr )
/* otherwise change state */
mtcInvApi_update_state(node_ptr, MTC_JSON_INV_AVAIL,"offline" );
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
mtcInvApi_update_state(node_ptr, MTC_JSON_INV_AVAIL_SUBF,"offline" );
}
@ -3473,7 +3465,7 @@ int nodeLinkClass::online_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->hostname.c_str());
mtcInvApi_update_state ( node_ptr, MTC_JSON_INV_AVAIL, "online" );
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
mtcInvApi_update_state ( node_ptr, MTC_JSON_INV_AVAIL_SUBF, "online" );
}
@ -6093,7 +6085,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
mtcInfo_log(node_ptr);
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
if ( daemon_is_file_present ( CONFIG_COMPLETE_WORKER ) == false )
{
@ -6120,52 +6112,38 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
mtcInvApi_update_state ( node_ptr, "availability", "available" );
}
/* handle other cases */
EFmAlarmSeverityT sev = mtcAlarm_state ( node_ptr->hostname,
MTC_ALARM_ID__ENABLE);
/* Query FM for existing Enable and Config alarm status */
EFmAlarmSeverityT enable_alarm_severity =
mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__ENABLE);
EFmAlarmSeverityT config_alarm_severity =
mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__CONFIG);
if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
/* Clear generic enable alarm over process restart.
* Will get reasserted if the cause condition still exists */
if ( enable_alarm_severity != FM_ALARM_SEVERITY_CLEAR )
{
node_ptr->alarms[MTC_ALARM_ID__LOCK] = FM_ALARM_SEVERITY_WARNING ;
/* If the node is locked then the Enable alarm
* should not be present */
if ( sev != FM_ALARM_SEVERITY_CLEAR )
{
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
sev = FM_ALARM_SEVERITY_CLEAR ;
}
ilog ("%s found enable alarm ; clearing %s",
node_ptr->hostname.c_str(),
alarmUtil_getSev_str(enable_alarm_severity).c_str());
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
}
/* Manage enable alarm over process restart.
*
* - clear the alarm in the active controller case
* - maintain the alarm, set degrade state in MAJOR and CRIT cases
* - clear alarm for all other severities.
*/
if ( THIS_HOST )
/* The config alarm is maintained if it exists.
* The in-service test handler will clear the alarm
* if the config failure is gone */
if ( config_alarm_severity != FM_ALARM_SEVERITY_CLEAR )
{
if ( sev != FM_ALARM_SEVERITY_CLEAR )
{
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
}
}
else
{
if (( sev == FM_ALARM_SEVERITY_CRITICAL ) ||
( sev == FM_ALARM_SEVERITY_MAJOR ))
{
node_ptr->alarms[MTC_ALARM_ID__ENABLE] = sev ;
node_ptr->degrade_mask |= DEGRADE_MASK_ENABLE ;
}
else if ( sev != FM_ALARM_SEVERITY_CLEAR )
{
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
}
node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ;
node_ptr->alarms[MTC_ALARM_ID__CONFIG] = config_alarm_severity ;
ilog ("%s found config alarm ; loaded %s",
node_ptr->hostname.c_str(),
alarmUtil_getSev_str(config_alarm_severity).c_str());
}
if ( is_controller(node_ptr) )
{
this->controllers++ ;
mtc_cmd_enum state = CONTROLLER_DISABLED ;
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
@ -6199,7 +6177,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
{
ilog ("%s %s\n",node_ptr->hostname.c_str(), MTC_TASK_SWACT_COMPLETE );
/* Work Around for issue: */
mtcInvApi_update_uptime ( node_ptr, node_ptr->uptime );
mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_COMPLETE );
@ -6233,7 +6210,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
mtcSmgrApi_request ( node_ptr, state , SWACT_FAIL_THRESHOLD );
}
}
if ( daemon_get_cfg_ptr()->debug_level & 1 )
nodeLinkClass::host_print (node_ptr);
@ -6290,6 +6266,40 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->hostname.c_str(), node_ptr->uptime );
break ;
}
/* Handle catching and recovering/restoring hosts that might
* have been in the Graceful Recovery Wait state.
*
* Prevents an extra reboot for hosts that might be in
* Graceful Recovery over a maintenance process restart. */
else if (( NOT_THIS_HOST ) &&
( !node_ptr->task.compare(MTC_TASK_RECOVERY_WAIT)))
{
ilog ("%s is in %s ; restoring state",
node_ptr->hostname.c_str(),
MTC_TASK_RECOVERY_WAIT);
/* Complete necessary add operations before switching
* to Recovery */
LOAD_NODETYPE_TIMERS ;
workQueue_purge ( node_ptr );
if (( hostUtil_is_valid_bm_type ( node_ptr->bm_type )) &&
( hostUtil_is_valid_ip_addr ( node_ptr->bm_ip )) &&
( hostUtil_is_valid_username ( node_ptr->bm_un )))
{
set_bm_prov ( node_ptr, true ) ;
}
mtcTimer_reset ( node_ptr->mtcTimer );
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
node_ptr->addStage = MTC_ADD__START;
/* Switch into recovery_handler's Graceful Recovery Wait
* state with the Graceful Recovery Wait timeout */
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler,
node_ptr->mtcalive_timeout );
recoveryStageChange ( node_ptr, MTC_RECOVERY__MTCALIVE_WAIT );
break ;
}
else
{
if ( is_controller(node_ptr) )
@ -6354,7 +6364,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
send_hbs_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
if ( ( CPE_SYSTEM ) || ( is_worker (node_ptr) == true ))
if ( ( AIO_SYSTEM ) || ( is_worker (node_ptr) == true ))
{
send_guest_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
}
@ -6368,6 +6378,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
}
case MTC_ADD__WORKQUEUE_WAIT:
{
rc = workQueue_done ( node_ptr );
if ( rc == RETRY )
{
@ -6393,11 +6404,11 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
{
/* start the heartbeat service in all cases except for
* THIS host and CPE controller hosts */
* THIS host and AIO controller hosts */
if ( NOT_THIS_HOST )
{
if (( LARGE_SYSTEM ) ||
(( CPE_SYSTEM ) && ( this->dor_mode_active == false )))
(( AIO_SYSTEM ) && ( this->dor_mode_active == false )))
{
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
}
@ -6430,7 +6441,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->configAction = MTC_CONFIG_ACTION__INSTALL_PASSWD ;
}
if (( ! SIMPLEX_CPE_SYSTEM ) &&
if (( ! SIMPLEX_AIO_SYSTEM ) &&
( node_ptr->bmc_provisioned == true ))
{
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__BM );
@ -6438,7 +6449,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
}
/* Special Add handling for the AIO system */
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
@ -6455,6 +6466,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
}
node_ptr->addStage = MTC_ADD__START;
plog ("%s Host Add Completed (uptime:%d)\n", node_ptr->hostname.c_str(), node_ptr->uptime );
node_ptr->add_completed = true ;
break ;
@ -6635,6 +6647,8 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
mtcInfo_set ( node_ptr, MTCE_INFO_KEY__BMC_PROTOCOL, BMC_PROTOCOL__IPMI_STR );
node_ptr->bmc_protocol = BMC_PROTOCOL__IPMITOOL ;
}
/* store mtcInfo, which specifies the selected BMC protocol,
* into the sysinv database */
mtcInvApi_update_mtcInfo ( node_ptr );
ilog ("%s bmc control using %s:%s",
@ -6751,8 +6765,15 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->bmc_thread_ctrl.done = true ;
node_ptr->bmc_thread_info.command = 0 ;
}
/* store mtcInfo, which specifies the selected BMC protocol,
* into the sysinv database */
mtcInvApi_update_mtcInfo ( node_ptr );
/* push the BMC access info out to the mtcClient when
* a controller's BMC connection is established/verified */
if ( node_ptr->nodetype & CONTROLLER_TYPE )
this->want_mtcInfo_push = true ;
send_hwmon_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );
}
@ -6942,6 +6963,11 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
}
} /* end power off detection handling */
/* push the BMC access info out to the mtcClient when
* a controller's BMC connection is established/verified */
if ( node_ptr->nodetype & CONTROLLER_TYPE )
this->want_mtcInfo_push = true ;
send_hwmon_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );
@ -7199,6 +7225,9 @@ int nodeLinkClass::oos_test_handler ( struct nodeLinkClass::node * node_ptr )
}
}
/* audit alarms */
mtcAlarm_audit (node_ptr );
break ;
}
case MTC_OOS_TEST__WAIT:
@ -7494,7 +7523,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
* In the restart case the subfunction fsm enable handler is not run so
* we try to detect the missing goenabled_subf flag as an inservice test.
*
* Only in CPE type
* Only in AIO type
* - clear the alarm if the issue goes away -
* i.e. the goenabled tests eventually pass. Today
* hey are not re-run in the background but someday they may be
@ -7502,7 +7531,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
* and we have only a single enabled controller (which must be this one)
* and the alarm is not already raised.
**/
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
@ -7597,7 +7626,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
}
}
/* Monitor the health of the host - no pass file */
/* Monitor the health of the host */
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
(( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
@ -7623,6 +7652,11 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
ilog ("%s sm degrade clear\n", node_ptr->hostname.c_str());
}
/*
* In-service Config Failure/Alarm handling
*/
/* Detect new config failure condition */
if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY)
{
/* not healthy .... */
@ -7634,16 +7668,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
{
wlog_throttled ( node_ptr->health_threshold_counter, (MTC_UNHEALTHY_THRESHOLD*10), "%s is UNHEALTHY\n", node_ptr->hostname.c_str());
if ( node_ptr->health_threshold_counter >= MTC_UNHEALTHY_THRESHOLD )
{
node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ;
/* threshold is reached so raise the config alarm if it is not already raised */
if ( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CRITICAL )
{
mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__CONFIG );
node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CRITICAL ;
}
}
alarm_config_failure ( node_ptr );
}
}
else
@ -7663,6 +7688,12 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
}
}
}
/* or correct an alarmed config failure that has cleared */
else if ( node_ptr->degrade_mask & DEGRADE_MASK_CONFIG )
{
if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_HEALTHY )
alarm_config_clear ( node_ptr );
}
else
{
node_ptr->health_threshold_counter = 0 ;

View File

@ -159,19 +159,20 @@ void nodeLinkClass::mnfa_recover_host ( struct nodeLinkClass::node * node_ptr )
if ( node_ptr->mnfa_graceful_recovery == true )
{
/* Restart the heartbeat for this recovered host */
// send_hbs_command ( node_ptr->hostname, MTC_RESTART_HBS );
if ( node_ptr->adminAction != MTC_ADMIN_ACTION__RECOVER )
{
ilog ("%s graceful recovery from MNFA\n", node_ptr->hostname.c_str());
recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
ilog ("%s graceful recovery (graceful recover count:%d)",
node_ptr->hostname.c_str(),
node_ptr->graceful_recovery_counter);
}
else
{
wlog ("%s already gracefully recovering\n", node_ptr->hostname.c_str() );
wlog ("%s graceful recovery restart (graceful recover count:%d)",
node_ptr->hostname.c_str(),
node_ptr->graceful_recovery_counter );
}
recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
}
}
@ -298,43 +299,38 @@ void nodeLinkClass::mnfa_exit ( bool force )
* Clear heartbeat degrades */
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ((( ptr->hbs_minor[CLSTR_IFACE] == true ) ||
( ptr->hbs_minor[MGMNT_IFACE] == true )) &&
( ptr->operState == MTC_OPER_STATE__ENABLED ))
std::list<string>::iterator mnfa_awol_ptr ;
for ( mnfa_awol_ptr = mnfa_awol_list.begin() ;
mnfa_awol_ptr != mnfa_awol_list.end() ;
mnfa_awol_ptr++ )
{
ptr->hbs_minor[MGMNT_IFACE] = false ;
ptr->hbs_minor[CLSTR_IFACE] = false ;
/* skip host if not in the mnfa pool */
if ( ptr->hostname.compare(*(mnfa_awol_ptr)) )
continue ;
if ( force == true )
if ((( ptr->hbs_minor[CLSTR_IFACE] == true ) ||
( ptr->hbs_minor[MGMNT_IFACE] == true )) &&
( ptr->operState == MTC_OPER_STATE__ENABLED ))
{
elog ("... %s failed ; auto-recovering\n",
ptr->hostname.c_str());
ptr->hbs_minor[MGMNT_IFACE] = false ;
ptr->hbs_minor[CLSTR_IFACE] = false ;
/* Set node as failed */
availStatusChange ( ptr, MTC_AVAIL_STATUS__FAILED );
enableStageChange ( ptr, MTC_ENABLE__START );
adminActionChange ( ptr, MTC_ADMIN_ACTION__NONE );
}
else
{
if ( ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )
if ( force == true )
{
if ( ptr->degrade_mask == 0 )
{
availStatusChange ( ptr, MTC_AVAIL_STATUS__AVAILABLE );
}
}
elog ("... %s failed ; auto-recovering\n",
ptr->hostname.c_str());
if ( ptr->adminAction != MTC_ADMIN_ACTION__RECOVER )
{
recoveryStageChange ( ptr, MTC_RECOVERY__START );
adminActionChange ( ptr, MTC_ADMIN_ACTION__RECOVER );
/* Set node as failed */
availStatusChange ( ptr, MTC_AVAIL_STATUS__FAILED );
enableStageChange ( ptr, MTC_ENABLE__START );
adminActionChange ( ptr, MTC_ADMIN_ACTION__NONE );
}
else
{
wlog ("%s already gracefully recovering\n", ptr->hostname.c_str() );
mnfa_recover_host ( ptr );
}
}
break ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;

View File

@ -125,11 +125,13 @@ int send_mtcAlive_msg ( mtc_socket_type * sock_ptr, string identity, int interfa
int recv_mtc_reply_noblock ( void );
int send_mtc_cmd ( string & hostname, int cmd, int interface );
int send_mtc_cmd ( string & hostname, int cmd, int interface , string json_dict="" );
int mtc_service_command ( mtc_socket_type * sock_ptr , int interface );
int mtc_set_availStatus ( string & hostname, mtc_nodeAvailStatus_enum status );
int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_name_ptr );
int mtce_send_event ( mtc_socket_type * sock_ptr, unsigned int cmd , const char * mtce_name_ptr );
int mtc_clstr_init ( mtc_socket_type * sock_ptr , char * iface );
string get_who_i_am ( void );
int send_mtcClient_cmd ( mtc_socket_type * sock_ptr, int cmd, string hostname, string address, int port);
#endif

View File

@ -96,7 +96,7 @@ int nodeLinkClass::mtcSmgrApi_request ( struct nodeLinkClass::node * node_ptr, m
int rc = PASS ;
string operation_string = "unknown" ;
if ( system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX )
if ( system_type == SYSTEM_TYPE__AIO__SIMPLEX )
{
dlog ("%s simpex mode ; SM '%d' request not sent\n", node_ptr->hostname.c_str(), operation );
return ( PASS );

View File

@ -110,14 +110,16 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
if ( node_ptr->mtce_flags & MTC_FLAG__SUBF_CONFIGURED )
{
mtcTimer_reset (node_ptr->mtcTimer);
plog ("%s Subf Configured OK\n", name.c_str());
plog ("%s Subf Configured OK (oob:%x)\n",
name.c_str(), node_ptr->mtce_flags);
enableStageChange ( node_ptr, MTC_ENABLE__GOENABLED_TIMER );
alarm_config_clear ( node_ptr );
break ;
}
if ((( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED )) ||
(( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY )))
if (( node_ptr->mtce_flags ) &&
(( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED ) ||
( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY )))
{
mtcTimer_reset (node_ptr->mtcTimer);
@ -140,9 +142,10 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
/* timeout handling */
else if ( node_ptr->mtcTimer.ring == true )
{
elog ("%s configuration timeout (%d secs)\n",
elog ("%s configuration timeout (%d secs) (oob:%x)\n",
name.c_str(),
MTC_WORKER_CONFIG_TIMEOUT );
MTC_WORKER_CONFIG_TIMEOUT,
node_ptr->mtce_flags);
alarm_config_failure ( node_ptr );
@ -169,7 +172,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
*
* issue: subfunction go-enable patching script fails and
* maintenance reboots the active controller when no-reboot
* patching maintenance in CPE.
* patching maintenance in AIO.
*
* The fix is to avoid running the subfunction go-enabled tests
* on self while patching.
@ -490,7 +493,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
fail = true ;
}
else if ( this->system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )
else if ( this->system_type != SYSTEM_TYPE__AIO__SIMPLEX )
{
/* Loop over the heartbeat interfaces and fail the Enable if any of them are failing */
for ( int i = 0 ; i < MAX_IFACES ; i++ )

View File

@ -231,6 +231,7 @@ typedef struct
recovery_method_type recovery_method ; /**< How processes are recovered */
bool reload_config ;
bool patching_in_progress ;
bool last_alarm_query_pass;
} pmon_ctrl_type ;
void pmon_set_ctrl_ptr ( pmon_ctrl_type * ctrl_ptr );

View File

@ -38,14 +38,14 @@ void pmonAlarm_init ( void )
alarmUtil_type * ptr ;
/** Process Failure Alarm ****************************************************/
ptr = &alarm_list[PMON_ALARM_ID__PMOND];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", PMOND_ALARM_ID);
ptr->name = "process failure" ;
ptr->instc_prefix = "process=" ;
ptr->critl_reason = "";
ptr->minor_reason = "";
ptr->major_reason = "";
@ -56,12 +56,12 @@ void pmonAlarm_init ( void )
ptr->alarm.inhibit_alarms = FM_FALSE;
ptr->alarm.service_affecting = FM_TRUE ;
ptr->alarm.suppression = FM_TRUE ;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_CLEAR ; /* Dynamic */
snprintf (ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH,
"If problem consistently occurs after Host is locked and unlocked then "
snprintf (ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH,
"If problem consistently occurs after Host is locked and unlocked then "
"contact next level of support for root cause analysis and recovery.");
}
@ -97,38 +97,46 @@ EFmAlarmSeverityT pmonAlarm_state ( string hostname, pmon_alarm_id_enum id )
/******************************************************************************
*
* Name : manage_queried_alarms
* Name : query_alarms
*
* Description: query FM for all the existing process monitor alarms and build
* up the callers 'saved_alarm_list' with those process names and
* corresponding severity.
*
* Assumptions: If the hostname is passed in as not empty then assume the clear
* is requested.
*
* Updates : callers saved_alarm_list
*
* Returns : PASS if FM returns no error
* FAIL_REQUEST ... alarmUtil_query_identity failed
* FAIL_OPERATION ... fm_get_fault failed
* FAIL_NULL_POINTER ... failed to get memory
*
******************************************************************************/
void manage_queried_alarms ( list<active_process_alarms_type> & saved_alarm_list, string hostname )
int query_alarms ( list<active_process_alarms_type> & saved_alarm_list, string hostname )
{
static const char HOSTNAME_LABEL [] = "host=" ;
static const char PROCNAME_LABEL [] = ".process=" ;
int rc = FAIL ;
saved_alarm_list.clear();
/**
* Query all the pmon alarms and if there is an alarm for a
* process that is functioing properly then clear the alarm.
**/
SFmAlarmDataT * alarm_list_ptr = (SFmAlarmDataT*) malloc ((sizeof(SFmAlarmDataT)*PMON_MAX_ALARMS));
if ( alarm_list_ptr )
{
if ( alarmUtil_query_identity ( pmonAlarm_getId_str(PMON_ALARM_ID__PMOND), alarm_list_ptr, PMON_MAX_ALARMS ) == PASS )
/* Query all the pmon alarms */
rc = alarmUtil_query_identity ( pmonAlarm_getId_str(PMON_ALARM_ID__PMOND), alarm_list_ptr, PMON_MAX_ALARMS );
if ( rc == RETRY )
{
dlog ("no %s alarms found", pmonAlarm_getId_str(PMON_ALARM_ID__PMOND).c_str());
rc = PASS ;
}
else if ( rc == PASS )
{
for ( int i = 0 ; i < PMON_MAX_ALARMS ; ++i )
{
/* loop over each active alarm and maintain its activity state */
if ( strnlen ((alarm_list_ptr+i)->entity_instance_id , MAX_FILENAME_LEN ) )
{
int rc ;
AlarmFilter alarm_filter ;
SFmAlarmDataT alarm_query ;
memset(&alarm_query, 0, sizeof(alarm_query));
@ -139,34 +147,49 @@ void manage_queried_alarms ( list<active_process_alarms_type> & saved_alarm_lis
if (( rc = fm_get_fault ( &alarm_filter, &alarm_query )) == FM_ERR_OK )
{
string entity = alarm_filter.entity_instance_id ;
size_t pos = entity.find("process=");
if ( pos != std::string::npos )
{
string pn = entity.substr(pos+strlen("process="));
ilog ("%s alarm is %s (process:%s)\n", alarm_filter.entity_instance_id,
alarmUtil_getSev_str(alarm_query.severity).c_str(), pn.c_str());
rc = PASS ;
/* filter out 'process=pmond' as that alarm is handled by hbsAgent */
if ( pn.compare("pmond") )
string entity = alarm_filter.entity_instance_id ;
size_t pos_hn = entity.find(HOSTNAME_LABEL);
size_t pos_pn = entity.find(PROCNAME_LABEL);
if (( pos_hn != std::string::npos ) &&
( pos_pn != std::string::npos ))
{
string hn = entity.substr(pos_hn+strlen(HOSTNAME_LABEL), pos_pn-strlen(HOSTNAME_LABEL));
string pn = entity.substr(pos_pn+strlen(PROCNAME_LABEL));
/* verify hostname */
if ( ( hn.length() == 0 ) || ( hn != hostname ) )
{
if ( !hostname.empty() )
{
pmonAlarm_clear ( hostname, PMON_ALARM_ID__PMOND, pn );
}
else
{
active_process_alarms_type this_alarm ;
this_alarm.process = pn ;
this_alarm.severity = alarm_query.severity ;
saved_alarm_list.push_front ( this_alarm );
}
/* ignore alarms not for this host */
dlog ("%s %s %s alarm not for this host",
entity.c_str(),
hn.c_str(),
pn.c_str());
continue ;
}
dlog ("%s alarm is %s (process:%s)\n",
alarm_filter.entity_instance_id,
alarmUtil_getSev_str(alarm_query.severity).c_str(),
pn.c_str());
/* filter out 'process=pmond'
* ... that alarm is handled by hbsAgent */
if ( pn != MTC_SERVICE_PMOND_NAME )
{
active_process_alarms_type this_alarm ;
this_alarm.process = pn ;
this_alarm.severity = alarm_query.severity ;
saved_alarm_list.push_front ( this_alarm );
}
}
}
else
{
ilog ("fm_get_fault failed (rc:%d)\n", rc );
wlog ("fm_get_fault failed (rc:%d)\n", rc );
rc = FAIL_OPERATION ;
break ;
}
}
else
@ -174,10 +197,21 @@ void manage_queried_alarms ( list<active_process_alarms_type> & saved_alarm_lis
dlog2 ("last entry %d\n", i);
break ;
}
}
} /* for loop */
}
else
{
wlog("failed to query alarms from fm ; rc:%d", rc);
rc = FAIL_REQUEST ;
}
free(alarm_list_ptr);
}
else
{
elog ("unable to allocate memory for alarm list");
rc = FAIL_NULL_POINTER ;
}
return (rc);
}
/************************* A L A R M I N G **************************/

View File

@ -37,8 +37,10 @@ typedef struct
EFmAlarmSeverityT severity ;
} active_process_alarms_type ;
/* Clear any pending alarms if the specified hostname is valid */
void manage_queried_alarms ( list<active_process_alarms_type> & alarm_list, string hostname="" );
/* Query FM for a list of Process Monitor (200.006) alarms */
int query_alarms ( list<active_process_alarms_type> & alarm_list, string hostname="" );
void alarmed_process_audit ( void );
void pmonAlarm_init ( void );

View File

@ -41,15 +41,6 @@ static struct mtc_timer ptimer[MAX_PROCESSES] ;
std::list<string> config_files ;
std::list<string>::iterator string_iter_ptr ;
/* If there is an alarm in the list that matches one in the process list
* then update that process with its severity and failed state.
* If there is a process in the saved list that is not in the process list
* then clear its alarm as it is no longer valid.
*/
void manage_process_alarms ( list<active_process_alarms_type> & _list,
process_config_type * const ptr,
int const processes );
static process_config_type process_config[MAX_PROCESSES] ;
/* lookup process control by index and return its pointer if found.
@ -216,6 +207,7 @@ void pmon_timer_init ( void )
/* Init the timer for this process */
mtcTimer_init ( process_config[i].pt_ptr, _pmon_ctrl_ptr->my_hostname, "process" ) ;
}
_pmon_ctrl_ptr->last_alarm_query_pass = false ;
}
void _process_death_hdlr ( int sig_num, siginfo_t * info_ptr, void * context_ptr );
@ -371,7 +363,7 @@ void init_process_config_memory ( void )
* all the process config files from /etc/pmon.d */
void load_processes ( void )
{
list<active_process_alarms_type> saved_alarm_list ;
list<active_process_alarms_type> queried_alarm_list ;
int rc = PASS ;
@ -385,10 +377,6 @@ void load_processes ( void )
close_process_socket ( &process_config[i] );
}
/* Query fm for existing pmon process alarms and
* for each that is found store their 'name' and
* 'severity' in the passed in saved list */
manage_queried_alarms ( saved_alarm_list );
/* init the process config memory */
init_process_config_memory ();
@ -454,13 +442,8 @@ void load_processes ( void )
}
_pmon_ctrl_ptr->reload_config = false ;
/* If there were process alarms that existed over the reload
* then ensure that those processes are updated with that information. */
if ( saved_alarm_list.size () )
{
ilog ("there are %ld active alarms over reload\n", saved_alarm_list.size());
manage_process_alarms ( saved_alarm_list, &process_config[0], _pmon_ctrl_ptr->processes );
}
/* use the audit to clear pre-existing alarms at process startup */
alarmed_process_audit ();
}
@ -1702,65 +1685,124 @@ void _process_death_hdlr ( int sig_num, siginfo_t * info_ptr, void * context_ptr
}
}
/************************************************************************
/***************************************************************************
*
* Name : manage_process_alarms
* Name : alarmed_process_audit
*
* Description: This interface manages process alarms over a process
* configuration reload
* Purpose : Verify the process state matches the queried alarm state
*
* Steps:
* Description: To correct process alarm state mismatches.
*
* 1. Loop over each item in the list and mark the process as failed
* with the specified severity level.
*
* 2. If the process is not found then clear its alarm as it is no
* longer a valid process in the new profile and we don't want a
* lingering stuck alarm.
*
*************************************************************************/
***************************************************************************/
void manage_process_alarms ( list<active_process_alarms_type> & _list,
process_config_type * const ptr,
int const processes )
void alarmed_process_audit ( void )
{
/* get out if the list is empty ; should not have been called if
* empty but ... just in case */
if ( ! _list.empty() )
/* Don't audit FM in service after the last query was successful.
* There is a blocking issue that needs to be dealt with */
if ( _pmon_ctrl_ptr->last_alarm_query_pass == true )
return ;
/*
* Query fm for existing pmon process alarms and
* for each that is found store their 'name' and
* 'severity' in the passed in queried_alarm_list.
*/
list<active_process_alarms_type> queried_alarm_list ;
int rc = query_alarms ( queried_alarm_list, get_ctrl_ptr()->my_hostname );
_pmon_ctrl_ptr->last_alarm_query_pass = (rc == PASS);
/* just return if query failed */
if ( _pmon_ctrl_ptr->last_alarm_query_pass == false )
return ;
if ( queried_alarm_list.size () )
{
list<active_process_alarms_type>::iterator _iter_ptr ;
alog ("audit found %ld active alarms", queried_alarm_list.size());
/* loop over the list ... */
for ( _iter_ptr=_list.begin(); _iter_ptr!=_list.end(); ++_iter_ptr )
for ( _iter_ptr=queried_alarm_list.begin();
_iter_ptr!=queried_alarm_list.end();
++_iter_ptr )
{
/* for each item assum it is not found */
bool found = false ;
alog ("%s audit", _iter_ptr->process.c_str());
/* try and find this process in the new process profile */
for ( int i = 0 ; i < processes ; i++ )
/* find this process*/
for ( int i = 0 ; (i < _pmon_ctrl_ptr->processes) && !found ; i++ )
{
if ( ! _iter_ptr->process.compare((ptr+i)->process) )
{
/* If the process is found then mark it as failed and update its severity.
* At this point we then assume that there is an alarm raised for this process. */
found = true ;
process_config_type * ptr = &process_config[i];
(ptr+i)->failed = false ;
wlog ("%s process was failed critical ; clearing existing alarm\n", _iter_ptr->process.c_str() );
pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, PMON_ALARM_ID__PMOND, _iter_ptr->process );
if ( ! _iter_ptr->process.compare(ptr->process) )
{
found = true ;
if ( ptr->failed == false )
{
ilog ("%s stale alarm ; clearing",
_iter_ptr->process.c_str() );
pmonAlarm_clear ( get_ctrl_ptr()->my_hostname,
PMON_ALARM_ID__PMOND,
_iter_ptr->process );
}
else if ( _iter_ptr->severity != ptr->alarm_severity )
{
wlog ("%s alarm severity mismatch ; %s -> %s ; correcting",
ptr->process,
alarmUtil_getSev_str(_iter_ptr->severity).c_str(),
alarmUtil_getSev_str(ptr->alarm_severity).c_str());
if ( ptr->alarm_severity == FM_ALARM_SEVERITY_MINOR )
{
pmonAlarm_minor(get_ctrl_ptr()->my_hostname,
PMON_ALARM_ID__PMOND,
ptr->process, 0);
}
else if (ptr->alarm_severity == FM_ALARM_SEVERITY_MAJOR )
{
pmonAlarm_major(get_ctrl_ptr()->my_hostname,
PMON_ALARM_ID__PMOND,
ptr->process);
}
else if (ptr->alarm_severity == FM_ALARM_SEVERITY_CRITICAL )
{
pmonAlarm_critical(get_ctrl_ptr()->my_hostname,
PMON_ALARM_ID__PMOND,
ptr->process);
}
else
{
wlog ("%s unexpected severity '%s' ; clearing alarm",
ptr->process,
ptr->severity);
pmonAlarm_clear ( get_ctrl_ptr()->my_hostname,
PMON_ALARM_ID__PMOND,
ptr->process );
}
}
else
{
alog ("%s is alarmed '%s' ; audit",
ptr->process,
ptr->severity);
}
}
}
/* if not found then just clear the alarm */
if ( found == false)
{
wlog ("%s process alarm clear ; not in current process profile\n", _iter_ptr->process.c_str() );
pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, PMON_ALARM_ID__PMOND, _iter_ptr->process );
wlog ("%s is not a monitored process ; clearing alarm",
_iter_ptr->process.c_str());
pmonAlarm_clear ( get_ctrl_ptr()->my_hostname,
PMON_ALARM_ID__PMOND,
_iter_ptr->process );
}
}
}
}
void pmon_service ( pmon_ctrl_type * ctrl_ptr )
{
std::list<int> socks ;
@ -1931,6 +1973,8 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr )
{
_get_events ();
mtcTimer_start ( pmonTimer_audit, pmon_timer_handler, audit_period );
alarmed_process_audit ();
}
/* Run the degrade set/clear by audit */

15
mtce/src/pmon/scripts/pmon.logrotate Executable file → Normal file
View File

@ -1,16 +1,19 @@
#daily
nodateext
#
# Copyright (c) 2015-2021 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
/var/log/pmond.log
{
nodateext
size 10M
create 0640 root root
start 1
missingok
size 10M
rotate 20
compress
sharedscripts
notifempty
missingok
postrotate
systemctl reload syslog-ng > /dev/null 2>&1 || true
endscript
delaycompress
}

0
mtce/src/pmon/scripts/pmond.conf Executable file → Normal file
View File

View File

@ -1,7 +1,11 @@
#
# Copyright (c) 2020-2021 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
/var/log/crash/vmcore.tar
/var/log/crash/vmcore_first.tar
{
nodateext
size 1K
start 1
rotate 1

View File

@ -87,6 +87,10 @@ sched_delay_threshold = 300 ; scheduler delay time in msecs that will trigger
daemon_log_port = 2121 ; daemon logger port
mtcalarm_req_port = 2122 ;
sync_b4_peer_ctrlr_reset = 0 ; issue a sync command to peer controller mtcClient
; before issuing BMC reset.
[timeouts] ; configurable maintenance timeout values in seconds
failsafe_shutdown_delay = 120;

View File

@ -1,59 +1,67 @@
#daily
# Apply all these options to all the logs
nodateext
start 1
compress
notifempty
missingok
sharedscripts
postrotate
systemctl reload syslog-ng > /dev/null 2>&1 || true
endscript
#
# Copyright (c) 2015-2021 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
/var/log/mtcAgent.log
{
size 100M
create 0640 root root
start 1
rotate 10
size 100M
compress
notifempty
missingok
postrotate
systemctl reload syslog-ng > /dev/null 2>&1 || true
endscript
delaycompress
}
/var/log/hbsAgent.log
{
size 20M
rotate 5
}
/var/log/mtcClient.log
{
size 20M
rotate 5
}
/var/log/hbsClient.log
{
size 20M
create 0640 root root
start 1
rotate 5
size 20M
compress
notifempty
missingok
postrotate
systemctl reload syslog-ng > /dev/null 2>&1 || true
endscript
delaycompress
}
/var/log/mtclogd.log
{
size 10M
create 0640 root root
start 1
rotate 5
size 10M
compress
notifempty
missingok
postrotate
systemctl reload syslog-ng > /dev/null 2>&1 || true
endscript
delaycompress
}
# The mtclogd opens and closes these log files on every log addition.
# Therefore does not require a notification over log rotation.
/var/log/mtcAgent_event.log
/var/log/mtcAgent_alarm.log
/var/log/mtcAgent_api.log
{
size 20M
create 0640 root root
start 1
rotate 5
}
/var/log/mtcAgent_event.log
{
size 20M
rotate 5
}
/var/log/mtcAgent_alarm.log
{
size 10M
rotate 5
compress
notifempty
missingok
delaycompress
}

View File

@ -18,6 +18,28 @@ usage ()
exit 1
}
# Systemd automatically remounts all the mounted filesystems at shutdown
# When we are deleting a partition, we have to unmount its corresponding filesystem
# because remounting deleted filesystems at shutdown will throw errors
unmount_fs()
{
local fs=$1
local ret_code=0
echo "Trying to unmount $fs"
if findmnt $fs > /dev/null 2>&1 ; then
if umount -f $fs ; then
echo "$fs has been successfully unmounted"
else
echo "Error! Failed to unmount $fs"
ret_code=1
fi
else
echo "Warning! $fs is not mounted"
ret_code=2
fi
return $ret_code
}
OPTS=`getopt -o h -l force -- "$@"`
if [ $? != 0 ]
then
@ -100,11 +122,14 @@ fi
BACKUP_PART_GUID="BA5EBA11-0000-1111-2222-000000000002"
part_type_guid_str="Partition GUID code"
# get the nodetype variable to check later if this node is a controller
. /etc/platform/platform.conf
for dev in $WIPE_HDD
do
if [[ -e $dev ]]
then
if [ "$dev" == "$rootfs" ]
if [[ "$dev" == "$rootfs" && "${nodetype}" == "controller" ]]
then
part_numbers=( $(parted -s $dev print | awk '$1 == "Number" {i=1; next}; i {print $1}') )
for part_number in "${part_numbers[@]}"; do
@ -128,6 +153,7 @@ do
# Skip / or we will lose access to the tools on the system.
if [[ $part != $rootfs_part ]]
then
unmount_fs $part
dd if=/dev/zero of=$part bs=512 count=34
dd if=/dev/zero of=$part bs=512 count=34 seek=$((`blockdev --getsz $part` - 34))
fi
@ -141,6 +167,7 @@ do
else
echo "Wiping $dev..."
wipefs -f -a $dev
unmount_fs $dev
# Clearing previous GPT tables or LVM data
# Delete the first few bytes at the start and end of the partition. This is required with